intimate/tasks/twitcasting/twitcasting_task1/main_test.go

191 lines
4.8 KiB
Go
Raw Normal View History

2020-08-04 06:13:39 +00:00
package main
import (
2020-08-07 10:10:22 +00:00
"database/sql"
"encoding/json"
2020-08-05 10:49:47 +00:00
"intimate"
2020-08-07 10:10:22 +00:00
"net/http"
"net/url"
"os"
"os/signal"
"syscall"
2020-08-05 10:49:47 +00:00
"time"
"github.com/474420502/extractor"
2020-08-04 06:13:39 +00:00
"github.com/474420502/focus/compare"
"github.com/474420502/focus/tree/heap"
"log"
"testing"
2020-08-07 10:10:22 +00:00
_ "net/http/pprof"
2020-08-04 06:13:39 +00:00
"github.com/474420502/requests"
)
2020-08-07 10:10:22 +00:00
func Test(t *testing.T) {
rawurl := "https://twitcasting.tv/你好"
u, _ := url.Parse(rawurl)
t.Error(u.EscapedPath())
t.Error(u.String())
}
2020-08-05 10:49:47 +00:00
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting))
// estore 解析存储连接实例
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
2020-08-04 06:13:39 +00:00
func TestMain(t *testing.T) {
2020-08-07 10:10:22 +00:00
f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
log.SetFlags(log.Llongfile | log.Ltime)
log.SetOutput(f)
2020-08-04 06:13:39 +00:00
2020-08-07 10:10:22 +00:00
go func() {
log.Println(http.ListenAndServe(":4040", nil))
}()
homeurl := "https://twitcasting.tv"
2020-08-04 06:13:39 +00:00
searchurl := "https://twitcasting.tv/rankingindex.php"
queuedict := make(map[string]bool)
queue := heap.New(compare.String)
queue.Put(searchurl)
queuedict[searchurl] = true
2020-08-07 10:10:22 +00:00
ses := requests.NewSession()
ses.Config().SetTimeout(15)
var surl interface{}
var ok bool
var debugsp *SearchProfile
var content []byte
defer func() {
if ierr := recover(); ierr != nil {
log.Println(surl, debugsp)
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
f.Write(content)
f.Close()
log.Panic(ierr)
}
}()
go func() {
signalchan := make(chan os.Signal)
signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP)
log.Println("accept stop command:", <-signalchan)
f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm)
f.Write(content)
f.Close()
os.Exit(1)
}()
for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() {
u, err := url.Parse(surl.(string))
if err != nil {
log.Println(err)
continue
}
2020-08-04 06:13:39 +00:00
2020-08-07 10:10:22 +00:00
resp, err := ses.Get(u.String()).Execute()
2020-08-04 06:13:39 +00:00
if err != nil {
2020-08-07 10:10:22 +00:00
log.Println(err)
log.Println(u.String(), surl)
continue
// log.Panic(err)
2020-08-04 06:13:39 +00:00
}
2020-08-07 10:10:22 +00:00
content = resp.Content()
2020-08-05 10:49:47 +00:00
etor := extractor.ExtractXml(resp.Content())
2020-08-07 10:10:22 +00:00
result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href")
2020-08-04 06:13:39 +00:00
if err != nil {
panic(err)
}
2020-08-05 10:49:47 +00:00
2020-08-04 06:13:39 +00:00
iter := result.NodeIter()
for iter.Next() {
2020-08-07 10:10:22 +00:00
wurl := homeurl + iter.Node().NodeValue()
2020-08-04 06:13:39 +00:00
if ok := queuedict[wurl]; !ok {
2020-08-05 10:49:47 +00:00
log.Println(wurl)
sl := &intimate.StreamerList{}
sl.Platform = intimate.Ptwitcasting
sl.Url = wurl
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
2020-08-07 10:10:22 +00:00
2020-08-05 10:49:47 +00:00
estore.InsertStreamerList(sl)
2020-08-07 10:10:22 +00:00
2020-08-04 06:13:39 +00:00
queue.Put(wurl)
queuedict[wurl] = true
}
}
2020-08-05 10:49:47 +00:00
// doc.Find("//div[@class='tw-search-result-row']")
xps, err := etor.XPaths("//div[@class='tw-search-result-row']")
if err != nil {
log.Println(surl, err)
continue
}
2020-08-07 10:10:22 +00:00
log.Println("extract tag")
2020-08-05 10:49:47 +00:00
var splist = xps.ForEachTag(SearchProfile{})
2020-08-07 10:10:22 +00:00
log.Println("finish extract tag")
2020-08-05 10:49:47 +00:00
for _, isp := range splist {
sp := isp.(*SearchProfile)
2020-08-07 10:10:22 +00:00
if sp.LiveUrl == "" {
continue
}
2020-08-05 10:49:47 +00:00
sp.UserId = sp.LiveUrl[1:]
2020-08-07 10:10:22 +00:00
for i := 0; i < len(sp.TagUrl); i++ {
wurl := homeurl + sp.TagUrl[i]
sp.TagUrl[i] = wurl
if ok := queuedict[wurl]; !ok {
sl := &intimate.StreamerList{}
sl.Platform = intimate.Ptwitcasting
sl.Url = wurl
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
queue.Put(wurl)
queuedict[wurl] = true
}
}
2020-08-05 10:49:47 +00:00
// log.Println(sp.(SearchProfile))
}
2020-08-07 10:10:22 +00:00
log.Println("find user:", len(splist))
2020-08-05 10:49:47 +00:00
for _, isp := range splist {
2020-08-07 10:10:22 +00:00
sp := isp.(*SearchProfile)
// log.Println(sp)
streamer := &intimate.Streamer{}
streamer.Platform = intimate.Ptwitcasting
streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true}
if btags, err := json.Marshal(sp.Tag); err != nil {
log.Println(err)
} else {
streamer.Tags = btags
}
streamer.UpdateInterval = 120
streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true}
streamer.UserName = sql.NullString{String: sp.UserName, Valid: true}
streamer.UserId = sp.UserId
debugsp = sp
estore.InsertStreamer(streamer)
2020-08-05 10:49:47 +00:00
}
2020-08-04 06:13:39 +00:00
log.Println("finish remain", queue.Size())
}
}
2020-08-05 10:49:47 +00:00
type SearchProfile struct {
2020-08-07 10:10:22 +00:00
UserName string `exp:".//span[@class='username']" method:"Text"`
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"`
TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"`
2020-08-05 10:49:47 +00:00
}