package main import ( "database/sql" "encoding/json" "intimate" "net/http" "net/url" "os" "os/signal" "syscall" "time" "github.com/474420502/extractor" "github.com/474420502/focus/compare" "github.com/474420502/focus/tree/heap" "log" "testing" _ "net/http/pprof" "github.com/474420502/requests" ) func Test(t *testing.T) { rawurl := "https://twitcasting.tv/你好" u, _ := url.Parse(rawurl) t.Error(u.EscapedPath()) t.Error(u.String()) } // sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting)) // estore 解析存储连接实例 var estore *intimate.StoreExtractor = intimate.NewStoreExtractor() func TestMain(t *testing.T) { f, _ := os.OpenFile("./log", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) log.SetFlags(log.Llongfile | log.Ltime) log.SetOutput(f) go func() { log.Println(http.ListenAndServe(":4040", nil)) }() homeurl := "https://twitcasting.tv" searchurl := "https://twitcasting.tv/rankingindex.php" queuedict := make(map[string]bool) queue := heap.New(compare.String) queue.Put(searchurl) queuedict[searchurl] = true ses := requests.NewSession() ses.Config().SetTimeout(15) var surl interface{} var ok bool var debugsp *SearchProfile var content []byte defer func() { if ierr := recover(); ierr != nil { log.Println(surl, debugsp) f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) f.Write(content) f.Close() log.Panic(ierr) } }() go func() { signalchan := make(chan os.Signal) signal.Notify(signalchan, syscall.SIGINT, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP) log.Println("accept stop command:", <-signalchan) f, _ := os.OpenFile("./error.html", os.O_TRUNC|os.O_CREATE|os.O_RDWR, os.ModePerm) f.Write(content) f.Close() os.Exit(1) }() for surl, ok = queue.Pop(); ok; surl, ok = queue.Pop() { u, err := url.Parse(surl.(string)) if err != nil { log.Println(err) continue } resp, err := ses.Get(u.String()).Execute() if err != nil { log.Println(err) log.Println(u.String(), surl) continue // log.Panic(err) } content = resp.Content() etor := extractor.ExtractXml(resp.Content()) result, err := etor.XPath("//p[@class='taglist']/a[contains(@class, 'tag')]/@href") if err != nil { panic(err) } iter := result.NodeIter() for iter.Next() { wurl := homeurl + iter.Node().NodeValue() if ok := queuedict[wurl]; !ok { log.Println(wurl) sl := &intimate.StreamerList{} sl.Platform = intimate.Ptwitcasting sl.Url = wurl sl.Operator = 0 sl.UpdateInterval = 120 sl.UpdateTime = time.Now() estore.InsertStreamerList(sl) queue.Put(wurl) queuedict[wurl] = true } } // doc.Find("//div[@class='tw-search-result-row']") xps, err := etor.XPaths("//div[@class='tw-search-result-row']") if err != nil { log.Println(surl, err) continue } log.Println("extract tag") var splist = xps.ForEachTag(SearchProfile{}) log.Println("finish extract tag") for _, isp := range splist { sp := isp.(*SearchProfile) if sp.LiveUrl == "" { continue } sp.UserId = sp.LiveUrl[1:] for i := 0; i < len(sp.TagUrl); i++ { wurl := homeurl + sp.TagUrl[i] sp.TagUrl[i] = wurl if ok := queuedict[wurl]; !ok { sl := &intimate.StreamerList{} sl.Platform = intimate.Ptwitcasting sl.Url = wurl sl.Operator = 0 sl.UpdateInterval = 120 sl.UpdateTime = time.Now() estore.InsertStreamerList(sl) queue.Put(wurl) queuedict[wurl] = true } } // log.Println(sp.(SearchProfile)) } log.Println("find user:", len(splist)) for _, isp := range splist { sp := isp.(*SearchProfile) // log.Println(sp) streamer := &intimate.Streamer{} streamer.Platform = intimate.Ptwitcasting streamer.LiveUrl = sql.NullString{String: sp.LiveUrl, Valid: true} if btags, err := json.Marshal(sp.Tag); err != nil { log.Println(err) } else { streamer.Tags = btags } streamer.UpdateInterval = 120 streamer.UpdateTime = sql.NullTime{Time: time.Now(), Valid: true} streamer.UserName = sql.NullString{String: sp.UserName, Valid: true} streamer.UserId = sp.UserId debugsp = sp estore.InsertStreamer(streamer) } log.Println("finish remain", queue.Size()) } } type SearchProfile struct { UserName string `exp:".//span[@class='username']" method:"Text"` UserId string // `exp:".//span[@class='fullname']" method:"Text"` LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"` Tag []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Text"` TagUrl []string `exp:".//a[contains(@class, 'tag tag-mini')]" method:"Attribute,href Value"` }