Extractor upgrade

This commit is contained in:
eson 2020-08-05 18:49:47 +08:00
parent 6158976986
commit 0bff7169ec
10 changed files with 180 additions and 15 deletions

View File

@ -3,6 +3,7 @@ package intimate
import ( import (
"database/sql" "database/sql"
"reflect" "reflect"
"time"
"github.com/474420502/hunter" "github.com/474420502/hunter"
"github.com/tidwall/gjson" "github.com/tidwall/gjson"
@ -11,6 +12,34 @@ import (
type GetSet struct { type GetSet struct {
} }
type StreamerList struct {
UrlHash []byte //
Platform Platform //
Url string //
Label sql.NullString //
Serialize interface{}
UpdateInterval int32
UpdateTime time.Time //
ErrorMsg sql.NullString
Operator int32
LastOperator int32
}
// Get Simple Value
func (sl *StreamerList) Get(field string) interface{} {
return reflect.ValueOf(sl).Elem().FieldByName(field).Interface()
}
// Set Simple Value
func (sl *StreamerList) Set(field string, value interface{}) {
reflect.ValueOf(sl).Elem().FieldByName(field).Set(reflect.ValueOf(value))
}
type Streamer struct { type Streamer struct {
Uid int64 // Uid int64 //
Platform Platform // Platform Platform //

1
go.mod
View File

@ -3,6 +3,7 @@ module intimate
go 1.14 go 1.14
require ( require (
github.com/474420502/extractor v0.2.2
github.com/474420502/focus v0.12.0 github.com/474420502/focus v0.12.0
github.com/474420502/gcurl v0.1.2 github.com/474420502/gcurl v0.1.2
github.com/474420502/hunter v0.3.4 github.com/474420502/hunter v0.3.4

8
go.sum
View File

@ -2,12 +2,16 @@ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMT
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU=
cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg= cloud.google.com/go v0.41.0/go.mod h1:OauMR7DV8fzvZIl2qg6rkaIhD/vmgk4iwEw/h6ercmg=
github.com/474420502/extractor v0.2.2 h1:hGao2iZt5CEI8oqYjQW938osQdHKgNWL/bwRJQNgHTM=
github.com/474420502/extractor v0.2.2/go.mod h1:OVFijdKLDghigpIYISHzlognL5q8eeVenT2fRhCyFns=
github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo=
github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s=
github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg=
github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM= github.com/474420502/gcurl v0.1.2/go.mod h1:hws5q/Ao64bXLLDnldz9VyTQUndTWc/i5DzdEazFfoM=
github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M= github.com/474420502/hunter v0.3.4 h1:fyLAgI84jWe3IcqsISC53j1w3CXI1FERxX//Potns0M=
github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA= github.com/474420502/hunter v0.3.4/go.mod h1:pe4Xr/I+2agvq339vS/OZV+EiHAWtpXQs75rioSW9oA=
github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406 h1:nLvl2D2y+hxCglLnRmLqwRGwmUsXQt8ga46zGySTU1I=
github.com/474420502/libxml2 v0.0.0-20200803084225-29e441d26406/go.mod h1:bUbcte7hFuLijGG6/+gGxurW3XvxE/CBdfAAlsIWj34=
github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY= github.com/474420502/requests v1.6.0 h1:f4h4j40eT0P5whhg9LdkotD8CaKjtuDu/vz9iSUkCgY=
github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo= github.com/474420502/requests v1.6.0/go.mod h1:SLXrQ5dL9c7dkIeKNUCBAjOIt3J9KFCS2RQjWJecNwo=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
@ -66,6 +70,8 @@ github.com/rogpeppe/go-charset v0.0.0-20180617210344-2471d30d28b4/go.mod h1:qgYe
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/tebeka/selenium v0.9.9 h1:cNziB+etNgyH/7KlNI7RMC1ua5aH1+5wUlFQyzeMh+w= github.com/tebeka/selenium v0.9.9 h1:cNziB+etNgyH/7KlNI7RMC1ua5aH1+5wUlFQyzeMh+w=
github.com/tebeka/selenium v0.9.9/go.mod h1:5Fr8+pUvU6B1OiPfkdCKdXZyr5znvVkxuPd0NOdZCQc= github.com/tebeka/selenium v0.9.9/go.mod h1:5Fr8+pUvU6B1OiPfkdCKdXZyr5znvVkxuPd0NOdZCQc=
github.com/tidwall/gjson v1.3.2/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls= github.com/tidwall/gjson v1.3.2/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls=
@ -162,6 +168,8 @@ gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU=
gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a h1:LJwr7TCTghdatWv40WobzlKXc9c4s8oGa7QKJUtHhWA= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a h1:LJwr7TCTghdatWv40WobzlKXc9c4s8oGa7QKJUtHhWA=

View File

@ -9,4 +9,7 @@ const (
// Ptwitch twitch 平台 // Ptwitch twitch 平台
Ptwitch Platform = "twitch" Ptwitch Platform = "twitch"
// Ptwitcasting twitcasting 平台
Ptwitcasting Platform = "twitcasting"
) )

View File

@ -1,6 +1,25 @@
create database if not exists `intimate_extractor`; create database if not exists `intimate_extractor`;
use intimate_extractor; use intimate_extractor;
CREATE TABLE IF NOT EXISTS `streamer_list` (
`urlhash` varchar(32) NOT NULL COMMENT '平台',
`url` text COMMENT 'url获取streamer列表的url',
`platform` varchar(255) NOT NULL COMMENT '平台',
`label` varchar(255) DEFAULT NULL COMMENT '必须的时候打上标签',
`serialize` blob DEFAULT NULL COMMENT '保存进程的必要计算数据',
`update_interval` int DEFAULT 120 COMMENT '分钟单位, 默认120分钟, 下次更新的时间间隔',
`update_time` Timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
`error_msg` text DEFAULT NULL COMMENT '错误信息',
`operator` int DEFAULT 0 COMMENT '操作标志位, 根据不同解析方法有不同标志',
PRIMARY KEY (`urlhash`),
KEY `platform_idx` (`platform`),
KEY `update_time_idx` (`update_time`),
KEY `operator_idx` (`operator`)
)
CREATE TABLE IF NOT EXISTS `streamer` ( CREATE TABLE IF NOT EXISTS `streamer` (
`uid` bigint AUTO_INCREMENT COMMENT '自增UID, 便于查询定位', `uid` bigint AUTO_INCREMENT COMMENT '自增UID, 便于查询定位',
`platform` varchar(255) NOT NULL COMMENT '平台', `platform` varchar(255) NOT NULL COMMENT '平台',

View File

@ -1,8 +1,11 @@
package intimate package intimate
import ( import (
"crypto/md5"
"database/sql" "database/sql"
"fmt"
"log" "log"
"strings"
"time" "time"
_ "github.com/go-sql-driver/mysql" _ "github.com/go-sql-driver/mysql"
@ -182,6 +185,9 @@ const StreamerTable string = "streamer"
// CollectLogTable 采集日志表 // CollectLogTable 采集日志表
const CollectLogTable string = "collect_log" const CollectLogTable string = "collect_log"
// StreamerListTable 主播表名称
const StreamerListTable string = "streamer_list"
type StoreExtractor struct { type StoreExtractor struct {
db *sql.DB db *sql.DB
@ -267,6 +273,48 @@ func (store *StoreExtractor) Pop(platform Platform, operators ...int32) (*Stream
return s, nil return s, nil
} }
// UpdateStreamerList streamerlist表, 更新数据
func (store *StoreExtractor) UpdateStreamerList(streamer IGet, fieldvalues ...interface{}) {
updateSQL := "UPDATE " + StreamerListTable + " SET "
var values []interface{}
for i := 0; i < len(fieldvalues); i += 2 {
field := fieldvalues[i]
values = append(values, fieldvalues[i+1])
updateSQL += field.(string) + " = ? "
}
updateSQL += "WHERE urlhash = ?"
values = append(values, streamer.Get("UrlHash"))
_, err := store.db.Exec(updateSQL, values...)
if err != nil {
panic(err)
}
}
// InsertStreamer streamerlist表, 插入数据
func (store *StoreExtractor) InsertStreamerList(streamerlist IGet) (isExists bool) {
urlstr := streamerlist.Get("Url").(string)
_, err := store.db.Exec("insert into streamer_list(urlhash, url, platform, label, serialize, update_interval, error_msg, operator) values(?,?,?,?,?,?,?,?)",
fmt.Sprintf("%x", md5.Sum([]byte(urlstr))),
urlstr,
streamerlist.Get("Platform"),
streamerlist.Get("Label"),
streamerlist.Get("Serialize"),
streamerlist.Get("UpdateInterval"),
streamerlist.Get("ErrorMsg"),
streamerlist.Get("Operator"),
)
if err != nil {
if !strings.HasPrefix(err.Error(), "Error 1062") {
log.Println(err)
}
return true
}
return false
}
// InsertStreamer Streamer表, 插入数据 // InsertStreamer Streamer表, 插入数据
func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) { func (store *StoreExtractor) InsertStreamer(streamer IGet) (isExists bool) {
// select uid from table where platform = ? and user_id = ? // select uid from table where platform = ? and user_id = ?

View File

@ -9,4 +9,7 @@ const (
// STTwitch twitch源table名称 // STTwitch twitch源table名称
STTwitch SourceTable = "source_twitch" STTwitch SourceTable = "source_twitch"
// STTwitcasting STTwitcasting源table名称
STTwitcasting SourceTable = "source_twitcasting"
) )

View File

@ -1,6 +1,10 @@
package main package main
import ( import (
"intimate"
"time"
"github.com/474420502/extractor"
"github.com/474420502/focus/compare" "github.com/474420502/focus/compare"
"github.com/474420502/focus/tree/heap" "github.com/474420502/focus/tree/heap"
@ -8,9 +12,14 @@ import (
"testing" "testing"
"github.com/474420502/requests" "github.com/474420502/requests"
"github.com/lestrrat-go/libxml2"
) )
// sstore 源存储实例, 为存储源数据的实现. 表格具体参考sql/intimate_source.sql
var sstore *intimate.StoreSource = intimate.NewStoreSource(string(intimate.STTwitcasting))
// estore 解析存储连接实例
var estore *intimate.StoreExtractor = intimate.NewStoreExtractor()
func TestMain(t *testing.T) { func TestMain(t *testing.T) {
searchurl := "https://twitcasting.tv/rankingindex.php" searchurl := "https://twitcasting.tv/rankingindex.php"
@ -27,30 +36,73 @@ func TestMain(t *testing.T) {
panic(err) panic(err)
} }
doc, err := libxml2.ParseHTML(resp.Content()) etor := extractor.ExtractXml(resp.Content())
// doc, err := libxml2.ParseHTML(resp.Content())
// if err != nil {
// panic(err)
// }
// defer doc.Free()
result, err := etor.XPath("//*[contains(@class, 'tag')]/@href")
if err != nil { if err != nil {
panic(err) panic(err)
} }
defer doc.Free()
result, err := doc.Find("//*[contains(@class, 'tag')]/@href") // result, err := doc.Find("//*[contains(@class, 'tag')]/@href")
if err != nil { // if err != nil {
panic(err) // panic(err)
} // }
defer result.Free() // defer result.Free()
iter := result.NodeIter() iter := result.NodeIter()
for iter.Next() { for iter.Next() {
log.Println(iter.Node().NodeValue())
wurl := "https://twitcasting.tv" + iter.Node().NodeValue() wurl := "https://twitcasting.tv" + iter.Node().NodeValue()
if ok := queuedict[wurl]; !ok { if ok := queuedict[wurl]; !ok {
log.Println(wurl)
sl := &intimate.StreamerList{}
sl.Platform = intimate.Ptwitcasting
sl.Url = wurl
sl.Operator = 0
sl.UpdateInterval = 120
sl.UpdateTime = time.Now()
estore.InsertStreamerList(sl)
queue.Put(wurl) queue.Put(wurl)
queuedict[wurl] = true queuedict[wurl] = true
} }
} }
doc.Find("//div[@class='tw-search-result-row']") // doc.Find("//div[@class='tw-search-result-row']")
xps, err := etor.XPaths("//div[@class='tw-search-result-row']")
if err != nil {
log.Println(surl, err)
continue
}
// xps.ForEachTag(SearchProfile{})
// texts, errs := xps.ForEachText(".//span[@class='username']")
// if len(errs) > 0 {
// t.Error(errs)
// }
var splist = xps.ForEachTag(SearchProfile{})
for _, isp := range splist {
sp := isp.(*SearchProfile)
sp.UserId = sp.LiveUrl[1:]
// log.Println(sp.(SearchProfile))
}
for _, isp := range splist {
log.Println(isp.(*SearchProfile))
}
log.Println("finish remain", queue.Size()) log.Println("finish remain", queue.Size())
} }
} }
type SearchProfile struct {
UserName string `exp:".//span[@class='username']" method:"Text"`
UserId string // `exp:".//span[@class='fullname']" method:"Text"`
LiveUrl string `exp:".//div[@class='usertext']/a[@href]" method:"Attribute,href Value"`
}

View File

@ -29,13 +29,16 @@ func (cl *UserList) Execute() {
//article//a[@data-a-target='preview-card-title-link'] //article//a[@data-a-target='preview-card-title-link']
wd := intimate.GetChromeDriver(3030) wd := intimate.GetChromeDriver(3030)
defer wd.Close()
defer wd.Quit() defer wd.Quit()
ps := intimate.NewPerfectShutdown() ps := intimate.NewPerfectShutdown()
counter := intimate.NewCounter() counter := intimate.NewCounter()
counter.SetMaxLimit(100) counter.SetMaxLimit(100)
counter.SetMaxToDo(func(olist ...interface{}) error { counter.SetMaxToDo(func(olist ...interface{}) error {
owd := olist[0].(*selenium.WebDriver) owd := olist[0].(*selenium.WebDriver)
(*owd).Quit() if err := (*owd).Quit(); err != nil {
log.Println(err)
}
*owd = intimate.GetChromeDriver(3030) *owd = intimate.GetChromeDriver(3030)
return nil return nil
}, &wd) }, &wd)

View File

@ -103,13 +103,12 @@ func GetChromeDriver(port int) selenium.WebDriver {
panic(err) panic(err)
} }
runtime.SetFinalizer(wd, func(obj interface{}) { runtime.SetFinalizer(wd, func(obj interface{}) {
// if err := obj.(selenium.WebDriver).Close(); err != nil { if err := obj.(selenium.WebDriver).Close(); err != nil {
// log.Println(err) log.Println(err)
// } }
if err := obj.(selenium.WebDriver).Quit(); err != nil { if err := obj.(selenium.WebDriver).Quit(); err != nil {
log.Println(err) log.Println(err)
} }
}) })
wd.ExecuteScript("windows.navigator.webdriver = undefined", nil) wd.ExecuteScript("windows.navigator.webdriver = undefined", nil)
if err != nil { if err != nil {