intimate/extractor/openrec_extractor/openrec_extractor.go

263 lines
6.7 KiB
Go
Raw Normal View History

package main
import (
"database/sql"
"encoding/json"
"intimate"
"log"
2020-07-16 10:31:13 +00:00
"os"
"os/signal"
"regexp"
"strconv"
"strings"
2020-07-16 10:31:13 +00:00
"sync/atomic"
"syscall"
"time"
"github.com/tidwall/gjson"
)
2020-07-22 12:00:02 +00:00
var estore = intimate.NewStoreExtractor()
var sstore = intimate.NewStoreSource(string(intimate.STOpenrec))
// OpenrecExtractor 提取方法
type OpenrecExtractor struct {
user *intimate.ExtractorSource
userLive *intimate.ExtractorSource
supporters *intimate.ExtractorSource
}
2020-07-16 10:31:13 +00:00
func (oe *OpenrecExtractor) Execute() {
var loop int32 = 1
go func() {
signalchan := make(chan os.Signal)
signal.Notify(signalchan, syscall.SIGKILL, syscall.SIGQUIT, syscall.SIGTERM, syscall.SIGSTOP)
log.Println("accept stop command:", <-signalchan)
atomic.StoreInt32(&loop, 0)
}()
var lasterr error = nil
2020-07-22 12:00:02 +00:00
execute := func() bool {
var err error
// if sstore.PopCount() >= 1000 {
// if err = estore.Close(); err != nil {
// log.Println(err)
// }
// if err = sstore.Close(); err != nil {
// log.Println(err)
// }
// estore = intimate.NewStoreExtractor()
// sstore = intimate.NewStoreSource(string(intimate.STOpenrec))
// oe.supporters.Clear()
// oe.user.Clear()
// oe.userLive.Clear()
// runtime.GC() // 主动gc
// log.Println("1000次执行, gc 重新建立sql链接")
// }
source, err := sstore.Pop(intimate.TOpenrecUser, 0)
2020-07-16 10:31:13 +00:00
if err != nil {
if err != lasterr {
log.Println(err, lasterr)
lasterr = err
}
time.Sleep(time.Second * 2)
2020-07-22 12:00:02 +00:00
return true
2020-07-16 10:31:13 +00:00
}
sdata := source.Ext.([]byte)
datamap := gjson.ParseBytes(sdata).Map()
source.Operator = int32(intimate.OperatorError)
userId := datamap["var_user_id"].String()
2020-07-16 10:31:13 +00:00
streamer := &intimate.Streamer{}
streamer.UserId = userId
streamer.Platform = intimate.Popenrec
2020-07-16 10:31:13 +00:00
2020-07-22 12:00:02 +00:00
htmlUser := datamap["html_user"]
oe.user = intimate.NewExtractorSource(&htmlUser)
oe.user.CreateExtractor()
2020-07-16 10:31:13 +00:00
2020-07-22 12:00:02 +00:00
htmlLive := datamap["html_live"]
oe.userLive = intimate.NewExtractorSource(&htmlLive)
oe.userLive.CreateExtractor()
2020-07-16 10:31:13 +00:00
2020-07-22 12:00:02 +00:00
jsonSupporters := datamap["json_supporters"]
oe.supporters = intimate.NewExtractorSource(&jsonSupporters)
clog := &intimate.CollectLog{}
2020-07-16 10:31:13 +00:00
// log.Println(anchorId)
2020-07-16 10:31:13 +00:00
oe.extractFollowers(clog)
oe.extractUserName(streamer)
oe.extractViewsAndLiveStreaming(clog)
oe.extractGiversAndGratuity(clog)
oe.extractLive(clog)
oe.extractTags(clog)
2020-07-16 10:31:13 +00:00
streamer.Uid = source.StreamerId.Int64
streamer.UpdateTime = source.UpdateTime
2020-07-16 10:31:13 +00:00
clog.Platform = string(intimate.Popenrec)
clog.UserId = userId
clog.UpdateTime = source.UpdateTime
2020-07-16 10:31:13 +00:00
logUid := estore.InsertCollectLog(clog)
2020-07-16 10:31:13 +00:00
LiveUrl := "https://www.openrec.tv/live/" + userId
streamer.LiveUrl = sql.NullString{String: LiveUrl, Valid: true}
streamer.LatestLogUid = logUid
streamer.Operator = 0
estore.UpdateStreamer(streamer)
2020-07-16 10:31:13 +00:00
source.Operator = int32(intimate.OperatorExtractorOK)
sstore.UpdateOperator(source)
2020-07-16 10:31:13 +00:00
2020-07-22 12:00:02 +00:00
return true
}
for atomic.LoadInt32(&loop) > 0 {
if !execute() {
break
}
2020-07-16 10:31:13 +00:00
}
}
func (oe *OpenrecExtractor) extractFollowers(clog intimate.ISet) {
extractor := oe.user.GetExtractor()
xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()")
if err != nil {
log.Println(err)
}
if !xp.NodeIter().Next() {
log.Println("不存在粉丝数")
}
followers := strings.ReplaceAll(xp.String(), ",", "")
followersInt, err := strconv.ParseInt(followers, 10, 64)
if err != nil {
log.Println(err)
}
clog.Set("Followers", sql.NullInt64{Int64: followersInt, Valid: true})
}
func (oe *OpenrecExtractor) extractUserName(streamer intimate.ISet) {
extractor := oe.user.GetExtractor()
xp, err := extractor.XPathResult("//p[ contains(@class, 'c-global__user__profile__list__name__text')]/text()")
if err != nil {
log.Println(err)
} else {
if xp.NodeIter().Next() {
userName := xp.String()
streamer.Set("UserName", sql.NullString{String: userName, Valid: true})
}
}
}
func (oe *OpenrecExtractor) extractViewsAndLiveStreaming(clog intimate.ISet) {
extractor := oe.user.GetExtractor()
// c-contents
xp, err := extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()")
if err != nil {
log.Println(err)
}
2020-07-22 12:00:02 +00:00
if xp.NodeIter().Next() {
views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String())
views = strings.ReplaceAll(views, ",", "")
viewsint, err := strconv.Atoi(views)
if err != nil {
log.Println(err)
}
clog.Set("Views", sql.NullInt64{Int64: int64(viewsint), Valid: true})
clog.Set("IsLiveStreaming", true)
}
}
func (oe *OpenrecExtractor) extractGiversAndGratuity(clog intimate.ISet) {
// extractor := oe.user.GetExtractor()
giverjson := oe.supporters.GetSource()
var givers []interface{}
var gratuity int64 = 0
for _, v := range giverjson.Array() {
giverSource := gjson.Parse(v.String())
for _, item := range giverSource.Get("data.items").Array() {
givers = append(givers, item.Map())
gratuity += item.Get("total_yells").Int()
}
}
giversbytes, err := json.Marshal(givers)
if err != nil {
log.Println(err)
clog.Set("ErrorMsg", sql.NullString{String: err.Error(), Valid: true})
} else {
clog.Set("Giver", giversbytes)
}
clog.Set("Gratuity", sql.NullInt64{Int64: gratuity, Valid: true})
}
func (oe *OpenrecExtractor) extractLive(clog intimate.ISet) {
extractor := oe.userLive.GetExtractor()
mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})</h1>").FindStringSubmatch(oe.userLive.GetSource().Str)
if len(mathes) == 2 {
clog.Set("LiveTitle", sql.NullString{String: mathes[1], Valid: true})
content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content")
if err != nil {
log.Println(err)
}
iter := content.NodeIter()
if iter.Next() {
tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local)
if err != nil {
log.Println(err)
}
// log.Println(iter.Node().NodeValue(), tm.Local())
clog.Set("LiveStartTime", sql.NullTime{Time: tm.Local(), Valid: true})
duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content")
if err != nil {
log.Println(err)
}
diter := duration.NodeIter()
if diter.Next() {
dt, err := intimate.ParseDuration(diter.Node().NodeValue())
if err != nil {
log.Println(err)
}
endtm := tm.Add(dt)
clog.Set("LiveEndTime", sql.NullTime{Time: endtm.Local(), Valid: true})
}
}
}
}
func (oe *OpenrecExtractor) extractTags(clog intimate.ISet) {
var tags []string
2020-07-16 10:31:13 +00:00
matheslist := regexp.MustCompile(`<[^>]+TagButton[^>]+>([^<]{1,100})<`).FindAllStringSubmatch(oe.userLive.GetSource().Str, -1)
for _, m := range matheslist {
tags = append(tags, m[1])
}
tagsBytes, err := json.Marshal(tags)
if err != nil {
log.Println(err)
}
clog.Set("Tags", tagsBytes)
}