From d1298dc3f374be4f2996c742f2abba8e7ad2c035 Mon Sep 17 00:00:00 2001 From: eson Date: Mon, 17 Aug 2020 13:10:29 +0800 Subject: [PATCH] =?UTF-8?q?finish:=20=E9=87=8D=E6=9E=84openrec?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- extractor/openrec_extractor/main.go | 3 +- .../openrec_extractor/openrec_extractor.go | 238 ++++++------------ extractor/openrec_extractor/openrec_test.go | 3 +- go.mod | 2 +- go.sum | 2 + 5 files changed, 84 insertions(+), 164 deletions(-) diff --git a/extractor/openrec_extractor/main.go b/extractor/openrec_extractor/main.go index 4596421..469a80d 100644 --- a/extractor/openrec_extractor/main.go +++ b/extractor/openrec_extractor/main.go @@ -15,6 +15,5 @@ import ( */ func main() { - oe := &OpenrecExtractor{} - oe.Execute() + Execute() } diff --git a/extractor/openrec_extractor/openrec_extractor.go b/extractor/openrec_extractor/openrec_extractor.go index 4ac446f..4bd51df 100644 --- a/extractor/openrec_extractor/openrec_extractor.go +++ b/extractor/openrec_extractor/openrec_extractor.go @@ -3,11 +3,9 @@ package main import ( "database/sql" "encoding/json" + "errors" "intimate" "log" - "regexp" - "strconv" - "strings" "time" "github.com/474420502/extractor" @@ -17,20 +15,23 @@ import ( var estore = intimate.NewStoreExtractor() var sstore = intimate.NewStoreSource(string(intimate.STOpenrec)) -// OpenrecExtractor 提取方法 -type OpenrecExtractor struct { - user *intimate.ExtractorSource - userLive *intimate.ExtractorSource - supporters *intimate.ExtractorSource -} - +//UserInfo 提取信息的结构体 type UserInfo struct { UserName string `exp:"//p[ contains(@class, 'c-global__user__profile__list__name__text')]"` - Followers int `exp:"//p[@class='c-global__user__count__row__right js-userCountFollowers']" mth:"r:ParseNumber"` - Views int `exp:"//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']" mth:"r:ExtractNumber"` + Followers int64 `exp:"//p[@class='c-global__user__count__row__right js-userCountFollowers']" mth:"r:ParseNumber"` + Views int64 `exp:"//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']" mth:"r:ExtractNumber"` } -func (oe *OpenrecExtractor) Execute() { +//UserLive 提取信息的结构体 +type UserLive struct { + Title string `exp:"//h1[contains(@class,'MovieTitle__Title')]"` + LiveStartTime string `exp:"//meta[@itemprop='uploadDate']/@content"` + LiveEndTime string `exp:"//meta[@itemprop='duration']/@content"` + Tags []string `exp:"//a[contains(@class,'TagButton')]"` +} + +// Execute 执行 +func Execute() { ps := intimate.NewPerfectShutdown() @@ -47,6 +48,7 @@ func (oe *OpenrecExtractor) Execute() { time.Sleep(time.Second * 5) continue } + lasterr = nil sdata := source.Ext.([]byte) datamap := gjson.ParseBytes(sdata).Map() @@ -56,31 +58,80 @@ func (oe *OpenrecExtractor) Execute() { streamer := &intimate.Streamer{} streamer.UserId = userId - // streamer.Platform = intimate.Popenrec + // streamer.Platform = intimate.Popenrec 不需要更新字段 htmlUser := datamap["html_user"] - oe.user = intimate.NewExtractorSource(&htmlUser) - oe.user.CreateExtractor() userEtor := extractor.ExtractHtmlString(htmlUser.String()) - log.Println(userEtor.GetObjectByTag(UserInfo{})) + ui, ok1 := userEtor.GetObjectByTag(UserInfo{}).(*UserInfo) htmlLive := datamap["html_live"] - oe.userLive = intimate.NewExtractorSource(&htmlLive) - oe.userLive.CreateExtractor() + + liveEtor := extractor.ExtractHtmlString(htmlLive.String()) + ul, ok2 := liveEtor.GetObjectByTag(UserLive{}).(*UserLive) jsonSupporters := datamap["json_supporters"] - oe.supporters = intimate.NewExtractorSource(&jsonSupporters) clog := &intimate.CollectLog{} - // log.Println(anchorId) + if ok1 { + clog.Followers = sql.NullInt64{Int64: ui.Followers, Valid: true} + clog.Views = sql.NullInt64{Int64: ui.Views, Valid: true} + if ui.Views != 0 { + clog.IsLiveStreaming = true + } + streamer.UserName = sql.NullString{String: ui.UserName, Valid: true} - oe.extractFollowers(clog) - oe.extractUserName(streamer) - oe.extractViewsAndLiveStreaming(clog) - oe.extractGiversAndGratuity(clog) - oe.extractLive(clog) - oe.extractTags(clog) + giverjson := jsonSupporters + var givers []interface{} + var gratuity int64 = 0 + + for _, v := range giverjson.Array() { + giverSource := gjson.Parse(v.String()) + for _, item := range giverSource.Get("data.items").Array() { + givers = append(givers, item.Map()) + gratuity += item.Get("total_yells").Int() + } + } + + giversbytes, err := json.Marshal(givers) + if err != nil { + log.Println(err) + clog.ErrorMsg = sql.NullString{String: err.Error(), Valid: true} + } else { + clog.Giver = giversbytes + } + + clog.Gratuity = sql.NullInt64{Int64: gratuity, Valid: true} + } else { + log.Println("UserInfo may be not exists") + estore.UpdateError(streamer, errors.New("UserInfo may be not exists")) + continue + } + + //log.Println(ul) + if ok2 { + clog.LiveTitle = sql.NullString{String: ul.Title, Valid: true} + + startTime, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", ul.LiveStartTime, time.Local) + if err != nil { + log.Println(err) + } else { + clog.LiveStartTime = sql.NullTime{Time: startTime.Local(), Valid: true} + duration, err := intimate.ParseDuration(ul.LiveEndTime) + if err != nil { + log.Println(err) + } else { + endTime := startTime.Add(duration) + clog.LiveStartTime = sql.NullTime{Time: endTime.Local(), Valid: true} + } + } + + if tags, err := json.Marshal(ul.Tags); err == nil { + clog.Tags = tags + } else { + log.Println("json error", ul.Tags, clog.Tags) + } + } streamer.Uid = source.StreamerId.Int64 streamer.UpdateTime = source.UpdateTime @@ -89,6 +140,7 @@ func (oe *OpenrecExtractor) Execute() { clog.Platform = intimate.Popenrec clog.UserId = userId clog.UpdateTime = source.UpdateTime + clog.StreamerUid = streamer.Uid logUid := estore.InsertClog(clog) @@ -113,135 +165,3 @@ func (oe *OpenrecExtractor) Execute() { } } - -func (oe *OpenrecExtractor) extractFollowers(clog intimate.ISet) { - extractor := oe.user.GetExtractor() - xp, err := extractor.XPathResult("//p[@class='c-global__user__count__row__right js-userCountFollowers']/text()") - if err != nil { - log.Println(err) - } - if !xp.NodeIter().Next() { - log.Println("不存在粉丝数") - } - - followers := strings.ReplaceAll(xp.String(), ",", "") - followersInt, err := strconv.ParseInt(followers, 10, 64) - if err != nil { - log.Println(err) - } - - clog.Set("Followers", sql.NullInt64{Int64: followersInt, Valid: true}) -} - -func (oe *OpenrecExtractor) extractUserName(streamer intimate.ISet) { - extractor := oe.user.GetExtractor() - xp, err := extractor.XPathResult("//p[ contains(@class, 'c-global__user__profile__list__name__text')]/text()") - if err != nil { - log.Println(err) - } else { - if xp.NodeIter().Next() { - userName := xp.String() - streamer.Set("UserName", sql.NullString{String: userName, Valid: true}) - } - } -} - -func (oe *OpenrecExtractor) extractViewsAndLiveStreaming(clog intimate.ISet) { - extractor := oe.user.GetExtractor() - // c-contents - xp, err := extractor.XPathResult("//ul[@class='c-contents']//p[@class='c-thumbnailVideo__footer__liveCount']/text()") - if err != nil { - log.Println(err) - } - - if xp.NodeIter().Next() { - views := regexp.MustCompile(`[0-9,]+`).FindString(xp.String()) - views = strings.ReplaceAll(views, ",", "") - viewsint, err := strconv.Atoi(views) - if err != nil { - log.Println(err) - } - - clog.Set("Views", sql.NullInt64{Int64: int64(viewsint), Valid: true}) - clog.Set("IsLiveStreaming", true) - } -} - -func (oe *OpenrecExtractor) extractGiversAndGratuity(clog intimate.ISet) { - // extractor := oe.user.GetExtractor() - giverjson := oe.supporters.GetSource() - var givers []interface{} - var gratuity int64 = 0 - - for _, v := range giverjson.Array() { - giverSource := gjson.Parse(v.String()) - for _, item := range giverSource.Get("data.items").Array() { - givers = append(givers, item.Map()) - gratuity += item.Get("total_yells").Int() - } - } - - giversbytes, err := json.Marshal(givers) - if err != nil { - log.Println(err) - clog.Set("ErrorMsg", sql.NullString{String: err.Error(), Valid: true}) - } else { - clog.Set("Giver", giversbytes) - } - - clog.Set("Gratuity", sql.NullInt64{Int64: gratuity, Valid: true}) -} - -func (oe *OpenrecExtractor) extractLive(clog intimate.ISet) { - extractor := oe.userLive.GetExtractor() - mathes := regexp.MustCompile("MovieTitle__Title[^>]+>(.{1,50})").FindStringSubmatch(oe.userLive.GetSource().Str) - if len(mathes) == 2 { - - clog.Set("LiveTitle", sql.NullString{String: mathes[1], Valid: true}) - - content, err := extractor.XPathResult("//meta[@itemprop='uploadDate']/@content") - if err != nil { - log.Println(err) - } - - iter := content.NodeIter() - if iter.Next() { - tm, err := time.ParseInLocation("2006-01-02T15:04:05Z07:00", iter.Node().NodeValue(), time.Local) - if err != nil { - log.Println(err) - } - // log.Println(iter.Node().NodeValue(), tm.Local()) - clog.Set("LiveStartTime", sql.NullTime{Time: tm.Local(), Valid: true}) - - duration, err := extractor.XPathResult("//meta[@itemprop='duration']/@content") - if err != nil { - log.Println(err) - } - - diter := duration.NodeIter() - if diter.Next() { - - dt, err := intimate.ParseDuration(diter.Node().NodeValue()) - if err != nil { - log.Println(err) - } - endtm := tm.Add(dt) - clog.Set("LiveEndTime", sql.NullTime{Time: endtm.Local(), Valid: true}) - } - } - } -} - -func (oe *OpenrecExtractor) extractTags(clog intimate.ISet) { - var tags []string - matheslist := regexp.MustCompile(`<[^>]+TagButton[^>]+>([^<]{1,100})<`).FindAllStringSubmatch(oe.userLive.GetSource().Str, -1) - for _, m := range matheslist { - tags = append(tags, m[1]) - } - tagsBytes, err := json.Marshal(tags) - if err != nil { - log.Println(err) - } - - clog.Set("Tags", tagsBytes) -} diff --git a/extractor/openrec_extractor/openrec_test.go b/extractor/openrec_extractor/openrec_test.go index 2028e34..e9bc878 100644 --- a/extractor/openrec_extractor/openrec_test.go +++ b/extractor/openrec_extractor/openrec_test.go @@ -114,6 +114,5 @@ func TestUserName(t *testing.T) { } func TestExtractor(t *testing.T) { - oe := &OpenrecExtractor{} - oe.Execute() + Execute() } diff --git a/go.mod b/go.mod index 16c756a..e58f8be 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module intimate go 1.14 require ( - github.com/474420502/extractor v0.9.4-0.20200814111732-bc270321f8f9 + github.com/474420502/extractor v0.9.4-0.20200817020657-7d26da5d1e89 github.com/474420502/focus v0.12.0 github.com/474420502/gcurl v0.2.0 github.com/474420502/hunter v0.3.4 diff --git a/go.sum b/go.sum index 1573290..e71679a 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/474420502/extractor v0.9.3 h1:Cjri64DbgWQQ64EjPiBSQfUH9l0cYlzU8py0PQu github.com/474420502/extractor v0.9.3/go.mod h1:Ss0KTfwsdB4XBpNda/V50rx21V9bl6/eQmyl50mjAS4= github.com/474420502/extractor v0.9.4-0.20200814111732-bc270321f8f9 h1:cxgsTQwRJSiML4yBL40n/0pD/FbEqkCIXE7qq6hJyLg= github.com/474420502/extractor v0.9.4-0.20200814111732-bc270321f8f9/go.mod h1:Ss0KTfwsdB4XBpNda/V50rx21V9bl6/eQmyl50mjAS4= +github.com/474420502/extractor v0.9.4-0.20200817020657-7d26da5d1e89 h1:6g4sPgooFdsVAdxNMhP6sqKQ0Z5EPBb4tGj9/absPoY= +github.com/474420502/extractor v0.9.4-0.20200817020657-7d26da5d1e89/go.mod h1:Ss0KTfwsdB4XBpNda/V50rx21V9bl6/eQmyl50mjAS4= github.com/474420502/focus v0.12.0 h1:+icbmj7IEOefvTegHt5EpcHt6WFbe2miIrceUJx2Evo= github.com/474420502/focus v0.12.0/go.mod h1:d0PMjtMxFz1a9HIhwyFPkWa+JF+0LgOrEUfd8iZka6s= github.com/474420502/gcurl v0.1.2 h1:ON9Yz3IgAdtDlFlHfkAJ3aIEBDxH0RiViPE5ST5ohKg=