scraper: update RealjamVR scraper for their new website (#1117)

* RealjamVr scraper rewrite * Update scraper_id introduced in PR #1113
xbapps · Feb 6, 2023 · 555ec5f · 555ec5f
1 parent cc2e759
commit 555ec5f
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 37 deletions.
diff --git a/pkg/models/model_scraper.go b/pkg/models/model_scraper.go
@@ -18,6 +18,7 @@ type Scraper struct {
 
 type ScrapedScene struct {
 	SceneID     string   `json:"_id"`
+	ScraperID   string   `json:"xbvr_site"`
 	SiteID      string   `json:"scene_id"`
 	SceneType   string   `json:"scene_type"`
 	Title       string   `json:"title"`

diff --git a/pkg/scrape/realjamvr.go b/pkg/scrape/realjamvr.go
@@ -1,14 +1,17 @@
 package scrape
 
 import (
-	"net/url"
+	"encoding/json"
+	"net/http"
 	"regexp"
 	"strconv"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/gocolly/colly"
 	"github.com/mozillazg/go-slugify"
+	"github.com/nleeper/goment"
 	"github.com/thoas/go-funk"
 	"github.com/xbapps/xbvr/pkg/models"
 )
@@ -22,89 +25,174 @@ func RealJamVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch
 	sceneCollector := createCollector("realjamvr.com")
 	siteCollector := createCollector("realjamvr.com")
 
+	var c = siteCollector.Cookies("realjamvr.com")
+	cookie := http.Cookie{Name: "age_confirmed", Value: "Tru", Domain: "realjamvr.com", Path: "/", Expires: time.Now().Add(time.Hour)}
+	c = append(c, &cookie)
+	siteCollector.SetCookies("https://realjamvr.com", c)
+	sceneCollector.SetCookies("https://realjamvr.com", c)
+
 	sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
 		sc := models.ScrapedScene{}
+		sc.ScraperID = scraperID
 		sc.SceneType = "VR"
 		sc.Studio = "Real Jam Network"
 		sc.Site = siteID
 		sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]
+		if strings.HasSuffix(sc.HomepageURL, "/") {
+			// make homepage url conistant
+			sc.HomepageURL = sc.HomepageURL[0 : len(sc.HomepageURL)-1]
+		}
 
-		// Scene ID - get from URL
-		tmp := strings.Split(sc.HomepageURL, "/")
-		sc.SiteID = strings.Split(tmp[len(tmp)-1], "-")[0]
-		sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
+		// source the scene_id from the trailer filename.  This is not the best appraoch but the only id source we have
+		trailerId := ""
+		e.ForEach(`dl8-video source[src]`, func(id int, e *colly.HTMLElement) {
+			re := regexp.MustCompile(`/([0-9]+)_[0-9]+p.mp4.`)
+			match := re.FindStringSubmatch(e.Attr("src"))
+			if len(match) > 0 {
+				if trailerId != "" {
+					if trailerId != match[1] {
+						// don't trust trailer files, make sure they all return the same id
+						trailerId = "mismatch"
+					}
+				}
+				_, err := strconv.Atoi(match[1])
+				if err == nil {
+					// only assign the id if it's a valid number
+					trailerId = match[1]
+				}
+			}
+		})
+		sc.SceneID = slugify.Slugify(sc.Site) + "-" + trailerId
 
 		// trailer details
-		sc.TrailerType = "deovr"
-		sc.TrailerSrc = `https://realjamvr.com/deovr/video/id/` + sc.SiteID
+		sc.TrailerType = "scrape_html"
+		params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "dl8-video source", ContentPath: "src", QualityPath: "quality"}
+		strParams, _ := json.Marshal(params)
+		sc.TrailerSrc = string(strParams)
 
 		// Cast
-		e.ForEach(`.featuring a`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div.scene-view a[href^='/actor/']`, func(id int, e *colly.HTMLElement) {
 			sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
 		})
 
-		// Duration
-		sc.Duration, _ = strconv.Atoi(strings.Split(strings.TrimSpace(e.ChildText(`.duration`)), ":")[0])
-
 		// Released
-		sc.Released = strings.TrimSuffix(strings.TrimSpace(e.ChildText(`.date`)), ",")
+		e.ForEach(`.bi-calendar3`, func(id int, e *colly.HTMLElement) {
+			p := e.DOM.Parent()
+			d, err := goment.New(p.Text(), "MMM DD, YYYY")
+			if err != nil {
+				log.Infof("%v", err)
+			}
+			sc.Released = d.Format("YYYY-MM-DD")
+		})
+
+		// Duration
+		e.ForEach(`.bi-clock-history`, func(id int, e *colly.HTMLElement) {
+			p := e.DOM.Parent()
+			t, _ := time.Parse("15:04:05", p.Text())
+			sc.Duration = t.Minute() + t.Hour()*60
+		})
 
 		// Title
 		sc.Title = strings.TrimSpace(e.ChildText(`h1`))
 
 		// Cover URL
-		re := regexp.MustCompile(`background(?:-image)?\s*?:\s*?url\s*?\(\s*?(.*?)\s*?\)`)
-		coverURL := re.FindStringSubmatch(strings.TrimSpace(e.ChildAttr(`.splash-screen`, "style")))[1]
-		if len(coverURL) > 0 {
-			sc.Covers = append(sc.Covers, coverURL)
-		}
+		e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
+			coverURL := e.Attr("poster")
+			if len(coverURL) > 0 {
+				sc.Covers = append(sc.Covers, coverURL)
+			}
+		})
 
 		// Gallery
-		e.ForEach(`.scene-previews-container a`, func(id int, e *colly.HTMLElement) {
-			sc.Gallery = append(sc.Gallery, strings.TrimSpace(e.Attr("href")))
+		e.ForEach(`.img-wrapper img`, func(id int, e *colly.HTMLElement) {
+			sc.Gallery = append(sc.Gallery, strings.TrimSpace(e.Attr("src")))
 		})
 
 		// Synopsis
-		sc.Synopsis = strings.TrimSpace(e.ChildText(`div.desc`))
+		e.ForEach(`div.my-2`, func(id int, e *colly.HTMLElement) {
+			if !strings.HasPrefix(strings.TrimSpace(e.Text), "Tags:") {
+				sc.Synopsis = strings.TrimSpace(e.Text)
+			}
+		})
 
 		// Tags
-		e.ForEach(`div.tags a`, func(id int, e *colly.HTMLElement) {
+		e.ForEach(`div a.tag`, func(id int, e *colly.HTMLElement) {
 			sc.Tags = append(sc.Tags, strings.TrimSpace(e.Text))
 		})
 
 		// Filenames
-		set := make(map[string]struct{})
-		e.ForEach(`.downloads a`, func(id int, e *colly.HTMLElement) {
-			u, _ := url.Parse(e.Attr("href"))
-			q := u.Query()
-			r, _ := regexp.Compile("attachment; filename=\"(.*?)\"")
-			match := r.FindStringSubmatch(q.Get("response-content-disposition"))
-			if len(match) > 0 {
-				set[match[1]] = struct{}{}
+		cnt := 0
+		fileMask := ""
+		// any "download/" links on the public site will be for trailers, use one trailer to get the basis of the scenes filenames
+		e.ForEach(`a[href^='download/']`, func(id int, e *colly.HTMLElement) {
+			if cnt == 0 {
+				trailerurl := sc.HomepageURL + "/" + e.Attr("href")
+				// url does not point directly to a file, need to resolve redirects with http.Head
+				resp, _ := http.Head(trailerurl)
+				fileMask = strings.Split(strings.Split(resp.Request.URL.String(), "attachment%3B%20filename%3D")[1], "&")[0]
+				tmp := strings.Split(fileMask, "_")
+				fileMask = strings.TrimSuffix(tmp[0], "-Trailer") + "-Full_$res_$fps_" + tmp[3] + "_" + tmp[4]
+				cnt += 1
+			}
+		})
+
+		// any "/join/" links on the public site will be for for the full movie
+		uniqueFilenames := make(map[string]bool)
+		e.ForEach(`a[href='/join/']`, func(id int, e *colly.HTMLElement) {
+			resolution := ""
+			fps := ""
+			e.ForEach(`div div`, func(id int, e *colly.HTMLElement) {
+				txt := strings.TrimSpace(e.Text)
+				if strings.HasPrefix(txt, "Full ") {
+					index := strings.Index(txt, "p")
+					if index != -1 {
+						resolution = txt[5:index]
+					}
+				} else {
+					if strings.HasSuffix(txt, "fps") {
+						fps = strings.TrimSuffix(txt, "fps")
+					}
+				}
+			})
+			if resolution != "" && fps != "" {
+				filename := strings.Replace(fileMask, "$res", resolution, 1)
+				filename = strings.Replace(filename, "$fps", fps, 1)
+				if !uniqueFilenames[filename] {
+					uniqueFilenames[filename] = true
+					sc.Filenames = append(sc.Filenames, filename)
+				}
 			}
 		})
-		for f := range set {
-			sc.Filenames = append(sc.Filenames, strings.ReplaceAll(strings.ReplaceAll(f, " ", "_"), ":", "_"))
-		}
 
-		out <- sc
+		switch trailerId {
+		case "":
+			log.Errorf("Could not determine Scene Id for %, Id not found", sc.HomepageURL)
+		case "mismatch":
+			log.Errorf("Could not determine Scene Id for %, inconsistent trailer filenames", sc.HomepageURL)
+		default:
+			out <- sc
+		}
 	})
 
-	siteCollector.OnHTML(`.c-pagination a`, func(e *colly.HTMLElement) {
+	siteCollector.OnHTML(`a.page-link`, func(e *colly.HTMLElement) {
 		pageURL := e.Request.AbsoluteURL(e.Attr("href"))
 		siteCollector.Visit(pageURL)
 	})
 
-	siteCollector.OnHTML(`div.movies-list a:not(.promo__info):not(.c-pagination a)`, func(e *colly.HTMLElement) {
+	siteCollector.OnHTML(`div.panel a`, func(e *colly.HTMLElement) {
 		sceneURL := e.Request.AbsoluteURL(e.Attr("href"))
 
+		if strings.HasSuffix(sceneURL, "/") {
+			// make a consistent URL
+			sceneURL = sceneURL[0 : len(sceneURL)-1]
+		}
 		// If scene exist in database, there's no need to scrape
-		if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
+		if !funk.ContainsString(knownScenes, sceneURL) && strings.Contains(sceneURL, "realjamvr.com/scene/") {
 			sceneCollector.Visit(sceneURL)
 		}
 	})
 
-	siteCollector.Visit("https://realjamvr.com/virtualreality/list")
+	siteCollector.Visit("https://realjamvr.com/scenes")
 
 	if updateSite {
 		updateSiteLastUpdate(scraperID)