Skip to content

Commit

Permalink
scraper: update RealjamVR scraper for their new website (#1117)
Browse files Browse the repository at this point in the history
* RealjamVr scraper rewrite

* Update scraper_id introduced in PR #1113
  • Loading branch information
toshski authored Feb 6, 2023
1 parent cc2e759 commit 555ec5f
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 37 deletions.
1 change: 1 addition & 0 deletions pkg/models/model_scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type Scraper struct {

type ScrapedScene struct {
SceneID string `json:"_id"`
ScraperID string `json:"xbvr_site"`
SiteID string `json:"scene_id"`
SceneType string `json:"scene_type"`
Title string `json:"title"`
Expand Down
162 changes: 125 additions & 37 deletions pkg/scrape/realjamvr.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
package scrape

import (
"net/url"
"encoding/json"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"

"github.com/gocolly/colly"
"github.com/mozillazg/go-slugify"
"github.com/nleeper/goment"
"github.com/thoas/go-funk"
"github.com/xbapps/xbvr/pkg/models"
)
Expand All @@ -22,89 +25,174 @@ func RealJamVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out ch
sceneCollector := createCollector("realjamvr.com")
siteCollector := createCollector("realjamvr.com")

var c = siteCollector.Cookies("realjamvr.com")
cookie := http.Cookie{Name: "age_confirmed", Value: "Tru", Domain: "realjamvr.com", Path: "/", Expires: time.Now().Add(time.Hour)}
c = append(c, &cookie)
siteCollector.SetCookies("https://realjamvr.com", c)
sceneCollector.SetCookies("https://realjamvr.com", c)

sceneCollector.OnHTML(`html`, func(e *colly.HTMLElement) {
sc := models.ScrapedScene{}
sc.ScraperID = scraperID
sc.SceneType = "VR"
sc.Studio = "Real Jam Network"
sc.Site = siteID
sc.HomepageURL = strings.Split(e.Request.URL.String(), "?")[0]
if strings.HasSuffix(sc.HomepageURL, "/") {
// make homepage url conistant
sc.HomepageURL = sc.HomepageURL[0 : len(sc.HomepageURL)-1]
}

// Scene ID - get from URL
tmp := strings.Split(sc.HomepageURL, "/")
sc.SiteID = strings.Split(tmp[len(tmp)-1], "-")[0]
sc.SceneID = slugify.Slugify(sc.Site) + "-" + sc.SiteID
// source the scene_id from the trailer filename. This is not the best appraoch but the only id source we have
trailerId := ""
e.ForEach(`dl8-video source[src]`, func(id int, e *colly.HTMLElement) {
re := regexp.MustCompile(`/([0-9]+)_[0-9]+p.mp4.`)
match := re.FindStringSubmatch(e.Attr("src"))
if len(match) > 0 {
if trailerId != "" {
if trailerId != match[1] {
// don't trust trailer files, make sure they all return the same id
trailerId = "mismatch"
}
}
_, err := strconv.Atoi(match[1])
if err == nil {
// only assign the id if it's a valid number
trailerId = match[1]
}
}
})
sc.SceneID = slugify.Slugify(sc.Site) + "-" + trailerId

// trailer details
sc.TrailerType = "deovr"
sc.TrailerSrc = `https://realjamvr.com/deovr/video/id/` + sc.SiteID
sc.TrailerType = "scrape_html"
params := models.TrailerScrape{SceneUrl: sc.HomepageURL, HtmlElement: "dl8-video source", ContentPath: "src", QualityPath: "quality"}
strParams, _ := json.Marshal(params)
sc.TrailerSrc = string(strParams)

// Cast
e.ForEach(`.featuring a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div.scene-view a[href^='/actor/']`, func(id int, e *colly.HTMLElement) {
sc.Cast = append(sc.Cast, strings.TrimSpace(e.Text))
})

// Duration
sc.Duration, _ = strconv.Atoi(strings.Split(strings.TrimSpace(e.ChildText(`.duration`)), ":")[0])

// Released
sc.Released = strings.TrimSuffix(strings.TrimSpace(e.ChildText(`.date`)), ",")
e.ForEach(`.bi-calendar3`, func(id int, e *colly.HTMLElement) {
p := e.DOM.Parent()
d, err := goment.New(p.Text(), "MMM DD, YYYY")
if err != nil {
log.Infof("%v", err)
}
sc.Released = d.Format("YYYY-MM-DD")
})

// Duration
e.ForEach(`.bi-clock-history`, func(id int, e *colly.HTMLElement) {
p := e.DOM.Parent()
t, _ := time.Parse("15:04:05", p.Text())
sc.Duration = t.Minute() + t.Hour()*60
})

// Title
sc.Title = strings.TrimSpace(e.ChildText(`h1`))

// Cover URL
re := regexp.MustCompile(`background(?:-image)?\s*?:\s*?url\s*?\(\s*?(.*?)\s*?\)`)
coverURL := re.FindStringSubmatch(strings.TrimSpace(e.ChildAttr(`.splash-screen`, "style")))[1]
if len(coverURL) > 0 {
sc.Covers = append(sc.Covers, coverURL)
}
e.ForEach(`dl8-video`, func(id int, e *colly.HTMLElement) {
coverURL := e.Attr("poster")
if len(coverURL) > 0 {
sc.Covers = append(sc.Covers, coverURL)
}
})

// Gallery
e.ForEach(`.scene-previews-container a`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, strings.TrimSpace(e.Attr("href")))
e.ForEach(`.img-wrapper img`, func(id int, e *colly.HTMLElement) {
sc.Gallery = append(sc.Gallery, strings.TrimSpace(e.Attr("src")))
})

// Synopsis
sc.Synopsis = strings.TrimSpace(e.ChildText(`div.desc`))
e.ForEach(`div.my-2`, func(id int, e *colly.HTMLElement) {
if !strings.HasPrefix(strings.TrimSpace(e.Text), "Tags:") {
sc.Synopsis = strings.TrimSpace(e.Text)
}
})

// Tags
e.ForEach(`div.tags a`, func(id int, e *colly.HTMLElement) {
e.ForEach(`div a.tag`, func(id int, e *colly.HTMLElement) {
sc.Tags = append(sc.Tags, strings.TrimSpace(e.Text))
})

// Filenames
set := make(map[string]struct{})
e.ForEach(`.downloads a`, func(id int, e *colly.HTMLElement) {
u, _ := url.Parse(e.Attr("href"))
q := u.Query()
r, _ := regexp.Compile("attachment; filename=\"(.*?)\"")
match := r.FindStringSubmatch(q.Get("response-content-disposition"))
if len(match) > 0 {
set[match[1]] = struct{}{}
cnt := 0
fileMask := ""
// any "download/" links on the public site will be for trailers, use one trailer to get the basis of the scenes filenames
e.ForEach(`a[href^='download/']`, func(id int, e *colly.HTMLElement) {
if cnt == 0 {
trailerurl := sc.HomepageURL + "/" + e.Attr("href")
// url does not point directly to a file, need to resolve redirects with http.Head
resp, _ := http.Head(trailerurl)
fileMask = strings.Split(strings.Split(resp.Request.URL.String(), "attachment%3B%20filename%3D")[1], "&")[0]
tmp := strings.Split(fileMask, "_")
fileMask = strings.TrimSuffix(tmp[0], "-Trailer") + "-Full_$res_$fps_" + tmp[3] + "_" + tmp[4]
cnt += 1
}
})

// any "/join/" links on the public site will be for for the full movie
uniqueFilenames := make(map[string]bool)
e.ForEach(`a[href='/join/']`, func(id int, e *colly.HTMLElement) {
resolution := ""
fps := ""
e.ForEach(`div div`, func(id int, e *colly.HTMLElement) {
txt := strings.TrimSpace(e.Text)
if strings.HasPrefix(txt, "Full ") {
index := strings.Index(txt, "p")
if index != -1 {
resolution = txt[5:index]
}
} else {
if strings.HasSuffix(txt, "fps") {
fps = strings.TrimSuffix(txt, "fps")
}
}
})
if resolution != "" && fps != "" {
filename := strings.Replace(fileMask, "$res", resolution, 1)
filename = strings.Replace(filename, "$fps", fps, 1)
if !uniqueFilenames[filename] {
uniqueFilenames[filename] = true
sc.Filenames = append(sc.Filenames, filename)
}
}
})
for f := range set {
sc.Filenames = append(sc.Filenames, strings.ReplaceAll(strings.ReplaceAll(f, " ", "_"), ":", "_"))
}

out <- sc
switch trailerId {
case "":
log.Errorf("Could not determine Scene Id for %, Id not found", sc.HomepageURL)
case "mismatch":
log.Errorf("Could not determine Scene Id for %, inconsistent trailer filenames", sc.HomepageURL)
default:
out <- sc
}
})

siteCollector.OnHTML(`.c-pagination a`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`a.page-link`, func(e *colly.HTMLElement) {
pageURL := e.Request.AbsoluteURL(e.Attr("href"))
siteCollector.Visit(pageURL)
})

siteCollector.OnHTML(`div.movies-list a:not(.promo__info):not(.c-pagination a)`, func(e *colly.HTMLElement) {
siteCollector.OnHTML(`div.panel a`, func(e *colly.HTMLElement) {
sceneURL := e.Request.AbsoluteURL(e.Attr("href"))

if strings.HasSuffix(sceneURL, "/") {
// make a consistent URL
sceneURL = sceneURL[0 : len(sceneURL)-1]
}
// If scene exist in database, there's no need to scrape
if !funk.ContainsString(knownScenes, sceneURL) && !strings.Contains(sceneURL, "/join") {
if !funk.ContainsString(knownScenes, sceneURL) && strings.Contains(sceneURL, "realjamvr.com/scene/") {
sceneCollector.Visit(sceneURL)
}
})

siteCollector.Visit("https://realjamvr.com/virtualreality/list")
siteCollector.Visit("https://realjamvr.com/scenes")

if updateSite {
updateSiteLastUpdate(scraperID)
Expand Down

0 comments on commit 555ec5f

Please sign in to comment.