Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions catalog/common/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class SiteName(models.TextChoices):
Goodreads = "goodreads", _("Goodreads")
GoogleBooks = "googlebooks", _("Google Books")
BooksTW = "bookstw", _("BooksTW")
Bookwyrm = "bookwyrm", _("Bookwyrm")
BibliotekDK = "bibliotekdk", _("Bibliotek.dk")
BibliotekDK_eReolen = "eReolen", _("eReolen.dk")
IMDB = "imdb", _("IMDb")
Expand Down Expand Up @@ -82,6 +83,7 @@ class IdType(models.TextChoices):
DoubanDrama = "doubandrama", _("Douban Drama")
DoubanDramaVersion = "doubandrama_version", _("Douban Drama Version")
BooksTW = "bookstw", _("BooksTW Book")
Bookwyrm = "bookwyrm", _("Bookwyrm")
BibliotekDK_Edition = "bibliotekdk_edition", _("Bibliotek.dk")
BibliotekDK_eReolen = "bibliotekdk_ereolen", _("eReolen.dk")
BibliotekDK_Work = "bibliotekdk_work", _("Bibliotek.dk")
Expand Down
1 change: 1 addition & 0 deletions catalog/sites/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .bgg import BoardGameGeek
from .bibliotek_dk import BibliotekDK_Edition, BibliotekDK_Work
from .bookstw import BooksTW
from .bookwyrm import Bookwyrm
from .discogs import DiscogsMaster, DiscogsRelease
from .douban_book import DoubanBook
from .douban_drama import DoubanDrama
Expand Down
128 changes: 128 additions & 0 deletions catalog/sites/bookwyrm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import re
from urllib.parse import urlparse

from lxml.html import fromstring

from catalog.common import *
from catalog.models import Edition
from common.models import detect_language


@SiteManager.register
class Bookwyrm(AbstractSite):
SITE_NAME = SiteName.Bookwyrm
ID_TYPE = IdType.Bookwyrm
DEFAULT_MODEL = Edition
URL_PATTERNS = []

@classmethod
def id_to_url(cls, id_value):
return id_value

@classmethod
def url_to_id(cls, url: str):
return url

@classmethod
def validate_url_fallback(cls, url: str):
parsed = urlparse(url)
probe_url = "https://" + parsed.hostname + "/nodeinfo/2.0" # type: ignore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good if the result to check the host can be cached.

software = (
CachedDownloader(probe_url).download().json().get("software").get("name")
)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This code block could raise exceptions if the probe_url is unreachable or if the JSON response doesn't contain the expected structure. Consider adding a try...except block to handle potential RequestException or KeyError exceptions and return False in case of failure.

try:
    software = (
        CachedDownloader(probe_url).download().json().get("software").get("name")
    )
except (requests.RequestException, KeyError):
    return False

if software == "bookwyrm":
p = parsed.path
if re.compile("^/book/[0-9]+").match(p):
return True
else:
return False
else:
return False

def scrape(self, response=None):
r = BasicDownloader(self.id_value).download()
tree = fromstring(r.text)
data = {}
title = "".join(tree.xpath("//h1[contains(@itemprop,'name')]//text()")).strip() # type: ignore

author = tree.xpath("//a[contains(@itemprop,'author')]//text()")
isbn = "".join(tree.xpath("//dd[contains(@itemprop,'isbn')]//text()")).replace( # type: ignore
"-", ""
)

pub_date = (
"".join(
map(
str,
tree.xpath("//meta[contains(@itemprop,'datePublished')]/@content"), # type: ignore
)
)
.strip()
.split("-")
)

pub_house = "".join(
map(str, tree.xpath("//meta[contains(@itemprop,'publisher')]/@content")) # type: ignore
).strip()

cover_src = tree.xpath("//img[contains(@class,'book-cover')]/@src")[0] # type: ignore

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This line assumes that the cover_src is always present in the xpath result. If the element is not found, cover_src will raise an IndexError. Consider adding a check to ensure that the list is not empty before accessing the first element.

cover_src = tree.xpath("//img[contains(@class,'book-cover')]/@src")
cover_src = cover_src[0] if cover_src else None  # type: ignore

pages = "".join(
map(str, tree.xpath("//meta[contains(@itemprop,'numberOfPages')]/@content")) # type: ignore
).strip()

brief = "".join(
tree.xpath("//div[contains(@itemprop,'abstract')]//text()") # type: ignore
).strip()

subtitle = "".join(
map(
str,
tree.xpath(
"//meta[contains(@itemprop,'alternativeHeadline')]/@content" # type: ignore
),
)
).strip()

series = "".join(
tree.xpath(
"//span[contains(@itemprop,'isPartOf')]//span[contains(@itemprop,'name')]//text()" # type: ignore
)
).strip()

lang = detect_language(title + " " + brief) if brief else detect_language(title)

book_base = "https://" + urlparse(self.id_value).hostname # type: ignore
if re.compile("^https://").match(cover_src): # type: ignore
data["cover_image_url"] = cover_src
else:
data["cover_image_url"] = book_base + cover_src if cover_src else None # type: ignore

if len(pub_date) == 3:
data["pub_year"] = pub_date[0]
data["pub_month"] = pub_date[1]
Comment on lines +108 to +110

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

It's good to check the length of pub_date, but consider adding a check to ensure that the elements at index 0 and 1 exist before accessing them. This will prevent IndexError if the pub_date list has fewer than 2 elements.

if len(pub_date) >= 2:
    data["pub_year"] = pub_date[0]
    data["pub_month"] = pub_date[1]


data["pub_house"] = pub_house if pub_house else None

data["pages"] = pages if pages else None

data["isbn"] = isbn if isbn else None

data["series"] = series if series else None

data["author"] = author

data["localized_title"] = [{"lang": lang, "text": title}]

data["localized_subtitle"] = (
[{"lang": lang, "text": subtitle}] if subtitle else None
)

data["localized_description"] = (
[{"lang": lang, "text": brief}] if brief else None
)

pd = ResourceContent(
metadata=data,
lookup_ids={IdType.ISBN: isbn},
)
return pd
Loading