Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[draft] import_page: support file scheme and use bs4 to workaround missing 'body' element #456

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion v7/import_page/README.md → v8/import_page/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Plugin to import arbitrary web pages.
Plugin to import arbitrary web pages (from an URL or a local file).

Usage:

Expand Down
File renamed without changes.
File renamed without changes.
75 changes: 46 additions & 29 deletions v7/import_page/import_page.py → v8/import_page/import_page.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright © 2015 Roberto Alsina and others
# Copyright © 2025 Roberto Alsina and others

# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
Expand Down Expand Up @@ -29,10 +29,9 @@
import codecs

try:
import libextract.api
from bs4 import BeautifulSoup
except ImportError:
libextract = None
import lxml.html
BeautifulSoup = None
import requests
import sys

Expand Down Expand Up @@ -61,34 +60,52 @@ class CommandImportPage(Command):

def _execute(self, options, args):
"""Import a Page."""
if libextract is None:
utils.req_missing(['libextract'], 'use the import_page plugin')
if BeautifulSoup is None:
utils.req_missing(['bs4'], 'use the import_page plugin')
for url in args:
self._import_page(url)

def _import_page(self, url):
r = requests.get(url)
if 199 < r.status_code < 300: # Got it
# Use the page's title
doc = lxml.html.fromstring(r.content)
title = doc.find('*//title').text
if sys.version_info[0] == 2 and isinstance(title, str):
title = title.decode('utf-8')
parse = requests.utils.urlparse(url)
if 'http' in parse.scheme:
r = requests.get(url)
if not (199 < r.status_code < 300): # Did not get it
LOGGER.error(f'Error fetching URL: {url}')
return 1
html = r.content.decode(r.encoding).encode('utf-8') if r.encoding and 'utf-8' \
not in r.encoding.lower() else r.content
else:
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)
nodes = list(libextract.api.extract(r.content))
# Let's assume the node with more text is the good one
lengths = [len(n.text_content()) for n in nodes]
node = nodes[lengths.index(max(lengths))]
document = doc_template.format(
title=title,
slug=slug,
content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8')
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)

with open(url, 'rb') as f:
html = f.read()
except FileNotFoundError:
LOGGER.error(f'Error file does not exist: {url}')
return 1
except (OSError, IOError) as e:
LOGGER.error(f'Error opening file "{url}": {e}')
return 1

try:
soup = BeautifulSoup(html, "lxml")
except ImportError:
soup = BeautifulSoup(html, "html.parser")

title = soup.title.text if soup.title else "Untitled Page"
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)

candidates = soup.find_all(["p", "div", "article", "section"])
if candidates:
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
else:
LOGGER.error('Error fetching URL: {}'.format(url))
node = None # empty

document = doc_template.format(
title=title,
slug=slug,
content=node.get_text(strip=True)
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)
File renamed without changes.