-
Notifications
You must be signed in to change notification settings - Fork 99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[draft] import_page: support file scheme and use bs4 to workaround missing 'body' element #456
base: master
Are you sure you want to change the base?
Changes from all commits
f7757d1
1df98ec
a066672
e8a5507
ab15bed
82cda9e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright © 2025 Roberto Alsina and others | ||
|
||
# Permission is hereby granted, free of charge, to any | ||
# person obtaining a copy of this software and associated | ||
# documentation files (the "Software"), to deal in the | ||
# Software without restriction, including without limitation | ||
# the rights to use, copy, modify, merge, publish, | ||
# distribute, sublicense, and/or sell copies of the | ||
# Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice | ||
# shall be included in all copies or substantial portions of | ||
# the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY | ||
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE | ||
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR | ||
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS | ||
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR | ||
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR | ||
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|
||
from __future__ import unicode_literals, print_function | ||
|
||
import codecs | ||
|
||
try: | ||
from bs4 import BeautifulSoup | ||
except ImportError: | ||
BeautifulSoup = None | ||
import requests | ||
import sys | ||
|
||
from nikola.plugin_categories import Command | ||
from nikola import utils | ||
|
||
LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER) | ||
|
||
args = sys.argv[1:] | ||
selector = None # 'body' | ||
extractor = None # 'lambda node: BeautifulSoup(node.decode_contents(), "html.parser").prettify()' | ||
urls = [] | ||
|
||
doc_template = '''<!-- | ||
.. title: {title} | ||
.. slug: {slug} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider also adding |
||
--> | ||
|
||
{content} | ||
''' | ||
|
||
|
||
class CommandImportPage(Command): | ||
"""Import a Page.""" | ||
|
||
name = "import_page" | ||
needs_config = False | ||
doc_usage = "[options] page_url [page_url,...]" | ||
doc_purpose = "import arbitrary web pages" | ||
|
||
def _execute(self, options, args): | ||
"""Import a Page.""" | ||
if BeautifulSoup is None: | ||
utils.req_missing(['bs4'], 'use the import_page plugin') | ||
|
||
while args: | ||
arg = args.pop(0) | ||
if arg == "-s" and args: | ||
selector = args.pop(0) | ||
elif arg == "-e" and args: | ||
extractor = args.pop(0) | ||
else: | ||
urls.append(arg) # Assume it's a page URL | ||
Comment on lines
+70
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don’t need to parse args yourself, you should use the built-in support in doit. See just about any command plugin for an example. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC all plugins just use <plugin> [-s extractor_file] arg... That is why I was parsing it... Or, do I misunderstand your comment? |
||
|
||
if not urls: | ||
LOGGER.error(f'No page URL or file path provided.') | ||
|
||
for url in urls: | ||
self._import_page(url, selector, extractor) | ||
|
||
def _import_page(self, url, selector, extractor): | ||
parse = requests.utils.urlparse(url) | ||
if 'http' in parse.scheme: | ||
r = requests.get(url) | ||
if not (199 < r.status_code < 300): # Did not get it | ||
LOGGER.error(f'Error fetching URL: {url}') | ||
return 1 | ||
html = r.content | ||
else: | ||
try: | ||
with open(url, 'rb') as f: | ||
html = f.read() | ||
except FileNotFoundError: | ||
LOGGER.error(f'Error file does not exist: {url}') | ||
return 1 | ||
except (OSError, IOError) as e: | ||
LOGGER.error(f'Error opening file "{url}": {e}') | ||
return 1 | ||
|
||
try: | ||
soup = BeautifulSoup(html, "lxml") | ||
except ImportError: | ||
soup = BeautifulSoup(html, "html.parser") | ||
|
||
title = soup.title.text if soup.title else "Untitled Page" | ||
try: | ||
slug = utils.slugify(title, lang='') | ||
except TypeError: | ||
slug = utils.slugify(title) | ||
|
||
node = None | ||
if selector: | ||
node = soup.select_one(selector) | ||
else: | ||
candidates = soup.find_all(["p", "div", "article", "section"]) | ||
if candidates: | ||
node = max(candidates, key=lambda n: len(n.get_text(strip=True))) | ||
|
||
if not node: # no content | ||
LOGGER.error(f'No content found in "{url}"') | ||
return 1 | ||
|
||
if extractor: | ||
try: | ||
extractor = eval(extractor) | ||
content = extractor(node) | ||
except Exception as e: | ||
LOGGER.error(f'Invalid extractor function: {extractor}. Error: {e}') | ||
return 1 | ||
else: | ||
content = node.prettify() | ||
|
||
if not content: # no content | ||
LOGGER.error(f'No content found in "{url}"') | ||
return 1 | ||
|
||
document = doc_template.format( | ||
title=title, | ||
slug=slug, | ||
content=content | ||
) | ||
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: | ||
outf.write(document) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
requests | ||
bs4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this used?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will come with a better solution,
extractor_module
so one can use an external file with its own code.