diff --git a/v7/import_page/import_page.py b/v7/import_page/import_page.py deleted file mode 100644 index 618d6bfc..00000000 --- a/v7/import_page/import_page.py +++ /dev/null @@ -1,94 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright © 2015 Roberto Alsina and others - -# Permission is hereby granted, free of charge, to any -# person obtaining a copy of this software and associated -# documentation files (the "Software"), to deal in the -# Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the -# Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice -# shall be included in all copies or substantial portions of -# the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS -# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR -# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -from __future__ import unicode_literals, print_function - -import codecs - -try: - import libextract.api -except ImportError: - libextract = None -import lxml.html -import requests -import sys - -from nikola.plugin_categories import Command -from nikola import utils - -LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER) - - -doc_template = ''' - -{content} -''' - - -class CommandImportPage(Command): - """Import a Page.""" - - name = "import_page" - needs_config = False - doc_usage = "[options] page_url [page_url,...]" - doc_purpose = "import arbitrary web pages" - - def _execute(self, options, args): - """Import a Page.""" - if libextract is None: - utils.req_missing(['libextract'], 'use the import_page plugin') - for url in args: - self._import_page(url) - - def _import_page(self, url): - r = requests.get(url) - if 199 < r.status_code < 300: # Got it - # Use the page's title - doc = lxml.html.fromstring(r.content) - title = doc.find('*//title').text - if sys.version_info[0] == 2 and isinstance(title, str): - title = title.decode('utf-8') - try: - slug = utils.slugify(title, lang='') - except TypeError: - slug = utils.slugify(title) - nodes = list(libextract.api.extract(r.content)) - # Let's assume the node with more text is the good one - lengths = [len(n.text_content()) for n in nodes] - node = nodes[lengths.index(max(lengths))] - document = doc_template.format( - title=title, - slug=slug, - content=lxml.html.tostring(node, encoding='utf8', method='html', pretty_print=True).decode('utf8') - ) - with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: - outf.write(document) - - else: - LOGGER.error('Error fetching URL: {}'.format(url)) diff --git a/v7/import_page/requirements.txt b/v7/import_page/requirements.txt deleted file mode 100644 index e2c47fd9..00000000 --- a/v7/import_page/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -requests -libextract diff --git a/v7/import_page/README.md b/v8/import_page/README.md similarity index 76% rename from v7/import_page/README.md rename to v8/import_page/README.md index 0a1ec271..3dbc4c3e 100644 --- a/v7/import_page/README.md +++ b/v8/import_page/README.md @@ -1,4 +1,4 @@ -Plugin to import arbitrary web pages. +Plugin to import arbitrary web pages (from an URL or a local file). Usage: diff --git a/v7/import_page/conf.py.sample b/v8/import_page/conf.py.sample similarity index 100% rename from v7/import_page/conf.py.sample rename to v8/import_page/conf.py.sample diff --git a/v7/import_page/import_page.plugin b/v8/import_page/import_page.plugin similarity index 100% rename from v7/import_page/import_page.plugin rename to v8/import_page/import_page.plugin diff --git a/v8/import_page/import_page.py b/v8/import_page/import_page.py new file mode 100644 index 00000000..5561bb61 --- /dev/null +++ b/v8/import_page/import_page.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +# Copyright © 2025 Roberto Alsina and others + +# Permission is hereby granted, free of charge, to any +# person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the +# Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the +# Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice +# shall be included in all copies or substantial portions of +# the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS +# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +from __future__ import unicode_literals, print_function + +import codecs + +try: + from bs4 import BeautifulSoup +except ImportError: + BeautifulSoup = None +import requests +import sys + +from nikola.plugin_categories import Command +from nikola import utils + +LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER) + +args = sys.argv[1:] +selector = None # 'body' +extractor = None # 'lambda node: BeautifulSoup(node.decode_contents(), "html.parser").prettify()' +urls = [] + +doc_template = ''' + +{content} +''' + + +class CommandImportPage(Command): + """Import a Page.""" + + name = "import_page" + needs_config = False + doc_usage = "[options] page_url [page_url,...]" + doc_purpose = "import arbitrary web pages" + + def _execute(self, options, args): + """Import a Page.""" + if BeautifulSoup is None: + utils.req_missing(['bs4'], 'use the import_page plugin') + + while args: + arg = args.pop(0) + if arg == "-s" and args: + selector = args.pop(0) + elif arg == "-e" and args: + extractor = args.pop(0) + else: + urls.append(arg) # Assume it's a page URL + + if not urls: + LOGGER.error(f'No page URL or file path provided.') + + for url in urls: + self._import_page(url, selector, extractor) + + def _import_page(self, url, selector, extractor): + parse = requests.utils.urlparse(url) + if 'http' in parse.scheme: + r = requests.get(url) + if not (199 < r.status_code < 300): # Did not get it + LOGGER.error(f'Error fetching URL: {url}') + return 1 + html = r.content + else: + try: + with open(url, 'rb') as f: + html = f.read() + except FileNotFoundError: + LOGGER.error(f'Error file does not exist: {url}') + return 1 + except (OSError, IOError) as e: + LOGGER.error(f'Error opening file "{url}": {e}') + return 1 + + try: + soup = BeautifulSoup(html, "lxml") + except ImportError: + soup = BeautifulSoup(html, "html.parser") + + title = soup.title.text if soup.title else "Untitled Page" + try: + slug = utils.slugify(title, lang='') + except TypeError: + slug = utils.slugify(title) + + node = None + if selector: + node = soup.select_one(selector) + else: + candidates = soup.find_all(["p", "div", "article", "section"]) + if candidates: + node = max(candidates, key=lambda n: len(n.get_text(strip=True))) + + if not node: # no content + LOGGER.error(f'No content found in "{url}"') + return 1 + + if extractor: + try: + extractor = eval(extractor) + content = extractor(node) + except Exception as e: + LOGGER.error(f'Invalid extractor function: {extractor}. Error: {e}') + return 1 + else: + content = node.prettify() + + if not content: # no content + LOGGER.error(f'No content found in "{url}"') + return 1 + + document = doc_template.format( + title=title, + slug=slug, + content=content + ) + with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf: + outf.write(document) diff --git a/v8/import_page/requirements.txt b/v8/import_page/requirements.txt new file mode 100644 index 00000000..dc1536fb --- /dev/null +++ b/v8/import_page/requirements.txt @@ -0,0 +1,2 @@ +requests +bs4