-
Notifications
You must be signed in to change notification settings - Fork 99
/
Copy pathimport_page.py
111 lines (92 loc) · 3.52 KB
/
import_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
# Copyright © 2025 Roberto Alsina and others
# Permission is hereby granted, free of charge, to any
# person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the
# Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the
# Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice
# shall be included in all copies or substantial portions of
# the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
# PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from __future__ import unicode_literals, print_function
import codecs
try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None
import requests
import sys
from nikola.plugin_categories import Command
from nikola import utils
LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER)
doc_template = '''<!--
.. title: {title}
.. slug: {slug}
-->
{content}
'''
class CommandImportPage(Command):
"""Import a Page."""
name = "import_page"
needs_config = False
doc_usage = "[options] page_url [page_url,...]"
doc_purpose = "import arbitrary web pages"
def _execute(self, options, args):
"""Import a Page."""
if BeautifulSoup is None:
utils.req_missing(['bs4'], 'use the import_page plugin')
for url in args:
self._import_page(url)
def _import_page(self, url):
parse = requests.utils.urlparse(url)
if 'http' in parse.scheme:
r = requests.get(url)
if not (199 < r.status_code < 300): # Did not get it
LOGGER.error(f'Error fetching URL: {url}')
return 1
html = r.content.decode(r.encoding).encode('utf-8') if r.encoding and 'utf-8' \
not in r.encoding.lower() else r.content
else:
try:
with open(url, 'rb') as f:
html = f.read()
except FileNotFoundError:
LOGGER.error(f'Error file does not exist: {url}')
return 1
except (OSError, IOError) as e:
LOGGER.error(f'Error opening file "{url}": {e}')
return 1
try:
soup = BeautifulSoup(html, "lxml")
except ImportError:
soup = BeautifulSoup(html, "html.parser")
title = soup.title.text if soup.title else "Untitled Page"
try:
slug = utils.slugify(title, lang='')
except TypeError:
slug = utils.slugify(title)
candidates = soup.find_all(["p", "div", "article", "section"])
if candidates:
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
else:
node = None # empty
document = doc_template.format(
title=title,
slug=slug,
content=node.get_text(strip=True)
)
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
outf.write(document)