Skip to content

Commit ab15bed

Browse files
committed
introduce selector and extractor; fix requirements.txt
1 parent e8a5507 commit ab15bed

File tree

2 files changed

+50
-9
lines changed

2 files changed

+50
-9
lines changed

v8/import_page/import_page.py

+49-8
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@
4040

4141
LOGGER = utils.get_logger('import_page', utils.STDERR_HANDLER)
4242

43+
args = sys.argv[1:]
44+
selector = None # 'body'
45+
extractor = None # 'lambda node: BeautifulSoup(node.decode_contents(), "html.parser").prettify()'
46+
path_or_url = None
4347

4448
doc_template = '''<!--
4549
.. title: {title}
@@ -62,10 +66,27 @@ def _execute(self, options, args):
6266
"""Import a Page."""
6367
if BeautifulSoup is None:
6468
utils.req_missing(['bs4'], 'use the import_page plugin')
65-
for url in args:
66-
self._import_page(url)
6769

68-
def _import_page(self, url):
70+
urls = []
71+
selector = None
72+
extractor = None
73+
74+
while args:
75+
arg = args.pop(0)
76+
if arg == "-s" and args:
77+
selector = args.pop(0)
78+
elif arg == "-e" and args:
79+
extractor = args.pop(0)
80+
else:
81+
urls.append(arg) # Assume it's a page URL
82+
83+
if not urls:
84+
LOGGER.error(f'No page URL or file path provided.')
85+
86+
for url in urls:
87+
self._import_page(url, selector, extractor)
88+
89+
def _import_page(self, url, selector, extractor):
6990
parse = requests.utils.urlparse(url)
7091
if 'http' in parse.scheme:
7192
r = requests.get(url)
@@ -95,16 +116,36 @@ def _import_page(self, url):
95116
except TypeError:
96117
slug = utils.slugify(title)
97118

98-
candidates = soup.find_all(["p", "div", "article", "section"])
99-
if candidates:
100-
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
119+
node = None
120+
if selector:
121+
node = soup.select_one(selector)
122+
else:
123+
candidates = soup.find_all(["p", "div", "article", "section"])
124+
if candidates:
125+
node = max(candidates, key=lambda n: len(n.get_text(strip=True)))
126+
127+
if not node: # no content
128+
LOGGER.error(f'No content found in "{url}"')
129+
return 1
130+
131+
if extractor:
132+
try:
133+
extractor = eval(extractor)
134+
content = extractor(node)
135+
except Exception as e:
136+
LOGGER.error(f'Invalid extractor function: {extractor}. Error: {e}')
137+
return 1
101138
else:
102-
node = None # empty
139+
content = node.prettify()
140+
141+
if not content: # no content
142+
LOGGER.error(f'No content found in "{url}"')
143+
return 1
103144

104145
document = doc_template.format(
105146
title=title,
106147
slug=slug,
107-
content=node.prettify()
148+
content=content
108149
)
109150
with codecs.open(slug + '.html', 'w+', encoding='utf-8') as outf:
110151
outf.write(document)

v8/import_page/requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
requests
2-
libextract
2+
bs4

0 commit comments

Comments
 (0)