40
40
41
41
LOGGER = utils .get_logger ('import_page' , utils .STDERR_HANDLER )
42
42
43
+ args = sys .argv [1 :]
44
+ selector = None # 'body'
45
+ extractor = None # 'lambda node: BeautifulSoup(node.decode_contents(), "html.parser").prettify()'
46
+ path_or_url = None
43
47
44
48
doc_template = '''<!--
45
49
.. title: {title}
@@ -62,10 +66,27 @@ def _execute(self, options, args):
62
66
"""Import a Page."""
63
67
if BeautifulSoup is None :
64
68
utils .req_missing (['bs4' ], 'use the import_page plugin' )
65
- for url in args :
66
- self ._import_page (url )
67
69
68
- def _import_page (self , url ):
70
+ urls = []
71
+ selector = None
72
+ extractor = None
73
+
74
+ while args :
75
+ arg = args .pop (0 )
76
+ if arg == "-s" and args :
77
+ selector = args .pop (0 )
78
+ elif arg == "-e" and args :
79
+ extractor = args .pop (0 )
80
+ else :
81
+ urls .append (arg ) # Assume it's a page URL
82
+
83
+ if not urls :
84
+ LOGGER .error (f'No page URL or file path provided.' )
85
+
86
+ for url in urls :
87
+ self ._import_page (url , selector , extractor )
88
+
89
+ def _import_page (self , url , selector , extractor ):
69
90
parse = requests .utils .urlparse (url )
70
91
if 'http' in parse .scheme :
71
92
r = requests .get (url )
@@ -95,16 +116,36 @@ def _import_page(self, url):
95
116
except TypeError :
96
117
slug = utils .slugify (title )
97
118
98
- candidates = soup .find_all (["p" , "div" , "article" , "section" ])
99
- if candidates :
100
- node = max (candidates , key = lambda n : len (n .get_text (strip = True )))
119
+ node = None
120
+ if selector :
121
+ node = soup .select_one (selector )
122
+ else :
123
+ candidates = soup .find_all (["p" , "div" , "article" , "section" ])
124
+ if candidates :
125
+ node = max (candidates , key = lambda n : len (n .get_text (strip = True )))
126
+
127
+ if not node : # no content
128
+ LOGGER .error (f'No content found in "{ url } "' )
129
+ return 1
130
+
131
+ if extractor :
132
+ try :
133
+ extractor = eval (extractor )
134
+ content = extractor (node )
135
+ except Exception as e :
136
+ LOGGER .error (f'Invalid extractor function: { extractor } . Error: { e } ' )
137
+ return 1
101
138
else :
102
- node = None # empty
139
+ content = node .prettify ()
140
+
141
+ if not content : # no content
142
+ LOGGER .error (f'No content found in "{ url } "' )
143
+ return 1
103
144
104
145
document = doc_template .format (
105
146
title = title ,
106
147
slug = slug ,
107
- content = node . prettify ()
148
+ content = content
108
149
)
109
150
with codecs .open (slug + '.html' , 'w+' , encoding = 'utf-8' ) as outf :
110
151
outf .write (document )
0 commit comments