-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodified_parser.py
More file actions
129 lines (101 loc) · 4 KB
/
modified_parser.py
File metadata and controls
129 lines (101 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import argparse
import sys
import logging
import os
from parser.helper import infer_edition_from_url, get_html_tree_from_string, get_html_tree_from_url
import importlib
if sys.version_info[0:3] >= (3, 0, 0): # python 3 (tested)
from zim.zimpy_p3 import ZimFile
else: # python 2 (not tested)
from zim.zimpy_p2 import ZimFile
def setup_logger():
if not os.path.exists('log/'):
os.mkdir('log/')
LOG_FILENAME = "log/parser.log"
logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
# key: Wiktionary edition code
# value: parser class (not parser instance)
parsers = {}
headers = ['head_lang', 'pos', 'FW', 'english_translation']
# dynamically loading all modules
def import_all_parsers():
parser_list = ['ja', 'vi', 'tr', 'fr', 'ru', 'uz', 'de', 'az', 'nl', 'en']
for parser_name in parser_list:
module_to_import = '.parse_' + parser_name
module = importlib.import_module(module_to_import, package='parser')
parsers[parser_name] = getattr(module, parser_name.capitalize() + "Parser")
# tested_url.extend(module.tested_url)
def get_parser(edition):
if edition not in parsers:
module_to_import = '.parse_' + edition
try:
module = importlib.import_module(module_to_import, package='parser')
parsers[edition] = getattr(module, edition.capitalize() + "Parser")
except Exception as e:
print(e)
return None
# instantiate the class
return parsers[edition]()
def read_zim_file(file):
# print(file.metadata())
# we only need main articles. They are in namespace 'A'.
namespace = b'A'
for article in file.articles():
if article['namespace'] != namespace:
continue
body = file.get_article_by_index(
article['index'], follow_redirect=False)[0]
if not body:
continue
else:
yield (body.decode('utf-8'))
def test_zim(filename, edition=None):
file = ZimFile(filename=filename)
# file.list_articles_by_url()
edition_lang_code = file.metadata()['language'].decode('utf-8')
# print(edition_lang_code)
if edition:
edition_wikt_code = edition
# print(edition_wikt_code)
else:
import parser.lang_code_conversion as languages
edition_wikt_code = languages.get_wikt_code_from_iso639_3(edition_lang_code)
# print(edition_wikt_code)
print(','.join(headers))
# get the parser class
parser = get_parser(edition_wikt_code)
if parser is None:
print("We don't have a parser for {}/{} language yet.".format(edition_lang_code, edition_wikt_code))
return
# instantiate the parser
page_generator = read_zim_file(file)
for page in page_generator:
soup = get_html_tree_from_string(page)
parser.parse_page(soup)
def test_html(filename, edition=None):
with open(filename) as file:
url_list = file.read().splitlines()
if edition is None:
edition = infer_edition_from_url(url_list[0])
parser = get_parser(edition)
print(','.join(headers))
for url in url_list:
soup = get_html_tree_from_url(url)
parser.parse_page(soup)
def main():
setup_logger()
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--url_zim', '-uz', help='use a zim file as the source of urls and get html from the Internet')
group.add_argument('--url_list', '-ul', help='use a file containing a list of urls and get html from the Internet')
group.add_argument('--zim', '-z', help='use the zim file as input instead of html')
parser.add_argument('--edition', '-e', help='explicitly specify the language edition, for either html or zim')
args = parser.parse_args()
if args.zim:
test_zim(args.zim, args.edition)
elif args.url_list:
test_html(args.url_list, args.edition)
elif args.url_zim:
use_url_zim(args.url_zim, args.edition)
if __name__ == '__main__':
main()