-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwikichomp.py
113 lines (97 loc) · 4.12 KB
/
wikichomp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python3
import wikipedia
import json
from sys import argv
import re
import pdb
from num2words import num2words
from whiptail import Whiptail
import db
# word = argv[1]
# if "/" in word: word = word.split('/')[-1]
def disambiguouizer(disambuchoices, ambiguous_word):
whippyGUI = Whiptail()
whippyGUI.title = "You've found the disambiguouizer"
whippyGUI.backtitle = ambiguous_word + " may refer to;\n"
user_choice, exitstatus = whippyGUI.menu(msg='what do you mean by ' +ambiguous_word+"?",items=disambuchoices,prefix='')
return user_choice
def wiki_query_prep(word_or_phrase):
''' Hola pass me any string and I'll format it as a wikipedia article title.
so "legislative chamber" becomes "Legislative_chamber"'''
if ' ' in word_or_phrase:
words = word_or_phrase.split(' ')
wiki_article = words[0].capitalize() + "_" + '_'.join(words[1:])
return wiki_article
else: return word_or_phrase.capitalize()
def make_word_xword(wrd):
numbers = re.compile('[0-9]')
if len(numbers.findall(wrd)) > 0:
for numba in re.findall('[0-9]+', wrd):
wrd = re.sub(numba, num2words(numba, to="year"), wrd)
return wrd
def wikipedia_grab_chomp(wikiterm):
'''I retrieve linked articles, summaries and picture locations from wikipedia
I used to put the chomped wiki stuff in a dir, but now I put that junk in a sqlite3 db'''
words = db.db_query("select * from word where word LIKE ?", wikiterm)
if len(words) < 1:
try:
autosuggestSetting = False
page = wikipedia.page(wikiterm, None, autosuggestSetting)
except wikipedia.DisambiguationError as disambu_choices:
# TODO: handle non-CLI contexts here
#print(f'DisambiguationError {wikiterm}')
page = wikipedia.page(disambiguouizer(disambu_choices.options, wikiterm))
wordid = db.db_insert("word", word = wikiterm, url = page.url, summary = page.summary, content = page.content)
links = [links for links in page.links]
# for link in page.links:
# links.append(link.split('(')[0])
bad_words = ['LIST', 'ISBN', 'ISDN', 'OCLC', 'LCCN', 'NKVD', 'IMDb', 'ISNI'] # TODO: config
#"this is the proper place to sanitize the links:"
for each_bad_word in bad_words:
try: links.remove(each_bad_word)
except: pass #print(f'no {each_bad_word} found in word links')
for each_link in links:
if each_link[0:4].upper() in bad_words:
links.remove(each_link)
pattern = re.compile('[\W_]+')
numbers = re.compile('[0-9]')
goodwords = set(links)
# badwords = []
#pdb.set_trace()
for wrd in list(goodwords):
# if len(numbers.findall(wrd)) > 0:
# # in: clue w/ #, out: same clue, no #
# # for numba in re.findall('[0-9]+', wrd):
# # goodwords.add(re.sub(numba, num2words(numba, to="year"), wrd))
# # # goodwords = ['twenty five or 6 to 4', 'twenty five or six to 4', 'twenty five or six to four']
goodword = make_word_xword(wrd)
goodwords.add(goodword)
if wrd != goodword: goodwords.remove(wrd)
# for numba in re.findall('[0-9]+', wrd):
# wrd = re.sub(numba, num2words(numba, to="year"), wrd)
# goodwords.add(wrd) #'twenty five or six to four'
#pdb.set_trace()
#print("throwing out '" +e+ "': don't do numbers yet")
#badwords.append(wrd)
# badwords = ['4']
# for e in badwords:
# goodwords.remove(e)
goodwords = list(goodwords)
#pdb.set_trace()
# with open('acronym/links/' + wikiterm, 'w') as links:
# links.write(json.dumps(goodwords))
for goodword in goodwords:
linkid = db.db_insert("word_links", word_id = wordid, link = goodword)
for image in page.images:
linkid = db.db_insert("word_images", word_id = wordid, image_url = image)
# with open('acronym/summary/' + wikiterm, 'w') as summary:
# summary.write(json.dumps(page.summary))
#db.db_insert("word_links",)
# with open('acronym/summary/' + wikiterm, 'w') as summary:
# summary.write(json.dumps(page.summary))
# with open('acronym/images/' + wikiterm, 'w') as images:
# images.write(json.dumps(page.images))
# with open('acronym/content/' + wikiterm, 'w') as content:
# content.write(json.dumps(page.content))
#inasrafieldtest = open(".eggspine.txt",'w')
#if __name__ == "__main__": wikipedia_grab_chomp(word)