-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlanguage.py
186 lines (148 loc) · 6.53 KB
/
language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
""" Defines the Language class, which allows comparisons between documents.
Written by Colin Hamilton, May 2016
"""
import os
import json
from math import sqrt
from ngramtrie import NGramTrie
def dot_product(lang1, lang2):
"""Returns the sum over all trigrams of lang1 times lang2's frequency"""
sum = 0
for key in lang1:
if key in lang2:
sum += lang1[key] * lang2[key]
return sum
def norm(lang1):
"""Returns the norm of the language, ||A|| = sqrt(A*A)"""
return sqrt(dot_product(lang1, lang1))
class Language:
""" A language maintains linguistic statistics, and uses them for comparisons.
In general, a Language object should be created for each language you wish to
analyze, including for any unknown languages. Then add_file should be
called for each document you wish to analyze.
n-grams are currently the mechanism used, so the max size of these n-grams should
be given when the Language object is created.
To implement your own preprocessing of files (for example, converting punctuation
and whitespace, dealing with capital vs lowercase letters), this class can be
subclassed and the transformation functions customized.
By default, multiple whitespace and nonalphabetic characters are condensed into
a single space (' ') character. Letters are converted to a standard case
using str.casefold(). This is because, in general, and particularly for small
documents, these features are not considered useful in distinguishing languages.
"""
def __init__(self, n=3):
"""Initializes an NGramTrie with the given max size (defaults to 3)"""
self.n_grams = NGramTrie(n)
# These functions handle transforming characters before counting them.
# They are intended to be overwritten and customized by subclassing
def transform(self, char, gram):
""" Transforms a character read from a document.
By default, standardizes alphabetic characters with standardize(). Transforms
nonalphabetic characters with transform_nonalpha(). If it's a space, it's
added only if the previous n-gram did not end with a space (otherwise
ignoring the current character).
This has the effect of condensing multiple nonalphabetic characters into
a single delimeter between words.
Args:
char: The character read from the document
gram: The previous n-gram extracted from the document
Returns:
A string, representing the transformed character. This is often
the same as the input character. It may be the empty string,
meaning the character should be ignored.
"""
if not char.isalpha():
char = self.transform_nonalpha(char)
if char.isspace() and gram.endswith(char):
return ""
else:
return self.standardize(char)
def transform_nonalpha(self, char):
"""Called when char is nonalphabetic"""
return " "
def standardize(self, char):
"""Called on alphabetic characters, should deal with properties like case"""
return char.casefold()
def first_gram(self):
"""Called at the start of a file; returns an initial (typically delimeter) string"""
return " "
def last_gram(self):
"""Called at the end of a file; returns a final (typically delimeter) string"""
return " "
def add_file(self, filename):
""" Analyses the given file, integrating it into the Language.
Each character from the file is transformed with the transform() method.
These characters are then compiled into n-grams, and counted accordingly.
Args:
filename: A string with the name of the document to analyse
"""
trie = self.n_grams
def _update(gram, char):
gram = gram + self.transform(char, gram)
gram = gram[-min(len(gram), trie.n_max):]
if len(gram) == trie.n_max:
trie.add(gram)
return gram
gram = ""
for char in self.first_gram():
gram = _update(gram, char)
with open(filename, "r") as file:
for line in file:
for char in line:
gram = _update(gram, char)
for char in self.last_gram():
gram = _update(gram, char)
self.n_grams = trie
def read_cache(self, cache, expected=[]):
"""Returns True if successful"""
found = {} # Should probably be done in language_match
for file in expected:
found[file] = False
if "files" in cache:
for file in cache["files"]:
if os.path.getmtime(file) > cache["files"][file]:
return False # A more recent version is available
found[file] = True
if "n" not in cache:
return False
self.n_grams = NGramTrie(cache["n"])
if "grams" not in cache:
return False
for gram in cache["grams"]:
self.n_grams.add(gram, count=cache["grams"][gram])
return True
def to_cache(self, files):
cache = {"files":{}, "n": self.n_grams.n_max, "grams": {}}
for file in files:
cache["files"][file] = os.path.getmtime(file)
cache["grams"] = self.n_grams.gram_counts()
return cache
def __str__(self):
"""A string representation of sorted n-gram frequencies of the Language"""
string = ""
grams = self.n_grams.frequencies()
for (gram, freq) in sorted(grams.items(), key=lambda x: x[1]):
string += "'" + gram + "' " + str(freq) + "\n"
return string[:-1]
def compare(self, other):
"""Compares self to other.
Args:
other: Another Language object to compare to.
Returns:
A number between 0 and 1 representing how closely correlated
they are; close to 1 means strong correlation, while close to 0
means virtually unrelated. For n=3, two objects of the same language
will typically have a correlation between 0.8 and 0.95
"""
lang1 = self.n_grams.frequencies()
lang2 = other.n_grams.frequencies()
denom = norm(lang1) * norm(lang2)
if denom == 0:
return 0
else:
return dot_product(lang1, lang2) / denom
def predict_next_char(self, start, random=True):
if random:
return self.n_grams.next_random(start)
else:
return self.n_grams.next_most_likely(start)