diff --git a/semanticizest/__init__.py b/semanticizest/__init__.py index 8dee4bf..5900511 100644 --- a/semanticizest/__init__.py +++ b/semanticizest/__init__.py @@ -1 +1,3 @@ from ._version import __version__ + +from ._semanticizer import Semanticizer diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py new file mode 100644 index 0000000..265ee32 --- /dev/null +++ b/semanticizest/_semanticizer.py @@ -0,0 +1,51 @@ +from collections import defaultdict +import operator + +import six + +from semanticizest._util import ngrams_with_pos, tosequence + + +class Semanticizer(object): + def __init__(self, link_count, N=7): + commonness = defaultdict(list) + + for (target, anchor), count in six.iteritems(link_count): + commonness[anchor].append((target, count)) + for anchor, targets in six.iteritems(commonness): + targets.sort(key=operator.itemgetter(1)) + + # Turn counts into probabilities. + # XXX should we preserve the counts as well? + total = float(sum(count for _, count in targets)) + targets = ((t, count / total) for t, count in targets) + + self.commonness = commonness + self.N = N + + def all_candidates(self, s): + """Retrieve all candidate entities. + + Parameters + ---------- + s : {string, iterable over string} + Tokens. If a string, it will be tokenized using a naive heuristic. + + Returns + ------- + candidates : iterable over (int, int, string, float) + Candidate entities: 4-tuples of start index, end index + (both in tokenized input), target entity and probability + (commonness). + """ + + if isinstance(s, six.string_types): + # XXX need a smarter tokenizer! + s = s.split() + else: + s = tosequence(s) + + for i, j, s in ngrams_with_pos(s, self.N): + if s in self.commonness: + for target, prob in self.commonness[s]: + yield i, j, target, prob diff --git a/semanticizest/_util.py b/semanticizest/_util.py index 4dcf2b3..4fc1c61 100644 --- a/semanticizest/_util.py +++ b/semanticizest/_util.py @@ -1,15 +1,25 @@ +from collections import Sequence from six.moves import xrange from six.moves.urllib.parse import quote -def ngrams(lst, N): +def ngrams_with_pos(lst, N): """Generate n-grams for 1 <= n <= N from lst.""" join = " ".join - for n in xrange(N): - for start in xrange(len(lst) - n): - yield join(lst[start:start + n + 1]) + for start in xrange(len(lst)): + for n in xrange(1, 1 + min(N, len(lst) - start)): + yield start, start + n, join(lst[start:start + n]) + + +def ngrams(lst, N): + return (text for _, _, text in ngrams_with_pos(lst, N)) + + +def tosequence(x): + """Cast x to sequence. Returns x if at all possible.""" + return x if isinstance(x, Sequence) else list(x) def url_from_title(title, wiki): diff --git a/semanticizest/tests/test_semanticizer.py b/semanticizest/tests/test_semanticizer.py new file mode 100644 index 0000000..62f86b7 --- /dev/null +++ b/semanticizest/tests/test_semanticizer.py @@ -0,0 +1,25 @@ +import os.path +import re + +from nose.tools import assert_less_equal +from semanticizest import Semanticizer +from semanticizest._wiki_dump_parser import parse_dump + + +def test_semanticizer(): + here = os.path.dirname(os.path.abspath(__file__)) + dump = os.path.join(here, 'nlwiki-20140927-pages-articles-sample.xml') + link_count, ngram_count = parse_dump(dump, N=2) + sem = Semanticizer(link_count) + + text = """Aangezien de aarde een planeet is, kunnen de aardwetenschappen +ook als een tak van de planetologie beschouwd worden. Aardwetenschappelijke +kennis, met name geomorfologie, wordt bijvoorbeeld ook toegepast voor de +zoektocht naar sporen van water, sneeuw en ijs op de planeet Mars.""" + tokens = re.split(r'\W+', text) + + expected = set(['planeet', 'planetologie', 'kennis (wetenschap)', + 'geomorfologie', 'Mars (planeet)']) + concepts = set(string for _, _, string, _ in sem.all_candidates(tokens)) + + assert_less_equal(expected, concepts) diff --git a/semanticizest/tests/test_util.py b/semanticizest/tests/test_util.py index e405211..beef3a6 100644 --- a/semanticizest/tests/test_util.py +++ b/semanticizest/tests/test_util.py @@ -1,8 +1,8 @@ from collections import Counter -from semanticizest._util import ngrams, url_from_title +from semanticizest._util import ngrams, ngrams_with_pos, url_from_title -from nose.tools import assert_equal, assert_true +from nose.tools import assert_equal, assert_in, assert_true def test_ngrams(): @@ -15,6 +15,10 @@ def test_ngrams(): assert_equal(set(ng), expected) assert_true(all(freq == 1 for freq in ng.values())) + with_pos = list(ngrams_with_pos(text, N=2)) + assert_in((0, 2, 'Hello ,'), with_pos) + assert_in((1, 3, ', world'), with_pos) + def test_url_from_title(): """Test article title -> Wikipedia URL conversion."""