semanticize · larsmans · Oct 24, 2014 · Oct 24, 2014 · Oct 28, 2014
diff --git a/semanticizest/__init__.py b/semanticizest/__init__.py
@@ -1 +1,3 @@
 from ._version import __version__
+
+from ._semanticizer import Semanticizer
diff --git a/semanticizest/_semanticizer.py b/semanticizest/_semanticizer.py
@@ -0,0 +1,51 @@
+from collections import defaultdict
+import operator
+
+import six
+
+from semanticizest._util import ngrams_with_pos, tosequence
+
+
+class Semanticizer(object):
+    def __init__(self, link_count, N=7):
+        commonness = defaultdict(list)
+
+        for (target, anchor), count in six.iteritems(link_count):
+            commonness[anchor].append((target, count))
+        for anchor, targets in six.iteritems(commonness):
+            targets.sort(key=operator.itemgetter(1))
+
+            # Turn counts into probabilities.
+            # XXX should we preserve the counts as well?
+            total = float(sum(count for _, count in targets))
+            targets = ((t, count / total) for t, count in targets)
+
+        self.commonness = commonness
+        self.N = N
+
+    def all_candidates(self, s):
+        """Retrieve all candidate entities.
+
+        Parameters
+        ----------
+        s : {string, iterable over string}
+            Tokens. If a string, it will be tokenized using a naive heuristic.
+
+        Returns
+        -------
+        candidates : iterable over (int, int, string, float)
+            Candidate entities: 4-tuples of start index, end index
+            (both in tokenized input), target entity and probability
+            (commonness).
+        """
+
+        if isinstance(s, six.string_types):
+            # XXX need a smarter tokenizer!
+            s = s.split()
+        else:
+            s = tosequence(s)
+
+        for i, j, s in ngrams_with_pos(s, self.N):
+            if s in self.commonness:
+                for target, prob in self.commonness[s]:
+                    yield i, j, target, prob
diff --git a/semanticizest/_util.py b/semanticizest/_util.py
@@ -1,15 +1,25 @@
+from collections import Sequence
 from six.moves import xrange
 from six.moves.urllib.parse import quote
 
 
-def ngrams(lst, N):
+def ngrams_with_pos(lst, N):
     """Generate n-grams for 1 <= n <= N from lst."""
 
     join = " ".join
 
-    for n in xrange(N):
-        for start in xrange(len(lst) - n):
-            yield join(lst[start:start + n + 1])
+    for start in xrange(len(lst)):
+        for n in xrange(1, 1 + min(N, len(lst) - start)):
+            yield start, start + n, join(lst[start:start + n])
+
+
+def ngrams(lst, N):
+    return (text for _, _, text in ngrams_with_pos(lst, N))
+
+
+def tosequence(x):
+    """Cast x to sequence. Returns x if at all possible."""
+    return x if isinstance(x, Sequence) else list(x)
 
 
 def url_from_title(title, wiki):

diff --git a/semanticizest/tests/test_semanticizer.py b/semanticizest/tests/test_semanticizer.py
@@ -0,0 +1,25 @@
+import os.path
+import re
+
+from nose.tools import assert_less_equal
+from semanticizest import Semanticizer
+from semanticizest._wiki_dump_parser import parse_dump
+
+
+def test_semanticizer():
+    here = os.path.dirname(os.path.abspath(__file__))
+    dump = os.path.join(here, 'nlwiki-20140927-pages-articles-sample.xml')
+    link_count, ngram_count = parse_dump(dump, N=2)
+    sem = Semanticizer(link_count)
+
+    text = """Aangezien de aarde een planeet is, kunnen de aardwetenschappen
+ook als een tak van de planetologie beschouwd worden. Aardwetenschappelijke
+kennis, met name geomorfologie, wordt bijvoorbeeld ook toegepast voor de
+zoektocht naar sporen van water, sneeuw en ijs op de planeet Mars."""
+    tokens = re.split(r'\W+', text)
+
+    expected = set(['planeet', 'planetologie', 'kennis (wetenschap)',
+                    'geomorfologie', 'Mars (planeet)'])
+    concepts = set(string for _, _, string, _ in sem.all_candidates(tokens))
+
+    assert_less_equal(expected, concepts)
diff --git a/semanticizest/tests/test_util.py b/semanticizest/tests/test_util.py
@@ -1,8 +1,8 @@
 from collections import Counter
 
-from semanticizest._util import ngrams, url_from_title
+from semanticizest._util import ngrams, ngrams_with_pos, url_from_title
 
-from nose.tools import assert_equal, assert_true
+from nose.tools import assert_equal, assert_in, assert_true
 
 
 def test_ngrams():
@@ -15,6 +15,10 @@ def test_ngrams():
     assert_equal(set(ng), expected)
     assert_true(all(freq == 1 for freq in ng.values()))
 
+    with_pos = list(ngrams_with_pos(text, N=2))
+    assert_in((0, 2, 'Hello ,'), with_pos)
+    assert_in((1, 3, ', world'), with_pos)
+
 
 def test_url_from_title():
     """Test article title -> Wikipedia URL conversion."""
Original file line number	Diff line number	Diff line change
		@@ -1 +1,3 @@
		from ._version import __version__

		from ._semanticizer import Semanticizer