Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions semanticizest/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from ._version import __version__

from ._semanticizer import Semanticizer
51 changes: 51 additions & 0 deletions semanticizest/_semanticizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from collections import defaultdict
import operator

import six

from semanticizest._util import ngrams_with_pos, tosequence


class Semanticizer(object):
def __init__(self, link_count, N=7):
commonness = defaultdict(list)

for (target, anchor), count in six.iteritems(link_count):
commonness[anchor].append((target, count))
for anchor, targets in six.iteritems(commonness):
targets.sort(key=operator.itemgetter(1))

# Turn counts into probabilities.
# XXX should we preserve the counts as well?
total = float(sum(count for _, count in targets))
targets = ((t, count / total) for t, count in targets)

self.commonness = commonness
self.N = N

def all_candidates(self, s):
"""Retrieve all candidate entities.

Parameters
----------
s : {string, iterable over string}
Tokens. If a string, it will be tokenized using a naive heuristic.

Returns
-------
candidates : iterable over (int, int, string, float)
Candidate entities: 4-tuples of start index, end index
(both in tokenized input), target entity and probability
(commonness).
"""

if isinstance(s, six.string_types):
# XXX need a smarter tokenizer!
s = s.split()
else:
s = tosequence(s)

for i, j, s in ngrams_with_pos(s, self.N):
if s in self.commonness:
for target, prob in self.commonness[s]:
yield i, j, target, prob
18 changes: 14 additions & 4 deletions semanticizest/_util.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,25 @@
from collections import Sequence
from six.moves import xrange
from six.moves.urllib.parse import quote


def ngrams(lst, N):
def ngrams_with_pos(lst, N):
"""Generate n-grams for 1 <= n <= N from lst."""

join = " ".join

for n in xrange(N):
for start in xrange(len(lst) - n):
yield join(lst[start:start + n + 1])
for start in xrange(len(lst)):
for n in xrange(1, 1 + min(N, len(lst) - start)):
yield start, start + n, join(lst[start:start + n])


def ngrams(lst, N):
return (text for _, _, text in ngrams_with_pos(lst, N))


def tosequence(x):
"""Cast x to sequence. Returns x if at all possible."""
return x if isinstance(x, Sequence) else list(x)


def url_from_title(title, wiki):
Expand Down
25 changes: 25 additions & 0 deletions semanticizest/tests/test_semanticizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os.path
import re

from nose.tools import assert_less_equal
from semanticizest import Semanticizer
from semanticizest._wiki_dump_parser import parse_dump


def test_semanticizer():
here = os.path.dirname(os.path.abspath(__file__))
dump = os.path.join(here, 'nlwiki-20140927-pages-articles-sample.xml')
link_count, ngram_count = parse_dump(dump, N=2)
sem = Semanticizer(link_count)

text = """Aangezien de aarde een planeet is, kunnen de aardwetenschappen
ook als een tak van de planetologie beschouwd worden. Aardwetenschappelijke
kennis, met name geomorfologie, wordt bijvoorbeeld ook toegepast voor de
zoektocht naar sporen van water, sneeuw en ijs op de planeet Mars."""
tokens = re.split(r'\W+', text)

expected = set(['planeet', 'planetologie', 'kennis (wetenschap)',
'geomorfologie', 'Mars (planeet)'])
concepts = set(string for _, _, string, _ in sem.all_candidates(tokens))

assert_less_equal(expected, concepts)
8 changes: 6 additions & 2 deletions semanticizest/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from collections import Counter

from semanticizest._util import ngrams, url_from_title
from semanticizest._util import ngrams, ngrams_with_pos, url_from_title

from nose.tools import assert_equal, assert_true
from nose.tools import assert_equal, assert_in, assert_true


def test_ngrams():
Expand All @@ -15,6 +15,10 @@ def test_ngrams():
assert_equal(set(ng), expected)
assert_true(all(freq == 1 for freq in ng.values()))

with_pos = list(ngrams_with_pos(text, N=2))
assert_in((0, 2, 'Hello ,'), with_pos)
assert_in((1, 3, ', world'), with_pos)


def test_url_from_title():
"""Test article title -> Wikipedia URL conversion."""
Expand Down