Skip to content

Commit a3d8ba0

Browse files
committed
generate n-grams in order, return positions to caller
1 parent 84e42de commit a3d8ba0

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

semanticizest/_util.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,23 @@
22
from six.moves.urllib.parse import quote
33

44

5-
def ngrams(lst, N):
5+
def ngrams_with_pos(lst, N):
66
"""Generate n-grams for 1 <= n <= N from lst."""
77

88
join = " ".join
99

10-
for n in xrange(N):
11-
for start in xrange(len(lst) - n):
12-
yield join(lst[start:start + n + 1])
10+
for start in xrange(len(lst)):
11+
for n in xrange(1, 1 + min(N, len(lst) - start)):
12+
yield start, start + n, join(lst[start:start + n])
13+
14+
15+
def ngrams(lst, N):
16+
return (ng for _, _, ng in ngrams_with_pos(lst, N))
17+
18+
19+
def tosequence(x):
20+
"""Cast x to sequence. Returns x if at all possible."""
21+
return x if isinstance(x, Sequence) else list(x)
1322

1423

1524
def url_from_title(title, wiki):

semanticizest/tests/test_util.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from collections import Counter
22

3-
from semanticizest._util import ngrams, url_from_title
3+
from semanticizest._util import ngrams, ngrams_with_pos, url_from_title
44

5-
from nose.tools import assert_equal, assert_true
5+
from nose.tools import assert_equal, assert_in, assert_true
66

77

88
def test_ngrams():
@@ -15,6 +15,10 @@ def test_ngrams():
1515
assert_equal(set(ng), expected)
1616
assert_true(all(freq == 1 for freq in ng.values()))
1717

18+
with_pos = list(ngrams_with_pos(text, N=2))
19+
assert_in((0, 2, 'Hello ,'), with_pos)
20+
assert_in((1, 3, ', world'), with_pos)
21+
1822

1923
def test_url_from_title():
2024
"""Test article title -> Wikipedia URL conversion."""

0 commit comments

Comments
 (0)