Skip to content

Commit 025c782

Browse files
committed
allow assign BIO tags using fuzzy string match
this can be useful to generate training data when partialy labeled data is avaiable
1 parent 4019595 commit 025c782

File tree

3 files changed

+199
-1
lines changed

3 files changed

+199
-1
lines changed

webstruct/fuzzymatch.py

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
"""
2+
This class help to generate the training data from partially labelled data with fuzzymatch_.
3+
4+
.. _fuzzywuzzy: https://github.com/seatgeek/fuzzywuzzy
5+
6+
"""
7+
import re
8+
import warnings
9+
from webstruct.sequence_encoding import IobEncoder
10+
11+
from .base import BaseSequenceClassifier
12+
13+
# from http://en.wikipedia.org/wiki/Space_(punctuation)#Spaces_in_Unicode
14+
SPACES_SRE = ur'[\s\u0020\u00a0\u1680\u18e0\u2000-\u200d\u202f\u205f\u2060\u3000\ufeff]+'
15+
16+
17+
class FuzzyMatchClassifier(BaseSequenceClassifier):
18+
"""
19+
Class for predicting the labels by matching with fuzzymatch_.
20+
21+
It first finds the candidates using the given regex pattern and then
22+
compare similarity of the matched text to the text in ``choices``,
23+
24+
if the any one of the similarities larger than the ``threshold``,
25+
assign the ``BIO`` tags to corresponding input.
26+
27+
Parameters
28+
----------
29+
entity : string
30+
the entitiy type (e.g. ADDR or ORG).
31+
32+
pattern: string
33+
a regex pattern used to find the matched string from ``html_tokens``.
34+
35+
choices : list of string
36+
a list of string to calculate the similarity to the matched string.
37+
38+
threshold: float
39+
a float to decide if matched text should assign to given tag.
40+
41+
postprocess: function
42+
a function to process the matched text before compare to ``choices``.
43+
44+
References
45+
----------
46+
.. _fuzzywuzzy: https://github.com/seatgeek/fuzzywuzzy
47+
48+
Notes
49+
-----
50+
the ``pattern`` should include the whitespaces, see ``SPACES_SRE``.
51+
52+
"""
53+
def __init__(self, entity, pattern, choices, threshold=0.9, \
54+
postprocess=lambda x: x, verbose=False):
55+
self.entity = entity
56+
self.pattern = pattern
57+
self.choices = choices
58+
self.threshold = threshold
59+
self.postprocess = postprocess
60+
self.verbose = verbose
61+
62+
def predict(self, X):
63+
"""
64+
Make a prediction.
65+
66+
Parameters
67+
----------
68+
X : list of lists of ``HtmlToken``
69+
70+
Returns
71+
-------
72+
y : list of lists
73+
predicted labels
74+
75+
"""
76+
from fuzzywuzzy import process
77+
78+
y = []
79+
for html_tokens in X:
80+
tokens = [html_token.token for html_token in html_tokens]
81+
iob_encoder = IobEncoder()
82+
83+
def repl(m):
84+
extracted = self.postprocess(m.group(0))
85+
if self.verbose:
86+
print extracted, choices
87+
88+
if process.extractBests(extracted, self.choices, score_cutoff=self.threshold * 100):
89+
return u' __START_{0}__ {1} __END_{0}__ '.format(self.entity, m.group(0))
90+
return m.group(0)
91+
92+
text = re.sub(self.pattern, repl, u" ".join(tokens), flags=re.I | re.U | re.DOTALL)
93+
tags = [tag for _, tag in iob_encoder.encode(text.split())]
94+
assert len(html_tokens) == len(tags), 'len(html_tokens): %s and len(tags): %s are not matched' % \
95+
(len(html_tokens), len(tags))
96+
y.append(tags)
97+
98+
return y
99+
100+
def merge_bio_tags(*tag_list):
101+
"""Merge BIO tags"""
102+
def select_tag(x, y):
103+
104+
if x != 'O' and y != 'O' and x[2:] != y[2:]:
105+
warnings.warn('conflict BIO tag: %s %s' % (x, y))
106+
107+
# later one wins
108+
if y != 'O':
109+
return y
110+
if x != 'O':
111+
return x
112+
return 'O'
113+
return [reduce(select_tag, i) for i in zip(*tag_list)]

webstruct/model.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
:mod:`webstruct.model` contains convetional wrappers for creating NER models.
3+
:mod:`webstruct.model` contains conventional wrappers for creating NER models.
44
"""
55
from __future__ import absolute_import
66
import urllib2
@@ -111,6 +111,13 @@ def annotate(self, bytes_data, pretty_print=False):
111111
Return annotated HTML data in WebAnnotator format.
112112
"""
113113
html_tokens, tags = self.extract_raw(bytes_data)
114+
return self.annotate_tokens(html_tokens, tags)
115+
116+
def annotate_tokens(self, html_tokens, tags, pretty_print=False):
117+
"""
118+
Return annotated HTML data in WebAnnotator format.; input is tokens and tags.
119+
"""
120+
assert len(html_tokens) == len(tags)
114121
tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
115122
tree = to_webannotator(tree, self.entity_colors)
116123
return tostring(tree, pretty_print=pretty_print)

webstruct/tests/test_fuzzymatch.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
from webstruct.fuzzymatch import FuzzyMatchClassifier, SPACES_SRE, merge_bio_tags
2+
from webstruct import HtmlTokenizer
3+
from webstruct.utils import html_document_fromstring
4+
5+
def test_fuzzy_assign_bio_tags():
6+
html = """<div id="intro_contact">
7+
013-Witgoedreparaties.nl<br/>Postbus 27<br/>
8+
4500 AA Oostburg<br/><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
9+
</div>
10+
"""
11+
html_tokenizer = HtmlTokenizer()
12+
html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
13+
pattern = ur'(^|{0})Postbus.*?Oostburg({0}|$)'.format(SPACES_SRE)
14+
choices = ['Postbus 22 4500AA Oostburg']
15+
16+
clf = FuzzyMatchClassifier(entity='ADDR', pattern=pattern, choices=choices)
17+
tags = clf.predict([html_tokens])
18+
19+
assert tags[0] == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
20+
21+
html = """<div id="intro_contact">
22+
013-Witgoedreparaties.nl<br/>Postbus 27<br/>
23+
4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:[email protected]">[email protected]</a><br/> tel: 013-5444999<br/></div></div>
24+
</div>
25+
"""
26+
html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
27+
pattern = ur'(^|{0})Postbus.*?Oostburg{0}((the\s+)?ne(d|th)erlands?)?'.format(SPACES_SRE)
28+
choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands']
29+
30+
clf = FuzzyMatchClassifier(entity='ADDR', pattern=pattern, choices=choices)
31+
tags = clf.predict([html_tokens])
32+
assert tags[0] == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
33+
34+
html = """<title>013-witgoedreparaties.nl | 013-witgoedreparaties.nl | title </title>"""
35+
html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
36+
pattern = ur'(^|{0})013-Witgoedreparaties.nl({0}|$)'.format(SPACES_SRE)
37+
choices = ['013-witgoedreparaties.nl']
38+
39+
clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
40+
tags = clf.predict([html_tokens])
41+
42+
assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
43+
44+
45+
def test_fuzzy_assign_bio_tags_with_non_break_spaces():
46+
html_tokenizer = HtmlTokenizer()
47+
html = """<title>013-witgoedreparaties.nl | &#8203;013-witgoedreparaties.nl | title </title>"""
48+
html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
49+
pattern = ur'(^|{0})013-Witgoedreparaties.nl({0}|$)'.format(SPACES_SRE)
50+
choices = ['013-witgoedreparaties.nl']
51+
52+
clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
53+
tags = clf.predict([html_tokens])
54+
55+
assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
56+
57+
html = """<title>013-witgoedreparaties.nl | &nbsp;013-witgoedreparaties.nl | title </title>"""
58+
html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
59+
choices = ['013-witgoedreparaties.nl']
60+
61+
clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
62+
tags = clf.predict([html_tokens])
63+
64+
assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
65+
66+
67+
def test_merge_bio_tags():
68+
tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
69+
assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
70+
71+
# I-ORG conflict with O
72+
tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'O', 'B-ORG', 'O', 'O'])
73+
assert tags == ['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O']
74+
75+
# merge 3 tag list
76+
tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'],
77+
['O', 'O', 'O', 'B-ADDR', 'I-ADDR'])
78+
assert tags == ['B-ORG', 'I-ORG', 'B-ORG', 'B-ADDR', 'I-ADDR']

0 commit comments

Comments
 (0)