allow assign BIO tags using fuzzy string match

tpeng · tpeng · commit 025c78208c8b · 2014-11-21T23:24:34.000+01:00
this can be useful to generate training data when partialy labeled data
is avaiable
diff --git a/webstruct/fuzzymatch.py b/webstruct/fuzzymatch.py
@@ -0,0 +1,113 @@
+"""
+This class help to generate the training data from partially labelled data with fuzzymatch_.
+
+.. _fuzzywuzzy: https://github.com/seatgeek/fuzzywuzzy
+
+"""
+import re
+import warnings
+from webstruct.sequence_encoding import IobEncoder
+
+from .base import BaseSequenceClassifier
+
+# from http://en.wikipedia.org/wiki/Space_(punctuation)#Spaces_in_Unicode
+SPACES_SRE = ur'[\s\u0020\u00a0\u1680\u18e0\u2000-\u200d\u202f\u205f\u2060\u3000\ufeff]+'
+
+
+class FuzzyMatchClassifier(BaseSequenceClassifier):
+    """
+    Class for predicting the labels by matching with fuzzymatch_.
+
+    It first finds the candidates using the given regex pattern and then
+    compare similarity of the matched text to the text in ``choices``,
+
+    if the any one of the similarities larger than the ``threshold``,
+    assign the ``BIO`` tags to corresponding input.
+
+    Parameters
+    ----------
+    entity : string
+        the entitiy type (e.g. ADDR or ORG).
+
+    pattern: string
+        a regex pattern used to find the matched string from ``html_tokens``.
+
+    choices : list of string
+        a list of string to calculate the similarity to the matched string.
+
+    threshold: float
+        a float to decide if matched text should assign to given tag.
+
+    postprocess: function
+        a function to process the matched text before compare to ``choices``.
+
+    References
+    ----------
+    .. _fuzzywuzzy: https://github.com/seatgeek/fuzzywuzzy
+
+    Notes
+    -----
+    the ``pattern`` should include the whitespaces, see ``SPACES_SRE``.
+
+    """
+    def __init__(self, entity, pattern, choices, threshold=0.9, \
+                postprocess=lambda x: x, verbose=False):
+        self.entity = entity
+        self.pattern = pattern
+        self.choices = choices
+        self.threshold = threshold
+        self.postprocess = postprocess
+        self.verbose = verbose
+
+    def predict(self, X):
+        """
+        Make a prediction.
+
+        Parameters
+        ----------
+        X : list of lists of ``HtmlToken``
+
+        Returns
+        -------
+        y : list of lists
+            predicted labels
+
+        """
+        from fuzzywuzzy import process
+
+        y = []
+        for html_tokens in X:
+            tokens = [html_token.token for html_token in html_tokens]
+            iob_encoder = IobEncoder()
+
+            def repl(m):
+                extracted = self.postprocess(m.group(0))
+                if self.verbose:
+                    print extracted, choices
+
+                if process.extractBests(extracted, self.choices, score_cutoff=self.threshold * 100):
+                    return u' __START_{0}__ {1} __END_{0}__ '.format(self.entity, m.group(0))
+                return m.group(0)
+
+            text = re.sub(self.pattern, repl, u" ".join(tokens), flags=re.I | re.U | re.DOTALL)
+            tags = [tag for _, tag in iob_encoder.encode(text.split())]
+            assert len(html_tokens) == len(tags), 'len(html_tokens): %s and len(tags): %s are not matched' % \
+                        (len(html_tokens), len(tags))
+            y.append(tags)
+
+        return y
+
+def merge_bio_tags(*tag_list):
+    """Merge BIO tags"""
+    def select_tag(x, y):
+
+        if x != 'O' and y != 'O' and x[2:] != y[2:]:
+            warnings.warn('conflict BIO tag: %s %s' % (x, y))
+
+        # later one wins
+        if y != 'O':
+            return y
+        if x != 'O':
+            return x
+        return 'O'
+    return [reduce(select_tag, i) for i in zip(*tag_list)]
diff --git a/webstruct/model.py b/webstruct/model.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-:mod:`webstruct.model` contains convetional wrappers for creating NER models.
+:mod:`webstruct.model` contains conventional wrappers for creating NER models.
 """
 from __future__ import absolute_import
 import urllib2
@@ -111,6 +111,13 @@ def annotate(self, bytes_data, pretty_print=False):
         Return annotated HTML data in WebAnnotator format.
         """
         html_tokens, tags = self.extract_raw(bytes_data)
+        return self.annotate_tokens(html_tokens, tags)
+
+    def annotate_tokens(self, html_tokens, tags, pretty_print=False):
+        """
+        Return annotated HTML data in WebAnnotator format.; input is tokens and tags.
+        """
+        assert len(html_tokens) == len(tags)
         tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
         tree = to_webannotator(tree, self.entity_colors)
         return tostring(tree, pretty_print=pretty_print)
diff --git a/webstruct/tests/test_fuzzymatch.py b/webstruct/tests/test_fuzzymatch.py
@@ -0,0 +1,78 @@
+from webstruct.fuzzymatch import FuzzyMatchClassifier, SPACES_SRE, merge_bio_tags
+from webstruct import HtmlTokenizer
+from webstruct.utils import html_document_fromstring
+
+def test_fuzzy_assign_bio_tags():
+    html = """<div id="intro_contact">
+         013-Witgoedreparaties.nl<br/>Postbus 27<br/>
+         4500 AA Oostburg<br/><a href="mailto:info@013-witgoedreparaties.nl">info@013-witgoedreparaties.nl</a><br/>  tel: 013-5444999<br/></div></div>
+        </div>
+    """
+    html_tokenizer = HtmlTokenizer()
+    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+    pattern = ur'(^|{0})Postbus.*?Oostburg({0}|$)'.format(SPACES_SRE)
+    choices = ['Postbus 22 4500AA Oostburg']
+
+    clf = FuzzyMatchClassifier(entity='ADDR', pattern=pattern, choices=choices)
+    tags = clf.predict([html_tokens])
+
+    assert tags[0] == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
+
+    html = """<div id="intro_contact">
+         013-Witgoedreparaties.nl<br/>Postbus 27<br/>
+         4500 AA Oostburg<br/>The Netherlands</br><a href="mailto:info@013-witgoedreparaties.nl">info@013-witgoedreparaties.nl</a><br/>  tel: 013-5444999<br/></div></div>
+        </div>
+    """
+    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+    pattern = ur'(^|{0})Postbus.*?Oostburg{0}((the\s+)?ne(d|th)erlands?)?'.format(SPACES_SRE)
+    choices = ['Postbus 22 4500AA Oostburg', 'Postbus 22 4500AA Oostburg the netherlands']
+
+    clf = FuzzyMatchClassifier(entity='ADDR', pattern=pattern, choices=choices)
+    tags = clf.predict([html_tokens])
+    assert tags[0] == ['O', 'B-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'I-ADDR', 'O', 'O', 'O']
+
+    html = """<title>013-witgoedreparaties.nl | 013-witgoedreparaties.nl | title </title>"""
+    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+    pattern = ur'(^|{0})013-Witgoedreparaties.nl({0}|$)'.format(SPACES_SRE)
+    choices = ['013-witgoedreparaties.nl']
+
+    clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
+    tags = clf.predict([html_tokens])
+
+    assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
+
+
+def test_fuzzy_assign_bio_tags_with_non_break_spaces():
+    html_tokenizer = HtmlTokenizer()
+    html = """<title>013-witgoedreparaties.nl | &#8203;013-witgoedreparaties.nl | title </title>"""
+    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+    pattern = ur'(^|{0})013-Witgoedreparaties.nl({0}|$)'.format(SPACES_SRE)
+    choices = ['013-witgoedreparaties.nl']
+
+    clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
+    tags = clf.predict([html_tokens])
+
+    assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
+
+    html = """<title>013-witgoedreparaties.nl | &nbsp;013-witgoedreparaties.nl | title </title>"""
+    html_tokens, tags = html_tokenizer.tokenize_single(html_document_fromstring(html))
+    choices = ['013-witgoedreparaties.nl']
+
+    clf = FuzzyMatchClassifier(entity='ORG', pattern=pattern, choices=choices)
+    tags = clf.predict([html_tokens])
+
+    assert tags[0] == ['B-ORG', 'O', 'B-ORG', 'O', 'O']
+
+
+def test_merge_bio_tags():
+    tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O'])
+    assert tags == ['B-ORG', 'B-ADDR', 'B-ORG', 'O', 'O']
+
+    # I-ORG conflict with O
+    tags = merge_bio_tags(['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'], ['B-ORG', 'O', 'B-ORG', 'O', 'O'])
+    assert tags == ['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O']
+
+    # merge 3 tag list
+    tags = merge_bio_tags(['B-ORG', 'O', 'B-ORG', 'O', 'O'], ['B-ORG', 'I-ORG', 'B-ORG', 'O', 'O'],
+        ['O', 'O', 'O', 'B-ADDR', 'I-ADDR'])
+    assert tags == ['B-ORG', 'I-ORG', 'B-ORG', 'B-ADDR', 'I-ADDR']