End of day

UCREL · Oct 6, 2021 · 61dd1ec · 61dd1ec
1 parent 3fb79ae
commit 61dd1ec
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 15 deletions.
diff --git a/pymusas/lexicon_collection.py b/pymusas/lexicon_collection.py
@@ -105,7 +105,7 @@ def add_lexicon_entry(self, value: LexiconEntry,
 
     @staticmethod
     def from_tsv(tsv_file_path: Union[PathLike, str], include_pos: bool = True
-                 ) -> "LexiconCollection":
+                 ) -> Dict[str, List[str]]:
         '''
         If `include_pos` is True and the TSV file does not contain a 
         `pos` field heading then this will return a LexiconCollection that is 
@@ -130,8 +130,8 @@ def from_tsv(tsv_file_path: Union[PathLike, str], include_pos: bool = True
                             adding the `LexiconEntry` into the returned 
                             `LexiconCollection`. For more information on this 
                             see the `add_lexicon_entry` method.
-        :returns: A `LexiconCollection` that has been created from the data 
-                  within the TSV file.
+        :returns: A dictionary object that can be used to create a 
+                  `LexiconCollection`
         :raises: ValueError if the minimum field headings, lemma and 
                  semantic_tags, do not exist in the given TSV file.
         '''
@@ -175,4 +175,4 @@ def from_tsv(tsv_file_path: Union[PathLike, str], include_pos: bool = True
                 collection_from_tsv.add_lexicon_entry(LexiconEntry(**row_data), 
                                                       include_pos=include_pos)
 
-        return collection_from_tsv
+        return collection_from_tsv.data
diff --git a/pymusas/spacy_lexicon_collection.py b/pymusas/spacy_lexicon_collection.py
@@ -0,0 +1,17 @@
+from os import PathLike
+from typing import Optional, Dict, List, Union
+
+import spacy
+
+from .lexicon_collection import LexiconCollection
+
+
+
+@spacy.util.registry.misc('lexicon_collection')
+def lexicon_collection(data: Optional[Dict[str, List[str]]] = None,
+                       tsv_file_path: Optional[Union[PathLike, str]] = None
+                       ) -> LexiconCollection:
+    if data is not None:
+        return LexiconCollection(data)
+    else:
+        return LexiconCollection.from_tsv(tsv_file_path)
diff --git a/pymusas/spacy_tagger.py b/pymusas/spacy_tagger.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Iterable, Callable
+from typing import Optional, List, Iterable, Callable, Dict
 
 from spacy.training import Example
 from spacy.language import Language
@@ -13,6 +13,8 @@ def __init__(self, nlp: Language, lexicon_lookup: Optional[LexiconCollection] =
                  lexicon_lemma_lookup: Optional[LexiconCollection] = None
                  ) -> None:
         print(nlp.pipe_names)
+        self.lexicon_lookup = lexicon_lookup
+        self.lexicon_lemma_lookup = lexicon_lemma_lookup
         if lexicon_lookup is None:
             self.lexicon_lookup: LexiconCollection = LexiconCollection()
         if lexicon_lemma_lookup is None:
@@ -27,21 +29,21 @@ def tag_token(text: str, lemma: str, pos: str,
         if pos == 'punc':
             return ["PUNCT"]
 
-        text_pos = f"{text}_{pos}"
+        text_pos = f"{text}|{pos}"
         if text_pos in lexicon_lookup:
             return lexicon_lookup[text_pos]
 
-        lemma_pos = f"{lemma}_{pos}"
+        lemma_pos = f"{lemma}|{pos}"
         if lemma_pos in lexicon_lookup:
             return lexicon_lookup[lemma_pos]
 
         text_lower = text.lower()
-        text_pos_lower = f"{text_lower}_{pos}"
+        text_pos_lower = f"{text_lower}|{pos}"
         if text_pos_lower in lexicon_lookup:
             return lexicon_lookup[text_pos_lower]
 
         lemma_lower = lemma.lower()
-        lemma_pos_lower = f"{lemma_lower}_{pos}"
+        lemma_pos_lower = f"{lemma_lower}|{pos}"
         if lemma_pos_lower in lexicon_lookup:
             return lexicon_lookup[lemma_pos_lower]
 
@@ -102,6 +104,5 @@ def initialize(self, get_examples: Optional[Callable[[], Iterable[Example]]] = N
 
 @Language.factory("usas_tagger")
 def create_spacy_rule_based_tagger_component(nlp: Language, name: str, 
-                                             lexicon_lookup: Optional[LexiconCollection] = None, 
-                                             lexicon_lemma_lookup: Optional[LexiconCollection] = None):
-    return SpacyRuleBasedTagger(nlp, lexicon_lookup, lexicon_lemma_lookup)
+                                             lexicon_lookup: Optional[Dict[str, List[str]]] = None):
+    return SpacyRuleBasedTagger(nlp, lexicon_lookup, None)
diff --git a/test.py b/test.py
@@ -1,3 +1,9 @@
+import spacy
+
+from pymusas.lexicon_collection import LexiconCollection, LexiconEntry
+from pymusas.spacy_tagger import create_spacy_rule_based_tagger_component
+from pymusas.spacy_lexicon_collection import lexicon_collection
+'''
 from __future__ import annotations
 from spacy.language import Language
 from spacy.tokens import Doc
@@ -14,7 +20,12 @@ def create_spacy_rule_based_tagger_component(nlp: Language, name: str):
 class SpacyRuleBasedTagger:
     def __init__(self, nlp: Language):
         pass
-
+'''
 # Add the component to the pipeline and configure it
-nlp = spacy.blank("en")
-nlp.add_pipe("ucrel")
+a_collection = lexicon_collection(data={'London|PROPN': ['Z2']}).data
+print(a_collection)
+nlp = spacy.load("en_core_web_sm")
+nlp.add_pipe("usas_tagger", last=True, config={'lexicon_lookup':a_collection})
+doc = nlp('London is great today')
+for token in doc:
+    print(f'{token.text} {token._.usas_tags}')