Skip to content

Commit

Permalink
[#77] Implement PrepareDatasetPipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
abojchevski committed Aug 18, 2015
1 parent aecf2bc commit 650f500
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 22 deletions.
24 changes: 2 additions & 22 deletions demo_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@
from nala.utils.readers import TextFilesReader
from nala.utils.readers import StringReader
from nala.utils.writers import ConsoleWriter
from nala.structures.pipelines import PrepareDatasetPipeline
from nala.learning.crfsuite import CRFSuite
from nala.preprocessing.spliters import NLTKSplitter
from nala.preprocessing.tokenizers import TmVarTokenizer
from nala.features.simple import SimpleFeatureGenerator
from nala.features.stemming import PorterStemFeatureGenerator
from nala.features.tmvar import TmVarFeatureGenerator
from nala.features.tmvar import TmVarDictionaryFeatureGenerator
from nala.features.window import WindowFeatureGenerator
import pkg_resources


Expand Down Expand Up @@ -41,21 +35,7 @@
else:
raise FileNotFoundError('directory or file "{}" does not exist'.format(args.dir_or_file))

# split and tokenize
NLTKSplitter().split(dataset)
TmVarTokenizer().tokenize(dataset)

# generate features
SimpleFeatureGenerator().generate(dataset)
PorterStemFeatureGenerator().generate(dataset)
TmVarFeatureGenerator().generate(dataset)
TmVarDictionaryFeatureGenerator().generate(dataset)

window_include_list = ['pattern0[0]', 'pattern1[0]', 'pattern2[0]', 'pattern3[0]', 'pattern4[0]', 'pattern5[0]',
'pattern6[0]', 'pattern7[0]', 'pattern8[0]', 'pattern9[0]', 'pattern10[0]', 'word[0]', 'stem[0]']

# generate features in a window
WindowFeatureGenerator(template=(-3, -2, -1, 1, 2, 3), include_list=window_include_list).generate(dataset)
PrepareDatasetPipeline().execute(dataset)

# get the predictions
crf = CRFSuite(args.crf_suite_dir)
Expand Down
64 changes: 64 additions & 0 deletions nala/structures/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from nala.preprocessing.spliters import Splitter, NLTKSplitter
from nala.preprocessing.tokenizers import Tokenizer, TmVarTokenizer
from nala.features import FeatureGenerator
from nala.features.simple import SimpleFeatureGenerator
from nala.features.stemming import PorterStemFeatureGenerator
from nala.features.tmvar import TmVarFeatureGenerator, TmVarDictionaryFeatureGenerator
from nala.features.window import WindowFeatureGenerator


class PrepareDatasetPipeline:
"""
Prepares an instance of a dataset by executing modules in fixed order.
* First executes the sentence splitter
* Next executes the tokenizer
* Finally executes each feature generator in the order they were provided
:type splitter: nala.structures.data.Splitter
:param splitter: the module responsible for splitting the text into sentences
:type tokenizer: nala.structures.data.Tokenizer
:param tokenizer: the module responsible for splitting the sentences into tokens
:type feature_generators: collections.Iterable[FeatureGenerator]
:param feature_generators: one or more modules responsible for generating features
"""

def __init__(self, splitter=None, tokenizer=None, feature_generators=None):
if not splitter:
splitter = NLTKSplitter()
if not tokenizer:
tokenizer = TmVarTokenizer()
if not feature_generators:
include = ['pattern0[0]', 'pattern1[0]', 'pattern2[0]', 'pattern3[0]', 'pattern4[0]', 'pattern5[0]',
'pattern6[0]', 'pattern7[0]', 'pattern8[0]', 'pattern9[0]', 'pattern10[0]', 'word[0]', 'stem[0]']
feature_generators = [SimpleFeatureGenerator(), PorterStemFeatureGenerator(), TmVarFeatureGenerator(),
TmVarDictionaryFeatureGenerator(),
WindowFeatureGenerator(template=(-3, -2, -1, 1, 2, 3), include_list=include)]

if isinstance(splitter, Splitter):
self.splitter = splitter
else:
raise TypeError('not an instance that implements Splitter')

if isinstance(tokenizer, Tokenizer):
self.tokenizer = tokenizer
else:
raise TypeError('not an instance that implements Tokenizer')

if hasattr(feature_generators, '__iter__'):
for index, feature_generator in enumerate(feature_generators):
if not isinstance(feature_generator, FeatureGenerator):
raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index))
self.feature_generators = feature_generators
elif isinstance(feature_generators, FeatureGenerator):
self.feature_generators = [feature_generators]
else:
raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')

def execute(self, dataset):
"""
:type dataset: nala.structures.data.Dataset()
"""
self.splitter.split(dataset)
self.tokenizer.tokenize(dataset)
for feature_generator in self.feature_generators:
feature_generator.generate(dataset)

0 comments on commit 650f500

Please sign in to comment.