diff --git a/doc/conf.py b/doc/conf.py index a6be0e8..6403136 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,8 +46,8 @@ master_doc = 'index' # General information about the project. -project = u'PyPLN' -copyright = u'2011, Flávio Codeço Coelho' +project = 'PyPLN' +copyright = '2011, Flávio Codeço Coelho' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -187,8 +187,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'PyPLN.tex', u'PyPLN Documentation', - u'Flávio Codeço Coelho', 'manual'), + ('index', 'PyPLN.tex', 'PyPLN Documentation', + 'Flávio Codeço Coelho', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -220,6 +220,6 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pypln', u'PyPLN Documentation', - [u'Flávio Codeço Coelho'], 1) + ('index', 'pypln', 'PyPLN Documentation', + ['Flávio Codeço Coelho'], 1) ] diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py index 342c5be..895d9cd 100644 --- a/pypln/backend/celery_app.py +++ b/pypln/backend/celery_app.py @@ -19,7 +19,7 @@ from celery import Celery from kombu import Exchange, Queue -import config +from . import config app = Celery('pypln_workers', backend='mongodb', broker='amqp://', include=['pypln.backend.workers']) diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index 2d3d93d..0b1c235 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -31,7 +31,7 @@ from pypln.backend import config -mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS) +mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False) database = mongo_client[config.MONGODB_DBNAME] document_collection = database[config.MONGODB_COLLECTION] diff --git a/pypln/backend/config.py b/pypln/backend/config.py index ec1d48e..e5bd6d3 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -1,16 +1,12 @@ import os +import urllib.parse from decouple import config, Csv -try: - import urlparse -except ImportError: - import urllib.parse as urlparse - def parse_url(url): - urlparse.uses_netloc.append('mongodb') - urlparse.uses_netloc.append('celery') - url = urlparse.urlparse(url) + urllib.parse.uses_netloc.append('mongodb') + urllib.parse.uses_netloc.append('celery') + url = urllib.parse.urlparse(url) path = url.path[1:] path = path.split('?', 2)[0] diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index 0125bde..9ca1ec2 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -17,18 +17,18 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from extractor import Extractor -from tokenizer import Tokenizer -from freqdist import FreqDist -from pos import POS -from statistics import Statistics -from bigrams import Bigrams -from palavras_raw import PalavrasRaw -from lemmatizer_pt import Lemmatizer -from palavras_noun_phrase import NounPhrase -from palavras_semantic_tagger import SemanticTagger -from word_cloud import WordCloud -from elastic_indexer import ElasticIndexer +from .extractor import Extractor +from .tokenizer import Tokenizer +from .freqdist import FreqDist +from .pos import POS +from .statistics import Statistics +from .bigrams import Bigrams +from .palavras_raw import PalavrasRaw +from .lemmatizer_pt import Lemmatizer +from .palavras_noun_phrase import NounPhrase +from .palavras_semantic_tagger import SemanticTagger +from .word_cloud import WordCloud +from .elastic_indexer import ElasticIndexer __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 302482f..c99cb95 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -16,33 +16,31 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +from collections import OrderedDict -import nltk -from collections import defaultdict - -from nltk.collocations import BigramCollocationFinder +from nltk import BigramCollocationFinder, BigramAssocMeasures from pypln.backend.celery_task import PyPLNTask +METRICS = ['chi_sq', + 'dice', + 'jaccard', + 'likelihood_ratio', + 'mi_like', + 'phi_sq', + 'pmi', + 'poisson_stirling', + 'raw_freq', + 'student_t'] -class Bigrams(PyPLNTask): - """Create a NLTK bigram finder and return a table in JSON format""" +class Bigrams(PyPLNTask): def process(self, document): - #todo: support filtering by stopwords - bigram_measures = nltk.collocations.BigramAssocMeasures() - metrics = ['chi_sq', - 'dice', - 'jaccard', - 'likelihood_ratio', - 'mi_like', - 'phi_sq', - 'pmi', - 'poisson_stirling', - 'raw_freq', - 'student_t'] bigram_finder = BigramCollocationFinder.from_words(document['tokens']) - br = defaultdict(lambda :[]) - for m in metrics: - for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): - br[res[0]].append(res[1]) - return {'metrics': metrics, 'bigram_rank': br.items()} + bigram_rankings = OrderedDict() + for metric_name in METRICS: + metric = getattr(BigramAssocMeasures, metric_name) + for ranking in bigram_finder.score_ngrams(metric): + bigram = ranking[0] + d = bigram_rankings.setdefault(bigram, {}) + d[metric_name] = ranking[1] + return {'bigram_rankings': list(bigram_rankings.items())} diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 110730b..eed8f33 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -18,16 +18,15 @@ # along with PyPLN. If not, see . import base64 +import html import shlex -from HTMLParser import HTMLParser from tempfile import NamedTemporaryFile from os import unlink from subprocess import Popen, PIPE -from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape -import cld +import pycld2 as cld import magic from pypln.backend.celery_task import PyPLNTask @@ -46,6 +45,10 @@ '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6', 'br', 'br/'] double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] +cld_error_re = regexp_compile('input contains invalid UTF-8 around byte ' + '(?P\d+) \(of \d+\)') +MAX_CLD_BYTES_TO_REMOVE = 1024 + def clean(text): text = regexp_spaces_start.sub(r'\1', text) @@ -84,10 +87,10 @@ def parse_html(html, remove_tags=None, remove_inside=None, [''] * (total_to_remove - 2) content_between[index + 1] = '\n' complete_tags.append('') - result = ''.join(sum(zip(content_between, complete_tags), tuple())) + result = ''.join(sum(list(zip(content_between, complete_tags)), tuple())) return clean(result) -def get_pdf_metadata(data): +def get_pdf_metadata(data: str) -> dict: lines = data.strip().splitlines() metadata = {} for line in lines: @@ -98,7 +101,7 @@ def get_pdf_metadata(data): metadata[key.strip()] = value.strip() return metadata -def extract_pdf(data): +def extract_pdf(data: bytes) -> (str, dict): temp = NamedTemporaryFile(delete=False) filename = temp.name temp.close() @@ -112,14 +115,16 @@ def extract_pdf(data): unlink(filename + '_ind.html') unlink(filename + 's.html') text = parse_html(html.replace(' ', ' '), True, ['script', 'style']) - pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE, - stderr=PIPE) - meta_out, meta_err = pdfinfo.communicate(input=data) + + info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE, + stderr=PIPE) + meta_out, meta_err = info_process.communicate(input=data) try: - metadata = get_pdf_metadata(meta_out) - except: + metadata = get_pdf_metadata(meta_out.decode('utf-8')) + except Exception: + # TODO: what should I do here? metadata = {} - #TODO: what should I do here? + if not (text and metadata): return '', {} elif not html_err: @@ -128,41 +133,57 @@ def extract_pdf(data): return '', {} -def trial_decode(text): +def decode_text_bytes(text: bytes) -> str: """ - Tries to detect text encoding using `magic`. If the detected encoding is - not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding - as utf-8 replacing invalid chars with `U+FFFD` (the replacement character). - - This is far from an ideal solution, but the extractor and the rest of the - pipeline need an unicode object. + Tries to detect text encoding using file magic. If that fails or the + detected encoding is not supported, tries using utf-8. If that doesn't work + tries using iso8859-1. """ - with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: - content_encoding = m.id_buffer(text) + try: + with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: + content_encoding = m.id_buffer(text) + except magic.MagicError: + pass # This can happen for instance if text is a single char + else: + try: + return text.decode(content_encoding) + except LookupError: # The detected encoding is not supported + pass - forced_decoding = False try: - result = text.decode(content_encoding) - except LookupError: - # If the detected encoding is not supported, we try to decode it as - # utf-8. + result = text.decode('utf-8') + except UnicodeDecodeError: + # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is + # a last resort. + result = text.decode('iso8859-1') + return result + + +def detect_language(text: str) -> str: + # CLD seems to have an issue with some bytes that Python considers + # to be valid utf-8. Remove up to MAX_CLD_BYTES_TO_REMOVE of such + # "invalid" bytes + # TODO: alert the user somehow if we give up removing them + detected_language = None + text_bytes = text.encode('utf-8') + for i in range(MAX_CLD_BYTES_TO_REMOVE): try: - result = text.decode('utf-8') - except UnicodeDecodeError: - # Is there a better way of doing this than nesting try/except - # blocks? This smells really bad. - try: - result = text.decode('iso-8859-1') - except UnicodeDecodeError: - # If neither utf-8 nor iso-885901 work are capable of handling - # this text, we just decode it using utf-8 and replace invalid - # chars with U+FFFD. - # Two somewhat arbitrary decisions were made here: use utf-8 - # and use 'replace' instead of 'ignore'. - result = text.decode('utf-8', 'replace') - forced_decoding = True - - return result, forced_decoding + languages = cld.detect(text_bytes)[2] + except cld.error as exc: + message = exc.args[0] if exc.args else '' + match = cld_error_re.match(message) + if match: + byte_index = int(match.group('index')) + text_bytes = (text_bytes[:byte_index] + + text_bytes[byte_index + 1:]) + else: + raise + else: + if languages: + detected_language = languages[0][1] + break + + return detected_language class Extractor(PyPLNTask): @@ -173,11 +194,12 @@ def process(self, file_data): contents = base64.b64decode(file_data['contents']) with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(contents) + metadata = {} - if file_mime_type == 'text/plain': - text = contents - elif file_mime_type == 'text/html': - text = parse_html(contents, True, ['script', 'style']) + if file_mime_type in ('text/plain', 'text/html'): + text = decode_text_bytes(contents) + if file_mime_type == 'text/html': + text = parse_html(text, True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(contents) else: @@ -191,22 +213,10 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - text, forced_decoding = trial_decode(text) - - if isinstance(text, unicode): - # HTMLParser only handles unicode objects. We can't pass the text - # through it if we don't know the encoding, and it's possible we - # also shouldn't. There's no way of knowing if it's a badly encoded - # html or a binary blob that happens do have bytes that look liked - # html entities. - text = HTMLParser().unescape(text) - + text = html.unescape(text) text = clean(text) - - if isinstance(text, unicode): - language = cld.detect(text.encode('utf-8'))[1] - else: - language = cld.detect(text)[1] - - return {'text': text, 'file_metadata': metadata, 'language': language, - 'mimetype': file_mime_type, 'forced_decoding': forced_decoding} + return {'text': text, + 'file_metadata': metadata, + 'language': detect_language(text), + 'mimetype': file_mime_type, + 'forced_decoding': None} diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py index 7bb7f7e..bdf3712 100644 --- a/pypln/backend/workers/freqdist.py +++ b/pypln/backend/workers/freqdist.py @@ -27,7 +27,7 @@ def process(self, document): tokens = [info.lower() for info in document_tokens] frequency_distribution = {token: tokens.count(token) \ for token in set(tokens)} - fd = frequency_distribution.items() - fd.sort(lambda x, y: cmp(y[1], x[1])) + fd = list(frequency_distribution.items()) + fd.sort(key=lambda x: (-x[1], x[0])) return {'freqdist': fd} diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py index 76e3a18..f9dde80 100644 --- a/pypln/backend/workers/palavras_noun_phrase.py +++ b/pypln/backend/workers/palavras_noun_phrase.py @@ -40,7 +40,7 @@ def process(self, document): stdout=subprocess.PIPE, stderr=subprocess.PIPE) palavras_output = document['palavras_raw'] - if isinstance(palavras_output, unicode): + if isinstance(palavras_output, str): # we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii. palavras_output = palavras_output.encode('utf-8') stdout, stderr = process.communicate(palavras_output) diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py index 77e2d9a..95161ba 100644 --- a/pypln/backend/workers/palavras_raw.py +++ b/pypln/backend/workers/palavras_raw.py @@ -39,14 +39,15 @@ def process(self, document): text = document['text'] - # For some reason, in some pypln installations the document['text'] is - # not always unicode as it should be. This may be due to errors during - # the decoding process that we fixed earlier. That meant that, when we - # got a non-unicode string, python would try to decode it using the - # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we - # know the text came from mongodb, we can just decode it using utf-8 to - # make sure we have a unicode object. - if not isinstance(text, unicode): + # This code is here because when using python2 for some + # reason, sometimes document['text'] was not a unicode object + # (as it should be, coming from pymongo). Since we're now + # using python3, we should really always get a str (unicode) + # object. But, since we do not know the real reason for the + # original error, we will keep this code here for now. As + # before, if we receive a bytes object, since it came from + # mongodb we can be sure it will be encoded in utf-8. + if isinstance(text, bytes): text = text.decode('utf-8') process = subprocess.Popen([BASE_PARSER, PARSER_MODE], @@ -55,4 +56,4 @@ def process(self, document): stderr=subprocess.PIPE) stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING)) - return {'palavras_raw': stdout, 'palavras_raw_ran': True} + return {'palavras_raw': stdout.decode('utf-8'), 'palavras_raw_ran': True} diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py index 3f35ca5..a66d42c 100644 --- a/pypln/backend/workers/palavras_semantic_tagger.py +++ b/pypln/backend/workers/palavras_semantic_tagger.py @@ -26,381 +26,381 @@ { 'Animal': { - '': u'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' , - '': u'Group of animals (cardume, enxame, passarada, ninhada)', - '': u'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)', - '': u'Group of domestic animals (boiada)', - '': u'Water-animal (tubarão, delfim)', - '': u'Mythological animal (basilisco)', - '': u'Land-animal (raposa)', - '': u'Bird (águia, bem-te-vi)', - '': u'Insect (borboleta)', - '': u'Cell-animal (bacteria, blood cells: linfócito)', + '': 'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' , + '': 'Group of animals (cardume, enxame, passarada, ninhada)', + '': 'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)', + '': 'Group of domestic animals (boiada)', + '': 'Water-animal (tubarão, delfim)', + '': 'Mythological animal (basilisco)', + '': 'Land-animal (raposa)', + '': 'Bird (águia, bem-te-vi)', + '': 'Insect (borboleta)', + '': 'Cell-animal (bacteria, blood cells: linfócito)', }, 'Plant': { - '': u'Plant, umbrella tag', - '': u'Group of plants, plantation (field, forest etc.: mata, nabal)', - '': u'Tree (oliveira, palmeira)', - '': u'Flower (rosa, taraxaco)', - '': u'Bush, shrub (rododendro, tamariz)', - '': u'(fruit, berries, nuts: maçã, morango, avelã, melancia)', - '': u'(vegetable espargo, funcho)', + '': 'Plant, umbrella tag', + '': 'Group of plants, plantation (field, forest etc.: mata, nabal)', + '': 'Tree (oliveira, palmeira)', + '': 'Flower (rosa, taraxaco)', + '': 'Bush, shrub (rododendro, tamariz)', + '': '(fruit, berries, nuts: maçã, morango, avelã, melancia)', + '': '(vegetable espargo, funcho)', }, 'Human': { - '': u'Human, umbrella tag', - '': u'Group of humans (organisations, teams, companies, e.g. editora)', - '': u'Attributive human umbrella tag (many -ista, -ante)', - '': u'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)', - '': u'Human with family or other private relation (pai, noiva)', - '': u'Ideological human (comunista, implies ), also: follower, disciple (dadaista)', - '': u'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)', - '': u'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)', - '': u'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)', - '': u'Sick human (few: asmático, diabético, cp )', - '': u'Title noun (rei, senhora)', + '': 'Human, umbrella tag', + '': 'Group of humans (organisations, teams, companies, e.g. editora)', + '': 'Attributive human umbrella tag (many -ista, -ante)', + '': 'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)', + '': 'Human with family or other private relation (pai, noiva)', + '': 'Ideological human (comunista, implies ), also: follower, disciple (dadaista)', + '': 'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)', + '': 'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)', + '': 'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)', + '': 'Sick human (few: asmático, diabético, cp )', + '': 'Title noun (rei, senhora)', }, 'Place and spatial': { - '': u'Place, umbrella tag', - '': u'Abstract place (anverso. auge)', - '': u'Civitas, town, country, county (equals + , cidade, país)', - '': u'Cover, lid (colcha, lona, tampa)', - '': u'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)', - '': u'opening, hole (apertura, fossa)', - '': u'Path (road, street etc.: rua, pista)' , - '': u'Star object (planets, comets: planeta, quasar)', - '': u'surface (face, verniz, cp. )', - '': u'tip place, edge (pico, pontinha, cp. )', - '': u'Geographical, natural place (promontório, pântano)', - '': u'trap place (armadilha, armazelo)', - '': u'Water place (river, lake, sea: fonte, foz, lagoa)', - '': u'barrier noun (dique, limite, muralha)', - '': u'(building)', - '': u'(institution)', - '': u'(picture)', - '': u'(situation)', - '': u'anatomical/body position (few: desaprumo)', - '': u'social position, job (emprego, condado, capitania, presidência)', + '': 'Place, umbrella tag', + '': 'Abstract place (anverso. auge)', + '': 'Civitas, town, country, county (equals + , cidade, país)', + '': 'Cover, lid (colcha, lona, tampa)', + '': 'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)', + '': 'opening, hole (apertura, fossa)', + '': 'Path (road, street etc.: rua, pista)' , + '': 'Star object (planets, comets: planeta, quasar)', + '': 'surface (face, verniz, cp. )', + '': 'tip place, edge (pico, pontinha, cp. )', + '': 'Geographical, natural place (promontório, pântano)', + '': 'trap place (armadilha, armazelo)', + '': 'Water place (river, lake, sea: fonte, foz, lagoa)', + '': 'barrier noun (dique, limite, muralha)', + '': '(building)', + '': '(institution)', + '': '(picture)', + '': '(situation)', + '': 'anatomical/body position (few: desaprumo)', + '': 'social position, job (emprego, condado, capitania, presidência)', }, 'Vehicle': { - '': u'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)', - '': u'Group of vehicles (armada, convoy: frota, esquadra)', - '': u'Water vehicle (ship: navio, submersível, canoa)', - '': u'Air vehicle (plane: hidroplano, jatinho)', + '': 'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)', + '': 'Group of vehicles (armada, convoy: frota, esquadra)', + '': 'Water vehicle (ship: navio, submersível, canoa)', + '': 'Air vehicle (plane: hidroplano, jatinho)', }, 'Abstract': { - '': u'Abstract countable, umbrella tag (alternativa, chance, lazer)', - '': u'Category word (latinismo, número atômico)', - '': u'sign, symbol (parêntese, semicolcheia)', - '': u'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)', - '': u'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.', - '': u'(features)', - '': u'direction noun (estibordo, contrasenso, norte)', - '': u'(shapes)', - '': u'meta noun (tipo, espécie)', - '': u'(MARCA) brand', - '': u'(DISCIPLINA) subject matter', - '': u'(ESCOLA) school of thought', - '': u'(IDEA) idea, concept', - '': u'(PLANO) named plan, project', - '': u'(OBRA) artist-s name, standing for body of work', - '': u'(NOME)', - '': u'(ESTADO) physiological state, in particular: disease', + '': 'Abstract countable, umbrella tag (alternativa, chance, lazer)', + '': 'Category word (latinismo, número atômico)', + '': 'sign, symbol (parêntese, semicolcheia)', + '': 'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)', + '': 'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.', + '': '(features)', + '': 'direction noun (estibordo, contrasenso, norte)', + '': '(shapes)', + '': 'meta noun (tipo, espécie)', + '': '(MARCA) brand', + '': '(DISCIPLINA) subject matter', + '': '(ESCOLA) school of thought', + '': '(IDEA) idea, concept', + '': '(PLANO) named plan, project', + '': '(OBRA) artist-s name, standing for body of work', + '': '(NOME)', + '': '(ESTADO) physiological state, in particular: disease', }, 'Concept': { - '': u'convention (social rule or law, lei, preceito)', - '': u'subject matter, profession, cf. , anatomia, citricultura, dactilografia)', - '': u'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)', - '': u'', - '': u'language (alemão, catalão, bengali)', - '': u'', - '': u'', - '': u'therapy (also and , acupuntura, balneoterapia)', + '': 'convention (social rule or law, lei, preceito)', + '': 'subject matter, profession, cf. , anatomia, citricultura, dactilografia)', + '': 'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)', + '': '', + '': 'language (alemão, catalão, bengali)', + '': '', + '': '', + '': 'therapy (also and , acupuntura, balneoterapia)', }, 'Game': { - '': u'play, game (bilhar, ioiô, poker, also )', + '': 'play, game (bilhar, ioiô, poker, also )', }, 'Genre': { - '': u'genre (especially art genre, cf. , modernismo, tropicalismo)', + '': 'genre (especially art genre, cf. , modernismo, tropicalismo)', }, 'Quantity': { - '': u'', - '': u'quantity noun (bocada, teor, sem-fim)', - '': u'currency noun (countable, implies , cf. , dirham, euro, real, dólar)', - '': u'amount of money (bolsa, custo, imposto, cf. )', + '': '', + '': 'quantity noun (bocada, teor, sem-fim)', + '': 'currency noun (countable, implies , cf. , dirham, euro, real, dólar)', + '': 'amount of money (bolsa, custo, imposto, cf. )', }, 'Action': { - '': u'Action umbrella tag (+CONTROL, PERFECTIVE)', - '': u'beat-action (thrashing, pancada, surra)', - '': u'do-action (typically dar/fazer + N, tentativa, teste, homenagem)', - '': u'speech act or communicative act (proposta, ordem)', - '': u'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )', - '': u'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)', - '': u'', - '': u'', - '': u'', - '': u'dance (both , and , calipso, flamenco, forró)', - '': u'fight, conflict (also and +TEMP, briga, querela)', - '': u'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)', + '': 'Action umbrella tag (+CONTROL, PERFECTIVE)', + '': 'beat-action (thrashing, pancada, surra)', + '': 'do-action (typically dar/fazer + N, tentativa, teste, homenagem)', + '': 'speech act or communicative act (proposta, ordem)', + '': 'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )', + '': 'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)', + '': '', + '': '', + '': '', + '': 'dance (both , and , calipso, flamenco, forró)', + '': 'fight, conflict (also and +TEMP, briga, querela)', + '': 'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)', }, 'Anatomical': { - '': u'Anatomical noun, umbrella tag (carótida, clítoris, dorso)', - '': u'Movable anatomy (arm, leg, braço, bíceps, cotovelo)', - '': u'Organ (heart, liver, hipófise, coração, testículo)', - '': u'Bone (calcâneo, fíbula, vértebra)', - '': u'Animal anatomy (rúmen, carapaça, chifres, tromba)', - '': u'Bird anatomy (bico, pluma)', - '': u'Fish anatomy (few: bránquias, siba)', - '': u'Insect anatomy (few: tentáculo, olho composto)', - '': u'Plant anatomy (bulbo, caule, folha)', - '': u'(human anatomical feature)', + '': 'Anatomical noun, umbrella tag (carótida, clítoris, dorso)', + '': 'Movable anatomy (arm, leg, braço, bíceps, cotovelo)', + '': 'Organ (heart, liver, hipófise, coração, testículo)', + '': 'Bone (calcâneo, fíbula, vértebra)', + '': 'Animal anatomy (rúmen, carapaça, chifres, tromba)', + '': 'Bird anatomy (bico, pluma)', + '': 'Fish anatomy (few: bránquias, siba)', + '': 'Insect anatomy (few: tentáculo, olho composto)', + '': 'Plant anatomy (bulbo, caule, folha)', + '': '(human anatomical feature)', }, 'Thing': { - '': u'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike )', - '': u'Artifact, umbrella tag (so far empty category in PALAVRAS)', - '': u'ornamental object (few: guirlanda, rufo)', - '': u'flat long object (few: board, plank, lousa, tabla)', - '': u'fire object (bonfire, spark, chispa, fogo, girândola)', - '': u'handle (garra, ansa, chupadouro)', - '': u'light artifact (lampião, farol, projector) ', - '': u'(atomic) particle (few: cátion, eletrônio)', - '': u'read object (carteira, cupom, bilhete, carta, cf. )', - '': u'cloth object (towel, napkin, carpet, rag) , cp. ', - '': u'(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)', - '': u'stick object (long and thin, vara, lançe, paulito)', - '': u'(OBJECT) named object', - '': u'(OBJECT) common noun used as name', - '': u'(SUBSTANCIA) substance', - '': u'(CLASSE) classification category for things', - '': u'(CLASSE) plant name', - '': u'(MOEDA) currency name (also marked on the number)', - '': u'mass noun (e.g. "leite", "a-gua")', - '': u'furniture (cama, cadeira, tambo, quadro)', - '': u'container (implies quantifying, ampola, chícara, aquário)', + '': 'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike )', + '': 'Artifact, umbrella tag (so far empty category in PALAVRAS)', + '': 'ornamental object (few: guirlanda, rufo)', + '': 'flat long object (few: board, plank, lousa, tabla)', + '': 'fire object (bonfire, spark, chispa, fogo, girândola)', + '': 'handle (garra, ansa, chupadouro)', + '': 'light artifact (lampião, farol, projector) ', + '': '(atomic) particle (few: cátion, eletrônio)', + '': 'read object (carteira, cupom, bilhete, carta, cf. )', + '': 'cloth object (towel, napkin, carpet, rag) , cp. ', + '': '(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)', + '': 'stick object (long and thin, vara, lançe, paulito)', + '': '(OBJECT) named object', + '': '(OBJECT) common noun used as name', + '': '(SUBSTANCIA) substance', + '': '(CLASSE) classification category for things', + '': '(CLASSE) plant name', + '': '(MOEDA) currency name (also marked on the number)', + '': 'mass noun (e.g. "leite", "a-gua")', + '': 'furniture (cama, cadeira, tambo, quadro)', + '': 'container (implies quantifying, ampola, chícara, aquário)', }, 'Substance': { - '': u'concrete mass/non-countable, umbrella tag, substance (cf. , terra, choça, magma)', - '': u'human-made substance (cf. , cemento)', - '': u'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina', - '': u'gas substance (so far few: argônio, overlap with. and )', - '': u'liquid substance (azeite, gasolina, plasma, overlap with and )', - '': u'remedy (medical or hygiene, antibiótico, canabis, quinina, part of , overlap with )', + '': 'concrete mass/non-countable, umbrella tag, substance (cf. , terra, choça, magma)', + '': 'human-made substance (cf. , cemento)', + '': 'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina', + '': 'gas substance (so far few: argônio, overlap with. and )', + '': 'liquid substance (azeite, gasolina, plasma, overlap with and )', + '': 'remedy (medical or hygiene, antibiótico, canabis, quinina, part of , overlap with )', }, 'Materials': { - '': u'material (argila, bronze, granito, cf. )', - '': u'cloth material (seda, couro, vison, kevlar), cp. ', - '': u'cord, string, rope, tape (previously , arame, fio, fibrila)', + '': 'material (argila, bronze, granito, cf. )', + '': 'cloth material (seda, couro, vison, kevlar), cp. ', + '': 'cord, string, rope, tape (previously , arame, fio, fibrila)', }, 'Clothing': { - '': u'animal clothing (sela, xabraque)', - '': u'human clothing (albornoz, anoraque, babadouro, bermudas)', - '': u'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)', - '': u'hat (sombrero, mitra, coroa)', - '': u'shoe (bota, chinela, patim)', - '': u'cloth material (seda, couro, vison, kevlar), cp. ', - '': u'(clothing)', + '': 'animal clothing (sela, xabraque)', + '': 'human clothing (albornoz, anoraque, babadouro, bermudas)', + '': 'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)', + '': 'hat (sombrero, mitra, coroa)', + '': 'shoe (bota, chinela, patim)', + '': 'cloth material (seda, couro, vison, kevlar), cp. ', + '': '(clothing)', }, 'Collective': { - '': u'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)', - '': u'thing collective, pile (baralho, lanço)', - '': u'plant-part collective (buquê, folhagem)', - '': u'semantic collective, collection (arquivo, repertório)', - '': u'tool collective, set (intrumentário, prataria)', - '': u'(group)', - '': u'(herd)', - '': u'(plantation)', - '': u'(convoy)', + '': 'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)', + '': 'thing collective, pile (baralho, lanço)', + '': 'plant-part collective (buquê, folhagem)', + '': 'semantic collective, collection (arquivo, repertório)', + '': 'tool collective, set (intrumentário, prataria)', + '': '(group)', + '': '(herd)', + '': '(plantation)', + '': '(convoy)', }, 'Time_Event': { - '': u'duration noun (test: durar+, implies , e.g. átimo, mês, hora)', - '': u'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. and )', - '': u'non-organised event (-CONTROL, PERFECTIVE, milagre, morte)', - '': u'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized ) ', - '': u'process (-CONTROL, -PERFECTIVE, cp. , balcanização, convecção, estagnação)', - '': u'', - '': u'', - '': u'(EFEMERIDE) one-time [historical] occurrence', - '': u'(DATA) date', - '': u'(HORA) hour', - '': u'(PERIODO) period', - '': u'(CICLICO) cyclic time expression', - '': u'month noun/name (agosto, julho, part of )', - '': u'period of time (prototypical test: durante, e.g. guerra, década, cf. and )', + '': 'duration noun (test: durar+, implies , e.g. átimo, mês, hora)', + '': 'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. and )', + '': 'non-organised event (-CONTROL, PERFECTIVE, milagre, morte)', + '': 'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized ) ', + '': 'process (-CONTROL, -PERFECTIVE, cp. , balcanização, convecção, estagnação)', + '': '', + '': '', + '': '(EFEMERIDE) one-time [historical] occurrence', + '': '(DATA) date', + '': '(HORA) hour', + '': '(PERIODO) period', + '': '(CICLICO) cyclic time expression', + '': 'month noun/name (agosto, julho, part of )', + '': 'period of time (prototypical test: durante, e.g. guerra, década, cf. and )', }, 'Feature': { - '': u'feature/property, umbrella tag (problematicidade, proporcionalidade)', - '': u'anatomical "local" feature, includes countables, e.g. barbela, olheiras)', - '': u'general countable feature (vestígio, laivos, vinco)', - '': u'human physical feature, not countable (lindura, compleição, same as , cp. anatomical local features )', - '': u'', - '': u'human psychological feature (passionalidade, pavonice, cp. passing states )', - '': u'quantifiable feature (e.g. circunferência, calor, DanGram-s covers both and )', - '': u'', - '': u'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)', - '': u'', - '': u'(human state)', + '': 'feature/property, umbrella tag (problematicidade, proporcionalidade)', + '': 'anatomical "local" feature, includes countables, e.g. barbela, olheiras)', + '': 'general countable feature (vestígio, laivos, vinco)', + '': 'human physical feature, not countable (lindura, compleição, same as , cp. anatomical local features )', + '': '', + '': 'human psychological feature (passionalidade, pavonice, cp. passing states )', + '': 'quantifiable feature (e.g. circunferência, calor, DanGram-s covers both and )', + '': '', + '': 'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)', + '': '', + '': '(human state)', }, 'Food': { - '': u'natural/simplex food (aveia, açúcar, carne, so far including )', - '': u'countable food (few: ovo, dente de alho, most are or )', - '': u'human-prepared/complex culinary food (caldo verde, lasanha)', - '': u'culinary countable food (biscoito, enchido, panetone, pastel)', - '': u'drink (cachaça, leite, guaraná, moca)', - '': u'fruit, berry, nut (still mostly marked as , abricote, amora, avelã, cebola)', - '': u'condiments, pepper', + '': 'natural/simplex food (aveia, açúcar, carne, so far including )', + '': 'countable food (few: ovo, dente de alho, most are or )', + '': 'human-prepared/complex culinary food (caldo verde, lasanha)', + '': 'culinary countable food (biscoito, enchido, panetone, pastel)', + '': 'drink (cachaça, leite, guaraná, moca)', + '': 'fruit, berry, nut (still mostly marked as , abricote, amora, avelã, cebola)', + '': 'condiments, pepper', }, 'Part': { - '': u'distinctive or functional part (ingrediente, parte, trecho)', - '': u'structural part of building or vehicle (balustrada, porta, estai)', - '': u'indistinctive (little) piece (pedaço, raspa)', - '': u'', - '': u'', + '': 'distinctive or functional part (ingrediente, parte, trecho)', + '': 'structural part of building or vehicle (balustrada, porta, estai)', + '': 'indistinctive (little) piece (pedaço, raspa)', + '': '', + '': '', }, 'Perception': { - '': u'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with )', - '': u'sound (what you hear, apitadela, barrulho, berro, crepitação)', - '': u'olfactory impression (what you smell, bafo, chamuscom fragrância)', - '': u'what you taste (PALAVRAS: not implemented)', - '': u'visual impression (what you see, arco-iris, réstia, vislumbre)', + '': 'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with )', + '': 'sound (what you hear, apitadela, barrulho, berro, crepitação)', + '': 'olfactory impression (what you smell, bafo, chamuscom fragrância)', + '': 'what you taste (PALAVRAS: not implemented)', + '': 'visual impression (what you see, arco-iris, réstia, vislumbre)', }, 'Semantic Product': { - '': u'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)', - '': u'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)', - '': u'listen-work (music, cantarola, prelúdio, at the same time : bossa nova)', - '': u'nonsense, rubbish (implies , galimatias, farelório)', - '': u'read-work (biografia, dissertação, e-mail, ficha cadastral)', - '': u'speak-work (palestra, piada, exposto)', - '': u'watch-work (filme, esquete, mininovela)', - '': u'(speach act)', - '': u'', + '': 'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)', + '': 'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)', + '': 'listen-work (music, cantarola, prelúdio, at the same time : bossa nova)', + '': 'nonsense, rubbish (implies , galimatias, farelório)', + '': 'read-work (biografia, dissertação, e-mail, ficha cadastral)', + '': 'speak-work (palestra, piada, exposto)', + '': 'watch-work (filme, esquete, mininovela)', + '': '(speach act)', + '': '', }, 'Disease': { - '': u'disease (acne, AIDS, sida, alcoolismo, cp. )', - '': u'', - '': u'countable disease-object (abscesso, berruga, cicatriz, gangrena)', + '': 'disease (acne, AIDS, sida, alcoolismo, cp. )', + '': '', + '': 'countable disease-object (abscesso, berruga, cicatriz, gangrena)', }, 'State-of-affairs': { - '': u'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than and ', - '': u'state (of something, otherwise ), abundância, calma, baixa-mar, equilíbrio', - '': u'human state (desamparo, desesperança, dormência, euforia, febre', - '': u'', - '': u'', + '': 'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than and ', + '': 'state (of something, otherwise ), abundância, calma, baixa-mar, equilíbrio', + '': 'human state (desamparo, desesperança, dormência, euforia, febre', + '': '', + '': '', }, 'Sport': { - '': u'sport (capoeira, futebol, golfe, also and )', + '': 'sport (capoeira, futebol, golfe, also and )', }, 'Tool': { - '': u'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. )', - '': u'cutting tool, knife (canivete, espada)', - '': u'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: )', - '': u'musical instrument (clavicórdio, ocarina, violão)', - '': u'sailing tool, sail (vela latina, joanete, coringa)', - '': u'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. )', - '': u'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like or )', + '': 'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. )', + '': 'cutting tool, knife (canivete, espada)', + '': 'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: )', + '': 'musical instrument (clavicórdio, ocarina, violão)', + '': 'sailing tool, sail (vela latina, joanete, coringa)', + '': 'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. )', + '': 'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like or )', }, 'Unit': { - '': u'unit noun (always implying , implied by and , e.g. caloria, centímetro, lúmen))', + '': 'unit noun (always implying , implied by and , e.g. caloria, centímetro, lúmen))', }, 'Weather': { - '': u'weather (states), umbrella tag (friagem, bruma)', - '': u'countable weather phenomenon (nuvem, tsunami)', - '': u'rain and other precipitation (chuvisco, tromba d-água, granizo)', - '': u'wind, storm (brisa, furacão)', + '': 'weather (states), umbrella tag (friagem, bruma)', + '': 'countable weather phenomenon (nuvem, tsunami)', + '': 'rain and other precipitation (chuvisco, tromba d-água, granizo)', + '': 'wind, storm (brisa, furacão)', }, 'Person': { - '': u'(INDIVIDUAL) person name (cp. )', - '': u'(CARGO) official function (~ cp. and )', - '': u'(MEMBRO) member', + '': '(INDIVIDUAL) person name (cp. )', + '': '(CARGO) official function (~ cp. and )', + '': '(MEMBRO) member', }, 'Organization_Group': { - '': u'(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)', - '': u'(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as )', - '': u'(EMPRESA) organized site (e.g. restaurant, cp. )', - '': u'(EMPRESA) media organisation (e.g. newspaper, tv channel)', - '': u'(INSTITUICAO) political party', - '': u'(SUB) organized part of any of the above', - '': u'currently unsupported: (EMPRESA) company (not site-bound, unlike , now fused with. )', + '': '(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)', + '': '(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as )', + '': '(EMPRESA) organized site (e.g. restaurant, cp. )', + '': '(EMPRESA) media organisation (e.g. newspaper, tv channel)', + '': '(INSTITUICAO) political party', + '': '(SUB) organized part of any of the above', + '': 'currently unsupported: (EMPRESA) company (not site-bound, unlike , now fused with. )', }, 'Group': { - '': u'(GROUPOIND) people, family', - '': u'(GROUPOCARGO) board, government (not fully implemented)', - '': u'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with )', + '': '(GROUPOIND) people, family', + '': '(GROUPOCARGO) board, government (not fully implemented)', + '': 'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with )', }, 'Place': { - '': u'(GEOGRAFICO) geographical location (cp. )', - '': u'(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. )', - '
': u'(CORREIO) address (including numbers etc.)', - '': u'(ALARGADO) functional place (cp. )', - '': u'(VIRTUAL) virtual place', - '': u'(OBJECTO) astronomical place (in HAREM object, not place)', - '': u'suggested (ALARGADO) roads, motorway (unlike
)', + '': '(GEOGRAFICO) geographical location (cp. )', + '': '(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. )', + '
': '(CORREIO) address (including numbers etc.)', + '': '(ALARGADO) functional place (cp. )', + '': '(VIRTUAL) virtual place', + '': '(OBJECTO) astronomical place (in HAREM object, not place)', + '': 'suggested (ALARGADO) roads, motorway (unlike
)', }, 'Work_of_Art': { - '': u'(REPRODUZIDO) [title of] reproduced work, copy', - '': u'(PUBLICACAO) [scientific] publication', - '': u'(PRODUTO) product brand', - '': u'(PRODUTO) vehicle brand (cp. , , )', - '': u'(ARTE) work of art', - '': u'picture (combination of , and , caricatura, cintilograma, diapositivo)', + '': '(REPRODUZIDO) [title of] reproduced work, copy', + '': '(PUBLICACAO) [scientific] publication', + '': '(PRODUTO) product brand', + '': '(PRODUTO) vehicle brand (cp. , , )', + '': '(ARTE) work of art', + '': 'picture (combination of , and , caricatura, cintilograma, diapositivo)', }, 'Colours': { - '': u'colours', + '': 'colours', }, 'Numeric_and_Math': { - '': u'(QUANTIDADE) simple measuring numeral', - '': u'(CLASSIFICADO) predicating numeral', - '': u'(MOEDA) currency name (also marked on the unit)', - '': u'geometry noun (circle, shape, e.g. losango, octógono, elipse)', - '': u'line (few: linha, percentil, curvas isobáricas)', + '': '(QUANTIDADE) simple measuring numeral', + '': '(CLASSIFICADO) predicating numeral', + '': '(MOEDA) currency name (also marked on the unit)', + '': 'geometry noun (circle, shape, e.g. losango, octógono, elipse)', + '': 'line (few: linha, percentil, curvas isobáricas)', }, 'Modifying_Adjectives': { - '': u'adjective modifying human noun', - '': u'adjective modifying inanimate noun ', - '': u'adjective modifying animal', - '': u'adjective modifying plant', - '': u'color adjective', - '': u'nationality adjective (also: from a certain town etc.)', - '': u'(human) attributive adjective (not fully implemented, cp. , e.g. "um presidente COMUNISTA")', + '': 'adjective modifying human noun', + '': 'adjective modifying inanimate noun ', + '': 'adjective modifying animal', + '': 'adjective modifying plant', + '': 'color adjective', + '': 'nationality adjective (also: from a certain town etc.)', + '': '(human) attributive adjective (not fully implemented, cp. , e.g. "um presidente COMUNISTA")', }, 'Verbs_related_human_things': { - '': u'verb with human subject', - '': u'verb with inanimate subject', + '': 'verb with human subject', + '': 'verb with inanimate subject', }, } diff --git a/pypln/backend/workers/pos/__init__.py b/pypln/backend/workers/pos/__init__.py index 9400fd1..4647189 100644 --- a/pypln/backend/workers/pos/__init__.py +++ b/pypln/backend/workers/pos/__init__.py @@ -18,8 +18,8 @@ # along with PyPLN. If not, see . -import en_nltk -import pt_palavras +from . import en_nltk +from . import pt_palavras from pypln.backend.workers.palavras_raw import palavras_installed from pypln.backend.celery_task import PyPLNTask @@ -48,7 +48,7 @@ def process(self, document): if language in MAPPING: tagset, tagged_text = MAPPING[language](document) text = document['text'] - if not isinstance(text, unicode): + if not isinstance(text, str): text = text.decode('utf-8') tagged_text_with_offset = put_offset(text, tagged_text) return {'pos': tagged_text_with_offset, 'tagset': tagset} diff --git a/pypln/backend/workers/pos/pt_palavras.py b/pypln/backend/workers/pos/pt_palavras.py index 19d4b9d..d24efa1 100644 --- a/pypln/backend/workers/pos/pt_palavras.py +++ b/pypln/backend/workers/pos/pt_palavras.py @@ -23,56 +23,56 @@ PALAVRAS_ENCODING = 'utf-8' WORD_CLASSES = { - u'N': u'Nouns', - u'PROP': u'Proper nouns', - u'SPEC': u'Specifiers', - u'DET': u'Determiners', - u'PERS': u'Personal pronouns', - u'ADJ': u'Adjectives', - u'ADV': u'Adverbs', - u'V': u'Verbs', - u'NUM': u'Numerals', - u'PRP': u'Preposition', - u'KS': u'Subordinating conjunctions', - u'KC': u'Coordinationg conjunctions', - u'IN': u'Interjections', - u'EC': u'Hyphen-separated prefix', - u'BL': u'Blank Line', - u'ES': u'End of Sentence', - u'NW': u'Non Word', + 'N': 'Nouns', + 'PROP': 'Proper nouns', + 'SPEC': 'Specifiers', + 'DET': 'Determiners', + 'PERS': 'Personal pronouns', + 'ADJ': 'Adjectives', + 'ADV': 'Adverbs', + 'V': 'Verbs', + 'NUM': 'Numerals', + 'PRP': 'Preposition', + 'KS': 'Subordinating conjunctions', + 'KC': 'Coordinationg conjunctions', + 'IN': 'Interjections', + 'EC': 'Hyphen-separated prefix', + 'BL': 'Blank Line', + 'ES': 'End of Sentence', + 'NW': 'Non Word', } def pos(document): if 'palavras_raw' not in document: - return u'', [] + return '', [] palavras_output = document['palavras_raw'] - if not isinstance(palavras_output, unicode): + if not isinstance(palavras_output, str): palavras_output = palavras_output.decode(PALAVRAS_ENCODING) tagged_text = [] - for line in palavras_output.split(u'\n'): + for line in palavras_output.split('\n'): line = line.strip() #print(line) - if line.isspace() or line == u'': + if line.isspace() or line == '': continue - elif line.startswith(u'<'): + elif line.startswith('<'): continue - elif line.startswith(u'$'): + elif line.startswith('$'): non_word = line.split()[0][1:] if non_word.isdigit(): - non_word_tag = u'NUM' + non_word_tag = 'NUM' else: non_word_tag = non_word tagged_text.append((non_word, non_word_tag)) - elif len(line.split(u'\t')) < 2: # Discard malformed lines + elif len(line.split('\t')) < 2: # Discard malformed lines continue else: - info = line.split(u'\t') - final = u'\t'.join(info[1:]).split() + info = line.split('\t') + final = '\t'.join(info[1:]).split() word = info[0].strip() syntatic_semantic_tags = final[1:] - tags = filter(lambda x: x in WORD_CLASSES, syntatic_semantic_tags) + tags = [x for x in syntatic_semantic_tags if x in WORD_CLASSES] if tags: pos_tag = tags[0] tagged_text.append((word, pos_tag)) diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py index 4a6afb8..eeac5d3 100644 --- a/pypln/backend/workers/spellchecker.py +++ b/pypln/backend/workers/spellchecker.py @@ -16,32 +16,40 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - +import warnings import enchant from enchant.checker import SpellChecker + from pypln.backend.celery_task import PyPLNTask + +class MissingDictionaryWarning(RuntimeWarning): + pass + + class SpellingChecker(PyPLNTask): """ This worker performs spellchecking in the plain text of a document """ def __init__(self): - # This method is only called once per process, but that is no problem - # since the enchant languange list should not change. Don't use this - # method for anything that should run every time the task is called. - # See http://docs.celeryproject.org/en/latest/userguide/tasks.html#instantiation - # for more information. - self.checkers = {lang: SpellChecker(lang) for lang in enchant.list_languages()} + # This method is only called once per process + self.checkers = {lang: SpellChecker(lang) + for lang in enchant.list_languages()} def process(self, document): - #TODO: this worker may be enhanced by also checking the errors against an specific vocabulary supplied with the document - try: - checker = self.checkers[document['language']] + # TODO: this worker may be enhanced by also checking the errors against + # an specific vocabulary supplied with the document + checker = self.checkers.get(document['language']) + if checker is None: + # Maybe this should be an exception instead + warnings.warn('%s dictionary missing. If running on linux, ' + 'install the corresponding myspell package' + % document['language'], + MissingDictionaryWarning) + errors = None + else: checker.set_text(document['text']) errors = [[e.word, e.wordpos, e.suggest()] for e in checker] - except KeyError: - errors = None return {'spelling_errors': errors} - diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index 4ad46ef..8778bed 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -45,11 +45,11 @@ def process(self, document): # We cannot store the trigram as a tuple (mongo keys need to be # strings). We decided to join tokens using spaces since a # space will never be in a token. - key = u' '.join(res[0]) + key = ' '.join(res[0]) # Mongo cannot have `.` or `$` in key names. Unfortunatelly # this means we need to replace them with placeholders. - key = key.replace(u'$', u'\dollarsign') - key = key.replace(u'.', u'\dot') + key = key.replace('$', '\dollarsign') + key = key.replace('.', '\dot') tr[key].append(res[1]) return {'trigram_rank': tr, 'metrics':metrics} diff --git a/pypln/backend/workers/word_cloud.py b/pypln/backend/workers/word_cloud.py index 4f55dad..5bf5efc 100644 --- a/pypln/backend/workers/word_cloud.py +++ b/pypln/backend/workers/word_cloud.py @@ -19,7 +19,7 @@ import base64 import string -from StringIO import StringIO +from io import BytesIO import numpy import nltk @@ -32,7 +32,7 @@ def filter_stopwords(fdist, lang): stopwords = list(string.punctuation) if lang in long_name: stopwords += nltk.corpus.stopwords.words(long_name[lang]) - return filter(lambda pair: pair[0].lower() not in stopwords, fdist) + return [pair for pair in fdist if pair[0].lower() not in stopwords] class WordCloud(PyPLNTask): @@ -41,7 +41,7 @@ def process(self, document): words = numpy.array([t[0] for t in fdist]) counts = numpy.array([t[1] for t in fdist]) wordcloud_img = make_wordcloud(words, counts) - fd = StringIO() + fd = BytesIO() wordcloud_img.save(fd, format="PNG") fd.seek(0) result = {'wordcloud': base64.b64encode(fd.read())} diff --git a/requirements/production.txt b/requirements/production.txt index bb43589..c043e27 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,17 +1,10 @@ # Common -celery +celery==3.1.23 pymongo==2.8.1 -# The newest pyparsing (2.0) only supports python 3, -# so we explicitly install 1.5.7 (the last version that -# supports python 2) before one of our dependencies tries -# to install it. -# http://sourceforge.net/projects/pyparsing/forums/forum/337293/topic/6481050 -pyparsing>=1.5.6,<2.0 - # Backend psutil -chromium_compact_language_detector +pycld2 filemagic numpy nltk>=2.7.8 diff --git a/scripts/add_pipelines.py b/scripts/add_pipelines.py index 2450bda..4d30735 100755 --- a/scripts/add_pipelines.py +++ b/scripts/add_pipelines.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from __future__ import print_function + import sys from logging import Logger, StreamHandler, Formatter from pymongo import Connection diff --git a/scripts/create_fake_measures.py b/scripts/create_fake_measures.py index 8e6b82f..a542187 100644 --- a/scripts/create_fake_measures.py +++ b/scripts/create_fake_measures.py @@ -25,17 +25,17 @@ data = \ {'host': {'cpu': {'cpu percent': 4.9, 'number of cpus': 4}, - 'memory': {'buffers': 214372352L, + 'memory': {'buffers': 214372352, 'cached': 919220224, - 'free': 1369661440L, + 'free': 1369661440, 'free virtual': 0, 'percent': 65.21955293723627, - 'real free': 2503254016L, + 'real free': 2503254016, 'real percent': 36.433711831634305, - 'real used': 1434767360L, - 'total': 3938021376L, + 'real used': 1434767360, + 'total': 3938021376, 'total virtual': 0, - 'used': 2568359936L, + 'used': 2568359936, 'used virtual': 0}, 'network': {'cluster ip': '127.0.0.1', 'interfaces': {'eth0': {'bytes received': 171472224, @@ -105,8 +105,8 @@ def populate_collection(): db[collection_name].drop() collection = db[collection_name] - print '[{}] Inserting total of {} measures ({} for {} brokers)...'\ - .format(asctime(), measures * brokers, measures, brokers) + print('[{}] Inserting total of {} measures ({} for {} brokers)...'\ + .format(asctime(), measures * brokers, measures, brokers)) for measure in range(1, measures + 1): for broker in range(1, brokers + 1): if '_id' in data: @@ -117,17 +117,17 @@ def populate_collection(): random() collection.insert(data) if measure % 10000 == 0: - print ' [{}] Inserted {} measures'.format(asctime(), - measure * broker) - print '[{}] Done inserting measures!'.format(asctime()) + print(' [{}] Inserted {} measures'.format(asctime(), + measure * broker)) + print('[{}] Done inserting measures!'.format(asctime())) - print '[{}] Creating index for "host.network.cluster ip"'.format(asctime()) + print('[{}] Creating index for "host.network.cluster ip"'.format(asctime())) collection.ensure_index('host.network.cluster ip') - print '[{}] Done!'.format(asctime()) + print('[{}] Done!'.format(asctime())) - print '[{}] Creating index for "timestamp"'.format(asctime()) + print('[{}] Creating index for "timestamp"'.format(asctime())) collection.ensure_index([('timestamp', -1)]) - print '[{}] Done!'.format(asctime()) + print('[{}] Done!'.format(asctime())) database_name = 'pypln' collection_name = 'monitoring' # WARNING: it'll drop the collection! @@ -149,10 +149,10 @@ def populate_collection(): .distinct('host.network.cluster ip')) end_time = time() total_time = end_time - start_time -print 'Time to get broker IPs: {}. Broker IPs: {}'.format(total_time, - ', '.join(broker_ips)) +print('Time to get broker IPs: {}. Broker IPs: {}'.format(total_time, + ', '.join(broker_ips))) -print '[{}] Getting last measure for each broker...'.format(asctime()) +print('[{}] Getting last measure for each broker...'.format(asctime())) measures = {} start_time = time() for broker_ip in broker_ips: @@ -162,7 +162,7 @@ def populate_collection(): measures[broker_ip] = result end_time = time() total_time = end_time - start_time -print '[{}] Time to get all information: {}'.format(asctime(), total_time) -for broker_ip, measure_list in measures.iteritems(): - print 'Broker: {}, measure: {}'.format(broker_ip, measure_list[0]) +print('[{}] Time to get all information: {}'.format(asctime(), total_time)) +for broker_ip, measure_list in measures.items(): + print('Broker: {}, measure: {}'.format(broker_ip, measure_list[0])) connection.close() diff --git a/scripts/mongo2sphinx.py b/scripts/mongo2sphinx.py index 89f3438..b6fafd0 100755 --- a/scripts/mongo2sphinx.py +++ b/scripts/mongo2sphinx.py @@ -61,7 +61,7 @@ def serialize(doc,id): an unique unsigned integer `id`. We use a counter for this. """ document = Element("sphinx:document", attrib={'id':str(id)}) - for k,v in doc.iteritems(): + for k,v in doc.items(): if k == '_id': SubElement(document,k).text = str(v) continue diff --git a/tests/data/encoding_unknown_to_libmagic.txt b/tests/data/encoding_unknown_to_libmagic.txt deleted file mode 100644 index 9fb69b2..0000000 --- a/tests/data/encoding_unknown_to_libmagic.txt +++ /dev/null @@ -1 +0,0 @@ -This file has a weird byte () that makes it impossible for libmagic to recognize it's encoding. diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py index fd1adde..0e087fa 100644 --- a/tests/test_celery_task.py +++ b/tests/test_celery_task.py @@ -17,14 +17,15 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . from pypln.backend.celery_task import PyPLNTask -from utils import TaskTest +from unittest import mock +from .utils import TaskTest class FakeTask(PyPLNTask): def process(self, document): return {'result': document['input']} class TestCeleryTask(TaskTest): - def test_task_should_get_the_correct_document(self): + def test_saves_returned_data_to_database(self): """This is a regression test. PyPLNTask was not filtering by _id. It was getting the first document it found. """ @@ -35,5 +36,15 @@ def test_task_should_get_the_correct_document(self): FakeTask().delay(correct_doc_id) refreshed_doc = self.collection.find_one({'_id': correct_doc_id}) + refreshed_wrong_doc = self.collection.find_one({'_id': wrong_doc_id}) self.assertEqual(refreshed_doc['result'], 'correct') + self.assertNotIn('result', refreshed_wrong_doc.keys()) + + @mock.patch.object(FakeTask, 'process') + def test_should_get_current_data_from_database(self, mocked_process): + document = {'input': 'correct'} + doc_id = self.collection.insert(document, w=1) + self.collection.insert({'input': 'wrong'}, w=1) + FakeTask().delay(doc_id) + mocked_process.assert_called_with(document) diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py index faaafab..6f319d6 100644 --- a/tests/test_elastic_indexer.py +++ b/tests/test_elastic_indexer.py @@ -1,5 +1,5 @@ #-*- coding:utf-8 -*- -u""" +""" Created on 20/05/15 by fccoelho license: GPL V3 or Later diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index de605e2..027a701 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -16,43 +16,215 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - -import nltk +from unittest import TestCase from pypln.backend.workers.bigrams import Bigrams -from utils import TaskTest - -bigram_measures = nltk.collocations.BigramAssocMeasures() - - -class TestBigramWorker(TaskTest): - def test_bigrams_should_return_correct_score(self): - # We need this list comprehension because we need to save the word list - # in mongo (thus, it needs to be json serializable). Also, a list is - # what will be available to the worker in real situations. - tokens = [w for w in - nltk.corpus.genesis.words('english-web.txt')] - - doc_id = self.collection.insert({'tokens': tokens}, w=1) - Bigrams().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - bigram_rank = refreshed_document['bigram_rank'] - result = bigram_rank[0][1][0] - # This is the value of the chi_sq measure for this bigram in this - # colocation - expected_chi_sq = 95.59393417173634 - self.assertEqual(result, expected_chi_sq) +TOKENS = ['Ao', 'verme', 'que', 'primeiro', 'roeu', 'as', 'frias', 'carnes', + 'do', 'meu', 'cadáver', 'dedico', 'como', 'saudosa', 'lembrança', + 'estas', 'Memórias', 'Póstumas', '.'] +RANKINGS = {'bigram_rankings': [(('Ao', 'verme'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('Memórias', 'Póstumas'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('Póstumas', '.'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('as', 'frias'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('cadáver', 'dedico'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('carnes', 'do'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('como', 'saudosa'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('dedico', 'como'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('do', 'meu'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('estas', 'Memórias'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('frias', 'carnes'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('lembrança', 'estas'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('meu', 'cadáver'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('primeiro', 'roeu'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('que', 'primeiro'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('roeu', 'as'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('saudosa', 'lembrança'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('verme', 'que'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316})]} - def test_bigrams_could_contain_dollar_signs_and_dots(self): - tokens = ['$', '.'] - doc_id = self.collection.insert({'tokens': tokens}, w=1) - Bigrams().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - bigram_rank = refreshed_document['bigram_rank'] - result = bigram_rank[0][1][0] - # 2.0 is the value of the chi_sq measure for this bigram in this - # colocation - expected_chi_sq = 2.0 - self.assertEqual(result, expected_chi_sq) +class TestBigramWorker(TestCase): + def test_returns_bigram_rankings(self): + self.maxDiff = None + result = Bigrams().process({'tokens': TOKENS}) + self.assertEqual(result, RANKINGS) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index d7819a5..0079f78 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -20,22 +20,124 @@ import base64 import os from textwrap import dedent -from pypln.backend.workers import Extractor -from utils import TaskTest +from unittest import TestCase +from unittest.mock import patch, Mock, MagicMock, call + +from magic import MagicError +import pycld2 as cld + +from pypln.backend.workers.extractor import (Extractor, decode_text_bytes, + detect_language) DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) +MODULE = 'pypln.backend.workers.extractor.' + + +class DecodeTextBytesTest(TestCase): + def setUp(self): + magic_mock = MagicMock() + magic_identifier = Mock() + self.id_buffer_mock = Mock(return_value='magic_codec') + magic_identifier.id_buffer = self.id_buffer_mock + magic_mock.return_value.__enter__.return_value = magic_identifier + self.magic_patcher = patch('magic.Magic', magic_mock) + + def test_ignores_magic_error(self): + self.id_buffer_mock.side_effect = MagicError() + text = Mock() + with self.magic_patcher: + # noinspection PyTypeChecker + result = decode_text_bytes(text) + self.assertEqual(result, text.decode.return_value) + self.assertEqual(text.decode.call_args_list, [call('utf-8')]) + + def test_tries_decoding_with_encoding_returned_by_magic(self): + text = Mock() + with self.magic_patcher: + # noinspection PyTypeChecker + result = decode_text_bytes(text) + self.assertEqual(result, text.decode.return_value) + self.assertEqual(text.decode.call_args_list, [call('magic_codec')]) + + def test_tries_decoding_as_utf8(self): + text = Mock() + text.decode.side_effect = [LookupError(), 'result'] + with self.magic_patcher: + # noinspection PyTypeChecker + result = decode_text_bytes(text) + self.assertEqual(result, 'result') + self.assertEqual(text.decode.call_args_list, + [call('magic_codec'), call('utf-8')]) + + def test_tries_iso8859_1_if_all_else_fails(self): + text = Mock() + + class FakeUnicodeDecodeError(UnicodeDecodeError): + def __init__(self): + pass + + text.decode.side_effect = [LookupError(), + FakeUnicodeDecodeError(), + 'result'] + with self.magic_patcher: + # noinspection PyTypeChecker + result = decode_text_bytes(text) + self.assertEqual(result, 'result') + self.assertEqual(text.decode.call_args_list, + [call('magic_codec'), + call('utf-8'), + call('iso8859-1')]) + + +def get_cld_exc(index): + return cld.error('input contains invalid UTF-8 around byte %s (of 42)' + % index) + -class TestExtractorWorker(TaskTest): +class DetectLanguageTest(TestCase): + def setUp(self): + self.cld_mock = Mock(return_value=(Mock(), Mock(), + [(Mock(), 'lang'), + (Mock(), 'other_lang')])) + self.cld_patcher = patch(MODULE + 'cld.detect', self.cld_mock) + + def test_detects_portuguese(self): + """Sort of an integration test""" + text = 'Esse texto foi escrito por Álvaro em Português.' + self.assertEqual(detect_language(text), 'pt') + + def test_removes_bytes_cld_considers_invalid(self): + self.cld_mock.side_effect = [get_cld_exc(0), + get_cld_exc(3), + self.cld_mock.return_value] + with self.cld_patcher: + self.assertEqual(detect_language('012345'), 'lang') + self.assertEqual(self.cld_mock.call_args_list, + [call(b'012345'), call(b'12345'), call(b'1235')]) + + def test_removes_at_most_max_bytes_for_cld(self): + self.cld_mock.side_effect = [get_cld_exc(0)] * 4 + with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3),\ + self.cld_patcher: + self.assertIsNone(detect_language('012345')) + self.assertEqual(self.cld_mock.call_count, 3) + + def test_doesnt_silence_other_cld_errors(self): + self.cld_mock.side_effect = [get_cld_exc(0), cld.error('another error')] + with self.cld_patcher: + self.assertRaises(cld.error, detect_language, 'text') + + +class TestExtractorWorker(TestCase): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.txt') - doc_id = self.collection.insert({'filename': filename, - 'contents': base64.b64encode(open(filename).read())}, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['mimetype'], 'text/plain') + data = {'filename': filename, + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(result['file_metadata'], {}) + self.assertEqual(result['mimetype'], 'text/plain') def test_extraction_from_html_file(self): expected = "This is a test file. I'm testing PyPLN extractor worker!" @@ -46,47 +148,48 @@ def test_extraction_from_html_file(self): # wasn't a problem before because with mongodict we used to keep a # pickled representation of the data. data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['mimetype'], 'text/html') + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(result['file_metadata'], {}) + self.assertEqual(result['mimetype'], 'text/html') def test_extraction_from_pdf_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.pdf') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) # Check that the expected metadata is a subset of what # our Extractor found (it may have found more details # depending on the toolset used to extract metadata) metadata_expected = { - u'Author': u'Álvaro Justen', - u'Creator': u'Writer', - u'Producer': u'LibreOffice 3.5', - u'CreationDate': u'Fri Jun 1 17:07:57 2012', - u'Tagged': u'no', - u'Pages': u'1', - u'Encrypted': u'no', - u'Page size': u'612 x 792 pts (letter)', - u'Optimized': u'no', - u'PDF version': u'1.4', + 'Author': 'Álvaro Justen', + 'Creator': 'Writer', + 'Producer': 'LibreOffice 3.5', + 'Tagged': 'no', + 'Pages': '1', + 'Encrypted': 'no', + 'Page size': '612 x 792 pts (letter)', + 'Optimized': 'no', + 'PDF version': '1.4', } - metadata_expected_set = set(metadata_expected.iteritems()) - metadata = refreshed_document['file_metadata'] - metadata_set = set(metadata.iteritems()) + metadata_expected_set = set(metadata_expected.items()) + metadata = result['file_metadata'] + + # Newer versions of pdfinfo add the timezone to this field + self.assertIn(metadata['CreationDate'], + ['Fri Jun 1 17:07:57 2012', + 'Fri Jun 1 17:07:57 2012 BRT']) + + metadata_set = set(metadata.items()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), ("Extracted metadata is not a subset of the expected metadata. " "Items missing or with different values: {}").format( - u", ".join(unicode(item) for item in diff_set))) - self.assertEqual(refreshed_document['mimetype'], 'application/pdf') + ", ".join(str(item) for item in diff_set))) + self.assertEqual(result['mimetype'], 'application/pdf') def test_extraction_from_html(self): contents = dedent(''' @@ -113,9 +216,8 @@ def test_extraction_from_html(self): ''') data = {'filename': 'test.html', - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) + 'contents': base64.b64encode(contents.encode('utf-8'))} + result = Extractor().process(data) expected = dedent(''' Testing @@ -133,92 +235,49 @@ def test_extraction_from_html(self): bla1 bla2''').strip() - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['mimetype'], 'text/html') - - def test_language_detection_pt(self): - text_pt = 'Esse texto foi escrito por Álvaro em Português.' - data_pt = {'filename': 'text-pt.txt', - 'contents': base64.b64encode(text_pt)} - doc_id = self.collection.insert(data_pt, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'pt') - - def test_language_detection_es(self): - text_es = 'Este texto ha sido escrito en Español por Álvaro.' - data_es = {'filename': 'text-es.txt', - 'contents': base64.b64encode(text_es)} - doc_id = self.collection.insert(data_es, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'es') - - def test_language_detection_en(self): - text_en = 'This text was written by Álvaro in English.' - data_en = {'filename': 'text-en.txt', - 'contents': base64.b64encode(text_en)} - doc_id = self.collection.insert(data_en, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'en') + self.assertEqual(result['text'], expected) + self.assertEqual(result['mimetype'], 'text/html') def test_unescape_html_entities(self): - expected = (u"This text has html . Álvaro asked me to make" - " sure it also has non ascii chars.") + expected = ("This text has html . Álvaro asked me to make" + " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) def test_should_detect_encoding_and_return_a_unicode_object(self): - expected = u"Flávio" + expected = "Flávio" filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(type(refreshed_document['text']), unicode) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(type(result['text']), str) def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'text_file') data = {'filename': filename, - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['mimetype'], 'text/plain') + 'contents': base64.b64encode(contents.encode('utf-8'))} + result = Extractor().process(data) + self.assertEqual(result['mimetype'], 'text/plain') def test_unknown_mimetype_should_be_flagged(self): filename = os.path.join(DATA_DIR, 'random_file') # we can't put the expected text content here, so we'll just make sure # it's equal to the input content, since - contents = open(filename).read() - data = {'filename': filename, - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['mimetype'], 'unknown') - self.assertEqual(refreshed_document['text'], "") - self.assertEqual(refreshed_document['language'], "") - self.assertEqual(refreshed_document['file_metadata'], {}) - - def test_unknown_encoding_should_be_ignored(self): - filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') - expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." - data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['language'], 'en') + contents = open(filename, 'rb').read() + result = Extractor().process({'filename': filename, + 'contents': base64.b64encode(contents)}) + self.assertEqual(result['mimetype'], 'unknown') + self.assertEqual(result['text'], "") + self.assertEqual(result['language'], "") + self.assertEqual(result['file_metadata'], {}) + + def test_calls_detect_language(self): + with patch(MODULE + 'detect_language') as detect_language_mock: + result = Extractor().process({'contents': base64.b64encode(b'ok')}) + self.assertEqual(result['language'], + detect_language_mock.return_value) diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index bde9c98..f4613dc 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -16,24 +16,18 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from pypln.backend.workers import FreqDist -from utils import TaskTest - - -class TestFreqDistWorker(TaskTest): - def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): - tokens = [u'The', u'sky', u'is', u'blue', u',', u'the', u'sun', u'is', - u'yellow', u'.'] - - expected_fd = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1], - [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]] +from unittest import TestCase +from pypln.backend.workers import FreqDist - # This is just preparing the expected input in the database - doc_id = self.collection.insert({'tokens': tokens}, w=1) - FreqDist().delay(doc_id) +class TestFreqDistWorker(TestCase): + def test_freqdist_should_be_a_list_of_tuples_with_frequency_distribution(self): + tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', + 'yellow', '.'] - resulting_fd = self.collection.find_one({'_id': doc_id})['freqdist'] + expected_fd = [('is', 2), ('the', 2), (',', 1), ('.', 1), ('blue', 1), + ('sky', 1), ('sun', 1), ('yellow', 1)] + resulting_fd = FreqDist().process({'tokens': tokens})['freqdist'] self.assertEqual(resulting_fd, expected_fd) diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py index 3887d81..ed0f156 100644 --- a/tests/test_worker_lemmatizer_pt.py +++ b/tests/test_worker_lemmatizer_pt.py @@ -21,7 +21,7 @@ from textwrap import dedent from pypln.backend.workers import Lemmatizer -from utils import TaskTest +from .utils import TaskTest class TestLemmatizerWorker(TaskTest): diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py index e9982ba..4ed026a 100644 --- a/tests/test_worker_palavras_noun_phrase.py +++ b/tests/test_worker_palavras_noun_phrase.py @@ -22,7 +22,7 @@ from pypln.backend.workers import NounPhrase from pypln.backend.workers.palavras_raw import palavras_installed -from utils import TaskTest +from .utils import TaskTest class TestNounPhraseWorker(TaskTest): diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index de2b6b8..628eafc 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -20,39 +20,33 @@ from unittest import skipIf from textwrap import dedent +import unittest from pypln.backend.workers import palavras_raw -from utils import TaskTest ORIGINAL_PATH = palavras_raw.BASE_PARSER -class TestPalavrasRawWorker(TaskTest): +class TestPalavrasRawWorker(unittest.TestCase): def test_should_run_only_if_language_is_portuguese(self): - doc_id = self.collection.insert({'text': 'There was a rock on the way.', - 'language': 'en'}, w=1) - - palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw_ran'], False) + doc = {'text': 'There was a rock on the way.', 'language': 'en'} + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw_ran'], False) def test_palavras_not_installed(self): palavras_raw.BASE_PARSER = '/not-found' - doc_id = self.collection.insert( - {'text': 'Tinha uma pedra no meio do caminho.', - 'language': 'pt'}, w=1) - palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw_ran'], False) + doc = {'text': 'Tinha uma pedra no meio do caminho.', + 'language': 'pt'} + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw_ran'], False) @skipIf(not palavras_raw.palavras_installed(), 'palavras software is not installed') def test_palavras_should_return_raw_if_it_is_installed(self): palavras_raw.BASE_PARSER = ORIGINAL_PATH - doc_id = self.collection.insert( - {'text': 'Eu sei que neste momento falo para todo Brasil.', - 'language': 'pt'}, w=1) + doc = {'text': 'Eu sei que neste momento falo para todo Brasil.', + 'language': 'pt'} expected_raw = dedent(''' Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 sei [saber] V PR 1S IND VFIN @FS-STA #2->0 @@ -67,7 +61,6 @@ def test_palavras_should_return_raw_if_it_is_installed(self): $. #11->0 ''').strip() + '\n\n' - result = palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw'], expected_raw) - self.assertEqual(refreshed_document['palavras_raw_ran'], True) + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw'], expected_raw) + self.assertEqual(result['palavras_raw_ran'], True) diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py index 1b3abbe..f540fc6 100644 --- a/tests/test_worker_palavras_semantic_tagger.py +++ b/tests/test_worker_palavras_semantic_tagger.py @@ -20,7 +20,7 @@ from textwrap import dedent from pypln.backend.workers import SemanticTagger -from utils import TaskTest +from .utils import TaskTest class TestSemanticTaggerWorker(TaskTest): @@ -78,10 +78,10 @@ def test_ambiguous_tags(self): ''').strip() + '\n\n' expected_tags = { - 'Non_Tagged': [u'Eu', u'bem', u'enquanto', u'ele', u'está', - u'em', u'o'], - 'Place and spatial': [u'canto'], - 'Verbs_related_human_things': [u'canto'] + 'Non_Tagged': ['Eu', 'bem', 'enquanto', 'ele', 'está', + 'em', 'o'], + 'Place and spatial': ['canto'], + 'Verbs_related_human_things': ['canto'] } doc_id = self.collection.insert({'palavras_raw': palavras_output, 'palavras_raw_ran': True}, w=1) diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py index 6307192..af0c302 100644 --- a/tests/test_worker_pos.py +++ b/tests/test_worker_pos.py @@ -22,7 +22,7 @@ from textwrap import dedent from pypln.backend.workers.palavras_raw import palavras_installed from pypln.backend.workers import POS -from utils import TaskTest +from .utils import TaskTest class TestPosWorker(TaskTest): @@ -56,7 +56,7 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): ''').strip() + '\n\n' # '.' is the only named entity here. - expected = [[u'.', u'.', 29]] + expected = [['.', '.', 29]] doc_id = self.collection.insert({'text': text, 'tokens': tokens, 'language': 'pt', 'palavras_raw': palavras_raw}, w=1) POS().delay(doc_id) diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index b81bb93..6200f73 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -18,34 +18,35 @@ # along with PyPLN. If not, see . import os -from textwrap import dedent -from pypln.backend.workers import spellchecker -from utils import TaskTest +from unittest import TestCase, mock + +from pypln.backend.workers.spellchecker import (SpellingChecker, + MissingDictionaryWarning) DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) -class TestSpellcheckerWorker(TaskTest): - def test_spellchek_pt(self): - text = u"Meu cachoro é um pastor" - doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}, - w=1) - spellchecker.SpellingChecker().delay(doc_id) +class TestSpellcheckerWorker(TestCase): + def test_spellcheck_pt(self): + text = "Meu cachoro é um pastor" + result = SpellingChecker().process({'text': text, 'language': 'pt_BR'}) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(len(refreshed_document['spelling_errors']), 1) - self.assertIn('cachoro', refreshed_document['spelling_errors'][0]) - self.assertIn('cachorro', refreshed_document['spelling_errors'][0][2]) - self.assertEqual(refreshed_document['spelling_errors'][0][1], 4) + self.assertEqual(len(result['spelling_errors']), 1) + self.assertIn('cachoro', result['spelling_errors'][0]) + self.assertIn('cachorro', result['spelling_errors'][0][2]) + self.assertEqual(result['spelling_errors'][0][1], 4) - def test_spellchek_en(self): - text = u"The cat bit the doggyo" - doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1) - spellchecker.SpellingChecker().delay(doc_id) + def test_spellcheck_en(self): + text = "The cat bit the doggyo" + result = SpellingChecker().process({'text': text, 'language': 'en'}) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(len(refreshed_document['spelling_errors']), 1) - self.assertIn('doggyo', refreshed_document['spelling_errors'][0]) - self.assertIn('doggy', refreshed_document['spelling_errors'][0][2]) - self.assertEqual(refreshed_document['spelling_errors'][0][1], 16) + self.assertEqual(len(result['spelling_errors']), 1) + self.assertIn('doggyo', result['spelling_errors'][0]) + self.assertIn('doggy', result['spelling_errors'][0][2]) + self.assertEqual(result['spelling_errors'][0][1], 16) + @mock.patch('warnings.warn') + def test_warns_about_missing_dictionary(self, warn_mock): + SpellingChecker().process({'text': '', + 'language': 'missing_language'}) + warn_mock.assert_called_with(mock.ANY, MissingDictionaryWarning) diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py index 3370e8d..4da4cfb 100644 --- a/tests/test_worker_statistics.py +++ b/tests/test_worker_statistics.py @@ -18,7 +18,7 @@ # along with PyPLN. If not, see . from pypln.backend.workers import Statistics -from utils import TaskTest +from .utils import TaskTest class TestStatisticsWorker(TaskTest): diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py index 9d59cac..67053a6 100644 --- a/tests/test_worker_tokenizer.py +++ b/tests/test_worker_tokenizer.py @@ -18,7 +18,7 @@ # along with PyPLN. If not, see . from pypln.backend.workers import Tokenizer -from utils import TaskTest +from .utils import TaskTest class TestTokenizerWorker(TaskTest): diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 93575e9..683e714 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -18,9 +18,9 @@ # along with PyPLN. If not, see . import nltk -import cPickle +import pickle from pypln.backend.workers.trigrams import Trigrams -from utils import TaskTest +from .utils import TaskTest trigram_measures = nltk.collocations.TrigramAssocMeasures() @@ -33,7 +33,7 @@ def test_Trigrams_should_return_correct_score(self): Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] - result = trigram_rank[u'olive leaf plucked'][0] + result = trigram_rank['olive leaf plucked'][0] # This is the value of the chi_sq measure for this trigram in this # colocation expected_chi_sq = 1940754916.9623578 @@ -45,7 +45,7 @@ def test_Trigrams_may_contain_dots_and_dollar_signs(self): Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] - result = trigram_rank[u'\dollarsign test \dot'][0] + result = trigram_rank['\dollarsign test \dot'][0] # This is the value of the chi_sq measure for this trigram in this # colocation expected_chi_sq = 10.5 diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 23ed090..25ddba5 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -18,26 +18,23 @@ # along with PyPLN. If not, see . import base64 -from StringIO import StringIO +from io import BytesIO +import unittest from PIL import Image from pypln.backend.workers import WordCloud -from utils import TaskTest -class TestFreqDistWorker(TaskTest): - name = "WordCloud" +class TestFreqDistWorker(unittest.TestCase): def test_wordcloud_should_return_a_base64_encoded_png(self): doc = {'freqdist': [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'} - doc_id = self.collection.insert(doc, w=1) - WordCloud().delay(doc_id) + result = WordCloud().process(doc) - refreshed_document = self.collection.find_one({'_id': doc_id}) - raw_png_data = base64.b64decode(refreshed_document['wordcloud']) + raw_png_data = base64.b64decode(result['wordcloud']) - fake_file = StringIO(raw_png_data) + fake_file = BytesIO(raw_png_data) img = Image.open(fake_file) img.verify() self.assertEqual(img.format, 'PNG')