From 702dbf1d3ad57a017fc9f10048dbdbdd89673096 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 23 Nov 2016 14:02:32 -0200 Subject: [PATCH 01/33] Uses pycld2 instead of the (outdate) chrom[...]tector --- pypln/backend/workers/extractor.py | 6 +++--- requirements/production.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 110730b..c273125 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -27,7 +27,7 @@ from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape -import cld +import cld2 import magic from pypln.backend.celery_task import PyPLNTask @@ -204,9 +204,9 @@ def process(self, file_data): text = clean(text) if isinstance(text, unicode): - language = cld.detect(text.encode('utf-8'))[1] + language = cld2.detect(text.encode('utf-8'))[1] else: - language = cld.detect(text)[1] + language = cld2.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding} diff --git a/requirements/production.txt b/requirements/production.txt index bb43589..915673a 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -11,7 +11,7 @@ pyparsing>=1.5.6,<2.0 # Backend psutil -chromium_compact_language_detector +pycld2 filemagic numpy nltk>=2.7.8 From cb3d1d28d03b9df7c0cd468c358999db5ba05ad1 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 23 Nov 2016 14:03:14 -0200 Subject: [PATCH 02/33] Removes pyparsing from requirements It looks like we are not using it anymore --- requirements/production.txt | 7 ------- 1 file changed, 7 deletions(-) diff --git a/requirements/production.txt b/requirements/production.txt index 915673a..0e325c3 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -2,13 +2,6 @@ celery pymongo==2.8.1 -# The newest pyparsing (2.0) only supports python 3, -# so we explicitly install 1.5.7 (the last version that -# supports python 2) before one of our dependencies tries -# to install it. -# http://sourceforge.net/projects/pyparsing/forums/forum/337293/topic/6481050 -pyparsing>=1.5.6,<2.0 - # Backend psutil pycld2 From 915efa72b8b1aa90d03aed2a93bcf488d628e047 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Wed, 23 Nov 2016 20:39:18 -0200 Subject: [PATCH 03/33] fix cld import --- pypln/backend/workers/extractor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index c273125..992ea4b 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -27,7 +27,7 @@ from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape -import cld2 +import pycld2 as cld import magic from pypln.backend.celery_task import PyPLNTask @@ -204,9 +204,9 @@ def process(self, file_data): text = clean(text) if isinstance(text, unicode): - language = cld2.detect(text.encode('utf-8'))[1] + language = cld.detect(text.encode('utf-8'))[1] else: - language = cld2.detect(text)[1] + language = cld.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, 'mimetype': file_mime_type, 'forced_decoding': forced_decoding} From 058959248f9c7c346294616eacfc3184f7e142ba Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Wed, 23 Nov 2016 20:40:17 -0200 Subject: [PATCH 04/33] prevent mongo from connecting at import time --- pypln/backend/celery_task.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index 2d3d93d..0b1c235 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -31,7 +31,7 @@ from pypln.backend import config -mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS) +mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False) database = mongo_client[config.MONGODB_DBNAME] document_collection = database[config.MONGODB_COLLECTION] From 0b4ccf68656080449f95fc1dd2b0b09852587535 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Wed, 23 Nov 2016 22:17:58 -0200 Subject: [PATCH 05/33] run 2to3 --- doc/conf.py | 12 +- pypln/backend/celery_app.py | 2 +- pypln/backend/config.py | 8 +- pypln/backend/workers/__init__.py | 24 +- pypln/backend/workers/bigrams.py | 2 +- pypln/backend/workers/extractor.py | 8 +- pypln/backend/workers/freqdist.py | 2 +- pypln/backend/workers/palavras_noun_phrase.py | 2 +- pypln/backend/workers/palavras_raw.py | 2 +- .../workers/palavras_semantic_tagger.py | 530 +++++++++--------- pypln/backend/workers/pos/__init__.py | 6 +- pypln/backend/workers/pos/pt_palavras.py | 56 +- pypln/backend/workers/trigrams.py | 6 +- pypln/backend/workers/word_cloud.py | 4 +- scripts/add_pipelines.py | 2 +- scripts/create_fake_measures.py | 42 +- scripts/mongo2sphinx.py | 2 +- tests/test_celery_task.py | 2 +- tests/test_elastic_indexer.py | 2 +- tests/test_worker_bigrams.py | 2 +- tests/test_worker_extractor.py | 36 +- tests/test_worker_freqdist.py | 10 +- tests/test_worker_lemmatizer_pt.py | 2 +- tests/test_worker_palavras_noun_phrase.py | 2 +- tests/test_worker_palavras_raw.py | 2 +- tests/test_worker_palavras_semantic_tagger.py | 10 +- tests/test_worker_pos.py | 4 +- tests/test_worker_spellchecker.py | 6 +- tests/test_worker_statistics.py | 2 +- tests/test_worker_tokenizer.py | 2 +- tests/test_worker_trigrams.py | 8 +- tests/test_worker_wordcloud.py | 4 +- 32 files changed, 402 insertions(+), 402 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index a6be0e8..6403136 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -46,8 +46,8 @@ master_doc = 'index' # General information about the project. -project = u'PyPLN' -copyright = u'2011, Flávio Codeço Coelho' +project = 'PyPLN' +copyright = '2011, Flávio Codeço Coelho' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -187,8 +187,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'PyPLN.tex', u'PyPLN Documentation', - u'Flávio Codeço Coelho', 'manual'), + ('index', 'PyPLN.tex', 'PyPLN Documentation', + 'Flávio Codeço Coelho', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -220,6 +220,6 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'pypln', u'PyPLN Documentation', - [u'Flávio Codeço Coelho'], 1) + ('index', 'pypln', 'PyPLN Documentation', + ['Flávio Codeço Coelho'], 1) ] diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py index 342c5be..895d9cd 100644 --- a/pypln/backend/celery_app.py +++ b/pypln/backend/celery_app.py @@ -19,7 +19,7 @@ from celery import Celery from kombu import Exchange, Queue -import config +from . import config app = Celery('pypln_workers', backend='mongodb', broker='amqp://', include=['pypln.backend.workers']) diff --git a/pypln/backend/config.py b/pypln/backend/config.py index ec1d48e..f89bd6f 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -3,14 +3,14 @@ from decouple import config, Csv try: - import urlparse + import urllib.parse except ImportError: import urllib.parse as urlparse def parse_url(url): - urlparse.uses_netloc.append('mongodb') - urlparse.uses_netloc.append('celery') - url = urlparse.urlparse(url) + urllib.parse.uses_netloc.append('mongodb') + urllib.parse.uses_netloc.append('celery') + url = urllib.parse.urlparse(url) path = url.path[1:] path = path.split('?', 2)[0] diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index 0125bde..9ca1ec2 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -17,18 +17,18 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from extractor import Extractor -from tokenizer import Tokenizer -from freqdist import FreqDist -from pos import POS -from statistics import Statistics -from bigrams import Bigrams -from palavras_raw import PalavrasRaw -from lemmatizer_pt import Lemmatizer -from palavras_noun_phrase import NounPhrase -from palavras_semantic_tagger import SemanticTagger -from word_cloud import WordCloud -from elastic_indexer import ElasticIndexer +from .extractor import Extractor +from .tokenizer import Tokenizer +from .freqdist import FreqDist +from .pos import POS +from .statistics import Statistics +from .bigrams import Bigrams +from .palavras_raw import PalavrasRaw +from .lemmatizer_pt import Lemmatizer +from .palavras_noun_phrase import NounPhrase +from .palavras_semantic_tagger import SemanticTagger +from .word_cloud import WordCloud +from .elastic_indexer import ElasticIndexer __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 302482f..034972d 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -45,4 +45,4 @@ def process(self, document): for m in metrics: for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): br[res[0]].append(res[1]) - return {'metrics': metrics, 'bigram_rank': br.items()} + return {'metrics': metrics, 'bigram_rank': list(br.items())} diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 992ea4b..2a864e6 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -20,7 +20,7 @@ import base64 import shlex -from HTMLParser import HTMLParser +from html.parser import HTMLParser from tempfile import NamedTemporaryFile from os import unlink from subprocess import Popen, PIPE @@ -84,7 +84,7 @@ def parse_html(html, remove_tags=None, remove_inside=None, [''] * (total_to_remove - 2) content_between[index + 1] = '\n' complete_tags.append('') - result = ''.join(sum(zip(content_between, complete_tags), tuple())) + result = ''.join(sum(list(zip(content_between, complete_tags)), tuple())) return clean(result) def get_pdf_metadata(data): @@ -193,7 +193,7 @@ def process(self, file_data): text, forced_decoding = trial_decode(text) - if isinstance(text, unicode): + if isinstance(text, str): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded @@ -203,7 +203,7 @@ def process(self, file_data): text = clean(text) - if isinstance(text, unicode): + if isinstance(text, str): language = cld.detect(text.encode('utf-8'))[1] else: language = cld.detect(text)[1] diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py index 7bb7f7e..82d75ee 100644 --- a/pypln/backend/workers/freqdist.py +++ b/pypln/backend/workers/freqdist.py @@ -27,7 +27,7 @@ def process(self, document): tokens = [info.lower() for info in document_tokens] frequency_distribution = {token: tokens.count(token) \ for token in set(tokens)} - fd = frequency_distribution.items() + fd = list(frequency_distribution.items()) fd.sort(lambda x, y: cmp(y[1], x[1])) return {'freqdist': fd} diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py index 76e3a18..f9dde80 100644 --- a/pypln/backend/workers/palavras_noun_phrase.py +++ b/pypln/backend/workers/palavras_noun_phrase.py @@ -40,7 +40,7 @@ def process(self, document): stdout=subprocess.PIPE, stderr=subprocess.PIPE) palavras_output = document['palavras_raw'] - if isinstance(palavras_output, unicode): + if isinstance(palavras_output, str): # we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii. palavras_output = palavras_output.encode('utf-8') stdout, stderr = process.communicate(palavras_output) diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py index 77e2d9a..e45bb11 100644 --- a/pypln/backend/workers/palavras_raw.py +++ b/pypln/backend/workers/palavras_raw.py @@ -46,7 +46,7 @@ def process(self, document): # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we # know the text came from mongodb, we can just decode it using utf-8 to # make sure we have a unicode object. - if not isinstance(text, unicode): + if not isinstance(text, str): text = text.decode('utf-8') process = subprocess.Popen([BASE_PARSER, PARSER_MODE], diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py index 3f35ca5..1e448de 100644 --- a/pypln/backend/workers/palavras_semantic_tagger.py +++ b/pypln/backend/workers/palavras_semantic_tagger.py @@ -26,381 +26,381 @@ { 'Animal': { - '': u'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' , - '': u'Group of animals (cardume, enxame, passarada, ninhada)', - '': u'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)', - '': u'Group of domestic animals (boiada)', - '': u'Water-animal (tubarão, delfim)', - '': u'Mythological animal (basilisco)', - '': u'Land-animal (raposa)', - '': u'Bird (águia, bem-te-vi)', - '': u'Insect (borboleta)', - '': u'Cell-animal (bacteria, blood cells: linfócito)', + '': 'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' , + '': 'Group of animals (cardume, enxame, passarada, ninhada)', + '': 'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)', + '': 'Group of domestic animals (boiada)', + '': 'Water-animal (tubarão, delfim)', + '': 'Mythological animal (basilisco)', + '': 'Land-animal (raposa)', + '': 'Bird (águia, bem-te-vi)', + '': 'Insect (borboleta)', + '': 'Cell-animal (bacteria, blood cells: linfócito)', }, 'Plant': { - '': u'Plant, umbrella tag', - '': u'Group of plants, plantation (field, forest etc.: mata, nabal)', - '': u'Tree (oliveira, palmeira)', - '': u'Flower (rosa, taraxaco)', - '': u'Bush, shrub (rododendro, tamariz)', - '': u'(fruit, berries, nuts: maçã, morango, avelã, melancia)', - '': u'(vegetable espargo, funcho)', + '': 'Plant, umbrella tag', + '': 'Group of plants, plantation (field, forest etc.: mata, nabal)', + '': 'Tree (oliveira, palmeira)', + '': 'Flower (rosa, taraxaco)', + '': 'Bush, shrub (rododendro, tamariz)', + '': '(fruit, berries, nuts: maçã, morango, avelã, melancia)', + '': '(vegetable espargo, funcho)', }, 'Human': { - '': u'Human, umbrella tag', - '': u'Group of humans (organisations, teams, companies, e.g. editora)', - '': u'Attributive human umbrella tag (many -ista, -ante)', - '': u'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)', - '': u'Human with family or other private relation (pai, noiva)', - '': u'Ideological human (comunista, implies ), also: follower, disciple (dadaista)', - '': u'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)', - '': u'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)', - '': u'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)', - '': u'Sick human (few: asmático, diabético, cp )', - '': u'Title noun (rei, senhora)', + '': 'Human, umbrella tag', + '': 'Group of humans (organisations, teams, companies, e.g. editora)', + '': 'Attributive human umbrella tag (many -ista, -ante)', + '': 'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)', + '': 'Human with family or other private relation (pai, noiva)', + '': 'Ideological human (comunista, implies ), also: follower, disciple (dadaista)', + '': 'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)', + '': 'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)', + '': 'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)', + '': 'Sick human (few: asmático, diabético, cp )', + '': 'Title noun (rei, senhora)', }, 'Place and spatial': { - '': u'Place, umbrella tag', - '': u'Abstract place (anverso. auge)', - '': u'Civitas, town, country, county (equals + , cidade, país)', - '': u'Cover, lid (colcha, lona, tampa)', - '': u'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)', - '': u'opening, hole (apertura, fossa)', - '': u'Path (road, street etc.: rua, pista)' , - '': u'Star object (planets, comets: planeta, quasar)', - '': u'surface (face, verniz, cp. )', - '': u'tip place, edge (pico, pontinha, cp. )', - '': u'Geographical, natural place (promontório, pântano)', - '': u'trap place (armadilha, armazelo)', - '': u'Water place (river, lake, sea: fonte, foz, lagoa)', - '': u'barrier noun (dique, limite, muralha)', - '': u'(building)', - '': u'(institution)', - '': u'(picture)', - '': u'(situation)', - '': u'anatomical/body position (few: desaprumo)', - '': u'social position, job (emprego, condado, capitania, presidência)', + '': 'Place, umbrella tag', + '': 'Abstract place (anverso. auge)', + '': 'Civitas, town, country, county (equals + , cidade, país)', + '': 'Cover, lid (colcha, lona, tampa)', + '': 'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)', + '': 'opening, hole (apertura, fossa)', + '': 'Path (road, street etc.: rua, pista)' , + '': 'Star object (planets, comets: planeta, quasar)', + '': 'surface (face, verniz, cp. )', + '': 'tip place, edge (pico, pontinha, cp. )', + '': 'Geographical, natural place (promontório, pântano)', + '': 'trap place (armadilha, armazelo)', + '': 'Water place (river, lake, sea: fonte, foz, lagoa)', + '': 'barrier noun (dique, limite, muralha)', + '': '(building)', + '': '(institution)', + '': '(picture)', + '': '(situation)', + '': 'anatomical/body position (few: desaprumo)', + '': 'social position, job (emprego, condado, capitania, presidência)', }, 'Vehicle': { - '': u'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)', - '': u'Group of vehicles (armada, convoy: frota, esquadra)', - '': u'Water vehicle (ship: navio, submersível, canoa)', - '': u'Air vehicle (plane: hidroplano, jatinho)', + '': 'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)', + '': 'Group of vehicles (armada, convoy: frota, esquadra)', + '': 'Water vehicle (ship: navio, submersível, canoa)', + '': 'Air vehicle (plane: hidroplano, jatinho)', }, 'Abstract': { - '': u'Abstract countable, umbrella tag (alternativa, chance, lazer)', - '': u'Category word (latinismo, número atômico)', - '': u'sign, symbol (parêntese, semicolcheia)', - '': u'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)', - '': u'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.', - '': u'(features)', - '': u'direction noun (estibordo, contrasenso, norte)', - '': u'(shapes)', - '': u'meta noun (tipo, espécie)', - '': u'(MARCA) brand', - '': u'(DISCIPLINA) subject matter', - '': u'(ESCOLA) school of thought', - '': u'(IDEA) idea, concept', - '': u'(PLANO) named plan, project', - '': u'(OBRA) artist-s name, standing for body of work', - '': u'(NOME)', - '': u'(ESTADO) physiological state, in particular: disease', + '': 'Abstract countable, umbrella tag (alternativa, chance, lazer)', + '': 'Category word (latinismo, número atômico)', + '': 'sign, symbol (parêntese, semicolcheia)', + '': 'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)', + '': 'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.', + '': '(features)', + '': 'direction noun (estibordo, contrasenso, norte)', + '': '(shapes)', + '': 'meta noun (tipo, espécie)', + '': '(MARCA) brand', + '': '(DISCIPLINA) subject matter', + '': '(ESCOLA) school of thought', + '': '(IDEA) idea, concept', + '': '(PLANO) named plan, project', + '': '(OBRA) artist-s name, standing for body of work', + '': '(NOME)', + '': '(ESTADO) physiological state, in particular: disease', }, 'Concept': { - '': u'convention (social rule or law, lei, preceito)', - '': u'subject matter, profession, cf. , anatomia, citricultura, dactilografia)', - '': u'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)', - '': u'', - '': u'language (alemão, catalão, bengali)', - '': u'', - '': u'', - '': u'therapy (also and , acupuntura, balneoterapia)', + '': 'convention (social rule or law, lei, preceito)', + '': 'subject matter, profession, cf. , anatomia, citricultura, dactilografia)', + '': 'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)', + '': '', + '': 'language (alemão, catalão, bengali)', + '': '', + '': '', + '': 'therapy (also and , acupuntura, balneoterapia)', }, 'Game': { - '': u'play, game (bilhar, ioiô, poker, also )', + '': 'play, game (bilhar, ioiô, poker, also )', }, 'Genre': { - '': u'genre (especially art genre, cf. , modernismo, tropicalismo)', + '': 'genre (especially art genre, cf. , modernismo, tropicalismo)', }, 'Quantity': { - '': u'', - '': u'quantity noun (bocada, teor, sem-fim)', - '': u'currency noun (countable, implies , cf. , dirham, euro, real, dólar)', - '': u'amount of money (bolsa, custo, imposto, cf. )', + '': '', + '': 'quantity noun (bocada, teor, sem-fim)', + '': 'currency noun (countable, implies , cf. , dirham, euro, real, dólar)', + '': 'amount of money (bolsa, custo, imposto, cf. )', }, 'Action': { - '': u'Action umbrella tag (+CONTROL, PERFECTIVE)', - '': u'beat-action (thrashing, pancada, surra)', - '': u'do-action (typically dar/fazer + N, tentativa, teste, homenagem)', - '': u'speech act or communicative act (proposta, ordem)', - '': u'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )', - '': u'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)', - '': u'', - '': u'', - '': u'', - '': u'dance (both , and , calipso, flamenco, forró)', - '': u'fight, conflict (also and +TEMP, briga, querela)', - '': u'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)', + '': 'Action umbrella tag (+CONTROL, PERFECTIVE)', + '': 'beat-action (thrashing, pancada, surra)', + '': 'do-action (typically dar/fazer + N, tentativa, teste, homenagem)', + '': 'speech act or communicative act (proposta, ordem)', + '': 'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )', + '': 'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)', + '': '', + '': '', + '': '', + '': 'dance (both , and , calipso, flamenco, forró)', + '': 'fight, conflict (also and +TEMP, briga, querela)', + '': 'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)', }, 'Anatomical': { - '': u'Anatomical noun, umbrella tag (carótida, clítoris, dorso)', - '': u'Movable anatomy (arm, leg, braço, bíceps, cotovelo)', - '': u'Organ (heart, liver, hipófise, coração, testículo)', - '': u'Bone (calcâneo, fíbula, vértebra)', - '': u'Animal anatomy (rúmen, carapaça, chifres, tromba)', - '': u'Bird anatomy (bico, pluma)', - '': u'Fish anatomy (few: bránquias, siba)', - '': u'Insect anatomy (few: tentáculo, olho composto)', - '': u'Plant anatomy (bulbo, caule, folha)', - '': u'(human anatomical feature)', + '': 'Anatomical noun, umbrella tag (carótida, clítoris, dorso)', + '': 'Movable anatomy (arm, leg, braço, bíceps, cotovelo)', + '': 'Organ (heart, liver, hipófise, coração, testículo)', + '': 'Bone (calcâneo, fíbula, vértebra)', + '': 'Animal anatomy (rúmen, carapaça, chifres, tromba)', + '': 'Bird anatomy (bico, pluma)', + '': 'Fish anatomy (few: bránquias, siba)', + '': 'Insect anatomy (few: tentáculo, olho composto)', + '': 'Plant anatomy (bulbo, caule, folha)', + '': '(human anatomical feature)', }, 'Thing': { - '': u'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike )', - '': u'Artifact, umbrella tag (so far empty category in PALAVRAS)', - '': u'ornamental object (few: guirlanda, rufo)', - '': u'flat long object (few: board, plank, lousa, tabla)', - '': u'fire object (bonfire, spark, chispa, fogo, girândola)', - '': u'handle (garra, ansa, chupadouro)', - '': u'light artifact (lampião, farol, projector) ', - '': u'(atomic) particle (few: cátion, eletrônio)', - '': u'read object (carteira, cupom, bilhete, carta, cf. )', - '': u'cloth object (towel, napkin, carpet, rag) , cp. ', - '': u'(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)', - '': u'stick object (long and thin, vara, lançe, paulito)', - '': u'(OBJECT) named object', - '': u'(OBJECT) common noun used as name', - '': u'(SUBSTANCIA) substance', - '': u'(CLASSE) classification category for things', - '': u'(CLASSE) plant name', - '': u'(MOEDA) currency name (also marked on the number)', - '': u'mass noun (e.g. "leite", "a-gua")', - '': u'furniture (cama, cadeira, tambo, quadro)', - '': u'container (implies quantifying, ampola, chícara, aquário)', + '': 'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike )', + '': 'Artifact, umbrella tag (so far empty category in PALAVRAS)', + '': 'ornamental object (few: guirlanda, rufo)', + '': 'flat long object (few: board, plank, lousa, tabla)', + '': 'fire object (bonfire, spark, chispa, fogo, girândola)', + '': 'handle (garra, ansa, chupadouro)', + '': 'light artifact (lampião, farol, projector) ', + '': '(atomic) particle (few: cátion, eletrônio)', + '': 'read object (carteira, cupom, bilhete, carta, cf. )', + '': 'cloth object (towel, napkin, carpet, rag) , cp. ', + '': '(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)', + '': 'stick object (long and thin, vara, lançe, paulito)', + '': '(OBJECT) named object', + '': '(OBJECT) common noun used as name', + '': '(SUBSTANCIA) substance', + '': '(CLASSE) classification category for things', + '': '(CLASSE) plant name', + '': '(MOEDA) currency name (also marked on the number)', + '': 'mass noun (e.g. "leite", "a-gua")', + '': 'furniture (cama, cadeira, tambo, quadro)', + '': 'container (implies quantifying, ampola, chícara, aquário)', }, 'Substance': { - '': u'concrete mass/non-countable, umbrella tag, substance (cf. , terra, choça, magma)', - '': u'human-made substance (cf. , cemento)', - '': u'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina', - '': u'gas substance (so far few: argônio, overlap with. and )', - '': u'liquid substance (azeite, gasolina, plasma, overlap with and )', - '': u'remedy (medical or hygiene, antibiótico, canabis, quinina, part of , overlap with )', + '': 'concrete mass/non-countable, umbrella tag, substance (cf. , terra, choça, magma)', + '': 'human-made substance (cf. , cemento)', + '': 'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina', + '': 'gas substance (so far few: argônio, overlap with. and )', + '': 'liquid substance (azeite, gasolina, plasma, overlap with and )', + '': 'remedy (medical or hygiene, antibiótico, canabis, quinina, part of , overlap with )', }, 'Materials': { - '': u'material (argila, bronze, granito, cf. )', - '': u'cloth material (seda, couro, vison, kevlar), cp. ', - '': u'cord, string, rope, tape (previously , arame, fio, fibrila)', + '': 'material (argila, bronze, granito, cf. )', + '': 'cloth material (seda, couro, vison, kevlar), cp. ', + '': 'cord, string, rope, tape (previously , arame, fio, fibrila)', }, 'Clothing': { - '': u'animal clothing (sela, xabraque)', - '': u'human clothing (albornoz, anoraque, babadouro, bermudas)', - '': u'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)', - '': u'hat (sombrero, mitra, coroa)', - '': u'shoe (bota, chinela, patim)', - '': u'cloth material (seda, couro, vison, kevlar), cp. ', - '': u'(clothing)', + '': 'animal clothing (sela, xabraque)', + '': 'human clothing (albornoz, anoraque, babadouro, bermudas)', + '': 'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)', + '': 'hat (sombrero, mitra, coroa)', + '': 'shoe (bota, chinela, patim)', + '': 'cloth material (seda, couro, vison, kevlar), cp. ', + '': '(clothing)', }, 'Collective': { - '': u'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)', - '': u'thing collective, pile (baralho, lanço)', - '': u'plant-part collective (buquê, folhagem)', - '': u'semantic collective, collection (arquivo, repertório)', - '': u'tool collective, set (intrumentário, prataria)', - '': u'(group)', - '': u'(herd)', - '': u'(plantation)', - '': u'(convoy)', + '': 'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)', + '': 'thing collective, pile (baralho, lanço)', + '': 'plant-part collective (buquê, folhagem)', + '': 'semantic collective, collection (arquivo, repertório)', + '': 'tool collective, set (intrumentário, prataria)', + '': '(group)', + '': '(herd)', + '': '(plantation)', + '': '(convoy)', }, 'Time_Event': { - '': u'duration noun (test: durar+, implies , e.g. átimo, mês, hora)', - '': u'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. and )', - '': u'non-organised event (-CONTROL, PERFECTIVE, milagre, morte)', - '': u'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized ) ', - '': u'process (-CONTROL, -PERFECTIVE, cp. , balcanização, convecção, estagnação)', - '': u'', - '': u'', - '': u'(EFEMERIDE) one-time [historical] occurrence', - '': u'(DATA) date', - '': u'(HORA) hour', - '': u'(PERIODO) period', - '': u'(CICLICO) cyclic time expression', - '': u'month noun/name (agosto, julho, part of )', - '': u'period of time (prototypical test: durante, e.g. guerra, década, cf. and )', + '': 'duration noun (test: durar+, implies , e.g. átimo, mês, hora)', + '': 'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. and )', + '': 'non-organised event (-CONTROL, PERFECTIVE, milagre, morte)', + '': 'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized ) ', + '': 'process (-CONTROL, -PERFECTIVE, cp. , balcanização, convecção, estagnação)', + '': '', + '': '', + '': '(EFEMERIDE) one-time [historical] occurrence', + '': '(DATA) date', + '': '(HORA) hour', + '': '(PERIODO) period', + '': '(CICLICO) cyclic time expression', + '': 'month noun/name (agosto, julho, part of )', + '': 'period of time (prototypical test: durante, e.g. guerra, década, cf. and )', }, 'Feature': { - '': u'feature/property, umbrella tag (problematicidade, proporcionalidade)', - '': u'anatomical "local" feature, includes countables, e.g. barbela, olheiras)', - '': u'general countable feature (vestígio, laivos, vinco)', - '': u'human physical feature, not countable (lindura, compleição, same as , cp. anatomical local features )', - '': u'', - '': u'human psychological feature (passionalidade, pavonice, cp. passing states )', - '': u'quantifiable feature (e.g. circunferência, calor, DanGram-s covers both and )', - '': u'', - '': u'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)', - '': u'', - '': u'(human state)', + '': 'feature/property, umbrella tag (problematicidade, proporcionalidade)', + '': 'anatomical "local" feature, includes countables, e.g. barbela, olheiras)', + '': 'general countable feature (vestígio, laivos, vinco)', + '': 'human physical feature, not countable (lindura, compleição, same as , cp. anatomical local features )', + '': '', + '': 'human psychological feature (passionalidade, pavonice, cp. passing states )', + '': 'quantifiable feature (e.g. circunferência, calor, DanGram-s covers both and )', + '': '', + '': 'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)', + '': '', + '': '(human state)', }, 'Food': { - '': u'natural/simplex food (aveia, açúcar, carne, so far including )', - '': u'countable food (few: ovo, dente de alho, most are or )', - '': u'human-prepared/complex culinary food (caldo verde, lasanha)', - '': u'culinary countable food (biscoito, enchido, panetone, pastel)', - '': u'drink (cachaça, leite, guaraná, moca)', - '': u'fruit, berry, nut (still mostly marked as , abricote, amora, avelã, cebola)', - '': u'condiments, pepper', + '': 'natural/simplex food (aveia, açúcar, carne, so far including )', + '': 'countable food (few: ovo, dente de alho, most are or )', + '': 'human-prepared/complex culinary food (caldo verde, lasanha)', + '': 'culinary countable food (biscoito, enchido, panetone, pastel)', + '': 'drink (cachaça, leite, guaraná, moca)', + '': 'fruit, berry, nut (still mostly marked as , abricote, amora, avelã, cebola)', + '': 'condiments, pepper', }, 'Part': { - '': u'distinctive or functional part (ingrediente, parte, trecho)', - '': u'structural part of building or vehicle (balustrada, porta, estai)', - '': u'indistinctive (little) piece (pedaço, raspa)', - '': u'', - '': u'', + '': 'distinctive or functional part (ingrediente, parte, trecho)', + '': 'structural part of building or vehicle (balustrada, porta, estai)', + '': 'indistinctive (little) piece (pedaço, raspa)', + '': '', + '': '', }, 'Perception': { - '': u'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with )', - '': u'sound (what you hear, apitadela, barrulho, berro, crepitação)', - '': u'olfactory impression (what you smell, bafo, chamuscom fragrância)', - '': u'what you taste (PALAVRAS: not implemented)', - '': u'visual impression (what you see, arco-iris, réstia, vislumbre)', + '': 'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with )', + '': 'sound (what you hear, apitadela, barrulho, berro, crepitação)', + '': 'olfactory impression (what you smell, bafo, chamuscom fragrância)', + '': 'what you taste (PALAVRAS: not implemented)', + '': 'visual impression (what you see, arco-iris, réstia, vislumbre)', }, 'Semantic Product': { - '': u'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)', - '': u'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)', - '': u'listen-work (music, cantarola, prelúdio, at the same time : bossa nova)', - '': u'nonsense, rubbish (implies , galimatias, farelório)', - '': u'read-work (biografia, dissertação, e-mail, ficha cadastral)', - '': u'speak-work (palestra, piada, exposto)', - '': u'watch-work (filme, esquete, mininovela)', - '': u'(speach act)', - '': u'', + '': 'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)', + '': 'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)', + '': 'listen-work (music, cantarola, prelúdio, at the same time : bossa nova)', + '': 'nonsense, rubbish (implies , galimatias, farelório)', + '': 'read-work (biografia, dissertação, e-mail, ficha cadastral)', + '': 'speak-work (palestra, piada, exposto)', + '': 'watch-work (filme, esquete, mininovela)', + '': '(speach act)', + '': '', }, 'Disease': { - '': u'disease (acne, AIDS, sida, alcoolismo, cp. )', - '': u'', - '': u'countable disease-object (abscesso, berruga, cicatriz, gangrena)', + '': 'disease (acne, AIDS, sida, alcoolismo, cp. )', + '': '', + '': 'countable disease-object (abscesso, berruga, cicatriz, gangrena)', }, 'State-of-affairs': { - '': u'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than and ', - '': u'state (of something, otherwise ), abundância, calma, baixa-mar, equilíbrio', - '': u'human state (desamparo, desesperança, dormência, euforia, febre', - '': u'', - '': u'', + '': 'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than and ', + '': 'state (of something, otherwise ), abundância, calma, baixa-mar, equilíbrio', + '': 'human state (desamparo, desesperança, dormência, euforia, febre', + '': '', + '': '', }, 'Sport': { - '': u'sport (capoeira, futebol, golfe, also and )', + '': 'sport (capoeira, futebol, golfe, also and )', }, 'Tool': { - '': u'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. )', - '': u'cutting tool, knife (canivete, espada)', - '': u'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: )', - '': u'musical instrument (clavicórdio, ocarina, violão)', - '': u'sailing tool, sail (vela latina, joanete, coringa)', - '': u'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. )', - '': u'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like or )', + '': 'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. )', + '': 'cutting tool, knife (canivete, espada)', + '': 'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: )', + '': 'musical instrument (clavicórdio, ocarina, violão)', + '': 'sailing tool, sail (vela latina, joanete, coringa)', + '': 'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. )', + '': 'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like or )', }, 'Unit': { - '': u'unit noun (always implying , implied by and , e.g. caloria, centímetro, lúmen))', + '': 'unit noun (always implying , implied by and , e.g. caloria, centímetro, lúmen))', }, 'Weather': { - '': u'weather (states), umbrella tag (friagem, bruma)', - '': u'countable weather phenomenon (nuvem, tsunami)', - '': u'rain and other precipitation (chuvisco, tromba d-água, granizo)', - '': u'wind, storm (brisa, furacão)', + '': 'weather (states), umbrella tag (friagem, bruma)', + '': 'countable weather phenomenon (nuvem, tsunami)', + '': 'rain and other precipitation (chuvisco, tromba d-água, granizo)', + '': 'wind, storm (brisa, furacão)', }, 'Person': { - '': u'(INDIVIDUAL) person name (cp. )', - '': u'(CARGO) official function (~ cp. and )', - '': u'(MEMBRO) member', + '': '(INDIVIDUAL) person name (cp. )', + '': '(CARGO) official function (~ cp. and )', + '': '(MEMBRO) member', }, 'Organization_Group': { - '': u'(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)', - '': u'(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as )', - '': u'(EMPRESA) organized site (e.g. restaurant, cp. )', - '': u'(EMPRESA) media organisation (e.g. newspaper, tv channel)', - '': u'(INSTITUICAO) political party', - '': u'(SUB) organized part of any of the above', - '': u'currently unsupported: (EMPRESA) company (not site-bound, unlike , now fused with. )', + '': '(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)', + '': '(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as )', + '': '(EMPRESA) organized site (e.g. restaurant, cp. )', + '': '(EMPRESA) media organisation (e.g. newspaper, tv channel)', + '': '(INSTITUICAO) political party', + '': '(SUB) organized part of any of the above', + '': 'currently unsupported: (EMPRESA) company (not site-bound, unlike , now fused with. )', }, 'Group': { - '': u'(GROUPOIND) people, family', - '': u'(GROUPOCARGO) board, government (not fully implemented)', - '': u'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with )', + '': '(GROUPOIND) people, family', + '': '(GROUPOCARGO) board, government (not fully implemented)', + '': 'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with )', }, 'Place': { - '': u'(GEOGRAFICO) geographical location (cp. )', - '': u'(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. )', - '
': u'(CORREIO) address (including numbers etc.)', - '': u'(ALARGADO) functional place (cp. )', - '': u'(VIRTUAL) virtual place', - '': u'(OBJECTO) astronomical place (in HAREM object, not place)', - '': u'suggested (ALARGADO) roads, motorway (unlike
)', + '': '(GEOGRAFICO) geographical location (cp. )', + '': '(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. )', + '
': '(CORREIO) address (including numbers etc.)', + '': '(ALARGADO) functional place (cp. )', + '': '(VIRTUAL) virtual place', + '': '(OBJECTO) astronomical place (in HAREM object, not place)', + '': 'suggested (ALARGADO) roads, motorway (unlike
)', }, 'Work_of_Art': { - '': u'(REPRODUZIDO) [title of] reproduced work, copy', - '': u'(PUBLICACAO) [scientific] publication', - '': u'(PRODUTO) product brand', - '': u'(PRODUTO) vehicle brand (cp. , , )', - '': u'(ARTE) work of art', - '': u'picture (combination of , and , caricatura, cintilograma, diapositivo)', + '': '(REPRODUZIDO) [title of] reproduced work, copy', + '': '(PUBLICACAO) [scientific] publication', + '': '(PRODUTO) product brand', + '': '(PRODUTO) vehicle brand (cp. , , )', + '': '(ARTE) work of art', + '': 'picture (combination of , and , caricatura, cintilograma, diapositivo)', }, 'Colours': { - '': u'colours', + '': 'colours', }, 'Numeric_and_Math': { - '': u'(QUANTIDADE) simple measuring numeral', - '': u'(CLASSIFICADO) predicating numeral', - '': u'(MOEDA) currency name (also marked on the unit)', - '': u'geometry noun (circle, shape, e.g. losango, octógono, elipse)', - '': u'line (few: linha, percentil, curvas isobáricas)', + '': '(QUANTIDADE) simple measuring numeral', + '': '(CLASSIFICADO) predicating numeral', + '': '(MOEDA) currency name (also marked on the unit)', + '': 'geometry noun (circle, shape, e.g. losango, octógono, elipse)', + '': 'line (few: linha, percentil, curvas isobáricas)', }, 'Modifying_Adjectives': { - '': u'adjective modifying human noun', - '': u'adjective modifying inanimate noun ', - '': u'adjective modifying animal', - '': u'adjective modifying plant', - '': u'color adjective', - '': u'nationality adjective (also: from a certain town etc.)', - '': u'(human) attributive adjective (not fully implemented, cp. , e.g. "um presidente COMUNISTA")', + '': 'adjective modifying human noun', + '': 'adjective modifying inanimate noun ', + '': 'adjective modifying animal', + '': 'adjective modifying plant', + '': 'color adjective', + '': 'nationality adjective (also: from a certain town etc.)', + '': '(human) attributive adjective (not fully implemented, cp. , e.g. "um presidente COMUNISTA")', }, 'Verbs_related_human_things': { - '': u'verb with human subject', - '': u'verb with inanimate subject', + '': 'verb with human subject', + '': 'verb with inanimate subject', }, } @@ -425,7 +425,7 @@ def process(self, document): word_sem_tags = angle_brackets_contents.findall(line.strip()) is_tagged = False for tag in word_sem_tags: - for category, subcategories in SEMANTIC_TAGS.items(): + for category, subcategories in list(SEMANTIC_TAGS.items()): if tag in subcategories: tagged_entities.setdefault(category, []).append(word) is_tagged = True diff --git a/pypln/backend/workers/pos/__init__.py b/pypln/backend/workers/pos/__init__.py index 9400fd1..4647189 100644 --- a/pypln/backend/workers/pos/__init__.py +++ b/pypln/backend/workers/pos/__init__.py @@ -18,8 +18,8 @@ # along with PyPLN. If not, see . -import en_nltk -import pt_palavras +from . import en_nltk +from . import pt_palavras from pypln.backend.workers.palavras_raw import palavras_installed from pypln.backend.celery_task import PyPLNTask @@ -48,7 +48,7 @@ def process(self, document): if language in MAPPING: tagset, tagged_text = MAPPING[language](document) text = document['text'] - if not isinstance(text, unicode): + if not isinstance(text, str): text = text.decode('utf-8') tagged_text_with_offset = put_offset(text, tagged_text) return {'pos': tagged_text_with_offset, 'tagset': tagset} diff --git a/pypln/backend/workers/pos/pt_palavras.py b/pypln/backend/workers/pos/pt_palavras.py index 19d4b9d..d24efa1 100644 --- a/pypln/backend/workers/pos/pt_palavras.py +++ b/pypln/backend/workers/pos/pt_palavras.py @@ -23,56 +23,56 @@ PALAVRAS_ENCODING = 'utf-8' WORD_CLASSES = { - u'N': u'Nouns', - u'PROP': u'Proper nouns', - u'SPEC': u'Specifiers', - u'DET': u'Determiners', - u'PERS': u'Personal pronouns', - u'ADJ': u'Adjectives', - u'ADV': u'Adverbs', - u'V': u'Verbs', - u'NUM': u'Numerals', - u'PRP': u'Preposition', - u'KS': u'Subordinating conjunctions', - u'KC': u'Coordinationg conjunctions', - u'IN': u'Interjections', - u'EC': u'Hyphen-separated prefix', - u'BL': u'Blank Line', - u'ES': u'End of Sentence', - u'NW': u'Non Word', + 'N': 'Nouns', + 'PROP': 'Proper nouns', + 'SPEC': 'Specifiers', + 'DET': 'Determiners', + 'PERS': 'Personal pronouns', + 'ADJ': 'Adjectives', + 'ADV': 'Adverbs', + 'V': 'Verbs', + 'NUM': 'Numerals', + 'PRP': 'Preposition', + 'KS': 'Subordinating conjunctions', + 'KC': 'Coordinationg conjunctions', + 'IN': 'Interjections', + 'EC': 'Hyphen-separated prefix', + 'BL': 'Blank Line', + 'ES': 'End of Sentence', + 'NW': 'Non Word', } def pos(document): if 'palavras_raw' not in document: - return u'', [] + return '', [] palavras_output = document['palavras_raw'] - if not isinstance(palavras_output, unicode): + if not isinstance(palavras_output, str): palavras_output = palavras_output.decode(PALAVRAS_ENCODING) tagged_text = [] - for line in palavras_output.split(u'\n'): + for line in palavras_output.split('\n'): line = line.strip() #print(line) - if line.isspace() or line == u'': + if line.isspace() or line == '': continue - elif line.startswith(u'<'): + elif line.startswith('<'): continue - elif line.startswith(u'$'): + elif line.startswith('$'): non_word = line.split()[0][1:] if non_word.isdigit(): - non_word_tag = u'NUM' + non_word_tag = 'NUM' else: non_word_tag = non_word tagged_text.append((non_word, non_word_tag)) - elif len(line.split(u'\t')) < 2: # Discard malformed lines + elif len(line.split('\t')) < 2: # Discard malformed lines continue else: - info = line.split(u'\t') - final = u'\t'.join(info[1:]).split() + info = line.split('\t') + final = '\t'.join(info[1:]).split() word = info[0].strip() syntatic_semantic_tags = final[1:] - tags = filter(lambda x: x in WORD_CLASSES, syntatic_semantic_tags) + tags = [x for x in syntatic_semantic_tags if x in WORD_CLASSES] if tags: pos_tag = tags[0] tagged_text.append((word, pos_tag)) diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index 4ad46ef..8778bed 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -45,11 +45,11 @@ def process(self, document): # We cannot store the trigram as a tuple (mongo keys need to be # strings). We decided to join tokens using spaces since a # space will never be in a token. - key = u' '.join(res[0]) + key = ' '.join(res[0]) # Mongo cannot have `.` or `$` in key names. Unfortunatelly # this means we need to replace them with placeholders. - key = key.replace(u'$', u'\dollarsign') - key = key.replace(u'.', u'\dot') + key = key.replace('$', '\dollarsign') + key = key.replace('.', '\dot') tr[key].append(res[1]) return {'trigram_rank': tr, 'metrics':metrics} diff --git a/pypln/backend/workers/word_cloud.py b/pypln/backend/workers/word_cloud.py index 4f55dad..149ec04 100644 --- a/pypln/backend/workers/word_cloud.py +++ b/pypln/backend/workers/word_cloud.py @@ -19,7 +19,7 @@ import base64 import string -from StringIO import StringIO +from io import StringIO import numpy import nltk @@ -32,7 +32,7 @@ def filter_stopwords(fdist, lang): stopwords = list(string.punctuation) if lang in long_name: stopwords += nltk.corpus.stopwords.words(long_name[lang]) - return filter(lambda pair: pair[0].lower() not in stopwords, fdist) + return [pair for pair in fdist if pair[0].lower() not in stopwords] class WordCloud(PyPLNTask): diff --git a/scripts/add_pipelines.py b/scripts/add_pipelines.py index 2450bda..4d30735 100755 --- a/scripts/add_pipelines.py +++ b/scripts/add_pipelines.py @@ -18,7 +18,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from __future__ import print_function + import sys from logging import Logger, StreamHandler, Formatter from pymongo import Connection diff --git a/scripts/create_fake_measures.py b/scripts/create_fake_measures.py index 8e6b82f..a542187 100644 --- a/scripts/create_fake_measures.py +++ b/scripts/create_fake_measures.py @@ -25,17 +25,17 @@ data = \ {'host': {'cpu': {'cpu percent': 4.9, 'number of cpus': 4}, - 'memory': {'buffers': 214372352L, + 'memory': {'buffers': 214372352, 'cached': 919220224, - 'free': 1369661440L, + 'free': 1369661440, 'free virtual': 0, 'percent': 65.21955293723627, - 'real free': 2503254016L, + 'real free': 2503254016, 'real percent': 36.433711831634305, - 'real used': 1434767360L, - 'total': 3938021376L, + 'real used': 1434767360, + 'total': 3938021376, 'total virtual': 0, - 'used': 2568359936L, + 'used': 2568359936, 'used virtual': 0}, 'network': {'cluster ip': '127.0.0.1', 'interfaces': {'eth0': {'bytes received': 171472224, @@ -105,8 +105,8 @@ def populate_collection(): db[collection_name].drop() collection = db[collection_name] - print '[{}] Inserting total of {} measures ({} for {} brokers)...'\ - .format(asctime(), measures * brokers, measures, brokers) + print('[{}] Inserting total of {} measures ({} for {} brokers)...'\ + .format(asctime(), measures * brokers, measures, brokers)) for measure in range(1, measures + 1): for broker in range(1, brokers + 1): if '_id' in data: @@ -117,17 +117,17 @@ def populate_collection(): random() collection.insert(data) if measure % 10000 == 0: - print ' [{}] Inserted {} measures'.format(asctime(), - measure * broker) - print '[{}] Done inserting measures!'.format(asctime()) + print(' [{}] Inserted {} measures'.format(asctime(), + measure * broker)) + print('[{}] Done inserting measures!'.format(asctime())) - print '[{}] Creating index for "host.network.cluster ip"'.format(asctime()) + print('[{}] Creating index for "host.network.cluster ip"'.format(asctime())) collection.ensure_index('host.network.cluster ip') - print '[{}] Done!'.format(asctime()) + print('[{}] Done!'.format(asctime())) - print '[{}] Creating index for "timestamp"'.format(asctime()) + print('[{}] Creating index for "timestamp"'.format(asctime())) collection.ensure_index([('timestamp', -1)]) - print '[{}] Done!'.format(asctime()) + print('[{}] Done!'.format(asctime())) database_name = 'pypln' collection_name = 'monitoring' # WARNING: it'll drop the collection! @@ -149,10 +149,10 @@ def populate_collection(): .distinct('host.network.cluster ip')) end_time = time() total_time = end_time - start_time -print 'Time to get broker IPs: {}. Broker IPs: {}'.format(total_time, - ', '.join(broker_ips)) +print('Time to get broker IPs: {}. Broker IPs: {}'.format(total_time, + ', '.join(broker_ips))) -print '[{}] Getting last measure for each broker...'.format(asctime()) +print('[{}] Getting last measure for each broker...'.format(asctime())) measures = {} start_time = time() for broker_ip in broker_ips: @@ -162,7 +162,7 @@ def populate_collection(): measures[broker_ip] = result end_time = time() total_time = end_time - start_time -print '[{}] Time to get all information: {}'.format(asctime(), total_time) -for broker_ip, measure_list in measures.iteritems(): - print 'Broker: {}, measure: {}'.format(broker_ip, measure_list[0]) +print('[{}] Time to get all information: {}'.format(asctime(), total_time)) +for broker_ip, measure_list in measures.items(): + print('Broker: {}, measure: {}'.format(broker_ip, measure_list[0])) connection.close() diff --git a/scripts/mongo2sphinx.py b/scripts/mongo2sphinx.py index 89f3438..b6fafd0 100755 --- a/scripts/mongo2sphinx.py +++ b/scripts/mongo2sphinx.py @@ -61,7 +61,7 @@ def serialize(doc,id): an unique unsigned integer `id`. We use a counter for this. """ document = Element("sphinx:document", attrib={'id':str(id)}) - for k,v in doc.iteritems(): + for k,v in doc.items(): if k == '_id': SubElement(document,k).text = str(v) continue diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py index fd1adde..544dda9 100644 --- a/tests/test_celery_task.py +++ b/tests/test_celery_task.py @@ -17,7 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . from pypln.backend.celery_task import PyPLNTask -from utils import TaskTest +from .utils import TaskTest class FakeTask(PyPLNTask): def process(self, document): diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py index faaafab..6f319d6 100644 --- a/tests/test_elastic_indexer.py +++ b/tests/test_elastic_indexer.py @@ -1,5 +1,5 @@ #-*- coding:utf-8 -*- -u""" +""" Created on 20/05/15 by fccoelho license: GPL V3 or Later diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index de605e2..6adf67e 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -20,7 +20,7 @@ import nltk from pypln.backend.workers.bigrams import Bigrams -from utils import TaskTest +from .utils import TaskTest bigram_measures = nltk.collocations.BigramAssocMeasures() diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index d7819a5..a3b8f14 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -21,7 +21,7 @@ import os from textwrap import dedent from pypln.backend.workers import Extractor -from utils import TaskTest +from .utils import TaskTest DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) @@ -67,25 +67,25 @@ def test_extraction_from_pdf_file(self): # our Extractor found (it may have found more details # depending on the toolset used to extract metadata) metadata_expected = { - u'Author': u'Álvaro Justen', - u'Creator': u'Writer', - u'Producer': u'LibreOffice 3.5', - u'CreationDate': u'Fri Jun 1 17:07:57 2012', - u'Tagged': u'no', - u'Pages': u'1', - u'Encrypted': u'no', - u'Page size': u'612 x 792 pts (letter)', - u'Optimized': u'no', - u'PDF version': u'1.4', + 'Author': 'Álvaro Justen', + 'Creator': 'Writer', + 'Producer': 'LibreOffice 3.5', + 'CreationDate': 'Fri Jun 1 17:07:57 2012', + 'Tagged': 'no', + 'Pages': '1', + 'Encrypted': 'no', + 'Page size': '612 x 792 pts (letter)', + 'Optimized': 'no', + 'PDF version': '1.4', } - metadata_expected_set = set(metadata_expected.iteritems()) + metadata_expected_set = set(metadata_expected.items()) metadata = refreshed_document['file_metadata'] - metadata_set = set(metadata.iteritems()) + metadata_set = set(metadata.items()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), ("Extracted metadata is not a subset of the expected metadata. " "Items missing or with different values: {}").format( - u", ".join(unicode(item) for item in diff_set))) + ", ".join(str(item) for item in diff_set))) self.assertEqual(refreshed_document['mimetype'], 'application/pdf') def test_extraction_from_html(self): @@ -165,7 +165,7 @@ def test_language_detection_en(self): self.assertEqual(refreshed_document['language'], 'en') def test_unescape_html_entities(self): - expected = (u"This text has html . Álvaro asked me to make" + expected = ("This text has html . Álvaro asked me to make" " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') data = {'filename': filename, @@ -176,7 +176,7 @@ def test_unescape_html_entities(self): self.assertEqual(refreshed_document['text'], expected) def test_should_detect_encoding_and_return_a_unicode_object(self): - expected = u"Flávio" + expected = "Flávio" filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} @@ -184,7 +184,7 @@ def test_should_detect_encoding_and_return_a_unicode_object(self): Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(type(refreshed_document['text']), unicode) + self.assertEqual(type(refreshed_document['text']), str) def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" @@ -213,7 +213,7 @@ def test_unknown_mimetype_should_be_flagged(self): def test_unknown_encoding_should_be_ignored(self): filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') - expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." + expected = "This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} doc_id = self.collection.insert(data, w=1) diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index bde9c98..c23f280 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -17,16 +17,16 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . from pypln.backend.workers import FreqDist -from utils import TaskTest +from .utils import TaskTest class TestFreqDistWorker(TaskTest): def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): - tokens = [u'The', u'sky', u'is', u'blue', u',', u'the', u'sun', u'is', - u'yellow', u'.'] + tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', + 'yellow', '.'] - expected_fd = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1], - [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]] + expected_fd = [['is', 2], ['the', 2], ['blue', 1], ['sun', 1], + ['sky', 1], [',', 1], ['yellow', 1], ['.', 1]] # This is just preparing the expected input in the database diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py index 3887d81..ed0f156 100644 --- a/tests/test_worker_lemmatizer_pt.py +++ b/tests/test_worker_lemmatizer_pt.py @@ -21,7 +21,7 @@ from textwrap import dedent from pypln.backend.workers import Lemmatizer -from utils import TaskTest +from .utils import TaskTest class TestLemmatizerWorker(TaskTest): diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py index e9982ba..4ed026a 100644 --- a/tests/test_worker_palavras_noun_phrase.py +++ b/tests/test_worker_palavras_noun_phrase.py @@ -22,7 +22,7 @@ from pypln.backend.workers import NounPhrase from pypln.backend.workers.palavras_raw import palavras_installed -from utils import TaskTest +from .utils import TaskTest class TestNounPhraseWorker(TaskTest): diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index de2b6b8..2c9853d 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -22,7 +22,7 @@ from textwrap import dedent from pypln.backend.workers import palavras_raw -from utils import TaskTest +from .utils import TaskTest ORIGINAL_PATH = palavras_raw.BASE_PARSER diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py index 1b3abbe..f540fc6 100644 --- a/tests/test_worker_palavras_semantic_tagger.py +++ b/tests/test_worker_palavras_semantic_tagger.py @@ -20,7 +20,7 @@ from textwrap import dedent from pypln.backend.workers import SemanticTagger -from utils import TaskTest +from .utils import TaskTest class TestSemanticTaggerWorker(TaskTest): @@ -78,10 +78,10 @@ def test_ambiguous_tags(self): ''').strip() + '\n\n' expected_tags = { - 'Non_Tagged': [u'Eu', u'bem', u'enquanto', u'ele', u'está', - u'em', u'o'], - 'Place and spatial': [u'canto'], - 'Verbs_related_human_things': [u'canto'] + 'Non_Tagged': ['Eu', 'bem', 'enquanto', 'ele', 'está', + 'em', 'o'], + 'Place and spatial': ['canto'], + 'Verbs_related_human_things': ['canto'] } doc_id = self.collection.insert({'palavras_raw': palavras_output, 'palavras_raw_ran': True}, w=1) diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py index 6307192..af0c302 100644 --- a/tests/test_worker_pos.py +++ b/tests/test_worker_pos.py @@ -22,7 +22,7 @@ from textwrap import dedent from pypln.backend.workers.palavras_raw import palavras_installed from pypln.backend.workers import POS -from utils import TaskTest +from .utils import TaskTest class TestPosWorker(TaskTest): @@ -56,7 +56,7 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): ''').strip() + '\n\n' # '.' is the only named entity here. - expected = [[u'.', u'.', 29]] + expected = [['.', '.', 29]] doc_id = self.collection.insert({'text': text, 'tokens': tokens, 'language': 'pt', 'palavras_raw': palavras_raw}, w=1) POS().delay(doc_id) diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index b81bb93..09e07f6 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -20,14 +20,14 @@ import os from textwrap import dedent from pypln.backend.workers import spellchecker -from utils import TaskTest +from .utils import TaskTest DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) class TestSpellcheckerWorker(TaskTest): def test_spellchek_pt(self): - text = u"Meu cachoro é um pastor" + text = "Meu cachoro é um pastor" doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}, w=1) spellchecker.SpellingChecker().delay(doc_id) @@ -39,7 +39,7 @@ def test_spellchek_pt(self): self.assertEqual(refreshed_document['spelling_errors'][0][1], 4) def test_spellchek_en(self): - text = u"The cat bit the doggyo" + text = "The cat bit the doggyo" doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1) spellchecker.SpellingChecker().delay(doc_id) diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py index 3370e8d..4da4cfb 100644 --- a/tests/test_worker_statistics.py +++ b/tests/test_worker_statistics.py @@ -18,7 +18,7 @@ # along with PyPLN. If not, see . from pypln.backend.workers import Statistics -from utils import TaskTest +from .utils import TaskTest class TestStatisticsWorker(TaskTest): diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py index 9d59cac..67053a6 100644 --- a/tests/test_worker_tokenizer.py +++ b/tests/test_worker_tokenizer.py @@ -18,7 +18,7 @@ # along with PyPLN. If not, see . from pypln.backend.workers import Tokenizer -from utils import TaskTest +from .utils import TaskTest class TestTokenizerWorker(TaskTest): diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 93575e9..683e714 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -18,9 +18,9 @@ # along with PyPLN. If not, see . import nltk -import cPickle +import pickle from pypln.backend.workers.trigrams import Trigrams -from utils import TaskTest +from .utils import TaskTest trigram_measures = nltk.collocations.TrigramAssocMeasures() @@ -33,7 +33,7 @@ def test_Trigrams_should_return_correct_score(self): Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] - result = trigram_rank[u'olive leaf plucked'][0] + result = trigram_rank['olive leaf plucked'][0] # This is the value of the chi_sq measure for this trigram in this # colocation expected_chi_sq = 1940754916.9623578 @@ -45,7 +45,7 @@ def test_Trigrams_may_contain_dots_and_dollar_signs(self): Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] - result = trigram_rank[u'\dollarsign test \dot'][0] + result = trigram_rank['\dollarsign test \dot'][0] # This is the value of the chi_sq measure for this trigram in this # colocation expected_chi_sq = 10.5 diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 23ed090..2f504f2 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -18,12 +18,12 @@ # along with PyPLN. If not, see . import base64 -from StringIO import StringIO +from io import StringIO from PIL import Image from pypln.backend.workers import WordCloud -from utils import TaskTest +from .utils import TaskTest class TestFreqDistWorker(TaskTest): From e41028ebd02962ed2962729e8931d4f9cf9f4c86 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 16:39:37 -0200 Subject: [PATCH 06/33] Removes redundant try/except block in urlparse import --- pypln/backend/config.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pypln/backend/config.py b/pypln/backend/config.py index f89bd6f..e5bd6d3 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -1,12 +1,8 @@ import os +import urllib.parse from decouple import config, Csv -try: - import urllib.parse -except ImportError: - import urllib.parse as urlparse - def parse_url(url): urllib.parse.uses_netloc.append('mongodb') urllib.parse.uses_netloc.append('celery') From ccfb5d9df2bd19a6af466e6d844f2b8507aae789 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 16:45:40 -0200 Subject: [PATCH 07/33] Pins celery version Celery 4.0 won't automatically register tasks that inherit from Task. We need to solve this before we can use the newer version. Celery's documentation only mentions class-based tasks in a way that is very different from what we do here, though. --- requirements/production.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/production.txt b/requirements/production.txt index 0e325c3..c043e27 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,5 +1,5 @@ # Common -celery +celery==3.1.23 pymongo==2.8.1 # Backend From 01a5fa63e0dced3f18fe1f55d39afe08e51c5ad1 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 17:02:30 -0200 Subject: [PATCH 08/33] Removes unnecessary cast to list that 2to3 inserted --- pypln/backend/workers/palavras_semantic_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py index 1e448de..a66d42c 100644 --- a/pypln/backend/workers/palavras_semantic_tagger.py +++ b/pypln/backend/workers/palavras_semantic_tagger.py @@ -425,7 +425,7 @@ def process(self, document): word_sem_tags = angle_brackets_contents.findall(line.strip()) is_tagged = False for tag in word_sem_tags: - for category, subcategories in list(SEMANTIC_TAGS.items()): + for category, subcategories in SEMANTIC_TAGS.items(): if tag in subcategories: tagged_entities.setdefault(category, []).append(word) is_tagged = True From b16be95e28855d213839bb13f8e0ffa3bdf475d9 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 17:23:21 -0200 Subject: [PATCH 09/33] Fixes test that expected str but receives bytes This test was changed by 2to3, that's why it broke. --- tests/test_worker_palavras_raw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index 2c9853d..24bdc63 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -53,7 +53,7 @@ def test_palavras_should_return_raw_if_it_is_installed(self): doc_id = self.collection.insert( {'text': 'Eu sei que neste momento falo para todo Brasil.', 'language': 'pt'}, w=1) - expected_raw = dedent(''' + expected_raw = dedent(b''' Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 sei [saber] V PR 1S IND VFIN @FS-STA #2->0 que [que] KS @SUB #3->7 From 21aa0a6e515676c9f2bdaf65a67cbb5c41d3e608 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 17:41:05 -0200 Subject: [PATCH 10/33] Adds test to make sure the 'process' method receives the expected data --- tests/test_celery_task.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py index 544dda9..4952817 100644 --- a/tests/test_celery_task.py +++ b/tests/test_celery_task.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . from pypln.backend.celery_task import PyPLNTask +from unittest import mock from .utils import TaskTest class FakeTask(PyPLNTask): @@ -37,3 +38,11 @@ def test_task_should_get_the_correct_document(self): refreshed_doc = self.collection.find_one({'_id': correct_doc_id}) self.assertEqual(refreshed_doc['result'], 'correct') + + @mock.patch.object(FakeTask, 'process') + def test_should_get_current_data_from_database(self, mocked_process): + document = {'input': 'correct'} + doc_id = self.collection.insert(document, w=1) + self.collection.insert({'input': 'wrong'}, w=1) + FakeTask().delay(doc_id) + mocked_process.assert_called_with(document) From 7d540d00742c660bd9ea19fa9c6f32b0fd68cc8e Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 17:41:50 -0200 Subject: [PATCH 11/33] Fixes existing base task test This test now makes sure only the correct document is updated. --- tests/test_celery_task.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py index 4952817..0e087fa 100644 --- a/tests/test_celery_task.py +++ b/tests/test_celery_task.py @@ -25,7 +25,7 @@ def process(self, document): return {'result': document['input']} class TestCeleryTask(TaskTest): - def test_task_should_get_the_correct_document(self): + def test_saves_returned_data_to_database(self): """This is a regression test. PyPLNTask was not filtering by _id. It was getting the first document it found. """ @@ -36,8 +36,10 @@ def test_task_should_get_the_correct_document(self): FakeTask().delay(correct_doc_id) refreshed_doc = self.collection.find_one({'_id': correct_doc_id}) + refreshed_wrong_doc = self.collection.find_one({'_id': wrong_doc_id}) self.assertEqual(refreshed_doc['result'], 'correct') + self.assertNotIn('result', refreshed_wrong_doc.keys()) @mock.patch.object(FakeTask, 'process') def test_should_get_current_data_from_database(self, mocked_process): From aa4478a5c77cb2eeeedb69b2b3f2dbd19475af6a Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 18:20:37 -0200 Subject: [PATCH 12/33] Uses BytesIO instead of StringIO in wordcloud 2to3 introduced an error because it couldn't know what StringIO was being used for. --- pypln/backend/workers/word_cloud.py | 4 ++-- tests/test_worker_wordcloud.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pypln/backend/workers/word_cloud.py b/pypln/backend/workers/word_cloud.py index 149ec04..5bf5efc 100644 --- a/pypln/backend/workers/word_cloud.py +++ b/pypln/backend/workers/word_cloud.py @@ -19,7 +19,7 @@ import base64 import string -from io import StringIO +from io import BytesIO import numpy import nltk @@ -41,7 +41,7 @@ def process(self, document): words = numpy.array([t[0] for t in fdist]) counts = numpy.array([t[1] for t in fdist]) wordcloud_img = make_wordcloud(words, counts) - fd = StringIO() + fd = BytesIO() wordcloud_img.save(fd, format="PNG") fd.seek(0) result = {'wordcloud': base64.b64encode(fd.read())} diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 2f504f2..186c5a3 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -18,7 +18,7 @@ # along with PyPLN. If not, see . import base64 -from io import StringIO +from io import BytesIO from PIL import Image @@ -37,7 +37,7 @@ def test_wordcloud_should_return_a_base64_encoded_png(self): refreshed_document = self.collection.find_one({'_id': doc_id}) raw_png_data = base64.b64decode(refreshed_document['wordcloud']) - fake_file = StringIO(raw_png_data) + fake_file = BytesIO(raw_png_data) img = Image.open(fake_file) img.verify() self.assertEqual(img.format, 'PNG') From d311b74dce0ef5ef96db1c23e00ff0f0b88699bc Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 18:22:33 -0200 Subject: [PATCH 13/33] Changes Wordcloud test not to touch the database The next few commits will change tests in a similar manner. This test is no longer touching the database, because we rely on the PyPLNTask test to test the fetch/save functionality. --- tests/test_worker_wordcloud.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 186c5a3..25ddba5 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -19,23 +19,20 @@ import base64 from io import BytesIO +import unittest from PIL import Image from pypln.backend.workers import WordCloud -from .utils import TaskTest -class TestFreqDistWorker(TaskTest): - name = "WordCloud" +class TestFreqDistWorker(unittest.TestCase): def test_wordcloud_should_return_a_base64_encoded_png(self): doc = {'freqdist': [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'} - doc_id = self.collection.insert(doc, w=1) - WordCloud().delay(doc_id) + result = WordCloud().process(doc) - refreshed_document = self.collection.find_one({'_id': doc_id}) - raw_png_data = base64.b64decode(refreshed_document['wordcloud']) + raw_png_data = base64.b64decode(result['wordcloud']) fake_file = BytesIO(raw_png_data) img = Image.open(fake_file) From 65c07b1c61dfcb60da66fbd0cd135d8e2e575cea Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 24 Nov 2016 18:58:23 -0200 Subject: [PATCH 14/33] Changes palavras_raw test to not touch the database --- pypln/backend/workers/palavras_raw.py | 19 +++++++------- tests/test_worker_palavras_raw.py | 37 +++++++++++---------------- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py index e45bb11..95161ba 100644 --- a/pypln/backend/workers/palavras_raw.py +++ b/pypln/backend/workers/palavras_raw.py @@ -39,14 +39,15 @@ def process(self, document): text = document['text'] - # For some reason, in some pypln installations the document['text'] is - # not always unicode as it should be. This may be due to errors during - # the decoding process that we fixed earlier. That meant that, when we - # got a non-unicode string, python would try to decode it using the - # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we - # know the text came from mongodb, we can just decode it using utf-8 to - # make sure we have a unicode object. - if not isinstance(text, str): + # This code is here because when using python2 for some + # reason, sometimes document['text'] was not a unicode object + # (as it should be, coming from pymongo). Since we're now + # using python3, we should really always get a str (unicode) + # object. But, since we do not know the real reason for the + # original error, we will keep this code here for now. As + # before, if we receive a bytes object, since it came from + # mongodb we can be sure it will be encoded in utf-8. + if isinstance(text, bytes): text = text.decode('utf-8') process = subprocess.Popen([BASE_PARSER, PARSER_MODE], @@ -55,4 +56,4 @@ def process(self, document): stderr=subprocess.PIPE) stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING)) - return {'palavras_raw': stdout, 'palavras_raw_ran': True} + return {'palavras_raw': stdout.decode('utf-8'), 'palavras_raw_ran': True} diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index 24bdc63..628eafc 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -20,40 +20,34 @@ from unittest import skipIf from textwrap import dedent +import unittest from pypln.backend.workers import palavras_raw -from .utils import TaskTest ORIGINAL_PATH = palavras_raw.BASE_PARSER -class TestPalavrasRawWorker(TaskTest): +class TestPalavrasRawWorker(unittest.TestCase): def test_should_run_only_if_language_is_portuguese(self): - doc_id = self.collection.insert({'text': 'There was a rock on the way.', - 'language': 'en'}, w=1) - - palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw_ran'], False) + doc = {'text': 'There was a rock on the way.', 'language': 'en'} + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw_ran'], False) def test_palavras_not_installed(self): palavras_raw.BASE_PARSER = '/not-found' - doc_id = self.collection.insert( - {'text': 'Tinha uma pedra no meio do caminho.', - 'language': 'pt'}, w=1) - palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw_ran'], False) + doc = {'text': 'Tinha uma pedra no meio do caminho.', + 'language': 'pt'} + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw_ran'], False) @skipIf(not palavras_raw.palavras_installed(), 'palavras software is not installed') def test_palavras_should_return_raw_if_it_is_installed(self): palavras_raw.BASE_PARSER = ORIGINAL_PATH - doc_id = self.collection.insert( - {'text': 'Eu sei que neste momento falo para todo Brasil.', - 'language': 'pt'}, w=1) - expected_raw = dedent(b''' + doc = {'text': 'Eu sei que neste momento falo para todo Brasil.', + 'language': 'pt'} + expected_raw = dedent(''' Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 sei [saber] V PR 1S IND VFIN @FS-STA #2->0 que [que] KS @SUB #3->7 @@ -67,7 +61,6 @@ def test_palavras_should_return_raw_if_it_is_installed(self): $. #11->0 ''').strip() + '\n\n' - result = palavras_raw.PalavrasRaw().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['palavras_raw'], expected_raw) - self.assertEqual(refreshed_document['palavras_raw_ran'], True) + result = palavras_raw.PalavrasRaw().process(doc) + self.assertEqual(result['palavras_raw'], expected_raw) + self.assertEqual(result['palavras_raw_ran'], True) From 9c8f952f8d433e6d1abd40be4d7876adfd5b5af9 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 20:06:25 -0200 Subject: [PATCH 15/33] Fix freqdist test and sorting List sorting changed in Python 3 and apparently string sorting did too. --- pypln/backend/workers/freqdist.py | 2 +- tests/test_worker_freqdist.py | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py index 82d75ee..bdf3712 100644 --- a/pypln/backend/workers/freqdist.py +++ b/pypln/backend/workers/freqdist.py @@ -28,6 +28,6 @@ def process(self, document): frequency_distribution = {token: tokens.count(token) \ for token in set(tokens)} fd = list(frequency_distribution.items()) - fd.sort(lambda x, y: cmp(y[1], x[1])) + fd.sort(key=lambda x: (-x[1], x[0])) return {'freqdist': fd} diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index c23f280..f4613dc 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -16,24 +16,18 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +from unittest import TestCase + from pypln.backend.workers import FreqDist -from .utils import TaskTest -class TestFreqDistWorker(TaskTest): - def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): +class TestFreqDistWorker(TestCase): + def test_freqdist_should_be_a_list_of_tuples_with_frequency_distribution(self): tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'] - expected_fd = [['is', 2], ['the', 2], ['blue', 1], ['sun', 1], - ['sky', 1], [',', 1], ['yellow', 1], ['.', 1]] - - - # This is just preparing the expected input in the database - doc_id = self.collection.insert({'tokens': tokens}, w=1) - - FreqDist().delay(doc_id) - - resulting_fd = self.collection.find_one({'_id': doc_id})['freqdist'] + expected_fd = [('is', 2), ('the', 2), (',', 1), ('.', 1), ('blue', 1), + ('sky', 1), ('sun', 1), ('yellow', 1)] + resulting_fd = FreqDist().process({'tokens': tokens})['freqdist'] self.assertEqual(resulting_fd, expected_fd) From 05594a15433b927d0ad6359918eec32c98048884 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 20:30:11 -0200 Subject: [PATCH 16/33] fix spellchecker tests --- pypln/backend/workers/spellchecker.py | 15 +++++------ tests/test_worker_spellchecker.py | 38 +++++++++++---------------- 2 files changed, 23 insertions(+), 30 deletions(-) diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py index 4a6afb8..2e16185 100644 --- a/pypln/backend/workers/spellchecker.py +++ b/pypln/backend/workers/spellchecker.py @@ -20,22 +20,22 @@ import enchant from enchant.checker import SpellChecker + from pypln.backend.celery_task import PyPLNTask + class SpellingChecker(PyPLNTask): """ This worker performs spellchecking in the plain text of a document """ def __init__(self): - # This method is only called once per process, but that is no problem - # since the enchant languange list should not change. Don't use this - # method for anything that should run every time the task is called. - # See http://docs.celeryproject.org/en/latest/userguide/tasks.html#instantiation - # for more information. - self.checkers = {lang: SpellChecker(lang) for lang in enchant.list_languages()} + # This method is only called once per process + self.checkers = {lang: SpellChecker(lang) + for lang in enchant.list_languages()} def process(self, document): - #TODO: this worker may be enhanced by also checking the errors against an specific vocabulary supplied with the document + #TODO: this worker may be enhanced by also checking the errors against + # an specific vocabulary supplied with the document try: checker = self.checkers[document['language']] checker.set_text(document['text']) @@ -44,4 +44,3 @@ def process(self, document): errors = None return {'spelling_errors': errors} - diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index 09e07f6..746cb5d 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -18,34 +18,28 @@ # along with PyPLN. If not, see . import os -from textwrap import dedent -from pypln.backend.workers import spellchecker -from .utils import TaskTest +from unittest import TestCase + +from pypln.backend.workers.spellchecker import SpellingChecker DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) -class TestSpellcheckerWorker(TaskTest): - def test_spellchek_pt(self): +class TestSpellcheckerWorker(TestCase): + def test_spellcheck_pt(self): text = "Meu cachoro é um pastor" - doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}, - w=1) - spellchecker.SpellingChecker().delay(doc_id) + result = SpellingChecker().process({'text': text, 'language': 'pt_BR'}) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(len(refreshed_document['spelling_errors']), 1) - self.assertIn('cachoro', refreshed_document['spelling_errors'][0]) - self.assertIn('cachorro', refreshed_document['spelling_errors'][0][2]) - self.assertEqual(refreshed_document['spelling_errors'][0][1], 4) + self.assertEqual(len(result['spelling_errors']), 1) + self.assertIn('cachoro', result['spelling_errors'][0]) + self.assertIn('cachorro', result['spelling_errors'][0][2]) + self.assertEqual(result['spelling_errors'][0][1], 4) - def test_spellchek_en(self): + def test_spellcheck_en(self): text = "The cat bit the doggyo" - doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1) - spellchecker.SpellingChecker().delay(doc_id) - - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(len(refreshed_document['spelling_errors']), 1) - self.assertIn('doggyo', refreshed_document['spelling_errors'][0]) - self.assertIn('doggy', refreshed_document['spelling_errors'][0][2]) - self.assertEqual(refreshed_document['spelling_errors'][0][1], 16) + result = SpellingChecker().process({'text': text, 'language': 'en'}) + self.assertEqual(len(result['spelling_errors']), 1) + self.assertIn('doggyo', result['spelling_errors'][0]) + self.assertIn('doggy', result['spelling_errors'][0][2]) + self.assertEqual(result['spelling_errors'][0][1], 16) From 7b31c9853b9bb508b08573f568a096b6b4b3cd5d Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 20:45:45 -0200 Subject: [PATCH 17/33] spellchecker: warn if dictionary is missing --- pypln/backend/workers/spellchecker.py | 26 +++++++++++++++++++------- tests/test_worker_spellchecker.py | 11 +++++++++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py index 2e16185..e6bc93b 100644 --- a/pypln/backend/workers/spellchecker.py +++ b/pypln/backend/workers/spellchecker.py @@ -16,7 +16,7 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - +import warnings import enchant from enchant.checker import SpellChecker @@ -24,6 +24,10 @@ from pypln.backend.celery_task import PyPLNTask +class MissingDictionaryWarning(RuntimeWarning): + pass + + class SpellingChecker(PyPLNTask): """ This worker performs spellchecking in the plain text of a document @@ -34,13 +38,21 @@ def __init__(self): for lang in enchant.list_languages()} def process(self, document): - #TODO: this worker may be enhanced by also checking the errors against + # TODO: this worker may be enhanced by also checking the errors against # an specific vocabulary supplied with the document - try: - checker = self.checkers[document['language']] - checker.set_text(document['text']) - errors = [[e.word, e.wordpos, e.suggest()] for e in checker] - except KeyError: + checker = self.checkers.get(document['language']) + if checker is None: + # Maybe this should be an exception instead + warnings.warn('%s dictionary missing. If running on linux, ' + 'install the corresponding myspell package' + % document['language'], + MissingDictionaryWarning) errors = None + else: + try: + checker.set_text(document['text']) + errors = [[e.word, e.wordpos, e.suggest()] for e in checker] + except KeyError: + errors = None return {'spelling_errors': errors} diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index 746cb5d..6200f73 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -18,9 +18,10 @@ # along with PyPLN. If not, see . import os -from unittest import TestCase +from unittest import TestCase, mock -from pypln.backend.workers.spellchecker import SpellingChecker +from pypln.backend.workers.spellchecker import (SpellingChecker, + MissingDictionaryWarning) DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) @@ -43,3 +44,9 @@ def test_spellcheck_en(self): self.assertIn('doggyo', result['spelling_errors'][0]) self.assertIn('doggy', result['spelling_errors'][0][2]) self.assertEqual(result['spelling_errors'][0][1], 16) + + @mock.patch('warnings.warn') + def test_warns_about_missing_dictionary(self, warn_mock): + SpellingChecker().process({'text': '', + 'language': 'missing_language'}) + warn_mock.assert_called_with(mock.ANY, MissingDictionaryWarning) From 00cce60d7b325ca94b2dc5ed4a769dc238cb2ec5 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 21:04:18 -0200 Subject: [PATCH 18/33] fix test_unknown_mimetype_should_be_flagged test --- tests/test_worker_extractor.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index a3b8f14..b4f39c7 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -25,6 +25,7 @@ DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) + class TestExtractorWorker(TaskTest): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" @@ -200,16 +201,13 @@ def test_unknown_mimetype_should_be_flagged(self): filename = os.path.join(DATA_DIR, 'random_file') # we can't put the expected text content here, so we'll just make sure # it's equal to the input content, since - contents = open(filename).read() - data = {'filename': filename, - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['mimetype'], 'unknown') - self.assertEqual(refreshed_document['text'], "") - self.assertEqual(refreshed_document['language'], "") - self.assertEqual(refreshed_document['file_metadata'], {}) + contents = open(filename, 'rb').read() + result = Extractor().process({'filename': filename, + 'contents': base64.b64encode(contents)}) + self.assertEqual(result['mimetype'], 'unknown') + self.assertEqual(result['text'], "") + self.assertEqual(result['language'], "") + self.assertEqual(result['file_metadata'], {}) def test_unknown_encoding_should_be_ignored(self): filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') From afaaa0bfec59103127c1d4d0e976c5325677711f Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 21:22:50 -0200 Subject: [PATCH 19/33] Update TestExtractorWorker.test_unknown_encoding_should_be_ignored This test is still not passing because cld is throwing an exception --- tests/test_worker_extractor.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index b4f39c7..ec555a6 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -211,12 +211,11 @@ def test_unknown_mimetype_should_be_flagged(self): def test_unknown_encoding_should_be_ignored(self): filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') - expected = "This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." + expected = "This file has a weird byte (\x96) that makes it " \ + "impossible for libmagic to recognize it's encoding." data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['language'], 'en') + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(result['file_metadata'], {}) + self.assertEqual(result['language'], 'en') From 427da7d51fc073494c4db185f0f273752f768629 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 21:28:52 -0200 Subject: [PATCH 20/33] fix TestExtractorWorker.test_unescape_html_entities --- tests/test_worker_extractor.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index ec555a6..b56dd81 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -170,11 +170,9 @@ def test_unescape_html_entities(self): " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) def test_should_detect_encoding_and_return_a_unicode_object(self): expected = "Flávio" From 2c0f8e8cd031c45053f8c31c5a1e752486171ce6 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 21:30:40 -0200 Subject: [PATCH 21/33] fix TestExtractorWorker.test_should_detect_encoding_and_return_a_unicode_object --- tests/test_worker_extractor.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index b56dd81..117ed8c 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -178,12 +178,10 @@ def test_should_detect_encoding_and_return_a_unicode_object(self): expected = "Flávio" filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(type(refreshed_document['text']), str) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(type(result['text']), str) def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" From 69899361d9c59e4e475ff36815afafce0299c720 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 25 Nov 2016 21:32:51 -0200 Subject: [PATCH 22/33] fix TestExtractorWorker.test_should_guess_mimetype_for_file_without_extension --- tests/test_worker_extractor.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index 117ed8c..f781f68 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -187,11 +187,9 @@ def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'text_file') data = {'filename': filename, - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['mimetype'], 'text/plain') + 'contents': base64.b64encode(contents.encode('utf-8'))} + result = Extractor().process(data) + self.assertEqual(result['mimetype'], 'text/plain') def test_unknown_mimetype_should_be_flagged(self): filename = os.path.join(DATA_DIR, 'random_file') From 17e47cb9ff0d1859ca86d272e9fc32fa4c206d5b Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Sat, 26 Nov 2016 13:07:21 -0200 Subject: [PATCH 23/33] updated more extractor tests --- tests/test_worker_extractor.py | 88 +++++++++++++++------------------- 1 file changed, 38 insertions(+), 50 deletions(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index f781f68..bc61127 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -20,23 +20,23 @@ import base64 import os from textwrap import dedent +from unittest import TestCase + from pypln.backend.workers import Extractor -from .utils import TaskTest DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) -class TestExtractorWorker(TaskTest): +class TestExtractorWorker(TestCase): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.txt') - doc_id = self.collection.insert({'filename': filename, - 'contents': base64.b64encode(open(filename).read())}, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['mimetype'], 'text/plain') + data = {'filename': filename, + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(result['file_metadata'], {}) + self.assertEqual(result['mimetype'], 'text/plain') def test_extraction_from_html_file(self): expected = "This is a test file. I'm testing PyPLN extractor worker!" @@ -47,23 +47,19 @@ def test_extraction_from_html_file(self): # wasn't a problem before because with mongodict we used to keep a # pickled representation of the data. data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['file_metadata'], {}) - self.assertEqual(refreshed_document['mimetype'], 'text/html') + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) + self.assertEqual(result['file_metadata'], {}) + self.assertEqual(result['mimetype'], 'text/html') def test_extraction_from_pdf_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.pdf') data = {'filename': filename, - 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) + 'contents': base64.b64encode(open(filename, 'rb').read())} + result = Extractor().process(data) + self.assertEqual(result['text'], expected) # Check that the expected metadata is a subset of what # our Extractor found (it may have found more details # depending on the toolset used to extract metadata) @@ -80,14 +76,14 @@ def test_extraction_from_pdf_file(self): 'PDF version': '1.4', } metadata_expected_set = set(metadata_expected.items()) - metadata = refreshed_document['file_metadata'] + metadata = result['file_metadata'] metadata_set = set(metadata.items()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), ("Extracted metadata is not a subset of the expected metadata. " "Items missing or with different values: {}").format( ", ".join(str(item) for item in diff_set))) - self.assertEqual(refreshed_document['mimetype'], 'application/pdf') + self.assertEqual(result['mimetype'], 'application/pdf') def test_extraction_from_html(self): contents = dedent(''' @@ -114,9 +110,8 @@ def test_extraction_from_html(self): ''') data = {'filename': 'test.html', - 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data, w=1) - Extractor().delay(doc_id) + 'contents': base64.b64encode(contents.encode('utf-8'))} + result = Extractor().process(data) expected = dedent(''' Testing @@ -134,36 +129,29 @@ def test_extraction_from_html(self): bla1 bla2''').strip() - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['text'], expected) - self.assertEqual(refreshed_document['mimetype'], 'text/html') + self.assertEqual(result['text'], expected) + self.assertEqual(result['mimetype'], 'text/html') def test_language_detection_pt(self): - text_pt = 'Esse texto foi escrito por Álvaro em Português.' - data_pt = {'filename': 'text-pt.txt', - 'contents': base64.b64encode(text_pt)} - doc_id = self.collection.insert(data_pt, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'pt') + text = 'Esse texto foi escrito por Álvaro em Português.' + data = {'filename': 'text-pt.txt', + 'contents': base64.b64encode(text.encode('utf-8'))} + result = Extractor().process(data) + self.assertEqual(result['language'], 'pt') def test_language_detection_es(self): - text_es = 'Este texto ha sido escrito en Español por Álvaro.' - data_es = {'filename': 'text-es.txt', - 'contents': base64.b64encode(text_es)} - doc_id = self.collection.insert(data_es, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'es') + text = 'Este texto ha sido escrito en Español por Álvaro.' + data = {'filename': 'text-es.txt', + 'contents': base64.b64encode(text.encode('utf-8'))} + result = Extractor().process(data) + self.assertEqual(result['language'], 'es') def test_language_detection_en(self): - text_en = 'This text was written by Álvaro in English.' - data_en = {'filename': 'text-en.txt', - 'contents': base64.b64encode(text_en)} - doc_id = self.collection.insert(data_en, w=1) - Extractor().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(refreshed_document['language'], 'en') + text = 'This text was written by Álvaro in English.' + data = {'filename': 'text-en.txt', + 'contents': base64.b64encode(text.encode('utf-8'))} + result = Extractor().process(data) + self.assertEqual(result['language'], 'en') def test_unescape_html_entities(self): expected = ("This text has html . Álvaro asked me to make" From 4eb5f613c00507f672d59cf3e504b132f32649ee Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Sat, 26 Nov 2016 13:33:32 -0200 Subject: [PATCH 24/33] fix extractor.extract_pdf --- pypln/backend/workers/extractor.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 2a864e6..c0cc9aa 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -87,7 +87,7 @@ def parse_html(html, remove_tags=None, remove_inside=None, result = ''.join(sum(list(zip(content_between, complete_tags)), tuple())) return clean(result) -def get_pdf_metadata(data): +def get_pdf_metadata(data: str) -> dict: lines = data.strip().splitlines() metadata = {} for line in lines: @@ -98,7 +98,7 @@ def get_pdf_metadata(data): metadata[key.strip()] = value.strip() return metadata -def extract_pdf(data): +def extract_pdf(data: bytes) -> (str, dict): temp = NamedTemporaryFile(delete=False) filename = temp.name temp.close() @@ -112,14 +112,16 @@ def extract_pdf(data): unlink(filename + '_ind.html') unlink(filename + 's.html') text = parse_html(html.replace(' ', ' '), True, ['script', 'style']) - pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE, - stderr=PIPE) - meta_out, meta_err = pdfinfo.communicate(input=data) + + info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE, + stderr=PIPE) + meta_out, meta_err = info_process.communicate(input=data) try: - metadata = get_pdf_metadata(meta_out) - except: + metadata = get_pdf_metadata(meta_out.decode('utf-8')) + except Exception: + # TODO: what should I do here? metadata = {} - #TODO: what should I do here? + if not (text and metadata): return '', {} elif not html_err: @@ -128,7 +130,7 @@ def extract_pdf(data): return '', {} -def trial_decode(text): +def trial_decode(text: bytes) -> str: """ Tries to detect text encoding using `magic`. If the detected encoding is not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding @@ -173,6 +175,7 @@ def process(self, file_data): contents = base64.b64decode(file_data['contents']) with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: file_mime_type = m.id_buffer(contents) + metadata = {} if file_mime_type == 'text/plain': text = contents @@ -191,7 +194,9 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - text, forced_decoding = trial_decode(text) + forced_decoding = False + if isinstance(text, bytes): + text, forced_decoding = trial_decode(text) if isinstance(text, str): # HTMLParser only handles unicode objects. We can't pass the text From 24c266fe8bfd479ef0f60b371831283943cdc42f Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Sat, 26 Nov 2016 23:41:51 -0200 Subject: [PATCH 25/33] Rewrite extractor.trial_decode and write tests for it Address a possible exception raised by Magic.id_buffer and remove the superfluous text.decode('utf-8', 'replace') call since decoding with the iso8859-1 codec will never raise a UnicodeDecodeError exception. --- pypln/backend/workers/extractor.py | 57 ++++++++++++------------------ tests/test_worker_extractor.py | 56 +++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 34 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index c0cc9aa..d9ed07c 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -130,41 +130,30 @@ def extract_pdf(data: bytes) -> (str, dict): return '', {} -def trial_decode(text: bytes) -> str: +def decode_text_bytes(text: bytes) -> str: """ - Tries to detect text encoding using `magic`. If the detected encoding is - not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding - as utf-8 replacing invalid chars with `U+FFFD` (the replacement character). - - This is far from an ideal solution, but the extractor and the rest of the - pipeline need an unicode object. + Tries to detect text encoding using file magic. If that fails or the + detected encoding is not supported, tries using utf-8. If that doesn't work + tries using iso8859-1. """ - with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: - content_encoding = m.id_buffer(text) - - forced_decoding = False try: - result = text.decode(content_encoding) - except LookupError: - # If the detected encoding is not supported, we try to decode it as - # utf-8. + with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: + content_encoding = m.id_buffer(text) + except magic.MagicError: + pass # This can happen for instance if text is a single char + else: try: - result = text.decode('utf-8') - except UnicodeDecodeError: - # Is there a better way of doing this than nesting try/except - # blocks? This smells really bad. - try: - result = text.decode('iso-8859-1') - except UnicodeDecodeError: - # If neither utf-8 nor iso-885901 work are capable of handling - # this text, we just decode it using utf-8 and replace invalid - # chars with U+FFFD. - # Two somewhat arbitrary decisions were made here: use utf-8 - # and use 'replace' instead of 'ignore'. - result = text.decode('utf-8', 'replace') - forced_decoding = True - - return result, forced_decoding + return text.decode(content_encoding) + except LookupError: # The detected encoding is not supported + pass + + try: + result = text.decode('utf-8') + except UnicodeDecodeError: + # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is + # a last resort. + result = text.decode('iso8859-1') + return result class Extractor(PyPLNTask): @@ -194,9 +183,8 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - forced_decoding = False if isinstance(text, bytes): - text, forced_decoding = trial_decode(text) + text = decode_text_bytes(text) if isinstance(text, str): # HTMLParser only handles unicode objects. We can't pass the text @@ -213,5 +201,6 @@ def process(self, file_data): else: language = cld.detect(text)[1] + # TODO: check for uses of forced_decoding and remove them return {'text': text, 'file_metadata': metadata, 'language': language, - 'mimetype': file_mime_type, 'forced_decoding': forced_decoding} + 'mimetype': file_mime_type, 'forced_decoding': None} diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index bc61127..4c94349 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -21,12 +21,68 @@ import os from textwrap import dedent from unittest import TestCase +from unittest.mock import patch, Mock, MagicMock, call + +from magic import MagicError from pypln.backend.workers import Extractor +from pypln.backend.workers.extractor import decode_text_bytes DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) +class DecodeTextBytesTest(TestCase): + def setUp(self): + magic_mock = MagicMock() + magic_identifier = Mock() + self.id_buffer_mock = Mock(return_value='magic_codec') + magic_identifier.id_buffer = self.id_buffer_mock + magic_mock.return_value.__enter__.return_value = magic_identifier + self.magic_patcher = patch('magic.Magic', magic_mock) + + def test_ignores_magic_error(self): + self.id_buffer_mock.side_effect = MagicError() + text = Mock() + with self.magic_patcher: + result = decode_text_bytes(text) + self.assertEqual(result, text.decode.return_value) + self.assertEqual(text.decode.call_args_list, [call('utf-8')]) + + def test_tries_decoding_with_encoding_returned_by_magic(self): + text = Mock() + with self.magic_patcher: + result = decode_text_bytes(text) + self.assertEqual(result, text.decode.return_value) + self.assertEqual(text.decode.call_args_list, [call('magic_codec')]) + + def test_tries_decoding_as_utf8(self): + text = Mock() + text.decode.side_effect = [LookupError(), 'result'] + with self.magic_patcher: + result = decode_text_bytes(text) + self.assertEqual(result, 'result') + self.assertEqual(text.decode.call_args_list, + [call('magic_codec'), call('utf-8')]) + + def test_tries_iso8859_1_if_all_else_fails(self): + text = Mock() + + class FakeUnicodeDecodeError(UnicodeDecodeError): + def __init__(self): + pass + + text.decode.side_effect = [LookupError(), + FakeUnicodeDecodeError(), + 'result'] + with self.magic_patcher: + result = decode_text_bytes(text) + self.assertEqual(result, 'result') + self.assertEqual(text.decode.call_args_list, + [call('magic_codec'), + call('utf-8'), + call('iso8859-1')]) + + class TestExtractorWorker(TestCase): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" From c08413276a189b9ae4a153d3e6ca7eb338131e7b Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Sun, 27 Nov 2016 00:15:18 -0200 Subject: [PATCH 26/33] extractor: convert text to string before calling parse_html --- pypln/backend/workers/extractor.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index d9ed07c..ce850b1 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -166,10 +166,10 @@ def process(self, file_data): file_mime_type = m.id_buffer(contents) metadata = {} - if file_mime_type == 'text/plain': - text = contents - elif file_mime_type == 'text/html': - text = parse_html(contents, True, ['script', 'style']) + if file_mime_type in ('text/plain', 'text/html'): + text = decode_text_bytes(contents) + if file_mime_type == 'text/html': + text = parse_html(text, True, ['script', 'style']) elif file_mime_type == 'application/pdf': text, metadata = extract_pdf(contents) else: @@ -183,9 +183,6 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - if isinstance(text, bytes): - text = decode_text_bytes(text) - if isinstance(text, str): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we From 8e67779d8250c0d056481d4ad59eb1b4cae9869e Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Sun, 27 Nov 2016 00:44:39 -0200 Subject: [PATCH 27/33] extractor: fix language detection --- pypln/backend/workers/extractor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index ce850b1..505e6df 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -194,10 +194,17 @@ def process(self, file_data): text = clean(text) if isinstance(text, str): - language = cld.detect(text.encode('utf-8'))[1] + languages = cld.detect(text.encode('utf-8'))[2] else: - language = cld.detect(text)[1] + languages = cld.detect(text)[2] + + detected_language = None + if languages: + detected_language = languages[0][1] # TODO: check for uses of forced_decoding and remove them - return {'text': text, 'file_metadata': metadata, 'language': language, - 'mimetype': file_mime_type, 'forced_decoding': None} + return {'text': text, + 'file_metadata': metadata, + 'language': detected_language, + 'mimetype': file_mime_type, + 'forced_decoding': None} From 11c203c9ad245d1c34384d9756957b479ddbe105 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 2 Dec 2016 16:47:33 -0200 Subject: [PATCH 28/33] extractor: remove checks for text being a str, it will always be --- pypln/backend/workers/extractor.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 505e6df..626aa3a 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -18,13 +18,12 @@ # along with PyPLN. If not, see . import base64 +import html import shlex -from html.parser import HTMLParser from tempfile import NamedTemporaryFile from os import unlink from subprocess import Popen, PIPE -from mimetypes import guess_type from re import compile as regexp_compile, DOTALL, escape import pycld2 as cld @@ -183,21 +182,10 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - if isinstance(text, str): - # HTMLParser only handles unicode objects. We can't pass the text - # through it if we don't know the encoding, and it's possible we - # also shouldn't. There's no way of knowing if it's a badly encoded - # html or a binary blob that happens do have bytes that look liked - # html entities. - text = HTMLParser().unescape(text) - + text = html.unescape(text) text = clean(text) - if isinstance(text, str): - languages = cld.detect(text.encode('utf-8'))[2] - else: - languages = cld.detect(text)[2] - + languages = cld.detect(text.encode('utf-8'))[2] detected_language = None if languages: detected_language = languages[0][1] From c6b3296e9512a35dcf79c06503a4073e6cc3f560 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 2 Dec 2016 16:47:41 -0200 Subject: [PATCH 29/33] extractor: remove up to 1k bytes that cld says are invalid --- pypln/backend/workers/extractor.py | 40 ++++++++++++++---- tests/test_worker_extractor.py | 67 +++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 22 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 626aa3a..eed8f33 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -45,6 +45,10 @@ '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6', 'br', 'br/'] double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] +cld_error_re = regexp_compile('input contains invalid UTF-8 around byte ' + '(?P\d+) \(of \d+\)') +MAX_CLD_BYTES_TO_REMOVE = 1024 + def clean(text): text = regexp_spaces_start.sub(r'\1', text) @@ -155,6 +159,33 @@ def decode_text_bytes(text: bytes) -> str: return result +def detect_language(text: str) -> str: + # CLD seems to have an issue with some bytes that Python considers + # to be valid utf-8. Remove up to MAX_CLD_BYTES_TO_REMOVE of such + # "invalid" bytes + # TODO: alert the user somehow if we give up removing them + detected_language = None + text_bytes = text.encode('utf-8') + for i in range(MAX_CLD_BYTES_TO_REMOVE): + try: + languages = cld.detect(text_bytes)[2] + except cld.error as exc: + message = exc.args[0] if exc.args else '' + match = cld_error_re.match(message) + if match: + byte_index = int(match.group('index')) + text_bytes = (text_bytes[:byte_index] + + text_bytes[byte_index + 1:]) + else: + raise + else: + if languages: + detected_language = languages[0][1] + break + + return detected_language + + class Extractor(PyPLNTask): #TODO: need to verify some exceptions when trying to convert 'evil' PDFs #TODO: should 'replace_with' be '' when extracting from HTML? @@ -184,15 +215,8 @@ def process(self, file_data): text = html.unescape(text) text = clean(text) - - languages = cld.detect(text.encode('utf-8'))[2] - detected_language = None - if languages: - detected_language = languages[0][1] - - # TODO: check for uses of forced_decoding and remove them return {'text': text, 'file_metadata': metadata, - 'language': detected_language, + 'language': detect_language(text), 'mimetype': file_mime_type, 'forced_decoding': None} diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index 4c94349..e364546 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -24,11 +24,13 @@ from unittest.mock import patch, Mock, MagicMock, call from magic import MagicError +import pycld2 as cld -from pypln.backend.workers import Extractor -from pypln.backend.workers.extractor import decode_text_bytes +from pypln.backend.workers.extractor import (Extractor, decode_text_bytes, + detect_language) DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) +MODULE = 'pypln.backend.workers.extractor.' class DecodeTextBytesTest(TestCase): @@ -44,6 +46,7 @@ def test_ignores_magic_error(self): self.id_buffer_mock.side_effect = MagicError() text = Mock() with self.magic_patcher: + # noinspection PyTypeChecker result = decode_text_bytes(text) self.assertEqual(result, text.decode.return_value) self.assertEqual(text.decode.call_args_list, [call('utf-8')]) @@ -51,6 +54,7 @@ def test_ignores_magic_error(self): def test_tries_decoding_with_encoding_returned_by_magic(self): text = Mock() with self.magic_patcher: + # noinspection PyTypeChecker result = decode_text_bytes(text) self.assertEqual(result, text.decode.return_value) self.assertEqual(text.decode.call_args_list, [call('magic_codec')]) @@ -59,6 +63,7 @@ def test_tries_decoding_as_utf8(self): text = Mock() text.decode.side_effect = [LookupError(), 'result'] with self.magic_patcher: + # noinspection PyTypeChecker result = decode_text_bytes(text) self.assertEqual(result, 'result') self.assertEqual(text.decode.call_args_list, @@ -75,6 +80,7 @@ def __init__(self): FakeUnicodeDecodeError(), 'result'] with self.magic_patcher: + # noinspection PyTypeChecker result = decode_text_bytes(text) self.assertEqual(result, 'result') self.assertEqual(text.decode.call_args_list, @@ -83,6 +89,44 @@ def __init__(self): call('iso8859-1')]) +def get_cld_exc(index): + return cld.error('input contains invalid UTF-8 around byte %s (of 42)' + % index) + + +class DetectLanguageTest(TestCase): + def setUp(self): + self.cld_patcher = patch(MODULE + 'cld.detect', + return_value=(Mock(), Mock(), + [(Mock(), 'lang'), + (Mock(), 'other_lang')])) + self.cld_mock = self.cld_patcher.start() + + def tearDown(self): + self.cld_patcher.stop() + + def test_returns_most_likely_language(self): + self.assertEqual(detect_language('text'), 'lang') + + def test_removes_bytes_cld_considers_invalid(self): + self.cld_mock.side_effect = [get_cld_exc(0), + get_cld_exc(3), + self.cld_mock.return_value] + self.assertEqual(detect_language('012345'), 'lang') + self.assertEqual(self.cld_mock.call_args_list, + [call(b'012345'), call(b'12345'), call(b'1235')]) + + def test_removes_at_most_max_bytes_for_cld(self): + self.cld_mock.side_effect = [get_cld_exc(0)] * 4 + with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3): + self.assertIsNone(detect_language('012345')) + self.assertEqual(self.cld_mock.call_count, 3) + + def test_doesnt_silence_other_cld_errors(self): + self.cld_mock.side_effect = [get_cld_exc(0), cld.error('another error')] + self.assertRaises(cld.error, detect_language, 'text') + + class TestExtractorWorker(TestCase): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" @@ -206,12 +250,12 @@ def test_language_detection_en(self): text = 'This text was written by Álvaro in English.' data = {'filename': 'text-en.txt', 'contents': base64.b64encode(text.encode('utf-8'))} - result = Extractor().process(data) + result = Extractor().process(data) self.assertEqual(result['language'], 'en') def test_unescape_html_entities(self): expected = ("This text has html . Álvaro asked me to make" - " sure it also has non ascii chars.") + " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') data = {'filename': filename, 'contents': base64.b64encode(open(filename, 'rb').read())} @@ -247,13 +291,8 @@ def test_unknown_mimetype_should_be_flagged(self): self.assertEqual(result['language'], "") self.assertEqual(result['file_metadata'], {}) - def test_unknown_encoding_should_be_ignored(self): - filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') - expected = "This file has a weird byte (\x96) that makes it " \ - "impossible for libmagic to recognize it's encoding." - data = {'filename': filename, - 'contents': base64.b64encode(open(filename, 'rb').read())} - result = Extractor().process(data) - self.assertEqual(result['text'], expected) - self.assertEqual(result['file_metadata'], {}) - self.assertEqual(result['language'], 'en') + def test_calls_detect_language(self): + with patch(MODULE + 'detect_language') as detect_language_mock: + result = Extractor().process({'contents': base64.b64encode(b'ok')}) + self.assertEqual(result['language'], + detect_language_mock.return_value) From 25a8e54e0e106b536edba51633d50017029c0de5 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Fri, 2 Dec 2016 16:53:03 -0200 Subject: [PATCH 30/33] SpellingChecker: no need to check for KeyError from document keys --- pypln/backend/workers/spellchecker.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py index e6bc93b..eeac5d3 100644 --- a/pypln/backend/workers/spellchecker.py +++ b/pypln/backend/workers/spellchecker.py @@ -49,10 +49,7 @@ def process(self, document): MissingDictionaryWarning) errors = None else: - try: - checker.set_text(document['text']) - errors = [[e.word, e.wordpos, e.suggest()] for e in checker] - except KeyError: - errors = None + checker.set_text(document['text']) + errors = [[e.word, e.wordpos, e.suggest()] for e in checker] return {'spelling_errors': errors} From 573a1117aa5acdb8a29d07008f2404b2932f6da9 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Tue, 6 Dec 2016 14:29:15 -0200 Subject: [PATCH 31/33] extractor: turn redundant tests into integration test --- tests/data/encoding_unknown_to_libmagic.txt | 1 - tests/test_worker_extractor.py | 48 ++++++--------------- 2 files changed, 14 insertions(+), 35 deletions(-) delete mode 100644 tests/data/encoding_unknown_to_libmagic.txt diff --git a/tests/data/encoding_unknown_to_libmagic.txt b/tests/data/encoding_unknown_to_libmagic.txt deleted file mode 100644 index 9fb69b2..0000000 --- a/tests/data/encoding_unknown_to_libmagic.txt +++ /dev/null @@ -1 +0,0 @@ -This file has a weird byte () that makes it impossible for libmagic to recognize it's encoding. diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index e364546..ac1df9e 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -96,35 +96,36 @@ def get_cld_exc(index): class DetectLanguageTest(TestCase): def setUp(self): - self.cld_patcher = patch(MODULE + 'cld.detect', - return_value=(Mock(), Mock(), - [(Mock(), 'lang'), - (Mock(), 'other_lang')])) - self.cld_mock = self.cld_patcher.start() + self.cld_mock = Mock(return_value=(Mock(), Mock(), + [(Mock(), 'lang'), + (Mock(), 'other_lang')])) + self.cld_patcher = patch(MODULE + 'cld.detect', self.cld_mock) - def tearDown(self): - self.cld_patcher.stop() - - def test_returns_most_likely_language(self): - self.assertEqual(detect_language('text'), 'lang') + def test_detects_portuguese(self): + """Sort of an integration test""" + text = 'Esse texto foi escrito por Álvaro em Português.' + self.assertEqual(detect_language(text), 'pt') def test_removes_bytes_cld_considers_invalid(self): self.cld_mock.side_effect = [get_cld_exc(0), get_cld_exc(3), self.cld_mock.return_value] - self.assertEqual(detect_language('012345'), 'lang') + with self.cld_patcher: + self.assertEqual(detect_language('012345'), 'lang') self.assertEqual(self.cld_mock.call_args_list, [call(b'012345'), call(b'12345'), call(b'1235')]) def test_removes_at_most_max_bytes_for_cld(self): self.cld_mock.side_effect = [get_cld_exc(0)] * 4 - with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3): + with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3),\ + self.cld_patcher: self.assertIsNone(detect_language('012345')) self.assertEqual(self.cld_mock.call_count, 3) def test_doesnt_silence_other_cld_errors(self): self.cld_mock.side_effect = [get_cld_exc(0), cld.error('another error')] - self.assertRaises(cld.error, detect_language, 'text') + with self.cld_patcher: + self.assertRaises(cld.error, detect_language, 'text') class TestExtractorWorker(TestCase): @@ -232,27 +233,6 @@ def test_extraction_from_html(self): self.assertEqual(result['text'], expected) self.assertEqual(result['mimetype'], 'text/html') - def test_language_detection_pt(self): - text = 'Esse texto foi escrito por Álvaro em Português.' - data = {'filename': 'text-pt.txt', - 'contents': base64.b64encode(text.encode('utf-8'))} - result = Extractor().process(data) - self.assertEqual(result['language'], 'pt') - - def test_language_detection_es(self): - text = 'Este texto ha sido escrito en Español por Álvaro.' - data = {'filename': 'text-es.txt', - 'contents': base64.b64encode(text.encode('utf-8'))} - result = Extractor().process(data) - self.assertEqual(result['language'], 'es') - - def test_language_detection_en(self): - text = 'This text was written by Álvaro in English.' - data = {'filename': 'text-en.txt', - 'contents': base64.b64encode(text.encode('utf-8'))} - result = Extractor().process(data) - self.assertEqual(result['language'], 'en') - def test_unescape_html_entities(self): expected = ("This text has html . Álvaro asked me to make" " sure it also has non ascii chars.") From 0265786bb08849dd82d1e2b60dc89f5773c3e296 Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Tue, 6 Dec 2016 19:26:38 -0200 Subject: [PATCH 32/33] extractor tests: support newer version of pdfinfo --- tests/test_worker_extractor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index ac1df9e..0079f78 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -168,7 +168,6 @@ def test_extraction_from_pdf_file(self): 'Author': 'Álvaro Justen', 'Creator': 'Writer', 'Producer': 'LibreOffice 3.5', - 'CreationDate': 'Fri Jun 1 17:07:57 2012', 'Tagged': 'no', 'Pages': '1', 'Encrypted': 'no', @@ -178,6 +177,12 @@ def test_extraction_from_pdf_file(self): } metadata_expected_set = set(metadata_expected.items()) metadata = result['file_metadata'] + + # Newer versions of pdfinfo add the timezone to this field + self.assertIn(metadata['CreationDate'], + ['Fri Jun 1 17:07:57 2012', + 'Fri Jun 1 17:07:57 2012 BRT']) + metadata_set = set(metadata.items()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), From 7b84defcd4a10cdcca9ff86b3827372ca099dada Mon Sep 17 00:00:00 2001 From: Luiz Geron Date: Tue, 31 Jan 2017 17:32:27 -0200 Subject: [PATCH 33/33] change bigram worker to return metric names and respect bigram order --- pypln/backend/workers/bigrams.py | 44 +++--- tests/test_worker_bigrams.py | 242 ++++++++++++++++++++++++++----- 2 files changed, 228 insertions(+), 58 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 034972d..c99cb95 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -16,33 +16,31 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +from collections import OrderedDict -import nltk -from collections import defaultdict - -from nltk.collocations import BigramCollocationFinder +from nltk import BigramCollocationFinder, BigramAssocMeasures from pypln.backend.celery_task import PyPLNTask +METRICS = ['chi_sq', + 'dice', + 'jaccard', + 'likelihood_ratio', + 'mi_like', + 'phi_sq', + 'pmi', + 'poisson_stirling', + 'raw_freq', + 'student_t'] -class Bigrams(PyPLNTask): - """Create a NLTK bigram finder and return a table in JSON format""" +class Bigrams(PyPLNTask): def process(self, document): - #todo: support filtering by stopwords - bigram_measures = nltk.collocations.BigramAssocMeasures() - metrics = ['chi_sq', - 'dice', - 'jaccard', - 'likelihood_ratio', - 'mi_like', - 'phi_sq', - 'pmi', - 'poisson_stirling', - 'raw_freq', - 'student_t'] bigram_finder = BigramCollocationFinder.from_words(document['tokens']) - br = defaultdict(lambda :[]) - for m in metrics: - for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): - br[res[0]].append(res[1]) - return {'metrics': metrics, 'bigram_rank': list(br.items())} + bigram_rankings = OrderedDict() + for metric_name in METRICS: + metric = getattr(BigramAssocMeasures, metric_name) + for ranking in bigram_finder.score_ngrams(metric): + bigram = ranking[0] + d = bigram_rankings.setdefault(bigram, {}) + d[metric_name] = ranking[1] + return {'bigram_rankings': list(bigram_rankings.items())} diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 6adf67e..027a701 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -16,43 +16,215 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - -import nltk +from unittest import TestCase from pypln.backend.workers.bigrams import Bigrams -from .utils import TaskTest - -bigram_measures = nltk.collocations.BigramAssocMeasures() - - -class TestBigramWorker(TaskTest): - def test_bigrams_should_return_correct_score(self): - # We need this list comprehension because we need to save the word list - # in mongo (thus, it needs to be json serializable). Also, a list is - # what will be available to the worker in real situations. - tokens = [w for w in - nltk.corpus.genesis.words('english-web.txt')] - - doc_id = self.collection.insert({'tokens': tokens}, w=1) - Bigrams().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - bigram_rank = refreshed_document['bigram_rank'] - result = bigram_rank[0][1][0] - # This is the value of the chi_sq measure for this bigram in this - # colocation - expected_chi_sq = 95.59393417173634 - self.assertEqual(result, expected_chi_sq) +TOKENS = ['Ao', 'verme', 'que', 'primeiro', 'roeu', 'as', 'frias', 'carnes', + 'do', 'meu', 'cadáver', 'dedico', 'como', 'saudosa', 'lembrança', + 'estas', 'Memórias', 'Póstumas', '.'] +RANKINGS = {'bigram_rankings': [(('Ao', 'verme'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('Memórias', 'Póstumas'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('Póstumas', '.'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('as', 'frias'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('cadáver', 'dedico'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('carnes', 'do'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('como', 'saudosa'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('dedico', 'como'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('do', 'meu'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('estas', 'Memórias'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('frias', 'carnes'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('lembrança', 'estas'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('meu', 'cadáver'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('primeiro', 'roeu'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('que', 'primeiro'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('roeu', 'as'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('saudosa', 'lembrança'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316}), + (('verme', 'que'), + {'chi_sq': 19.0, + 'dice': 1.0, + 'jaccard': 1.0, + 'likelihood_ratio': 7.835297924062801, + 'mi_like': 1.0, + 'phi_sq': 1.0, + 'pmi': 4.247927513443585, + 'poisson_stirling': 3.247927513443585, + 'raw_freq': 0.05263157894736842, + 'student_t': 0.9473684210526316})]} - def test_bigrams_could_contain_dollar_signs_and_dots(self): - tokens = ['$', '.'] - doc_id = self.collection.insert({'tokens': tokens}, w=1) - Bigrams().delay(doc_id) - refreshed_document = self.collection.find_one({'_id': doc_id}) - bigram_rank = refreshed_document['bigram_rank'] - result = bigram_rank[0][1][0] - # 2.0 is the value of the chi_sq measure for this bigram in this - # colocation - expected_chi_sq = 2.0 - self.assertEqual(result, expected_chi_sq) +class TestBigramWorker(TestCase): + def test_returns_bigram_rankings(self): + self.maxDiff = None + result = Bigrams().process({'tokens': TOKENS}) + self.assertEqual(result, RANKINGS)