From 702dbf1d3ad57a017fc9f10048dbdbdd89673096 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 23 Nov 2016 14:02:32 -0200
Subject: [PATCH 01/33] Uses pycld2 instead of the (outdate) chrom[...]tector

---
 pypln/backend/workers/extractor.py | 6 +++---
 requirements/production.txt        | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 110730b..c273125 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -27,7 +27,7 @@
 from mimetypes import guess_type
 from re import compile as regexp_compile, DOTALL, escape
 
-import cld
+import cld2
 import magic
 
 from pypln.backend.celery_task import PyPLNTask
@@ -204,9 +204,9 @@ def process(self, file_data):
         text = clean(text)
 
         if isinstance(text, unicode):
-            language = cld.detect(text.encode('utf-8'))[1]
+            language = cld2.detect(text.encode('utf-8'))[1]
         else:
-            language = cld.detect(text)[1]
+            language = cld2.detect(text)[1]
 
         return {'text': text, 'file_metadata': metadata, 'language': language,
                 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
diff --git a/requirements/production.txt b/requirements/production.txt
index bb43589..915673a 100644
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -11,7 +11,7 @@ pyparsing>=1.5.6,<2.0
 
 # Backend
 psutil
-chromium_compact_language_detector
+pycld2
 filemagic
 numpy
 nltk>=2.7.8

From cb3d1d28d03b9df7c0cd468c358999db5ba05ad1 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 23 Nov 2016 14:03:14 -0200
Subject: [PATCH 02/33] Removes pyparsing from requirements

It looks like we are not using it anymore
---
 requirements/production.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/requirements/production.txt b/requirements/production.txt
index 915673a..0e325c3 100644
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -2,13 +2,6 @@
 celery
 pymongo==2.8.1
 
-# The newest pyparsing (2.0) only supports python 3,
-# so we explicitly install 1.5.7 (the last version that
-# supports python 2) before one of our dependencies tries
-# to install it.
-# http://sourceforge.net/projects/pyparsing/forums/forum/337293/topic/6481050
-pyparsing>=1.5.6,<2.0
-
 # Backend
 psutil
 pycld2

From 915efa72b8b1aa90d03aed2a93bcf488d628e047 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Wed, 23 Nov 2016 20:39:18 -0200
Subject: [PATCH 03/33] fix cld import

---
 pypln/backend/workers/extractor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index c273125..992ea4b 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -27,7 +27,7 @@
 from mimetypes import guess_type
 from re import compile as regexp_compile, DOTALL, escape
 
-import cld2
+import pycld2 as cld
 import magic
 
 from pypln.backend.celery_task import PyPLNTask
@@ -204,9 +204,9 @@ def process(self, file_data):
         text = clean(text)
 
         if isinstance(text, unicode):
-            language = cld2.detect(text.encode('utf-8'))[1]
+            language = cld.detect(text.encode('utf-8'))[1]
         else:
-            language = cld2.detect(text)[1]
+            language = cld.detect(text)[1]
 
         return {'text': text, 'file_metadata': metadata, 'language': language,
                 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}

From 058959248f9c7c346294616eacfc3184f7e142ba Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Wed, 23 Nov 2016 20:40:17 -0200
Subject: [PATCH 04/33] prevent mongo from connecting at import time

---
 pypln/backend/celery_task.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
index 2d3d93d..0b1c235 100644
--- a/pypln/backend/celery_task.py
+++ b/pypln/backend/celery_task.py
@@ -31,7 +31,7 @@
 from pypln.backend import config
 
 
-mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
+mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False)
 database = mongo_client[config.MONGODB_DBNAME]
 document_collection = database[config.MONGODB_COLLECTION]
 

From 0b4ccf68656080449f95fc1dd2b0b09852587535 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Wed, 23 Nov 2016 22:17:58 -0200
Subject: [PATCH 05/33] run 2to3

---
 doc/conf.py                                   |  12 +-
 pypln/backend/celery_app.py                   |   2 +-
 pypln/backend/config.py                       |   8 +-
 pypln/backend/workers/__init__.py             |  24 +-
 pypln/backend/workers/bigrams.py              |   2 +-
 pypln/backend/workers/extractor.py            |   8 +-
 pypln/backend/workers/freqdist.py             |   2 +-
 pypln/backend/workers/palavras_noun_phrase.py |   2 +-
 pypln/backend/workers/palavras_raw.py         |   2 +-
 .../workers/palavras_semantic_tagger.py       | 530 +++++++++---------
 pypln/backend/workers/pos/__init__.py         |   6 +-
 pypln/backend/workers/pos/pt_palavras.py      |  56 +-
 pypln/backend/workers/trigrams.py             |   6 +-
 pypln/backend/workers/word_cloud.py           |   4 +-
 scripts/add_pipelines.py                      |   2 +-
 scripts/create_fake_measures.py               |  42 +-
 scripts/mongo2sphinx.py                       |   2 +-
 tests/test_celery_task.py                     |   2 +-
 tests/test_elastic_indexer.py                 |   2 +-
 tests/test_worker_bigrams.py                  |   2 +-
 tests/test_worker_extractor.py                |  36 +-
 tests/test_worker_freqdist.py                 |  10 +-
 tests/test_worker_lemmatizer_pt.py            |   2 +-
 tests/test_worker_palavras_noun_phrase.py     |   2 +-
 tests/test_worker_palavras_raw.py             |   2 +-
 tests/test_worker_palavras_semantic_tagger.py |  10 +-
 tests/test_worker_pos.py                      |   4 +-
 tests/test_worker_spellchecker.py             |   6 +-
 tests/test_worker_statistics.py               |   2 +-
 tests/test_worker_tokenizer.py                |   2 +-
 tests/test_worker_trigrams.py                 |   8 +-
 tests/test_worker_wordcloud.py                |   4 +-
 32 files changed, 402 insertions(+), 402 deletions(-)

diff --git a/doc/conf.py b/doc/conf.py
index a6be0e8..6403136 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -46,8 +46,8 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'PyPLN'
-copyright = u'2011, Flávio Codeço Coelho'
+project = 'PyPLN'
+copyright = '2011, Flávio Codeço Coelho'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -187,8 +187,8 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('index', 'PyPLN.tex', u'PyPLN Documentation',
-   u'Flávio Codeço Coelho', 'manual'),
+  ('index', 'PyPLN.tex', 'PyPLN Documentation',
+   'Flávio Codeço Coelho', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -220,6 +220,6 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'pypln', u'PyPLN Documentation',
-     [u'Flávio Codeço Coelho'], 1)
+    ('index', 'pypln', 'PyPLN Documentation',
+     ['Flávio Codeço Coelho'], 1)
 ]
diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py
index 342c5be..895d9cd 100644
--- a/pypln/backend/celery_app.py
+++ b/pypln/backend/celery_app.py
@@ -19,7 +19,7 @@
 
 from celery import Celery
 from kombu import Exchange, Queue
-import config
+from . import config
 
 app = Celery('pypln_workers', backend='mongodb',
         broker='amqp://', include=['pypln.backend.workers'])
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
index ec1d48e..f89bd6f 100644
--- a/pypln/backend/config.py
+++ b/pypln/backend/config.py
@@ -3,14 +3,14 @@
 from decouple import config, Csv
 
 try:
-    import urlparse
+    import urllib.parse
 except ImportError:
     import urllib.parse as urlparse
 
 def parse_url(url):
-    urlparse.uses_netloc.append('mongodb')
-    urlparse.uses_netloc.append('celery')
-    url = urlparse.urlparse(url)
+    urllib.parse.uses_netloc.append('mongodb')
+    urllib.parse.uses_netloc.append('celery')
+    url = urllib.parse.urlparse(url)
 
     path = url.path[1:]
     path = path.split('?', 2)[0]
diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
index 0125bde..9ca1ec2 100644
--- a/pypln/backend/workers/__init__.py
+++ b/pypln/backend/workers/__init__.py
@@ -17,18 +17,18 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from extractor import Extractor
-from tokenizer import Tokenizer
-from freqdist import FreqDist
-from pos import POS
-from statistics import Statistics
-from bigrams import Bigrams
-from palavras_raw import PalavrasRaw
-from lemmatizer_pt import Lemmatizer
-from palavras_noun_phrase import NounPhrase
-from palavras_semantic_tagger import SemanticTagger
-from word_cloud import WordCloud
-from elastic_indexer import ElasticIndexer
+from .extractor import Extractor
+from .tokenizer import Tokenizer
+from .freqdist import FreqDist
+from .pos import POS
+from .statistics import Statistics
+from .bigrams import Bigrams
+from .palavras_raw import PalavrasRaw
+from .lemmatizer_pt import Lemmatizer
+from .palavras_noun_phrase import NounPhrase
+from .palavras_semantic_tagger import SemanticTagger
+from .word_cloud import WordCloud
+from .elastic_indexer import ElasticIndexer
 
 
 __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',
diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 302482f..034972d 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -45,4 +45,4 @@ def process(self, document):
         for m in metrics:
             for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
                 br[res[0]].append(res[1])
-        return {'metrics': metrics, 'bigram_rank': br.items()}
+        return {'metrics': metrics, 'bigram_rank': list(br.items())}
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 992ea4b..2a864e6 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -20,7 +20,7 @@
 import base64
 import shlex
 
-from HTMLParser import HTMLParser
+from html.parser import HTMLParser
 from tempfile import NamedTemporaryFile
 from os import unlink
 from subprocess import Popen, PIPE
@@ -84,7 +84,7 @@ def parse_html(html, remove_tags=None, remove_inside=None,
                     [''] * (total_to_remove - 2)
             content_between[index + 1] = '\n'
     complete_tags.append('')
-    result = ''.join(sum(zip(content_between, complete_tags), tuple()))
+    result = ''.join(sum(list(zip(content_between, complete_tags)), tuple()))
     return clean(result)
 
 def get_pdf_metadata(data):
@@ -193,7 +193,7 @@ def process(self, file_data):
 
         text, forced_decoding = trial_decode(text)
 
-        if isinstance(text, unicode):
+        if isinstance(text, str):
             # HTMLParser only handles unicode objects. We can't pass the text
             # through it if we don't know the encoding, and it's possible we
             # also shouldn't. There's no way of knowing if it's a badly encoded
@@ -203,7 +203,7 @@ def process(self, file_data):
 
         text = clean(text)
 
-        if isinstance(text, unicode):
+        if isinstance(text, str):
             language = cld.detect(text.encode('utf-8'))[1]
         else:
             language = cld.detect(text)[1]
diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py
index 7bb7f7e..82d75ee 100644
--- a/pypln/backend/workers/freqdist.py
+++ b/pypln/backend/workers/freqdist.py
@@ -27,7 +27,7 @@ def process(self, document):
         tokens = [info.lower() for info in document_tokens]
         frequency_distribution = {token: tokens.count(token) \
                                   for token in set(tokens)}
-        fd = frequency_distribution.items()
+        fd = list(frequency_distribution.items())
         fd.sort(lambda x, y: cmp(y[1], x[1]))
 
         return {'freqdist': fd}
diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py
index 76e3a18..f9dde80 100644
--- a/pypln/backend/workers/palavras_noun_phrase.py
+++ b/pypln/backend/workers/palavras_noun_phrase.py
@@ -40,7 +40,7 @@ def process(self, document):
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         palavras_output = document['palavras_raw']
-        if isinstance(palavras_output, unicode):
+        if isinstance(palavras_output, str):
             # we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii.
             palavras_output = palavras_output.encode('utf-8')
         stdout, stderr = process.communicate(palavras_output)
diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py
index 77e2d9a..e45bb11 100644
--- a/pypln/backend/workers/palavras_raw.py
+++ b/pypln/backend/workers/palavras_raw.py
@@ -46,7 +46,7 @@ def process(self, document):
         # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we
         # know the text came from mongodb, we can just decode it using utf-8 to
         # make sure we have a unicode object.
-        if not isinstance(text, unicode):
+        if not isinstance(text, str):
             text = text.decode('utf-8')
 
         process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py
index 3f35ca5..1e448de 100644
--- a/pypln/backend/workers/palavras_semantic_tagger.py
+++ b/pypln/backend/workers/palavras_semantic_tagger.py
@@ -26,381 +26,381 @@
 {
     'Animal':
         {
-        '<A>': u'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' ,
-        '<AA>': u'Group of animals (cardume, enxame, passarada, ninhada)',
-        '<Adom>': u'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)',
-        '<AAdom>': u'Group of domestic animals (boiada)',
-        '<Aich>': u'Water-animal (tubarão, delfim)',
-        '<Amyth>': u'Mythological animal (basilisco)',
-        '<Azo>': u'Land-animal (raposa)',
-        '<Aorn>': u'Bird (águia, bem-te-vi)',
-        '<Aent>': u'Insect (borboleta)',
-        '<Acell>': u'Cell-animal (bacteria, blood cells: linfócito)',
+        '<A>': 'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' ,
+        '<AA>': 'Group of animals (cardume, enxame, passarada, ninhada)',
+        '<Adom>': 'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)',
+        '<AAdom>': 'Group of domestic animals (boiada)',
+        '<Aich>': 'Water-animal (tubarão, delfim)',
+        '<Amyth>': 'Mythological animal (basilisco)',
+        '<Azo>': 'Land-animal (raposa)',
+        '<Aorn>': 'Bird (águia, bem-te-vi)',
+        '<Aent>': 'Insect (borboleta)',
+        '<Acell>': 'Cell-animal (bacteria, blood cells: linfócito)',
         },
     'Plant':
         {
-        '<B>': u'Plant, umbrella tag',
-        '<BB>': u'Group of plants, plantation (field, forest etc.: mata, nabal)',
-        '<Btree>': u'Tree (oliveira, palmeira)',
-        '<Bflo>': u'Flower (rosa, taraxaco)',
-        '<Bbush>': u'Bush, shrub (rododendro, tamariz)',
-        '<fruit>': u'(fruit, berries, nuts: maçã, morango, avelã, melancia)',
-        '<Bveg>': u'(vegetable espargo, funcho)',
+        '<B>': 'Plant, umbrella tag',
+        '<BB>': 'Group of plants, plantation (field, forest etc.: mata, nabal)',
+        '<Btree>': 'Tree (oliveira, palmeira)',
+        '<Bflo>': 'Flower (rosa, taraxaco)',
+        '<Bbush>': 'Bush, shrub (rododendro, tamariz)',
+        '<fruit>': '(fruit, berries, nuts: maçã, morango, avelã, melancia)',
+        '<Bveg>': '(vegetable espargo, funcho)',
         },
     'Human':
         {
-        '<H>': u'Human, umbrella tag',
-        '<HH>': u'Group of humans (organisations, teams, companies, e.g. editora)',
-        '<Hattr>': u'Attributive human umbrella tag (many -ista, -ante)',
-        '<Hbio>': u'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)',
-        '<Hfam>': u'Human with family or other private relation (pai, noiva)',
-        '<Hideo>': u'Ideological human (comunista, implies <Hattr>), also: follower, disciple (dadaista)',
-        '<Hmyth>': u'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)',
-        '<Hnat>': u'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)',
-        '<Hprof>': u'Professional human (marinheiro, implies <Hattr>), also: sport, hobby (alpinista)',
-        '<Hsick>': u'Sick human (few: asmático, diabético, cp <sick>)',
-        '<Htit>': u'Title noun (rei, senhora)',
+        '<H>': 'Human, umbrella tag',
+        '<HH>': 'Group of humans (organisations, teams, companies, e.g. editora)',
+        '<Hattr>': 'Attributive human umbrella tag (many -ista, -ante)',
+        '<Hbio>': 'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)',
+        '<Hfam>': 'Human with family or other private relation (pai, noiva)',
+        '<Hideo>': 'Ideological human (comunista, implies <Hattr>), also: follower, disciple (dadaista)',
+        '<Hmyth>': 'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)',
+        '<Hnat>': 'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)',
+        '<Hprof>': 'Professional human (marinheiro, implies <Hattr>), also: sport, hobby (alpinista)',
+        '<Hsick>': 'Sick human (few: asmático, diabético, cp <sick>)',
+        '<Htit>': 'Title noun (rei, senhora)',
         },
     'Place and spatial':
         {
-        '<L>': u'Place, umbrella tag',
-        '<Labs>': u'Abstract place (anverso. auge)',
-        '<Lciv>': u'Civitas, town, country, county (equals <L> + <HH>, cidade, país)',
-        '<Lcover>': u'Cover, lid (colcha, lona, tampa)',
-        '<Lh>': u'Functional place, human built or human-used (aeroporto, anfiteatro, cp. <build> for just a building)',
-        '<Lopening>': u'opening, hole (apertura, fossa)',
-        '<Lpath>': u'Path (road, street etc.: rua, pista)' ,
-        '<Lstar>': u'Star object (planets, comets: planeta, quasar)',
-        '<Lsurf>': u'surface (face, verniz, cp. <Lcover>)',
-        '<Ltip>': u'tip place, edge (pico, pontinha, cp. <Labs>)',
-        '<Ltop>': u'Geographical, natural place (promontório, pântano)',
-        '<Ltrap>': u'trap place (armadilha, armazelo)',
-        '<Lwater>': u'Water place (river, lake, sea: fonte, foz, lagoa)',
-        '<bar>': u'barrier noun (dique, limite, muralha)',
-        '<build>': u'(building)',
-        '<inst>': u'(institution)',
-        '<pict>': u'(picture)',
-        '<sit>': u'(situation)',
-        '<pos-an>': u'anatomical/body position (few: desaprumo)',
-        '<pos-soc>': u'social position, job (emprego, condado, capitania, presidência)',
+        '<L>': 'Place, umbrella tag',
+        '<Labs>': 'Abstract place (anverso. auge)',
+        '<Lciv>': 'Civitas, town, country, county (equals <L> + <HH>, cidade, país)',
+        '<Lcover>': 'Cover, lid (colcha, lona, tampa)',
+        '<Lh>': 'Functional place, human built or human-used (aeroporto, anfiteatro, cp. <build> for just a building)',
+        '<Lopening>': 'opening, hole (apertura, fossa)',
+        '<Lpath>': 'Path (road, street etc.: rua, pista)' ,
+        '<Lstar>': 'Star object (planets, comets: planeta, quasar)',
+        '<Lsurf>': 'surface (face, verniz, cp. <Lcover>)',
+        '<Ltip>': 'tip place, edge (pico, pontinha, cp. <Labs>)',
+        '<Ltop>': 'Geographical, natural place (promontório, pântano)',
+        '<Ltrap>': 'trap place (armadilha, armazelo)',
+        '<Lwater>': 'Water place (river, lake, sea: fonte, foz, lagoa)',
+        '<bar>': 'barrier noun (dique, limite, muralha)',
+        '<build>': '(building)',
+        '<inst>': '(institution)',
+        '<pict>': '(picture)',
+        '<sit>': '(situation)',
+        '<pos-an>': 'anatomical/body position (few: desaprumo)',
+        '<pos-soc>': 'social position, job (emprego, condado, capitania, presidência)',
         },
     'Vehicle':
         {
-        '<V>': u'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)',
-        '<VV>': u'Group of vehicles (armada, convoy: frota, esquadra)',
-        '<Vwater>': u'Water vehicle (ship: navio, submersível, canoa)',
-        '<Vair>': u'Air vehicle (plane: hidroplano, jatinho)',
+        '<V>': 'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)',
+        '<VV>': 'Group of vehicles (armada, convoy: frota, esquadra)',
+        '<Vwater>': 'Water vehicle (ship: navio, submersível, canoa)',
+        '<Vair>': 'Air vehicle (plane: hidroplano, jatinho)',
         },
     'Abstract':
         {
-        '<ac>': u'Abstract countable, umbrella tag (alternativa, chance, lazer)',
-        '<ac-cat>': u'Category word (latinismo, número atômico)',
-        '<ac-sign>': u'sign, symbol (parêntese, semicolcheia)',
-        '<am>': u'Abstract mass/non-countable, umbrella tag (still contains many cases that could be <f-..>, e.g. habilidade, legalidade)',
-        '<ax>': u'Abstract/concept, neither countable nor mass (endogamia), cp. <f>, <sit> etc.',
-        '<f...>': u'(features)',
-        '<dir>': u'direction noun (estibordo, contrasenso, norte)',
-        '<geom...>': u'(shapes)',
-        '<meta>': u'meta noun (tipo, espécie)',
-        '<brand>': u'(MARCA) brand',
-        '<genre>': u'(DISCIPLINA) subject matter',
-        '<school>': u'(ESCOLA) school of thought',
-        '<idea>': u'(IDEA) idea, concept',
-        '<plan>': u'(PLANO) named plan, project',
-        '<author>': u'(OBRA) artist-s name, standing for body of work',
-        '<absname>': u'(NOME)',
-        '<disease>': u'(ESTADO) physiological state, in particular: disease',
+        '<ac>': 'Abstract countable, umbrella tag (alternativa, chance, lazer)',
+        '<ac-cat>': 'Category word (latinismo, número atômico)',
+        '<ac-sign>': 'sign, symbol (parêntese, semicolcheia)',
+        '<am>': 'Abstract mass/non-countable, umbrella tag (still contains many cases that could be <f-..>, e.g. habilidade, legalidade)',
+        '<ax>': 'Abstract/concept, neither countable nor mass (endogamia), cp. <f>, <sit> etc.',
+        '<f...>': '(features)',
+        '<dir>': 'direction noun (estibordo, contrasenso, norte)',
+        '<geom...>': '(shapes)',
+        '<meta>': 'meta noun (tipo, espécie)',
+        '<brand>': '(MARCA) brand',
+        '<genre>': '(DISCIPLINA) subject matter',
+        '<school>': '(ESCOLA) school of thought',
+        '<idea>': '(IDEA) idea, concept',
+        '<plan>': '(PLANO) named plan, project',
+        '<author>': '(OBRA) artist-s name, standing for body of work',
+        '<absname>': '(NOME)',
+        '<disease>': '(ESTADO) physiological state, in particular: disease',
         },
     'Concept':
         {
-        '<conv>': u'convention (social rule or law, lei, preceito)',
-        '<domain>': u'subject matter, profession, cf. <genre>, anatomia, citricultura, dactilografia)',
-        '<ism>': u'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)',
-        '<genre>': u'',
-        '<ling>': u'language (alemão, catalão, bengali)',
-        '<disease>': u'',
-        '<state...>': u'',
-        '<therapy>': u'therapy (also <domain> and <activity>, acupuntura, balneoterapia)',
+        '<conv>': 'convention (social rule or law, lei, preceito)',
+        '<domain>': 'subject matter, profession, cf. <genre>, anatomia, citricultura, dactilografia)',
+        '<ism>': 'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)',
+        '<genre>': '',
+        '<ling>': 'language (alemão, catalão, bengali)',
+        '<disease>': '',
+        '<state...>': '',
+        '<therapy>': 'therapy (also <domain> and <activity>, acupuntura, balneoterapia)',
         },
     'Game':
         {
-        '<game>': u'play, game (bilhar, ioiô, poker, also <activity>)',
+        '<game>': 'play, game (bilhar, ioiô, poker, also <activity>)',
         },
     'Genre':
         {
-        '<genre>': u'genre (especially art genre, cf. <domain>, modernismo, tropicalismo)',
+        '<genre>': 'genre (especially art genre, cf. <domain>, modernismo, tropicalismo)',
         },
     'Quantity':
         {
-        '<unit>': u'',
-        '<amount>': u'quantity noun (bocada, teor, sem-fim)',
-        '<cur>': u'currency noun (countable, implies <unit>, cf. <mon>, dirham, euro, real, dólar)',
-        '<mon>': u'amount of money (bolsa, custo, imposto, cf. <cur>)',
+        '<unit>': '',
+        '<amount>': 'quantity noun (bocada, teor, sem-fim)',
+        '<cur>': 'currency noun (countable, implies <unit>, cf. <mon>, dirham, euro, real, dólar)',
+        '<mon>': 'amount of money (bolsa, custo, imposto, cf. <cur>)',
         },
     'Action':
         {
-        '<act>': u'Action umbrella tag (+CONTROL, PERFECTIVE)',
-        '<act-beat>': u'beat-action (thrashing, pancada, surra)',
-        '<act-d>': u'do-action (typically dar/fazer + N, tentativa, teste, homenagem)',
-        '<act-s>': u'speech act or communicative act (proposta, ordem)',
-        '<act-trick>': u'trick-action (cheat, fraud, ruse, jeito, fraude, similar to <act-d>)',
-        '<activity>': u'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)',
-        '<sport>': u'',
-        '<game>': u'',
-        '<therapy>': u'',
-        '<dance>': u'dance (both <activity>, <genre> and <sem-l>, calipso, flamenco, forró)',
-        '<fight>': u'fight, conflict (also <activity> and +TEMP, briga, querela)',
-        '<talk>': u'speech situation, talk, discussion, quarrel (implies <activity> and <sd>, entrevista, lero-lero)',
+        '<act>': 'Action umbrella tag (+CONTROL, PERFECTIVE)',
+        '<act-beat>': 'beat-action (thrashing, pancada, surra)',
+        '<act-d>': 'do-action (typically dar/fazer + N, tentativa, teste, homenagem)',
+        '<act-s>': 'speech act or communicative act (proposta, ordem)',
+        '<act-trick>': 'trick-action (cheat, fraud, ruse, jeito, fraude, similar to <act-d>)',
+        '<activity>': 'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)',
+        '<sport>': '',
+        '<game>': '',
+        '<therapy>': '',
+        '<dance>': 'dance (both <activity>, <genre> and <sem-l>, calipso, flamenco, forró)',
+        '<fight>': 'fight, conflict (also <activity> and +TEMP, briga, querela)',
+        '<talk>': 'speech situation, talk, discussion, quarrel (implies <activity> and <sd>, entrevista, lero-lero)',
         },
     'Anatomical':
         {
-        '<an>': u'Anatomical noun, umbrella tag (carótida, clítoris, dorso)',
-        '<anmov>': u'Movable anatomy (arm, leg, braço, bíceps, cotovelo)',
-        '<anorg>': u'Organ (heart, liver, hipófise, coração, testículo)',
-        '<anost>': u'Bone (calcâneo, fíbula, vértebra)',
-        '<anzo>': u'Animal anatomy (rúmen, carapaça, chifres, tromba)',
-        '<anorn>': u'Bird anatomy (bico, pluma)',
-        '<anich>': u'Fish anatomy (few: bránquias, siba)',
-        '<anent>': u'Insect anatomy (few: tentáculo, olho composto)',
-        '<anbo>': u'Plant anatomy (bulbo, caule, folha)',
-        '<f-an>': u'(human anatomical feature)',
+        '<an>': 'Anatomical noun, umbrella tag (carótida, clítoris, dorso)',
+        '<anmov>': 'Movable anatomy (arm, leg, braço, bíceps, cotovelo)',
+        '<anorg>': 'Organ (heart, liver, hipófise, coração, testículo)',
+        '<anost>': 'Bone (calcâneo, fíbula, vértebra)',
+        '<anzo>': 'Animal anatomy (rúmen, carapaça, chifres, tromba)',
+        '<anorn>': 'Bird anatomy (bico, pluma)',
+        '<anich>': 'Fish anatomy (few: bránquias, siba)',
+        '<anent>': 'Insect anatomy (few: tentáculo, olho composto)',
+        '<anbo>': 'Plant anatomy (bulbo, caule, folha)',
+        '<f-an>': '(human anatomical feature)',
         },
     'Thing':
         {
-        '<cc>': u'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike <part-build>)',
-        '<cc-h>': u'Artifact, umbrella tag (so far empty category in PALAVRAS)',
-        '<cc-beauty>': u'ornamental object (few: guirlanda, rufo)',
-        '<cc-board>': u'flat long object (few: board, plank, lousa, tabla)',
-        '<cc-fire>': u'fire object (bonfire, spark, chispa, fogo, girândola)',
-        '<cc-handle>': u'handle (garra, ansa, chupadouro)',
-        '<cc-light>': u'light artifact (lampião, farol, projector) ',
-        '<cc-particle>': u'(atomic) particle (few: cátion, eletrônio)',
-        '<cc-r>': u'read object (carteira, cupom, bilhete, carta, cf. <sem-r>)',
-        '<cc-rag>': u'cloth object (towel, napkin, carpet, rag) , cp. <mat-cloth>',
-        '<cc-stone>': u'(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)',
-        '<cc-stick>': u'stick object (long and thin, vara, lançe, paulito)',
-        '<object>': u'(OBJECT) named object',
-        '<common>': u'(OBJECT) common noun used as name',
-        '<mat>': u'(SUBSTANCIA) substance',
-        '<class>': u'(CLASSE) classification category for things',
-        '<plant>': u'(CLASSE) plant name',
-        '<currency>': u'(MOEDA) currency name (also marked on the number)',
-        '<mass>': u'mass noun (e.g. "leite", "a-gua")',
-        '<furn>': u'furniture (cama, cadeira, tambo, quadro)',
-        '<con>': u'container (implies <num+> quantifying, ampola, chícara, aquário)',
+        '<cc>': 'Concrete countable object, umbrella tag (briquete, coágulo, normally movable things, unlike <part-build>)',
+        '<cc-h>': 'Artifact, umbrella tag (so far empty category in PALAVRAS)',
+        '<cc-beauty>': 'ornamental object (few: guirlanda, rufo)',
+        '<cc-board>': 'flat long object (few: board, plank, lousa, tabla)',
+        '<cc-fire>': 'fire object (bonfire, spark, chispa, fogo, girândola)',
+        '<cc-handle>': 'handle (garra, ansa, chupadouro)',
+        '<cc-light>': 'light artifact (lampião, farol, projector) ',
+        '<cc-particle>': '(atomic) particle (few: cátion, eletrônio)',
+        '<cc-r>': 'read object (carteira, cupom, bilhete, carta, cf. <sem-r>)',
+        '<cc-rag>': 'cloth object (towel, napkin, carpet, rag) , cp. <mat-cloth>',
+        '<cc-stone>': '(cc-round) stones and stone-sized round objects (pedra, itá, amonite, tijolo)',
+        '<cc-stick>': 'stick object (long and thin, vara, lançe, paulito)',
+        '<object>': '(OBJECT) named object',
+        '<common>': '(OBJECT) common noun used as name',
+        '<mat>': '(SUBSTANCIA) substance',
+        '<class>': '(CLASSE) classification category for things',
+        '<plant>': '(CLASSE) plant name',
+        '<currency>': '(MOEDA) currency name (also marked on the number)',
+        '<mass>': 'mass noun (e.g. "leite", "a-gua")',
+        '<furn>': 'furniture (cama, cadeira, tambo, quadro)',
+        '<con>': 'container (implies <num+> quantifying, ampola, chícara, aquário)',
         },
     'Substance':
         {
-        '<cm>': u'concrete mass/non-countable, umbrella tag, substance (cf. <mat>, terra, choça, magma)',
-        '<cm-h>': u'human-made substance (cf. <mat>, cemento)',
-        '<cm-chem>': u'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina',
-        '<cm-gas>': u'gas substance (so far few: argônio, overlap with. <cm-chem> and <cm>)',
-        '<cm-liq>': u'liquid substance (azeite, gasolina, plasma, overlap with <food> and <cm-rem>)',
-        '<cm-rem>': u'remedy (medical or hygiene, antibiótico, canabis, quinina, part of <cm-h>, overlap with <cm-chem>)',
+        '<cm>': 'concrete mass/non-countable, umbrella tag, substance (cf. <mat>, terra, choça, magma)',
+        '<cm-h>': 'human-made substance (cf. <mat>, cemento)',
+        '<cm-chem>': 'chemical substance, also biological (acetileno, amônio, anilina, bilirrubina',
+        '<cm-gas>': 'gas substance (so far few: argônio, overlap with. <cm-chem> and <cm>)',
+        '<cm-liq>': 'liquid substance (azeite, gasolina, plasma, overlap with <food> and <cm-rem>)',
+        '<cm-rem>': 'remedy (medical or hygiene, antibiótico, canabis, quinina, part of <cm-h>, overlap with <cm-chem>)',
         },
     'Materials':
         {
-        '<mat>': u'material (argila, bronze, granito, cf. <cm>)',
-        '<mat-cloth>': u'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
-        '<cord>': u'cord, string, rope, tape (previously <tool-tie>, arame, fio, fibrila)',
+        '<mat>': 'material (argila, bronze, granito, cf. <cm>)',
+        '<mat-cloth>': 'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
+        '<cord>': 'cord, string, rope, tape (previously <tool-tie>, arame, fio, fibrila)',
         },
     'Clothing':
         {
-        '<cloA>': u'animal clothing (sela, xabraque)',
-        '<cloH>': u'human clothing (albornoz, anoraque, babadouro, bermudas)',
-        '<cloH-beauty>': u'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)',
-        '<cloH-hat>': u'hat (sombrero, mitra, coroa)',
-        '<cloH-shoe>': u'shoe (bota, chinela, patim)',
-        '<mat-cloth>': u'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
-        '<clo...>': u'(clothing)',
+        '<cloA>': 'animal clothing (sela, xabraque)',
+        '<cloH>': 'human clothing (albornoz, anoraque, babadouro, bermudas)',
+        '<cloH-beauty>': 'beauty clothing (e.g. jewelry, diadema, pendente, pulseira)',
+        '<cloH-hat>': 'hat (sombrero, mitra, coroa)',
+        '<cloH-shoe>': 'shoe (bota, chinela, patim)',
+        '<mat-cloth>': 'cloth material (seda, couro, vison, kevlar), cp. <cc-rag>',
+        '<clo...>': '(clothing)',
         },
     'Collective':
         {
-        '<coll>': u'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)',
-        '<coll-cc>': u'thing collective, pile (baralho, lanço)',
-        '<coll-B>': u'plant-part collective (buquê, folhagem)',
-        '<coll-sem>': u'semantic collective, collection (arquivo, repertório)',
-        '<coll-tool>': u'tool collective, set (intrumentário, prataria)',
-        '<HH>': u'(group)',
-        '<AA>': u'(herd)',
-        '<BB>': u'(plantation)',
-        '<VV>': u'(convoy)',
+        '<coll>': 'set,collective (random or systematic collection/compound/multitude of similar but distinct small parts, conjunto, série)',
+        '<coll-cc>': 'thing collective, pile (baralho, lanço)',
+        '<coll-B>': 'plant-part collective (buquê, folhagem)',
+        '<coll-sem>': 'semantic collective, collection (arquivo, repertório)',
+        '<coll-tool>': 'tool collective, set (intrumentário, prataria)',
+        '<HH>': '(group)',
+        '<AA>': '(herd)',
+        '<BB>': '(plantation)',
+        '<VV>': '(convoy)',
         },
     'Time_Event':
         {
-        '<dur>': u'duration noun (test: durar+, implies <unit>, e.g. átimo, mês, hora)',
-        '<temp>': u'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. <dur> and <per>)',
-        '<event>': u'non-organised event  (-CONTROL, PERFECTIVE, milagre, morte)',
-        '<occ>': u'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized <event>) ',
-        '<process>': u'process (-CONTROL, -PERFECTIVE, cp. <event>, balcanização, convecção, estagnação)',
-        '<act...>': u'',
-        '<activity>': u'',
-        '<history>': u'(EFEMERIDE) one-time [historical] occurrence',
-        '<date>': u'(DATA) date',
-        '<hour>': u'(HORA) hour',
-        '<period>': u'(PERIODO) period',
-        '<cyclic>': u'(CICLICO) cyclic time expression',
-        '<month>': u'month noun/name (agosto, julho, part of <temp>)',
-        '<per>': u'period of time (prototypical test: durante, e.g. guerra, década, cf. <dur> and <temp>)',
+        '<dur>': 'duration noun (test: durar+, implies <unit>, e.g. átimo, mês, hora)',
+        '<temp>': 'temporal object, point in time (amanhecer, novilúnio, test: até+, cf. <dur> and <per>)',
+        '<event>': 'non-organised event  (-CONTROL, PERFECTIVE, milagre, morte)',
+        '<occ>': 'occasion, human/social event (copa do mundo, aniversário, jantar, desfile, cp. unorganized <event>) ',
+        '<process>': 'process (-CONTROL, -PERFECTIVE, cp. <event>, balcanização, convecção, estagnação)',
+        '<act...>': '',
+        '<activity>': '',
+        '<history>': '(EFEMERIDE) one-time [historical] occurrence',
+        '<date>': '(DATA) date',
+        '<hour>': '(HORA) hour',
+        '<period>': '(PERIODO) period',
+        '<cyclic>': '(CICLICO) cyclic time expression',
+        '<month>': 'month noun/name (agosto, julho, part of <temp>)',
+        '<per>': 'period of time (prototypical test: durante, e.g. guerra, década, cf. <dur> and <temp>)',
         },
     'Feature':
         {
-        '<f>': u'feature/property, umbrella tag (problematicidade, proporcionalidade)',
-        '<f-an>': u'anatomical "local" feature, includes countables, e.g. barbela, olheiras)',
-        '<f-c>': u'general countable feature (vestígio, laivos, vinco)',
-        '<f-h>': u'human physical feature, not countable (lindura, compleição, same as <f-phys-h>, cp. anatomical local features <f-an>)',
-        '<f-phys-h>': u'',
-        '<f-psych>': u'human psychological feature (passionalidade, pavonice, cp. passing states <state-h>)',
-        '<f-q>': u'quantifiable feature (e.g. circunferência, calor, DanGram-s <f-phys> covers both <f> and <f-q>)',
-        '<f-phys>': u'',
-        '<f-right>': u'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)',
-        '<state>': u'',
-        '<state-h>': u'(human state)',
+        '<f>': 'feature/property, umbrella tag (problematicidade, proporcionalidade)',
+        '<f-an>': 'anatomical "local" feature, includes countables, e.g. barbela, olheiras)',
+        '<f-c>': 'general countable feature (vestígio, laivos, vinco)',
+        '<f-h>': 'human physical feature, not countable (lindura, compleição, same as <f-phys-h>, cp. anatomical local features <f-an>)',
+        '<f-phys-h>': '',
+        '<f-psych>': 'human psychological feature (passionalidade, pavonice, cp. passing states <state-h>)',
+        '<f-q>': 'quantifiable feature (e.g. circunferência, calor, DanGram-s <f-phys> covers both <f> and <f-q>)',
+        '<f-phys>': '',
+        '<f-right>': 'human social feature (right or duty): e.g. copyright, privilégio, imperativo legal)',
+        '<state>': '',
+        '<state-h>': '(human state)',
         },
     'Food':
         {
-        '<food>': u'natural/simplex food (aveia, açúcar, carne, so far including <spice>)',
-        '<food-c>': u'countable food (few: ovo, dente de alho, most are <fruit> or <food-c-h>)',
-        '<food-h>': u'human-prepared/complex culinary food (caldo verde, lasanha)',
-        '<food-c-h>': u'culinary countable food (biscoito, enchido, panetone, pastel)',
-        '<drink>': u'drink (cachaça, leite, guaraná, moca)',
-        '<fruit>': u'fruit, berry, nut (still mostly marked as <food-c>, abricote, amora, avelã, cebola)',
-        '<spice>': u'condiments, pepper',
+        '<food>': 'natural/simplex food (aveia, açúcar, carne, so far including <spice>)',
+        '<food-c>': 'countable food (few: ovo, dente de alho, most are <fruit> or <food-c-h>)',
+        '<food-h>': 'human-prepared/complex culinary food (caldo verde, lasanha)',
+        '<food-c-h>': 'culinary countable food (biscoito, enchido, panetone, pastel)',
+        '<drink>': 'drink (cachaça, leite, guaraná, moca)',
+        '<fruit>': 'fruit, berry, nut (still mostly marked as <food-c>, abricote, amora, avelã, cebola)',
+        '<spice>': 'condiments, pepper',
         },
     'Part':
         {
-        '<part>': u'distinctive or functional part (ingrediente, parte, trecho)',
-        '<part-build>': u'structural part of building or vehicle (balustrada, porta, estai)',
-        '<piece>': u'indistinctive (little) piece (pedaço, raspa)',
-        '<cc-handle>': u'',
-        '<Ltip>': u'',
+        '<part>': 'distinctive or functional part (ingrediente, parte, trecho)',
+        '<part-build>': 'structural part of building or vehicle (balustrada, porta, estai)',
+        '<piece>': 'indistinctive (little) piece (pedaço, raspa)',
+        '<cc-handle>': '',
+        '<Ltip>': '',
         },
     'Perception':
         {
-        '<percep-f>': u'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with <state-h>)',
-        '<percep-l>': u'sound (what you hear, apitadela, barrulho, berro, crepitação)',
-        '<percep-o>': u'olfactory impression (what you smell, bafo, chamuscom fragrância)',
-        '<percep-t>': u'what you taste (PALAVRAS: not implemented)',
-        '<percep-w>': u'visual impression (what you see, arco-iris, réstia, vislumbre)',
+        '<percep-f>': 'what you feel (senses or sentiment, pain, e.g. arrepio, aversão, desagrado, cócegas, some overlap with <state-h>)',
+        '<percep-l>': 'sound (what you hear, apitadela, barrulho, berro, crepitação)',
+        '<percep-o>': 'olfactory impression (what you smell, bafo, chamuscom fragrância)',
+        '<percep-t>': 'what you taste (PALAVRAS: not implemented)',
+        '<percep-w>': 'visual impression (what you see, arco-iris, réstia, vislumbre)',
         },
     'Semantic Product':
         {
-        '<sem>': u'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)',
-        '<sem-c>': u'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)',
-        '<sem-l>': u'listen-work (music, cantarola, prelúdio, at the same time <genre>: bossa nova)',
-        '<sem-nons>': u'nonsense, rubbish (implies <sem-s>, galimatias, farelório)',
-        '<sem-r>': u'read-work (biografia, dissertação, e-mail, ficha cadastral)',
-        '<sem-s>': u'speak-work (palestra, piada, exposto)',
-        '<sem-w>': u'watch-work (filme, esquete, mininovela)',
-        '<ac-s>': u'(speach act)',
-        '<talk>': u'',
+        '<sem>': 'semiotic artifact, work of art, umbrella tag (all specified in PALAVRAS)',
+        '<sem-c>': 'cognition product (concept, plan, system, conjetura, esquema, plano, prejuízo)',
+        '<sem-l>': 'listen-work (music, cantarola, prelúdio, at the same time <genre>: bossa nova)',
+        '<sem-nons>': 'nonsense, rubbish (implies <sem-s>, galimatias, farelório)',
+        '<sem-r>': 'read-work (biografia, dissertação, e-mail, ficha cadastral)',
+        '<sem-s>': 'speak-work (palestra, piada, exposto)',
+        '<sem-w>': 'watch-work (filme, esquete, mininovela)',
+        '<ac-s>': '(speach act)',
+        '<talk>': '',
         },
     'Disease':
         {
-        '<sick>': u'disease (acne, AIDS, sida, alcoolismo, cp. <Hsick>)',
-        '<Hsick>': u'',
-        '<sick-c>': u'countable disease-object (abscesso, berruga, cicatriz, gangrena)',
+        '<sick>': 'disease (acne, AIDS, sida, alcoolismo, cp. <Hsick>)',
+        '<Hsick>': '',
+        '<sick-c>': 'countable disease-object (abscesso, berruga, cicatriz, gangrena)',
         },
     'State-of-affairs':
         {
-        '<sit>': u'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than <state> and <state-h>',
-        '<state>': u'state (of something, otherwise <sit>), abundância, calma, baixa-mar, equilíbrio',
-        '<state-h>': u'human state (desamparo, desesperança, dormência, euforia, febre',
-        '<f-psych>': u'',
-        '<f-phys-h>': u'',
+        '<sit>': 'psychological situation or physical state of affairs (reclusão, arruaça, ilegalidade, more complex and more "locative" than <state> and <state-h>',
+        '<state>': 'state (of something, otherwise <sit>), abundância, calma, baixa-mar, equilíbrio',
+        '<state-h>': 'human state (desamparo, desesperança, dormência, euforia, febre',
+        '<f-psych>': '',
+        '<f-phys-h>': '',
         },
     'Sport':
         {
-        '<sport>': u'sport (capoeira, futebol, golfe, also <activity> and <domain>)',
+        '<sport>': 'sport (capoeira, futebol, golfe, also <activity> and <domain>)',
         },
     'Tool':
         {
-        '<tool>': u'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. <mach>)',
-        '<tool-cut>': u'cutting tool, knife (canivete, espada)',
-        '<tool-gun>': u'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: <tool-shoot>)',
-        '<tool-mus>': u'musical instrument (clavicórdio, ocarina, violão)',
-        '<tool-sail>': u'sailing tool, sail (vela latina, joanete, coringa)',
-        '<mach>': u'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. <tool>)',
-        '<tube>': u'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like <an> or <tool>)',
+        '<tool>': 'tool, umbrella tag (abana-moscas, lápis, computador, maceta, "handable", cf. <mach>)',
+        '<tool-cut>': 'cutting tool, knife (canivete, espada)',
+        '<tool-gun>': 'shooting tool, gun (carabina, metralhadora, helicanão, in Dangram: <tool-shoot>)',
+        '<tool-mus>': 'musical instrument (clavicórdio, ocarina, violão)',
+        '<tool-sail>': 'sailing tool, sail (vela latina, joanete, coringa)',
+        '<mach>': 'machine (complex, usually with moving parts, betoneira, embrulhador, limpa-pratos, cp. <tool>)',
+        '<tube>': 'tube object (cânula, gasoduto, zarabatana, shape-category, typically with another category, like <an> or <tool>)',
         },
     'Unit':
         {
-        '<unit>': u'unit noun (always implying <num+>, implied by <cur> and <dur>, e.g. caloria, centímetro, lúmen))',
+        '<unit>': 'unit noun (always implying <num+>, implied by <cur> and <dur>, e.g. caloria, centímetro, lúmen))',
         },
     'Weather':
         {
-        '<wea>': u'weather (states), umbrella tag (friagem, bruma)',
-        '<wea-c>': u'countable weather phenomenon (nuvem, tsunami)',
-        '<wea-rain>': u'rain and other precipitation (chuvisco, tromba d-água, granizo)',
-        '<wea-wind>': u'wind, storm (brisa, furacão)',
+        '<wea>': 'weather (states), umbrella tag (friagem, bruma)',
+        '<wea-c>': 'countable weather phenomenon (nuvem, tsunami)',
+        '<wea-rain>': 'rain and other precipitation (chuvisco, tromba d-água, granizo)',
+        '<wea-wind>': 'wind, storm (brisa, furacão)',
         },
     'Person':
         {
-        '<hum>': u'(INDIVIDUAL) person name (cp. <H>)',
-        '<official>': u'(CARGO) official function (~ cp. <Htitle> and <Hprof>)',
-        '<member>': u'(MEMBRO) member',
+        '<hum>': '(INDIVIDUAL) person name (cp. <H>)',
+        '<official>': '(CARGO) official function (~ cp. <Htitle> and <Hprof>)',
+        '<member>': '(MEMBRO) member',
         },
     'Organization_Group':
         {
-        '<admin>': u'(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)',
-        '<org>': u'(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as <Linst>)',
-        '<inst>': u'(EMPRESA) organized site (e.g. restaurant, cp. <Linst>)',
-        '<media>': u'(EMPRESA) media organisation (e.g. newspaper, tv channel)',
-        '<party>': u'(INSTITUICAO) political party',
-        '<suborg>': u'(SUB) organized part of any of the above',
-        '<company>': u'currently unsupported: (EMPRESA) company (not site-bound, unlike <inst>, now fused with. <org>)',
+        '<admin>': '(ADMINISTRACAO, ORG.) administrative body (government, town administration etc.)',
+        '<org>': '(INSTITUICAO/EMPRESA) commercial or non-commercial, non-administrative non-party organisations (not place-bound, therefore not the same as <Linst>)',
+        '<inst>': '(EMPRESA) organized site (e.g. restaurant, cp. <Linst>)',
+        '<media>': '(EMPRESA) media organisation (e.g. newspaper, tv channel)',
+        '<party>': '(INSTITUICAO) political party',
+        '<suborg>': '(SUB) organized part of any of the above',
+        '<company>': 'currently unsupported: (EMPRESA) company (not site-bound, unlike <inst>, now fused with. <org>)',
         },
     'Group':
         {
-        '<groupind>': u'(GROUPOIND) people, family',
-        '<groupofficial>': u'(GROUPOCARGO) board, government (not fully implemented)',
-        '<grouporg>': u'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with <org>)',
+        '<groupind>': '(GROUPOIND) people, family',
+        '<groupofficial>': '(GROUPOCARGO) board, government (not fully implemented)',
+        '<grouporg>': 'currently unsupported (GROUPOMEMBRO) club, e.g. football club (now fused with <org>)',
         },
     'Place':
         {
-        '<top>': u'(GEOGRAFICO) geographical location (cp. <Ltop>)',
-        '<civ>': u'(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. <Lciv>)',
-        '<address>': u'(CORREIO) address (including numbers etc.)',
-        '<site>': u'(ALARGADO) functional place (cp. <Lh>)',
-        '<virtual>': u'(VIRTUAL) virtual place',
-        '<astro>': u'(OBJECTO) astronomical place (in HAREM object, not place)',
-        '<road>': u'suggested (ALARGADO) roads, motorway (unlike <address>)',
+        '<top>': '(GEOGRAFICO) geographical location (cp. <Ltop>)',
+        '<civ>': '(ADMINISTRACAO, LOC.) civitas (country, town, state, cp. <Lciv>)',
+        '<address>': '(CORREIO) address (including numbers etc.)',
+        '<site>': '(ALARGADO) functional place (cp. <Lh>)',
+        '<virtual>': '(VIRTUAL) virtual place',
+        '<astro>': '(OBJECTO) astronomical place (in HAREM object, not place)',
+        '<road>': 'suggested (ALARGADO) roads, motorway (unlike <address>)',
         },
     'Work_of_Art':
         {
-        '<tit>': u'(REPRODUZIDO) [title of] reproduced work, copy',
-        '<pub>': u'(PUBLICACAO) [scientific] publication',
-        '<product>': u'(PRODUTO) product brand',
-        '<V>': u'(PRODUTO) vehicle brand (cp. <V>, <Vair>, <Vwater>)',
-        '<artwork>': u'(ARTE) work of art',
-        '<pict>': u'picture (combination of <cc>, <sem-w> and <L>, caricatura, cintilograma, diapositivo)',
+        '<tit>': '(REPRODUZIDO) [title of] reproduced work, copy',
+        '<pub>': '(PUBLICACAO) [scientific] publication',
+        '<product>': '(PRODUTO) product brand',
+        '<V>': '(PRODUTO) vehicle brand (cp. <V>, <Vair>, <Vwater>)',
+        '<artwork>': '(ARTE) work of art',
+        '<pict>': 'picture (combination of <cc>, <sem-w> and <L>, caricatura, cintilograma, diapositivo)',
         },
     'Colours':
         {
-        '<col>': u'colours',
+        '<col>': 'colours',
         },
     'Numeric_and_Math':
         {
-        '<quantity>': u'(QUANTIDADE) simple measuring numeral',
-        '<prednum>': u'(CLASSIFICADO) predicating numeral',
-        '<currency>': u'(MOEDA) currency name (also marked on the unit)',
-        '<geom>': u'geometry noun (circle, shape, e.g. losango, octógono, elipse)',
-        '<geom-line>': u'line (few: linha, percentil, curvas isobáricas)',
+        '<quantity>': '(QUANTIDADE) simple measuring numeral',
+        '<prednum>': '(CLASSIFICADO) predicating numeral',
+        '<currency>': '(MOEDA) currency name (also marked on the unit)',
+        '<geom>': 'geometry noun (circle, shape, e.g. losango, octógono, elipse)',
+        '<geom-line>': 'line (few: linha, percentil, curvas isobáricas)',
         },
     'Modifying_Adjectives':
         {
-        '<jh>': u'adjective modifying human noun',
-        '<jn>': u'adjective modifying inanimate noun ',
-        '<ja>': u'adjective modifying animal',
-        '<jb>': u'adjective modifying plant',
-        '<col>': u'color adjective',
-        '<nat>': u'nationality adjective (also: from a certain town etc.)',
-        '<attr>': u'(human) attributive adjective (not fully implemented, cp. <Hattr>, e.g. "um presidente COMUNISTA")',
+        '<jh>': 'adjective modifying human noun',
+        '<jn>': 'adjective modifying inanimate noun ',
+        '<ja>': 'adjective modifying animal',
+        '<jb>': 'adjective modifying plant',
+        '<col>': 'color adjective',
+        '<nat>': 'nationality adjective (also: from a certain town etc.)',
+        '<attr>': '(human) attributive adjective (not fully implemented, cp. <Hattr>, e.g. "um presidente COMUNISTA")',
         },
     'Verbs_related_human_things':
         {
-        '<vH>': u'verb with human subject',
-        '<vN>': u'verb with inanimate subject',
+        '<vH>': 'verb with human subject',
+        '<vN>': 'verb with inanimate subject',
         },
 }
 
@@ -425,7 +425,7 @@ def process(self, document):
             word_sem_tags = angle_brackets_contents.findall(line.strip())
             is_tagged = False
             for tag in word_sem_tags:
-                for category, subcategories in SEMANTIC_TAGS.items():
+                for category, subcategories in list(SEMANTIC_TAGS.items()):
                     if tag in subcategories:
                         tagged_entities.setdefault(category, []).append(word)
                         is_tagged = True
diff --git a/pypln/backend/workers/pos/__init__.py b/pypln/backend/workers/pos/__init__.py
index 9400fd1..4647189 100644
--- a/pypln/backend/workers/pos/__init__.py
+++ b/pypln/backend/workers/pos/__init__.py
@@ -18,8 +18,8 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 
-import en_nltk
-import pt_palavras
+from . import en_nltk
+from . import pt_palavras
 from pypln.backend.workers.palavras_raw import palavras_installed
 from pypln.backend.celery_task import PyPLNTask
 
@@ -48,7 +48,7 @@ def process(self, document):
         if language in MAPPING:
             tagset, tagged_text = MAPPING[language](document)
             text = document['text']
-            if not isinstance(text, unicode):
+            if not isinstance(text, str):
                 text = text.decode('utf-8')
             tagged_text_with_offset = put_offset(text, tagged_text)
         return {'pos': tagged_text_with_offset, 'tagset': tagset}
diff --git a/pypln/backend/workers/pos/pt_palavras.py b/pypln/backend/workers/pos/pt_palavras.py
index 19d4b9d..d24efa1 100644
--- a/pypln/backend/workers/pos/pt_palavras.py
+++ b/pypln/backend/workers/pos/pt_palavras.py
@@ -23,56 +23,56 @@
 
 PALAVRAS_ENCODING = 'utf-8'
 WORD_CLASSES = {
-                u'N': u'Nouns',
-                u'PROP': u'Proper nouns',
-                u'SPEC': u'Specifiers',
-                u'DET': u'Determiners',
-                u'PERS': u'Personal pronouns',
-                u'ADJ': u'Adjectives',
-                u'ADV': u'Adverbs',
-                u'V': u'Verbs',
-                u'NUM': u'Numerals',
-                u'PRP': u'Preposition',
-                u'KS': u'Subordinating conjunctions',
-                u'KC': u'Coordinationg conjunctions',
-                u'IN': u'Interjections',
-                u'EC': u'Hyphen-separated prefix',
-                u'BL': u'Blank Line',
-                u'ES': u'End of Sentence',
-                u'NW': u'Non Word',
+                'N': 'Nouns',
+                'PROP': 'Proper nouns',
+                'SPEC': 'Specifiers',
+                'DET': 'Determiners',
+                'PERS': 'Personal pronouns',
+                'ADJ': 'Adjectives',
+                'ADV': 'Adverbs',
+                'V': 'Verbs',
+                'NUM': 'Numerals',
+                'PRP': 'Preposition',
+                'KS': 'Subordinating conjunctions',
+                'KC': 'Coordinationg conjunctions',
+                'IN': 'Interjections',
+                'EC': 'Hyphen-separated prefix',
+                'BL': 'Blank Line',
+                'ES': 'End of Sentence',
+                'NW': 'Non Word',
 }
 
 
 def pos(document):
     if 'palavras_raw' not in document:
-        return u'', []
+        return '', []
 
     palavras_output = document['palavras_raw']
-    if not isinstance(palavras_output, unicode):
+    if not isinstance(palavras_output, str):
         palavras_output = palavras_output.decode(PALAVRAS_ENCODING)
     tagged_text = []
-    for line in palavras_output.split(u'\n'):
+    for line in palavras_output.split('\n'):
         line = line.strip()
         #print(line)
-        if line.isspace() or line == u'':
+        if line.isspace() or line == '':
             continue
-        elif line.startswith(u'<'):
+        elif line.startswith('<'):
             continue
-        elif line.startswith(u'$'):
+        elif line.startswith('$'):
             non_word = line.split()[0][1:]
             if non_word.isdigit():
-                non_word_tag = u'NUM'
+                non_word_tag = 'NUM'
             else:
                 non_word_tag = non_word
             tagged_text.append((non_word, non_word_tag))
-        elif len(line.split(u'\t')) < 2: # Discard malformed lines
+        elif len(line.split('\t')) < 2: # Discard malformed lines
             continue
         else:
-            info = line.split(u'\t')
-            final = u'\t'.join(info[1:]).split()
+            info = line.split('\t')
+            final = '\t'.join(info[1:]).split()
             word = info[0].strip()
             syntatic_semantic_tags = final[1:]
-            tags = filter(lambda x: x in WORD_CLASSES, syntatic_semantic_tags)
+            tags = [x for x in syntatic_semantic_tags if x in WORD_CLASSES]
             if tags:
                 pos_tag = tags[0]
                 tagged_text.append((word, pos_tag))
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index 4ad46ef..8778bed 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -45,11 +45,11 @@ def process(self, document):
                 # We cannot store the trigram as a tuple (mongo keys need to be
                 # strings). We decided to join tokens using spaces since a
                 # space will never be in a token.
-                key = u' '.join(res[0])
+                key = ' '.join(res[0])
                 # Mongo cannot have `.` or `$` in key names. Unfortunatelly
                 # this means we need to replace them with placeholders.
-                key = key.replace(u'$', u'\dollarsign')
-                key = key.replace(u'.', u'\dot')
+                key = key.replace('$', '\dollarsign')
+                key = key.replace('.', '\dot')
                 tr[key].append(res[1])
 
         return {'trigram_rank': tr, 'metrics':metrics}
diff --git a/pypln/backend/workers/word_cloud.py b/pypln/backend/workers/word_cloud.py
index 4f55dad..149ec04 100644
--- a/pypln/backend/workers/word_cloud.py
+++ b/pypln/backend/workers/word_cloud.py
@@ -19,7 +19,7 @@
 
 import base64
 import string
-from StringIO import StringIO
+from io import StringIO
 
 import numpy
 import nltk
@@ -32,7 +32,7 @@ def filter_stopwords(fdist, lang):
     stopwords =  list(string.punctuation)
     if lang in long_name:
         stopwords += nltk.corpus.stopwords.words(long_name[lang])
-    return filter(lambda pair: pair[0].lower() not in stopwords, fdist)
+    return [pair for pair in fdist if pair[0].lower() not in stopwords]
 
 class WordCloud(PyPLNTask):
 
diff --git a/scripts/add_pipelines.py b/scripts/add_pipelines.py
index 2450bda..4d30735 100755
--- a/scripts/add_pipelines.py
+++ b/scripts/add_pipelines.py
@@ -18,7 +18,7 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from __future__ import print_function
+
 import sys
 from logging import Logger, StreamHandler, Formatter
 from pymongo import Connection
diff --git a/scripts/create_fake_measures.py b/scripts/create_fake_measures.py
index 8e6b82f..a542187 100644
--- a/scripts/create_fake_measures.py
+++ b/scripts/create_fake_measures.py
@@ -25,17 +25,17 @@
 
 data = \
 {'host': {'cpu': {'cpu percent': 4.9, 'number of cpus': 4},
-          'memory': {'buffers': 214372352L,
+          'memory': {'buffers': 214372352,
                      'cached': 919220224,
-                     'free': 1369661440L,
+                     'free': 1369661440,
                      'free virtual': 0,
                      'percent': 65.21955293723627,
-                     'real free': 2503254016L,
+                     'real free': 2503254016,
                      'real percent': 36.433711831634305,
-                     'real used': 1434767360L,
-                     'total': 3938021376L,
+                     'real used': 1434767360,
+                     'total': 3938021376,
                      'total virtual': 0,
-                     'used': 2568359936L,
+                     'used': 2568359936,
                      'used virtual': 0},
           'network': {'cluster ip': '127.0.0.1',
                       'interfaces': {'eth0': {'bytes received': 171472224,
@@ -105,8 +105,8 @@
 def populate_collection():
     db[collection_name].drop()
     collection = db[collection_name]
-    print '[{}] Inserting total of {} measures ({} for {} brokers)...'\
-            .format(asctime(), measures * brokers, measures, brokers)
+    print('[{}] Inserting total of {} measures ({} for {} brokers)...'\
+            .format(asctime(), measures * brokers, measures, brokers))
     for measure in range(1, measures + 1):
         for broker in range(1, brokers + 1):
             if '_id' in data:
@@ -117,17 +117,17 @@ def populate_collection():
                                 random()
             collection.insert(data)
         if measure % 10000 == 0:
-            print '  [{}] Inserted {} measures'.format(asctime(),
-                    measure * broker)
-    print '[{}] Done inserting measures!'.format(asctime())
+            print('  [{}] Inserted {} measures'.format(asctime(),
+                    measure * broker))
+    print('[{}] Done inserting measures!'.format(asctime()))
 
-    print '[{}] Creating index for "host.network.cluster ip"'.format(asctime())
+    print('[{}] Creating index for "host.network.cluster ip"'.format(asctime()))
     collection.ensure_index('host.network.cluster ip')
-    print '[{}] Done!'.format(asctime())
+    print('[{}] Done!'.format(asctime()))
 
-    print '[{}] Creating index for "timestamp"'.format(asctime())
+    print('[{}] Creating index for "timestamp"'.format(asctime()))
     collection.ensure_index([('timestamp', -1)])
-    print '[{}] Done!'.format(asctime())
+    print('[{}] Done!'.format(asctime()))
 
 database_name = 'pypln'
 collection_name = 'monitoring' # WARNING: it'll drop the collection!
@@ -149,10 +149,10 @@ def populate_collection():
         .distinct('host.network.cluster ip'))
 end_time = time()
 total_time = end_time - start_time
-print 'Time to get broker IPs: {}. Broker IPs: {}'.format(total_time,
-        ', '.join(broker_ips))
+print('Time to get broker IPs: {}. Broker IPs: {}'.format(total_time,
+        ', '.join(broker_ips)))
 
-print '[{}] Getting last measure for each broker...'.format(asctime())
+print('[{}] Getting last measure for each broker...'.format(asctime()))
 measures = {}
 start_time = time()
 for broker_ip in broker_ips:
@@ -162,7 +162,7 @@ def populate_collection():
     measures[broker_ip] = result
 end_time = time()
 total_time = end_time - start_time
-print '[{}] Time to get all information: {}'.format(asctime(), total_time)
-for broker_ip, measure_list in measures.iteritems():
-    print 'Broker: {}, measure: {}'.format(broker_ip, measure_list[0])
+print('[{}] Time to get all information: {}'.format(asctime(), total_time))
+for broker_ip, measure_list in measures.items():
+    print('Broker: {}, measure: {}'.format(broker_ip, measure_list[0]))
 connection.close()
diff --git a/scripts/mongo2sphinx.py b/scripts/mongo2sphinx.py
index 89f3438..b6fafd0 100755
--- a/scripts/mongo2sphinx.py
+++ b/scripts/mongo2sphinx.py
@@ -61,7 +61,7 @@ def serialize(doc,id):
     an unique unsigned integer `id`. We use a counter for this.
     """
     document = Element("sphinx:document", attrib={'id':str(id)})
-    for k,v in doc.iteritems():
+    for k,v in doc.items():
         if k == '_id':
             SubElement(document,k).text = str(v)
             continue
diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py
index fd1adde..544dda9 100644
--- a/tests/test_celery_task.py
+++ b/tests/test_celery_task.py
@@ -17,7 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 from pypln.backend.celery_task import PyPLNTask
-from utils import TaskTest
+from .utils import TaskTest
 
 class FakeTask(PyPLNTask):
     def process(self, document):
diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py
index faaafab..6f319d6 100644
--- a/tests/test_elastic_indexer.py
+++ b/tests/test_elastic_indexer.py
@@ -1,5 +1,5 @@
 #-*- coding:utf-8 -*-
-u"""
+"""
 Created on 20/05/15
 by fccoelho
 license: GPL V3 or Later
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index de605e2..6adf67e 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -20,7 +20,7 @@
 import nltk
 
 from pypln.backend.workers.bigrams import Bigrams
-from utils import TaskTest
+from .utils import TaskTest
 
 bigram_measures = nltk.collocations.BigramAssocMeasures()
 
diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index d7819a5..a3b8f14 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -21,7 +21,7 @@
 import os
 from textwrap import dedent
 from pypln.backend.workers import Extractor
-from utils import TaskTest
+from .utils import TaskTest
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
@@ -67,25 +67,25 @@ def test_extraction_from_pdf_file(self):
         # our Extractor found (it may have found more details
         # depending on the toolset used to extract metadata)
         metadata_expected = {
-                u'Author':         u'Álvaro Justen',
-                u'Creator':        u'Writer',
-                u'Producer':       u'LibreOffice 3.5',
-                u'CreationDate':   u'Fri Jun  1 17:07:57 2012',
-                u'Tagged':         u'no',
-                u'Pages':          u'1',
-                u'Encrypted':      u'no',
-                u'Page size':      u'612 x 792 pts (letter)',
-                u'Optimized':      u'no',
-                u'PDF version':    u'1.4',
+                'Author':         'Álvaro Justen',
+                'Creator':        'Writer',
+                'Producer':       'LibreOffice 3.5',
+                'CreationDate':   'Fri Jun  1 17:07:57 2012',
+                'Tagged':         'no',
+                'Pages':          '1',
+                'Encrypted':      'no',
+                'Page size':      '612 x 792 pts (letter)',
+                'Optimized':      'no',
+                'PDF version':    '1.4',
         }
-        metadata_expected_set = set(metadata_expected.iteritems())
+        metadata_expected_set = set(metadata_expected.items())
         metadata = refreshed_document['file_metadata']
-        metadata_set = set(metadata.iteritems())
+        metadata_set = set(metadata.items())
         diff_set = metadata_expected_set - metadata_set
         self.assertTrue(metadata_expected_set.issubset(metadata_set),
                         ("Extracted metadata is not a subset of the expected metadata. "
                          "Items missing or with different values: {}").format(
-                         u", ".join(unicode(item) for item in diff_set)))
+                         ", ".join(str(item) for item in diff_set)))
         self.assertEqual(refreshed_document['mimetype'], 'application/pdf')
 
     def test_extraction_from_html(self):
@@ -165,7 +165,7 @@ def test_language_detection_en(self):
         self.assertEqual(refreshed_document['language'], 'en')
 
     def test_unescape_html_entities(self):
-        expected = (u"This text has html <entities>. Álvaro asked me to make"
+        expected = ("This text has html <entities>. Álvaro asked me to make"
                      " sure it also has non ascii chars.")
         filename = os.path.join(DATA_DIR, 'test_html_entities.txt')
         data = {'filename': filename,
@@ -176,7 +176,7 @@ def test_unescape_html_entities(self):
         self.assertEqual(refreshed_document['text'], expected)
 
     def test_should_detect_encoding_and_return_a_unicode_object(self):
-        expected = u"Flávio"
+        expected = "Flávio"
         filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt')
         data = {'filename': filename,
                 'contents': base64.b64encode(open(filename).read())}
@@ -184,7 +184,7 @@ def test_should_detect_encoding_and_return_a_unicode_object(self):
         Extractor().delay(doc_id)
         refreshed_document = self.collection.find_one({'_id': doc_id})
         self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(type(refreshed_document['text']), unicode)
+        self.assertEqual(type(refreshed_document['text']), str)
 
     def test_should_guess_mimetype_for_file_without_extension(self):
         contents = "This is a test file. I'm testing PyPLN extractor worker!"
@@ -213,7 +213,7 @@ def test_unknown_mimetype_should_be_flagged(self):
 
     def test_unknown_encoding_should_be_ignored(self):
         filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt')
-        expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding."
+        expected = "This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding."
         data = {'filename': filename,
                 'contents': base64.b64encode(open(filename).read())}
         doc_id = self.collection.insert(data, w=1)
diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py
index bde9c98..c23f280 100644
--- a/tests/test_worker_freqdist.py
+++ b/tests/test_worker_freqdist.py
@@ -17,16 +17,16 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 from pypln.backend.workers import FreqDist
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestFreqDistWorker(TaskTest):
     def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self):
-        tokens = [u'The', u'sky', u'is', u'blue', u',', u'the', u'sun', u'is',
-                  u'yellow', u'.']
+        tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is',
+                  'yellow', '.']
 
-        expected_fd =  [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1],
-                [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]]
+        expected_fd =  [['is', 2], ['the', 2], ['blue', 1], ['sun', 1],
+                ['sky', 1], [',', 1], ['yellow', 1], ['.', 1]]
 
 
         # This is just preparing the expected input in the database
diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py
index 3887d81..ed0f156 100644
--- a/tests/test_worker_lemmatizer_pt.py
+++ b/tests/test_worker_lemmatizer_pt.py
@@ -21,7 +21,7 @@
 from textwrap import dedent
 
 from pypln.backend.workers import Lemmatizer
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestLemmatizerWorker(TaskTest):
diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py
index e9982ba..4ed026a 100644
--- a/tests/test_worker_palavras_noun_phrase.py
+++ b/tests/test_worker_palavras_noun_phrase.py
@@ -22,7 +22,7 @@
 
 from pypln.backend.workers import NounPhrase
 from pypln.backend.workers.palavras_raw import palavras_installed
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestNounPhraseWorker(TaskTest):
diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py
index de2b6b8..2c9853d 100644
--- a/tests/test_worker_palavras_raw.py
+++ b/tests/test_worker_palavras_raw.py
@@ -22,7 +22,7 @@
 from textwrap import dedent
 
 from pypln.backend.workers import palavras_raw
-from utils import TaskTest
+from .utils import TaskTest
 
 
 ORIGINAL_PATH = palavras_raw.BASE_PARSER
diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py
index 1b3abbe..f540fc6 100644
--- a/tests/test_worker_palavras_semantic_tagger.py
+++ b/tests/test_worker_palavras_semantic_tagger.py
@@ -20,7 +20,7 @@
 from textwrap import dedent
 
 from pypln.backend.workers import SemanticTagger
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestSemanticTaggerWorker(TaskTest):
@@ -78,10 +78,10 @@ def test_ambiguous_tags(self):
         ''').strip() + '\n\n'
 
         expected_tags = {
-                'Non_Tagged': [u'Eu', u'bem', u'enquanto', u'ele', u'está',
-                    u'em', u'o'],
-                'Place and spatial': [u'canto'],
-                'Verbs_related_human_things': [u'canto']
+                'Non_Tagged': ['Eu', 'bem', 'enquanto', 'ele', 'está',
+                    'em', 'o'],
+                'Place and spatial': ['canto'],
+                'Verbs_related_human_things': ['canto']
         }
         doc_id = self.collection.insert({'palavras_raw': palavras_output,
             'palavras_raw_ran': True}, w=1)
diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py
index 6307192..af0c302 100644
--- a/tests/test_worker_pos.py
+++ b/tests/test_worker_pos.py
@@ -22,7 +22,7 @@
 from textwrap import dedent
 from pypln.backend.workers.palavras_raw import palavras_installed
 from pypln.backend.workers import POS
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestPosWorker(TaskTest):
@@ -56,7 +56,7 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self):
         ''').strip() + '\n\n'
 
         # '.' is the only named entity here.
-        expected = [[u'.', u'.', 29]]
+        expected = [['.', '.', 29]]
         doc_id = self.collection.insert({'text': text, 'tokens': tokens,
             'language': 'pt', 'palavras_raw': palavras_raw}, w=1)
         POS().delay(doc_id)
diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py
index b81bb93..09e07f6 100644
--- a/tests/test_worker_spellchecker.py
+++ b/tests/test_worker_spellchecker.py
@@ -20,14 +20,14 @@
 import os
 from textwrap import dedent
 from pypln.backend.workers import spellchecker
-from utils import TaskTest
+from .utils import TaskTest
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
 
 class TestSpellcheckerWorker(TaskTest):
     def test_spellchek_pt(self):
-        text = u"Meu cachoro é um pastor"
+        text = "Meu cachoro é um pastor"
         doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'},
                 w=1)
         spellchecker.SpellingChecker().delay(doc_id)
@@ -39,7 +39,7 @@ def test_spellchek_pt(self):
         self.assertEqual(refreshed_document['spelling_errors'][0][1], 4)
 
     def test_spellchek_en(self):
-        text = u"The cat bit the doggyo"
+        text = "The cat bit the doggyo"
         doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1)
         spellchecker.SpellingChecker().delay(doc_id)
 
diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py
index 3370e8d..4da4cfb 100644
--- a/tests/test_worker_statistics.py
+++ b/tests/test_worker_statistics.py
@@ -18,7 +18,7 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 from pypln.backend.workers import Statistics
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestStatisticsWorker(TaskTest):
diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py
index 9d59cac..67053a6 100644
--- a/tests/test_worker_tokenizer.py
+++ b/tests/test_worker_tokenizer.py
@@ -18,7 +18,7 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 from pypln.backend.workers import Tokenizer
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestTokenizerWorker(TaskTest):
diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py
index 93575e9..683e714 100644
--- a/tests/test_worker_trigrams.py
+++ b/tests/test_worker_trigrams.py
@@ -18,9 +18,9 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import nltk
-import cPickle
+import pickle
 from pypln.backend.workers.trigrams import Trigrams
-from utils import TaskTest
+from .utils import TaskTest
 
 trigram_measures = nltk.collocations.TrigramAssocMeasures()
 
@@ -33,7 +33,7 @@ def test_Trigrams_should_return_correct_score(self):
         Trigrams().delay(doc_id)
         refreshed_document = self.collection.find_one({'_id': doc_id})
         trigram_rank = refreshed_document['trigram_rank']
-        result = trigram_rank[u'olive leaf plucked'][0]
+        result = trigram_rank['olive leaf plucked'][0]
         # This is the value of the chi_sq measure for this trigram in this
         # colocation
         expected_chi_sq = 1940754916.9623578
@@ -45,7 +45,7 @@ def test_Trigrams_may_contain_dots_and_dollar_signs(self):
         Trigrams().delay(doc_id)
         refreshed_document = self.collection.find_one({'_id': doc_id})
         trigram_rank = refreshed_document['trigram_rank']
-        result = trigram_rank[u'\dollarsign test \dot'][0]
+        result = trigram_rank['\dollarsign test \dot'][0]
         # This is the value of the chi_sq measure for this trigram in this
         # colocation
         expected_chi_sq = 10.5
diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py
index 23ed090..2f504f2 100644
--- a/tests/test_worker_wordcloud.py
+++ b/tests/test_worker_wordcloud.py
@@ -18,12 +18,12 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import base64
-from StringIO import StringIO
+from io import StringIO
 
 from PIL import Image
 
 from pypln.backend.workers import WordCloud
-from utils import TaskTest
+from .utils import TaskTest
 
 
 class TestFreqDistWorker(TaskTest):

From e41028ebd02962ed2962729e8931d4f9cf9f4c86 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 16:39:37 -0200
Subject: [PATCH 06/33] Removes redundant try/except block in urlparse import

---
 pypln/backend/config.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pypln/backend/config.py b/pypln/backend/config.py
index f89bd6f..e5bd6d3 100644
--- a/pypln/backend/config.py
+++ b/pypln/backend/config.py
@@ -1,12 +1,8 @@
 import os
+import urllib.parse
 
 from decouple import config, Csv
 
-try:
-    import urllib.parse
-except ImportError:
-    import urllib.parse as urlparse
-
 def parse_url(url):
     urllib.parse.uses_netloc.append('mongodb')
     urllib.parse.uses_netloc.append('celery')

From ccfb5d9df2bd19a6af466e6d844f2b8507aae789 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 16:45:40 -0200
Subject: [PATCH 07/33] Pins celery version

Celery 4.0 won't automatically register tasks that inherit from Task. We
need to solve this before we can use the newer version. Celery's
documentation only mentions class-based tasks in a way that is very
different from what we do here, though.
---
 requirements/production.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/production.txt b/requirements/production.txt
index 0e325c3..c043e27 100644
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -1,5 +1,5 @@
 # Common
-celery
+celery==3.1.23
 pymongo==2.8.1
 
 # Backend

From 01a5fa63e0dced3f18fe1f55d39afe08e51c5ad1 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 17:02:30 -0200
Subject: [PATCH 08/33] Removes unnecessary cast to list that 2to3 inserted

---
 pypln/backend/workers/palavras_semantic_tagger.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py
index 1e448de..a66d42c 100644
--- a/pypln/backend/workers/palavras_semantic_tagger.py
+++ b/pypln/backend/workers/palavras_semantic_tagger.py
@@ -425,7 +425,7 @@ def process(self, document):
             word_sem_tags = angle_brackets_contents.findall(line.strip())
             is_tagged = False
             for tag in word_sem_tags:
-                for category, subcategories in list(SEMANTIC_TAGS.items()):
+                for category, subcategories in SEMANTIC_TAGS.items():
                     if tag in subcategories:
                         tagged_entities.setdefault(category, []).append(word)
                         is_tagged = True

From b16be95e28855d213839bb13f8e0ffa3bdf475d9 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 17:23:21 -0200
Subject: [PATCH 09/33] Fixes test that expected str but receives bytes

This test was changed by 2to3, that's why it broke.
---
 tests/test_worker_palavras_raw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py
index 2c9853d..24bdc63 100644
--- a/tests/test_worker_palavras_raw.py
+++ b/tests/test_worker_palavras_raw.py
@@ -53,7 +53,7 @@ def test_palavras_should_return_raw_if_it_is_installed(self):
         doc_id = self.collection.insert(
                 {'text': 'Eu sei que neste momento falo para todo Brasil.',
                     'language': 'pt'}, w=1)
-        expected_raw = dedent('''
+        expected_raw = dedent(b'''
         Eu 	[eu] <*> PERS M/F 1S NOM @SUBJ>  #1->2
         sei 	[saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA  #2->0
         que 	[que] <clb> <clb-fs> KS @SUB  #3->7

From 21aa0a6e515676c9f2bdaf65a67cbb5c41d3e608 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 17:41:05 -0200
Subject: [PATCH 10/33] Adds test to make sure the 'process' method receives
 the expected data

---
 tests/test_celery_task.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py
index 544dda9..4952817 100644
--- a/tests/test_celery_task.py
+++ b/tests/test_celery_task.py
@@ -17,6 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 from pypln.backend.celery_task import PyPLNTask
+from unittest import mock
 from .utils import TaskTest
 
 class FakeTask(PyPLNTask):
@@ -37,3 +38,11 @@ def test_task_should_get_the_correct_document(self):
         refreshed_doc = self.collection.find_one({'_id': correct_doc_id})
 
         self.assertEqual(refreshed_doc['result'], 'correct')
+
+    @mock.patch.object(FakeTask, 'process')
+    def test_should_get_current_data_from_database(self, mocked_process):
+        document = {'input': 'correct'}
+        doc_id = self.collection.insert(document, w=1)
+        self.collection.insert({'input': 'wrong'}, w=1)
+        FakeTask().delay(doc_id)
+        mocked_process.assert_called_with(document)

From 7d540d00742c660bd9ea19fa9c6f32b0fd68cc8e Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 17:41:50 -0200
Subject: [PATCH 11/33] Fixes existing base task test

This test now makes sure only the correct document is updated.
---
 tests/test_celery_task.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py
index 4952817..0e087fa 100644
--- a/tests/test_celery_task.py
+++ b/tests/test_celery_task.py
@@ -25,7 +25,7 @@ def process(self, document):
         return {'result': document['input']}
 
 class TestCeleryTask(TaskTest):
-    def test_task_should_get_the_correct_document(self):
+    def test_saves_returned_data_to_database(self):
         """This is a regression test. PyPLNTask was not filtering by _id. It
         was getting the first document it found. """
 
@@ -36,8 +36,10 @@ def test_task_should_get_the_correct_document(self):
         FakeTask().delay(correct_doc_id)
 
         refreshed_doc = self.collection.find_one({'_id': correct_doc_id})
+        refreshed_wrong_doc = self.collection.find_one({'_id': wrong_doc_id})
 
         self.assertEqual(refreshed_doc['result'], 'correct')
+        self.assertNotIn('result', refreshed_wrong_doc.keys())
 
     @mock.patch.object(FakeTask, 'process')
     def test_should_get_current_data_from_database(self, mocked_process):

From aa4478a5c77cb2eeeedb69b2b3f2dbd19475af6a Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 18:20:37 -0200
Subject: [PATCH 12/33] Uses BytesIO instead of StringIO in wordcloud

2to3 introduced an error because it couldn't know what StringIO was
being used for.
---
 pypln/backend/workers/word_cloud.py | 4 ++--
 tests/test_worker_wordcloud.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pypln/backend/workers/word_cloud.py b/pypln/backend/workers/word_cloud.py
index 149ec04..5bf5efc 100644
--- a/pypln/backend/workers/word_cloud.py
+++ b/pypln/backend/workers/word_cloud.py
@@ -19,7 +19,7 @@
 
 import base64
 import string
-from io import StringIO
+from io import BytesIO
 
 import numpy
 import nltk
@@ -41,7 +41,7 @@ def process(self, document):
         words = numpy.array([t[0] for t in fdist])
         counts = numpy.array([t[1] for t in fdist])
         wordcloud_img = make_wordcloud(words, counts)
-        fd = StringIO()
+        fd = BytesIO()
         wordcloud_img.save(fd, format="PNG")
         fd.seek(0)
         result = {'wordcloud': base64.b64encode(fd.read())}
diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py
index 2f504f2..186c5a3 100644
--- a/tests/test_worker_wordcloud.py
+++ b/tests/test_worker_wordcloud.py
@@ -18,7 +18,7 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import base64
-from io import StringIO
+from io import BytesIO
 
 from PIL import Image
 
@@ -37,7 +37,7 @@ def test_wordcloud_should_return_a_base64_encoded_png(self):
         refreshed_document = self.collection.find_one({'_id': doc_id})
         raw_png_data = base64.b64decode(refreshed_document['wordcloud'])
 
-        fake_file = StringIO(raw_png_data)
+        fake_file = BytesIO(raw_png_data)
         img = Image.open(fake_file)
         img.verify()
         self.assertEqual(img.format, 'PNG')

From d311b74dce0ef5ef96db1c23e00ff0f0b88699bc Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 18:22:33 -0200
Subject: [PATCH 13/33] Changes Wordcloud test not to touch the database

The next few commits will change tests in a similar manner. This test is
no longer touching the database, because we rely on the PyPLNTask test
to test the fetch/save functionality.
---
 tests/test_worker_wordcloud.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py
index 186c5a3..25ddba5 100644
--- a/tests/test_worker_wordcloud.py
+++ b/tests/test_worker_wordcloud.py
@@ -19,23 +19,20 @@
 
 import base64
 from io import BytesIO
+import unittest
 
 from PIL import Image
 
 from pypln.backend.workers import WordCloud
-from .utils import TaskTest
 
 
-class TestFreqDistWorker(TaskTest):
-    name = "WordCloud"
+class TestFreqDistWorker(unittest.TestCase):
     def test_wordcloud_should_return_a_base64_encoded_png(self):
         doc = {'freqdist':  [('is', 2), ('the', 2), ('blue', 1), ('sun', 1),
             ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'}
-        doc_id = self.collection.insert(doc, w=1)
-        WordCloud().delay(doc_id)
+        result = WordCloud().process(doc)
 
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        raw_png_data = base64.b64decode(refreshed_document['wordcloud'])
+        raw_png_data = base64.b64decode(result['wordcloud'])
 
         fake_file = BytesIO(raw_png_data)
         img = Image.open(fake_file)

From 65c07b1c61dfcb60da66fbd0cd135d8e2e575cea Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 24 Nov 2016 18:58:23 -0200
Subject: [PATCH 14/33] Changes palavras_raw test to not touch the database

---
 pypln/backend/workers/palavras_raw.py | 19 +++++++-------
 tests/test_worker_palavras_raw.py     | 37 +++++++++++----------------
 2 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py
index e45bb11..95161ba 100644
--- a/pypln/backend/workers/palavras_raw.py
+++ b/pypln/backend/workers/palavras_raw.py
@@ -39,14 +39,15 @@ def process(self, document):
 
         text = document['text']
 
-        # For some reason, in some pypln installations the document['text'] is
-        # not always unicode as it should be. This may be due to errors during
-        # the decoding process that we fixed earlier. That meant that, when we
-        # got a non-unicode string, python would try to decode it using the
-        # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we
-        # know the text came from mongodb, we can just decode it using utf-8 to
-        # make sure we have a unicode object.
-        if not isinstance(text, str):
+        # This code is here because when using python2 for some
+        # reason, sometimes document['text'] was not a unicode object
+        # (as it should be, coming from pymongo).  Since we're now
+        # using python3, we should really always get a str (unicode)
+        # object. But, since we do not know the real reason for the
+        # original error, we will keep this code here for now. As
+        # before, if we receive a bytes object, since it came from
+        # mongodb we can be sure it will be encoded in utf-8.
+        if isinstance(text, bytes):
             text = text.decode('utf-8')
 
         process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
@@ -55,4 +56,4 @@ def process(self, document):
                                    stderr=subprocess.PIPE)
         stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
 
-        return {'palavras_raw': stdout, 'palavras_raw_ran': True}
+        return {'palavras_raw': stdout.decode('utf-8'), 'palavras_raw_ran': True}
diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py
index 24bdc63..628eafc 100644
--- a/tests/test_worker_palavras_raw.py
+++ b/tests/test_worker_palavras_raw.py
@@ -20,40 +20,34 @@
 
 from unittest import skipIf
 from textwrap import dedent
+import unittest
 
 from pypln.backend.workers import palavras_raw
-from .utils import TaskTest
 
 
 ORIGINAL_PATH = palavras_raw.BASE_PARSER
 
-class TestPalavrasRawWorker(TaskTest):
+class TestPalavrasRawWorker(unittest.TestCase):
 
     def test_should_run_only_if_language_is_portuguese(self):
-        doc_id = self.collection.insert({'text': 'There was a rock on the way.',
-            'language': 'en'}, w=1)
-
-        palavras_raw.PalavrasRaw().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['palavras_raw_ran'], False)
+        doc = {'text': 'There was a rock on the way.', 'language': 'en'}
+        result = palavras_raw.PalavrasRaw().process(doc)
+        self.assertEqual(result['palavras_raw_ran'], False)
 
     def test_palavras_not_installed(self):
         palavras_raw.BASE_PARSER = '/not-found'
-        doc_id = self.collection.insert(
-                {'text': 'Tinha uma pedra no meio do caminho.',
-                    'language': 'pt'}, w=1)
-        palavras_raw.PalavrasRaw().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['palavras_raw_ran'], False)
+        doc = {'text': 'Tinha uma pedra no meio do caminho.',
+                    'language': 'pt'}
+        result = palavras_raw.PalavrasRaw().process(doc)
+        self.assertEqual(result['palavras_raw_ran'], False)
 
 
     @skipIf(not palavras_raw.palavras_installed(), 'palavras software is not installed')
     def test_palavras_should_return_raw_if_it_is_installed(self):
         palavras_raw.BASE_PARSER = ORIGINAL_PATH
-        doc_id = self.collection.insert(
-                {'text': 'Eu sei que neste momento falo para todo Brasil.',
-                    'language': 'pt'}, w=1)
-        expected_raw = dedent(b'''
+        doc = {'text': 'Eu sei que neste momento falo para todo Brasil.',
+                    'language': 'pt'}
+        expected_raw = dedent('''
         Eu 	[eu] <*> PERS M/F 1S NOM @SUBJ>  #1->2
         sei 	[saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA  #2->0
         que 	[que] <clb> <clb-fs> KS @SUB  #3->7
@@ -67,7 +61,6 @@ def test_palavras_should_return_raw_if_it_is_installed(self):
         $. #11->0
         </s>
         ''').strip() + '\n\n'
-        result = palavras_raw.PalavrasRaw().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['palavras_raw'], expected_raw)
-        self.assertEqual(refreshed_document['palavras_raw_ran'], True)
+        result = palavras_raw.PalavrasRaw().process(doc)
+        self.assertEqual(result['palavras_raw'], expected_raw)
+        self.assertEqual(result['palavras_raw_ran'], True)

From 9c8f952f8d433e6d1abd40be4d7876adfd5b5af9 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 20:06:25 -0200
Subject: [PATCH 15/33] Fix freqdist test and sorting

List sorting changed in Python 3 and apparently string sorting did too.
---
 pypln/backend/workers/freqdist.py |  2 +-
 tests/test_worker_freqdist.py     | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py
index 82d75ee..bdf3712 100644
--- a/pypln/backend/workers/freqdist.py
+++ b/pypln/backend/workers/freqdist.py
@@ -28,6 +28,6 @@ def process(self, document):
         frequency_distribution = {token: tokens.count(token) \
                                   for token in set(tokens)}
         fd = list(frequency_distribution.items())
-        fd.sort(lambda x, y: cmp(y[1], x[1]))
+        fd.sort(key=lambda x: (-x[1], x[0]))
 
         return {'freqdist': fd}
diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py
index c23f280..f4613dc 100644
--- a/tests/test_worker_freqdist.py
+++ b/tests/test_worker_freqdist.py
@@ -16,24 +16,18 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from unittest import TestCase
+
 from pypln.backend.workers import FreqDist
-from .utils import TaskTest
 
 
-class TestFreqDistWorker(TaskTest):
-    def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self):
+class TestFreqDistWorker(TestCase):
+    def test_freqdist_should_be_a_list_of_tuples_with_frequency_distribution(self):
         tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is',
                   'yellow', '.']
 
-        expected_fd =  [['is', 2], ['the', 2], ['blue', 1], ['sun', 1],
-                ['sky', 1], [',', 1], ['yellow', 1], ['.', 1]]
-
-
-        # This is just preparing the expected input in the database
-        doc_id = self.collection.insert({'tokens': tokens}, w=1)
-
-        FreqDist().delay(doc_id)
-
-        resulting_fd = self.collection.find_one({'_id': doc_id})['freqdist']
+        expected_fd = [('is', 2), ('the', 2), (',', 1), ('.', 1), ('blue', 1),
+                       ('sky', 1), ('sun', 1), ('yellow', 1)]
 
+        resulting_fd = FreqDist().process({'tokens': tokens})['freqdist']
         self.assertEqual(resulting_fd, expected_fd)

From 05594a15433b927d0ad6359918eec32c98048884 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 20:30:11 -0200
Subject: [PATCH 16/33] fix spellchecker tests

---
 pypln/backend/workers/spellchecker.py | 15 +++++------
 tests/test_worker_spellchecker.py     | 38 +++++++++++----------------
 2 files changed, 23 insertions(+), 30 deletions(-)

diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py
index 4a6afb8..2e16185 100644
--- a/pypln/backend/workers/spellchecker.py
+++ b/pypln/backend/workers/spellchecker.py
@@ -20,22 +20,22 @@
 
 import enchant
 from enchant.checker import SpellChecker
+
 from pypln.backend.celery_task import PyPLNTask
 
+
 class SpellingChecker(PyPLNTask):
     """
     This worker performs spellchecking in the plain text of a document
     """
     def __init__(self):
-        # This method is only called once per process, but that is no problem
-        # since the enchant languange list should not change. Don't use this
-        # method for anything that should run every time the task is called.
-        # See http://docs.celeryproject.org/en/latest/userguide/tasks.html#instantiation
-        # for more information.
-        self.checkers = {lang: SpellChecker(lang) for lang in enchant.list_languages()}
+        # This method is only called once per process
+        self.checkers = {lang: SpellChecker(lang)
+                         for lang in enchant.list_languages()}
 
     def process(self, document):
-        #TODO: this worker may be enhanced by also checking the errors against an specific vocabulary supplied with the document
+        #TODO: this worker may be enhanced by also checking the errors against
+        # an specific vocabulary supplied with the document
         try:
             checker = self.checkers[document['language']]
             checker.set_text(document['text'])
@@ -44,4 +44,3 @@ def process(self, document):
             errors = None
 
         return {'spelling_errors': errors}
-
diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py
index 09e07f6..746cb5d 100644
--- a/tests/test_worker_spellchecker.py
+++ b/tests/test_worker_spellchecker.py
@@ -18,34 +18,28 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import os
-from textwrap import dedent
-from pypln.backend.workers import spellchecker
-from .utils import TaskTest
+from unittest import TestCase
+
+from pypln.backend.workers.spellchecker import SpellingChecker
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
 
-class TestSpellcheckerWorker(TaskTest):
-    def test_spellchek_pt(self):
+class TestSpellcheckerWorker(TestCase):
+    def test_spellcheck_pt(self):
         text = "Meu cachoro é um pastor"
-        doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'},
-                w=1)
-        spellchecker.SpellingChecker().delay(doc_id)
+        result = SpellingChecker().process({'text': text, 'language': 'pt_BR'})
 
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(len(refreshed_document['spelling_errors']), 1)
-        self.assertIn('cachoro', refreshed_document['spelling_errors'][0])
-        self.assertIn('cachorro', refreshed_document['spelling_errors'][0][2])
-        self.assertEqual(refreshed_document['spelling_errors'][0][1], 4)
+        self.assertEqual(len(result['spelling_errors']), 1)
+        self.assertIn('cachoro', result['spelling_errors'][0])
+        self.assertIn('cachorro', result['spelling_errors'][0][2])
+        self.assertEqual(result['spelling_errors'][0][1], 4)
 
-    def test_spellchek_en(self):
+    def test_spellcheck_en(self):
         text = "The cat bit the doggyo"
-        doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1)
-        spellchecker.SpellingChecker().delay(doc_id)
-
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(len(refreshed_document['spelling_errors']), 1)
-        self.assertIn('doggyo', refreshed_document['spelling_errors'][0])
-        self.assertIn('doggy', refreshed_document['spelling_errors'][0][2])
-        self.assertEqual(refreshed_document['spelling_errors'][0][1], 16)
+        result = SpellingChecker().process({'text': text, 'language': 'en'})
 
+        self.assertEqual(len(result['spelling_errors']), 1)
+        self.assertIn('doggyo', result['spelling_errors'][0])
+        self.assertIn('doggy', result['spelling_errors'][0][2])
+        self.assertEqual(result['spelling_errors'][0][1], 16)

From 7b31c9853b9bb508b08573f568a096b6b4b3cd5d Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 20:45:45 -0200
Subject: [PATCH 17/33] spellchecker: warn if dictionary is missing

---
 pypln/backend/workers/spellchecker.py | 26 +++++++++++++++++++-------
 tests/test_worker_spellchecker.py     | 11 +++++++++--
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py
index 2e16185..e6bc93b 100644
--- a/pypln/backend/workers/spellchecker.py
+++ b/pypln/backend/workers/spellchecker.py
@@ -16,7 +16,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
-
+import warnings
 
 import enchant
 from enchant.checker import SpellChecker
@@ -24,6 +24,10 @@
 from pypln.backend.celery_task import PyPLNTask
 
 
+class MissingDictionaryWarning(RuntimeWarning):
+    pass
+
+
 class SpellingChecker(PyPLNTask):
     """
     This worker performs spellchecking in the plain text of a document
@@ -34,13 +38,21 @@ def __init__(self):
                          for lang in enchant.list_languages()}
 
     def process(self, document):
-        #TODO: this worker may be enhanced by also checking the errors against
+        # TODO: this worker may be enhanced by also checking the errors against
         # an specific vocabulary supplied with the document
-        try:
-            checker = self.checkers[document['language']]
-            checker.set_text(document['text'])
-            errors = [[e.word, e.wordpos, e.suggest()] for e in checker]
-        except KeyError:
+        checker = self.checkers.get(document['language'])
+        if checker is None:
+            # Maybe this should be an exception instead
+            warnings.warn('%s dictionary missing. If running on linux, '
+                          'install the corresponding myspell package'
+                          % document['language'],
+                          MissingDictionaryWarning)
             errors = None
+        else:
+            try:
+                checker.set_text(document['text'])
+                errors = [[e.word, e.wordpos, e.suggest()] for e in checker]
+            except KeyError:
+                errors = None
 
         return {'spelling_errors': errors}
diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py
index 746cb5d..6200f73 100644
--- a/tests/test_worker_spellchecker.py
+++ b/tests/test_worker_spellchecker.py
@@ -18,9 +18,10 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import os
-from unittest import TestCase
+from unittest import TestCase, mock
 
-from pypln.backend.workers.spellchecker import SpellingChecker
+from pypln.backend.workers.spellchecker import (SpellingChecker,
+                                                MissingDictionaryWarning)
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
@@ -43,3 +44,9 @@ def test_spellcheck_en(self):
         self.assertIn('doggyo', result['spelling_errors'][0])
         self.assertIn('doggy', result['spelling_errors'][0][2])
         self.assertEqual(result['spelling_errors'][0][1], 16)
+
+    @mock.patch('warnings.warn')
+    def test_warns_about_missing_dictionary(self, warn_mock):
+        SpellingChecker().process({'text': '',
+                                   'language': 'missing_language'})
+        warn_mock.assert_called_with(mock.ANY, MissingDictionaryWarning)

From 00cce60d7b325ca94b2dc5ed4a769dc238cb2ec5 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 21:04:18 -0200
Subject: [PATCH 18/33] fix test_unknown_mimetype_should_be_flagged test

---
 tests/test_worker_extractor.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index a3b8f14..b4f39c7 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -25,6 +25,7 @@
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
+
 class TestExtractorWorker(TaskTest):
     def test_extraction_from_text_file(self):
         expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
@@ -200,16 +201,13 @@ def test_unknown_mimetype_should_be_flagged(self):
         filename = os.path.join(DATA_DIR, 'random_file')
         # we can't put the expected text content here, so we'll just make sure
         # it's equal to the input content, since
-        contents = open(filename).read()
-        data = {'filename': filename,
-                'contents': base64.b64encode(contents)}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['mimetype'], 'unknown')
-        self.assertEqual(refreshed_document['text'], "")
-        self.assertEqual(refreshed_document['language'], "")
-        self.assertEqual(refreshed_document['file_metadata'], {})
+        contents = open(filename, 'rb').read()
+        result = Extractor().process({'filename': filename,
+                                      'contents': base64.b64encode(contents)})
+        self.assertEqual(result['mimetype'], 'unknown')
+        self.assertEqual(result['text'], "")
+        self.assertEqual(result['language'], "")
+        self.assertEqual(result['file_metadata'], {})
 
     def test_unknown_encoding_should_be_ignored(self):
         filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt')

From afaaa0bfec59103127c1d4d0e976c5325677711f Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 21:22:50 -0200
Subject: [PATCH 19/33] Update
 TestExtractorWorker.test_unknown_encoding_should_be_ignored

This test is still not passing because cld is throwing an exception
---
 tests/test_worker_extractor.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index b4f39c7..ec555a6 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -211,12 +211,11 @@ def test_unknown_mimetype_should_be_flagged(self):
 
     def test_unknown_encoding_should_be_ignored(self):
         filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt')
-        expected = "This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding."
+        expected = "This file has a weird byte (\x96) that makes it " \
+                   "impossible for libmagic to recognize it's encoding."
         data = {'filename': filename,
-                'contents': base64.b64encode(open(filename).read())}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(refreshed_document['file_metadata'], {})
-        self.assertEqual(refreshed_document['language'], 'en')
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
+        self.assertEqual(result['file_metadata'], {})
+        self.assertEqual(result['language'], 'en')

From 427da7d51fc073494c4db185f0f273752f768629 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 21:28:52 -0200
Subject: [PATCH 20/33] fix TestExtractorWorker.test_unescape_html_entities

---
 tests/test_worker_extractor.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index ec555a6..b56dd81 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -170,11 +170,9 @@ def test_unescape_html_entities(self):
                      " sure it also has non ascii chars.")
         filename = os.path.join(DATA_DIR, 'test_html_entities.txt')
         data = {'filename': filename,
-                'contents': base64.b64encode(open(filename).read())}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
 
     def test_should_detect_encoding_and_return_a_unicode_object(self):
         expected = "Flávio"

From 2c0f8e8cd031c45053f8c31c5a1e752486171ce6 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 21:30:40 -0200
Subject: [PATCH 21/33] fix
 TestExtractorWorker.test_should_detect_encoding_and_return_a_unicode_object

---
 tests/test_worker_extractor.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index b56dd81..117ed8c 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -178,12 +178,10 @@ def test_should_detect_encoding_and_return_a_unicode_object(self):
         expected = "Flávio"
         filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt')
         data = {'filename': filename,
-                'contents': base64.b64encode(open(filename).read())}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(type(refreshed_document['text']), str)
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
+        self.assertEqual(type(result['text']), str)
 
     def test_should_guess_mimetype_for_file_without_extension(self):
         contents = "This is a test file. I'm testing PyPLN extractor worker!"

From 69899361d9c59e4e475ff36815afafce0299c720 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 25 Nov 2016 21:32:51 -0200
Subject: [PATCH 22/33] fix
 TestExtractorWorker.test_should_guess_mimetype_for_file_without_extension

---
 tests/test_worker_extractor.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index 117ed8c..f781f68 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -187,11 +187,9 @@ def test_should_guess_mimetype_for_file_without_extension(self):
         contents = "This is a test file. I'm testing PyPLN extractor worker!"
         filename = os.path.join(DATA_DIR, 'text_file')
         data = {'filename': filename,
-                'contents': base64.b64encode(contents)}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['mimetype'], 'text/plain')
+                'contents': base64.b64encode(contents.encode('utf-8'))}
+        result = Extractor().process(data)
+        self.assertEqual(result['mimetype'], 'text/plain')
 
     def test_unknown_mimetype_should_be_flagged(self):
         filename = os.path.join(DATA_DIR, 'random_file')

From 17e47cb9ff0d1859ca86d272e9fc32fa4c206d5b Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Sat, 26 Nov 2016 13:07:21 -0200
Subject: [PATCH 23/33] updated more extractor tests

---
 tests/test_worker_extractor.py | 88 +++++++++++++++-------------------
 1 file changed, 38 insertions(+), 50 deletions(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index f781f68..bc61127 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -20,23 +20,23 @@
 import base64
 import os
 from textwrap import dedent
+from unittest import TestCase
+
 from pypln.backend.workers import Extractor
-from .utils import TaskTest
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
 
-class TestExtractorWorker(TaskTest):
+class TestExtractorWorker(TestCase):
     def test_extraction_from_text_file(self):
         expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
         filename = os.path.join(DATA_DIR, 'test.txt')
-        doc_id = self.collection.insert({'filename': filename,
-            'contents': base64.b64encode(open(filename).read())}, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(refreshed_document['file_metadata'], {})
-        self.assertEqual(refreshed_document['mimetype'], 'text/plain')
+        data = {'filename': filename,
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
+        self.assertEqual(result['file_metadata'], {})
+        self.assertEqual(result['mimetype'], 'text/plain')
 
     def test_extraction_from_html_file(self):
         expected = "This is a test file. I'm testing PyPLN extractor worker!"
@@ -47,23 +47,19 @@ def test_extraction_from_html_file(self):
         # wasn't a problem before because with mongodict we used to keep a
         # pickled representation of the data.
         data = {'filename': filename,
-                'contents': base64.b64encode(open(filename).read())}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(refreshed_document['file_metadata'], {})
-        self.assertEqual(refreshed_document['mimetype'], 'text/html')
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
+        self.assertEqual(result['file_metadata'], {})
+        self.assertEqual(result['mimetype'], 'text/html')
 
     def test_extraction_from_pdf_file(self):
         expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
         filename = os.path.join(DATA_DIR, 'test.pdf')
         data = {'filename': filename,
-                'contents': base64.b64encode(open(filename).read())}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
+                'contents': base64.b64encode(open(filename, 'rb').read())}
+        result = Extractor().process(data)
+        self.assertEqual(result['text'], expected)
         # Check that the expected metadata is a subset of what
         # our Extractor found (it may have found more details
         # depending on the toolset used to extract metadata)
@@ -80,14 +76,14 @@ def test_extraction_from_pdf_file(self):
                 'PDF version':    '1.4',
         }
         metadata_expected_set = set(metadata_expected.items())
-        metadata = refreshed_document['file_metadata']
+        metadata = result['file_metadata']
         metadata_set = set(metadata.items())
         diff_set = metadata_expected_set - metadata_set
         self.assertTrue(metadata_expected_set.issubset(metadata_set),
                         ("Extracted metadata is not a subset of the expected metadata. "
                          "Items missing or with different values: {}").format(
                          ", ".join(str(item) for item in diff_set)))
-        self.assertEqual(refreshed_document['mimetype'], 'application/pdf')
+        self.assertEqual(result['mimetype'], 'application/pdf')
 
     def test_extraction_from_html(self):
         contents = dedent('''
@@ -114,9 +110,8 @@ def test_extraction_from_html(self):
         </html>
         ''')
         data = {'filename': 'test.html',
-                'contents': base64.b64encode(contents)}
-        doc_id = self.collection.insert(data, w=1)
-        Extractor().delay(doc_id)
+                'contents': base64.b64encode(contents.encode('utf-8'))}
+        result = Extractor().process(data)
         expected = dedent('''
             Testing
 
@@ -134,36 +129,29 @@ def test_extraction_from_html(self):
             bla1
 
             bla2''').strip()
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['text'], expected)
-        self.assertEqual(refreshed_document['mimetype'], 'text/html')
+        self.assertEqual(result['text'], expected)
+        self.assertEqual(result['mimetype'], 'text/html')
 
     def test_language_detection_pt(self):
-        text_pt = 'Esse texto foi escrito por Álvaro em Português.'
-        data_pt = {'filename': 'text-pt.txt',
-                'contents': base64.b64encode(text_pt)}
-        doc_id = self.collection.insert(data_pt, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['language'], 'pt')
+        text = 'Esse texto foi escrito por Álvaro em Português.'
+        data = {'filename': 'text-pt.txt',
+                'contents': base64.b64encode(text.encode('utf-8'))}
+        result = Extractor().process(data)
+        self.assertEqual(result['language'], 'pt')
 
     def test_language_detection_es(self):
-        text_es = 'Este texto ha sido escrito en Español por Álvaro.'
-        data_es = {'filename': 'text-es.txt',
-                'contents': base64.b64encode(text_es)}
-        doc_id = self.collection.insert(data_es, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['language'], 'es')
+        text = 'Este texto ha sido escrito en Español por Álvaro.'
+        data = {'filename': 'text-es.txt',
+                'contents': base64.b64encode(text.encode('utf-8'))}
+        result = Extractor().process(data)
+        self.assertEqual(result['language'], 'es')
 
     def test_language_detection_en(self):
-        text_en = 'This text was written by Álvaro in English.'
-        data_en = {'filename': 'text-en.txt',
-                'contents': base64.b64encode(text_en)}
-        doc_id = self.collection.insert(data_en, w=1)
-        Extractor().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        self.assertEqual(refreshed_document['language'], 'en')
+        text = 'This text was written by Álvaro in English.'
+        data = {'filename': 'text-en.txt',
+                'contents': base64.b64encode(text.encode('utf-8'))}
+        result =  Extractor().process(data)
+        self.assertEqual(result['language'], 'en')
 
     def test_unescape_html_entities(self):
         expected = ("This text has html <entities>. Álvaro asked me to make"

From 4eb5f613c00507f672d59cf3e504b132f32649ee Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Sat, 26 Nov 2016 13:33:32 -0200
Subject: [PATCH 24/33] fix extractor.extract_pdf

---
 pypln/backend/workers/extractor.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 2a864e6..c0cc9aa 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -87,7 +87,7 @@ def parse_html(html, remove_tags=None, remove_inside=None,
     result = ''.join(sum(list(zip(content_between, complete_tags)), tuple()))
     return clean(result)
 
-def get_pdf_metadata(data):
+def get_pdf_metadata(data: str) -> dict:
     lines = data.strip().splitlines()
     metadata = {}
     for line in lines:
@@ -98,7 +98,7 @@ def get_pdf_metadata(data):
         metadata[key.strip()] = value.strip()
     return metadata
 
-def extract_pdf(data):
+def extract_pdf(data: bytes) -> (str, dict):
     temp = NamedTemporaryFile(delete=False)
     filename = temp.name
     temp.close()
@@ -112,14 +112,16 @@ def extract_pdf(data):
     unlink(filename + '_ind.html')
     unlink(filename + 's.html')
     text = parse_html(html.replace('&#160;', ' '), True, ['script', 'style'])
-    pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
-                    stderr=PIPE)
-    meta_out, meta_err = pdfinfo.communicate(input=data)
+
+    info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
+                         stderr=PIPE)
+    meta_out, meta_err = info_process.communicate(input=data)
     try:
-        metadata = get_pdf_metadata(meta_out)
-    except:
+        metadata = get_pdf_metadata(meta_out.decode('utf-8'))
+    except Exception:
+        # TODO: what should I do here?
         metadata = {}
-        #TODO: what should I do here?
+
     if not (text and metadata):
         return '', {}
     elif not html_err:
@@ -128,7 +130,7 @@ def extract_pdf(data):
         return '', {}
 
 
-def trial_decode(text):
+def trial_decode(text: bytes) -> str:
     """
     Tries to detect text encoding using `magic`. If the detected encoding is
     not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
@@ -173,6 +175,7 @@ def process(self, file_data):
         contents = base64.b64decode(file_data['contents'])
         with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
             file_mime_type = m.id_buffer(contents)
+
         metadata = {}
         if file_mime_type == 'text/plain':
             text = contents
@@ -191,7 +194,9 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        text, forced_decoding = trial_decode(text)
+        forced_decoding = False
+        if isinstance(text, bytes):
+            text, forced_decoding = trial_decode(text)
 
         if isinstance(text, str):
             # HTMLParser only handles unicode objects. We can't pass the text

From 24c266fe8bfd479ef0f60b371831283943cdc42f Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Sat, 26 Nov 2016 23:41:51 -0200
Subject: [PATCH 25/33] Rewrite extractor.trial_decode and write tests for it

Address a possible exception raised by Magic.id_buffer and remove the
superfluous text.decode('utf-8', 'replace') call since decoding with the
iso8859-1 codec will never raise a UnicodeDecodeError exception.
---
 pypln/backend/workers/extractor.py | 57 ++++++++++++------------------
 tests/test_worker_extractor.py     | 56 +++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index c0cc9aa..d9ed07c 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -130,41 +130,30 @@ def extract_pdf(data: bytes) -> (str, dict):
         return '', {}
 
 
-def trial_decode(text: bytes) -> str:
+def decode_text_bytes(text: bytes) -> str:
     """
-    Tries to detect text encoding using `magic`. If the detected encoding is
-    not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
-    as utf-8 replacing invalid chars with `U+FFFD` (the replacement character).
-
-    This is far from an ideal solution, but the extractor and the rest of the
-    pipeline need an unicode object.
+    Tries to detect text encoding using file magic. If that fails or the
+    detected encoding is not supported, tries using utf-8. If that doesn't work
+    tries using iso8859-1.
     """
-    with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
-        content_encoding = m.id_buffer(text)
-
-    forced_decoding = False
     try:
-        result = text.decode(content_encoding)
-    except LookupError:
-        # If the detected encoding is not supported, we try to decode it as
-        # utf-8.
+        with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
+            content_encoding = m.id_buffer(text)
+    except magic.MagicError:
+        pass  # This can happen for instance if text is a single char
+    else:
         try:
-            result = text.decode('utf-8')
-        except UnicodeDecodeError:
-            # Is there a better way of doing this than nesting try/except
-            # blocks? This smells really bad.
-            try:
-                result = text.decode('iso-8859-1')
-            except UnicodeDecodeError:
-                # If neither utf-8 nor iso-885901 work are capable of handling
-                # this text, we just decode it using utf-8 and replace invalid
-                # chars with U+FFFD.
-                # Two somewhat arbitrary decisions were made here: use utf-8
-                # and use 'replace' instead of 'ignore'.
-                result = text.decode('utf-8', 'replace')
-                forced_decoding = True
-
-    return result, forced_decoding
+            return text.decode(content_encoding)
+        except LookupError:  # The detected encoding is not supported
+            pass
+
+    try:
+        result = text.decode('utf-8')
+    except UnicodeDecodeError:
+        # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is
+        # a last resort.
+        result = text.decode('iso8859-1')
+    return result
 
 
 class Extractor(PyPLNTask):
@@ -194,9 +183,8 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        forced_decoding = False
         if isinstance(text, bytes):
-            text, forced_decoding = trial_decode(text)
+            text = decode_text_bytes(text)
 
         if isinstance(text, str):
             # HTMLParser only handles unicode objects. We can't pass the text
@@ -213,5 +201,6 @@ def process(self, file_data):
         else:
             language = cld.detect(text)[1]
 
+        # TODO: check for uses of forced_decoding and remove them
         return {'text': text, 'file_metadata': metadata, 'language': language,
-                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
+                'mimetype': file_mime_type, 'forced_decoding': None}
diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index bc61127..4c94349 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -21,12 +21,68 @@
 import os
 from textwrap import dedent
 from unittest import TestCase
+from unittest.mock import patch, Mock, MagicMock, call
+
+from magic import MagicError
 
 from pypln.backend.workers import Extractor
+from pypln.backend.workers.extractor import decode_text_bytes
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
 
 
+class DecodeTextBytesTest(TestCase):
+    def setUp(self):
+        magic_mock = MagicMock()
+        magic_identifier = Mock()
+        self.id_buffer_mock = Mock(return_value='magic_codec')
+        magic_identifier.id_buffer = self.id_buffer_mock
+        magic_mock.return_value.__enter__.return_value = magic_identifier
+        self.magic_patcher = patch('magic.Magic', magic_mock)
+
+    def test_ignores_magic_error(self):
+        self.id_buffer_mock.side_effect = MagicError()
+        text = Mock()
+        with self.magic_patcher:
+            result = decode_text_bytes(text)
+        self.assertEqual(result, text.decode.return_value)
+        self.assertEqual(text.decode.call_args_list, [call('utf-8')])
+
+    def test_tries_decoding_with_encoding_returned_by_magic(self):
+        text = Mock()
+        with self.magic_patcher:
+            result = decode_text_bytes(text)
+        self.assertEqual(result, text.decode.return_value)
+        self.assertEqual(text.decode.call_args_list, [call('magic_codec')])
+
+    def test_tries_decoding_as_utf8(self):
+        text = Mock()
+        text.decode.side_effect = [LookupError(), 'result']
+        with self.magic_patcher:
+            result = decode_text_bytes(text)
+        self.assertEqual(result, 'result')
+        self.assertEqual(text.decode.call_args_list,
+                         [call('magic_codec'), call('utf-8')])
+
+    def test_tries_iso8859_1_if_all_else_fails(self):
+        text = Mock()
+
+        class FakeUnicodeDecodeError(UnicodeDecodeError):
+            def __init__(self):
+                pass
+
+        text.decode.side_effect = [LookupError(),
+                                   FakeUnicodeDecodeError(),
+                                   'result']
+        with self.magic_patcher:
+            result = decode_text_bytes(text)
+        self.assertEqual(result, 'result')
+        self.assertEqual(text.decode.call_args_list,
+                         [call('magic_codec'),
+                          call('utf-8'),
+                          call('iso8859-1')])
+
+
 class TestExtractorWorker(TestCase):
     def test_extraction_from_text_file(self):
         expected = "This is a test file.\nI'm testing PyPLN extractor worker!"

From c08413276a189b9ae4a153d3e6ca7eb338131e7b Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Sun, 27 Nov 2016 00:15:18 -0200
Subject: [PATCH 26/33] extractor: convert text to string before calling
 parse_html

---
 pypln/backend/workers/extractor.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index d9ed07c..ce850b1 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -166,10 +166,10 @@ def process(self, file_data):
             file_mime_type = m.id_buffer(contents)
 
         metadata = {}
-        if file_mime_type == 'text/plain':
-            text = contents
-        elif file_mime_type == 'text/html':
-            text = parse_html(contents, True, ['script', 'style'])
+        if file_mime_type in ('text/plain', 'text/html'):
+            text = decode_text_bytes(contents)
+            if file_mime_type == 'text/html':
+                text = parse_html(text, True, ['script', 'style'])
         elif file_mime_type == 'application/pdf':
             text, metadata = extract_pdf(contents)
         else:
@@ -183,9 +183,6 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        if isinstance(text, bytes):
-            text = decode_text_bytes(text)
-
         if isinstance(text, str):
             # HTMLParser only handles unicode objects. We can't pass the text
             # through it if we don't know the encoding, and it's possible we

From 8e67779d8250c0d056481d4ad59eb1b4cae9869e Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Sun, 27 Nov 2016 00:44:39 -0200
Subject: [PATCH 27/33] extractor: fix language detection

---
 pypln/backend/workers/extractor.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index ce850b1..505e6df 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -194,10 +194,17 @@ def process(self, file_data):
         text = clean(text)
 
         if isinstance(text, str):
-            language = cld.detect(text.encode('utf-8'))[1]
+            languages = cld.detect(text.encode('utf-8'))[2]
         else:
-            language = cld.detect(text)[1]
+            languages = cld.detect(text)[2]
+
+        detected_language = None
+        if languages:
+            detected_language = languages[0][1]
 
         # TODO: check for uses of forced_decoding and remove them
-        return {'text': text, 'file_metadata': metadata, 'language': language,
-                'mimetype': file_mime_type, 'forced_decoding': None}
+        return {'text': text,
+                'file_metadata': metadata,
+                'language': detected_language,
+                'mimetype': file_mime_type,
+                'forced_decoding': None}

From 11c203c9ad245d1c34384d9756957b479ddbe105 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 2 Dec 2016 16:47:33 -0200
Subject: [PATCH 28/33] extractor: remove checks for text being a str, it will
 always be

---
 pypln/backend/workers/extractor.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 505e6df..626aa3a 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -18,13 +18,12 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import base64
+import html
 import shlex
 
-from html.parser import HTMLParser
 from tempfile import NamedTemporaryFile
 from os import unlink
 from subprocess import Popen, PIPE
-from mimetypes import guess_type
 from re import compile as regexp_compile, DOTALL, escape
 
 import pycld2 as cld
@@ -183,21 +182,10 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        if isinstance(text, str):
-            # HTMLParser only handles unicode objects. We can't pass the text
-            # through it if we don't know the encoding, and it's possible we
-            # also shouldn't. There's no way of knowing if it's a badly encoded
-            # html or a binary blob that happens do have bytes that look liked
-            # html entities.
-            text = HTMLParser().unescape(text)
-
+        text = html.unescape(text)
         text = clean(text)
 
-        if isinstance(text, str):
-            languages = cld.detect(text.encode('utf-8'))[2]
-        else:
-            languages = cld.detect(text)[2]
-
+        languages = cld.detect(text.encode('utf-8'))[2]
         detected_language = None
         if languages:
             detected_language = languages[0][1]

From c6b3296e9512a35dcf79c06503a4073e6cc3f560 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 2 Dec 2016 16:47:41 -0200
Subject: [PATCH 29/33] extractor: remove up to 1k bytes that cld says are
 invalid

---
 pypln/backend/workers/extractor.py | 40 ++++++++++++++----
 tests/test_worker_extractor.py     | 67 +++++++++++++++++++++++-------
 2 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 626aa3a..eed8f33 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -45,6 +45,10 @@
                   '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6',
                   'br', 'br/']
 double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+cld_error_re = regexp_compile('input contains invalid UTF-8 around byte '
+                              '(?P<index>\d+) \(of \d+\)')
+MAX_CLD_BYTES_TO_REMOVE = 1024
+
 
 def clean(text):
     text = regexp_spaces_start.sub(r'\1', text)
@@ -155,6 +159,33 @@ def decode_text_bytes(text: bytes) -> str:
     return result
 
 
+def detect_language(text: str) -> str:
+    # CLD seems to have an issue with some bytes that Python considers
+    # to be valid utf-8. Remove up to MAX_CLD_BYTES_TO_REMOVE of such
+    # "invalid" bytes
+    # TODO: alert the user somehow if we give up removing them
+    detected_language = None
+    text_bytes = text.encode('utf-8')
+    for i in range(MAX_CLD_BYTES_TO_REMOVE):
+        try:
+            languages = cld.detect(text_bytes)[2]
+        except cld.error as exc:
+            message = exc.args[0] if exc.args else ''
+            match = cld_error_re.match(message)
+            if match:
+                byte_index = int(match.group('index'))
+                text_bytes = (text_bytes[:byte_index]
+                              + text_bytes[byte_index + 1:])
+            else:
+                raise
+        else:
+            if languages:
+                detected_language = languages[0][1]
+            break
+
+    return detected_language
+
+
 class Extractor(PyPLNTask):
     #TODO: need to verify some exceptions when trying to convert 'evil' PDFs
     #TODO: should 'replace_with' be '' when extracting from HTML?
@@ -184,15 +215,8 @@ def process(self, file_data):
 
         text = html.unescape(text)
         text = clean(text)
-
-        languages = cld.detect(text.encode('utf-8'))[2]
-        detected_language = None
-        if languages:
-            detected_language = languages[0][1]
-
-        # TODO: check for uses of forced_decoding and remove them
         return {'text': text,
                 'file_metadata': metadata,
-                'language': detected_language,
+                'language': detect_language(text),
                 'mimetype': file_mime_type,
                 'forced_decoding': None}
diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index 4c94349..e364546 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -24,11 +24,13 @@
 from unittest.mock import patch, Mock, MagicMock, call
 
 from magic import MagicError
+import pycld2 as cld
 
-from pypln.backend.workers import Extractor
-from pypln.backend.workers.extractor import decode_text_bytes
+from pypln.backend.workers.extractor import (Extractor, decode_text_bytes,
+                                             detect_language)
 
 DATA_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))
+MODULE = 'pypln.backend.workers.extractor.'
 
 
 class DecodeTextBytesTest(TestCase):
@@ -44,6 +46,7 @@ def test_ignores_magic_error(self):
         self.id_buffer_mock.side_effect = MagicError()
         text = Mock()
         with self.magic_patcher:
+            # noinspection PyTypeChecker
             result = decode_text_bytes(text)
         self.assertEqual(result, text.decode.return_value)
         self.assertEqual(text.decode.call_args_list, [call('utf-8')])
@@ -51,6 +54,7 @@ def test_ignores_magic_error(self):
     def test_tries_decoding_with_encoding_returned_by_magic(self):
         text = Mock()
         with self.magic_patcher:
+            # noinspection PyTypeChecker
             result = decode_text_bytes(text)
         self.assertEqual(result, text.decode.return_value)
         self.assertEqual(text.decode.call_args_list, [call('magic_codec')])
@@ -59,6 +63,7 @@ def test_tries_decoding_as_utf8(self):
         text = Mock()
         text.decode.side_effect = [LookupError(), 'result']
         with self.magic_patcher:
+            # noinspection PyTypeChecker
             result = decode_text_bytes(text)
         self.assertEqual(result, 'result')
         self.assertEqual(text.decode.call_args_list,
@@ -75,6 +80,7 @@ def __init__(self):
                                    FakeUnicodeDecodeError(),
                                    'result']
         with self.magic_patcher:
+            # noinspection PyTypeChecker
             result = decode_text_bytes(text)
         self.assertEqual(result, 'result')
         self.assertEqual(text.decode.call_args_list,
@@ -83,6 +89,44 @@ def __init__(self):
                           call('iso8859-1')])
 
 
+def get_cld_exc(index):
+    return cld.error('input contains invalid UTF-8 around byte %s (of 42)'
+                     % index)
+
+
+class DetectLanguageTest(TestCase):
+    def setUp(self):
+        self.cld_patcher = patch(MODULE + 'cld.detect',
+                                 return_value=(Mock(), Mock(),
+                                               [(Mock(), 'lang'),
+                                                (Mock(), 'other_lang')]))
+        self.cld_mock = self.cld_patcher.start()
+
+    def tearDown(self):
+        self.cld_patcher.stop()
+
+    def test_returns_most_likely_language(self):
+        self.assertEqual(detect_language('text'), 'lang')
+
+    def test_removes_bytes_cld_considers_invalid(self):
+        self.cld_mock.side_effect = [get_cld_exc(0),
+                                     get_cld_exc(3),
+                                     self.cld_mock.return_value]
+        self.assertEqual(detect_language('012345'), 'lang')
+        self.assertEqual(self.cld_mock.call_args_list,
+                         [call(b'012345'), call(b'12345'), call(b'1235')])
+
+    def test_removes_at_most_max_bytes_for_cld(self):
+        self.cld_mock.side_effect = [get_cld_exc(0)] * 4
+        with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3):
+            self.assertIsNone(detect_language('012345'))
+            self.assertEqual(self.cld_mock.call_count, 3)
+
+    def test_doesnt_silence_other_cld_errors(self):
+        self.cld_mock.side_effect = [get_cld_exc(0), cld.error('another error')]
+        self.assertRaises(cld.error, detect_language, 'text')
+
+
 class TestExtractorWorker(TestCase):
     def test_extraction_from_text_file(self):
         expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
@@ -206,12 +250,12 @@ def test_language_detection_en(self):
         text = 'This text was written by Álvaro in English.'
         data = {'filename': 'text-en.txt',
                 'contents': base64.b64encode(text.encode('utf-8'))}
-        result =  Extractor().process(data)
+        result = Extractor().process(data)
         self.assertEqual(result['language'], 'en')
 
     def test_unescape_html_entities(self):
         expected = ("This text has html <entities>. Álvaro asked me to make"
-                     " sure it also has non ascii chars.")
+                    " sure it also has non ascii chars.")
         filename = os.path.join(DATA_DIR, 'test_html_entities.txt')
         data = {'filename': filename,
                 'contents': base64.b64encode(open(filename, 'rb').read())}
@@ -247,13 +291,8 @@ def test_unknown_mimetype_should_be_flagged(self):
         self.assertEqual(result['language'], "")
         self.assertEqual(result['file_metadata'], {})
 
-    def test_unknown_encoding_should_be_ignored(self):
-        filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt')
-        expected = "This file has a weird byte (\x96) that makes it " \
-                   "impossible for libmagic to recognize it's encoding."
-        data = {'filename': filename,
-                'contents': base64.b64encode(open(filename, 'rb').read())}
-        result = Extractor().process(data)
-        self.assertEqual(result['text'], expected)
-        self.assertEqual(result['file_metadata'], {})
-        self.assertEqual(result['language'], 'en')
+    def test_calls_detect_language(self):
+        with patch(MODULE + 'detect_language') as detect_language_mock:
+            result = Extractor().process({'contents': base64.b64encode(b'ok')})
+            self.assertEqual(result['language'],
+                             detect_language_mock.return_value)

From 25a8e54e0e106b536edba51633d50017029c0de5 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Fri, 2 Dec 2016 16:53:03 -0200
Subject: [PATCH 30/33] SpellingChecker: no need to check for KeyError from
 document keys

---
 pypln/backend/workers/spellchecker.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pypln/backend/workers/spellchecker.py b/pypln/backend/workers/spellchecker.py
index e6bc93b..eeac5d3 100644
--- a/pypln/backend/workers/spellchecker.py
+++ b/pypln/backend/workers/spellchecker.py
@@ -49,10 +49,7 @@ def process(self, document):
                           MissingDictionaryWarning)
             errors = None
         else:
-            try:
-                checker.set_text(document['text'])
-                errors = [[e.word, e.wordpos, e.suggest()] for e in checker]
-            except KeyError:
-                errors = None
+            checker.set_text(document['text'])
+            errors = [[e.word, e.wordpos, e.suggest()] for e in checker]
 
         return {'spelling_errors': errors}

From 573a1117aa5acdb8a29d07008f2404b2932f6da9 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Tue, 6 Dec 2016 14:29:15 -0200
Subject: [PATCH 31/33] extractor: turn redundant tests into integration test

---
 tests/data/encoding_unknown_to_libmagic.txt |  1 -
 tests/test_worker_extractor.py              | 48 ++++++---------------
 2 files changed, 14 insertions(+), 35 deletions(-)
 delete mode 100644 tests/data/encoding_unknown_to_libmagic.txt

diff --git a/tests/data/encoding_unknown_to_libmagic.txt b/tests/data/encoding_unknown_to_libmagic.txt
deleted file mode 100644
index 9fb69b2..0000000
--- a/tests/data/encoding_unknown_to_libmagic.txt
+++ /dev/null
@@ -1 +0,0 @@
-This file has a weird byte (�) that makes it impossible for libmagic to recognize it's encoding.
diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index e364546..ac1df9e 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -96,35 +96,36 @@ def get_cld_exc(index):
 
 class DetectLanguageTest(TestCase):
     def setUp(self):
-        self.cld_patcher = patch(MODULE + 'cld.detect',
-                                 return_value=(Mock(), Mock(),
-                                               [(Mock(), 'lang'),
-                                                (Mock(), 'other_lang')]))
-        self.cld_mock = self.cld_patcher.start()
+        self.cld_mock = Mock(return_value=(Mock(), Mock(),
+                                           [(Mock(), 'lang'),
+                                            (Mock(), 'other_lang')]))
+        self.cld_patcher = patch(MODULE + 'cld.detect', self.cld_mock)
 
-    def tearDown(self):
-        self.cld_patcher.stop()
-
-    def test_returns_most_likely_language(self):
-        self.assertEqual(detect_language('text'), 'lang')
+    def test_detects_portuguese(self):
+        """Sort of an integration test"""
+        text = 'Esse texto foi escrito por Álvaro em Português.'
+        self.assertEqual(detect_language(text), 'pt')
 
     def test_removes_bytes_cld_considers_invalid(self):
         self.cld_mock.side_effect = [get_cld_exc(0),
                                      get_cld_exc(3),
                                      self.cld_mock.return_value]
-        self.assertEqual(detect_language('012345'), 'lang')
+        with self.cld_patcher:
+            self.assertEqual(detect_language('012345'), 'lang')
         self.assertEqual(self.cld_mock.call_args_list,
                          [call(b'012345'), call(b'12345'), call(b'1235')])
 
     def test_removes_at_most_max_bytes_for_cld(self):
         self.cld_mock.side_effect = [get_cld_exc(0)] * 4
-        with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3):
+        with patch(MODULE + 'MAX_CLD_BYTES_TO_REMOVE', 3),\
+                self.cld_patcher:
             self.assertIsNone(detect_language('012345'))
             self.assertEqual(self.cld_mock.call_count, 3)
 
     def test_doesnt_silence_other_cld_errors(self):
         self.cld_mock.side_effect = [get_cld_exc(0), cld.error('another error')]
-        self.assertRaises(cld.error, detect_language, 'text')
+        with self.cld_patcher:
+            self.assertRaises(cld.error, detect_language, 'text')
 
 
 class TestExtractorWorker(TestCase):
@@ -232,27 +233,6 @@ def test_extraction_from_html(self):
         self.assertEqual(result['text'], expected)
         self.assertEqual(result['mimetype'], 'text/html')
 
-    def test_language_detection_pt(self):
-        text = 'Esse texto foi escrito por Álvaro em Português.'
-        data = {'filename': 'text-pt.txt',
-                'contents': base64.b64encode(text.encode('utf-8'))}
-        result = Extractor().process(data)
-        self.assertEqual(result['language'], 'pt')
-
-    def test_language_detection_es(self):
-        text = 'Este texto ha sido escrito en Español por Álvaro.'
-        data = {'filename': 'text-es.txt',
-                'contents': base64.b64encode(text.encode('utf-8'))}
-        result = Extractor().process(data)
-        self.assertEqual(result['language'], 'es')
-
-    def test_language_detection_en(self):
-        text = 'This text was written by Álvaro in English.'
-        data = {'filename': 'text-en.txt',
-                'contents': base64.b64encode(text.encode('utf-8'))}
-        result = Extractor().process(data)
-        self.assertEqual(result['language'], 'en')
-
     def test_unescape_html_entities(self):
         expected = ("This text has html <entities>. Álvaro asked me to make"
                     " sure it also has non ascii chars.")

From 0265786bb08849dd82d1e2b60dc89f5773c3e296 Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Tue, 6 Dec 2016 19:26:38 -0200
Subject: [PATCH 32/33] extractor tests: support newer version of pdfinfo

---
 tests/test_worker_extractor.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index ac1df9e..0079f78 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -168,7 +168,6 @@ def test_extraction_from_pdf_file(self):
                 'Author':         'Álvaro Justen',
                 'Creator':        'Writer',
                 'Producer':       'LibreOffice 3.5',
-                'CreationDate':   'Fri Jun  1 17:07:57 2012',
                 'Tagged':         'no',
                 'Pages':          '1',
                 'Encrypted':      'no',
@@ -178,6 +177,12 @@ def test_extraction_from_pdf_file(self):
         }
         metadata_expected_set = set(metadata_expected.items())
         metadata = result['file_metadata']
+
+        # Newer versions of pdfinfo add the timezone to this field
+        self.assertIn(metadata['CreationDate'],
+                      ['Fri Jun  1 17:07:57 2012',
+                       'Fri Jun  1 17:07:57 2012 BRT'])
+
         metadata_set = set(metadata.items())
         diff_set = metadata_expected_set - metadata_set
         self.assertTrue(metadata_expected_set.issubset(metadata_set),

From 7b84defcd4a10cdcca9ff86b3827372ca099dada Mon Sep 17 00:00:00 2001
From: Luiz Geron <luiz@geron.me>
Date: Tue, 31 Jan 2017 17:32:27 -0200
Subject: [PATCH 33/33] change bigram worker to return metric names and respect
 bigram order

---
 pypln/backend/workers/bigrams.py |  44 +++---
 tests/test_worker_bigrams.py     | 242 ++++++++++++++++++++++++++-----
 2 files changed, 228 insertions(+), 58 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 034972d..c99cb95 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -16,33 +16,31 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from collections import OrderedDict
 
-import nltk
-from collections import defaultdict
-
-from nltk.collocations import BigramCollocationFinder
+from nltk import BigramCollocationFinder, BigramAssocMeasures
 from pypln.backend.celery_task import PyPLNTask
 
+METRICS = ['chi_sq',
+           'dice',
+           'jaccard',
+           'likelihood_ratio',
+           'mi_like',
+           'phi_sq',
+           'pmi',
+           'poisson_stirling',
+           'raw_freq',
+           'student_t']
 
-class Bigrams(PyPLNTask):
-    """Create a NLTK bigram finder and return a table in JSON format"""
 
+class Bigrams(PyPLNTask):
     def process(self, document):
-        #todo: support filtering by stopwords
-        bigram_measures = nltk.collocations.BigramAssocMeasures()
-        metrics = ['chi_sq',
-               'dice',
-               'jaccard',
-               'likelihood_ratio',
-               'mi_like',
-               'phi_sq',
-               'pmi',
-               'poisson_stirling',
-               'raw_freq',
-               'student_t']
         bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
-        br = defaultdict(lambda :[])
-        for m in metrics:
-            for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
-                br[res[0]].append(res[1])
-        return {'metrics': metrics, 'bigram_rank': list(br.items())}
+        bigram_rankings = OrderedDict()
+        for metric_name in METRICS:
+            metric = getattr(BigramAssocMeasures, metric_name)
+            for ranking in bigram_finder.score_ngrams(metric):
+                bigram = ranking[0]
+                d = bigram_rankings.setdefault(bigram, {})
+                d[metric_name] = ranking[1]
+        return {'bigram_rankings': list(bigram_rankings.items())}
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index 6adf67e..027a701 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -16,43 +16,215 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
-
-import nltk
+from unittest import TestCase
 
 from pypln.backend.workers.bigrams import Bigrams
-from .utils import TaskTest
-
-bigram_measures = nltk.collocations.BigramAssocMeasures()
-
-
-class TestBigramWorker(TaskTest):
-    def test_bigrams_should_return_correct_score(self):
-        # We need this list comprehension because we need to save the word list
-        # in mongo (thus, it needs to be json serializable). Also, a list is
-        # what will be available to the worker in real situations.
-        tokens = [w for w in
-                nltk.corpus.genesis.words('english-web.txt')]
-
-        doc_id = self.collection.insert({'tokens': tokens}, w=1)
 
-        Bigrams().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        bigram_rank = refreshed_document['bigram_rank']
-        result = bigram_rank[0][1][0]
-        # This is the value of the chi_sq measure for this bigram in this
-        # colocation
-        expected_chi_sq = 95.59393417173634
-        self.assertEqual(result, expected_chi_sq)
+TOKENS = ['Ao', 'verme', 'que', 'primeiro', 'roeu', 'as', 'frias', 'carnes',
+          'do', 'meu', 'cadáver', 'dedico', 'como', 'saudosa', 'lembrança',
+          'estas', 'Memórias', 'Póstumas', '.']
+RANKINGS = {'bigram_rankings': [(('Ao', 'verme'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('Memórias', 'Póstumas'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('Póstumas', '.'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('as', 'frias'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('cadáver', 'dedico'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('carnes', 'do'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('como', 'saudosa'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('dedico', 'como'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('do', 'meu'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('estas', 'Memórias'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('frias', 'carnes'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('lembrança', 'estas'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('meu', 'cadáver'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('primeiro', 'roeu'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('que', 'primeiro'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('roeu', 'as'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('saudosa', 'lembrança'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316}),
+                                (('verme', 'que'),
+                                 {'chi_sq': 19.0,
+                                  'dice': 1.0,
+                                  'jaccard': 1.0,
+                                  'likelihood_ratio': 7.835297924062801,
+                                  'mi_like': 1.0,
+                                  'phi_sq': 1.0,
+                                  'pmi': 4.247927513443585,
+                                  'poisson_stirling': 3.247927513443585,
+                                  'raw_freq': 0.05263157894736842,
+                                  'student_t': 0.9473684210526316})]}
 
-    def test_bigrams_could_contain_dollar_signs_and_dots(self):
-        tokens = ['$', '.']
-        doc_id = self.collection.insert({'tokens': tokens}, w=1)
 
-        Bigrams().delay(doc_id)
-        refreshed_document = self.collection.find_one({'_id': doc_id})
-        bigram_rank = refreshed_document['bigram_rank']
-        result = bigram_rank[0][1][0]
-        # 2.0 is the value of the chi_sq measure for this bigram in this
-        # colocation
-        expected_chi_sq = 2.0
-        self.assertEqual(result, expected_chi_sq)
+class TestBigramWorker(TestCase):
+    def test_returns_bigram_rankings(self):
+        self.maxDiff = None
+        result = Bigrams().process({'tokens': TOKENS})
+        self.assertEqual(result, RANKINGS)