NAMD · geron · Nov 23, 2016 · Nov 23, 2016 · Nov 23, 2016 · Nov 23, 2016
diff --git a/doc/conf.py b/doc/conf.py
@@ -46,8 +46,8 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'PyPLN'
-copyright = u'2011, Flávio Codeço Coelho'
+project = 'PyPLN'
+copyright = '2011, Flávio Codeço Coelho'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -187,8 +187,8 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('index', 'PyPLN.tex', u'PyPLN Documentation',
-   u'Flávio Codeço Coelho', 'manual'),
+  ('index', 'PyPLN.tex', 'PyPLN Documentation',
+   'Flávio Codeço Coelho', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -220,6 +220,6 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'pypln', u'PyPLN Documentation',
-     [u'Flávio Codeço Coelho'], 1)
+    ('index', 'pypln', 'PyPLN Documentation',
+     ['Flávio Codeço Coelho'], 1)
 ]
diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py
@@ -19,7 +19,7 @@
 
 from celery import Celery
 from kombu import Exchange, Queue
-import config
+from . import config
 
 app = Celery('pypln_workers', backend='mongodb',
         broker='amqp://', include=['pypln.backend.workers'])

diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
@@ -31,7 +31,7 @@
 from pypln.backend import config
 
 
-mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
+mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False)
 database = mongo_client[config.MONGODB_DBNAME]
 document_collection = database[config.MONGODB_COLLECTION]
 

diff --git a/pypln/backend/config.py b/pypln/backend/config.py
@@ -1,16 +1,12 @@
 import os
+import urllib.parse
 
 from decouple import config, Csv
 
-try:
-    import urlparse
-except ImportError:
-    import urllib.parse as urlparse
-
 def parse_url(url):
-    urlparse.uses_netloc.append('mongodb')
-    urlparse.uses_netloc.append('celery')
-    url = urlparse.urlparse(url)
+    urllib.parse.uses_netloc.append('mongodb')
+    urllib.parse.uses_netloc.append('celery')
+    url = urllib.parse.urlparse(url)
 
     path = url.path[1:]
     path = path.split('?', 2)[0]

diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
@@ -17,18 +17,18 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from extractor import Extractor
-from tokenizer import Tokenizer
-from freqdist import FreqDist
-from pos import POS
-from statistics import Statistics
-from bigrams import Bigrams
-from palavras_raw import PalavrasRaw
-from lemmatizer_pt import Lemmatizer
-from palavras_noun_phrase import NounPhrase
-from palavras_semantic_tagger import SemanticTagger
-from word_cloud import WordCloud
-from elastic_indexer import ElasticIndexer
+from .extractor import Extractor
+from .tokenizer import Tokenizer
+from .freqdist import FreqDist
+from .pos import POS
+from .statistics import Statistics
+from .bigrams import Bigrams
+from .palavras_raw import PalavrasRaw
+from .lemmatizer_pt import Lemmatizer
+from .palavras_noun_phrase import NounPhrase
+from .palavras_semantic_tagger import SemanticTagger
+from .word_cloud import WordCloud
+from .elastic_indexer import ElasticIndexer
 
 
 __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
@@ -16,33 +16,31 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from collections import OrderedDict
 
-import nltk
-from collections import defaultdict
-
-from nltk.collocations import BigramCollocationFinder
+from nltk import BigramCollocationFinder, BigramAssocMeasures
 from pypln.backend.celery_task import PyPLNTask
 
+METRICS = ['chi_sq',
+           'dice',
+           'jaccard',
+           'likelihood_ratio',
+           'mi_like',
+           'phi_sq',
+           'pmi',
+           'poisson_stirling',
+           'raw_freq',
+           'student_t']
 
-class Bigrams(PyPLNTask):
-    """Create a NLTK bigram finder and return a table in JSON format"""
 
+class Bigrams(PyPLNTask):
     def process(self, document):
-        #todo: support filtering by stopwords
-        bigram_measures = nltk.collocations.BigramAssocMeasures()
-        metrics = ['chi_sq',
-               'dice',
-               'jaccard',
-               'likelihood_ratio',
-               'mi_like',
-               'phi_sq',
-               'pmi',
-               'poisson_stirling',
-               'raw_freq',
-               'student_t']
         bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
-        br = defaultdict(lambda :[])
-        for m in metrics:
-            for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
-                br[res[0]].append(res[1])
-        return {'metrics': metrics, 'bigram_rank': br.items()}
+        bigram_rankings = OrderedDict()
+        for metric_name in METRICS:
+            metric = getattr(BigramAssocMeasures, metric_name)
+            for ranking in bigram_finder.score_ngrams(metric):
+                bigram = ranking[0]
+                d = bigram_rankings.setdefault(bigram, {})
+                d[metric_name] = ranking[1]
+        return {'bigram_rankings': list(bigram_rankings.items())}
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
@@ -18,16 +18,15 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import base64
+import html
 import shlex
 
-from HTMLParser import HTMLParser
 from tempfile import NamedTemporaryFile
 from os import unlink
 from subprocess import Popen, PIPE
-from mimetypes import guess_type
 from re import compile as regexp_compile, DOTALL, escape
 
-import cld
+import pycld2 as cld
 import magic
 
 from pypln.backend.celery_task import PyPLNTask
@@ -46,6 +45,10 @@
                   '/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6',
                   'br', 'br/']
 double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+cld_error_re = regexp_compile('input contains invalid UTF-8 around byte '
+                              '(?P<index>\d+) \(of \d+\)')
+MAX_CLD_BYTES_TO_REMOVE = 1024
+
 
 def clean(text):
     text = regexp_spaces_start.sub(r'\1', text)
@@ -84,10 +87,10 @@ def parse_html(html, remove_tags=None, remove_inside=None,
                     [''] * (total_to_remove - 2)
             content_between[index + 1] = '\n'
     complete_tags.append('')
-    result = ''.join(sum(zip(content_between, complete_tags), tuple()))
+    result = ''.join(sum(list(zip(content_between, complete_tags)), tuple()))
     return clean(result)
 
-def get_pdf_metadata(data):
+def get_pdf_metadata(data: str) -> dict:
     lines = data.strip().splitlines()
     metadata = {}
     for line in lines:
@@ -98,7 +101,7 @@ def get_pdf_metadata(data):
         metadata[key.strip()] = value.strip()
     return metadata
 
-def extract_pdf(data):
+def extract_pdf(data: bytes) -> (str, dict):
     temp = NamedTemporaryFile(delete=False)
     filename = temp.name
     temp.close()
@@ -112,14 +115,16 @@ def extract_pdf(data):
     unlink(filename + '_ind.html')
     unlink(filename + 's.html')
     text = parse_html(html.replace('&#160;', ' '), True, ['script', 'style'])
-    pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
-                    stderr=PIPE)
-    meta_out, meta_err = pdfinfo.communicate(input=data)
+
+    info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
+                         stderr=PIPE)
+    meta_out, meta_err = info_process.communicate(input=data)
     try:
-        metadata = get_pdf_metadata(meta_out)
-    except:
+        metadata = get_pdf_metadata(meta_out.decode('utf-8'))
+    except Exception:
+        # TODO: what should I do here?
         metadata = {}
-        #TODO: what should I do here?
+
     if not (text and metadata):
         return '', {}
     elif not html_err:
@@ -128,41 +133,57 @@ def extract_pdf(data):
         return '', {}
 
 
-def trial_decode(text):
+def decode_text_bytes(text: bytes) -> str:
     """
-    Tries to detect text encoding using `magic`. If the detected encoding is
-    not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
-    as utf-8 replacing invalid chars with `U+FFFD` (the replacement character).
-
-    This is far from an ideal solution, but the extractor and the rest of the
-    pipeline need an unicode object.
+    Tries to detect text encoding using file magic. If that fails or the
+    detected encoding is not supported, tries using utf-8. If that doesn't work
+    tries using iso8859-1.
     """
-    with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
-        content_encoding = m.id_buffer(text)
+    try:
+        with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
+            content_encoding = m.id_buffer(text)
+    except magic.MagicError:
+        pass  # This can happen for instance if text is a single char
+    else:
+        try:
+            return text.decode(content_encoding)
+        except LookupError:  # The detected encoding is not supported
+            pass
 
-    forced_decoding = False
     try:
-        result = text.decode(content_encoding)
-    except LookupError:
-        # If the detected encoding is not supported, we try to decode it as
-        # utf-8.
+        result = text.decode('utf-8')
+    except UnicodeDecodeError:
+        # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is
+        # a last resort.
+        result = text.decode('iso8859-1')
+    return result
+
+
+def detect_language(text: str) -> str:
+    # CLD seems to have an issue with some bytes that Python considers
+    # to be valid utf-8. Remove up to MAX_CLD_BYTES_TO_REMOVE of such
+    # "invalid" bytes
+    # TODO: alert the user somehow if we give up removing them
+    detected_language = None
+    text_bytes = text.encode('utf-8')
+    for i in range(MAX_CLD_BYTES_TO_REMOVE):
         try:
-            result = text.decode('utf-8')
-        except UnicodeDecodeError:
-            # Is there a better way of doing this than nesting try/except
-            # blocks? This smells really bad.
-            try:
-                result = text.decode('iso-8859-1')
-            except UnicodeDecodeError:
-                # If neither utf-8 nor iso-885901 work are capable of handling
-                # this text, we just decode it using utf-8 and replace invalid
-                # chars with U+FFFD.
-                # Two somewhat arbitrary decisions were made here: use utf-8
-                # and use 'replace' instead of 'ignore'.
-                result = text.decode('utf-8', 'replace')
-                forced_decoding = True
-
-    return result, forced_decoding
+            languages = cld.detect(text_bytes)[2]
+        except cld.error as exc:
+            message = exc.args[0] if exc.args else ''
+            match = cld_error_re.match(message)
+            if match:
+                byte_index = int(match.group('index'))
+                text_bytes = (text_bytes[:byte_index]
+                              + text_bytes[byte_index + 1:])
+            else:
+                raise
+        else:
+            if languages:
+                detected_language = languages[0][1]
+            break
+
+    return detected_language
 
 
 class Extractor(PyPLNTask):
@@ -173,11 +194,12 @@ def process(self, file_data):
         contents = base64.b64decode(file_data['contents'])
         with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
             file_mime_type = m.id_buffer(contents)
+
         metadata = {}
-        if file_mime_type == 'text/plain':
-            text = contents
-        elif file_mime_type == 'text/html':
-            text = parse_html(contents, True, ['script', 'style'])
+        if file_mime_type in ('text/plain', 'text/html'):
+            text = decode_text_bytes(contents)
+            if file_mime_type == 'text/html':
+                text = parse_html(text, True, ['script', 'style'])
         elif file_mime_type == 'application/pdf':
             text, metadata = extract_pdf(contents)
         else:
@@ -191,22 +213,10 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        text, forced_decoding = trial_decode(text)
-
-        if isinstance(text, unicode):
-            # HTMLParser only handles unicode objects. We can't pass the text
-            # through it if we don't know the encoding, and it's possible we
-            # also shouldn't. There's no way of knowing if it's a badly encoded
-            # html or a binary blob that happens do have bytes that look liked
-            # html entities.
-            text = HTMLParser().unescape(text)
-
+        text = html.unescape(text)
         text = clean(text)
-
-        if isinstance(text, unicode):
-            language = cld.detect(text.encode('utf-8'))[1]
-        else:
-            language = cld.detect(text)[1]
-
-        return {'text': text, 'file_metadata': metadata, 'language': language,
-                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
+        return {'text': text,
+                'file_metadata': metadata,
+                'language': detect_language(text),
+                'mimetype': file_mime_type,
+                'forced_decoding': None}
diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py
@@ -27,7 +27,7 @@ def process(self, document):
         tokens = [info.lower() for info in document_tokens]
         frequency_distribution = {token: tokens.count(token) \
                                   for token in set(tokens)}
-        fd = frequency_distribution.items()
-        fd.sort(lambda x, y: cmp(y[1], x[1]))
+        fd = list(frequency_distribution.items())
+        fd.sort(key=lambda x: (-x[1], x[0]))
 
         return {'freqdist': fd}
diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py
@@ -40,7 +40,7 @@ def process(self, document):
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         palavras_output = document['palavras_raw']
-        if isinstance(palavras_output, unicode):
+        if isinstance(palavras_output, str):
             # we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii.
             palavras_output = palavras_output.encode('utf-8')
         stdout, stderr = process.communicate(palavras_output)