diff --git a/doc/conf.py b/doc/conf.py
index a6be0e8..6403136 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -46,8 +46,8 @@
master_doc = 'index'
# General information about the project.
-project = u'PyPLN'
-copyright = u'2011, Flávio Codeço Coelho'
+project = 'PyPLN'
+copyright = '2011, Flávio Codeço Coelho'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
@@ -187,8 +187,8 @@
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
- ('index', 'PyPLN.tex', u'PyPLN Documentation',
- u'Flávio Codeço Coelho', 'manual'),
+ ('index', 'PyPLN.tex', 'PyPLN Documentation',
+ 'Flávio Codeço Coelho', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
@@ -220,6 +220,6 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ('index', 'pypln', u'PyPLN Documentation',
- [u'Flávio Codeço Coelho'], 1)
+ ('index', 'pypln', 'PyPLN Documentation',
+ ['Flávio Codeço Coelho'], 1)
]
diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py
index 342c5be..895d9cd 100644
--- a/pypln/backend/celery_app.py
+++ b/pypln/backend/celery_app.py
@@ -19,7 +19,7 @@
from celery import Celery
from kombu import Exchange, Queue
-import config
+from . import config
app = Celery('pypln_workers', backend='mongodb',
broker='amqp://', include=['pypln.backend.workers'])
diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
index 2d3d93d..0b1c235 100644
--- a/pypln/backend/celery_task.py
+++ b/pypln/backend/celery_task.py
@@ -31,7 +31,7 @@
from pypln.backend import config
-mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS)
+mongo_client = pymongo.MongoClient(host=config.MONGODB_URIS, _connect=False)
database = mongo_client[config.MONGODB_DBNAME]
document_collection = database[config.MONGODB_COLLECTION]
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
index ec1d48e..e5bd6d3 100644
--- a/pypln/backend/config.py
+++ b/pypln/backend/config.py
@@ -1,16 +1,12 @@
import os
+import urllib.parse
from decouple import config, Csv
-try:
- import urlparse
-except ImportError:
- import urllib.parse as urlparse
-
def parse_url(url):
- urlparse.uses_netloc.append('mongodb')
- urlparse.uses_netloc.append('celery')
- url = urlparse.urlparse(url)
+ urllib.parse.uses_netloc.append('mongodb')
+ urllib.parse.uses_netloc.append('celery')
+ url = urllib.parse.urlparse(url)
path = url.path[1:]
path = path.split('?', 2)[0]
diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
index 0125bde..9ca1ec2 100644
--- a/pypln/backend/workers/__init__.py
+++ b/pypln/backend/workers/__init__.py
@@ -17,18 +17,18 @@
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
-from extractor import Extractor
-from tokenizer import Tokenizer
-from freqdist import FreqDist
-from pos import POS
-from statistics import Statistics
-from bigrams import Bigrams
-from palavras_raw import PalavrasRaw
-from lemmatizer_pt import Lemmatizer
-from palavras_noun_phrase import NounPhrase
-from palavras_semantic_tagger import SemanticTagger
-from word_cloud import WordCloud
-from elastic_indexer import ElasticIndexer
+from .extractor import Extractor
+from .tokenizer import Tokenizer
+from .freqdist import FreqDist
+from .pos import POS
+from .statistics import Statistics
+from .bigrams import Bigrams
+from .palavras_raw import PalavrasRaw
+from .lemmatizer_pt import Lemmatizer
+from .palavras_noun_phrase import NounPhrase
+from .palavras_semantic_tagger import SemanticTagger
+from .word_cloud import WordCloud
+from .elastic_indexer import ElasticIndexer
__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics',
diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 302482f..c99cb95 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -16,33 +16,31 @@
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
+from collections import OrderedDict
-import nltk
-from collections import defaultdict
-
-from nltk.collocations import BigramCollocationFinder
+from nltk import BigramCollocationFinder, BigramAssocMeasures
from pypln.backend.celery_task import PyPLNTask
+METRICS = ['chi_sq',
+ 'dice',
+ 'jaccard',
+ 'likelihood_ratio',
+ 'mi_like',
+ 'phi_sq',
+ 'pmi',
+ 'poisson_stirling',
+ 'raw_freq',
+ 'student_t']
-class Bigrams(PyPLNTask):
- """Create a NLTK bigram finder and return a table in JSON format"""
+class Bigrams(PyPLNTask):
def process(self, document):
- #todo: support filtering by stopwords
- bigram_measures = nltk.collocations.BigramAssocMeasures()
- metrics = ['chi_sq',
- 'dice',
- 'jaccard',
- 'likelihood_ratio',
- 'mi_like',
- 'phi_sq',
- 'pmi',
- 'poisson_stirling',
- 'raw_freq',
- 'student_t']
bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
- br = defaultdict(lambda :[])
- for m in metrics:
- for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
- br[res[0]].append(res[1])
- return {'metrics': metrics, 'bigram_rank': br.items()}
+ bigram_rankings = OrderedDict()
+ for metric_name in METRICS:
+ metric = getattr(BigramAssocMeasures, metric_name)
+ for ranking in bigram_finder.score_ngrams(metric):
+ bigram = ranking[0]
+ d = bigram_rankings.setdefault(bigram, {})
+ d[metric_name] = ranking[1]
+ return {'bigram_rankings': list(bigram_rankings.items())}
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 110730b..eed8f33 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -18,16 +18,15 @@
# along with PyPLN. If not, see .
import base64
+import html
import shlex
-from HTMLParser import HTMLParser
from tempfile import NamedTemporaryFile
from os import unlink
from subprocess import Popen, PIPE
-from mimetypes import guess_type
from re import compile as regexp_compile, DOTALL, escape
-import cld
+import pycld2 as cld
import magic
from pypln.backend.celery_task import PyPLNTask
@@ -46,6 +45,10 @@
'/h2', 'h3', '/h3', 'h4', '/h4', 'h5', '/h5', 'h6', '/h6',
'br', 'br/']
double_breakline = ['table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+cld_error_re = regexp_compile('input contains invalid UTF-8 around byte '
+ '(?P\d+) \(of \d+\)')
+MAX_CLD_BYTES_TO_REMOVE = 1024
+
def clean(text):
text = regexp_spaces_start.sub(r'\1', text)
@@ -84,10 +87,10 @@ def parse_html(html, remove_tags=None, remove_inside=None,
[''] * (total_to_remove - 2)
content_between[index + 1] = '\n'
complete_tags.append('')
- result = ''.join(sum(zip(content_between, complete_tags), tuple()))
+ result = ''.join(sum(list(zip(content_between, complete_tags)), tuple()))
return clean(result)
-def get_pdf_metadata(data):
+def get_pdf_metadata(data: str) -> dict:
lines = data.strip().splitlines()
metadata = {}
for line in lines:
@@ -98,7 +101,7 @@ def get_pdf_metadata(data):
metadata[key.strip()] = value.strip()
return metadata
-def extract_pdf(data):
+def extract_pdf(data: bytes) -> (str, dict):
temp = NamedTemporaryFile(delete=False)
filename = temp.name
temp.close()
@@ -112,14 +115,16 @@ def extract_pdf(data):
unlink(filename + '_ind.html')
unlink(filename + 's.html')
text = parse_html(html.replace(' ', ' '), True, ['script', 'style'])
- pdfinfo = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
- stderr=PIPE)
- meta_out, meta_err = pdfinfo.communicate(input=data)
+
+ info_process = Popen(shlex.split('pdfinfo -'), stdin=PIPE, stdout=PIPE,
+ stderr=PIPE)
+ meta_out, meta_err = info_process.communicate(input=data)
try:
- metadata = get_pdf_metadata(meta_out)
- except:
+ metadata = get_pdf_metadata(meta_out.decode('utf-8'))
+ except Exception:
+ # TODO: what should I do here?
metadata = {}
- #TODO: what should I do here?
+
if not (text and metadata):
return '', {}
elif not html_err:
@@ -128,41 +133,57 @@ def extract_pdf(data):
return '', {}
-def trial_decode(text):
+def decode_text_bytes(text: bytes) -> str:
"""
- Tries to detect text encoding using `magic`. If the detected encoding is
- not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
- as utf-8 replacing invalid chars with `U+FFFD` (the replacement character).
-
- This is far from an ideal solution, but the extractor and the rest of the
- pipeline need an unicode object.
+ Tries to detect text encoding using file magic. If that fails or the
+ detected encoding is not supported, tries using utf-8. If that doesn't work
+ tries using iso8859-1.
"""
- with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
- content_encoding = m.id_buffer(text)
+ try:
+ with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
+ content_encoding = m.id_buffer(text)
+ except magic.MagicError:
+ pass # This can happen for instance if text is a single char
+ else:
+ try:
+ return text.decode(content_encoding)
+ except LookupError: # The detected encoding is not supported
+ pass
- forced_decoding = False
try:
- result = text.decode(content_encoding)
- except LookupError:
- # If the detected encoding is not supported, we try to decode it as
- # utf-8.
+ result = text.decode('utf-8')
+ except UnicodeDecodeError:
+ # Decoding with iso8859-1 doesn't raise UnicodeDecodeError, so this is
+ # a last resort.
+ result = text.decode('iso8859-1')
+ return result
+
+
+def detect_language(text: str) -> str:
+ # CLD seems to have an issue with some bytes that Python considers
+ # to be valid utf-8. Remove up to MAX_CLD_BYTES_TO_REMOVE of such
+ # "invalid" bytes
+ # TODO: alert the user somehow if we give up removing them
+ detected_language = None
+ text_bytes = text.encode('utf-8')
+ for i in range(MAX_CLD_BYTES_TO_REMOVE):
try:
- result = text.decode('utf-8')
- except UnicodeDecodeError:
- # Is there a better way of doing this than nesting try/except
- # blocks? This smells really bad.
- try:
- result = text.decode('iso-8859-1')
- except UnicodeDecodeError:
- # If neither utf-8 nor iso-885901 work are capable of handling
- # this text, we just decode it using utf-8 and replace invalid
- # chars with U+FFFD.
- # Two somewhat arbitrary decisions were made here: use utf-8
- # and use 'replace' instead of 'ignore'.
- result = text.decode('utf-8', 'replace')
- forced_decoding = True
-
- return result, forced_decoding
+ languages = cld.detect(text_bytes)[2]
+ except cld.error as exc:
+ message = exc.args[0] if exc.args else ''
+ match = cld_error_re.match(message)
+ if match:
+ byte_index = int(match.group('index'))
+ text_bytes = (text_bytes[:byte_index]
+ + text_bytes[byte_index + 1:])
+ else:
+ raise
+ else:
+ if languages:
+ detected_language = languages[0][1]
+ break
+
+ return detected_language
class Extractor(PyPLNTask):
@@ -173,11 +194,12 @@ def process(self, file_data):
contents = base64.b64decode(file_data['contents'])
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
file_mime_type = m.id_buffer(contents)
+
metadata = {}
- if file_mime_type == 'text/plain':
- text = contents
- elif file_mime_type == 'text/html':
- text = parse_html(contents, True, ['script', 'style'])
+ if file_mime_type in ('text/plain', 'text/html'):
+ text = decode_text_bytes(contents)
+ if file_mime_type == 'text/html':
+ text = parse_html(text, True, ['script', 'style'])
elif file_mime_type == 'application/pdf':
text, metadata = extract_pdf(contents)
else:
@@ -191,22 +213,10 @@ def process(self, file_data):
return {'mimetype': 'unknown', 'text': "",
'file_metadata': {}, 'language': ""}
- text, forced_decoding = trial_decode(text)
-
- if isinstance(text, unicode):
- # HTMLParser only handles unicode objects. We can't pass the text
- # through it if we don't know the encoding, and it's possible we
- # also shouldn't. There's no way of knowing if it's a badly encoded
- # html or a binary blob that happens do have bytes that look liked
- # html entities.
- text = HTMLParser().unescape(text)
-
+ text = html.unescape(text)
text = clean(text)
-
- if isinstance(text, unicode):
- language = cld.detect(text.encode('utf-8'))[1]
- else:
- language = cld.detect(text)[1]
-
- return {'text': text, 'file_metadata': metadata, 'language': language,
- 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}
+ return {'text': text,
+ 'file_metadata': metadata,
+ 'language': detect_language(text),
+ 'mimetype': file_mime_type,
+ 'forced_decoding': None}
diff --git a/pypln/backend/workers/freqdist.py b/pypln/backend/workers/freqdist.py
index 7bb7f7e..bdf3712 100644
--- a/pypln/backend/workers/freqdist.py
+++ b/pypln/backend/workers/freqdist.py
@@ -27,7 +27,7 @@ def process(self, document):
tokens = [info.lower() for info in document_tokens]
frequency_distribution = {token: tokens.count(token) \
for token in set(tokens)}
- fd = frequency_distribution.items()
- fd.sort(lambda x, y: cmp(y[1], x[1]))
+ fd = list(frequency_distribution.items())
+ fd.sort(key=lambda x: (-x[1], x[0]))
return {'freqdist': fd}
diff --git a/pypln/backend/workers/palavras_noun_phrase.py b/pypln/backend/workers/palavras_noun_phrase.py
index 76e3a18..f9dde80 100644
--- a/pypln/backend/workers/palavras_noun_phrase.py
+++ b/pypln/backend/workers/palavras_noun_phrase.py
@@ -40,7 +40,7 @@ def process(self, document):
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
palavras_output = document['palavras_raw']
- if isinstance(palavras_output, unicode):
+ if isinstance(palavras_output, str):
# we *need* to send a 'str' to the process. Otherwise it's going to try to use ascii.
palavras_output = palavras_output.encode('utf-8')
stdout, stderr = process.communicate(palavras_output)
diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py
index 77e2d9a..95161ba 100644
--- a/pypln/backend/workers/palavras_raw.py
+++ b/pypln/backend/workers/palavras_raw.py
@@ -39,14 +39,15 @@ def process(self, document):
text = document['text']
- # For some reason, in some pypln installations the document['text'] is
- # not always unicode as it should be. This may be due to errors during
- # the decoding process that we fixed earlier. That meant that, when we
- # got a non-unicode string, python would try to decode it using the
- # default codec (ascii) in `text.encode(PALAVRAS_ENCODING)`. Since we
- # know the text came from mongodb, we can just decode it using utf-8 to
- # make sure we have a unicode object.
- if not isinstance(text, unicode):
+ # This code is here because when using python2 for some
+ # reason, sometimes document['text'] was not a unicode object
+ # (as it should be, coming from pymongo). Since we're now
+ # using python3, we should really always get a str (unicode)
+ # object. But, since we do not know the real reason for the
+ # original error, we will keep this code here for now. As
+ # before, if we receive a bytes object, since it came from
+ # mongodb we can be sure it will be encoded in utf-8.
+ if isinstance(text, bytes):
text = text.decode('utf-8')
process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
@@ -55,4 +56,4 @@ def process(self, document):
stderr=subprocess.PIPE)
stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
- return {'palavras_raw': stdout, 'palavras_raw_ran': True}
+ return {'palavras_raw': stdout.decode('utf-8'), 'palavras_raw_ran': True}
diff --git a/pypln/backend/workers/palavras_semantic_tagger.py b/pypln/backend/workers/palavras_semantic_tagger.py
index 3f35ca5..a66d42c 100644
--- a/pypln/backend/workers/palavras_semantic_tagger.py
+++ b/pypln/backend/workers/palavras_semantic_tagger.py
@@ -26,381 +26,381 @@
{
'Animal':
{
- '': u'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' ,
- '': u'Group of animals (cardume, enxame, passarada, ninhada)',
- '': u'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)',
- '': u'Group of domestic animals (boiada)',
- '': u'Water-animal (tubarão, delfim)',
- '': u'Mythological animal (basilisco)',
- '': u'Land-animal (raposa)',
- '': u'Bird (águia, bem-te-vi)',
- '': u'Insect (borboleta)',
- '': u'Cell-animal (bacteria, blood cells: linfócito)',
+ '': 'Animal, umbrella tag (clone, fêmea, fóssil, parasito, predador)' ,
+ '': 'Group of animals (cardume, enxame, passarada, ninhada)',
+ '': 'Domestic animal or big mammal (likely to have female forms etc.: terneiro, leão/leoa, cachorro)',
+ '': 'Group of domestic animals (boiada)',
+ '': 'Water-animal (tubarão, delfim)',
+ '': 'Mythological animal (basilisco)',
+ '': 'Land-animal (raposa)',
+ '': 'Bird (águia, bem-te-vi)',
+ '': 'Insect (borboleta)',
+ '': 'Cell-animal (bacteria, blood cells: linfócito)',
},
'Plant':
{
- '': u'Plant, umbrella tag',
- '': u'Group of plants, plantation (field, forest etc.: mata, nabal)',
- '': u'Tree (oliveira, palmeira)',
- '': u'Flower (rosa, taraxaco)',
- '': u'Bush, shrub (rododendro, tamariz)',
- '': u'(fruit, berries, nuts: maçã, morango, avelã, melancia)',
- '': u'(vegetable espargo, funcho)',
+ '': 'Plant, umbrella tag',
+ '': 'Group of plants, plantation (field, forest etc.: mata, nabal)',
+ '': 'Tree (oliveira, palmeira)',
+ '': 'Flower (rosa, taraxaco)',
+ '': 'Bush, shrub (rododendro, tamariz)',
+ '': '(fruit, berries, nuts: maçã, morango, avelã, melancia)',
+ '': '(vegetable espargo, funcho)',
},
'Human':
{
- '': u'Human, umbrella tag',
- '': u'Group of humans (organisations, teams, companies, e.g. editora)',
- '': u'Attributive human umbrella tag (many -ista, -ante)',
- '': u'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)',
- '': u'Human with family or other private relation (pai, noiva)',
- '': u'Ideological human (comunista, implies ), also: follower, disciple (dadaista)',
- '': u'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)',
- '': u'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)',
- '': u'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)',
- '': u'Sick human (few: asmático, diabético, cp )',
- '': u'Title noun (rei, senhora)',
+ '': 'Human, umbrella tag',
+ '': 'Group of humans (organisations, teams, companies, e.g. editora)',
+ '': 'Attributive human umbrella tag (many -ista, -ante)',
+ '': 'Human classified by biological criteria (race, age etc., caboclo, mestiço, bebé, adulto)',
+ '': 'Human with family or other private relation (pai, noiva)',
+ '': 'Ideological human (comunista, implies ), also: follower, disciple (dadaista)',
+ '': 'Humanoid mythical (gods, fairy tale humanoids, curupira, duende)',
+ '': 'Nationality human (brasileiro, alemão), also: inhabitant (lisboeta)',
+ '': 'Professional human (marinheiro, implies ), also: sport, hobby (alpinista)',
+ '': 'Sick human (few: asmático, diabético, cp )',
+ '': 'Title noun (rei, senhora)',
},
'Place and spatial':
{
- '': u'Place, umbrella tag',
- '': u'Abstract place (anverso. auge)',
- '': u'Civitas, town, country, county (equals + , cidade, país)',
- '': u'Cover, lid (colcha, lona, tampa)',
- '': u'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)',
- '': u'opening, hole (apertura, fossa)',
- '': u'Path (road, street etc.: rua, pista)' ,
- '': u'Star object (planets, comets: planeta, quasar)',
- '': u'surface (face, verniz, cp. )',
- '': u'tip place, edge (pico, pontinha, cp. )',
- '': u'Geographical, natural place (promontório, pântano)',
- '': u'trap place (armadilha, armazelo)',
- '': u'Water place (river, lake, sea: fonte, foz, lagoa)',
- '': u'barrier noun (dique, limite, muralha)',
- '': u'(building)',
- '': u'(institution)',
- '': u'(picture)',
- '': u'(situation)',
- '': u'anatomical/body position (few: desaprumo)',
- '': u'social position, job (emprego, condado, capitania, presidência)',
+ '': 'Place, umbrella tag',
+ '': 'Abstract place (anverso. auge)',
+ '': 'Civitas, town, country, county (equals + , cidade, país)',
+ '': 'Cover, lid (colcha, lona, tampa)',
+ '': 'Functional place, human built or human-used (aeroporto, anfiteatro, cp. for just a building)',
+ '': 'opening, hole (apertura, fossa)',
+ '': 'Path (road, street etc.: rua, pista)' ,
+ '': 'Star object (planets, comets: planeta, quasar)',
+ '': 'surface (face, verniz, cp. )',
+ '': 'tip place, edge (pico, pontinha, cp. )',
+ '': 'Geographical, natural place (promontório, pântano)',
+ '': 'trap place (armadilha, armazelo)',
+ '': 'Water place (river, lake, sea: fonte, foz, lagoa)',
+ '': 'barrier noun (dique, limite, muralha)',
+ '': '(building)',
+ '': '(institution)',
+ '': '(picture)',
+ '': '(situation)',
+ '': 'anatomical/body position (few: desaprumo)',
+ '': 'social position, job (emprego, condado, capitania, presidência)',
},
'Vehicle':
{
- '': u'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)',
- '': u'Group of vehicles (armada, convoy: frota, esquadra)',
- '': u'Water vehicle (ship: navio, submersível, canoa)',
- '': u'Air vehicle (plane: hidroplano, jatinho)',
+ '': 'Vehicle, umbrella tag and ground vehicle (car, train: carro, comboio, tanque, teleférico)',
+ '': 'Group of vehicles (armada, convoy: frota, esquadra)',
+ '': 'Water vehicle (ship: navio, submersível, canoa)',
+ '': 'Air vehicle (plane: hidroplano, jatinho)',
},
'Abstract':
{
- '': u'Abstract countable, umbrella tag (alternativa, chance, lazer)',
- '': u'Category word (latinismo, número atômico)',
- '': u'sign, symbol (parêntese, semicolcheia)',
- '': u'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)',
- '': u'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.',
- '': u'(features)',
- '': u'direction noun (estibordo, contrasenso, norte)',
- '': u'(shapes)',
- '': u'meta noun (tipo, espécie)',
- '': u'(MARCA) brand',
- '': u'(DISCIPLINA) subject matter',
- '': u'(ESCOLA) school of thought',
- '': u'(IDEA) idea, concept',
- '': u'(PLANO) named plan, project',
- '': u'(OBRA) artist-s name, standing for body of work',
- '': u'(NOME)',
- '': u'(ESTADO) physiological state, in particular: disease',
+ '': 'Abstract countable, umbrella tag (alternativa, chance, lazer)',
+ '': 'Category word (latinismo, número atômico)',
+ '': 'sign, symbol (parêntese, semicolcheia)',
+ '': 'Abstract mass/non-countable, umbrella tag (still contains many cases that could be , e.g. habilidade, legalidade)',
+ '': 'Abstract/concept, neither countable nor mass (endogamia), cp. , etc.',
+ '': '(features)',
+ '': 'direction noun (estibordo, contrasenso, norte)',
+ '': '(shapes)',
+ '': 'meta noun (tipo, espécie)',
+ '': '(MARCA) brand',
+ '': '(DISCIPLINA) subject matter',
+ '': '(ESCOLA) school of thought',
+ '': '(IDEA) idea, concept',
+ '': '(PLANO) named plan, project',
+ '': '(OBRA) artist-s name, standing for body of work',
+ '': '(NOME)',
+ '': '(ESTADO) physiological state, in particular: disease',
},
'Concept':
{
- '': u'convention (social rule or law, lei, preceito)',
- '': u'subject matter, profession, cf. , anatomia, citricultura, dactilografia)',
- '': u'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)',
- '': u'',
- '': u'language (alemão, catalão, bengali)',
- '': u'',
- '': u'',
- '': u'therapy (also and , acupuntura, balneoterapia)',
+ '': 'convention (social rule or law, lei, preceito)',
+ '': 'subject matter, profession, cf. , anatomia, citricultura, dactilografia)',
+ '': 'ideology or other value system (anarquismo, anti-ocidentalismo, apartheid)',
+ '': '',
+ '': 'language (alemão, catalão, bengali)',
+ '': '',
+ '': '',
+ '': 'therapy (also and , acupuntura, balneoterapia)',
},
'Game':
{
- '': u'play, game (bilhar, ioiô, poker, also )',
+ '': 'play, game (bilhar, ioiô, poker, also )',
},
'Genre':
{
- '': u'genre (especially art genre, cf. , modernismo, tropicalismo)',
+ '': 'genre (especially art genre, cf. , modernismo, tropicalismo)',
},
'Quantity':
{
- '': u'',
- '': u'quantity noun (bocada, teor, sem-fim)',
- '': u'currency noun (countable, implies , cf. , dirham, euro, real, dólar)',
- '': u'amount of money (bolsa, custo, imposto, cf. )',
+ '': '',
+ '': 'quantity noun (bocada, teor, sem-fim)',
+ '': 'currency noun (countable, implies , cf. , dirham, euro, real, dólar)',
+ '': 'amount of money (bolsa, custo, imposto, cf. )',
},
'Action':
{
- '': u'Action umbrella tag (+CONTROL, PERFECTIVE)',
- '': u'beat-action (thrashing, pancada, surra)',
- '': u'do-action (typically dar/fazer + N, tentativa, teste, homenagem)',
- '': u'speech act or communicative act (proposta, ordem)',
- '': u'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )',
- '': u'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)',
- '': u'',
- '': u'',
- '': u'',
- '': u'dance (both , and , calipso, flamenco, forró)',
- '': u'fight, conflict (also and +TEMP, briga, querela)',
- '': u'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)',
+ '': 'Action umbrella tag (+CONTROL, PERFECTIVE)',
+ '': 'beat-action (thrashing, pancada, surra)',
+ '': 'do-action (typically dar/fazer + N, tentativa, teste, homenagem)',
+ '': 'speech act or communicative act (proposta, ordem)',
+ '': 'trick-action (cheat, fraud, ruse, jeito, fraude, similar to )',
+ '': 'Activity, umbrella tag (+CONTROL, IMPERFECTIVE, correria, manejo)',
+ '': '',
+ '': '',
+ '': '',
+ '': 'dance (both , and , calipso, flamenco, forró)',
+ '': 'fight, conflict (also and +TEMP, briga, querela)',
+ '': 'speech situation, talk, discussion, quarrel (implies and , entrevista, lero-lero)',
},
'Anatomical':
{
- '': u'Anatomical noun, umbrella tag (carótida, clítoris, dorso)',
- '': u'Movable anatomy (arm, leg, braço, bíceps, cotovelo)',
- '': u'Organ (heart, liver, hipófise, coração, testículo)',
- '': u'Bone (calcâneo, fíbula, vértebra)',
- '': u'Animal anatomy (rúmen, carapaça, chifres, tromba)',
- '': u'Bird anatomy (bico, pluma)',
- '': u'Fish anatomy (few: bránquias, siba)',
- '': u'Insect anatomy (few: tentáculo, olho composto)',
- '': u'Plant anatomy (bulbo, caule, folha)',
- '': u'(human anatomical feature)',
+ '': 'Anatomical noun, umbrella tag (carótida, clítoris, dorso)',
+ '': 'Movable anatomy (arm, leg, braço, bíceps, cotovelo)',
+ '': 'Organ (heart, liver, hipófise, coração, testículo)',
+ '