empath-client

Ejhfast · Apr 16, 2016 · cf046b7 · cf046b7
commit cf046b7
Show file tree

Hide file tree

Showing 11 changed files with 248 additions and 0 deletions.
diff --git a/empath/__init__.py b/empath/__init__.py
@@ -0,0 +1 @@
+from .core import Empath
diff --git a/empath/__init__.pyc b/empath/__init__.pyc
diff --git a/empath/__pycache__/__init__.cpython-35.pyc b/empath/__pycache__/__init__.cpython-35.pyc
diff --git a/empath/__pycache__/core.cpython-35.pyc b/empath/__pycache__/core.cpython-35.pyc
diff --git a/empath/__pycache__/helpers.cpython-35.pyc b/empath/__pycache__/helpers.cpython-35.pyc
diff --git a/empath/core.py b/empath/core.py
@@ -0,0 +1,40 @@
+import os
+import sys
+from collections import defaultdict
+from . import helpers as util
+
+class Empath:
+    def __init__(self, backend_url="http://localhost:8000"):
+        self.cats = defaultdict(list)
+        self.invcats = defaultdict(list)
+        self.base_dir = os.path.dirname(util.__file__)
+        self.load(self.base_dir+"/data/categories.tsv")
+
+    def load(self,file):
+        with open(file,"r") as f:
+            for line in f:
+                cols = line.strip().split("\t")
+                name = cols[0]
+                terms = cols[1:]
+                for t in terms:
+                    self.cats[name].append(t)
+                    self.invcats[t].append(name)
+
+    def analyze(self,doc,tokenizer="default",normalize=True):
+        if tokenizer == "default":
+            tokenizer = util.default_tokenizer
+        elif tokenizer == "bigrams":
+            tokenizer = util.bigram_tokenizer
+        if not hasattr(tokenizer,"__call__"):
+            raise Exception("invalid tokenizer")
+        count = {}
+        tokens = 0.0
+        for cat in self.cats.keys(): count[cat] = 0.0
+        for tk in tokenizer(doc):
+            tokens += 1.0
+            for cat in self.invcats[tk]:
+                count[cat]+=1.0
+        if normalize:
+            for cat in count.keys():
+                count[cat] = count[cat] / tokens
+        return count
diff --git a/empath/core.pyc b/empath/core.pyc
diff --git a/empath/data/categories.tsv b/empath/data/categories.tsv
diff --git a/empath/helpers.py b/empath/helpers.py
@@ -0,0 +1,9 @@
+def default_tokenizer(doc):
+    return doc.split()
+
+def bigram_tokenizer(doc):
+    tokens = doc.split()
+    for t1,t2 in zip(tokens,tokens[1:]):
+        yield t1
+        yield t1+"_"+t2
+    yield tokens[-1]
diff --git a/empath/helpers.pyc b/empath/helpers.pyc
diff --git a/test.py b/test.py
@@ -0,0 +1,4 @@
+from empath import Empath
+empath = Empath()
+
+