Skip to content

Commit

Permalink
empath-client
Browse files Browse the repository at this point in the history
  • Loading branch information
Ethan Fast authored and Ethan Fast committed Apr 16, 2016
0 parents commit cf046b7
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 0 deletions.
1 change: 1 addition & 0 deletions empath/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .core import Empath
Binary file added empath/__init__.pyc
Binary file not shown.
Binary file added empath/__pycache__/__init__.cpython-35.pyc
Binary file not shown.
Binary file added empath/__pycache__/core.cpython-35.pyc
Binary file not shown.
Binary file added empath/__pycache__/helpers.cpython-35.pyc
Binary file not shown.
40 changes: 40 additions & 0 deletions empath/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
import sys
from collections import defaultdict
from . import helpers as util

class Empath:
def __init__(self, backend_url="http://localhost:8000"):
self.cats = defaultdict(list)
self.invcats = defaultdict(list)
self.base_dir = os.path.dirname(util.__file__)
self.load(self.base_dir+"/data/categories.tsv")

def load(self,file):
with open(file,"r") as f:
for line in f:
cols = line.strip().split("\t")
name = cols[0]
terms = cols[1:]
for t in terms:
self.cats[name].append(t)
self.invcats[t].append(name)

def analyze(self,doc,tokenizer="default",normalize=True):
if tokenizer == "default":
tokenizer = util.default_tokenizer
elif tokenizer == "bigrams":
tokenizer = util.bigram_tokenizer
if not hasattr(tokenizer,"__call__"):
raise Exception("invalid tokenizer")
count = {}
tokens = 0.0
for cat in self.cats.keys(): count[cat] = 0.0
for tk in tokenizer(doc):
tokens += 1.0
for cat in self.invcats[tk]:
count[cat]+=1.0
if normalize:
for cat in count.keys():
count[cat] = count[cat] / tokens
return count
Binary file added empath/core.pyc
Binary file not shown.
194 changes: 194 additions & 0 deletions empath/data/categories.tsv

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions empath/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
def default_tokenizer(doc):
return doc.split()

def bigram_tokenizer(doc):
tokens = doc.split()
for t1,t2 in zip(tokens,tokens[1:]):
yield t1
yield t1+"_"+t2
yield tokens[-1]
Binary file added empath/helpers.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from empath import Empath
empath = Empath()


0 comments on commit cf046b7

Please sign in to comment.