-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy paththeme_extractor.py
More file actions
104 lines (86 loc) · 4.14 KB
/
theme_extractor.py
File metadata and controls
104 lines (86 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
import string
import re
# Download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('punkt')
nltk.download('stopwords')
def extract_themes(text):
"""
Extract themes from text using NLTK
"""
if not text:
return ["memory", "personal", "story"]
# Process text
sentences = sent_tokenize(text)
# Get stopwords
stop_words = set(stopwords.words('english'))
# Add custom stopwords for oral histories
custom_stopwords = ['um', 'uh', 'like', 'you know', 'I mean', 'well', 'so', 'actually',
'really', 'basically', 'totally', 'honestly', 'literally', 'yeah']
stop_words.update(custom_stopwords)
# Process tokens
all_words = []
for sent in sentences:
# Remove punctuation and convert to lowercase
sent = re.sub(r'[^\w\s]', '', sent.lower())
# Tokenize
tokens = word_tokenize(sent)
# Filter out stopwords and short words
filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
all_words.extend(filtered_tokens)
# Get word frequency
fdist = FreqDist(all_words)
# Predefined themes (categories) and their associated keywords
theme_keywords = {
'family': ['family', 'mother', 'father', 'parent', 'brother', 'sister', 'aunt', 'uncle',
'grandparent', 'grandmother', 'grandfather', 'son', 'daughter', 'child', 'children'],
'childhood': ['childhood', 'school', 'play', 'kid', 'young', 'grow', 'growing', 'youth', 'childish'],
'food': ['food', 'cook', 'recipe', 'meal', 'dish', 'eat', 'kitchen', 'bake', 'ingredient',
'dinner', 'lunch', 'breakfast', 'restaurant'],
'traditions': ['tradition', 'custom', 'holiday', 'celebration', 'ritual', 'heritage', 'culture',
'ceremony', 'festival', 'religion', 'spiritual'],
'work': ['work', 'job', 'career', 'profession', 'business', 'employment', 'workplace',
'office', 'company', 'boss', 'employee', 'coworker'],
'migration': ['migrate', 'immigration', 'move', 'relocate', 'journey', 'travel', 'foreign',
'country', 'overseas', 'immigrant', 'refugee', 'border', 'international'],
'hardship': ['hardship', 'difficult', 'challenge', 'struggle', 'poverty', 'suffer', 'overcome',
'adversity', 'problem', 'obstacle', 'hurdle', 'tough'],
'history': ['history', 'historical', 'past', 'era', 'decade', 'century', 'remember', 'memory',
'ancestor', 'heritage', 'legacy', 'historic'],
'education': ['education', 'study', 'school', 'college', 'university', 'learn', 'teacher',
'student', 'class', 'degree', 'knowledge', 'academic'],
'weather': ['weather', 'climate', 'storm', 'rain', 'snow', 'winter', 'summer', 'season',
'cold', 'warm', 'hot', 'temperature', 'wind', 'hurricane']
}
# Track theme matches and their strength
theme_matches = Counter()
# For each word in our text, check if it matches any theme keywords
for word, freq in fdist.items():
for theme, keywords in theme_keywords.items():
if word in keywords:
theme_matches[theme] += freq
# Get the top 3-5 themes
top_themes = [theme for theme, _ in theme_matches.most_common(5)]
# If we didn't find any themes, return default themes
if not top_themes:
# Add some common words from the text
common_words = [word for word, _ in fdist.most_common(3)]
return common_words if common_words else ["memory", "personal", "story"]
return top_themes
if __name__ == "__main__":
# Read input from stdin
text = sys.stdin.read()
# Extract themes
themes = extract_themes(text)
# Output JSON to stdout
print(json.dumps({"themes": themes}))