Echo/theme_extractor.py at replit-agent · angelaxli/Echo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import sys
import json
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
import string
import re

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

def extract_themes(text):
    """
    Extract themes from text using NLTK
    """
    if not text:
        return ["memory", "personal", "story"]

    # Process text
    sentences = sent_tokenize(text)

    # Get stopwords
    stop_words = set(stopwords.words('english'))

    # Add custom stopwords for oral histories
    custom_stopwords = ['um', 'uh', 'like', 'you know', 'I mean', 'well', 'so', 'actually',
                         'really', 'basically', 'totally', 'honestly', 'literally', 'yeah']
    stop_words.update(custom_stopwords)

    # Process tokens
    all_words = []
    for sent in sentences:
        # Remove punctuation and convert to lowercase
        sent = re.sub(r'[^\w\s]', '', sent.lower())

        # Tokenize
        tokens = word_tokenize(sent)

        # Filter out stopwords and short words
        filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
        all_words.extend(filtered_tokens)

    # Get word frequency
    fdist = FreqDist(all_words)

    # Predefined themes (categories) and their associated keywords
    theme_keywords = {
        'family': ['family', 'mother', 'father', 'parent', 'brother', 'sister', 'aunt', 'uncle',
                  'grandparent', 'grandmother', 'grandfather', 'son', 'daughter', 'child', 'children'],
        'childhood': ['childhood', 'school', 'play', 'kid', 'young', 'grow', 'growing', 'youth', 'childish'],
        'food': ['food', 'cook', 'recipe', 'meal', 'dish', 'eat', 'kitchen', 'bake', 'ingredient',
                'dinner', 'lunch', 'breakfast', 'restaurant'],
        'traditions': ['tradition', 'custom', 'holiday', 'celebration', 'ritual', 'heritage', 'culture',
                      'ceremony', 'festival', 'religion', 'spiritual'],
        'work': ['work', 'job', 'career', 'profession', 'business', 'employment', 'workplace',
                'office', 'company', 'boss', 'employee', 'coworker'],
        'migration': ['migrate', 'immigration', 'move', 'relocate', 'journey', 'travel', 'foreign',
                     'country', 'overseas', 'immigrant', 'refugee', 'border', 'international'],
        'hardship': ['hardship', 'difficult', 'challenge', 'struggle', 'poverty', 'suffer', 'overcome',
                    'adversity', 'problem', 'obstacle', 'hurdle', 'tough'],
        'history': ['history', 'historical', 'past', 'era', 'decade', 'century', 'remember', 'memory',
                   'ancestor', 'heritage', 'legacy', 'historic'],
        'education': ['education', 'study', 'school', 'college', 'university', 'learn', 'teacher',
                     'student', 'class', 'degree', 'knowledge', 'academic'],
        'weather': ['weather', 'climate', 'storm', 'rain', 'snow', 'winter', 'summer', 'season',
                   'cold', 'warm', 'hot', 'temperature', 'wind', 'hurricane']
    }

    # Track theme matches and their strength
    theme_matches = Counter()

    # For each word in our text, check if it matches any theme keywords
    for word, freq in fdist.items():
        for theme, keywords in theme_keywords.items():
            if word in keywords:
                theme_matches[theme] += freq

    # Get the top 3-5 themes
    top_themes = [theme for theme, _ in theme_matches.most_common(5)]

    # If we didn't find any themes, return default themes
    if not top_themes:
        # Add some common words from the text
        common_words = [word for word, _ in fdist.most_common(3)]
        return common_words if common_words else ["memory", "personal", "story"]

    return top_themes

if __name__ == "__main__":
    # Read input from stdin
    text = sys.stdin.read()

    # Extract themes
    themes = extract_themes(text)

    # Output JSON to stdout
    print(json.dumps({"themes": themes}))