-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdebug_preprocessing.py
More file actions
78 lines (63 loc) · 2.77 KB
/
debug_preprocessing.py
File metadata and controls
78 lines (63 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Debug preprocessing to understand why similarity scores are 0
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.text_vectorizer import TextVectorizer
from src.utils import load_json_data
def debug_preprocessing():
print("Starting debug...")
try:
# Load sample data
documents = load_json_data("data/sample_documents.json")
print(f"Loaded {len(documents)} documents")
vectorizer = TextVectorizer()
print("Created vectorizer")
# Test preprocessing on first document
doc = documents[0]
print(f"Original: {doc['content']}")
processed = vectorizer.preprocess_text(doc['content'])
print(f"Processed: '{processed}'")
# Test TF-IDF with different settings
texts = [doc['content'] for doc in documents[:3]]
processed_texts = [vectorizer.preprocess_text(text) for text in texts]
print(f"\nProcessed texts:")
for i, text in enumerate(processed_texts):
print(f"{i+1}: '{text}'")
# Test TF-IDF matrix directly
from sklearn.feature_extraction.text import TfidfVectorizer
print(f"\nTesting TF-IDF configurations:")
# Test 1: Default settings with Chinese text
tfidf1 = TfidfVectorizer(max_features=1000)
matrix1 = tfidf1.fit_transform(processed_texts)
print(f"Default TF-IDF: {matrix1.shape}, non-zero: {matrix1.nnz}")
vocab1 = tfidf1.get_feature_names_out()
print(f"Vocabulary sample: {list(vocab1[:10])}")
# Test 2: Custom token pattern for Chinese
tfidf2 = TfidfVectorizer(
max_features=1000,
token_pattern=r'(?u)\b\w+\b', # Unicode aware pattern
analyzer='char_wb', # Character n-grams within word boundaries
ngram_range=(1, 3)
)
matrix2 = tfidf2.fit_transform(processed_texts)
print(f"Chinese TF-IDF: {matrix2.shape}, non-zero: {matrix2.nnz}")
vocab2 = tfidf2.get_feature_names_out()
print(f"Vocabulary sample: {list(vocab2[:10])}")
# Test query transformation
query = "机器学习"
query_processed = vectorizer.preprocess_text(query)
print(f"\nQuery: '{query}' -> '{query_processed}'")
query_vec1 = tfidf1.transform([query_processed])
query_vec2 = tfidf2.transform([query_processed])
print(f"Query vector 1 non-zero: {query_vec1.nnz}")
print(f"Query vector 2 non-zero: {query_vec2.nnz}")
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
debug_preprocessing()