Skip to content

Commit db94546

Browse files
committed
Code fixed
1 parent dfdfb6e commit db94546

11 files changed

+8179
-57
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
my_tests/
2-
csv_files/
2+
33
### JetBrains template
44
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
55
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

csv_files/01_reddit_posts/reddit_posts_0104.tsv

+97
Large diffs are not rendered by default.

csv_files/01_reddit_posts/reddit_posts_1504.tsv

+809
Large diffs are not rendered by default.

csv_files/01_reddit_posts/reddit_posts_combined.tsv

+3,617
Large diffs are not rendered by default.

csv_files/02_preprocessed_data/preprocessed_data.tsv

+3,605
Large diffs are not rendered by default.

main.py

+28-10
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,24 @@
11
import os
2+
import csv
3+
import glob
24

35
from dotenv import load_dotenv
46

5-
from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
67
from sentiment_analysis.etap1_1_sentiment_analysis_preprocessing import SentimentAnalysisPreprocessor
8+
from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
9+
10+
11+
def combine_tsv_files(file_list: list, output_file: str):
12+
post_data = [
13+
row for file in file_list for row in
14+
csv.DictReader(open(file, 'r', newline='', encoding='utf-8'), delimiter='\t')
15+
]
16+
17+
with open(output_file, 'w', newline='', encoding='utf-8') as o_file:
18+
writer = csv.DictWriter(o_file, fieldnames=post_data[0].keys(), delimiter='\t')
19+
writer.writeheader()
20+
writer.writerows(post_data)
21+
722

823
if __name__ == '__main__':
924
load_dotenv()
@@ -19,12 +34,15 @@
1934
'cybersecurity', 'education', 'humanrights', 'globaldevelopment'
2035
]
2136

22-
scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=1000)
23-
scraper.scrape(output_file="csv_files/reddit_posts_20230503.csv")
24-
#
25-
# preprocessor = SentimentAnalysisPreprocessor(
26-
# input_file='csv_files/reddit_posts_20230503.csv',
27-
# output_file='csv_files/preprocessed_data_20230503.csv',
28-
# num_clusters=3
29-
# )
30-
# preprocessor.preprocess()
37+
scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=10000)
38+
# scraper.scrape(output_file="csv_files/01_reddit_posts/reddit_posts_20230503.tsv")
39+
# Run line above if you do not have any .tsv files
40+
41+
reddit_posts = glob.glob("csv_files/01_reddit_posts/*.tsv")
42+
combine_tsv_files(reddit_posts, "csv_files/01_reddit_posts/reddit_posts_combined.tsv")
43+
44+
preprocessor = SentimentAnalysisPreprocessor(
45+
input_file='csv_files/01_reddit_posts/reddit_posts_combined.tsv',
46+
output_file='csv_files/02_preprocessed_data/preprocessed_data.tsv'
47+
)
48+
preprocessor.preprocess()

sentiment_analysis/etap1_1_sentiment_analysis_preprocessing.py

+11-38
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,16 @@
44
import numpy as np
55
from nltk.corpus import stopwords
66
from gensim.models import Word2Vec
7+
from sklearn.cluster import KMeans
78
from sklearn.preprocessing import normalize
8-
from sklearn.model_selection import train_test_split
9-
from sklearn.linear_model import LogisticRegression
10-
from sklearn.metrics import accuracy_score
119

1210

1311
class SentimentAnalysisPreprocessor:
1412

15-
def __init__(self, input_file, output_file):
13+
def __init__(self, input_file, output_file, num_clusters=3):
1614
self.input_file = input_file
1715
self.output_file = output_file
16+
self.num_clusters = num_clusters
1817

1918
@staticmethod
2019
def clean_text(text):
@@ -34,7 +33,7 @@ def average_vector(words, word_vectors):
3433

3534
def preprocess(self):
3635
# Load the collected data
37-
df = pd.read_csv(self.input_file)
36+
df = pd.read_csv(self.input_file, sep='\t')
3837

3938
# Perform data cleaning and preprocessing
4039
df['cleaned_title'] = df['title'].apply(lambda x: self.clean_text(x))
@@ -49,44 +48,18 @@ def preprocess(self):
4948
word_vectors = model.wv
5049

5150
# Get the average vector for each title
51+
# df['avg_vector'] = df['tokenized_title'].apply(lambda x: self.average_vector(x, word_vectors))
5252
df['avg_vector'] = df['tokenized_title'].apply(
5353
lambda x: self.average_vector(x, word_vectors)
5454
.tolist() if self.average_vector(x, word_vectors) is not None else None)
5555
df = df.dropna(subset=['avg_vector'])
5656

57-
# Manually label a portion of the data for training the classifier
58-
# Add a new column 'manual_label' to the DataFrame and set it to None
59-
df['manual_label'] = None
57+
# Apply K-Means clustering to the embeddings
58+
X = np.vstack(df['avg_vector'].values)
59+
kmeans = KMeans(n_clusters=self.num_clusters, random_state=0).fit(X)
6060

61-
# Manually label some examples (replace the index and label with appropriate values)
62-
# 0 - negative, 1 - neutral, 2 - positive
63-
df.at[0, 'manual_label'] = 2
64-
df.at[5, 'manual_label'] = 0
65-
# ...
66-
67-
# Split the manually labeled data into training and testing sets
68-
labeled_data = df.dropna(subset=['manual_label'])
69-
X_train, X_test, y_train, y_test = train_test_split(
70-
np.vstack(labeled_data['avg_vector'].values),
71-
labeled_data['manual_label'].values.astype(int),
72-
test_size=0.3,
73-
random_state=42)
74-
75-
# Train a classifier (e.g., logistic regression) on
76-
clf = LogisticRegression(random_state=42)
77-
clf.fit(X_train, y_train)
78-
79-
# Evaluate the classifier on the test set
80-
y_pred = clf.predict(X_test)
81-
print(f"Classifier accuracy: {accuracy_score(y_test, y_pred)}")
82-
83-
# Assign sentiment labels to the remaining data using the trained classifier
84-
unlabeled_data = df[df['manual_label'].isnull()]
85-
unlabeled_data['predicted_label'] = clf.predict(np.vstack(unlabeled_data['avg_vector'].values))
86-
87-
# Combine the manually labeled and predicted data
88-
labeled_data['predicted_label'] = labeled_data['manual_label']
89-
result_df = pd.concat([labeled_data, unlabeled_data], ignore_index=True)
61+
# Assign sentiment labels to the clusters
62+
df['cluster'] = kmeans.labels_
9063

9164
# Save the preprocessed data
92-
result_df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True)
65+
df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True, sep='\t')

sentiment_analysis/etap1_reddit_scrapper.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import praw
23
import pandas as pd
34
import logging
@@ -16,13 +17,15 @@ def __init__(self, client_id, client_secret, user_agent, subreddits, num_posts=1
1617
self.subreddits = subreddits
1718
self.num_posts = num_posts
1819

19-
def scrape(self, output_file='csv_files/reddit_posts.csv'):
20+
def scrape(self, output_file):
2021
posts = []
2122
i, j = 0, 0
2223

2324
for subreddit in self.subreddits:
2425
i += 1
25-
for submission in self.reddit.subreddit(subreddit).top(limit=self.num_posts, time_filter='all'):
26+
# for submission in self.reddit.subreddit('popular').search(query='', limit=self.num_posts):
27+
for submission in self.reddit.subreddit('popular').top(limit=self.num_posts, time_filter='all'):
28+
# for submission in self.reddit.subreddit(subreddit).top(limit=self.num_posts, time_filter='all'):
2629
if submission.is_self:
2730
posts.append({
2831
'author': submission.author.name if submission.author else '',
@@ -33,10 +36,10 @@ def scrape(self, output_file='csv_files/reddit_posts.csv'):
3336
'num_comments': submission.num_comments,
3437
'created': pd.to_datetime(submission.created_utc, unit='s'),
3538
'subreddit': subreddit,
36-
'selftext': submission.selftext
39+
'selftext': re.sub(r'["\n\t]', ' ', submission.selftext)
3740
})
3841
j += 1
3942
print(f"INFO: Parsed i: {i} and j: {j}")
4043

4144
df = pd.DataFrame(posts)
42-
df.to_csv(output_file, index=False)
45+
df.to_csv(output_file, index=False, sep='\t')

sentiment_analysis/etap2_sentiment_analysis_classic_ml.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sklearn.preprocessing import label_binarize
1111

1212
# Load preprocessed data
13-
df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
13+
df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
1414
df['avg_vector'] = df['avg_vector'].apply(lambda x: ast.literal_eval(x))
1515

1616
# Split data into training and testing sets

sentiment_analysis/etap3_sentiment_analysis_lstm_model.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from tensorflow.keras.utils import to_categorical
88

99
# Load preprocessed data
10-
df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
10+
df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
1111
df['avg_vector'] = df['avg_vector'].apply(lambda x: ast.literal_eval(x))
1212

1313
# Split data into training and testing sets

sentiment_analysis/etap4.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from transformers import get_linear_schedule_with_warmup
88

99
# Load preprocessed data
10-
df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
10+
df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
1111
df['cleaned_title'] = df['cleaned_title'].astype(str)
1212

1313
# Tokenize the text
@@ -61,7 +61,7 @@
6161
scheduler.step()
6262

6363
# Evaluate the fine-tuned model on the test set
64-
test_df = pd.read_csv('../csv_files/reddit_posts_1504.csv')
64+
test_df = pd.read_csv('../csv_files/01_reddit_posts/reddit_posts_combined.tsv', sep='\t')
6565
test_df['cleaned_title'] = test_df['cleaned_title'].astype(str)
6666
test_inputs = torch.tensor(
6767
[tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text

0 commit comments

Comments
 (0)