dannycrief
diff --git a/‎.gitignore
+1-1 b/‎.gitignore
+1-1
diff --git a/‎csv_files/01_reddit_posts/reddit_posts_0104.tsv
+97 b/‎csv_files/01_reddit_posts/reddit_posts_0104.tsv
+97
diff --git a/‎csv_files/01_reddit_posts/reddit_posts_1504.tsv
+809 b/‎csv_files/01_reddit_posts/reddit_posts_1504.tsv
+809
diff --git a/‎csv_files/01_reddit_posts/reddit_posts_combined.tsv
+3,617 b/‎csv_files/01_reddit_posts/reddit_posts_combined.tsv
+3,617
diff --git a/‎csv_files/02_preprocessed_data/preprocessed_data.tsv
+3,605 b/‎csv_files/02_preprocessed_data/preprocessed_data.tsv
+3,605
diff --git a/‎main.py
+28-10 b/‎main.py
+28-10
diff --git a/‎sentiment_analysis/etap1_1_sentiment_analysis_preprocessing.py
+11-38 b/‎sentiment_analysis/etap1_1_sentiment_analysis_preprocessing.py
+11-38
diff --git a/‎sentiment_analysis/etap1_reddit_scrapper.py
+7-4 b/‎sentiment_analysis/etap1_reddit_scrapper.py
+7-4
diff --git a/‎sentiment_analysis/etap2_sentiment_analysis_classic_ml.py
+1-1 b/‎sentiment_analysis/etap2_sentiment_analysis_classic_ml.py
+1-1
diff --git a/‎sentiment_analysis/etap3_sentiment_analysis_lstm_model.py
+1-1 b/‎sentiment_analysis/etap3_sentiment_analysis_lstm_model.py
+1-1
diff --git a/‎sentiment_analysis/etap4.py
+2-2 b/‎sentiment_analysis/etap4.py
+2-2
@@ -1,5 +1,5 @@
 my_tests/
-csv_files/
+
 ### JetBrains template
 # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
 # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 
@@ -1,9 +1,24 @@
 import os
+import csv
+import glob
 
 from dotenv import load_dotenv
 
-from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
 from sentiment_analysis.etap1_1_sentiment_analysis_preprocessing import SentimentAnalysisPreprocessor
+from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
+
+
+def combine_tsv_files(file_list: list, output_file: str):
+    post_data = [
+        row for file in file_list for row in
+        csv.DictReader(open(file, 'r', newline='', encoding='utf-8'), delimiter='\t')
+    ]
+
+    with open(output_file, 'w', newline='', encoding='utf-8') as o_file:
+        writer = csv.DictWriter(o_file, fieldnames=post_data[0].keys(), delimiter='\t')
+        writer.writeheader()
+        writer.writerows(post_data)
+
 
 if __name__ == '__main__':
     load_dotenv()
@@ -19,12 +34,15 @@
         'cybersecurity', 'education', 'humanrights', 'globaldevelopment'
     ]
 
-    scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=1000)
-    scraper.scrape(output_file="csv_files/reddit_posts_20230503.csv")
-    #
-    # preprocessor = SentimentAnalysisPreprocessor(
-    #     input_file='csv_files/reddit_posts_20230503.csv',
-    #     output_file='csv_files/preprocessed_data_20230503.csv',
-    #     num_clusters=3
-    # )
-    # preprocessor.preprocess()
+    scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=10000)
+    # scraper.scrape(output_file="csv_files/01_reddit_posts/reddit_posts_20230503.tsv")
+    # Run line above if you do not have any .tsv files
+
+    reddit_posts = glob.glob("csv_files/01_reddit_posts/*.tsv")
+    combine_tsv_files(reddit_posts, "csv_files/01_reddit_posts/reddit_posts_combined.tsv")
+
+    preprocessor = SentimentAnalysisPreprocessor(
+        input_file='csv_files/01_reddit_posts/reddit_posts_combined.tsv',
+        output_file='csv_files/02_preprocessed_data/preprocessed_data.tsv'
+    )
+    preprocessor.preprocess()
@@ -4,17 +4,16 @@
 import numpy as np
 from nltk.corpus import stopwords
 from gensim.models import Word2Vec
+from sklearn.cluster import KMeans
 from sklearn.preprocessing import normalize
-from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import accuracy_score
 
 
 class SentimentAnalysisPreprocessor:
 
-    def __init__(self, input_file, output_file):
+    def __init__(self, input_file, output_file, num_clusters=3):
         self.input_file = input_file
         self.output_file = output_file
+        self.num_clusters = num_clusters
 
     @staticmethod
     def clean_text(text):
@@ -34,7 +33,7 @@ def average_vector(words, word_vectors):
 
     def preprocess(self):
         # Load the collected data
-        df = pd.read_csv(self.input_file)
+        df = pd.read_csv(self.input_file, sep='\t')
 
         # Perform data cleaning and preprocessing
         df['cleaned_title'] = df['title'].apply(lambda x: self.clean_text(x))
@@ -49,44 +48,18 @@ def preprocess(self):
         word_vectors = model.wv
 
         # Get the average vector for each title
+        # df['avg_vector'] = df['tokenized_title'].apply(lambda x: self.average_vector(x, word_vectors))
         df['avg_vector'] = df['tokenized_title'].apply(
             lambda x: self.average_vector(x, word_vectors)
             .tolist() if self.average_vector(x, word_vectors) is not None else None)
         df = df.dropna(subset=['avg_vector'])
 
-        # Manually label a portion of the data for training the classifier
-        # Add a new column 'manual_label' to the DataFrame and set it to None
-        df['manual_label'] = None
+        # Apply K-Means clustering to the embeddings
+        X = np.vstack(df['avg_vector'].values)
+        kmeans = KMeans(n_clusters=self.num_clusters, random_state=0).fit(X)
 
-        # Manually label some examples (replace the index and label with appropriate values)
-        # 0 - negative, 1 - neutral, 2 - positive
-        df.at[0, 'manual_label'] = 2
-        df.at[5, 'manual_label'] = 0
-        # ...
-
-        # Split the manually labeled data into training and testing sets
-        labeled_data = df.dropna(subset=['manual_label'])
-        X_train, X_test, y_train, y_test = train_test_split(
-            np.vstack(labeled_data['avg_vector'].values),
-            labeled_data['manual_label'].values.astype(int),
-            test_size=0.3,
-            random_state=42)
-
-        # Train a classifier (e.g., logistic regression) on
-        clf = LogisticRegression(random_state=42)
-        clf.fit(X_train, y_train)
-
-        # Evaluate the classifier on the test set
-        y_pred = clf.predict(X_test)
-        print(f"Classifier accuracy: {accuracy_score(y_test, y_pred)}")
-
-        # Assign sentiment labels to the remaining data using the trained classifier
-        unlabeled_data = df[df['manual_label'].isnull()]
-        unlabeled_data['predicted_label'] = clf.predict(np.vstack(unlabeled_data['avg_vector'].values))
-
-        # Combine the manually labeled and predicted data
-        labeled_data['predicted_label'] = labeled_data['manual_label']
-        result_df = pd.concat([labeled_data, unlabeled_data], ignore_index=True)
+        # Assign sentiment labels to the clusters
+        df['cluster'] = kmeans.labels_
 
         # Save the preprocessed data
-        result_df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True)
+        df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True, sep='\t')
@@ -1,3 +1,4 @@
+import re
 import praw
 import pandas as pd
 import logging
@@ -16,13 +17,15 @@ def __init__(self, client_id, client_secret, user_agent, subreddits, num_posts=1
         self.subreddits = subreddits
         self.num_posts = num_posts
 
-    def scrape(self, output_file='csv_files/reddit_posts.csv'):
+    def scrape(self, output_file):
         posts = []
         i, j = 0, 0
 
         for subreddit in self.subreddits:
             i += 1
-            for submission in self.reddit.subreddit(subreddit).top(limit=self.num_posts, time_filter='all'):
+            # for submission in self.reddit.subreddit('popular').search(query='', limit=self.num_posts):
+            for submission in self.reddit.subreddit('popular').top(limit=self.num_posts, time_filter='all'):
+            # for submission in self.reddit.subreddit(subreddit).top(limit=self.num_posts, time_filter='all'):
                 if submission.is_self:
                     posts.append({
                         'author': submission.author.name if submission.author else '',
@@ -33,10 +36,10 @@ def scrape(self, output_file='csv_files/reddit_posts.csv'):
                         'num_comments': submission.num_comments,
                         'created': pd.to_datetime(submission.created_utc, unit='s'),
                         'subreddit': subreddit,
-                        'selftext': submission.selftext
+                        'selftext': re.sub(r'["\n\t]', ' ', submission.selftext)
                     })
                     j += 1
                     print(f"INFO: Parsed i: {i} and j: {j}")
 
         df = pd.DataFrame(posts)
-        df.to_csv(output_file, index=False)
+        df.to_csv(output_file, index=False, sep='\t')
@@ -10,7 +10,7 @@
 from sklearn.preprocessing import label_binarize
 
 # Load preprocessed data
-df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
+df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
 df['avg_vector'] = df['avg_vector'].apply(lambda x: ast.literal_eval(x))
 
 # Split data into training and testing sets
 
@@ -7,7 +7,7 @@
 from tensorflow.keras.utils import to_categorical
 
 # Load preprocessed data
-df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
+df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
 df['avg_vector'] = df['avg_vector'].apply(lambda x: ast.literal_eval(x))
 
 # Split data into training and testing sets
 
@@ -7,7 +7,7 @@
 from transformers import get_linear_schedule_with_warmup
 
 # Load preprocessed data
-df = pd.read_csv('../csv_files/preprocessed_data_1504.csv')
+df = pd.read_csv('../csv_files/02_preprocessed_data/preprocessed_data.tsv', sep='\t')
 df['cleaned_title'] = df['cleaned_title'].astype(str)
 
 # Tokenize the text
@@ -61,7 +61,7 @@
         scheduler.step()
 
 # Evaluate the fine-tuned model on the test set
-test_df = pd.read_csv('../csv_files/reddit_posts_1504.csv')
+test_df = pd.read_csv('../csv_files/01_reddit_posts/reddit_posts_combined.tsv', sep='\t')
 test_df['cleaned_title'] = test_df['cleaned_title'].astype(str)
 test_inputs = torch.tensor(
     [tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text