dannycrief
diff --git a/‎.gitignore
+447 b/‎.gitignore
+447
diff --git a/‎README.md b/‎README.md
diff --git a/‎aaa.py
+100 b/‎aaa.py
+100
diff --git a/‎main.py
+30 b/‎main.py
+30
diff --git a/‎requirements.txt
+169 b/‎requirements.txt
+169
diff --git a/‎sentiment_analysis/__init__.py b/‎sentiment_analysis/__init__.py
diff --git a/‎sentiment_analysis/etap1_1_sentiment_analysis_preprocessing.py
+92 b/‎sentiment_analysis/etap1_1_sentiment_analysis_preprocessing.py
+92
@@ -0,0 +1,100 @@
+import pandas as pd
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, random_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, BertForSequenceClassification, AdamW
+from transformers import get_linear_schedule_with_warmup
+
+# Load preprocessed data
+df = pd.read_csv('csv_files/preprocessed_data.csv')
+df['cleaned_title'] = df['cleaned_title'].astype(str)
+
+# Tokenize the text
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
+tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df['cleaned_title']]
+max_len = max([len(txt) for txt in tokenized_texts])
+input_ids = torch.tensor(
+    [tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text
+     in df['cleaned_title']])
+
+# Create attention masks
+attention_masks = []
+for seq in input_ids:
+    seq_mask = [float(i > 0) for i in seq]
+    attention_masks.append(seq_mask)
+attention_masks = torch.tensor(attention_masks)
+
+# Create data loaders
+batch_size = 32
+dataset = TensorDataset(input_ids, attention_masks, torch.tensor(df['cluster'].values))
+train_size = int(0.8 * len(dataset))
+val_size = len(dataset) - train_size
+train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
+validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
+
+# Load pre-trained model
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, output_attentions=False,
+                                                      output_hidden_states=False)
+
+# Fine-tune the model
+optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
+epochs = 4
+total_steps = len(train_dataloader) * epochs
+scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+
+for epoch in range(epochs):
+    for batch in train_dataloader:
+        b_input_ids = batch[0].to(device)
+        b_input_mask = batch[1].to(device)
+        b_labels = batch[2].to(device)
+        model.zero_grad()
+        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
+        loss = outputs[0]
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        scheduler.step()
+
+# Evaluate the fine-tuned model on the test set
+test_df = pd.read_csv('csv_files/reddit_posts_0104.csv')
+test_df['cleaned_title'] = test_df['cleaned_title'].astype(str)
+test_inputs = torch.tensor(
+    [tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text
+     in test_df['cleaned_title']])
+test_masks = []
+for seq in test_inputs:
+    seq_mask = [float(i > 0) for i in seq]
+    test_masks.append(seq_mask)
+test_masks = torch.tensor(test_masks)
+test_labels = torch.tensor(test_df['cluster'].values)
+test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
+test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)
+
+model.eval()
+
+test_loss, test_accuracy = 0, 0
+nb_test_steps, nb_test_examples = 0, 0
+
+
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+
+for step, batch in enumerate(test_dataloader):
+    batch = tuple(t.to(device) for t in batch)
+    b_input_ids, b_input_mask, b_labels = batch
+    with torch.no_grad():
+        outputs = model(b_input_ids, attention_mask=b_input_mask)
+    logits = outputs[0]
+    tmp_test_accuracy = flat_accuracy(logits, b_labels)
+    test_accuracy += tmp_test_accuracy
+    nb_test_steps += 1
+
+print("Test Accuracy: {}".format(test_accuracy / nb_test_steps))
@@ -0,0 +1,30 @@
+import os
+
+from dotenv import load_dotenv
+
+from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
+from sentiment_analysis.etap1_1_sentiment_analysis_preprocessing import SentimentAnalysisPreprocessor
+
+if __name__ == '__main__':
+    load_dotenv()
+
+    client_id = os.getenv('CLIENT_ID')
+    client_secret = os.getenv('CLIENT_SECRET')
+    user_agent = os.getenv('USER_AGENT')
+
+    subreddits = [
+        'worldnews', 'news', 'poland', 'ukraine', 'economics',
+        'geopolitics', 'europe', 'finance', 'business', 'technology',
+        'environment', 'science', 'globalhealth', 'energy', 'internationalpolitics',
+        'cybersecurity', 'education', 'humanrights', 'globaldevelopment'
+    ]
+
+    scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=1000)
+    scraper.scrape(output_file="csv_files/reddit_posts_20230503.csv")
+    #
+    # preprocessor = SentimentAnalysisPreprocessor(
+    #     input_file='csv_files/reddit_posts_20230503.csv',
+    #     output_file='csv_files/preprocessed_data_20230503.csv',
+    #     num_clusters=3
+    # )
+    # preprocessor.preprocess()
@@ -0,0 +1,169 @@
+absl-py==1.4.0
+anyio==3.6.2
+argon2-cffi==21.3.0
+argon2-cffi-bindings==21.2.0
+arrow==1.2.3
+asttokens==2.2.1
+astunparse==1.6.3
+attrs==22.2.0
+backcall==0.2.0
+beautifulsoup4==4.12.0
+bleach==6.0.0
+cachetools==5.3.0
+certifi==2022.12.7
+cffi==1.15.1
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.1
+comm==0.1.3
+contourpy==1.0.7
+cycler==0.11.0
+debugpy==1.6.6
+decorator==5.1.1
+defusedxml==0.7.1
+executing==1.2.0
+fastjsonschema==2.16.3
+filelock==3.10.7
+flatbuffers==23.3.3
+fonttools==4.39.3
+fqdn==1.5.1
+gast==0.4.0
+gensim==4.3.1
+google-auth==2.17.1
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+grpcio==1.53.0
+h5py==3.8.0
+huggingface-hub==0.13.3
+idna==3.4
+importlib-metadata==6.1.0
+importlib-resources==5.12.0
+ipykernel==6.22.0
+ipython==8.12.0
+ipython-genutils==0.2.0
+ipywidgets==8.0.6
+isoduration==20.11.0
+jax==0.4.8
+jedi==0.18.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonpointer==2.3
+jsonschema==4.17.3
+jupyter==1.0.0
+jupyter-console==6.6.3
+jupyter-events==0.6.3
+jupyter_client==8.1.0
+jupyter_core==5.3.0
+jupyter_server==2.5.0
+jupyter_server_terminals==0.4.4
+jupyterlab-pygments==0.2.2
+jupyterlab-widgets==3.0.7
+keras==2.12.0
+kiwisolver==1.4.4
+libclang==16.0.0
+lit==16.0.0
+Markdown==3.4.3
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mistune==2.0.5
+ml-dtypes==0.0.4
+mpmath==1.3.0
+nbclassic==0.5.4
+nbclient==0.7.2
+nbconvert==7.2.10
+nbformat==5.8.0
+nest-asyncio==1.5.6
+networkx==3.0
+nltk==3.8.1
+notebook==6.5.3
+notebook_shim==0.2.2
+numpy==1.23.5
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+oauthlib==3.2.2
+opt-einsum==3.3.0
+packaging==23.0
+pandas==1.5.3
+pandocfilters==1.5.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.5.0
+pkgutil_resolve_name==1.3.10
+platformdirs==3.2.0
+praw==7.7.0
+prawcore==2.3.0
+prometheus-client==0.16.0
+prompt-toolkit==3.0.38
+protobuf==4.22.1
+psutil==5.9.4
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycparser==2.21
+Pygments==2.14.0
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-dotenv==1.0.0
+python-json-logger==2.0.7
+pytz==2023.3
+PyYAML==6.0
+pyzmq==25.0.2
+qtconsole==5.4.1
+QtPy==2.3.1
+regex==2023.3.23
+requests==2.28.2
+requests-oauthlib==1.3.1
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rsa==4.9
+scikit-learn==1.2.2
+scipy==1.10.1
+Send2Trash==1.8.0
+six==1.16.0
+smart-open==6.3.0
+sniffio==1.3.0
+soupsieve==2.4
+stack-data==0.6.2
+sympy==1.11.1
+tensorboard==2.12.1
+tensorboard-data-server==0.7.0
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.12.0
+tensorflow-estimator==2.12.0
+tensorflow-io-gcs-filesystem==0.32.0
+termcolor==2.2.0
+terminado==0.17.1
+threadpoolctl==3.1.0
+tinycss2==1.2.1
+tokenizers==0.13.2
+torch==2.0.0
+tornado==6.2
+tqdm==4.65.0
+traitlets==5.9.0
+transformers==4.27.4
+triton==2.0.0
+typing_extensions==4.5.0
+update-checker==0.18.0
+uri-template==1.2.0
+urllib3==1.26.15
+wcwidth==0.2.6
+webcolors==1.13
+webencodings==0.5.1
+websocket-client==1.5.1
+Werkzeug==2.2.3
+widgetsnbextension==4.0.7
+wrapt==1.14.1
+zipp==3.15.0
@@ -0,0 +1,92 @@
+import pandas as pd
+import re
+import nltk
+import numpy as np
+from nltk.corpus import stopwords
+from gensim.models import Word2Vec
+from sklearn.preprocessing import normalize
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+
+
+class SentimentAnalysisPreprocessor:
+
+    def __init__(self, input_file, output_file):
+        self.input_file = input_file
+        self.output_file = output_file
+
+    @staticmethod
+    def clean_text(text):
+        text = text.lower()
+        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
+        text = re.sub(r'\@\w+|\#', '', text)
+        text = re.sub(r'[^a-zA-Z\s]', '', text)
+        text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
+        return text
+
+    @staticmethod
+    def average_vector(words, word_vectors):
+        word_vectors = [word_vectors[word] for word in words if word in word_vectors]
+        if len(word_vectors) == 0:
+            return None
+        return normalize(sum(word_vectors).reshape(1, -1))
+
+    def preprocess(self):
+        # Load the collected data
+        df = pd.read_csv(self.input_file)
+
+        # Perform data cleaning and preprocessing
+        df['cleaned_title'] = df['title'].apply(lambda x: self.clean_text(x))
+
+        # Tokenize text
+        df['tokenized_title'] = df['cleaned_title'].apply(lambda x: x.split())
+
+        # Create word embeddings using a pre-trained model (e.g., Word2Vec)
+        model = Word2Vec(df['tokenized_title'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
+
+        # Get the vector representation of each word in the titles
+        word_vectors = model.wv
+
+        # Get the average vector for each title
+        df['avg_vector'] = df['tokenized_title'].apply(
+            lambda x: self.average_vector(x, word_vectors)
+            .tolist() if self.average_vector(x, word_vectors) is not None else None)
+        df = df.dropna(subset=['avg_vector'])
+
+        # Manually label a portion of the data for training the classifier
+        # Add a new column 'manual_label' to the DataFrame and set it to None
+        df['manual_label'] = None
+
+        # Manually label some examples (replace the index and label with appropriate values)
+        # 0 - negative, 1 - neutral, 2 - positive
+        df.at[0, 'manual_label'] = 2
+        df.at[5, 'manual_label'] = 0
+        # ...
+
+        # Split the manually labeled data into training and testing sets
+        labeled_data = df.dropna(subset=['manual_label'])
+        X_train, X_test, y_train, y_test = train_test_split(
+            np.vstack(labeled_data['avg_vector'].values),
+            labeled_data['manual_label'].values.astype(int),
+            test_size=0.3,
+            random_state=42)
+
+        # Train a classifier (e.g., logistic regression) on
+        clf = LogisticRegression(random_state=42)
+        clf.fit(X_train, y_train)
+
+        # Evaluate the classifier on the test set
+        y_pred = clf.predict(X_test)
+        print(f"Classifier accuracy: {accuracy_score(y_test, y_pred)}")
+
+        # Assign sentiment labels to the remaining data using the trained classifier
+        unlabeled_data = df[df['manual_label'].isnull()]
+        unlabeled_data['predicted_label'] = clf.predict(np.vstack(unlabeled_data['avg_vector'].values))
+
+        # Combine the manually labeled and predicted data
+        labeled_data['predicted_label'] = labeled_data['manual_label']
+        result_df = pd.concat([labeled_data, unlabeled_data], ignore_index=True)
+
+        # Save the preprocessed data
+        result_df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True)