Skip to content

Commit dfdfb6e

Browse files
committed
first commit
0 parents  commit dfdfb6e

11 files changed

+1070
-0
lines changed

.gitignore

+447
Large diffs are not rendered by default.

README.md

Whitespace-only changes.

aaa.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import pandas as pd
2+
import numpy as np
3+
import torch
4+
from torch.utils.data import TensorDataset, random_split
5+
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
6+
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
7+
from transformers import get_linear_schedule_with_warmup
8+
9+
# Load preprocessed data
10+
df = pd.read_csv('csv_files/preprocessed_data.csv')
11+
df['cleaned_title'] = df['cleaned_title'].astype(str)
12+
13+
# Tokenize the text
14+
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
15+
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in df['cleaned_title']]
16+
max_len = max([len(txt) for txt in tokenized_texts])
17+
input_ids = torch.tensor(
18+
[tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text
19+
in df['cleaned_title']])
20+
21+
# Create attention masks
22+
attention_masks = []
23+
for seq in input_ids:
24+
seq_mask = [float(i > 0) for i in seq]
25+
attention_masks.append(seq_mask)
26+
attention_masks = torch.tensor(attention_masks)
27+
28+
# Create data loaders
29+
batch_size = 32
30+
dataset = TensorDataset(input_ids, attention_masks, torch.tensor(df['cluster'].values))
31+
train_size = int(0.8 * len(dataset))
32+
val_size = len(dataset) - train_size
33+
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
34+
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
35+
validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
36+
37+
# Load pre-trained model
38+
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, output_attentions=False,
39+
output_hidden_states=False)
40+
41+
# Fine-tune the model
42+
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
43+
epochs = 4
44+
total_steps = len(train_dataloader) * epochs
45+
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
46+
47+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
48+
model.to(device)
49+
50+
for epoch in range(epochs):
51+
for batch in train_dataloader:
52+
b_input_ids = batch[0].to(device)
53+
b_input_mask = batch[1].to(device)
54+
b_labels = batch[2].to(device)
55+
model.zero_grad()
56+
outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
57+
loss = outputs[0]
58+
loss.backward()
59+
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
60+
optimizer.step()
61+
scheduler.step()
62+
63+
# Evaluate the fine-tuned model on the test set
64+
test_df = pd.read_csv('csv_files/reddit_posts_0104.csv')
65+
test_df['cleaned_title'] = test_df['cleaned_title'].astype(str)
66+
test_inputs = torch.tensor(
67+
[tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=max_len, truncation=True) for text
68+
in test_df['cleaned_title']])
69+
test_masks = []
70+
for seq in test_inputs:
71+
seq_mask = [float(i > 0) for i in seq]
72+
test_masks.append(seq_mask)
73+
test_masks = torch.tensor(test_masks)
74+
test_labels = torch.tensor(test_df['cluster'].values)
75+
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
76+
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)
77+
78+
model.eval()
79+
80+
test_loss, test_accuracy = 0, 0
81+
nb_test_steps, nb_test_examples = 0, 0
82+
83+
84+
def flat_accuracy(preds, labels):
85+
pred_flat = np.argmax(preds, axis=1).flatten()
86+
labels_flat = labels.flatten()
87+
return np.sum(pred_flat == labels_flat) / len(labels_flat)
88+
89+
90+
for step, batch in enumerate(test_dataloader):
91+
batch = tuple(t.to(device) for t in batch)
92+
b_input_ids, b_input_mask, b_labels = batch
93+
with torch.no_grad():
94+
outputs = model(b_input_ids, attention_mask=b_input_mask)
95+
logits = outputs[0]
96+
tmp_test_accuracy = flat_accuracy(logits, b_labels)
97+
test_accuracy += tmp_test_accuracy
98+
nb_test_steps += 1
99+
100+
print("Test Accuracy: {}".format(test_accuracy / nb_test_steps))

main.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import os
2+
3+
from dotenv import load_dotenv
4+
5+
from sentiment_analysis.etap1_reddit_scrapper import RedditScraper
6+
from sentiment_analysis.etap1_1_sentiment_analysis_preprocessing import SentimentAnalysisPreprocessor
7+
8+
if __name__ == '__main__':
9+
load_dotenv()
10+
11+
client_id = os.getenv('CLIENT_ID')
12+
client_secret = os.getenv('CLIENT_SECRET')
13+
user_agent = os.getenv('USER_AGENT')
14+
15+
subreddits = [
16+
'worldnews', 'news', 'poland', 'ukraine', 'economics',
17+
'geopolitics', 'europe', 'finance', 'business', 'technology',
18+
'environment', 'science', 'globalhealth', 'energy', 'internationalpolitics',
19+
'cybersecurity', 'education', 'humanrights', 'globaldevelopment'
20+
]
21+
22+
scraper = RedditScraper(client_id, client_secret, user_agent, subreddits, num_posts=1000)
23+
scraper.scrape(output_file="csv_files/reddit_posts_20230503.csv")
24+
#
25+
# preprocessor = SentimentAnalysisPreprocessor(
26+
# input_file='csv_files/reddit_posts_20230503.csv',
27+
# output_file='csv_files/preprocessed_data_20230503.csv',
28+
# num_clusters=3
29+
# )
30+
# preprocessor.preprocess()

requirements.txt

+169
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
absl-py==1.4.0
2+
anyio==3.6.2
3+
argon2-cffi==21.3.0
4+
argon2-cffi-bindings==21.2.0
5+
arrow==1.2.3
6+
asttokens==2.2.1
7+
astunparse==1.6.3
8+
attrs==22.2.0
9+
backcall==0.2.0
10+
beautifulsoup4==4.12.0
11+
bleach==6.0.0
12+
cachetools==5.3.0
13+
certifi==2022.12.7
14+
cffi==1.15.1
15+
charset-normalizer==3.1.0
16+
click==8.1.3
17+
cmake==3.26.1
18+
comm==0.1.3
19+
contourpy==1.0.7
20+
cycler==0.11.0
21+
debugpy==1.6.6
22+
decorator==5.1.1
23+
defusedxml==0.7.1
24+
executing==1.2.0
25+
fastjsonschema==2.16.3
26+
filelock==3.10.7
27+
flatbuffers==23.3.3
28+
fonttools==4.39.3
29+
fqdn==1.5.1
30+
gast==0.4.0
31+
gensim==4.3.1
32+
google-auth==2.17.1
33+
google-auth-oauthlib==1.0.0
34+
google-pasta==0.2.0
35+
grpcio==1.53.0
36+
h5py==3.8.0
37+
huggingface-hub==0.13.3
38+
idna==3.4
39+
importlib-metadata==6.1.0
40+
importlib-resources==5.12.0
41+
ipykernel==6.22.0
42+
ipython==8.12.0
43+
ipython-genutils==0.2.0
44+
ipywidgets==8.0.6
45+
isoduration==20.11.0
46+
jax==0.4.8
47+
jedi==0.18.2
48+
Jinja2==3.1.2
49+
joblib==1.2.0
50+
jsonpointer==2.3
51+
jsonschema==4.17.3
52+
jupyter==1.0.0
53+
jupyter-console==6.6.3
54+
jupyter-events==0.6.3
55+
jupyter_client==8.1.0
56+
jupyter_core==5.3.0
57+
jupyter_server==2.5.0
58+
jupyter_server_terminals==0.4.4
59+
jupyterlab-pygments==0.2.2
60+
jupyterlab-widgets==3.0.7
61+
keras==2.12.0
62+
kiwisolver==1.4.4
63+
libclang==16.0.0
64+
lit==16.0.0
65+
Markdown==3.4.3
66+
MarkupSafe==2.1.2
67+
matplotlib==3.7.1
68+
matplotlib-inline==0.1.6
69+
mistune==2.0.5
70+
ml-dtypes==0.0.4
71+
mpmath==1.3.0
72+
nbclassic==0.5.4
73+
nbclient==0.7.2
74+
nbconvert==7.2.10
75+
nbformat==5.8.0
76+
nest-asyncio==1.5.6
77+
networkx==3.0
78+
nltk==3.8.1
79+
notebook==6.5.3
80+
notebook_shim==0.2.2
81+
numpy==1.23.5
82+
nvidia-cublas-cu11==11.10.3.66
83+
nvidia-cuda-cupti-cu11==11.7.101
84+
nvidia-cuda-nvrtc-cu11==11.7.99
85+
nvidia-cuda-runtime-cu11==11.7.99
86+
nvidia-cudnn-cu11==8.5.0.96
87+
nvidia-cufft-cu11==10.9.0.58
88+
nvidia-curand-cu11==10.2.10.91
89+
nvidia-cusolver-cu11==11.4.0.1
90+
nvidia-cusparse-cu11==11.7.4.91
91+
nvidia-nccl-cu11==2.14.3
92+
nvidia-nvtx-cu11==11.7.91
93+
oauthlib==3.2.2
94+
opt-einsum==3.3.0
95+
packaging==23.0
96+
pandas==1.5.3
97+
pandocfilters==1.5.0
98+
parso==0.8.3
99+
pexpect==4.8.0
100+
pickleshare==0.7.5
101+
Pillow==9.5.0
102+
pkgutil_resolve_name==1.3.10
103+
platformdirs==3.2.0
104+
praw==7.7.0
105+
prawcore==2.3.0
106+
prometheus-client==0.16.0
107+
prompt-toolkit==3.0.38
108+
protobuf==4.22.1
109+
psutil==5.9.4
110+
ptyprocess==0.7.0
111+
pure-eval==0.2.2
112+
pyasn1==0.4.8
113+
pyasn1-modules==0.2.8
114+
pycparser==2.21
115+
Pygments==2.14.0
116+
pyparsing==3.0.9
117+
pyrsistent==0.19.3
118+
python-dateutil==2.8.2
119+
python-dotenv==1.0.0
120+
python-json-logger==2.0.7
121+
pytz==2023.3
122+
PyYAML==6.0
123+
pyzmq==25.0.2
124+
qtconsole==5.4.1
125+
QtPy==2.3.1
126+
regex==2023.3.23
127+
requests==2.28.2
128+
requests-oauthlib==1.3.1
129+
rfc3339-validator==0.1.4
130+
rfc3986-validator==0.1.1
131+
rsa==4.9
132+
scikit-learn==1.2.2
133+
scipy==1.10.1
134+
Send2Trash==1.8.0
135+
six==1.16.0
136+
smart-open==6.3.0
137+
sniffio==1.3.0
138+
soupsieve==2.4
139+
stack-data==0.6.2
140+
sympy==1.11.1
141+
tensorboard==2.12.1
142+
tensorboard-data-server==0.7.0
143+
tensorboard-plugin-wit==1.8.1
144+
tensorflow==2.12.0
145+
tensorflow-estimator==2.12.0
146+
tensorflow-io-gcs-filesystem==0.32.0
147+
termcolor==2.2.0
148+
terminado==0.17.1
149+
threadpoolctl==3.1.0
150+
tinycss2==1.2.1
151+
tokenizers==0.13.2
152+
torch==2.0.0
153+
tornado==6.2
154+
tqdm==4.65.0
155+
traitlets==5.9.0
156+
transformers==4.27.4
157+
triton==2.0.0
158+
typing_extensions==4.5.0
159+
update-checker==0.18.0
160+
uri-template==1.2.0
161+
urllib3==1.26.15
162+
wcwidth==0.2.6
163+
webcolors==1.13
164+
webencodings==0.5.1
165+
websocket-client==1.5.1
166+
Werkzeug==2.2.3
167+
widgetsnbextension==4.0.7
168+
wrapt==1.14.1
169+
zipp==3.15.0

sentiment_analysis/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import pandas as pd
2+
import re
3+
import nltk
4+
import numpy as np
5+
from nltk.corpus import stopwords
6+
from gensim.models import Word2Vec
7+
from sklearn.preprocessing import normalize
8+
from sklearn.model_selection import train_test_split
9+
from sklearn.linear_model import LogisticRegression
10+
from sklearn.metrics import accuracy_score
11+
12+
13+
class SentimentAnalysisPreprocessor:
14+
15+
def __init__(self, input_file, output_file):
16+
self.input_file = input_file
17+
self.output_file = output_file
18+
19+
@staticmethod
20+
def clean_text(text):
21+
text = text.lower()
22+
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
23+
text = re.sub(r'\@\w+|\#', '', text)
24+
text = re.sub(r'[^a-zA-Z\s]', '', text)
25+
text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
26+
return text
27+
28+
@staticmethod
29+
def average_vector(words, word_vectors):
30+
word_vectors = [word_vectors[word] for word in words if word in word_vectors]
31+
if len(word_vectors) == 0:
32+
return None
33+
return normalize(sum(word_vectors).reshape(1, -1))
34+
35+
def preprocess(self):
36+
# Load the collected data
37+
df = pd.read_csv(self.input_file)
38+
39+
# Perform data cleaning and preprocessing
40+
df['cleaned_title'] = df['title'].apply(lambda x: self.clean_text(x))
41+
42+
# Tokenize text
43+
df['tokenized_title'] = df['cleaned_title'].apply(lambda x: x.split())
44+
45+
# Create word embeddings using a pre-trained model (e.g., Word2Vec)
46+
model = Word2Vec(df['tokenized_title'].tolist(), vector_size=100, window=5, min_count=1, workers=4)
47+
48+
# Get the vector representation of each word in the titles
49+
word_vectors = model.wv
50+
51+
# Get the average vector for each title
52+
df['avg_vector'] = df['tokenized_title'].apply(
53+
lambda x: self.average_vector(x, word_vectors)
54+
.tolist() if self.average_vector(x, word_vectors) is not None else None)
55+
df = df.dropna(subset=['avg_vector'])
56+
57+
# Manually label a portion of the data for training the classifier
58+
# Add a new column 'manual_label' to the DataFrame and set it to None
59+
df['manual_label'] = None
60+
61+
# Manually label some examples (replace the index and label with appropriate values)
62+
# 0 - negative, 1 - neutral, 2 - positive
63+
df.at[0, 'manual_label'] = 2
64+
df.at[5, 'manual_label'] = 0
65+
# ...
66+
67+
# Split the manually labeled data into training and testing sets
68+
labeled_data = df.dropna(subset=['manual_label'])
69+
X_train, X_test, y_train, y_test = train_test_split(
70+
np.vstack(labeled_data['avg_vector'].values),
71+
labeled_data['manual_label'].values.astype(int),
72+
test_size=0.3,
73+
random_state=42)
74+
75+
# Train a classifier (e.g., logistic regression) on
76+
clf = LogisticRegression(random_state=42)
77+
clf.fit(X_train, y_train)
78+
79+
# Evaluate the classifier on the test set
80+
y_pred = clf.predict(X_test)
81+
print(f"Classifier accuracy: {accuracy_score(y_test, y_pred)}")
82+
83+
# Assign sentiment labels to the remaining data using the trained classifier
84+
unlabeled_data = df[df['manual_label'].isnull()]
85+
unlabeled_data['predicted_label'] = clf.predict(np.vstack(unlabeled_data['avg_vector'].values))
86+
87+
# Combine the manually labeled and predicted data
88+
labeled_data['predicted_label'] = labeled_data['manual_label']
89+
result_df = pd.concat([labeled_data, unlabeled_data], ignore_index=True)
90+
91+
# Save the preprocessed data
92+
result_df.to_csv(self.output_file, index=False, lineterminator='\n', float_format='%.8f', header=True)

0 commit comments

Comments
 (0)