-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment_analysis_using_natural_language_processing(nltk).py
85 lines (70 loc) · 3.05 KB
/
sentiment_analysis_using_natural_language_processing(nltk).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""Sentiment Analysis Using Natural Language Processing(NLTK).ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1UDUw22JjPLFNN2aP3i-trq0xamYomOjC
"""
import csv
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Define a function to preprocess text
def preprocess_text(text):
text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
text = re.sub(r'\@\w+|\#', '', text) # Remove mentions and hashtags
text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only alphabets and spaces
text = text.lower().split() # Lowercase and split into words
stops = set(stopwords.words("english")) # English stopwords
text = [WordNetLemmatizer().lemmatize(w) for w in text if not w in stops] # Lemmatize and remove stopwords
return " ".join(text)
# Manually handle CSV reading
def clean_csv(file_path):
data = []
with open(file_path, 'r', encoding='ISO-8859-1') as file:
reader = csv.reader(file)
for row in reader:
try:
# Only accept rows that have the expected number of columns
if len(row) == 6:
data.append([row[0], row[5]]) # Assuming sentiment is in the first column and text in the sixth
except csv.Error:
continue
return data
# Load and preprocess the data
data = clean_csv('/content/training.1600000.processed.noemoticon.csv')
df = pd.DataFrame(data, columns=['sentiment', 'text'])
df['sentiment'] = df['sentiment'].replace('4', '1') # Normalize sentiment values
df['text'] = df['text'].apply(preprocess_text) # Apply preprocessing
# Feature Extraction
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
y = df['sentiment'].astype(int)
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train models
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
gbm.fit(X_train, y_train)
ada.fit(X_train, y_train)
# Evaluate models
gbm_preds = gbm.predict(X_test)
ada_preds = ada.predict(X_test)
print("Gradient Boosting Machine Performance:")
print(classification_report(y_test, gbm_preds))
print("AdaBoost Performance:")
print(classification_report(y_test, ada_preds))
nRowsRead = 1000
df1 = pd.read_csv('training.1600000.processed.noemoticon.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'training.1600000.processed.noemoticon.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')
df1.head(5)