sentiment_analysis_using_natural_language_processing(nltk).py

# -*- coding: utf-8 -*-
"""Sentiment Analysis Using Natural Language Processing(NLTK).ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1UDUw22JjPLFNN2aP3i-trq0xamYomOjC
"""

import csv
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define a function to preprocess text
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabets and spaces
    text = text.lower().split()  # Lowercase and split into words
    stops = set(stopwords.words("english"))  # English stopwords
    text = [WordNetLemmatizer().lemmatize(w) for w in text if not w in stops]  # Lemmatize and remove stopwords
    return " ".join(text)

# Manually handle CSV reading
def clean_csv(file_path):
    data = []
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        reader = csv.reader(file)
        for row in reader:
            try:
                # Only accept rows that have the expected number of columns
                if len(row) == 6:
                    data.append([row[0], row[5]])  # Assuming sentiment is in the first column and text in the sixth
            except csv.Error:
                continue
    return data

# Load and preprocess the data
data = clean_csv('/content/training.1600000.processed.noemoticon.csv')
df = pd.DataFrame(data, columns=['sentiment', 'text'])
df['sentiment'] = df['sentiment'].replace('4', '1')  # Normalize sentiment values
df['text'] = df['text'].apply(preprocess_text)  # Apply preprocessing

# Feature Extraction
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
y = df['sentiment'].astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
gbm.fit(X_train, y_train)
ada.fit(X_train, y_train)

# Evaluate models
gbm_preds = gbm.predict(X_test)
ada_preds = ada.predict(X_test)

print("Gradient Boosting Machine Performance:")
print(classification_report(y_test, gbm_preds))
print("AdaBoost Performance:")
print(classification_report(y_test, ada_preds))

nRowsRead = 1000
df1 = pd.read_csv('training.1600000.processed.noemoticon.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'training.1600000.processed.noemoticon.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

df1.head(5)