Skip to content

Commit

Permalink
Merge pull request #1 from pankace/Tomy's-experiments
Browse files Browse the repository at this point in the history
Tomy's experiments
  • Loading branch information
pankace authored Mar 2, 2024
2 parents 3dae960 + 2966caa commit 6e656c5
Show file tree
Hide file tree
Showing 11 changed files with 10,206 additions and 0 deletions.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

144 changes: 144 additions & 0 deletions experiments/Tomy's-experiments/Purification_and_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import threading
import pandas as pd
import numpy as np
from support_fuctions import numeric_analysis_arm, enhanced_diagnostics, dataframe_generator, tokenization, remove_emojis, inpurity_purging_protocol

file_path = r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\tranlated_tweets.csv"
source_data = pd.read_csv(file_path)
developer_mode = 0
loop_viewer = 0

print("<------------------>")
column_names = list(source_data.columns)
column_data_types_detected = []
object_data_columns = []
analysis_compatible_columns = []

for c_names in column_names:
column_data = source_data[c_names].to_numpy()
types_detected = column_data.dtype
column_data_types_detected.append(types_detected)
if types_detected == "int64":
analysis_compatible_columns.append(c_names)
elif types_detected == "object":
object_data_columns.append(c_names)

if developer_mode == 1:
if len(column_names) == len(column_data_types_detected):
for element in range(len(column_names)):
print(f"Column name: {column_names[element]} | Data type detected: {column_data_types_detected[element]}")
elif developer_mode == 0:
print("Developer mode inactive")
print("<------------------>")
print()
print(f"Analysis compatible columns: {analysis_compatible_columns}")
numeric_columns = []
object_columns = []

input_mode = True
print("Type '-1' when no more target columns are needed.")
while input_mode == True:
target = "-1" #input("Column name: ")
if target == "-1":
input_mode = False
elif target != "-1":
numeric_columns.append(target)

locked_in_columns = numeric_columns
print("Numeric analysis columns:")
print(locked_in_columns)

for names in locked_in_columns:
c_name_analysis_input = names
column_isolation = source_data[c_name_analysis_input].to_numpy()
if type(c_name_analysis_input) == str and (type(column_isolation) == list or type(column_isolation) == np.ndarray):
numeric_analysis_arm(c_name_analysis_input,column_isolation,developer_mode)
enhanced_diagnostics(c_name_analysis_input,column_isolation,developer_mode)

text_isolation = source_data["Description Cleaned Translated"].to_numpy()
text_isolation = inpurity_purging_protocol(text_isolation)
unique_locations = []
for tweets in text_isolation:
if tweets not in unique_locations:
unique_locations.append(tweets)

target_isolation = source_data["Analysis results"].to_numpy()
numbers_detected = []
for numbers in target_isolation:
if numbers not in numbers_detected:
numbers_detected.append(numbers)
elif numbers in numbers_detected:
continue

if len(target_isolation) == len(text_isolation):
print(f"Target isolated data: {len(target_isolation)}")
print(f"Text isolated data: {len(text_isolation)}")


zero = 0
one = 0
asociation_text = []
asociation_target = []
if len(target_isolation) == len(text_isolation):
for elements in range(len(target_isolation)):
asociation_text.append(text_isolation[elements])
asociation_target.append(target_isolation[elements])

if target_isolation[elements] == 0:
zero += 1
elif target_isolation[elements] == 1:
one += 1

full_set = one + zero
print(f"Full set: {full_set}")

if developer_mode == 1:
print(f"Zero targets: {zero}")
print(f"Ones targets: {one}")

if full_set != 0:
zero_percentage = (zero / full_set) * 100
one_percentage = (one / full_set) * 100
else:
zero_percentage = "Infinity"
one_percentage = "Infinity"

if one > zero:
description = ["Cleared", "Flagged", "Percentage ratio of Cleared", "Percentage ratio of Flagged"]
results = [zero, one, f"{round(zero_percentage, 2)} %", f"{round(one_percentage, 2)} %"]
else:
description = ["Cleared", "Flagged", "Percentage ratio of flagged", "Percentage ratio of Cleared"]
results = [zero, one, f"{round(one_percentage, 2)} %", f"{round(zero_percentage, 2)} %"]

# Token analysis with stop words purge integrated
text_isolation = text_isolation.flatten()
tokenized_conversion = tokenization(text_isolation)

# Emoji removal in classic for-loop format
post_purge_storage = []
for pre_purge in tokenized_conversion:
post_purge = remove_emojis(pre_purge)
post_purge_storage.append(post_purge)

# Calculate unique tokens after stop words removal
unique_tokens, counts = np.unique(post_purge_storage, return_counts=True)
token_counts = dict(zip(unique_tokens, counts))

# Prepare for dataframe generation
token_keys = list(token_counts.keys())
token_values = list(token_counts.values())
token_des_column = ["Unique tokens", "Total tokens", "Percentage of unique tokens"]
token_res_column = [len(unique_tokens), sum(counts), f"{(len(unique_tokens) / sum(counts) * 100)} %"]
multithreading_deployment = 1
if multithreading_deployment == 1:
if __name__ == "__main__":
t1 = threading.Thread(target=dataframe_generator(asociation_text, asociation_target, c1="Cleaned tweet", c2="Tag", column_name="Purification verification", developer_mode=developer_mode))
t2 = threading.Thread(target=dataframe_generator(description, results, c1="Cleared/Flagged quantity", c2="Analysis output", column_name="Tags analysis", developer_mode=developer_mode))
t3 = threading.Thread(target=dataframe_generator(token_des_column, token_res_column, c1="Token details", c2="Token count", column_name="Token analysis", developer_mode=developer_mode))
t4 = threading.Thread(target=dataframe_generator(token_keys, token_values, c1="Individual words", c2="Occurrences", column_name="Frequency of token usage", developer_mode=developer_mode))

threads = [t1, t2, t3, t4]
for individual_threads in threads:
individual_threads.start()
for initiated_threads in threads:
initiated_threads.join()
155 changes: 155 additions & 0 deletions experiments/Tomy's-experiments/Random_forrest_real_data_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import time

import nltk
from sklearn.ensemble import RandomForestClassifier
start_time = time.time()

nltk.download('stopwords')
nltk.download('wordnet')

import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import PCA, TruncatedSVD
from nltk.tokenize import word_tokenize
import pandas as pd

def binary_classifier(X, y):
def error_metrics(y_true, y_pred):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

return accuracy, precision, recall, f1

upgrade_deployment = 0
if type(y) != np.ndarray:
y = np.array(y)

if upgrade_deployment == 1:
svd = TruncatedSVD(n_components=75)
X = svd.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediction
prediction = model.predict(X_test)
print(f"Prediction: {prediction}")
accuracy_metric, precision_metric, recall_metric, f1_metric = error_metrics(y_test, prediction)
return accuracy_metric, precision_metric, recall_metric, f1_metric

def remove_stopwords_from_text(tokens, stop_words_applied):
assert(type(tokens) == list or type(tokens) == np.ndarray)
if type(tokens) != np.ndarray:
tokens = np.array(tokens)

filtered_tokens = []
for token in tokens:
if token.lower() not in stop_words_applied and token.lower() not in spanish_stop_words:
filtered_tokens.append(token)
return filtered_tokens


purified_data = pd.read_csv(r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\Results\Token based\Post_purging\32\Purification verification data analysis results.csv")

cleaned_tweet = purified_data["Cleaned tweet"].to_numpy()
tags = purified_data["Tag"].to_numpy()

# Tokenization
token_conversion = []
for tweets in cleaned_tweet:
post_conversion = word_tokenize(tweets)
token_conversion.append(post_conversion)

print("Tokenization complete")
for i in range(3):
print(token_conversion[i])

# Stopwords purge
english_stop_words = set(stopwords.words("English"))
spanish_stop_words = set(stopwords.words("Spanish"))
print("<------------->")
print("Purging of spanish and english stop words in progress...")
print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}")
print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}")

stage_0 = []
# English stopwords removal
for stage_0_element in token_conversion:
english_cleaned_text = remove_stopwords_from_text(stage_0_element, english_stop_words)
stage_0.append(english_cleaned_text)

stop_words_stage_1 = []
# Spanish stop words removal
stage_1 = []
for stage_1_elements in stage_0:
spanish_cleaned_text = remove_stopwords_from_text(stage_1_elements, spanish_stop_words)
stage_1.append(spanish_cleaned_text)
print("Purge of Spanish and english stop words completed...")
print("<------------->")


# lammination
lemmatizer = WordNetLemmatizer()
lamminized_tokens = []
for post_processed_tokens in stage_1:
processed_tokens = []
for token in post_processed_tokens:
if token not in english_stop_words and token not in spanish_stop_words and token.isalpha():
# Lemmatize the token and append to the result list
lemmatized_token = lemmatizer.lemmatize(token)
processed_tokens.append(lemmatized_token)
lamminized_tokens.append(processed_tokens)

preprocessed_texts = [" ".join(tokens) for tokens in lamminized_tokens]
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(preprocessed_texts)
y = tags
if type(y) != np.ndarray:
y = np.array(y)
print(f"X type: {type(X)}")
print(f"y type: {type(y)}")
accuracy_metric, precision_metric, recall_metric, f1_metric = binary_classifier(X, y)
end_time = time.time()
print(f"Accuracy: {round(accuracy_metric, 4)}")
print(f"Precision: {round(precision_metric, 4)}")
print(f"Recall: {round(recall_metric, 4)}")
print(f"F1: {round(f1_metric, 4)}")
processing_time = end_time - start_time
if processing_time < 60:
print(f"Processing time: {processing_time} seconds")
elif processing_time >= 60:
hours_conversion = (processing_time)/60
if hours_conversion > 1 and hours_conversion < 2:
print(f"Processing time: {hours_conversion} hour")
else:
print(f"Processing time: {hours_conversion} hours")
"""
Benchmark to beat:
------------------
Prediction: [0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0
0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0
1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 1 1
0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1
1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 1
1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1
0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1
0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1
1 0 0 1]
Accuracy: 79.0%
Precision: 79.43262%
Recall: 76.71233%
F1-Score: 78.04878%
Processing time: 3.4483914375305176 seconds
Observation: Any attempt to increase accuracy makes these metrics decrease. Any assist is welcomed.
Note: The processing time fluctuates drastically from 3.45 seconds to sometimes 8 seconds. Do not consider that an important consideration or anything if it stays in the seconds for processing time.
"""
17 changes: 17 additions & 0 deletions experiments/Tomy's-experiments/Simple ANN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Define CNN model
model = Sequential([
Embedding(input_dim=500, output_dim=100, input_length=500), # Placeholder values used
Conv1D(filters=128, kernel_size=5, activation='relu'),
GlobalMaxPooling1D(),
Dense(64, activation='relu'),
Dense(1, activation='sigmoid')
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
,Cleared/Flagged quantity,Analysis output
0,Cleared,1027
1,Flagged,973
2,Percentage ratio of flagged,48.65 %
3,Percentage ratio of Cleared,51.35 %
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,Token details,Token count
0,Unique tokens,7589
1,Total tokens,59875
2,Percentage of unique tokens,12.674739039665972 %
11 changes: 11 additions & 0 deletions experiments/Tomy's-experiments/Tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

# initializing list
test_list = ["Testing for tokenization", "Fat cat syndrome", "Test 01"]

# printing original list
print("The original list : " + str(test_list))
# List splitting
res = [sub.split() for sub in test_list]

# print result
print("The list after split of strings is : " + str(res))
55 changes: 55 additions & 0 deletions experiments/Tomy's-experiments/advanced_prototype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np
import tensorflow as tf
from keras.layers import Conv1D, BatchNormalization, Dropout, MaxPooling1D, LSTM, Bidirectional, Dense, Flatten
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = tf.keras.models.Sequential()

num_samples = 1000
input_shape = (1000, 1)

X = np.random.rand(num_samples, *input_shape)
y = np.random.randint(2, size=num_samples)

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

regularization_standard = 0.005
# Convulution layers
conv1d_layer_sync_dropout_min = 0.20
conv1d_layer_sync_dropout_max = 0.24
# Dense layers
dense_layer_sync_dropout_min = 0.20
dense_layer_sync_dropout_max = 0.25
#Recurring layers
recurring_layer_sync_dropout_min = 0.32
recurring_layer_sync_dropout_max = 0.36

model.add(Conv1D(filters=12, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max)))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=16, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max)))
model.add(MaxPooling1D(pool_size=2))

model.add(LSTM(20, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard)))
model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max)))

model.add(Bidirectional(LSTM(24, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard))))
model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max)))
model.add(Flatten())
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regularization_standard)))
model.add(BatchNormalization())
model.add(Dropout(np.random.uniform(dense_layer_sync_dropout_min, dense_layer_sync_dropout_max)))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
1 change: 1 addition & 0 deletions experiments/Tomy's-experiments/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Loading

0 comments on commit 6e656c5

Please sign in to comment.