-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from pankace/Tomy's-experiments
Tomy's experiments
- Loading branch information
Showing
11 changed files
with
10,206 additions
and
0 deletions.
There are no files selected for viewing
7,590 changes: 7,590 additions & 0 deletions
7,590
experiments/Tomy's-experiments/Frequency of token usage data analysis results.csv
Large diffs are not rendered by default.
Oops, something went wrong.
2,001 changes: 2,001 additions & 0 deletions
2,001
experiments/Tomy's-experiments/Purification verification data analysis results.csv
Large diffs are not rendered by default.
Oops, something went wrong.
144 changes: 144 additions & 0 deletions
144
experiments/Tomy's-experiments/Purification_and_analysis.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
import threading | ||
import pandas as pd | ||
import numpy as np | ||
from support_fuctions import numeric_analysis_arm, enhanced_diagnostics, dataframe_generator, tokenization, remove_emojis, inpurity_purging_protocol | ||
|
||
file_path = r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\tranlated_tweets.csv" | ||
source_data = pd.read_csv(file_path) | ||
developer_mode = 0 | ||
loop_viewer = 0 | ||
|
||
print("<------------------>") | ||
column_names = list(source_data.columns) | ||
column_data_types_detected = [] | ||
object_data_columns = [] | ||
analysis_compatible_columns = [] | ||
|
||
for c_names in column_names: | ||
column_data = source_data[c_names].to_numpy() | ||
types_detected = column_data.dtype | ||
column_data_types_detected.append(types_detected) | ||
if types_detected == "int64": | ||
analysis_compatible_columns.append(c_names) | ||
elif types_detected == "object": | ||
object_data_columns.append(c_names) | ||
|
||
if developer_mode == 1: | ||
if len(column_names) == len(column_data_types_detected): | ||
for element in range(len(column_names)): | ||
print(f"Column name: {column_names[element]} | Data type detected: {column_data_types_detected[element]}") | ||
elif developer_mode == 0: | ||
print("Developer mode inactive") | ||
print("<------------------>") | ||
print() | ||
print(f"Analysis compatible columns: {analysis_compatible_columns}") | ||
numeric_columns = [] | ||
object_columns = [] | ||
|
||
input_mode = True | ||
print("Type '-1' when no more target columns are needed.") | ||
while input_mode == True: | ||
target = "-1" #input("Column name: ") | ||
if target == "-1": | ||
input_mode = False | ||
elif target != "-1": | ||
numeric_columns.append(target) | ||
|
||
locked_in_columns = numeric_columns | ||
print("Numeric analysis columns:") | ||
print(locked_in_columns) | ||
|
||
for names in locked_in_columns: | ||
c_name_analysis_input = names | ||
column_isolation = source_data[c_name_analysis_input].to_numpy() | ||
if type(c_name_analysis_input) == str and (type(column_isolation) == list or type(column_isolation) == np.ndarray): | ||
numeric_analysis_arm(c_name_analysis_input,column_isolation,developer_mode) | ||
enhanced_diagnostics(c_name_analysis_input,column_isolation,developer_mode) | ||
|
||
text_isolation = source_data["Description Cleaned Translated"].to_numpy() | ||
text_isolation = inpurity_purging_protocol(text_isolation) | ||
unique_locations = [] | ||
for tweets in text_isolation: | ||
if tweets not in unique_locations: | ||
unique_locations.append(tweets) | ||
|
||
target_isolation = source_data["Analysis results"].to_numpy() | ||
numbers_detected = [] | ||
for numbers in target_isolation: | ||
if numbers not in numbers_detected: | ||
numbers_detected.append(numbers) | ||
elif numbers in numbers_detected: | ||
continue | ||
|
||
if len(target_isolation) == len(text_isolation): | ||
print(f"Target isolated data: {len(target_isolation)}") | ||
print(f"Text isolated data: {len(text_isolation)}") | ||
|
||
|
||
zero = 0 | ||
one = 0 | ||
asociation_text = [] | ||
asociation_target = [] | ||
if len(target_isolation) == len(text_isolation): | ||
for elements in range(len(target_isolation)): | ||
asociation_text.append(text_isolation[elements]) | ||
asociation_target.append(target_isolation[elements]) | ||
|
||
if target_isolation[elements] == 0: | ||
zero += 1 | ||
elif target_isolation[elements] == 1: | ||
one += 1 | ||
|
||
full_set = one + zero | ||
print(f"Full set: {full_set}") | ||
|
||
if developer_mode == 1: | ||
print(f"Zero targets: {zero}") | ||
print(f"Ones targets: {one}") | ||
|
||
if full_set != 0: | ||
zero_percentage = (zero / full_set) * 100 | ||
one_percentage = (one / full_set) * 100 | ||
else: | ||
zero_percentage = "Infinity" | ||
one_percentage = "Infinity" | ||
|
||
if one > zero: | ||
description = ["Cleared", "Flagged", "Percentage ratio of Cleared", "Percentage ratio of Flagged"] | ||
results = [zero, one, f"{round(zero_percentage, 2)} %", f"{round(one_percentage, 2)} %"] | ||
else: | ||
description = ["Cleared", "Flagged", "Percentage ratio of flagged", "Percentage ratio of Cleared"] | ||
results = [zero, one, f"{round(one_percentage, 2)} %", f"{round(zero_percentage, 2)} %"] | ||
|
||
# Token analysis with stop words purge integrated | ||
text_isolation = text_isolation.flatten() | ||
tokenized_conversion = tokenization(text_isolation) | ||
|
||
# Emoji removal in classic for-loop format | ||
post_purge_storage = [] | ||
for pre_purge in tokenized_conversion: | ||
post_purge = remove_emojis(pre_purge) | ||
post_purge_storage.append(post_purge) | ||
|
||
# Calculate unique tokens after stop words removal | ||
unique_tokens, counts = np.unique(post_purge_storage, return_counts=True) | ||
token_counts = dict(zip(unique_tokens, counts)) | ||
|
||
# Prepare for dataframe generation | ||
token_keys = list(token_counts.keys()) | ||
token_values = list(token_counts.values()) | ||
token_des_column = ["Unique tokens", "Total tokens", "Percentage of unique tokens"] | ||
token_res_column = [len(unique_tokens), sum(counts), f"{(len(unique_tokens) / sum(counts) * 100)} %"] | ||
multithreading_deployment = 1 | ||
if multithreading_deployment == 1: | ||
if __name__ == "__main__": | ||
t1 = threading.Thread(target=dataframe_generator(asociation_text, asociation_target, c1="Cleaned tweet", c2="Tag", column_name="Purification verification", developer_mode=developer_mode)) | ||
t2 = threading.Thread(target=dataframe_generator(description, results, c1="Cleared/Flagged quantity", c2="Analysis output", column_name="Tags analysis", developer_mode=developer_mode)) | ||
t3 = threading.Thread(target=dataframe_generator(token_des_column, token_res_column, c1="Token details", c2="Token count", column_name="Token analysis", developer_mode=developer_mode)) | ||
t4 = threading.Thread(target=dataframe_generator(token_keys, token_values, c1="Individual words", c2="Occurrences", column_name="Frequency of token usage", developer_mode=developer_mode)) | ||
|
||
threads = [t1, t2, t3, t4] | ||
for individual_threads in threads: | ||
individual_threads.start() | ||
for initiated_threads in threads: | ||
initiated_threads.join() |
155 changes: 155 additions & 0 deletions
155
experiments/Tomy's-experiments/Random_forrest_real_data_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import time | ||
|
||
import nltk | ||
from sklearn.ensemble import RandomForestClassifier | ||
start_time = time.time() | ||
|
||
nltk.download('stopwords') | ||
nltk.download('wordnet') | ||
|
||
import numpy as np | ||
from sklearn.datasets import make_classification | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score | ||
from nltk.stem import WordNetLemmatizer | ||
from nltk.corpus import stopwords | ||
from sklearn.decomposition import PCA, TruncatedSVD | ||
from nltk.tokenize import word_tokenize | ||
import pandas as pd | ||
|
||
def binary_classifier(X, y): | ||
def error_metrics(y_true, y_pred): | ||
accuracy = accuracy_score(y_true, y_pred) | ||
precision = precision_score(y_true, y_pred) | ||
recall = recall_score(y_true, y_pred) | ||
f1 = f1_score(y_true, y_pred) | ||
|
||
return accuracy, precision, recall, f1 | ||
|
||
upgrade_deployment = 0 | ||
if type(y) != np.ndarray: | ||
y = np.array(y) | ||
|
||
if upgrade_deployment == 1: | ||
svd = TruncatedSVD(n_components=75) | ||
X = svd.fit_transform(X) | ||
# Split the data into training and testing sets | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42) | ||
model = RandomForestClassifier(n_estimators=100, random_state=42) | ||
model.fit(X_train, y_train) | ||
|
||
# Prediction | ||
prediction = model.predict(X_test) | ||
print(f"Prediction: {prediction}") | ||
accuracy_metric, precision_metric, recall_metric, f1_metric = error_metrics(y_test, prediction) | ||
return accuracy_metric, precision_metric, recall_metric, f1_metric | ||
|
||
def remove_stopwords_from_text(tokens, stop_words_applied): | ||
assert(type(tokens) == list or type(tokens) == np.ndarray) | ||
if type(tokens) != np.ndarray: | ||
tokens = np.array(tokens) | ||
|
||
filtered_tokens = [] | ||
for token in tokens: | ||
if token.lower() not in stop_words_applied and token.lower() not in spanish_stop_words: | ||
filtered_tokens.append(token) | ||
return filtered_tokens | ||
|
||
|
||
purified_data = pd.read_csv(r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\Results\Token based\Post_purging\32\Purification verification data analysis results.csv") | ||
|
||
cleaned_tweet = purified_data["Cleaned tweet"].to_numpy() | ||
tags = purified_data["Tag"].to_numpy() | ||
|
||
# Tokenization | ||
token_conversion = [] | ||
for tweets in cleaned_tweet: | ||
post_conversion = word_tokenize(tweets) | ||
token_conversion.append(post_conversion) | ||
|
||
print("Tokenization complete") | ||
for i in range(3): | ||
print(token_conversion[i]) | ||
|
||
# Stopwords purge | ||
english_stop_words = set(stopwords.words("English")) | ||
spanish_stop_words = set(stopwords.words("Spanish")) | ||
print("<------------->") | ||
print("Purging of spanish and english stop words in progress...") | ||
print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}") | ||
print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}") | ||
|
||
stage_0 = [] | ||
# English stopwords removal | ||
for stage_0_element in token_conversion: | ||
english_cleaned_text = remove_stopwords_from_text(stage_0_element, english_stop_words) | ||
stage_0.append(english_cleaned_text) | ||
|
||
stop_words_stage_1 = [] | ||
# Spanish stop words removal | ||
stage_1 = [] | ||
for stage_1_elements in stage_0: | ||
spanish_cleaned_text = remove_stopwords_from_text(stage_1_elements, spanish_stop_words) | ||
stage_1.append(spanish_cleaned_text) | ||
print("Purge of Spanish and english stop words completed...") | ||
print("<------------->") | ||
|
||
|
||
# lammination | ||
lemmatizer = WordNetLemmatizer() | ||
lamminized_tokens = [] | ||
for post_processed_tokens in stage_1: | ||
processed_tokens = [] | ||
for token in post_processed_tokens: | ||
if token not in english_stop_words and token not in spanish_stop_words and token.isalpha(): | ||
# Lemmatize the token and append to the result list | ||
lemmatized_token = lemmatizer.lemmatize(token) | ||
processed_tokens.append(lemmatized_token) | ||
lamminized_tokens.append(processed_tokens) | ||
|
||
preprocessed_texts = [" ".join(tokens) for tokens in lamminized_tokens] | ||
tfidf_vectorizer = TfidfVectorizer() | ||
X = tfidf_vectorizer.fit_transform(preprocessed_texts) | ||
y = tags | ||
if type(y) != np.ndarray: | ||
y = np.array(y) | ||
print(f"X type: {type(X)}") | ||
print(f"y type: {type(y)}") | ||
accuracy_metric, precision_metric, recall_metric, f1_metric = binary_classifier(X, y) | ||
end_time = time.time() | ||
print(f"Accuracy: {round(accuracy_metric, 4)}") | ||
print(f"Precision: {round(precision_metric, 4)}") | ||
print(f"Recall: {round(recall_metric, 4)}") | ||
print(f"F1: {round(f1_metric, 4)}") | ||
processing_time = end_time - start_time | ||
if processing_time < 60: | ||
print(f"Processing time: {processing_time} seconds") | ||
elif processing_time >= 60: | ||
hours_conversion = (processing_time)/60 | ||
if hours_conversion > 1 and hours_conversion < 2: | ||
print(f"Processing time: {hours_conversion} hour") | ||
else: | ||
print(f"Processing time: {hours_conversion} hours") | ||
""" | ||
Benchmark to beat: | ||
------------------ | ||
Prediction: [0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 | ||
0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 | ||
1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 1 1 | ||
0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 | ||
1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 1 | ||
1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1 | ||
0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1 | ||
0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 | ||
1 0 0 1] | ||
Accuracy: 79.0% | ||
Precision: 79.43262% | ||
Recall: 76.71233% | ||
F1-Score: 78.04878% | ||
Processing time: 3.4483914375305176 seconds | ||
Observation: Any attempt to increase accuracy makes these metrics decrease. Any assist is welcomed. | ||
Note: The processing time fluctuates drastically from 3.45 seconds to sometimes 8 seconds. Do not consider that an important consideration or anything if it stays in the seconds for processing time. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense | ||
|
||
# Define CNN model | ||
model = Sequential([ | ||
Embedding(input_dim=500, output_dim=100, input_length=500), # Placeholder values used | ||
Conv1D(filters=128, kernel_size=5, activation='relu'), | ||
GlobalMaxPooling1D(), | ||
Dense(64, activation='relu'), | ||
Dense(1, activation='sigmoid') | ||
]) | ||
|
||
# Compile model | ||
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | ||
|
||
# Train model | ||
|
5 changes: 5 additions & 0 deletions
5
experiments/Tomy's-experiments/Tags analysis data analysis results.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
,Cleared/Flagged quantity,Analysis output | ||
0,Cleared,1027 | ||
1,Flagged,973 | ||
2,Percentage ratio of flagged,48.65 % | ||
3,Percentage ratio of Cleared,51.35 % |
4 changes: 4 additions & 0 deletions
4
experiments/Tomy's-experiments/Token analysis data analysis results.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
,Token details,Token count | ||
0,Unique tokens,7589 | ||
1,Total tokens,59875 | ||
2,Percentage of unique tokens,12.674739039665972 % |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
|
||
# initializing list | ||
test_list = ["Testing for tokenization", "Fat cat syndrome", "Test 01"] | ||
|
||
# printing original list | ||
print("The original list : " + str(test_list)) | ||
# List splitting | ||
res = [sub.split() for sub in test_list] | ||
|
||
# print result | ||
print("The list after split of strings is : " + str(res)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import numpy as np | ||
import tensorflow as tf | ||
from keras.layers import Conv1D, BatchNormalization, Dropout, MaxPooling1D, LSTM, Bidirectional, Dense, Flatten | ||
import random | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score | ||
|
||
model = tf.keras.models.Sequential() | ||
|
||
num_samples = 1000 | ||
input_shape = (1000, 1) | ||
|
||
X = np.random.rand(num_samples, *input_shape) | ||
y = np.random.randint(2, size=num_samples) | ||
|
||
# Split the data into training, validation, and testing sets | ||
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) | ||
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) | ||
|
||
regularization_standard = 0.005 | ||
# Convulution layers | ||
conv1d_layer_sync_dropout_min = 0.20 | ||
conv1d_layer_sync_dropout_max = 0.24 | ||
# Dense layers | ||
dense_layer_sync_dropout_min = 0.20 | ||
dense_layer_sync_dropout_max = 0.25 | ||
#Recurring layers | ||
recurring_layer_sync_dropout_min = 0.32 | ||
recurring_layer_sync_dropout_max = 0.36 | ||
|
||
model.add(Conv1D(filters=12, kernel_size=3, activation='relu')) | ||
model.add(BatchNormalization()) | ||
model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max))) | ||
model.add(MaxPooling1D(pool_size=2)) | ||
|
||
model.add(Conv1D(filters=16, kernel_size=3, activation='relu')) | ||
model.add(BatchNormalization()) | ||
model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max))) | ||
model.add(MaxPooling1D(pool_size=2)) | ||
|
||
model.add(LSTM(20, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard))) | ||
model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max))) | ||
|
||
model.add(Bidirectional(LSTM(24, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard)))) | ||
model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max))) | ||
model.add(Flatten()) | ||
model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regularization_standard))) | ||
model.add(BatchNormalization()) | ||
model.add(Dropout(np.random.uniform(dense_layer_sync_dropout_min, dense_layer_sync_dropout_max))) | ||
|
||
# Compile the model | ||
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) | ||
|
||
# Train the model | ||
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
Oops, something went wrong.