Merge pull request #1 from pankace/Tomy's-experiments

Tomy's experiments
pankace · Mar 2, 2024 · 6e656c5 · 6e656c5
2 parents 3dae960 + 2966caa
commit 6e656c5
Show file tree

Hide file tree

Showing 11 changed files with 10,206 additions and 0 deletions.
diff --git a/experiments/Tomy's-experiments/Frequency of token usage data analysis results.csv b/experiments/Tomy's-experiments/Frequency of token usage data analysis results.csv
diff --git a/experiments/Tomy's-experiments/Purification verification data analysis results.csv b/experiments/Tomy's-experiments/Purification verification data analysis results.csv
diff --git a/experiments/Tomy's-experiments/Purification_and_analysis.py b/experiments/Tomy's-experiments/Purification_and_analysis.py
@@ -0,0 +1,144 @@
+import threading
+import pandas as pd
+import numpy as np
+from support_fuctions import numeric_analysis_arm, enhanced_diagnostics, dataframe_generator, tokenization, remove_emojis, inpurity_purging_protocol
+
+file_path = r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\tranlated_tweets.csv"
+source_data = pd.read_csv(file_path)
+developer_mode = 0
+loop_viewer = 0
+
+print("<------------------>")
+column_names = list(source_data.columns)
+column_data_types_detected = []
+object_data_columns = []
+analysis_compatible_columns = []
+
+for c_names in column_names:
+    column_data = source_data[c_names].to_numpy()
+    types_detected = column_data.dtype
+    column_data_types_detected.append(types_detected)
+    if types_detected == "int64":
+        analysis_compatible_columns.append(c_names)
+    elif types_detected == "object":
+        object_data_columns.append(c_names)
+
+if developer_mode == 1:
+    if len(column_names) == len(column_data_types_detected):
+        for element in range(len(column_names)):
+            print(f"Column name: {column_names[element]} | Data type detected: {column_data_types_detected[element]}")
+elif developer_mode == 0:
+    print("Developer mode inactive")
+print("<------------------>")
+print()
+print(f"Analysis compatible columns: {analysis_compatible_columns}")
+numeric_columns = []
+object_columns = []
+
+input_mode = True
+print("Type '-1' when no more target columns are needed.")
+while input_mode == True:
+    target = "-1" #input("Column name: ")
+    if target == "-1":
+        input_mode = False
+    elif target != "-1":
+        numeric_columns.append(target)
+
+locked_in_columns = numeric_columns
+print("Numeric analysis columns:")
+print(locked_in_columns)
+
+for names in locked_in_columns:
+    c_name_analysis_input = names
+    column_isolation = source_data[c_name_analysis_input].to_numpy()
+    if type(c_name_analysis_input) == str and (type(column_isolation) == list or type(column_isolation) == np.ndarray):
+        numeric_analysis_arm(c_name_analysis_input,column_isolation,developer_mode)
+        enhanced_diagnostics(c_name_analysis_input,column_isolation,developer_mode)
+
+text_isolation = source_data["Description Cleaned Translated"].to_numpy()
+text_isolation = inpurity_purging_protocol(text_isolation)
+unique_locations = []
+for tweets in text_isolation:
+    if tweets not in unique_locations:
+        unique_locations.append(tweets)
+
+target_isolation = source_data["Analysis results"].to_numpy()
+numbers_detected = []
+for numbers in target_isolation:
+    if numbers not in numbers_detected:
+        numbers_detected.append(numbers)
+    elif numbers in numbers_detected:
+        continue
+
+if len(target_isolation) == len(text_isolation):
+    print(f"Target isolated data: {len(target_isolation)}")
+    print(f"Text isolated data: {len(text_isolation)}")
+
+
+zero = 0
+one = 0
+asociation_text = []
+asociation_target = []
+if len(target_isolation) == len(text_isolation):
+    for elements in range(len(target_isolation)):
+        asociation_text.append(text_isolation[elements])
+        asociation_target.append(target_isolation[elements])
+
+        if target_isolation[elements] == 0:
+            zero += 1
+        elif target_isolation[elements] == 1:
+            one += 1
+
+full_set = one + zero
+print(f"Full set: {full_set}")
+
+if developer_mode == 1:
+    print(f"Zero targets: {zero}")
+    print(f"Ones targets: {one}")
+
+if full_set != 0:
+    zero_percentage = (zero / full_set) * 100
+    one_percentage = (one / full_set) * 100
+else:
+    zero_percentage = "Infinity"
+    one_percentage = "Infinity"
+
+if one > zero:
+    description = ["Cleared", "Flagged", "Percentage ratio of Cleared", "Percentage ratio of Flagged"]
+    results = [zero, one, f"{round(zero_percentage, 2)} %", f"{round(one_percentage, 2)} %"]
+else:
+    description = ["Cleared", "Flagged", "Percentage ratio of flagged", "Percentage ratio of Cleared"]
+    results = [zero, one, f"{round(one_percentage, 2)} %", f"{round(zero_percentage, 2)} %"]
+
+# Token analysis with stop words purge integrated
+text_isolation = text_isolation.flatten()
+tokenized_conversion = tokenization(text_isolation)
+
+# Emoji removal in classic for-loop format
+post_purge_storage = []
+for pre_purge in tokenized_conversion:
+    post_purge = remove_emojis(pre_purge)
+    post_purge_storage.append(post_purge)
+
+# Calculate unique tokens after stop words removal
+unique_tokens, counts = np.unique(post_purge_storage, return_counts=True)
+token_counts = dict(zip(unique_tokens, counts))
+
+# Prepare for dataframe generation
+token_keys = list(token_counts.keys())
+token_values = list(token_counts.values())
+token_des_column = ["Unique tokens", "Total tokens", "Percentage of unique tokens"]
+token_res_column = [len(unique_tokens), sum(counts), f"{(len(unique_tokens) / sum(counts) * 100)} %"]
+multithreading_deployment = 1
+if multithreading_deployment == 1:
+    if __name__ == "__main__":
+        t1 = threading.Thread(target=dataframe_generator(asociation_text, asociation_target, c1="Cleaned tweet", c2="Tag", column_name="Purification verification", developer_mode=developer_mode))
+        t2 = threading.Thread(target=dataframe_generator(description, results, c1="Cleared/Flagged quantity", c2="Analysis output", column_name="Tags analysis", developer_mode=developer_mode))
+        t3 = threading.Thread(target=dataframe_generator(token_des_column, token_res_column, c1="Token details", c2="Token count", column_name="Token analysis", developer_mode=developer_mode))
+        t4 = threading.Thread(target=dataframe_generator(token_keys, token_values, c1="Individual words", c2="Occurrences", column_name="Frequency of token usage", developer_mode=developer_mode))
+
+        threads = [t1, t2, t3, t4]
+        for individual_threads in threads:
+            individual_threads.start()
+        for initiated_threads in threads:
+            initiated_threads.join()
diff --git a/experiments/Tomy's-experiments/Random_forrest_real_data_test.py b/experiments/Tomy's-experiments/Random_forrest_real_data_test.py
@@ -0,0 +1,155 @@
+import time
+
+import nltk
+from sklearn.ensemble import RandomForestClassifier
+start_time = time.time()
+
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, f1_score, confusion_matrix, recall_score
+from nltk.stem import WordNetLemmatizer
+from nltk.corpus import stopwords
+from sklearn.decomposition import PCA, TruncatedSVD
+from nltk.tokenize import word_tokenize
+import pandas as pd
+
+def binary_classifier(X, y):
+    def error_metrics(y_true, y_pred):
+        accuracy = accuracy_score(y_true, y_pred)
+        precision = precision_score(y_true, y_pred)
+        recall = recall_score(y_true, y_pred)
+        f1 = f1_score(y_true, y_pred)
+
+        return accuracy, precision, recall, f1
+
+    upgrade_deployment = 0
+    if type(y) != np.ndarray:
+        y = np.array(y)
+
+    if upgrade_deployment == 1:
+        svd = TruncatedSVD(n_components=75)
+        X = svd.fit_transform(X)
+    # Split the data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
+    model = RandomForestClassifier(n_estimators=100, random_state=42)
+    model.fit(X_train, y_train)
+
+    # Prediction
+    prediction = model.predict(X_test)
+    print(f"Prediction: {prediction}")
+    accuracy_metric, precision_metric, recall_metric, f1_metric = error_metrics(y_test, prediction)
+    return accuracy_metric, precision_metric, recall_metric, f1_metric
+
+def remove_stopwords_from_text(tokens, stop_words_applied):
+    assert(type(tokens) == list or type(tokens) == np.ndarray)
+    if type(tokens) != np.ndarray:
+        tokens = np.array(tokens)
+
+    filtered_tokens = []
+    for token in tokens:
+        if token.lower() not in stop_words_applied and token.lower() not in spanish_stop_words:
+            filtered_tokens.append(token)
+    return filtered_tokens
+
+
+purified_data = pd.read_csv(r"C:\Users\Tomy\PycharmProjects\Experiment - 7\Industrial machine learning course files\Racism classification\Data analysis\Results\Token based\Post_purging\32\Purification verification data analysis results.csv")
+
+cleaned_tweet = purified_data["Cleaned tweet"].to_numpy()
+tags = purified_data["Tag"].to_numpy()
+
+# Tokenization
+token_conversion = []
+for tweets in cleaned_tweet:
+    post_conversion = word_tokenize(tweets)
+    token_conversion.append(post_conversion)
+
+print("Tokenization complete")
+for i in range(3):
+    print(token_conversion[i])
+
+# Stopwords purge
+english_stop_words = set(stopwords.words("English"))
+spanish_stop_words = set(stopwords.words("Spanish"))
+print("<------------->")
+print("Purging of spanish and english stop words in progress...")
+print(f"English stop words(length: {len(english_stop_words)}): {english_stop_words}")
+print(f"Spanish stop words(length: {len(spanish_stop_words)}): {spanish_stop_words}")
+
+stage_0 = []
+# English stopwords removal
+for stage_0_element in token_conversion:
+    english_cleaned_text = remove_stopwords_from_text(stage_0_element, english_stop_words)
+    stage_0.append(english_cleaned_text)
+
+stop_words_stage_1 = []
+# Spanish stop words removal
+stage_1 = []
+for stage_1_elements in stage_0:
+    spanish_cleaned_text = remove_stopwords_from_text(stage_1_elements, spanish_stop_words)
+    stage_1.append(spanish_cleaned_text)
+print("Purge of Spanish and english stop words completed...")
+print("<------------->")
+
+
+# lammination
+lemmatizer = WordNetLemmatizer()
+lamminized_tokens = []
+for post_processed_tokens in stage_1:
+    processed_tokens = []
+    for token in post_processed_tokens:
+        if token not in english_stop_words and token not in spanish_stop_words and token.isalpha():
+            # Lemmatize the token and append to the result list
+            lemmatized_token = lemmatizer.lemmatize(token)
+            processed_tokens.append(lemmatized_token)
+    lamminized_tokens.append(processed_tokens)
+
+preprocessed_texts = [" ".join(tokens) for tokens in lamminized_tokens]
+tfidf_vectorizer = TfidfVectorizer()
+X = tfidf_vectorizer.fit_transform(preprocessed_texts)
+y = tags
+if type(y) != np.ndarray:
+    y = np.array(y)
+print(f"X type: {type(X)}")
+print(f"y type: {type(y)}")
+accuracy_metric, precision_metric, recall_metric, f1_metric = binary_classifier(X, y)
+end_time = time.time()
+print(f"Accuracy: {round(accuracy_metric, 4)}")
+print(f"Precision: {round(precision_metric, 4)}")
+print(f"Recall: {round(recall_metric, 4)}")
+print(f"F1: {round(f1_metric, 4)}")
+processing_time = end_time - start_time
+if processing_time < 60:
+    print(f"Processing time: {processing_time} seconds")
+elif processing_time >= 60:
+    hours_conversion = (processing_time)/60
+    if hours_conversion > 1 and hours_conversion < 2:
+        print(f"Processing time: {hours_conversion} hour")
+    else:
+        print(f"Processing time: {hours_conversion} hours")
+"""
+Benchmark to beat:
+------------------
+Prediction: [0 1 1 1 1 0 1 0 1 1 0 0 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0
+ 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0
+ 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 0 1 1
+ 0 0 0 0 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1
+ 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 1 1 1
+ 1 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 1
+ 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1
+ 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1
+ 1 0 0 1]
+Accuracy: 79.0%
+Precision: 79.43262%
+Recall: 76.71233%
+F1-Score: 78.04878%
+Processing time: 3.4483914375305176 seconds
+
+Observation: Any attempt to increase accuracy makes these metrics decrease. Any assist is welcomed.
+Note: The processing time fluctuates drastically from 3.45 seconds to sometimes 8 seconds. Do not consider that an important consideration or anything if it stays in the seconds for processing time.
+"""
diff --git a/experiments/Tomy's-experiments/Simple ANN.py b/experiments/Tomy's-experiments/Simple ANN.py
@@ -0,0 +1,17 @@
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
+
+# Define CNN model
+model = Sequential([
+    Embedding(input_dim=500, output_dim=100, input_length=500), # Placeholder values used
+    Conv1D(filters=128, kernel_size=5, activation='relu'),
+    GlobalMaxPooling1D(),
+    Dense(64, activation='relu'),
+    Dense(1, activation='sigmoid')
+])
+
+# Compile model
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+
+# Train model
+
diff --git a/experiments/Tomy's-experiments/Tags analysis data analysis results.csv b/experiments/Tomy's-experiments/Tags analysis data analysis results.csv
@@ -0,0 +1,5 @@
+,Cleared/Flagged quantity,Analysis output
+0,Cleared,1027
+1,Flagged,973
+2,Percentage ratio of flagged,48.65 %
+3,Percentage ratio of Cleared,51.35 %
diff --git a/experiments/Tomy's-experiments/Token analysis data analysis results.csv b/experiments/Tomy's-experiments/Token analysis data analysis results.csv
@@ -0,0 +1,4 @@
+,Token details,Token count
+0,Unique tokens,7589
+1,Total tokens,59875
+2,Percentage of unique tokens,12.674739039665972 %
diff --git a/experiments/Tomy's-experiments/Tokenization.py b/experiments/Tomy's-experiments/Tokenization.py
@@ -0,0 +1,11 @@
+
+# initializing list
+test_list = ["Testing for tokenization", "Fat cat syndrome", "Test 01"]
+
+# printing original list
+print("The original list : " + str(test_list))
+# List splitting
+res = [sub.split() for sub in test_list]
+
+# print result
+print("The list after split of strings is : " + str(res))
diff --git a/experiments/Tomy's-experiments/advanced_prototype.py b/experiments/Tomy's-experiments/advanced_prototype.py
@@ -0,0 +1,55 @@
+import numpy as np
+import tensorflow as tf
+from keras.layers import Conv1D, BatchNormalization, Dropout, MaxPooling1D, LSTM, Bidirectional, Dense, Flatten
+import random
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+model = tf.keras.models.Sequential()
+
+num_samples = 1000
+input_shape = (1000, 1)
+
+X = np.random.rand(num_samples, *input_shape)
+y = np.random.randint(2, size=num_samples)
+
+# Split the data into training, validation, and testing sets
+X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
+X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
+
+regularization_standard = 0.005
+# Convulution layers
+conv1d_layer_sync_dropout_min = 0.20
+conv1d_layer_sync_dropout_max = 0.24
+# Dense layers
+dense_layer_sync_dropout_min = 0.20
+dense_layer_sync_dropout_max = 0.25
+#Recurring layers
+recurring_layer_sync_dropout_min = 0.32
+recurring_layer_sync_dropout_max = 0.36
+
+model.add(Conv1D(filters=12, kernel_size=3, activation='relu'))
+model.add(BatchNormalization())
+model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max)))
+model.add(MaxPooling1D(pool_size=2))
+
+model.add(Conv1D(filters=16, kernel_size=3, activation='relu'))
+model.add(BatchNormalization())
+model.add(Dropout(np.random.uniform(conv1d_layer_sync_dropout_min, conv1d_layer_sync_dropout_max)))
+model.add(MaxPooling1D(pool_size=2))
+
+model.add(LSTM(20, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard)))
+model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max)))
+
+model.add(Bidirectional(LSTM(24, return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(regularization_standard))))
+model.add(Dropout(np.random.uniform(recurring_layer_sync_dropout_min, recurring_layer_sync_dropout_max)))
+model.add(Flatten())
+model.add(Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regularization_standard)))
+model.add(BatchNormalization())
+model.add(Dropout(np.random.uniform(dense_layer_sync_dropout_min, dense_layer_sync_dropout_max)))
+
+# Compile the model
+model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+
+# Train the model
+model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
diff --git a/experiments/Tomy's-experiments/readme.md b/experiments/Tomy's-experiments/readme.md
@@ -0,0 +1 @@
+