From 5526024e637abb05e9941c05a9ee98dc51e72fd5 Mon Sep 17 00:00:00 2001
From: Ravi Roy <raviroy2002@gmail.com>
Date: Mon, 28 Jul 2025 11:38:57 +0530
Subject: [PATCH] Some fixes or improvements

---
 app.py                                  | 94 +++++++------------------
 docs/dbscan.md                          | 53 ++++++++++++++
 docs/kmeans.md                          | 45 ++++++++++++
 sample_data.csv                         |  7 ++
 unsupervised_algos/dbscan_clustering.py | 74 +++++++++++++++++++
 unsupervised_algos/kmeans_clustering.py | 69 ++++++++++++++++++
 6 files changed, 275 insertions(+), 67 deletions(-)
 create mode 100644 docs/dbscan.md
 create mode 100644 docs/kmeans.md
 create mode 100644 sample_data.csv
 create mode 100644 unsupervised_algos/dbscan_clustering.py
 create mode 100644 unsupervised_algos/kmeans_clustering.py
diff --git a/app.py b/app.py
index 5a37089..dff6239 100644
--- a/app.py
+++ b/app.py
@@ -1,68 +1,28 @@
-# Importing required library
 import streamlit as st
-
-
-# title  of the  page
-st.title("🔬Algo Labs Visualize  and Learn")
-
-
-# Navigation : all the files are in algo_files folder
-tab1, tab2, tab3= st.tabs(["Visualize Data","Supervised Learning", "Unsupervised Learning"])
-with tab1:
-    from algo_files.data_visualize import  visualize
-    visualize()
-with tab2:
-    from  algo_files.supervised_learning import  supervised
-    supervised()
-with tab3:
-    from  algo_files.unsupervised_learning import  unsupervised
-    unsupervised()
-
-
-# Sidebar : Data Uploading and Data Generation  
-with st.sidebar:
-    options = ["Upload Dataset", "Generate Dataset"]
-    selected_option = st.radio("Choose your preferred option:", options, index=0)
-    
-    if selected_option == "Upload Dataset":
-        uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
-    elif selected_option == "Generate Dataset":
-        no_of_sample = st.slider("No. of Samples",10,2000)
-        no_of_feature = st.slider("No. of Features",2,20)
-        noise_level= st.slider("Noise Level",0.00,50.00)
-        no_of_class= st.text_input("No. of Classes")
-        class_separation= st.slider("Class Separation",0.50,2.00)
-        def my_callback():
-            st.write("Data  Generated!")
-        st.button("Generate Data", on_click=my_callback)
-
-
-
-
-#Footer
-st.markdown("""
-<style>
-.footer {
-    position: fixed;
-    left: 0;
-    bottom: 0;
-    width: 100%;
-    background-color: #f0f2f6;
-    color: black;
-    text-align: right;
-    padding: 10px;
-    border-top: 1px solid #e0e0e0;
-    height:50px;
-}
-</style>
-<div class="footer">
-    <p>© 2025 GGSOC❤️ </p>
-</div>
-""", unsafe_allow_html=True)
-
-
-
-       
-
-
-
+import pandas as pd
+from unsupervised_algos.kmeans_clustering import run_kmeans
+from unsupervised_algos.dbscan_clustering import run_dbscan
+
+st.title("🧠 Clustering Playground")
+
+uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
+if uploaded_file:
+    data = pd.read_csv(uploaded_file)
+    st.write("### Data Preview", data.head())
+
+    # Session state initialization for selected algo
+    if 'selected_algo' not in st.session_state:
+        st.session_state.selected_algo = "KMeans"
+
+    # Algorithm dropdown
+    st.session_state.selected_algo = st.selectbox(
+        "Choose Clustering Algorithm",
+        ["KMeans", "DBSCAN"],
+        index=["KMeans", "DBSCAN"].index(st.session_state.selected_algo)
+    )
+
+    # Algo caller
+    if st.session_state.selected_algo == "KMeans":
+        run_kmeans(data)
+    elif st.session_state.selected_algo == "DBSCAN":
+        run_dbscan(data)
diff --git a/docs/dbscan.md b/docs/dbscan.md
new file mode 100644
index 0000000..533ceeb
--- /dev/null
+++ b/docs/dbscan.md
@@ -0,0 +1,53 @@
+
+# DBSCAN Clustering
+
+## 📌 What is DBSCAN?
+
+DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is an unsupervised clustering algorithm based on the **density** of points.
+
+It can find clusters of **arbitrary shapes** and identify outliers (noise).
+
+---
+
+## 🛠️ Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `eps` | Maximum distance between two samples to be considered neighbors |
+| `min_samples` | Minimum number of neighbors to form a dense region |
+
+---
+
+## 🎯 Output
+
+- **Clustered Scatter Plot**
+- **Silhouette Score** (if valid)
+- **Clustered Data Preview**
+
+---
+
+## 🧑‍💻 How It Works in Streamlit UI
+
+1. Upload a `.csv` file with 2 numeric columns
+2. Select `eps` and `min_samples` via sliders
+3. Click **"Run DBSCAN"**
+4. Get visual clusters and evaluation score (if possible)
+
+---
+
+## ⚠️ Special Note
+
+If DBSCAN finds only one cluster or marks all points as noise, silhouette score cannot be calculated.
+
+---
+
+## ✅ Example CSV Format
+
+```csv
+x,y
+1,2
+2,3
+3,4
+8,7
+9,6
+10,8
diff --git a/docs/kmeans.md b/docs/kmeans.md
new file mode 100644
index 0000000..b8082e5
--- /dev/null
+++ b/docs/kmeans.md
@@ -0,0 +1,45 @@
+# KMeans Clustering
+
+## 📌 What is KMeans?
+
+KMeans is a centroid-based unsupervised clustering algorithm that groups data into **k clusters**. It tries to minimize the distance between data points and their respective cluster centers.
+
+---
+
+## 🛠️ Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `n_clusters` | Number of clusters to form |
+| `random_state` | Ensures reproducibility |
+| `n_init='auto'` | Stable initialization |
+
+---
+
+## 🎯 Output
+
+- **Clustered Scatter Plot**
+- **Silhouette Score** to evaluate cluster separation
+- **Preview of Clustered Data**
+
+---
+
+## 🧑‍💻 How It Works in Streamlit UI
+
+1. Upload a `.csv` file with 2 numeric columns (e.g., x, y)
+2. Select the number of clusters using a slider
+3. Click **"Run KMeans Clustering"**
+4. Get visual clusters and evaluation score
+
+---
+
+## ✅ Example CSV Format
+
+```csv
+x,y
+1,2
+2,3
+3,4
+8,7
+9,6
+10,8
diff --git a/sample_data.csv b/sample_data.csv
new file mode 100644
index 0000000..c189873
--- /dev/null
+++ b/sample_data.csv
@@ -0,0 +1,7 @@
+x,y
+1,2
+2,3
+3,4
+8,7
+9,6
+10,8
diff --git a/unsupervised_algos/dbscan_clustering.py b/unsupervised_algos/dbscan_clustering.py
new file mode 100644
index 0000000..cbdec50
--- /dev/null
+++ b/unsupervised_algos/dbscan_clustering.py
@@ -0,0 +1,74 @@
+import streamlit as st
+from sklearn.cluster import DBSCAN
+from sklearn.metrics import silhouette_score
+import matplotlib.pyplot as plt
+import uuid
+import pandas as pd
+
+def run_dbscan(data: pd.DataFrame):
+    st.subheader("DBSCAN Clustering")
+
+    # Session state variables
+    if 'dbscan_eps' not in st.session_state:
+        st.session_state.dbscan_eps = 0.5
+    if 'dbscan_min_samples' not in st.session_state:
+        st.session_state.dbscan_min_samples = 5
+    if 'dbscan_score' not in st.session_state:
+        st.session_state.dbscan_score = None
+    if 'dbscan_plot_fig' not in st.session_state:
+        st.session_state.dbscan_plot_fig = None
+    if 'dbscan_clustered_data' not in st.session_state:
+        st.session_state.dbscan_clustered_data = None
+
+    # UI form
+    with st.form(key="dbscan_form"):
+        eps = st.slider("Epsilon (eps)", 0.1, 5.0, st.session_state.dbscan_eps, step=0.1)
+        min_samples = st.slider("Min Samples", 1, 20, st.session_state.dbscan_min_samples)
+        submitted = st.form_submit_button("Run DBSCAN")
+
+        if submitted:
+            st.write("✅ Inside DBSCAN submit")
+
+            input_data = data.copy()
+            input_data = input_data.apply(pd.to_numeric, errors='coerce')
+            input_data.dropna(inplace=True)
+
+            # Drop old cluster column
+            if "Cluster" in input_data.columns:
+                input_data.drop("Cluster", axis=1, inplace=True)
+
+            model = DBSCAN(eps=eps, min_samples=min_samples)
+            labels = model.fit_predict(input_data)
+            input_data["Cluster"] = labels
+
+            st.session_state.dbscan_eps = eps
+            st.session_state.dbscan_min_samples = min_samples
+            st.session_state.dbscan_clustered_data = input_data.head()
+
+            if len(set(labels)) > 1 and -1 not in set(labels):
+                score = silhouette_score(input_data.iloc[:, :-1], labels)
+                st.session_state.dbscan_score = score
+            else:
+                st.session_state.dbscan_score = "Not enough clusters to compute score"
+
+            # Plot
+            fig, ax = plt.subplots(figsize=(6, 4))
+            ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=labels, cmap='plasma')
+            ax.set_title("DBSCAN Clustering")
+            ax.set_xlabel("X")
+            ax.set_ylabel("Y")
+            st.session_state.dbscan_plot_fig = fig
+
+    # Result display (outside submit)
+    if st.session_state.dbscan_score is not None:
+        if isinstance(st.session_state.dbscan_score, str):
+            st.warning(st.session_state.dbscan_score)
+        else:
+            st.write(f"### Silhouette Score: {st.session_state.dbscan_score:.2f}")
+
+    if st.session_state.dbscan_plot_fig:
+        st.pyplot(st.session_state.dbscan_plot_fig)
+        plt.close(st.session_state.dbscan_plot_fig)
+
+    if st.session_state.dbscan_clustered_data is not None:
+        st.write("### Clustered Data", st.session_state.dbscan_clustered_data)
diff --git a/unsupervised_algos/kmeans_clustering.py b/unsupervised_algos/kmeans_clustering.py
new file mode 100644
index 0000000..1e656c9
--- /dev/null
+++ b/unsupervised_algos/kmeans_clustering.py
@@ -0,0 +1,69 @@
+import streamlit as st
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+import matplotlib.pyplot as plt
+import pandas as pd # Added for type hinting and clarity
+
+def run_kmeans(data: pd.DataFrame):
+    st.subheader("KMeans Clustering")
+
+    # Session state variables को इनिशियलाइज़ करें ताकि परिणाम बने रहें
+    if 'silhouette_score' not in st.session_state:
+        st.session_state.silhouette_score = None
+    if 'kmeans_plot_fig' not in st.session_state:
+        st.session_state.kmeans_plot_fig = None
+    if 'clustered_data_head' not in st.session_state:
+        st.session_state.clustered_data_head = None
+    if 'kmeans_n_clusters' not in st.session_state: # पिछली बार उपयोग किए गए n_clusters को स्टोर करने के लिए
+        st.session_state.kmeans_n_clusters = 3 # डिफ़ॉल्ट मान
+
+    # st.form के लिए एक स्थिर key का उपयोग करें
+    with st.form(key="kmeans_params_form"):
+        # स्लाइडर का डिफ़ॉल्ट मान session_state से लें
+        n_clusters = st.slider("Select number of clusters", 2, 10, st.session_state.kmeans_n_clusters, key="kmeans_n_clusters_slider")
+        submitted = st.form_submit_button("Run KMeans Clustering")
+        st.write("Data Types:", data.dtypes)
+
+        if submitted:
+            st.write("✅ Inside submit block")
+
+            # इनपुट डेटा की कॉपी बनाएं
+            input_data = data.copy()
+
+            # यदि 'Cluster' कॉलम पहले से मौजूद है तो उसे हटा दें
+            if "Cluster" in input_data.columns:
+                input_data.drop("Cluster", axis=1, inplace=True)
+
+            # KMeans फिट करें
+            # KMeans के लिए n_init='auto' जोड़ा गया ताकि भविष्य की चेतावनी से बचा जा सके
+            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
+            kmeans.fit(input_data)
+
+            input_data["Cluster"] = kmeans.labels_
+
+            # सिलुएट स्कोर की गणना करें और session_state में स्टोर करें
+            score = silhouette_score(input_data.iloc[:, :-1], input_data["Cluster"])
+            st.session_state.silhouette_score = score
+            st.session_state.kmeans_n_clusters = n_clusters # चुने गए n_clusters को स्टोर करें
+
+            # प्लॉट बनाएं और figure को session_state में स्टोर करें
+            fig, ax = plt.subplots(figsize=(6, 4))
+            ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=input_data["Cluster"], cmap='viridis')
+            ax.set_xlabel("Feature 1")
+            ax.set_ylabel("Feature 2")
+            ax.set_title("KMeans Clustering")
+            st.session_state.kmeans_plot_fig = fig
+
+            # क्लस्टर किए गए डेटा का head session_state में स्टोर करें
+            st.session_state.clustered_data_head = input_data.head()
+
+    # परिणाम 'if submitted' ब्लॉक के बाहर प्रदर्शित करें, लेकिन तभी जब वे session_state में मौजूद हों
+    if st.session_state.silhouette_score is not None:
+        st.write(f"### Silhouette Score: {st.session_state.silhouette_score:.2f}")
+
+    if st.session_state.kmeans_plot_fig is not None:
+        st.pyplot(st.session_state.kmeans_plot_fig)
+        plt.close(st.session_state.kmeans_plot_fig) # मेमोरी लीक से बचने के लिए figure को बंद करें
+
+    if st.session_state.clustered_data_head is not None:
+        st.write("### Clustered Data", st.session_state.clustered_data_head)
\ No newline at end of file