From 5526024e637abb05e9941c05a9ee98dc51e72fd5 Mon Sep 17 00:00:00 2001 From: Ravi Roy Date: Mon, 28 Jul 2025 11:38:57 +0530 Subject: [PATCH] Some fixes or improvements --- app.py | 94 +++++++------------------ docs/dbscan.md | 53 ++++++++++++++ docs/kmeans.md | 45 ++++++++++++ sample_data.csv | 7 ++ unsupervised_algos/dbscan_clustering.py | 74 +++++++++++++++++++ unsupervised_algos/kmeans_clustering.py | 69 ++++++++++++++++++ 6 files changed, 275 insertions(+), 67 deletions(-) create mode 100644 docs/dbscan.md create mode 100644 docs/kmeans.md create mode 100644 sample_data.csv create mode 100644 unsupervised_algos/dbscan_clustering.py create mode 100644 unsupervised_algos/kmeans_clustering.py diff --git a/app.py b/app.py index 5a37089..dff6239 100644 --- a/app.py +++ b/app.py @@ -1,68 +1,28 @@ -# Importing required library import streamlit as st - - -# title of the page -st.title("ЁЯФмAlgo Labs Visualize and Learn") - - -# Navigation : all the files are in algo_files folder -tab1, tab2, tab3= st.tabs(["Visualize Data","Supervised Learning", "Unsupervised Learning"]) -with tab1: - from algo_files.data_visualize import visualize - visualize() -with tab2: - from algo_files.supervised_learning import supervised - supervised() -with tab3: - from algo_files.unsupervised_learning import unsupervised - unsupervised() - - -# Sidebar : Data Uploading and Data Generation -with st.sidebar: - options = ["Upload Dataset", "Generate Dataset"] - selected_option = st.radio("Choose your preferred option:", options, index=0) - - if selected_option == "Upload Dataset": - uploaded_file = st.file_uploader("Choose a CSV file", type="csv") - elif selected_option == "Generate Dataset": - no_of_sample = st.slider("No. of Samples",10,2000) - no_of_feature = st.slider("No. of Features",2,20) - noise_level= st.slider("Noise Level",0.00,50.00) - no_of_class= st.text_input("No. of Classes") - class_separation= st.slider("Class Separation",0.50,2.00) - def my_callback(): - st.write("Data Generated!") - st.button("Generate Data", on_click=my_callback) - - - - -#Footer -st.markdown(""" - - -""", unsafe_allow_html=True) - - - - - - - +import pandas as pd +from unsupervised_algos.kmeans_clustering import run_kmeans +from unsupervised_algos.dbscan_clustering import run_dbscan + +st.title("ЁЯза Clustering Playground") + +uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) +if uploaded_file: + data = pd.read_csv(uploaded_file) + st.write("### Data Preview", data.head()) + + # Session state initialization for selected algo + if 'selected_algo' not in st.session_state: + st.session_state.selected_algo = "KMeans" + + # Algorithm dropdown + st.session_state.selected_algo = st.selectbox( + "Choose Clustering Algorithm", + ["KMeans", "DBSCAN"], + index=["KMeans", "DBSCAN"].index(st.session_state.selected_algo) + ) + + # Algo caller + if st.session_state.selected_algo == "KMeans": + run_kmeans(data) + elif st.session_state.selected_algo == "DBSCAN": + run_dbscan(data) diff --git a/docs/dbscan.md b/docs/dbscan.md new file mode 100644 index 0000000..533ceeb --- /dev/null +++ b/docs/dbscan.md @@ -0,0 +1,53 @@ + +# DBSCAN Clustering + +## ЁЯУМ What is DBSCAN? + +DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is an unsupervised clustering algorithm based on the **density** of points. + +It can find clusters of **arbitrary shapes** and identify outliers (noise). + +--- + +## ЁЯЫая╕П Parameters + +| Parameter | Description | +|-----------|-------------| +| `eps` | Maximum distance between two samples to be considered neighbors | +| `min_samples` | Minimum number of neighbors to form a dense region | + +--- + +## ЁЯОп Output + +- **Clustered Scatter Plot** +- **Silhouette Score** (if valid) +- **Clustered Data Preview** + +--- + +## ЁЯзСтАНЁЯТ╗ How It Works in Streamlit UI + +1. Upload a `.csv` file with 2 numeric columns +2. Select `eps` and `min_samples` via sliders +3. Click **"Run DBSCAN"** +4. Get visual clusters and evaluation score (if possible) + +--- + +## тЪая╕П Special Note + +If DBSCAN finds only one cluster or marks all points as noise, silhouette score cannot be calculated. + +--- + +## тЬЕ Example CSV Format + +```csv +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/docs/kmeans.md b/docs/kmeans.md new file mode 100644 index 0000000..b8082e5 --- /dev/null +++ b/docs/kmeans.md @@ -0,0 +1,45 @@ +# KMeans Clustering + +## ЁЯУМ What is KMeans? + +KMeans is a centroid-based unsupervised clustering algorithm that groups data into **k clusters**. It tries to minimize the distance between data points and their respective cluster centers. + +--- + +## ЁЯЫая╕П Parameters + +| Parameter | Description | +|-----------|-------------| +| `n_clusters` | Number of clusters to form | +| `random_state` | Ensures reproducibility | +| `n_init='auto'` | Stable initialization | + +--- + +## ЁЯОп Output + +- **Clustered Scatter Plot** +- **Silhouette Score** to evaluate cluster separation +- **Preview of Clustered Data** + +--- + +## ЁЯзСтАНЁЯТ╗ How It Works in Streamlit UI + +1. Upload a `.csv` file with 2 numeric columns (e.g., x, y) +2. Select the number of clusters using a slider +3. Click **"Run KMeans Clustering"** +4. Get visual clusters and evaluation score + +--- + +## тЬЕ Example CSV Format + +```csv +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/sample_data.csv b/sample_data.csv new file mode 100644 index 0000000..c189873 --- /dev/null +++ b/sample_data.csv @@ -0,0 +1,7 @@ +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/unsupervised_algos/dbscan_clustering.py b/unsupervised_algos/dbscan_clustering.py new file mode 100644 index 0000000..cbdec50 --- /dev/null +++ b/unsupervised_algos/dbscan_clustering.py @@ -0,0 +1,74 @@ +import streamlit as st +from sklearn.cluster import DBSCAN +from sklearn.metrics import silhouette_score +import matplotlib.pyplot as plt +import uuid +import pandas as pd + +def run_dbscan(data: pd.DataFrame): + st.subheader("DBSCAN Clustering") + + # Session state variables + if 'dbscan_eps' not in st.session_state: + st.session_state.dbscan_eps = 0.5 + if 'dbscan_min_samples' not in st.session_state: + st.session_state.dbscan_min_samples = 5 + if 'dbscan_score' not in st.session_state: + st.session_state.dbscan_score = None + if 'dbscan_plot_fig' not in st.session_state: + st.session_state.dbscan_plot_fig = None + if 'dbscan_clustered_data' not in st.session_state: + st.session_state.dbscan_clustered_data = None + + # UI form + with st.form(key="dbscan_form"): + eps = st.slider("Epsilon (eps)", 0.1, 5.0, st.session_state.dbscan_eps, step=0.1) + min_samples = st.slider("Min Samples", 1, 20, st.session_state.dbscan_min_samples) + submitted = st.form_submit_button("Run DBSCAN") + + if submitted: + st.write("тЬЕ Inside DBSCAN submit") + + input_data = data.copy() + input_data = input_data.apply(pd.to_numeric, errors='coerce') + input_data.dropna(inplace=True) + + # Drop old cluster column + if "Cluster" in input_data.columns: + input_data.drop("Cluster", axis=1, inplace=True) + + model = DBSCAN(eps=eps, min_samples=min_samples) + labels = model.fit_predict(input_data) + input_data["Cluster"] = labels + + st.session_state.dbscan_eps = eps + st.session_state.dbscan_min_samples = min_samples + st.session_state.dbscan_clustered_data = input_data.head() + + if len(set(labels)) > 1 and -1 not in set(labels): + score = silhouette_score(input_data.iloc[:, :-1], labels) + st.session_state.dbscan_score = score + else: + st.session_state.dbscan_score = "Not enough clusters to compute score" + + # Plot + fig, ax = plt.subplots(figsize=(6, 4)) + ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=labels, cmap='plasma') + ax.set_title("DBSCAN Clustering") + ax.set_xlabel("X") + ax.set_ylabel("Y") + st.session_state.dbscan_plot_fig = fig + + # Result display (outside submit) + if st.session_state.dbscan_score is not None: + if isinstance(st.session_state.dbscan_score, str): + st.warning(st.session_state.dbscan_score) + else: + st.write(f"### Silhouette Score: {st.session_state.dbscan_score:.2f}") + + if st.session_state.dbscan_plot_fig: + st.pyplot(st.session_state.dbscan_plot_fig) + plt.close(st.session_state.dbscan_plot_fig) + + if st.session_state.dbscan_clustered_data is not None: + st.write("### Clustered Data", st.session_state.dbscan_clustered_data) diff --git a/unsupervised_algos/kmeans_clustering.py b/unsupervised_algos/kmeans_clustering.py new file mode 100644 index 0000000..1e656c9 --- /dev/null +++ b/unsupervised_algos/kmeans_clustering.py @@ -0,0 +1,69 @@ +import streamlit as st +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score +import matplotlib.pyplot as plt +import pandas as pd # Added for type hinting and clarity + +def run_kmeans(data: pd.DataFrame): + st.subheader("KMeans Clustering") + + # Session state variables рдХреЛ рдЗрдирд┐рд╢рд┐рдпрд▓рд╛рдЗрдЬрд╝ рдХрд░реЗрдВ рддрд╛рдХрд┐ рдкрд░рд┐рдгрд╛рдо рдмрдиреЗ рд░рд╣реЗрдВ + if 'silhouette_score' not in st.session_state: + st.session_state.silhouette_score = None + if 'kmeans_plot_fig' not in st.session_state: + st.session_state.kmeans_plot_fig = None + if 'clustered_data_head' not in st.session_state: + st.session_state.clustered_data_head = None + if 'kmeans_n_clusters' not in st.session_state: # рдкрд┐рдЫрд▓реА рдмрд╛рд░ рдЙрдкрдпреЛрдЧ рдХрд┐рдП рдЧрдП n_clusters рдХреЛ рд╕реНрдЯреЛрд░ рдХрд░рдиреЗ рдХреЗ рд▓рд┐рдП + st.session_state.kmeans_n_clusters = 3 # рдбрд┐рдлрд╝реЙрд▓реНрдЯ рдорд╛рди + + # st.form рдХреЗ рд▓рд┐рдП рдПрдХ рд╕реНрдерд┐рд░ key рдХрд╛ рдЙрдкрдпреЛрдЧ рдХрд░реЗрдВ + with st.form(key="kmeans_params_form"): + # рд╕реНрд▓рд╛рдЗрдбрд░ рдХрд╛ рдбрд┐рдлрд╝реЙрд▓реНрдЯ рдорд╛рди session_state рд╕реЗ рд▓реЗрдВ + n_clusters = st.slider("Select number of clusters", 2, 10, st.session_state.kmeans_n_clusters, key="kmeans_n_clusters_slider") + submitted = st.form_submit_button("Run KMeans Clustering") + st.write("Data Types:", data.dtypes) + + if submitted: + st.write("тЬЕ Inside submit block") + + # рдЗрдирдкреБрдЯ рдбреЗрдЯрд╛ рдХреА рдХреЙрдкреА рдмрдирд╛рдПрдВ + input_data = data.copy() + + # рдпрджрд┐ 'Cluster' рдХреЙрд▓рдо рдкрд╣рд▓реЗ рд╕реЗ рдореМрдЬреВрдж рд╣реИ рддреЛ рдЙрд╕реЗ рд╣рдЯрд╛ рджреЗрдВ + if "Cluster" in input_data.columns: + input_data.drop("Cluster", axis=1, inplace=True) + + # KMeans рдлрд┐рдЯ рдХрд░реЗрдВ + # KMeans рдХреЗ рд▓рд┐рдП n_init='auto' рдЬреЛрдбрд╝рд╛ рдЧрдпрд╛ рддрд╛рдХрд┐ рднрд╡рд┐рд╖реНрдп рдХреА рдЪреЗрддрд╛рд╡рдиреА рд╕реЗ рдмрдЪрд╛ рдЬрд╛ рд╕рдХреЗ + kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto') + kmeans.fit(input_data) + + input_data["Cluster"] = kmeans.labels_ + + # рд╕рд┐рд▓реБрдПрдЯ рд╕реНрдХреЛрд░ рдХреА рдЧрдгрдирд╛ рдХрд░реЗрдВ рдФрд░ session_state рдореЗрдВ рд╕реНрдЯреЛрд░ рдХрд░реЗрдВ + score = silhouette_score(input_data.iloc[:, :-1], input_data["Cluster"]) + st.session_state.silhouette_score = score + st.session_state.kmeans_n_clusters = n_clusters # рдЪреБрдиреЗ рдЧрдП n_clusters рдХреЛ рд╕реНрдЯреЛрд░ рдХрд░реЗрдВ + + # рдкреНрд▓реЙрдЯ рдмрдирд╛рдПрдВ рдФрд░ figure рдХреЛ session_state рдореЗрдВ рд╕реНрдЯреЛрд░ рдХрд░реЗрдВ + fig, ax = plt.subplots(figsize=(6, 4)) + ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=input_data["Cluster"], cmap='viridis') + ax.set_xlabel("Feature 1") + ax.set_ylabel("Feature 2") + ax.set_title("KMeans Clustering") + st.session_state.kmeans_plot_fig = fig + + # рдХреНрд▓рд╕реНрдЯрд░ рдХрд┐рдП рдЧрдП рдбреЗрдЯрд╛ рдХрд╛ head session_state рдореЗрдВ рд╕реНрдЯреЛрд░ рдХрд░реЗрдВ + st.session_state.clustered_data_head = input_data.head() + + # рдкрд░рд┐рдгрд╛рдо 'if submitted' рдмреНрд▓реЙрдХ рдХреЗ рдмрд╛рд╣рд░ рдкреНрд░рджрд░реНрд╢рд┐рдд рдХрд░реЗрдВ, рд▓реЗрдХрд┐рди рддрднреА рдЬрдм рд╡реЗ session_state рдореЗрдВ рдореМрдЬреВрдж рд╣реЛрдВ + if st.session_state.silhouette_score is not None: + st.write(f"### Silhouette Score: {st.session_state.silhouette_score:.2f}") + + if st.session_state.kmeans_plot_fig is not None: + st.pyplot(st.session_state.kmeans_plot_fig) + plt.close(st.session_state.kmeans_plot_fig) # рдореЗрдореЛрд░реА рд▓реАрдХ рд╕реЗ рдмрдЪрдиреЗ рдХреЗ рд▓рд┐рдП figure рдХреЛ рдмрдВрдж рдХрд░реЗрдВ + + if st.session_state.clustered_data_head is not None: + st.write("### Clustered Data", st.session_state.clustered_data_head) \ No newline at end of file