diff --git a/app.py b/app.py index e16ff7a..ec5cea5 100644 --- a/app.py +++ b/app.py @@ -1,30 +1,10 @@ - -# Importing required library -import streamlit as st -import pandas as pd - -#import upload_validate() from data validation - -# ============================== -# app.py - AlgoLab Main Script -# ---------------------------- -# - Handles UI and navigation -# - Dataset Upload & Generation -# - Calls interactive_model_tuning() -# ============================== - import streamlit as st import pandas as pd -from supervised_module import interactive_model_tuning - +from unsupervised_algos.kmeans_clustering import run_kmeans +from unsupervised_algos.dbscan_clustering import run_dbscan from data_handler.upload_validate import upload_and_validate from sklearn.datasets import make_classification - - - -# Page configuration -======= -# ✅ Page configuration +from supervised_module import interactive_model_tuning st.set_page_config( page_title="Algo Lab", @@ -33,10 +13,8 @@ initial_sidebar_state="expanded" ) -# ✅ App Title st.title("🔬 Algo Labs - Visualize and Learn") -# ✅ Motivational Quote Box st.markdown("""
.footer { diff --git a/docs/dbscan.md b/docs/dbscan.md new file mode 100644 index 0000000..533ceeb --- /dev/null +++ b/docs/dbscan.md @@ -0,0 +1,53 @@ + +# DBSCAN Clustering + +## 📌 What is DBSCAN? + +DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is an unsupervised clustering algorithm based on the **density** of points. + +It can find clusters of **arbitrary shapes** and identify outliers (noise). + +--- + +## 🛠️ Parameters + +| Parameter | Description | +|-----------|-------------| +| `eps` | Maximum distance between two samples to be considered neighbors | +| `min_samples` | Minimum number of neighbors to form a dense region | + +--- + +## 🎯 Output + +- **Clustered Scatter Plot** +- **Silhouette Score** (if valid) +- **Clustered Data Preview** + +--- + +## 🧑‍💻 How It Works in Streamlit UI + +1. Upload a `.csv` file with 2 numeric columns +2. Select `eps` and `min_samples` via sliders +3. Click **"Run DBSCAN"** +4. Get visual clusters and evaluation score (if possible) + +--- + +## ⚠️ Special Note + +If DBSCAN finds only one cluster or marks all points as noise, silhouette score cannot be calculated. + +--- + +## ✅ Example CSV Format + +```csv +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/docs/kmeans.md b/docs/kmeans.md new file mode 100644 index 0000000..b8082e5 --- /dev/null +++ b/docs/kmeans.md @@ -0,0 +1,45 @@ +# KMeans Clustering + +## 📌 What is KMeans? + +KMeans is a centroid-based unsupervised clustering algorithm that groups data into **k clusters**. It tries to minimize the distance between data points and their respective cluster centers. + +--- + +## 🛠️ Parameters + +| Parameter | Description | +|-----------|-------------| +| `n_clusters` | Number of clusters to form | +| `random_state` | Ensures reproducibility | +| `n_init='auto'` | Stable initialization | + +--- + +## 🎯 Output + +- **Clustered Scatter Plot** +- **Silhouette Score** to evaluate cluster separation +- **Preview of Clustered Data** + +--- + +## 🧑‍💻 How It Works in Streamlit UI + +1. Upload a `.csv` file with 2 numeric columns (e.g., x, y) +2. Select the number of clusters using a slider +3. Click **"Run KMeans Clustering"** +4. Get visual clusters and evaluation score + +--- + +## ✅ Example CSV Format + +```csv +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/sample_data.csv b/sample_data.csv new file mode 100644 index 0000000..c189873 --- /dev/null +++ b/sample_data.csv @@ -0,0 +1,7 @@ +x,y +1,2 +2,3 +3,4 +8,7 +9,6 +10,8 diff --git a/unsupervised_algos/dbscan_clustering.py b/unsupervised_algos/dbscan_clustering.py new file mode 100644 index 0000000..cbdec50 --- /dev/null +++ b/unsupervised_algos/dbscan_clustering.py @@ -0,0 +1,74 @@ +import streamlit as st +from sklearn.cluster import DBSCAN +from sklearn.metrics import silhouette_score +import matplotlib.pyplot as plt +import uuid +import pandas as pd + +def run_dbscan(data: pd.DataFrame): + st.subheader("DBSCAN Clustering") + + # Session state variables + if 'dbscan_eps' not in st.session_state: + st.session_state.dbscan_eps = 0.5 + if 'dbscan_min_samples' not in st.session_state: + st.session_state.dbscan_min_samples = 5 + if 'dbscan_score' not in st.session_state: + st.session_state.dbscan_score = None + if 'dbscan_plot_fig' not in st.session_state: + st.session_state.dbscan_plot_fig = None + if 'dbscan_clustered_data' not in st.session_state: + st.session_state.dbscan_clustered_data = None + + # UI form + with st.form(key="dbscan_form"): + eps = st.slider("Epsilon (eps)", 0.1, 5.0, st.session_state.dbscan_eps, step=0.1) + min_samples = st.slider("Min Samples", 1, 20, st.session_state.dbscan_min_samples) + submitted = st.form_submit_button("Run DBSCAN") + + if submitted: + st.write("✅ Inside DBSCAN submit") + + input_data = data.copy() + input_data = input_data.apply(pd.to_numeric, errors='coerce') + input_data.dropna(inplace=True) + + # Drop old cluster column + if "Cluster" in input_data.columns: + input_data.drop("Cluster", axis=1, inplace=True) + + model = DBSCAN(eps=eps, min_samples=min_samples) + labels = model.fit_predict(input_data) + input_data["Cluster"] = labels + + st.session_state.dbscan_eps = eps + st.session_state.dbscan_min_samples = min_samples + st.session_state.dbscan_clustered_data = input_data.head() + + if len(set(labels)) > 1 and -1 not in set(labels): + score = silhouette_score(input_data.iloc[:, :-1], labels) + st.session_state.dbscan_score = score + else: + st.session_state.dbscan_score = "Not enough clusters to compute score" + + # Plot + fig, ax = plt.subplots(figsize=(6, 4)) + ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=labels, cmap='plasma') + ax.set_title("DBSCAN Clustering") + ax.set_xlabel("X") + ax.set_ylabel("Y") + st.session_state.dbscan_plot_fig = fig + + # Result display (outside submit) + if st.session_state.dbscan_score is not None: + if isinstance(st.session_state.dbscan_score, str): + st.warning(st.session_state.dbscan_score) + else: + st.write(f"### Silhouette Score: {st.session_state.dbscan_score:.2f}") + + if st.session_state.dbscan_plot_fig: + st.pyplot(st.session_state.dbscan_plot_fig) + plt.close(st.session_state.dbscan_plot_fig) + + if st.session_state.dbscan_clustered_data is not None: + st.write("### Clustered Data", st.session_state.dbscan_clustered_data) diff --git a/unsupervised_algos/kmeans_clustering.py b/unsupervised_algos/kmeans_clustering.py new file mode 100644 index 0000000..1e656c9 --- /dev/null +++ b/unsupervised_algos/kmeans_clustering.py @@ -0,0 +1,69 @@ +import streamlit as st +from sklearn.cluster import KMeans +from sklearn.metrics import silhouette_score +import matplotlib.pyplot as plt +import pandas as pd # Added for type hinting and clarity + +def run_kmeans(data: pd.DataFrame): + st.subheader("KMeans Clustering") + + # Session state variables को इनिशियलाइज़ करें ताकि परिणाम बने रहें + if 'silhouette_score' not in st.session_state: + st.session_state.silhouette_score = None + if 'kmeans_plot_fig' not in st.session_state: + st.session_state.kmeans_plot_fig = None + if 'clustered_data_head' not in st.session_state: + st.session_state.clustered_data_head = None + if 'kmeans_n_clusters' not in st.session_state: # पिछली बार उपयोग किए गए n_clusters को स्टोर करने के लिए + st.session_state.kmeans_n_clusters = 3 # डिफ़ॉल्ट मान + + # st.form के लिए एक स्थिर key का उपयोग करें + with st.form(key="kmeans_params_form"): + # स्लाइडर का डिफ़ॉल्ट मान session_state से लें + n_clusters = st.slider("Select number of clusters", 2, 10, st.session_state.kmeans_n_clusters, key="kmeans_n_clusters_slider") + submitted = st.form_submit_button("Run KMeans Clustering") + st.write("Data Types:", data.dtypes) + + if submitted: + st.write("✅ Inside submit block") + + # इनपुट डेटा की कॉपी बनाएं + input_data = data.copy() + + # यदि 'Cluster' कॉलम पहले से मौजूद है तो उसे हटा दें + if "Cluster" in input_data.columns: + input_data.drop("Cluster", axis=1, inplace=True) + + # KMeans फिट करें + # KMeans के लिए n_init='auto' जोड़ा गया ताकि भविष्य की चेतावनी से बचा जा सके + kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto') + kmeans.fit(input_data) + + input_data["Cluster"] = kmeans.labels_ + + # सिलुएट स्कोर की गणना करें और session_state में स्टोर करें + score = silhouette_score(input_data.iloc[:, :-1], input_data["Cluster"]) + st.session_state.silhouette_score = score + st.session_state.kmeans_n_clusters = n_clusters # चुने गए n_clusters को स्टोर करें + + # प्लॉट बनाएं और figure को session_state में स्टोर करें + fig, ax = plt.subplots(figsize=(6, 4)) + ax.scatter(input_data.iloc[:, 0], input_data.iloc[:, 1], c=input_data["Cluster"], cmap='viridis') + ax.set_xlabel("Feature 1") + ax.set_ylabel("Feature 2") + ax.set_title("KMeans Clustering") + st.session_state.kmeans_plot_fig = fig + + # क्लस्टर किए गए डेटा का head session_state में स्टोर करें + st.session_state.clustered_data_head = input_data.head() + + # परिणाम 'if submitted' ब्लॉक के बाहर प्रदर्शित करें, लेकिन तभी जब वे session_state में मौजूद हों + if st.session_state.silhouette_score is not None: + st.write(f"### Silhouette Score: {st.session_state.silhouette_score:.2f}") + + if st.session_state.kmeans_plot_fig is not None: + st.pyplot(st.session_state.kmeans_plot_fig) + plt.close(st.session_state.kmeans_plot_fig) # मेमोरी लीक से बचने के लिए figure को बंद करें + + if st.session_state.clustered_data_head is not None: + st.write("### Clustered Data", st.session_state.clustered_data_head) \ No newline at end of file