changed plotting and added data description

ACSE-vg822 · Sep 14, 2024 · d805076 · d805076
1 parent 80fdab1
commit d805076
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 49 deletions.
diff --git a/src/components/data_ingestion.py b/src/components/data_ingestion.py
@@ -1,8 +1,10 @@
 import os
 import sys
 import numpy as np
+
 from src.exception import CustomException
 from src.logger import logging
+from src.utils import save_object, load_object, convert_and_set_time_index
 
 import pandas as pd
 from sklearn.model_selection import train_test_split
@@ -37,10 +39,7 @@ def initiate_data_ingestion(self):
             logging.info('Read the dataset as dataframe')
 
             # Step 2: Set 'Time' column as index
-            if 'Time' in df.columns:
-                logging.info("Converting 'Time' column to human-readable format.")
-                df['Time'] = pd.to_datetime(df['Time'], unit='s')  # Assuming Unix timestamps
-                df.set_index('Time', inplace=True)
+            df = convert_and_set_time_index(df)
 
             # Step 3: Normalize the data using MinMaxScaler
             scaler = MinMaxScaler()

diff --git a/src/pipeline/predict_pipeline.py b/src/pipeline/predict_pipeline.py
@@ -5,7 +5,7 @@
 from tensorflow.keras.models import load_model
 from src.exception import CustomException
 from src.logger import logging
-from src.utils import save_object, load_object  # Assuming save_object and load_object are present for reusability
+from src.utils import save_object, load_object, convert_and_set_time_index  # Assuming save_object and load_object are present for reusability
 from sklearn.preprocessing import MinMaxScaler
 from dataclasses import dataclass
 
@@ -63,10 +63,12 @@ def predict(self, df, seq_length = 10):
             threshold = self.load_threshold()
 
             # Step 3: Convert the 'time' column to human-readable format
-            if 'Time' in df.columns:
-                logging.info("Converting 'Time' column to human-readable format.")
-                df['Time'] = pd.to_datetime(df['Time'], unit='s')  # Assuming Unix timestamps
-                df.set_index('Time', inplace=True)
+            # if 'Time' in df.columns:
+            #     logging.info("Converting 'Time' column to human-readable format.")
+            #     df['Time'] = pd.to_datetime(df['Time'], unit='s')  # Assuming Unix timestamps
+            #     df.set_index('Time', inplace=True)
+            df = convert_and_set_time_index(df)
+
             logging.info(df.head())
 
             # Step 4: Preprocess the input dataframe

diff --git a/src/utils.py b/src/utils.py
@@ -1,14 +1,17 @@
 import os
 import sys
+import streamlit as st
 
 import numpy as np 
 import pandas as pd
-#import dill
+import matplotlib.pyplot as plt
+
 import pickle
 from sklearn.metrics import r2_score
 from sklearn.model_selection import GridSearchCV
 
 from src.exception import CustomException
+from src.logger import logging
 
 def save_object(file_path, obj):
     try:
@@ -29,4 +32,45 @@ def load_object(file_path):
 
     except Exception as e:
         raise CustomException(e, sys)
-
+
+
+# Function to plot training data with anomalies    
+def plot_train_data_with_anomalies(df, anomalies_df):
+    # Iterate over each sensor/column in the DataFrame
+    for col in df.columns:
+        # Create a new figure for each column
+        plt.figure(figsize=(25, 8))  # Adjust the figure size as needed
+
+        # Plot training data for the current column
+        plt.plot(df.index, df[col], marker='.', linestyle='-', label=f'Training {col}')
+
+        # Plot anomaly points (if any) for the current column
+        if col in anomalies_df.columns:
+            plt.scatter(anomalies_df.index, anomalies_df[col], color='red', marker='o', label=f'Anomalies in {col}', s=100, zorder=5)  # Set the size of the dots (s=100)
+
+        # Set titles and labels
+        plt.title(f'{col} Sensor Data with Anomalies', fontsize=16)
+        plt.xlabel('Time', fontsize=14)
+        plt.ylabel(col, fontsize=14)
+        plt.legend(fontsize=12)
+        plt.grid(True)
+
+        # Display the plot in Streamlit
+        st.pyplot(plt)
+
+def convert_and_set_time_index(df, time_column='Time'):
+    # Converts the 'Time' column to a human-readable format if it is not already,
+    # and sets it as the index of the DataFrame
+    if time_column in df.columns:
+        logging.info(f"Checking if '{time_column}' column needs to be converted to human-readable format.")
+
+        # Check if 'Time' is already in datetime format
+        if not pd.api.types.is_datetime64_any_dtype(df[time_column]):
+            logging.info(f"Converting '{time_column}' column to human-readable format.")
+            df[time_column] = pd.to_datetime(df[time_column], unit='s')  # Convert to human-readable format
+
+        # Set 'Time' as the index regardless of format
+        df.set_index(time_column, inplace=True)
+        logging.info(f"Set '{time_column}' column as the index.")
+
+    return df
diff --git a/streamlit_app.py b/streamlit_app.py
@@ -1,34 +1,10 @@
 import streamlit as st
 import pandas as pd
-import matplotlib.pyplot as plt
 import numpy as np
 from src.pipeline.predict_pipeline import PredictionPipeline
 import os
 import time  # Import the time module for sleep
-
-# Function to plot training data with anomalies
-def plot_train_data_with_anomalies(df, anomalies_df):
-    # Iterate over each sensor/column in the DataFrame
-    for col in df.columns:
-        # Create a new figure for each column
-        plt.figure(figsize=(25, 8))  # Adjust the figure size as needed
-
-        # Plot training data for the current column
-        plt.plot(df.index, df[col], marker='.', linestyle='-', label=f'Training {col}')
-
-        # Plot anomaly points (if any) for the current column
-        if col in anomalies_df.columns:
-            plt.scatter(anomalies_df.index, anomalies_df[col], color='red', marker='o', label=f'Anomalies in {col}', s=100, zorder=5)  # Set the size of the dots (s=100)
-
-        # Set titles and labels
-        plt.title(f'{col} Sensor Data with Anomalies', fontsize=16)
-        plt.xlabel('Time', fontsize=14)
-        plt.ylabel(col, fontsize=14)
-        plt.legend(fontsize=12)
-        plt.grid(True)
-
-        # Display the plot in Streamlit
-        st.pyplot(plt)
+from src.utils import plot_train_data_with_anomalies, convert_and_set_time_index
 
 # Custom CSS Styling
 st.markdown("""
@@ -48,19 +24,47 @@ def plot_train_data_with_anomalies(df, anomalies_df):
         background-color: #fafafa;
         color: #003366;
     }
+    .sample-data-box {
+        background-color: #e6f7ff;
+        padding: 20px;
+        border-radius: 8px;
+        margin-bottom: 20px;
+        border: 1px solid #007acc;
+    }
     </style>
     """, unsafe_allow_html=True)
 
 # Sidebar Navigation
 st.sidebar.title("Anomaly Detection Dashboard")
 st.sidebar.subheader("Choose Your Options")
 uploaded_file = st.sidebar.file_uploader("Upload your CSV file", type=["csv"])
-use_default = st.sidebar.checkbox("Use default dataset source repo", value=True)
+use_default = st.sidebar.checkbox("Use default dataset from source repo", value=True)
 
 # Main Title
-#st.title("Anomaly Detection in IoT Sensor Data")
 st.markdown('<div class="title"><h1>Anomaly Detection in IoT Sensor Data</h1></div>', unsafe_allow_html=True)
 
+# Sample Data Box
+#st.markdown('<div class="sample-data-box">', unsafe_allow_html=True)
+st.subheader("Sample Data Format")
+st.markdown("""
+**Default Dataset Description**: The default dataset is collected from a 25 m² room over a period of 24 hours with 2 people present. 
+
+**Expected Data Format**:
+- **Time**: Unix timestamp (seconds) or human readable format
+- **Temperature**: Degrees Celsius (°C)
+- **Humidity**: Percentage (%)
+- **Air Quality**: Index or ppm (specific air quality measurement)
+- **Light**: Lux (lx)
+- **Loudness**: Decibels (dB)
+
+Example:
+| Time                      | Temperature (°C) | Humidity (%) | Air Quality | Light (lx) | Loudness (dB) |
+| -------------             | ---------------- | ------------ | ----------- | ---------- | ------------- |
+| 2021-06-15 18:21:46       | 37.94            | 28.94        | 75          | 644        | 106           |
+| 2021-06-15 18:21:56       | 37.94            | 29.00        | 75          | 645        | 145           |
+""")
+st.markdown('</div>', unsafe_allow_html=True)
+
 # Progress Bar
 progress_bar = st.sidebar.progress(0)
 
@@ -73,34 +77,32 @@ def plot_train_data_with_anomalies(df, anomalies_df):
             st.stop()
         else:
             df = pd.read_csv(uploaded_file)
+            data_set_for_plot = df.copy()#pd.read_csv(uploaded_file)
             st.success("File uploaded successfully!")
     else:
         st.info(f"Using default dataset from source")
         df = pd.read_csv('artifacts/data.csv')
+        data_set_for_plot = pd.read_csv('artifacts/data.csv')
 
-    # Convert 'Time' column to human-readable format if it exists
-    if 'Time' in df.columns:
-        df['Time'] = pd.to_datetime(df['Time'], unit='s')
-        df.set_index('Time', inplace=True)
+    # Convert 'Time' column to human-readable format
+    df = convert_and_set_time_index(df)
 
     # Display the dataset
     st.subheader("Chosen Dataset Preview")
     st.write(df.head())
 
     # Load the raw dataset
-    raw_df = pd.read_csv('artifacts/data.csv')
-
-    if 'Time' in raw_df.columns:
-        raw_df['Time'] = pd.to_datetime(raw_df['Time'], unit='s')
-        raw_df.set_index('Time', inplace=True)
+    # raw_df = pd.read_csv('artifacts/data.csv')
+    # raw_df = convert_and_set_time_index(raw_df)
+    data_set_for_plot = convert_and_set_time_index(data_set_for_plot)
 
     # Create prediction pipeline
     predictor = PredictionPipeline()
 
     # Simulate progress
     for i in range(100):
         progress_bar.progress(i + 1)
-        time.sleep(0.05)  # Use time.sleep instead of st.sleep
+        time.sleep(0.005)
 
     # Predict anomalies
     results = predictor.predict(df)
@@ -114,7 +116,7 @@ def plot_train_data_with_anomalies(df, anomalies_df):
             st.write(results["anomalous_data"])
 
             # Plot training data with anomalies
-            plot_train_data_with_anomalies(raw_df, results["anomalous_data"])
+            plot_train_data_with_anomalies(data_set_for_plot, results["anomalous_data"])
 
             # Download button for anomalous data
             st.download_button(