Skip to content

Commit

Permalink
changed plotting and added data description
Browse files Browse the repository at this point in the history
  • Loading branch information
ACSE-vg822 committed Sep 14, 2024
1 parent 80fdab1 commit d805076
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 49 deletions.
7 changes: 3 additions & 4 deletions src/components/data_ingestion.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
import sys
import numpy as np

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object, load_object, convert_and_set_time_index

import pandas as pd
from sklearn.model_selection import train_test_split
Expand Down Expand Up @@ -37,10 +39,7 @@ def initiate_data_ingestion(self):
logging.info('Read the dataset as dataframe')

# Step 2: Set 'Time' column as index
if 'Time' in df.columns:
logging.info("Converting 'Time' column to human-readable format.")
df['Time'] = pd.to_datetime(df['Time'], unit='s') # Assuming Unix timestamps
df.set_index('Time', inplace=True)
df = convert_and_set_time_index(df)

# Step 3: Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
Expand Down
12 changes: 7 additions & 5 deletions src/pipeline/predict_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from tensorflow.keras.models import load_model
from src.exception import CustomException
from src.logger import logging
from src.utils import save_object, load_object # Assuming save_object and load_object are present for reusability
from src.utils import save_object, load_object, convert_and_set_time_index # Assuming save_object and load_object are present for reusability
from sklearn.preprocessing import MinMaxScaler
from dataclasses import dataclass

Expand Down Expand Up @@ -63,10 +63,12 @@ def predict(self, df, seq_length = 10):
threshold = self.load_threshold()

# Step 3: Convert the 'time' column to human-readable format
if 'Time' in df.columns:
logging.info("Converting 'Time' column to human-readable format.")
df['Time'] = pd.to_datetime(df['Time'], unit='s') # Assuming Unix timestamps
df.set_index('Time', inplace=True)
# if 'Time' in df.columns:
# logging.info("Converting 'Time' column to human-readable format.")
# df['Time'] = pd.to_datetime(df['Time'], unit='s') # Assuming Unix timestamps
# df.set_index('Time', inplace=True)
df = convert_and_set_time_index(df)

logging.info(df.head())

# Step 4: Preprocess the input dataframe
Expand Down
48 changes: 46 additions & 2 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import os
import sys
import streamlit as st

import numpy as np
import pandas as pd
#import dill
import matplotlib.pyplot as plt

import pickle
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

from src.exception import CustomException
from src.logger import logging

def save_object(file_path, obj):
try:
Expand All @@ -29,4 +32,45 @@ def load_object(file_path):

except Exception as e:
raise CustomException(e, sys)



# Function to plot training data with anomalies
def plot_train_data_with_anomalies(df, anomalies_df):
# Iterate over each sensor/column in the DataFrame
for col in df.columns:
# Create a new figure for each column
plt.figure(figsize=(25, 8)) # Adjust the figure size as needed

# Plot training data for the current column
plt.plot(df.index, df[col], marker='.', linestyle='-', label=f'Training {col}')

# Plot anomaly points (if any) for the current column
if col in anomalies_df.columns:
plt.scatter(anomalies_df.index, anomalies_df[col], color='red', marker='o', label=f'Anomalies in {col}', s=100, zorder=5) # Set the size of the dots (s=100)

# Set titles and labels
plt.title(f'{col} Sensor Data with Anomalies', fontsize=16)
plt.xlabel('Time', fontsize=14)
plt.ylabel(col, fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)

# Display the plot in Streamlit
st.pyplot(plt)

def convert_and_set_time_index(df, time_column='Time'):
# Converts the 'Time' column to a human-readable format if it is not already,
# and sets it as the index of the DataFrame
if time_column in df.columns:
logging.info(f"Checking if '{time_column}' column needs to be converted to human-readable format.")

# Check if 'Time' is already in datetime format
if not pd.api.types.is_datetime64_any_dtype(df[time_column]):
logging.info(f"Converting '{time_column}' column to human-readable format.")
df[time_column] = pd.to_datetime(df[time_column], unit='s') # Convert to human-readable format

# Set 'Time' as the index regardless of format
df.set_index(time_column, inplace=True)
logging.info(f"Set '{time_column}' column as the index.")

return df
78 changes: 40 additions & 38 deletions streamlit_app.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,10 @@
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from src.pipeline.predict_pipeline import PredictionPipeline
import os
import time # Import the time module for sleep

# Function to plot training data with anomalies
def plot_train_data_with_anomalies(df, anomalies_df):
# Iterate over each sensor/column in the DataFrame
for col in df.columns:
# Create a new figure for each column
plt.figure(figsize=(25, 8)) # Adjust the figure size as needed

# Plot training data for the current column
plt.plot(df.index, df[col], marker='.', linestyle='-', label=f'Training {col}')

# Plot anomaly points (if any) for the current column
if col in anomalies_df.columns:
plt.scatter(anomalies_df.index, anomalies_df[col], color='red', marker='o', label=f'Anomalies in {col}', s=100, zorder=5) # Set the size of the dots (s=100)

# Set titles and labels
plt.title(f'{col} Sensor Data with Anomalies', fontsize=16)
plt.xlabel('Time', fontsize=14)
plt.ylabel(col, fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)

# Display the plot in Streamlit
st.pyplot(plt)
from src.utils import plot_train_data_with_anomalies, convert_and_set_time_index

# Custom CSS Styling
st.markdown("""
Expand All @@ -48,19 +24,47 @@ def plot_train_data_with_anomalies(df, anomalies_df):
background-color: #fafafa;
color: #003366;
}
.sample-data-box {
background-color: #e6f7ff;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
border: 1px solid #007acc;
}
</style>
""", unsafe_allow_html=True)

# Sidebar Navigation
st.sidebar.title("Anomaly Detection Dashboard")
st.sidebar.subheader("Choose Your Options")
uploaded_file = st.sidebar.file_uploader("Upload your CSV file", type=["csv"])
use_default = st.sidebar.checkbox("Use default dataset source repo", value=True)
use_default = st.sidebar.checkbox("Use default dataset from source repo", value=True)

# Main Title
#st.title("Anomaly Detection in IoT Sensor Data")
st.markdown('<div class="title"><h1>Anomaly Detection in IoT Sensor Data</h1></div>', unsafe_allow_html=True)

# Sample Data Box
#st.markdown('<div class="sample-data-box">', unsafe_allow_html=True)
st.subheader("Sample Data Format")
st.markdown("""
**Default Dataset Description**: The default dataset is collected from a 25 m² room over a period of 24 hours with 2 people present.
**Expected Data Format**:
- **Time**: Unix timestamp (seconds) or human readable format
- **Temperature**: Degrees Celsius (°C)
- **Humidity**: Percentage (%)
- **Air Quality**: Index or ppm (specific air quality measurement)
- **Light**: Lux (lx)
- **Loudness**: Decibels (dB)
Example:
| Time | Temperature (°C) | Humidity (%) | Air Quality | Light (lx) | Loudness (dB) |
| ------------- | ---------------- | ------------ | ----------- | ---------- | ------------- |
| 2021-06-15 18:21:46 | 37.94 | 28.94 | 75 | 644 | 106 |
| 2021-06-15 18:21:56 | 37.94 | 29.00 | 75 | 645 | 145 |
""")
st.markdown('</div>', unsafe_allow_html=True)

# Progress Bar
progress_bar = st.sidebar.progress(0)

Expand All @@ -73,34 +77,32 @@ def plot_train_data_with_anomalies(df, anomalies_df):
st.stop()
else:
df = pd.read_csv(uploaded_file)
data_set_for_plot = df.copy()#pd.read_csv(uploaded_file)
st.success("File uploaded successfully!")
else:
st.info(f"Using default dataset from source")
df = pd.read_csv('artifacts/data.csv')
data_set_for_plot = pd.read_csv('artifacts/data.csv')

# Convert 'Time' column to human-readable format if it exists
if 'Time' in df.columns:
df['Time'] = pd.to_datetime(df['Time'], unit='s')
df.set_index('Time', inplace=True)
# Convert 'Time' column to human-readable format
df = convert_and_set_time_index(df)

# Display the dataset
st.subheader("Chosen Dataset Preview")
st.write(df.head())

# Load the raw dataset
raw_df = pd.read_csv('artifacts/data.csv')

if 'Time' in raw_df.columns:
raw_df['Time'] = pd.to_datetime(raw_df['Time'], unit='s')
raw_df.set_index('Time', inplace=True)
# raw_df = pd.read_csv('artifacts/data.csv')
# raw_df = convert_and_set_time_index(raw_df)
data_set_for_plot = convert_and_set_time_index(data_set_for_plot)

# Create prediction pipeline
predictor = PredictionPipeline()

# Simulate progress
for i in range(100):
progress_bar.progress(i + 1)
time.sleep(0.05) # Use time.sleep instead of st.sleep
time.sleep(0.005)

# Predict anomalies
results = predictor.predict(df)
Expand All @@ -114,7 +116,7 @@ def plot_train_data_with_anomalies(df, anomalies_df):
st.write(results["anomalous_data"])

# Plot training data with anomalies
plot_train_data_with_anomalies(raw_df, results["anomalous_data"])
plot_train_data_with_anomalies(data_set_for_plot, results["anomalous_data"])

# Download button for anomalous data
st.download_button(
Expand Down

0 comments on commit d805076

Please sign in to comment.