Titanic Data analysis Using Streamlit and EDA

vemeruva · web-flow · commit 24937839006d · 2025-02-22T12:57:08.000+05:30
diff --git a/app.py b/app.py
@@ -0,0 +1,92 @@
+import seaborn as sns
+import streamlit as st
+import pandas as pd
+from matplotlib import pyplot as plt
+import numpy as np
+
+#load Titanic dataset
+@st.cache_data
+def load_data():
+    data = pd.read_csv(r"C:\Users\vemer\Documents\Naresh Technology\New\21st FEB EDA\titanic dataset.csv")
+    return data
+
+data = load_data()
+
+#Title and description
+st.title('Exploratoy data analysis of Titan Dataset')
+st.write(" This is an EDA on the titanic dataset")
+st.write("First few rows of dataset:")
+st.dataframe(data.head())
+
+#data cleaning section
+st.subheader('Missing values')
+missing_data=data.isnull().sum()
+st.write(missing_data)
+
+if st.checkbox('Fill missing Age with median'):
+    data['Age'].fillna(data['Age'].mean(), inplace=True)
+
+if st.checkbox('Fill missing Embarked with mode'):
+    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
+
+if st.checkbox('Drop duplicates'):
+    data.drop_duplicates(inplace=True)
+
+st.subheader('Cleaned Dataset')
+st.dataframe(data.head())
+
+# EDA Section
+st.subheader('Statistical Summary of the Data')
+st.write(data.describe())
+
+# Age Distribution
+st.subheader('Age Distribution')
+fig, ax = plt.subplots()
+sns.histplot(data['Age'], kde=True, ax=ax)
+ax.set_title('Age Distribution')
+st.pyplot(fig)
+
+# Gender Distribution
+st.subheader('Gender Distribution')
+fig, ax = plt.subplots()
+sns.countplot(x='Sex', data=data, ax=ax)
+ax.set_title('Gender Distribution')
+st.pyplot(fig)
+
+# Pclass vs Survived
+st.subheader('Pclass vs Survived')
+fig, ax = plt.subplots()
+sns.countplot(x='Pclass', hue='Survived', data=data, ax=ax)
+ax.set_title('Pclass vs Survived')
+st.pyplot(fig)
+
+'''
+# Correlation Heatmap
+st.subheader('Correlation Heatmap')
+fig, ax = plt.subplots()
+data = pd.get_dummies(data)
+sns.heatmap(data.corr(), annot=True, cmap='coolwarm', ax=ax)
+#sns.heatmap(data.corr(), annot=True, cmap='coolwarm', ax=ax)
+ax.set_title('Correlation Heatmap')
+st.pyplot(fig)
+
+'''
+
+# Feature Engineering Section
+st.subheader('Feature Engineering: Family Size')
+data['FamilySize'] = data['SibSp'] + data['Parch']
+fig, ax = plt.subplots()
+sns.histplot(data['FamilySize'], kde=True, ax=ax)
+ax.set_title('Family Size Distribution')
+st.pyplot(fig)
+
+# Conclusion Section
+st.subheader('Key Insights')
+insights = """
+- Females have a higher survival rate than males.
+- Passengers in 1st class had the highest survival rate.
+- The majority of passengers are in Pclass 3.
+- Younger passengers tended to survive more often.
+"""
+st.write(insights)
+