-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
92 lines (75 loc) · 2.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import seaborn as sns
import streamlit as st
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
#load Titanic dataset
@st.cache_data
def load_data():
data = pd.read_csv(r"C:\Users\vemer\Documents\Naresh Technology\New\21st FEB EDA\titanic dataset.csv")
return data
data = load_data()
#Title and description
st.title('Exploratoy data analysis of Titan Dataset')
st.write(" This is an EDA on the titanic dataset")
st.write("First few rows of dataset:")
st.dataframe(data.head())
#data cleaning section
st.subheader('Missing values')
missing_data=data.isnull().sum()
st.write(missing_data)
if st.checkbox('Fill missing Age with median'):
data['Age'].fillna(data['Age'].mean(), inplace=True)
if st.checkbox('Fill missing Embarked with mode'):
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
if st.checkbox('Drop duplicates'):
data.drop_duplicates(inplace=True)
st.subheader('Cleaned Dataset')
st.dataframe(data.head())
# EDA Section
st.subheader('Statistical Summary of the Data')
st.write(data.describe())
# Age Distribution
st.subheader('Age Distribution')
fig, ax = plt.subplots()
sns.histplot(data['Age'], kde=True, ax=ax)
ax.set_title('Age Distribution')
st.pyplot(fig)
# Gender Distribution
st.subheader('Gender Distribution')
fig, ax = plt.subplots()
sns.countplot(x='Sex', data=data, ax=ax)
ax.set_title('Gender Distribution')
st.pyplot(fig)
# Pclass vs Survived
st.subheader('Pclass vs Survived')
fig, ax = plt.subplots()
sns.countplot(x='Pclass', hue='Survived', data=data, ax=ax)
ax.set_title('Pclass vs Survived')
st.pyplot(fig)
'''
# Correlation Heatmap
st.subheader('Correlation Heatmap')
fig, ax = plt.subplots()
data = pd.get_dummies(data)
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', ax=ax)
#sns.heatmap(data.corr(), annot=True, cmap='coolwarm', ax=ax)
ax.set_title('Correlation Heatmap')
st.pyplot(fig)
'''
# Feature Engineering Section
st.subheader('Feature Engineering: Family Size')
data['FamilySize'] = data['SibSp'] + data['Parch']
fig, ax = plt.subplots()
sns.histplot(data['FamilySize'], kde=True, ax=ax)
ax.set_title('Family Size Distribution')
st.pyplot(fig)
# Conclusion Section
st.subheader('Key Insights')
insights = """
- Females have a higher survival rate than males.
- Passengers in 1st class had the highest survival rate.
- The majority of passengers are in Pclass 3.
- Younger passengers tended to survive more often.
"""
st.write(insights)