-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstreamlit_app.py
168 lines (114 loc) · 6.38 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import streamlit as st
from plotnine.data import txhousing
df = txhousing
# Drop rows with NAN values
df.dropna(inplace=True)
try:
data = df
if data is not None:
selected_features = ['volume', 'median', 'listings', 'inventory']
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
selected_features.extend(['year', 'month'])
X = data[selected_features]
y = data['sales']
st.title("Sales Prediction App - txhousing dataset")
# Select City
st.subheader("Select City")
selected_city = st.selectbox("Choose a City from the dropdown:", df['city'].unique())
# new df filtered for the selected city
city_data = df[df['city'] == selected_city]
# Information related to city
st.write(f"Selected City: {selected_city}")
st.write(f"Total Data Points available for {selected_city}: {len(city_data)}")
########
# Feature selection for the selected city
selected_features = ['volume', 'median', 'listings', 'inventory']
selected_features.extend(['year', 'month'])
X_city = city_data[selected_features]
y_city = city_data['sales']
# Train-Test Split for the selected city
X_train_city, X_test_city, y_train_city, y_test_city = train_test_split(X_city, y_city, test_size=0.2, random_state=42)
# Models for the selected city
models_city = {
'Linear Regression (City)': LinearRegression(),
'Random Forest (City)': RandomForestRegressor(),
'Gradient Boosting (City)': GradientBoostingRegressor()
}
########
########
# Feature selection for the entire dataset
all_features = ['volume', 'median', 'listings', 'inventory']
all_features.extend(['year', 'month'])
X_all = df[all_features]
y_all = df['sales']
# 20% of the data will be used for testing, and the remaining 80% will be used for training(Entire dataset).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Models for the entire dataset
models_all = {
'Linear Regression (All)': LinearRegression(),
'Random Forest (All)': RandomForestRegressor(),
'Gradient Boosting (All)': GradientBoostingRegressor()
}
########
# User input form
st.sidebar.header("Enter User Data")
min_values = city_data[selected_features].min()
max_values = city_data[selected_features].max()
# input fields with min and max values
volume = st.sidebar.number_input("Volume-total value of sales", min_value=float(min_values['volume']), max_value=float(max_values['volume']), value=float(min_values['volume']))
median = st.sidebar.number_input("Median sale price", min_value=float(min_values['median']), max_value=float(max_values['median']), value=float(min_values['median']))
listings = st.sidebar.number_input("Total active Listings", min_value=float(min_values['listings']), max_value=float(max_values['listings']), value=float(min_values['listings']))
inventory = st.sidebar.number_input("Months in Inventory ")#, min_value=float(min_values['inventory']), max_value=float(max_values['inventory']), value=float(min_values['inventory']))
st.sidebar.subheader("Enter Date:")
year = st.sidebar.slider("Year", min_value=1970, max_value=2015, value=int(min_values['year']))
month = st.sidebar.slider("Month", min_value=1, max_value=12, value=int(min_values['month']))
## storing all user data in an array
user_data = np.array([[volume, median, listings, inventory, year, month]])
# Model selection for the entire dataset
st.subheader("Select Model - entire dataset is used for training here")
model_selector_all = st.selectbox("Select a Model (All)", list(models_all.keys()))
selected_model_all = models_all[model_selector_all]
selected_model_all.fit(X_train, y_train)
prediction_all = selected_model_all.predict(user_data)
st.write(f"Predicted sales : {prediction_all[0]}")
# Model selection for the selected city
st.subheader(f"Select Model for data trained for filtered data for {selected_city}")
model_selector_city = st.selectbox(f"Select a Model ({selected_city})", list(models_city.keys()))
selected_model_city = models_city[model_selector_city]
selected_model_city.fit(X_train_city, y_train_city)
prediction_city = selected_model_city.predict(user_data)
st.write(f"Predicted sales for {selected_city}: {prediction_city[0]}")
########## CSV File Upload
st.header("Upload CSV File")
csv_file = st.file_uploader("Upload a CSV file for predictions", type=["csv"])
if csv_file is not None:
df_uploaded = pd.read_csv(csv_file)
X_uploaded = df_uploaded[selected_features]
if 'Random Forest (City)' in models_city:
selected_model_city = models_city['Random Forest (City)']
selected_model_city.fit(X_train_city, y_train_city)
predictions_uploaded = selected_model_city.predict(X_uploaded)
df_uploaded['Predicted_Sales'] = predictions_uploaded
st.subheader("Predictions for Uploaded CSV:")
st.write(df_uploaded)
# Download predictions as a CSV file
st.download_button(
label="Download Predictions",
data=df_uploaded.to_csv(index=False),
file_name="predictions.csv",
key="download_button",
)
else:
st.write("Error: Random Forest (City) model not found.")
else:
st.write("Error: Data not loaded.")
except Exception as e:
st.write(f"Error: {e}")