-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
49 changed files
with
50,730 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Housing Price Prediction.ipynb | ||
Automatically generated by Colaboratory. | ||
Original file is located at | ||
https://colab.research.google.com/drive/1kQUcNKWXtvLTcsdQyjYWEOjzWuVJnoqu | ||
""" | ||
|
||
import os | ||
from google.colab import drive | ||
drive.mount('/content/drive') | ||
|
||
path = "/content/drive/My Drive" | ||
|
||
os.chdir(path) | ||
os.listdir(path) | ||
|
||
import pandas as pd | ||
df = pd.read_csv("/content/drive/My Drive/house_data.csv") | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import warnings | ||
from sklearn.model_selection import cross_val_score,GridSearchCV,train_test_split | ||
from sklearn.metrics import roc_auc_score,roc_curve | ||
from matplotlib import pyplot as plt | ||
from sklearn.ensemble import GradientBoostingRegressor | ||
from sklearn.metrics import r2_score | ||
import seaborn as sns | ||
from scipy.stats import * | ||
|
||
# 忽略过多的警告 | ||
warnings.filterwarnings('ignore') | ||
|
||
df.head() | ||
|
||
df.info() | ||
|
||
df.describe() | ||
|
||
df.hist(bins=10,figsize=(20,15)) | ||
plt.show() | ||
|
||
sns.distplot(df['price']) | ||
|
||
corrs = df.corr() | ||
plt.figure(figsize=(16, 16)) | ||
sns.heatmap(corrs,annot = True) | ||
|
||
df.corr()['price'].sort_values(ascending = False) #重点关注sqft_living, grade 以及 sqft_above | ||
|
||
sns.pairplot(df[["sqft_living","grade","sqft_above","price"]]) | ||
|
||
df.plot(kind='scatter', x='long', y='lat', alpha=0.4, | ||
c="price", cmap=plt.get_cmap("jet"), colorbar=True) | ||
|
||
def outliers_proc(data, col_name, scale=3): | ||
""" | ||
用于清洗异常值,默认用 box_plot(scale=3)进行清洗 | ||
:param data: 接收 pandas 数据格式 | ||
:param col_name: pandas 列名 | ||
:param scale: 尺度 | ||
:return: | ||
""" | ||
|
||
def box_plot_outliers(data_ser, box_scale): | ||
""" | ||
利用箱线图去除异常值 | ||
:param data_ser: 接收 pandas.Series 数据格式 | ||
:param box_scale: 箱线图尺度, | ||
:return: | ||
""" | ||
iqr = box_scale * (data_ser.quantile(0.75) - data_ser.quantile(0.25)) | ||
val_low = data_ser.quantile(0.25) - iqr | ||
val_up = data_ser.quantile(0.75) + iqr | ||
rule_low = (data_ser < val_low) | ||
rule_up = (data_ser > val_up) | ||
return (rule_low, rule_up), (val_low, val_up) | ||
|
||
data_n = data.copy() | ||
data_series = data_n[col_name] | ||
rule, value = box_plot_outliers(data_series, box_scale=scale) | ||
index = np.arange(data_series.shape[0])[rule[0] | rule[1]] | ||
print("Delete number is: {}".format(len(index))) | ||
data_n = data_n.drop(index) | ||
data_n.reset_index(drop=True, inplace=True) | ||
print("Now column number is: {}".format(data_n.shape[0])) | ||
index_low = np.arange(data_series.shape[0])[rule[0]] | ||
outliers = data_series.iloc[index_low] | ||
print("Description of data less than the lower bound is:") | ||
print(pd.Series(outliers).describe()) | ||
index_up = np.arange(data_series.shape[0])[rule[1]] | ||
outliers = data_series.iloc[index_up] | ||
print("Description of data larger than the upper bound is:") | ||
print(pd.Series(outliers).describe()) | ||
|
||
fig, ax = plt.subplots(1, 2, figsize=(10, 7)) | ||
sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0]) | ||
sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1]) | ||
return data_n | ||
|
||
df.drop(columns=['id'],inplace=True) | ||
df.drop(columns=['date'],inplace=True) | ||
df.drop(columns=['zipcode'],inplace=True) | ||
for i in df.columns.values.tolist(): | ||
if str(df[i]) != 'int64': | ||
df[i] = df[i].apply(pd.to_numeric) | ||
df = outliers_proc(df,i) | ||
else: | ||
df = outliers_proc(df,i) | ||
|
||
df.info() | ||
df.describe() | ||
|
||
df.plot(kind='scatter', x='long', y='lat', alpha=0.4, | ||
c="price", cmap=plt.get_cmap("jet"), colorbar=True) | ||
|
||
#预览清洗后重点关注数据的分布 | ||
sns.pairplot(df[["sqft_living","grade","sqft_above","price"]]) | ||
|
||
#划分训练集,测试集 | ||
X = df.iloc[:,1:] | ||
y = df.iloc[:,0] | ||
# Split the data into a training set and a test set | ||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) | ||
print(X_train.shape) | ||
print(X_test.shape) | ||
print(y_train.shape) | ||
print(y_test.shape) | ||
|
||
train= np.column_stack((X_train,y_train)) | ||
test= np.column_stack((X_test,y_test)) | ||
train = pd.DataFrame(train) | ||
test = pd.DataFrame(test) | ||
|
||
y.head() | ||
|
||
test.head() | ||
|
||
df.head() | ||
|
||
#因为由前文可知,目标值明显右偏,故取对数,使其近似正态分布 | ||
p = train[15] | ||
sns.distplot(p, fit=norm) | ||
print('Skewness of target:', p.skew()) | ||
print('kurtosis of target:', p.kurtosis()) | ||
|
||
p = np.log1p(p) | ||
print('Skewness of target:', p.skew()) | ||
print('kurtosis of target:', p.kurtosis()) | ||
sns.distplot(p, fit=norm) | ||
|
||
from sklearn.model_selection import KFold, cross_val_score | ||
#采用十折交叉验证 | ||
n_folds = 10 | ||
|
||
def rmse_cv(model): | ||
kf = KFold(n_folds, shuffle=True, random_state=20) | ||
rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)) | ||
return(rmse) | ||
|
||
def display_scores(scores): | ||
print("RMSE:", scores) | ||
print("平均RMSE:", scores.mean()) | ||
print("RMSE标准差:", scores.std()) | ||
|
||
from sklearn.linear_model import LinearRegression | ||
from sklearn.model_selection import KFold, cross_val_score | ||
|
||
n_folds = 10 | ||
|
||
def train_models(): | ||
lin_reg = LinearRegression() | ||
lin_scores = cross_val_score(lin_reg, X, y, | ||
scoring="neg_mean_squared_error", cv=KFold(n_folds, shuffle=True, random_state=20)) | ||
lin_rmse_scores = np.sqrt(-lin_scores) | ||
display_scores(lin_rmse_scores) | ||
train_models() | ||
|
||
from sklearn.decomposition import PCA | ||
from sklearn.datasets import load_digits | ||
from sklearn.model_selection import train_test_split | ||
from math import sqrt | ||
from sklearn.metrics import mean_squared_error | ||
|
||
from timeit import timeit | ||
|
||
|
||
def exec_without_pca(): | ||
gbr = GradientBoostingRegressor(loss='huber', | ||
criterion='mse', | ||
learning_rate=0.1, | ||
n_estimators=600, | ||
max_depth=4, | ||
subsample=0.6, | ||
min_samples_split=20, | ||
min_samples_leaf=5, | ||
max_features=0.6, | ||
random_state=42, | ||
alpha=0.5) | ||
gbr.fit(X_train, y_train) | ||
y_pred = gbr.predict(X_test) | ||
score = rmse_cv(gbr) | ||
print (score.mean(), score.std()) | ||
|
||
|
||
def exec_with_pca(): | ||
gbr = GradientBoostingRegressor(loss='huber', | ||
criterion='mse', | ||
learning_rate=0.1, | ||
n_estimators=600, | ||
max_depth=4, | ||
subsample=0.6, | ||
min_samples_split=20, | ||
min_samples_leaf=5, | ||
max_features=0.6, | ||
random_state=42, | ||
alpha=0.5) | ||
pca = PCA(n_components=1) | ||
pca.fit(X_train, y_train) | ||
X_train_dunction = pca.transform(X_train) | ||
X_test_dunction = pca.transform(X_test) | ||
gbr.fit(X_train_dunction, y_train) | ||
score = rmse_cv(gbr) | ||
print (score.mean(), score.std()) | ||
|
||
|
||
if __name__ == '__main__': | ||
print ('Time of method[exec_without_pca] costs:', | ||
timeit('exec_without_pca()', setup='from __main__ import exec_without_pca', number=1)) | ||
print ('----' * 10) | ||
print ('Time of method[exec_with_pca] costs:', | ||
timeit('exec_with_pca()', setup='from __main__ import exec_with_pca', number=1)) | ||
|
||
#确定交叉验证方法及其评估方法 | ||
from sklearn.linear_model import Lasso, LassoCV, ElasticNet, ElasticNetCV, Ridge, RidgeCV | ||
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor | ||
from sklearn.svm import SVR | ||
|
||
from sklearn.pipeline import make_pipeline | ||
from sklearn.preprocessing import RobustScaler | ||
from sklearn.model_selection import KFold, cross_val_score | ||
from sklearn.metrics import mean_squared_error | ||
|
||
#Lasso | ||
lasso_alpha = [0.00005, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0] | ||
lasso = make_pipeline(RobustScaler(), LassoCV(alphas=lasso_alpha, random_state=2)) | ||
|
||
#ElasticNet | ||
enet_beta = [0.1, 0.2, 0.5, 0.6, 0.8, 0.9] | ||
enet_alpha = [0.00005, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01] | ||
ENet = make_pipeline(RobustScaler(), ElasticNetCV(l1_ratio=enet_beta, alphas=enet_alpha, random_state=12)) | ||
|
||
#Ridge | ||
rid_alpha = [0.00005, 0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0] | ||
rid = make_pipeline(RobustScaler(), RidgeCV(alphas=rid_alpha)) | ||
|
||
#Gradient Boosting | ||
gbr_params = {'loss': 'huber', | ||
'criterion': 'mse', | ||
'learning_rate': 0.1, | ||
'n_estimators': 600, | ||
'max_depth': 4, | ||
'subsample': 0.6, | ||
'min_samples_split': 20, | ||
'min_samples_leaf': 5, | ||
'max_features': 0.6, | ||
'random_state': 32, | ||
'alpha': 0.5} | ||
gbr = GradientBoostingRegressor(**gbr_params) | ||
|
||
#采用十折交叉验证 | ||
n_folds = 10 | ||
|
||
def rmse_cv(model): | ||
kf = KFold(n_folds, shuffle=True, random_state=20) | ||
rmse = np.sqrt(-cross_val_score(model, train.values, p, scoring='neg_mean_squared_error', cv=kf)) | ||
return(rmse) | ||
|
||
models_name = ['Lasso', 'ElasticNet', 'Ridge', 'Gradient Boosting'] | ||
models = [lasso, ENet, rid, gbr] | ||
for i, model in enumerate(models): | ||
score = rmse_cv(model) | ||
print('{} score: {}({})'.format(models_name[i], score.mean(), score.std())) |
Oops, something went wrong.