classification.py

# -*- coding: utf-8 -*-
"""Classification.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/github/aymanmostafa11/Taxi-Ride-Fare-Prediction/blob/main/Classification.ipynb
"""

# import gdown
# gdown.download_folder('https://drive.google.com/drive/folders/1fBs3klrABIbNxvReGZ96lLdOnLTRYEZ5?usp=sharing')
#
# !curl --remote-name https://raw.githubusercontent.com/aymanmostafa11/Taxi-Ride-Fare-Prediction/main/helpers.py

import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC

from helpers import Model , PreProcessing

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

dataPath = 'E:/Edu/Academic/Year 3/Second Term/ML/Project/taxi-rides-classification'

taxiRides = pd.read_csv(dataPath + '/taxi-rides-classification.csv')
weather = pd.read_csv(dataPath + '/weather.csv')

# taxiRides = pd.read_csv('taxi/taxi-rides-classification.csv')
# weather = pd.read_csv('taxi/weather.csv')

print(f"Taxi Rides has {taxiRides.shape[0]} Rows and {taxiRides.shape[1]} Columns")
taxiRides.head()

print(f"Weather has {weather.shape[0]} Rows and {weather.shape[1]} Columns")
weather.head()

"""# Data Cleaning

## Taxi Rides
"""

taxiRides.info()

"""## Null"""

print("Null Values in columns")
taxiRides.isnull().sum()

"""## product_id and name"""

print(f"Value counts of 'product_id' feature\n")
taxiRides['product_id'].value_counts()

print(f"Value counts of 'name' feature\n")
taxiRides['name'].value_counts()

taxiRides.head()

"""product_id and name represent the same feature so we can drop one of them

## Encoding Timestamps to date
"""

weatherDate = pd.to_datetime(weather['time_stamp'], unit='s').apply(lambda x: x.strftime(('%Y-%m-%d')))
taxiRidesDate = pd.to_datetime(taxiRides['time_stamp'], unit='ms').apply(lambda x: x.strftime(('%Y-%m-%d')))
weather['date'] = weatherDate
taxiRides['date'] = taxiRidesDate

weather.head()

"""## Joining Dataframes based on date"""

taxiRides.drop(['time_stamp'],axis = 1, inplace = True)
weather.drop(['time_stamp'],axis = 1, inplace = True)

mergedData = pd.merge(taxiRides,weather.drop_duplicates(subset=['date', 'location']), how = 'left', left_on=['date', 'source'], right_on=['date', 'location'])

mergedData.isnull().sum()

"""## Rain Feature"""

weather['rain'].isnull().sum() / weather['rain'].shape[0]
weather['rain'].hist()
#Does null values of rain revolve around certain values?
print("Rows with null rain value statistics")
weather[weather['rain'].isnull()].describe()

print("Rows with non-null rain value statistics")
weather[weather['rain'].notna()].describe()

weather['rain'].value_counts()

print(f"Values of 0 in the rain feature :{(weather['rain'] == 0).sum()}")
#Rain feature nulls could indicate no rain

weather['rain'].fillna(0,inplace=True)

"""# Data Visualization"""

taxiRides['cab_type'].hist()
plt.show()

taxiRides['surge_multiplier'].hist()

"""# Preprocessing"""

mergedData.head()

preProcessing = PreProcessing()

columnsToDrop = ['id', 'date', 'product_id', 'location']

mergedData.drop(columnsToDrop,axis = 1,inplace=True)

mergedData.head()

mergedData.isnull().sum()

"""## Encoding

###  <i>name</i> Feature
"""

preProcessing.encodeManually(mergedData['name'], PreProcessing.nameFeatureMap)

preProcessing.encodeManually(mergedData['RideCategory'], PreProcessing.labelFeatureMap)

"""### Other Features"""

nonIntegerColumns = [col for col in mergedData.columns if mergedData[col].dtypes == object]
print(f"Non Integer Columns : {nonIntegerColumns}")

preProcessing.encode(mergedData,nonIntegerColumns)

"""### Rain Feature Engineering"""

mergedData['rain'].describe()

"""Referring to google:
<blockquote>Light rainfall is considered <b>less than 0.10 inches</b> of rain per hour. Moderate rainfall measures <b>0.10 to 0.30 inches</b> of rain per hour. Heavy rainfall is more than <b>0.30 inches</b>
 of rain per hour.</blockquote>
0 : no rain <br>
1 : light rain <br>
2 : mid rain <br>
3 : heavy rain (doesn't exist in the data)
"""

mergedData['rainType'] = 0

mergedData['rainType'][(mergedData['rain'] > 0) & (mergedData['rain'] < 0.1)] = 1

mergedData['rainType'][(mergedData['rain'] > 0.1) & (mergedData['rain'] < 0.3)] = 2

mergedData['rainType'].value_counts()

"""## Clouds engineering
making the assumption that clouds are on normalized [Okta Scale](https://polarpedia.eu/en/okta-scale/) that means values less than 0.1 are sunny days
"""

mergedData['sunnyDay'] = 0

mergedData['sunnyDay'][mergedData['clouds'] <= 0.1] = 1

mergedData['sunnyDay'].value_counts()

"""### Correlation Before PCA"""

plt.figure(figsize = (16, 10))

ax = sns.heatmap(mergedData.corr(), annot = True)
ax.tick_params(axis = 'x', rotation = 60)
ax.tick_params(axis = 'y', rotation = 0)
# these two lines are meant to fix a bug in matplotlib's current version
# where the graph top and bot rows are cut in half (I copied them from stackoverflow)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

plt.show()

"""### Dimentionality Reduction"""

subsetOfData = mergedData[['temp','sunnyDay','rainType','wind','pressure','humidity']]
mergedData.drop(['temp','clouds','sunnyDay','rainType','rain','wind','pressure','humidity'],axis=1,inplace=True)

lowerDimensionWeatherData =preProcessing.reduceDimentionsOf(subsetOfData)
mergedData['weatherState'] = lowerDimensionWeatherData

"""### Correlation Before PCA"""

plt.figure(figsize = (10, 10))

ax = sns.heatmap(mergedData.corr(), annot = True)
ax.tick_params(axis = 'x', rotation = 60)
ax.tick_params(axis = 'y', rotation = 0)
# these two lines are meant to fix a bug in matplotlib's current version
# where the graph top and bot rows are cut in half (I copied them from stackoverflow)
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

plt.show()

"""# Model"""

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform

dataFeatures = mergedData.drop(['RideCategory'],axis=1)
dataLabel = mergedData['RideCategory']

dataFeatures.head()

model = Model()

splitData = model.splitData(dataFeatures,dataLabel)
preProcessing.scale(splitData['trainFeatures'],'minMax',splitData['testFeatures'])

"""### Linear Logistic"""

# searchParameters = {'solver' : ['lbfgs', 'sag'],
#                     'penalty': ['l2', 'none'],
#                     'C' : uniform(loc=0, scale=10)}
# logisticParamSearch = RandomizedSearchCV(linear_model.LogisticRegression(), searchParameters, n_iter = 2)

# logisticParamSearch.fit(splitData['trainFeatures'], splitData['trainLabel'])
# logisticParamSearch.best_params_

params = {'solver' : 'lbfgs',
          'multi_class' : 'multinomial', 
          'C' : 0.47469656914931324,
          'penalty' : 'none' ,
          'random_state':RANDOM_STATE}
logisticRegression = model.fitClassificationModel(linear_model.LogisticRegression,
                                                 params,
                                                 evaluationFunction= metrics.f1_score,
                                                 dataDictionary= splitData)

"""### Polynomial Logistic"""

polyDegree = 3

polyFeaturesTrain  = model.changeDegreeOf(splitData['trainFeatures'], polyDegree)
polyFeaturesValidation = model.changeDegreeOf(splitData['testFeatures'], polyDegree)

polyData = splitData.copy()
polyData['trainFeatures'] = polyFeaturesTrain
polyData['testFeatures']  = polyFeaturesValidation

params = {'solver' : 'lbfgs',
          'multi_class' : 'multinomial', 
          'C' : 0.47469656914931324,
          'penalty' : 'none' ,
          'random_state':RANDOM_STATE}
polyModel = model.fitClassificationModel(linear_model.LogisticRegression,
                                         params,
                                         evaluationFunction= metrics.f1_score,
                                         dataDictionary=polyData)

"""### Decision Tree"""

# searchParameters = {'criterion' : ['gini', 'entropy'],
#                     'min_samples_split': [i for i in range(1, 1001, 100)],
#                     'max_depth' : [i for i in range(10, 50, 5)]}
# treeParamSearch = RandomizedSearchCV(DecisionTreeClassifier(), searchParameters, n_iter = 10)

# treeParamSearch.fit(dataFeatures, dataLabel)
# treeParamSearch.best_params

params = {'criterion' : 'entropy', 'max_depth' : 30 ,'random_state' : RANDOM_STATE, 'min_samples_split' : 501}
tree = model.fitClassificationModel(DecisionTreeClassifier,
                                     params,
                                     evaluationFunction= metrics.f1_score,
                                     dataDictionary=splitData)

"""### Random Forest"""

searchParameters = {'criterion' : ['gini', 'entropy'],
                    'min_samples_split': [i for i in range(1, 1001, 100)],
                    'max_depth' : [i for i in range(10, 50, 5)],
                    'n_estimators' : [i for i in range(10, 101, 10)]}
forestParamSearch = RandomizedSearchCV(RandomForestClassifier(), searchParameters, n_iter = 2)

# forestParamSearch.fit(dataFeatures, dataLabel)
# forestParamSearch.best_params_

params = {'criterion'  : 'entropy' , 
          'max_depth' : 30,
          'min_samples_split' : 500,
          'n_estimators' : 50, 
          'random_state' : RANDOM_STATE,}
forest = model.fitClassificationModel(RandomForestClassifier,
                                     params,
                                     evaluationFunction= metrics.f1_score,
                                     dataDictionary=splitData)

"""### SVM"""

# params = {'C' : 10, 'random_state' : RANDOM_STATE}
# svm = model.fitClassificationModel(SVC,
#                                     params,
#                                     evaluationFunction= metrics.f1_score,
#                                     dataDictionary=splitScaledData)

"""### Saving Encoders and Models"""

models = {
    'LogisticRegression':logisticRegression,
    'PolynomialLogistic':polyModel,
    'DecisionTree': tree,
    'RandomForest': forest,
    # 'SVM': svm
}

imputers = {}
for feature in taxiRides.columns:
    if taxiRides[feature].dtype == object:
        imputers[feature]=taxiRides[feature].mode()[0]
    else:
        imputers[feature]=taxiRides[feature].mean()

classification_cache ={
    'models': models,
    'encoders': preProcessing.encoders,
    'scalers': preProcessing.scalingCache,
    'imputers': imputers,
    'categoricalFeaturesValues': preProcessing.featuresUniqueValues
}

from pickle import dump
filename = 'classification_cache'
dump(classification_cache, open('classification_cache', 'wb'))