Skip to content

Commit 314d1ac

Browse files
committed
Implemented XGBoost in Python and R
1 parent 58ba08f commit 314d1ac

File tree

2 files changed

+92
-0
lines changed

2 files changed

+92
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Importing the dataset
2+
# ---------------------
3+
dataset = read.csv('../../data_files/Churn_Modelling.csv')
4+
dataset = dataset[4:14]
5+
6+
# Encoding the categorical variables as factor
7+
dataset$Geography = as.numeric(factor(dataset$Geography,
8+
levels=c('France', 'Spain', 'Germany'),
9+
labels=c(1, 2, 3)))
10+
dataset$Gender = as.numeric(factor(dataset$Gender,
11+
levels=c('Female', 'Male'),
12+
labels=c(1, 2)))
13+
14+
# Splitting the dataset into the Training set and Test set
15+
# --------------------------------------------------------
16+
# install.packages('caTools')
17+
library(caTools)
18+
set.seed(123)
19+
split = sample.split(dataset$Exited, SplitRatio = 0.8)
20+
training_set = subset(dataset, split==TRUE)
21+
test_set = subset(dataset, split==FALSE)
22+
23+
# Fitting XGBoost to the Training set
24+
# -----------------------------------
25+
library(xgboost)
26+
classifier = xgboost(
27+
data=as.matrix(training_set[-11]),
28+
label=training_set$Exited,
29+
nrounds=10
30+
)
31+
32+
# Applying k-Fold Cross Validation
33+
library(caret)
34+
folds = createFolds(training_set$Purchased, k=10)
35+
cv = lapply(folds, function(x) {
36+
training_fold = training_set[-x, ]
37+
test_fold = training_set[x, ]
38+
classifier = xgboost(
39+
data=as.matrix(training_set[-11]),
40+
label=training_set$Exited,
41+
nrounds=10
42+
)
43+
y_pred = predict(classifier, newdata=as.matrix(test_fold[-11]))
44+
y_pred = (y_pred >= 0.5)
45+
cm = table(test_fold[, 3], y_pred)
46+
accuracy = (cm[1, 1] + cm[2, 2]) / (cm[1, 1] + cm[2, 2] + cm[1, 2] + cm[2, 1])
47+
return(accuracy)
48+
})
49+
accuracy = mean(as.numeric(cv))
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Importing the libraries
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
import pandas as pd
5+
6+
# Importing the dataset
7+
dataset = pd.read_csv('../../data_files/Churn_Modelling.csv')
8+
X = dataset.iloc[:, 3:13].values
9+
y = dataset.iloc[:, 13].values
10+
11+
# ------ Part-1: Data preprocessing ----------
12+
13+
# Encoding categorical data
14+
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
15+
labelencoder_X_1 = LabelEncoder()
16+
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])
17+
labelencoder_X_2 = LabelEncoder()
18+
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])
19+
onehotencoder = OneHotEncoder(categorical_features=[1])
20+
X = onehotencoder.fit_transform(X).toarray()
21+
X = X[:, 1:]
22+
23+
# Splitting the dataset into the Training set and Test set
24+
from sklearn.model_selection import train_test_split
25+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
26+
27+
# Fitting XGBoost to the Training set
28+
from xgboost import XGBClassifier
29+
classifier = XGBClassifier()
30+
classifier.fit(X_train, y_train)
31+
32+
# Predicting the Test set results
33+
y_pred = classifier.predict(X_test)
34+
35+
# Making the confusion Matrix
36+
from sklearn.metrics import confusion_matrix
37+
cm = confusion_matrix(y_test, y_pred)
38+
39+
# Applying K-Fold Cross Validation
40+
from sklearn.model_selection import cross_val_score
41+
accuracies = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=10)
42+
accuracies.mean()
43+
accuracies.std()

0 commit comments

Comments
 (0)