microsoft-malware-classification-challenge/solution4b.py at master · canast02/microsoft-malware-classification-challenge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from csv import writer

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
import six

import utilities


# Decide read/write mode based on python version
read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt')

# Set the path to your consolidated files
path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project'
os.chdir(path)

# File names
ftrain = 'train_consolidation.txt'
ftest = 'test_consolidation.txt'
flabel = 'trainLabels.csv'
fsubmission = 'submission.csv'

labels = utilities.read_labels(flabel)

# Dimensions for train set
ntrain = 10868
nfeature = 16 ** 2 + 1 + 1  # For two_byte_codes, no_que_marks, label
train = utilities.read_train(ntrain, nfeature, labels, ftrain)

X = train[:, :-1]
y = train[:,  -1]

del labels
del train

# Parameters for trees
random_state = 5342
n_jobs = 8
verbose = 1
n_estimators = 89
# ExtraTreesClassifier - classifier 1
clf1 = ExtraTreesClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
clf2 = ExtraTreesClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
# RandomForestClassifier - classifier 2
n_estimators = 89
clf3 = RandomForestClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)
clf4 = RandomForestClassifier(criterion='gini', random_state=random_state, n_jobs=n_jobs, verbose=verbose, n_estimators=n_estimators, max_features=None)

# Start training
print('training started')
clf1.fit(X, y)
X_new1 = clf1.transform(X, '1.25*median')
clf3.fit(X, y)
X_new2 = clf3.transform(X)

print('n_components = ', len(X_new1[0]), len(X_new2[0]))

# ############################
# # test log loss
# print('computing log loss')
# kf = cross_validation.KFold(ntrain, n_folds=4)
#
# _logloss = 0.0
# for trainIndex, testIndex in kf:
#     print("TRAIN:", trainIndex, "TEST:", testIndex)
#     X_train1, X_test1 = X_new1[trainIndex], X_new1[testIndex]
#     X_train2, X_test2 = X_new2[trainIndex], X_new2[testIndex]
#     y_train, y_test = y[trainIndex], y[testIndex]
#
#     clf2.fit(X_train1, y_train)
#     clf4.fit(X_train2, y_train)
#
#     pred1 = clf2.predict_proba(X_test1)
#     pred2 = clf4.predict_proba(X_test2)
#
#     pred = utilities.avg_proba(pred1, pred2)
#
#     _logloss += utilities.log_loss(pred, y_test)
#
# print('log loss = ', _logloss/len(kf))
# ############################

clf2.fit(X_new1, y)
clf4.fit(X_new2, y)
print('training completed')

# We don't need training set now
del X_new1
del X_new2

# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1  # For two_byte_codes, no_que_marks
test, Ids = utilities.read_test(ntest, nfeature, ftest)

test_new1 = clf1.transform(test, '1.25*median')
test_new2 = clf3.transform(test)

del test

# Predict for whole test set
pred1 = clf2.predict_proba(test_new1)
pred2 = clf4.predict_proba(test_new2)

# calculate the average probabilities
final_pred = utilities.avg_proba2(pred1, pred2)

# Writing results to file
with open(fsubmission, write_mode) as f:
    fw = writer(f)
    # Header preparation
    header = ['Id'] + ['Prediction' + str(i) for i in range(1, 10)]
    fw.writerow(header)
    for t, (Id, pred) in enumerate(zip(Ids, final_pred.tolist())):
        fw.writerow([Id] + pred)
        if (t + 1) % 1000 == 0:
            print(t + 1, 'prediction written')

print('all done!')