-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
147 lines (129 loc) · 6.06 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
# http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
X = iris.data[:, [2, 3]]
y = iris.target # 取species列,类别
print('Class labels:', np.unique(y))
# Output:Class labels: [0 1 2]
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
else:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0) # train_test_split方法分割数据集
X_train.shape
# Output:(105, 2)
X_test.shape
# Output:(45, 2)
X.shape
# Output:(150, 2)
y_train.shape
# Output: (105,)
y_test.shape
# Output: (45,)
# scaler = sklearn.preprocessing.StandardScaler().fit(train)
# scaler.transform(train);scaler.transform(test)
# fit()方法建模,transform()方法转换
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # 初始化一个对象sc去对数据集作变换
sc.fit(X_train) # 用对象去拟合数据集X_train,并且存下来拟合参数
# Output:StandardScaler(copy=True, with_mean=True, with_std=True)
# type(sc.fit(X_train))
# Output:sklearn.preprocessing.data.StandardScaler
sc.scale_ # sc.std_同样输出结果
# Output:array([ 1.79595918, 0.77769705])
sc.mean_
# Output:array([ 3.82857143, 1.22666667])
import numpy as np
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
# test标准化原理
at = X_train_std[:5] * sc.scale_ + sc.mean_
a = X_train[:5]
at == a
# Output:
# array([[ True, True],
# [ True, True],
# [ True, True],
# [ True, True],
# [ True, True]], dtype=bool)
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings
def versiontuple(v): # Numpy版本检测函数
return tuple(map(int, (v.split("."))))
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# 画决策边界,X是特征,y是标签,classifier是分类器,test_idx是测试集序号
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 # 第一个特征取值范围作为横轴
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 # 第二个特征取值范围作为纵轴
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution)) # reolution是网格剖分粒度,xx1和xx2数组维度一样
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
# classifier指定分类器,ravel是数组展平;Z的作用是对组合的二种特征进行预测
Z = Z.reshape(xx1.shape) # Z是列向量
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
# contourf(x,y,z)其中x和y为两个等长一维数组,z为二维数组,指定每一对xy所对应的z值。
# 对等高线间的区域进行填充(使用不同的颜色)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl) # 全数据集,不同类别样本点的特征作为坐标(x,y),用不同颜色画散点图
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx] # X_test取测试集样本两列特征,y_test取测试集标签
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
linewidths=1,
marker='o',
s=55, label='test set') # c设置颜色,测试集不同类别的实例点画图不区别颜色
from sklearn.linear_model import Perceptron
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron
#ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn = Perceptron() #y=w.x+b
ppn.fit(X_train_std, y_train)
#Output:Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
# n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
# verbose=0, warm_start=False)
ppn.coef_ #分类决策函数中的特征系数w
#Output:array([[-1.48746619, -1.1229737 ],
# [ 3.0624304 , -2.18594118],
# [ 2.9272062 , 2.64027405]])
ppn.intercept_ #分类决策函数中的偏置项b
#Output:array([-1., 0., -2.])
y_pred = ppn.predict(X_test_std) #对测试集做类别预测
y_pred
#Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
# 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0])
y_test
#Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
# 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0, 0])
y_pred == y_test
#Output:array([ True, True, True, True, True, True, True, True, True,
# True, False, True, True, True, True, True, True, True,
# True, True, True, True, True, True, True, True, True,
# True, False, True, True, True, True, True, True, True,
# True, False, True, True, True, True, True, True, True], dtype=bool)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
#Output:Misclassified samples: 3
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) #预测准确度,(len(y_test)-3)/len(y_test):0.9333333333333333
#Output:Accuracy: 0.93
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()