其他
万字长文总结机器学习的模型评估与调参 | 附代码下载
The following article is from SAMshare Author Samshare
2import pandas as pd
3import urllib
4try:
5 df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases'
6 '/breast-cancer-wisconsin/wdbc.data', header=None)
7except urllib.error.URLError:
8 df = pd.read_csv('https://raw.githubusercontent.com/rasbt/'
9 'python-machine-learning-book/master/code/'
10 'datasets/wdbc/wdbc.data', header=None)
11print('rows, columns:', df.shape)
12df.head()
2X = df.loc[:, 2:].values
3y = df.loc[:, 1].values
4le = LabelEncoder()
5# 将目标转为0-1变量
6y = le.fit_transform(y)
7le.transform(['M', 'B'])
2from sklearn.model_selection import train_test_split
3X_train, X_test, y_train, y_test = \
4 train_test_split(X, y, test_size=0.20, random_state=1)
2from sklearn.decomposition import PCA # 用于进行特征降维
3from sklearn.linear_model import LogisticRegression # 用于模型预测
4from sklearn.pipeline import Pipeline
5pipe_lr = Pipeline([('scl', StandardScaler()),
6 ('pca', PCA(n_components=2)),
7 ('clf', LogisticRegression(random_state=1))])
8pipe_lr.fit(X_train, y_train)
9print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))
10y_pred = pipe_lr.predict(X_test)
2from sklearn.model_selection import StratifiedKFold
3kfold = StratifiedKFold(n_splits=10,
4 random_state=1).split(X_train, y_train)
5scores = []
6for k, (train, test) in enumerate(kfold):
7 pipe_lr.fit(X_train[train], y_train[train])
8 score = pipe_lr.score(X_train[test], y_train[test])
9 scores.append(score)
10 print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1,
11 np.bincount(y_train[train]), score))
12print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
2scores = cross_val_score(estimator=pipe_lr,
3 X=X_train,
4 y=y_train,
5 cv=10,
6 n_jobs=1)
7print('CV accuracy scores: %s' % scores)
8print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
曲线调参
2from sklearn.model_selection import learning_curve
3pipe_lr = Pipeline([('scl', StandardScaler()),
4 ('clf', LogisticRegression(penalty='l2', random_state=0))])
5train_sizes, train_scores, test_scores =\
6 learning_curve(estimator=pipe_lr,
7 X=X_train,
8 y=y_train,
9 train_sizes=np.linspace(0.1, 1.0, 10), #在0.1和1间线性的取10个值
10 cv=10,
11 n_jobs=1)
12train_mean = np.mean(train_scores, axis=1)
13train_std = np.std(train_scores, axis=1)
14test_mean = np.mean(test_scores, axis=1)
15test_std = np.std(test_scores, axis=1)
16plt.plot(train_sizes, train_mean,
17 color='blue', marker='o',
18 markersize=5, label='training accuracy')
19plt.fill_between(train_sizes,
20 train_mean + train_std,
21 train_mean - train_std,
22 alpha=0.15, color='blue')
23plt.plot(train_sizes, test_mean,
24 color='green', linestyle='--',
25 marker='s', markersize=5,
26 label='validation accuracy')
27plt.fill_between(train_sizes,
28 test_mean + test_std,
29 test_mean - test_std,
30 alpha=0.15, color='green')
31plt.grid()
32plt.xlabel('Number of training samples')
33plt.ylabel('Accuracy')
34plt.legend(loc='lower right')
35plt.ylim([0.8, 1.0])
36plt.tight_layout()
37plt.show()
2param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
3train_scores, test_scores = validation_curve(
4 estimator=pipe_lr,
5 X=X_train,
6 y=y_train,
7 param_name='clf__C',
8 param_range=param_range,
9 cv=10)
10train_mean = np.mean(train_scores, axis=1)
11train_std = np.std(train_scores, axis=1)
12test_mean = np.mean(test_scores, axis=1)
13test_std = np.std(test_scores, axis=1)
14plt.plot(param_range, train_mean,
15 color='blue', marker='o',
16 markersize=5, label='training accuracy')
17plt.fill_between(param_range, train_mean + train_std,
18 train_mean - train_std, alpha=0.15,
19 color='blue')
20plt.plot(param_range, test_mean,
21 color='green', linestyle='--',
22 marker='s', markersize=5,
23 label='validation accuracy')
24plt.fill_between(param_range,
25 test_mean + test_std,
26 test_mean - test_std,
27 alpha=0.15, color='green')
28plt.grid()
29plt.xscale('log')
30plt.legend(loc='lower right')
31plt.xlabel('Parameter C')
32plt.ylabel('Accuracy')
33plt.ylim([0.8, 1.0])
34plt.tight_layout()
35plt.show()
2from sklearn.datasets import load_iris
3from sklearn.svm import SVC
4from sklearn.model_selection import train_test_split
5iris = load_iris()
6X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
7print("Size of training set: %d size of test set: %d" % (X_train.shape[0], X_test.shape[0]))
8best_score = 0
9for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
10 for C in [0.001, 0.01, 0.1, 1, 10, 100]:
11 # for each combination of parameters
12 # train an SVC
13 svm = SVC(gamma=gamma, C=C)
14 svm.fit(X_train, y_train)
15 # evaluate the SVC on the test set
16 score = svm.score(X_test, y_test)
17 # if we got a better score, store the score and parameters
18 if score > best_score:
19 best_score = score
20 best_parameters = {'C': C, 'gamma': gamma}
21print("best score: ", best_score)
22print("best parameters: ", best_parameters)
Size of training set: 112 size of test set: 38
best score: 0.973684210526
best parameters: {'C': 100, 'gamma': 0.001}
2from sklearn.model_selection import GridSearchCV
3pipe_svc = Pipeline([('scl', StandardScaler()),
4 ('clf', SVC(random_state=1))])
5param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
6param_grid = [{'clf__C': param_range,
7 'clf__kernel': ['linear']},
8 {'clf__C': param_range,
9 'clf__gamma': param_range,
10 'clf__kernel': ['rbf']}]
11gs = GridSearchCV(estimator=pipe_svc,
12 param_grid=param_grid,
13 scoring='accuracy',
14 cv=10,
15 n_jobs=-1)
16gs = gs.fit(X_train, y_train)
17print(gs.best_score_)
18print(gs.best_params_)
0.978021978022
{'clf__C': 0.1, 'clf__kernel': 'linear'}
2clf.fit(X_train, y_train)
3print('Test accuracy: %.3f' % clf.score(X_test, y_test))
2 param_grid=param_grid,
3 scoring='accuracy',
4 cv=2)
5
6# Note: Optionally, you could use cv=2
7# in the GridSearchCV above to produce
8# the 5 x 2 nested CV that is shown in the figure.
9
10scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
11print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
2
3gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
4 param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}],
5 scoring='accuracy',
6 cv=2)
7scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5)
8print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
2
3pipe_svc.fit(X_train, y_train)
4y_pred = pipe_svc.predict(X_test)
5confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
6print(confmat)
[[71 1]
[ 2 40]]
2ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
3for i in range(confmat.shape[0]):
4 for j in range(confmat.shape[1]):
5 ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center')
6
7plt.xlabel('predicted label')
8plt.ylabel('true label')
9
10plt.tight_layout()
11plt.show()
2
3print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
4print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
5print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
Recall: 0.952
F1: 0.964
2
3scorer = make_scorer(f1_score, pos_label=0)
4
5c_gamma_range = [0.01, 0.1, 1.0, 10.0]
6
7param_grid = [{'clf__C': c_gamma_range,
8 'clf__kernel': ['linear']},
9 {'clf__C': c_gamma_range,
10 'clf__gamma': c_gamma_range,
11 'clf__kernel': ['rbf']}]
12
13gs = GridSearchCV(estimator=pipe_svc,
14 param_grid=param_grid,
15 scoring=scorer,
16 cv=10,
17 n_jobs=-1)
18gs = gs.fit(X_train, y_train)
19print(gs.best_score_)
20print(gs.best_params_)
{'clf__C': 0.1, 'clf__kernel': 'linear'}
比如说:
5个样本,真实的target(目标标签)是y=c(1,1,0,0,1)
模型分类器将预测样本为1的概率p=c(0.5,0.6,0.55,0.4,0.7)
我们需要选定阈值才能把概率转化为类别,如果我们选定阈值为0.1,那么5个样本被分进1的类别。如果选定0.3,结果仍然一样。如果选了0.45作为阈值,那么只有样本4被分进0。
之后把所有得到的所有分类结果计算FTR,PTR,并绘制成线,就可以得到ROC曲线了,当threshold(阈值)取值越多,ROC曲线越平滑。
2from scipy import interp
3
4pipe_lr = Pipeline([('scl', StandardScaler()),
5 ('pca', PCA(n_components=2)),
6 ('clf', LogisticRegression(penalty='l2',
7 random_state=0,
8 C=100.0))])
9
10X_train2 = X_train[:, [4, 14]]
11 # 因为全部特征丢进去的话,预测效果太好,画ROC曲线不好看哈哈哈,所以只是取了2个特征
12
13
14cv = list(StratifiedKFold(n_splits=3,
15 random_state=1).split(X_train, y_train))
16
17fig = plt.figure(figsize=(7, 5))
18
19mean_tpr = 0.0
20mean_fpr = np.linspace(0, 1, 100)
21all_tpr = []
22
23for i, (train, test) in enumerate(cv):
24 probas = pipe_lr.fit(X_train2[train],
25 y_train[train]).predict_proba(X_train2[test])
26
27 fpr, tpr, thresholds = roc_curve(y_train[test],
28 probas[:, 1],
29 pos_label=1)
30 mean_tpr += interp(mean_fpr, fpr, tpr)
31 mean_tpr[0] = 0.0
32 roc_auc = auc(fpr, tpr)
33 plt.plot(fpr,
34 tpr,
35 lw=1,
36 label='ROC fold %d (area = %0.2f)'
37 % (i+1, roc_auc))
38
39plt.plot([0, 1],
40 [0, 1],
41 linestyle='--',
42 color=(0.6, 0.6, 0.6),
43 label='random guessing')
44
45mean_tpr /= len(cv)
46mean_tpr[-1] = 1.0
47mean_auc = auc(mean_fpr, mean_tpr)
48plt.plot(mean_fpr, mean_tpr, 'k--',
49 label='mean ROC (area = %0.2f)' % mean_auc, lw=2)
50plt.plot([0, 0, 1],
51 [0, 1, 1],
52 lw=2,
53 linestyle=':',
54 color='black',
55 label='perfect performance')
56
57plt.xlim([-0.05, 1.05])
58plt.ylim([-0.05, 1.05])
59plt.xlabel('false positive rate')
60plt.ylabel('true positive rate')
61plt.title('Receiver Operator Characteristic')
62plt.legend(loc="lower right")
63
64plt.tight_layout()
65plt.show()
2y_labels = pipe_lr.predict(X_test[:, [4, 14]])
3y_probas = pipe_lr.predict_proba(X_test[:, [4, 14]])[:, 1]
4# note that we use probabilities for roc_auc
5# the `[:, 1]` selects the positive class label only
2print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_probas))
3print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_labels))
Accuracy: 0.711