Deep Learning预测5年内皮马印第安人糖尿病发作的概率

代码实现如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer,accuracy_score

from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn.tree import DecisionTreeClassifier

import graphviz

1
2
3
4
5
6
# 加载数据集

data = pd.read_csv("diabetes.csv")

data

1
2
3
4
5
6
7
8
x = data.iloc[:,:-1].values

y = data.iloc[:,-1].values

print(x)

print(y)

1
2
3
4
5
6
7
8
9
10
# 定义决策树模型,采用10折交叉验证

model_dtc = DecisionTreeClassifier(criterion='entropy',max_depth=5)

acc_score_dtc = cross_val_score(model_dtc,x,y,cv=10,scoring='accuracy') # 准确率

precision_score_dtc = cross_val_score(model_dtc,x,y,cv=10,scoring='precision')  # 精确率

recall_score_dtc = cross_val_score(model_dtc,x,y,cv=10,scoring='recall')  # 召回率

1
2
acc_score_dtc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 可视化决策树的结构

feature_name = data.columns.values[:-1]

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

model_dtc.fit(x_train,y_train)

dot_data = tree.export_graphviz(model_dtc

                                ,out_file= None

                                ,feature_names = feature_name

                                ,class_names=['class_0','class_1']

                                ,filled=True

                                ,rounded=True

                               )

graph = graphviz.Source(dot_data)

graph.render("iris")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 创建随机森林对象

rfClassifier = RandomForestClassifier()



# 网络搜索的模型候选参数,即随机森林的候选参数

paramGrid = dict(

    max_depth = [1,2,3,4,5],             # 最大数深度候选值

    criterion = ["gini","entropy"],      # 评选节点分裂的候选标准

    max_leaf_nodes = [3,4,5,6,7,8],      # 最大叶子数候选值

    n_estimators = [10,50,100,150,200]   # 森林中树的候选数量

)

1
2
3
4
5
6
7
8
9
10
# 创建网络搜索对象

gridSearchCV = GridSearchCV(

    rfClassifier,param_grid=paramGrid,

    cv=10,verbose=1,n_jobs=2,scoring=make_scorer(accuracy_score)

)

1
2
3
4
5
6
7
8
9
10
11
12
# 开始进行网络搜索,寻找最优参数

grid = gridSearchCV.fit(x,y)

print(grid.best_score_)   # 打印最优准确率得分

print(grid.best_params_)  # 打印最优超参数



# 得到最优超参数:'max_depth': 3, 'max_leaf_nodes': 8, 'n_estimators': 150

1
2
3
4
5
6
trees = []   # 存放树的数量

errs = []    # 存放错误率

accs = []    # 准确率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# 划分训练集喝测试集

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

t_num = list(range(0,200,10))

t_num = [i+10 for i in t_num]

t_num = list([1]+t_num)

for t in t_num:

    trees.append(t)

    # 创建随机森林

    rfClassifier = RandomForestClassifier(max_depth=3,max_leaf_nodes=8,criterion="gini",n_estimators=t)

    # 使用训练集训练模型

    rfClassifier.fit(x_train,y_train)

    # 使用训练后的模型进行预测

    y_test_predict = rfClassifier.predict(x_test)

    # 计算准确率

    acc = accuracy_score(y_true=y_test,y_pred=y_test_predict)

    # 准确率

    accs.append(acc)

    # 计算错误率

    err = 1-acc

    errs.append(err)

df = pd.DataFrame({

    "trees":trees,

    "errs":errs,

    "accs":accs

})



df.plot(x="trees",y="errs",figsize=(10,5))

1
2
3
4
print("随机森林准确率:",acc)

print("决策树准确率:",acc_score_dtc.mean())

尾巴