In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')
In [2]:
raw_data = pd.read_excel('data.xlsx', engine="openpyxl")

raw_data.info()
raw_data.describe()
raw_data.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788 entries, 0 to 787
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   棱柱体抗拉强度ft    788 non-null    float64
 1   圆柱体抗压强度fc'   788 non-null    float64
 2   立方体抗压强度fcu   788 non-null    float64
 3   棱柱体抗压强度fc    788 non-null    float64
 4   截面宽度b        788 non-null    int64  
 5   截面有效高度h0     788 non-null    int64  
 6   剪跨比a/d       788 non-null    float64
 7   纵筋配筋率ρ       558 non-null    float64
 8   箍筋屈服强度       230 non-null    float64
 9   配箍率          230 non-null    float64
 10  试件极限受剪承载力Vu  788 non-null    float64
dtypes: float64(9), int64(2)
memory usage: 67.8 KB
Out[2]:
棱柱体抗拉强度ft 圆柱体抗压强度fc' 立方体抗压强度fcu 棱柱体抗压强度fc 截面宽度b 截面有效高度h0 剪跨比a/d 纵筋配筋率ρ 箍筋屈服强度 配箍率 试件极限受剪承载力Vu
0 2.8 28.4 35.5 27.0 190 270 3.0 2.07 NaN NaN 61.5
1 2.8 28.4 35.5 27.0 190 270 3.0 2.07 NaN NaN 78.5
2 2.8 28.4 35.5 27.0 190 270 4.0 2.07 NaN NaN 62.0
3 2.8 28.4 35.5 27.0 190 270 4.0 2.07 NaN NaN 69.5
4 2.9 29.7 37.1 28.2 190 278 5.0 2.01 NaN NaN 63.5
In [3]:
# 预处理

raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]

def show_missing_features(x):
    total = x.isnull().sum()
    percent = (x.isnull().sum()/x.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in raw_data.columns:
        dtype = str(x[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

show_missing_features(raw_data)
Out[3]:
棱柱体抗拉强度ft 圆柱体抗压强度fc' 立方体抗压强度fcu 棱柱体抗压强度fc 截面宽度b 截面有效高度h0 剪跨比a/d 纵筋配筋率ρ 箍筋屈服强度 配箍率 试件极限受剪承载力vu
Total 0 0 0 0 0 0 0 230 558 558 0
Percent 0 0 0 0 0 0 0 29.1878 70.8122 70.8122 0
Types float64 float64 float64 float64 int64 int64 float64 float64 float64 float64 float64
  • 此处采用补0的方法来填补缺漏数据
In [4]:
# 补缺数据

for x in raw_data.columns:
    if raw_data[x].dtype=='float16' or  raw_data[x].dtype=='float32' or  raw_data[x].dtype=='float64':
        raw_data[x].fillna(0,inplace=True)

raw_data = raw_data.fillna(0)


raw_data.head()
Out[4]:
棱柱体抗拉强度ft 圆柱体抗压强度fc' 立方体抗压强度fcu 棱柱体抗压强度fc 截面宽度b 截面有效高度h0 剪跨比a/d 纵筋配筋率ρ 箍筋屈服强度 配箍率 试件极限受剪承载力vu
0 2.8 28.4 35.5 27.0 190 270 3.0 2.07 0.0 0.0 61.5
1 2.8 28.4 35.5 27.0 190 270 3.0 2.07 0.0 0.0 78.5
2 2.8 28.4 35.5 27.0 190 270 4.0 2.07 0.0 0.0 62.0
3 2.8 28.4 35.5 27.0 190 270 4.0 2.07 0.0 0.0 69.5
4 2.9 29.7 37.1 28.2 190 278 5.0 2.01 0.0 0.0 63.5

数据划分与特征工程¶

In [5]:
# 训练集测试集划分

label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict

raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']

X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2, random_state=101)
X_train
Out[5]:
棱柱体抗拉强度ft 圆柱体抗压强度fc' 立方体抗压强度fcu 棱柱体抗压强度fc 截面宽度b 截面有效高度h0 剪跨比a/d 纵筋配筋率ρ 箍筋屈服强度 配箍率
174 3.4 39.0 51.3 39.0 300 925 2.92 1.01 0.0 0.00
416 2.9 28.8 37.8 28.8 154 268 2.53 0.76 0.0 0.00
744 3.3 37.8 47.2 35.6 153 500 0.78 0.00 333.0 0.89
355 2.7 25.2 33.1 25.2 203 403 3.78 2.55 0.0 0.00
68 3.0 31.1 38.8 29.5 152 1096 3.97 2.73 0.0 0.00
... ... ... ... ... ... ... ... ... ... ...
75 2.8 28.6 35.8 27.2 612 270 3.02 2.73 0.0 0.00
599 2.6 24.3 30.3 23.1 307 464 4.93 0.00 325.4 0.10
575 3.5 42.7 53.4 40.9 254 456 4.01 0.00 372.3 0.16
337 2.1 16.3 21.5 16.3 152 268 3.41 1.89 0.0 0.00
523 2.5 22.4 28.0 21.3 120 460 0.87 0.82 0.0 0.00

630 rows × 10 columns

In [6]:
# 特征工程,采用协方差矩阵,阈值设为0.95
# 去除掉信息重复的一些特征

corr_matrix = raw_data.corr().abs()
corr_matrix.head()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train
Out[6]:
棱柱体抗拉强度ft 截面宽度b 截面有效高度h0 剪跨比a/d 纵筋配筋率ρ 箍筋屈服强度 配箍率
174 3.4 300 925 2.92 1.01 0.0 0.00
416 2.9 154 268 2.53 0.76 0.0 0.00
744 3.3 153 500 0.78 0.00 333.0 0.89
355 2.7 203 403 3.78 2.55 0.0 0.00
68 3.0 152 1096 3.97 2.73 0.0 0.00
... ... ... ... ... ... ... ...
75 2.8 612 270 3.02 2.73 0.0 0.00
599 2.6 307 464 4.93 0.00 325.4 0.10
575 3.5 254 456 4.01 0.00 372.3 0.16
337 2.1 152 268 3.41 1.89 0.0 0.00
523 2.5 120 460 0.87 0.82 0.0 0.00

630 rows × 7 columns

交叉验证调参¶

In [7]:
   
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
 
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
 
#SVM回归
from sklearn.svm import SVR
svr = SVR()
 
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()

###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
 
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor() 
 
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
 
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
 
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()

决策树回归¶

其主要超参数

  1. max_depth 最大深度,模型样本量多,特征也多的情况下,可以尝试限制一下,影响是否过拟合
  2. min_samples_split 如果节点的样本数量小于max_samples_split,则不会继续再尝试选择最优特征来进行划分
  3. min_samples_leaf 叶子节点的最小样本数,如果小于该值则会被剪枝
In [8]:
max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]

parameters=dict(max_depth=max_depths,
                min_samples_split=min_samples_splits,
                min_samples_leaf=min_samples_leafs,
                random_state=random_states)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3050.3989197979013
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 3}
In [9]:
dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=2, random_state=3)

SVM回归¶

支持向量机所需要调整的超参数主要包括核函数、惩罚系数C以及核函数系数gamma

  1. 核函数多选用rbf和linear,分别用来处理非线性可分和线性可分问题
  2. 目标函数的惩罚系数C,C高即不能容忍误差,容易过拟合
  3. gamma,rbf中的系数,gamma越大,支持向量越少,影响训练和预测的速度
In [10]:
kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]

parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)
-8962.711340550044
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
In [11]:
svr = SVR(C=1, gamma=0.1, kernel='linear')

k近邻算法¶

其主要超参数包括K值、邻居标签的权重weights以及距离算法

  1. K值 默认为5
  2. weights 可以采用均布uniform或者以距离distance为权重
  3. p 距离,p=1为曼哈顿距离,p=2为欧氏距离
In [12]:
k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]

parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-5279.807461163706
{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
In [13]:
kr = KNeighborsRegressor(n_neighbors=7, p=1, weights='distance')

随机森林回归¶

随机森林包含决策树的超参数,此外还有决策树的个数等超参数

  1. 决策树个数越多,效果越好,但性能就会越差
  2. 决策树自身超参数影响较小,可以不调整
In [14]:
random_states=[3]
n_estimators=[100,200,300,400,500]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators
               )
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2345.780706138281
{'n_estimators': 200, 'random_state': 3}
In [15]:
rf = ensemble.RandomForestRegressor(n_estimators=200, random_state=3)

Adaboost回归¶

以决策树为基学习器,包含决策树的超参数,此外还有决策树的个数和学习率

  1. 决策树个数越多,效果越好,但性能就会越差
  2. 学习率需要和决策树个数一起调整
In [16]:
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-4231.662325003929
{'learning_rate': 0.2, 'n_estimators': 100, 'random_state': 3}
In [17]:
ar = AdaBoostRegressor(learning_rate=0.2, n_estimators=100, random_state=3) 

GBDT回归¶

梯度提升决策树,与adaboost类似,以决策树为基学习器,包含决策树的超参数,此外还有决策树的个数和学习率

  1. 决策树个数越多,效果越好,但性能就会越差
  2. 学习率需要和决策树个数一起调整
In [18]:
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2141.260501893247
{'learning_rate': 0.2, 'n_estimators': 200, 'random_state': 3}
In [19]:
gr = GradientBoostingRegressor(learning_rate=0.2, n_estimators=200, random_state=3)

Bagging回归¶

给定一个大小为n的训练集D,Bagging算法从中均匀、有放回地(即使用自助抽样法)选出m个大小为n'的子集Di,作为新的训练集。采用决策树作为基学习器,超参数包括决策树的个数。

  1. 决策树个数越多,效果越好,但性能就会越差
In [20]:
random_states=[3]
n_estimators=[50,100,200,300,400]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
               )
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2397.1728460506847
{'n_estimators': 200, 'random_state': 3}
In [21]:
br = BaggingRegressor(n_estimators=200,random_state=3)

ExtraTree极端随机树回归¶

将森林中收集的多个去相关决策树的结果聚集起来输出分类结果,比常规随机森林更具有随机性,以决策树为基学习器,包含最大特征数等超参数

In [22]:
random_states=[3]
max_features=[2,5,10]

parameters=dict(random_state=random_states,
                max_features=max_features
               )
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-5171.304530630513
{'max_features': 5, 'random_state': 3}
In [23]:
er = ExtraTreeRegressor(max_features=5,random_state=3)

交叉验证,选择最优模型¶

In [24]:
# Cross-validaiton

cross_mse = []

ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)

ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)

ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)

ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)

ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)

ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)

ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)

ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)

ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)

cross_mse
Out[24]:
[3050.3989197979013,
 8591.61818230716,
 8962.711340550044,
 5279.807461163706,
 2345.780706138281,
 4231.662325003929,
 2141.260501893247,
 2397.1728460506847,
 5171.304530630513]
In [25]:
# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']

plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()

模型选择¶

交叉验证后可以发现,采用GBDT梯度提升决策树的预测效果最好,在测试集中的结果如下所示

In [26]:
def try_different_method(model):
    model.fit(X_train,y_train)
    score = model.score(X_test, y_test)
    print('R2:',score)
    result = model.predict(X_test)
    plt.figure()
    plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
    plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
    plt.title('score: %f'%score)
    plt.legend()
    plt.show()

try_different_method(gr)
R2: 0.8657070812586823
In [27]:
gr.fit(X_train,y_train)
import joblib 

# 保存模型
joblib.dump(gr,'model.dat')  # 第二个参数只需要写文件名字,是不是比pickle更人性化
Out[27]:
['model.dat']