import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
raw_data = pd.read_excel('data.xlsx', engine="openpyxl")
raw_data.info()
raw_data.describe()
raw_data.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 788 entries, 0 to 787 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 棱柱体抗拉强度ft 788 non-null float64 1 圆柱体抗压强度fc' 788 non-null float64 2 立方体抗压强度fcu 788 non-null float64 3 棱柱体抗压强度fc 788 non-null float64 4 截面宽度b 788 non-null int64 5 截面有效高度h0 788 non-null int64 6 剪跨比a/d 788 non-null float64 7 纵筋配筋率ρ 558 non-null float64 8 箍筋屈服强度 230 non-null float64 9 配箍率 230 non-null float64 10 试件极限受剪承载力Vu 788 non-null float64 dtypes: float64(9), int64(2) memory usage: 67.8 KB
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力Vu | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | NaN | NaN | 61.5 |
1 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | NaN | NaN | 78.5 |
2 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | NaN | NaN | 62.0 |
3 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | NaN | NaN | 69.5 |
4 | 2.9 | 29.7 | 37.1 | 28.2 | 190 | 278 | 5.0 | 2.01 | NaN | NaN | 63.5 |
# 预处理
raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]
def show_missing_features(x):
total = x.isnull().sum()
percent = (x.isnull().sum()/x.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in raw_data.columns:
dtype = str(x[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
show_missing_features(raw_data)
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|---|---|
Total | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 230 | 558 | 558 | 0 |
Percent | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 29.1878 | 70.8122 | 70.8122 | 0 |
Types | float64 | float64 | float64 | float64 | int64 | int64 | float64 | float64 | float64 | float64 | float64 |
# 补缺数据
for x in raw_data.columns:
if raw_data[x].dtype=='float16' or raw_data[x].dtype=='float32' or raw_data[x].dtype=='float64':
raw_data[x].fillna(0,inplace=True)
raw_data = raw_data.fillna(0)
raw_data.head()
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 0.0 | 0.0 | 61.5 |
1 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 0.0 | 0.0 | 78.5 |
2 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 0.0 | 0.0 | 62.0 |
3 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 0.0 | 0.0 | 69.5 |
4 | 2.9 | 29.7 | 37.1 | 28.2 | 190 | 278 | 5.0 | 2.01 | 0.0 | 0.0 | 63.5 |
# 训练集测试集划分
label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict
raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2, random_state=101)
X_train
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 箍筋屈服强度 | 配箍率 | |
---|---|---|---|---|---|---|---|---|---|---|
174 | 3.4 | 39.0 | 51.3 | 39.0 | 300 | 925 | 2.92 | 1.01 | 0.0 | 0.00 |
416 | 2.9 | 28.8 | 37.8 | 28.8 | 154 | 268 | 2.53 | 0.76 | 0.0 | 0.00 |
744 | 3.3 | 37.8 | 47.2 | 35.6 | 153 | 500 | 0.78 | 0.00 | 333.0 | 0.89 |
355 | 2.7 | 25.2 | 33.1 | 25.2 | 203 | 403 | 3.78 | 2.55 | 0.0 | 0.00 |
68 | 3.0 | 31.1 | 38.8 | 29.5 | 152 | 1096 | 3.97 | 2.73 | 0.0 | 0.00 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
75 | 2.8 | 28.6 | 35.8 | 27.2 | 612 | 270 | 3.02 | 2.73 | 0.0 | 0.00 |
599 | 2.6 | 24.3 | 30.3 | 23.1 | 307 | 464 | 4.93 | 0.00 | 325.4 | 0.10 |
575 | 3.5 | 42.7 | 53.4 | 40.9 | 254 | 456 | 4.01 | 0.00 | 372.3 | 0.16 |
337 | 2.1 | 16.3 | 21.5 | 16.3 | 152 | 268 | 3.41 | 1.89 | 0.0 | 0.00 |
523 | 2.5 | 22.4 | 28.0 | 21.3 | 120 | 460 | 0.87 | 0.82 | 0.0 | 0.00 |
630 rows × 10 columns
# 特征工程,采用协方差矩阵,阈值设为0.95
# 去除掉信息重复的一些特征
corr_matrix = raw_data.corr().abs()
corr_matrix.head()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train
棱柱体抗拉强度ft | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 箍筋屈服强度 | 配箍率 | |
---|---|---|---|---|---|---|---|
174 | 3.4 | 300 | 925 | 2.92 | 1.01 | 0.0 | 0.00 |
416 | 2.9 | 154 | 268 | 2.53 | 0.76 | 0.0 | 0.00 |
744 | 3.3 | 153 | 500 | 0.78 | 0.00 | 333.0 | 0.89 |
355 | 2.7 | 203 | 403 | 3.78 | 2.55 | 0.0 | 0.00 |
68 | 3.0 | 152 | 1096 | 3.97 | 2.73 | 0.0 | 0.00 |
... | ... | ... | ... | ... | ... | ... | ... |
75 | 2.8 | 612 | 270 | 3.02 | 2.73 | 0.0 | 0.00 |
599 | 2.6 | 307 | 464 | 4.93 | 0.00 | 325.4 | 0.10 |
575 | 3.5 | 254 | 456 | 4.01 | 0.00 | 372.3 | 0.16 |
337 | 2.1 | 152 | 268 | 3.41 | 1.89 | 0.0 | 0.00 |
523 | 2.5 | 120 | 460 | 0.87 | 0.82 | 0.0 | 0.00 |
630 rows × 7 columns
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
#SVM回归
from sklearn.svm import SVR
svr = SVR()
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()
###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor()
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()
其主要超参数
max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]
parameters=dict(max_depth=max_depths,
min_samples_split=min_samples_splits,
min_samples_leaf=min_samples_leafs,
random_state=random_states)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3050.3989197979013 {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 3}
dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=2, random_state=3)
支持向量机所需要调整的超参数主要包括核函数、惩罚系数C以及核函数系数gamma
kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]
parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)
-8962.711340550044 {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
svr = SVR(C=1, gamma=0.1, kernel='linear')
其主要超参数包括K值、邻居标签的权重weights以及距离算法
k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]
parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-5279.807461163706 {'n_neighbors': 7, 'p': 1, 'weights': 'distance'}
kr = KNeighborsRegressor(n_neighbors=7, p=1, weights='distance')
random_states=[3]
n_estimators=[100,200,300,400,500]
parameters=dict(random_state=random_states,
n_estimators=n_estimators
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2345.780706138281 {'n_estimators': 200, 'random_state': 3}
rf = ensemble.RandomForestRegressor(n_estimators=200, random_state=3)
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-4231.662325003929 {'learning_rate': 0.2, 'n_estimators': 100, 'random_state': 3}
ar = AdaBoostRegressor(learning_rate=0.2, n_estimators=100, random_state=3)
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2141.260501893247 {'learning_rate': 0.2, 'n_estimators': 200, 'random_state': 3}
gr = GradientBoostingRegressor(learning_rate=0.2, n_estimators=200, random_state=3)
给定一个大小为n的训练集D,Bagging算法从中均匀、有放回地(即使用自助抽样法)选出m个大小为n'的子集Di,作为新的训练集。采用决策树作为基学习器,超参数包括决策树的个数。
random_states=[3]
n_estimators=[50,100,200,300,400]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2397.1728460506847 {'n_estimators': 200, 'random_state': 3}
br = BaggingRegressor(n_estimators=200,random_state=3)
将森林中收集的多个去相关决策树的结果聚集起来输出分类结果,比常规随机森林更具有随机性,以决策树为基学习器,包含最大特征数等超参数
random_states=[3]
max_features=[2,5,10]
parameters=dict(random_state=random_states,
max_features=max_features
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-5171.304530630513 {'max_features': 5, 'random_state': 3}
er = ExtraTreeRegressor(max_features=5,random_state=3)
# Cross-validaiton
cross_mse = []
ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)
ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)
ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)
ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)
ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)
ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)
ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)
ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)
ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)
cross_mse
[3050.3989197979013, 8591.61818230716, 8962.711340550044, 5279.807461163706, 2345.780706138281, 4231.662325003929, 2141.260501893247, 2397.1728460506847, 5171.304530630513]
# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']
plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
width, height = i.get_width(), i.get_height()
x, y = i.get_xy()
ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()
交叉验证后可以发现,采用GBDT梯度提升决策树的预测效果最好,在测试集中的结果如下所示
def try_different_method(model):
model.fit(X_train,y_train)
score = model.score(X_test, y_test)
print('R2:',score)
result = model.predict(X_test)
plt.figure()
plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
plt.title('score: %f'%score)
plt.legend()
plt.show()
try_different_method(gr)
R2: 0.8657070812586823
gr.fit(X_train,y_train)
import joblib
# 保存模型
joblib.dump(gr,'model.dat') # 第二个参数只需要写文件名字,是不是比pickle更人性化
['model.dat']