import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
raw_data = pd.read_excel('datanowebbar.xlsx', engine="openpyxl")
raw_data.info()
raw_data.describe()
raw_data.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 558 entries, 0 to 557 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 棱柱体抗拉强度ft 558 non-null float64 1 圆柱体抗压强度fc' 558 non-null float64 2 立方体抗压强度fcu 558 non-null float64 3 棱柱体抗压强度fc 558 non-null float64 4 截面宽度b 558 non-null int64 5 截面有效高度h0 558 non-null int64 6 剪跨比a/d 558 non-null float64 7 纵筋配筋率ρ 558 non-null float64 8 试件极限受剪承载力Vu 558 non-null float64 dtypes: float64(7), int64(2) memory usage: 39.4 KB
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 试件极限受剪承载力Vu | |
---|---|---|---|---|---|---|---|---|---|
0 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 61.5 |
1 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 78.5 |
2 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 62.0 |
3 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 69.5 |
4 | 2.9 | 29.7 | 37.1 | 28.2 | 190 | 278 | 5.0 | 2.01 | 63.5 |
# 预处理
raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]
def show_missing_features(x):
total = x.isnull().sum()
percent = (x.isnull().sum()/x.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in raw_data.columns:
dtype = str(x[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
show_missing_features(raw_data)
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|
Total | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Percent | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Types | float64 | float64 | float64 | float64 | int64 | int64 | float64 | float64 | float64 |
# 补缺数据
for x in raw_data.columns:
if raw_data[x].dtype=='float16' or raw_data[x].dtype=='float32' or raw_data[x].dtype=='float64':
raw_data[x].fillna(0,inplace=True)
raw_data = raw_data.fillna(0)
raw_data.head()
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|
0 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 61.5 |
1 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 3.0 | 2.07 | 78.5 |
2 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 62.0 |
3 | 2.8 | 28.4 | 35.5 | 27.0 | 190 | 270 | 4.0 | 2.07 | 69.5 |
4 | 2.9 | 29.7 | 37.1 | 28.2 | 190 | 278 | 5.0 | 2.01 | 63.5 |
# 训练集测试集划分
label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict
raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2, random_state=101)
X_train
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | |
---|---|---|---|---|---|---|---|---|
147 | 5.2 | 81.4 | 107.1 | 89.0 | 152 | 298 | 3.60 | 3.36 |
202 | 2.7 | 24.9 | 32.8 | 24.9 | 187 | 221 | 3.80 | 1.93 |
33 | 2.8 | 28.1 | 35.1 | 26.7 | 151 | 136 | 1.00 | 2.76 |
513 | 3.7 | 75.6 | 94.5 | 70.9 | 158 | 266 | 1.92 | 5.32 |
89 | 2.1 | 17.1 | 21.3 | 16.2 | 152 | 254 | 5.14 | 3.10 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
552 | 5.5 | 87.4 | 109.2 | 95.9 | 200 | 550 | 2.50 | 1.23 |
393 | 2.4 | 20.1 | 26.4 | 20.1 | 152 | 273 | 1.99 | 0.50 |
75 | 2.8 | 28.6 | 35.8 | 27.2 | 612 | 270 | 3.02 | 2.73 |
337 | 2.1 | 16.3 | 21.5 | 16.3 | 152 | 268 | 3.41 | 1.89 |
523 | 2.5 | 22.4 | 28.0 | 21.3 | 120 | 460 | 0.87 | 0.82 |
446 rows × 8 columns
# 特征工程,采用协方差矩阵,阈值设为0.95
# 去除掉信息重复的一些特征
corr_matrix = raw_data.corr().abs()
corr_matrix.head()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train
棱柱体抗拉强度ft | 截面宽度b | 截面有效高度h0 | 剪跨比a/d | 纵筋配筋率ρ | |
---|---|---|---|---|---|
147 | 5.2 | 152 | 298 | 3.60 | 3.36 |
202 | 2.7 | 187 | 221 | 3.80 | 1.93 |
33 | 2.8 | 151 | 136 | 1.00 | 2.76 |
513 | 3.7 | 158 | 266 | 1.92 | 5.32 |
89 | 2.1 | 152 | 254 | 5.14 | 3.10 |
... | ... | ... | ... | ... | ... |
552 | 5.5 | 200 | 550 | 2.50 | 1.23 |
393 | 2.4 | 152 | 273 | 1.99 | 0.50 |
75 | 2.8 | 612 | 270 | 3.02 | 2.73 |
337 | 2.1 | 152 | 268 | 3.41 | 1.89 |
523 | 2.5 | 120 | 460 | 0.87 | 0.82 |
446 rows × 5 columns
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
#SVM回归
from sklearn.svm import SVR
svr = SVR()
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()
###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor()
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()
其主要超参数
max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]
parameters=dict(max_depth=max_depths,
min_samples_split=min_samples_splits,
min_samples_leaf=min_samples_leafs,
random_state=random_states)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3011.2148499547748 {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'random_state': 3}
dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=4, random_state=3)
支持向量机所需要调整的超参数主要包括核函数、惩罚系数C以及核函数系数gamma
kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]
parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)
-7195.764634588294 {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
svr = SVR(C=1, gamma=0.1, kernel='linear')
其主要超参数包括K值、邻居标签的权重weights以及距离算法
k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]
parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-4316.574772738964 {'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
kr = KNeighborsRegressor(n_neighbors=2, p=1, weights='distance')
random_states=[3]
n_estimators=[300,400,500,600,700,800]
parameters=dict(random_state=random_states,
n_estimators=n_estimators
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-1896.8281900124919 {'n_estimators': 500, 'random_state': 3}
rf = ensemble.RandomForestRegressor(n_estimators=500, random_state=3)
random_states=[3]
n_estimators=[25,50,100,200,300]
learning_rates=[0.1,0.2,0.4,0.8,1.6]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3202.8595944912827 {'learning_rate': 0.2, 'n_estimators': 50, 'random_state': 3}
ar = AdaBoostRegressor(learning_rate=0.2, n_estimators=50, random_state=3)
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-1386.8153578902009 {'learning_rate': 0.4, 'n_estimators': 200, 'random_state': 3}
gr = GradientBoostingRegressor(learning_rate=0.4, n_estimators=200, random_state=3)
给定一个大小为n的训练集D,Bagging算法从中均匀、有放回地(即使用自助抽样法)选出m个大小为n'的子集Di,作为新的训练集。采用决策树作为基学习器,超参数包括决策树的个数。
random_states=[3]
n_estimators=[50,100,200,300,400]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-1900.9513275926747 {'n_estimators': 200, 'random_state': 3}
br = BaggingRegressor(n_estimators=200,random_state=3)
将森林中收集的多个去相关决策树的结果聚集起来输出分类结果,比常规随机森林更具有随机性,以决策树为基学习器,包含最大特征数等超参数
random_states=[3]
max_features=[2,5,10]
parameters=dict(random_state=random_states,
max_features=max_features
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3892.1530888254456 {'max_features': 2, 'random_state': 3}
er = ExtraTreeRegressor(max_features=2,random_state=3)
# Cross-validaiton
cross_mse = []
ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)
ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)
ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)
ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)
ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)
ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)
ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)
ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)
ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)
cross_mse
[3011.2148499547748, 5571.789511806758, 7195.764634588294, 4316.574772738964, 1896.8281900124919, 3202.8595944912827, 1386.8153578902009, 1900.9513275926747, 3892.1530888254456]
# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']
plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
width, height = i.get_width(), i.get_height()
x, y = i.get_xy()
ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()
交叉验证后可以发现,采用GBDT梯度提升决策树的预测效果最好,在测试集中的结果如下所示
def try_different_method(model):
model.fit(X_train,y_train)
score = model.score(X_test, y_test)
print('R2:',score)
result = model.predict(X_test)
plt.figure()
plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
plt.title('score: %f'%score)
plt.legend()
plt.show()
try_different_method(gr)
R2: 0.6390886480867133
gr.fit(X_train,y_train)
import joblib
# 保存模型
joblib.dump(gr,'modelnowebbar.dat') # 第二个参数只需要写文件名字,是不是比pickle更人性化
['modelnowebbar.dat']