import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
raw_data = pd.read_excel('datahaswebbar.xlsx', engine="openpyxl")
raw_data.info()
raw_data.describe()
raw_data.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 230 entries, 0 to 229 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 棱柱体抗拉强度ft 230 non-null float64 1 圆柱体抗压强度fc' 230 non-null float64 2 立方体抗压强度fcu 230 non-null float64 3 棱柱体抗压强度fc 230 non-null float64 4 截面宽度b 230 non-null int64 5 截面有效高度h0 230 non-null int64 6 剪跨比 230 non-null float64 7 箍筋屈服强度 230 non-null float64 8 配箍率 230 non-null float64 9 试件极限受剪承载力Vu 230 non-null float64 dtypes: float64(8), int64(2) memory usage: 18.1 KB
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比 | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力Vu | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3.4 | 40.1 | 50.2 | 38.1 | 254 | 456 | 4.01 | 341.3 | 0.17 | 206.8 |
1 | 3.3 | 38.8 | 48.5 | 36.9 | 254 | 456 | 4.01 | 341.3 | 0.11 | 159.7 |
2 | 3.3 | 37.6 | 47.1 | 35.8 | 254 | 456 | 4.01 | 341.3 | 0.11 | 160.1 |
3 | 3.3 | 38.9 | 48.6 | 36.9 | 254 | 456 | 4.01 | 341.3 | 0.07 | 148.1 |
4 | 3.3 | 37.2 | 46.5 | 35.3 | 254 | 456 | 4.01 | 372.3 | 0.11 | 216.6 |
# 预处理
raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]
def show_missing_features(x):
total = x.isnull().sum()
percent = (x.isnull().sum()/x.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in raw_data.columns:
dtype = str(x[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
show_missing_features(raw_data)
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比 | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|---|
Total | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Percent | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Types | float64 | float64 | float64 | float64 | int64 | int64 | float64 | float64 | float64 | float64 |
# 补缺数据
for x in raw_data.columns:
if raw_data[x].dtype=='float16' or raw_data[x].dtype=='float32' or raw_data[x].dtype=='float64':
raw_data[x].fillna(0,inplace=True)
raw_data = raw_data.fillna(0)
raw_data.head()
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比 | 箍筋屈服强度 | 配箍率 | 试件极限受剪承载力vu | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3.4 | 40.1 | 50.2 | 38.1 | 254 | 456 | 4.01 | 341.3 | 0.17 | 206.8 |
1 | 3.3 | 38.8 | 48.5 | 36.9 | 254 | 456 | 4.01 | 341.3 | 0.11 | 159.7 |
2 | 3.3 | 37.6 | 47.1 | 35.8 | 254 | 456 | 4.01 | 341.3 | 0.11 | 160.1 |
3 | 3.3 | 38.9 | 48.6 | 36.9 | 254 | 456 | 4.01 | 341.3 | 0.07 | 148.1 |
4 | 3.3 | 37.2 | 46.5 | 35.3 | 254 | 456 | 4.01 | 372.3 | 0.11 | 216.6 |
# 训练集测试集划分
label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict
raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']
X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.13, random_state=101)
X_train
棱柱体抗拉强度ft | 圆柱体抗压强度fc' | 立方体抗压强度fcu | 棱柱体抗压强度fc | 截面宽度b | 截面有效高度h0 | 剪跨比 | 箍筋屈服强度 | 配箍率 | |
---|---|---|---|---|---|---|---|---|---|
32 | 3.8 | 48.1 | 60.2 | 46.9 | 152 | 254 | 3.60 | 279.9 | 0.41 |
208 | 2.3 | 20.2 | 25.2 | 19.2 | 200 | 537 | 2.80 | 210.0 | 0.49 |
91 | 2.6 | 24.6 | 30.8 | 23.4 | 203 | 390 | 1.95 | 331.1 | 0.37 |
31 | 3.5 | 43.0 | 53.7 | 41.2 | 152 | 254 | 3.36 | 269.9 | 0.21 |
112 | 2.6 | 24.0 | 30.0 | 22.8 | 203 | 390 | 1.17 | 331.1 | 0.61 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
63 | 2.5 | 23.2 | 29.0 | 22.1 | 150 | 315 | 2.00 | 354.4 | 0.23 |
70 | 2.7 | 27.0 | 33.8 | 25.7 | 151 | 265 | 4.21 | 344.8 | 0.21 |
81 | 2.6 | 24.6 | 30.8 | 23.4 | 140 | 464 | 1.75 | 330.3 | 0.74 |
11 | 3.3 | 37.6 | 47.0 | 35.7 | 254 | 456 | 4.01 | 372.3 | 0.06 |
95 | 3.5 | 42.1 | 52.7 | 40.3 | 203 | 390 | 1.95 | 331.1 | 0.37 |
200 rows × 9 columns
# 特征工程,采用协方差矩阵,阈值设为0.95
# 去除掉信息重复的一些特征
corr_matrix = raw_data.corr().abs()
corr_matrix.head()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train
棱柱体抗拉强度ft | 截面宽度b | 截面有效高度h0 | 剪跨比 | 箍筋屈服强度 | 配箍率 | |
---|---|---|---|---|---|---|
32 | 3.8 | 152 | 254 | 3.60 | 279.9 | 0.41 |
208 | 2.3 | 200 | 537 | 2.80 | 210.0 | 0.49 |
91 | 2.6 | 203 | 390 | 1.95 | 331.1 | 0.37 |
31 | 3.5 | 152 | 254 | 3.36 | 269.9 | 0.21 |
112 | 2.6 | 203 | 390 | 1.17 | 331.1 | 0.61 |
... | ... | ... | ... | ... | ... | ... |
63 | 2.5 | 150 | 315 | 2.00 | 354.4 | 0.23 |
70 | 2.7 | 151 | 265 | 4.21 | 344.8 | 0.21 |
81 | 2.6 | 140 | 464 | 1.75 | 330.3 | 0.74 |
11 | 3.3 | 254 | 456 | 4.01 | 372.3 | 0.06 |
95 | 3.5 | 203 | 390 | 1.95 | 331.1 | 0.37 |
200 rows × 6 columns
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
#SVM回归
from sklearn.svm import SVR
svr = SVR()
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()
###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor()
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()
其主要超参数
max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]
parameters=dict(max_depth=max_depths,
min_samples_split=min_samples_splits,
min_samples_leaf=min_samples_leafs,
random_state=random_states)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3202.931581968067 {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 6, 'random_state': 3}
dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=6, random_state=3)
支持向量机所需要调整的超参数主要包括核函数、惩罚系数C以及核函数系数gamma
kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]
parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)
-6273.949828273424 {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
svr = SVR(C=1, gamma=0.1, kernel='linear')
其主要超参数包括K值、邻居标签的权重weights以及距离算法
k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]
parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3981.6608777688257 {'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
kr = KNeighborsRegressor(n_neighbors=6, p=1, weights='distance')
random_states=[3]
n_estimators=[50,100,200,300,400]
parameters=dict(random_state=random_states,
n_estimators=n_estimators
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2325.4599213494507 {'n_estimators': 100, 'random_state': 3}
rf = ensemble.RandomForestRegressor(n_estimators=100, random_state=3)
random_states=[3]
n_estimators=[20,50,100,200,300]
learning_rates=[0.1,0.2,0.4,0.8,1.6]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-3533.2810668705733 {'learning_rate': 0.4, 'n_estimators': 50, 'random_state': 3}
ar = AdaBoostRegressor(learning_rate=0.4, n_estimators=50, random_state=3)
random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.05,0.1,0.2,0.4,0.8]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
learning_rate=learning_rates
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2514.4621345140968 {'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 3}
gr = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, random_state=3)
给定一个大小为n的训练集D,Bagging算法从中均匀、有放回地(即使用自助抽样法)选出m个大小为n'的子集Di,作为新的训练集。采用决策树作为基学习器,超参数包括决策树的个数。
random_states=[3]
n_estimators=[50,100,200,300,400]
parameters=dict(random_state=random_states,
n_estimators=n_estimators,
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-2303.512217164249 {'n_estimators': 100, 'random_state': 3}
br = BaggingRegressor(n_estimators=100,random_state=3)
将森林中收集的多个去相关决策树的结果聚集起来输出分类结果,比常规随机森林更具有随机性,以决策树为基学习器,包含最大特征数等超参数
random_states=[3]
max_features=[2,5,10]
parameters=dict(random_state=random_states,
max_features=max_features
)
#3折方法进行网格划分,寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
-4779.930741951191 {'max_features': 5, 'random_state': 3}
er = ExtraTreeRegressor(max_features=5,random_state=3)
# Cross-validaiton
cross_mse = []
ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)
ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)
ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)
ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)
ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)
ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)
ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)
ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)
ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)
cross_mse
[3202.931581968067, 5085.57280973098, 6273.949828273424, 3981.6608777688257, 2325.4599213494507, 3533.2810668705733, 2514.4621345140968, 2303.512217164249, 4779.930741951191]
# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']
plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
width, height = i.get_width(), i.get_height()
x, y = i.get_xy()
ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()
交叉验证后可以发现,采用Bagging回归的预测效果最好,在测试集中的结果如下所示
def try_different_method(model):
model.fit(X_train,y_train)
score = model.score(X_test, y_test)
print('R2:',score)
result = model.predict(X_test)
plt.figure()
plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
plt.title('score: %f'%score)
plt.legend()
plt.show()
try_different_method(br)
R2: 0.8331440202893755
br.fit(X_train,y_train)
import joblib
# 保存模型
joblib.dump(br,'modelhaswebbar.dat') # 第二个参数只需要写文件名字,是不是比pickle更人性化
['modelhaswebbar.dat']