import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')


raw_data = pd.read_excel('datahaswebbar.xlsx', engine="openpyxl")

raw_data.info()
raw_data.describe()
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   棱柱体抗拉强度ft    230 non-null    float64
 1   圆柱体抗压强度fc'   230 non-null    float64
 2   立方体抗压强度fcu   230 non-null    float64
 3   棱柱体抗压强度fc    230 non-null    float64
 4   截面宽度b        230 non-null    int64  
 5   截面有效高度h0     230 non-null    int64  
 6   剪跨比          230 non-null    float64
 7   箍筋屈服强度       230 non-null    float64
 8   配箍率          230 non-null    float64
 9   试件极限受剪承载力Vu  230 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 18.1 KB


# 预处理

raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]

def show_missing_features(x):
    total = x.isnull().sum()
    percent = (x.isnull().sum()/x.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in raw_data.columns:
        dtype = str(x[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

show_missing_features(raw_data)


# 补缺数据

for x in raw_data.columns:
    if raw_data[x].dtype=='float16' or  raw_data[x].dtype=='float32' or  raw_data[x].dtype=='float64':
        raw_data[x].fillna(0,inplace=True)

raw_data = raw_data.fillna(0)


raw_data.head()


# 训练集测试集划分

label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict

raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']

X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.13, random_state=101)
X_train


# 特征工程，采用协方差矩阵，阈值设为0.95
# 去除掉信息重复的一些特征

corr_matrix = raw_data.corr().abs()
corr_matrix.head()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train


   
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
 
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
 
#SVM回归
from sklearn.svm import SVR
svr = SVR()
 
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()

###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
 
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor() 
 
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
 
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
 
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()


max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]

parameters=dict(max_depth=max_depths,
                min_samples_split=min_samples_splits,
                min_samples_leaf=min_samples_leafs,
                random_state=random_states)
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-3202.931581968067
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 6, 'random_state': 3}


dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=6, random_state=3)


kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]

parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)

-6273.949828273424
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}


svr = SVR(C=1, gamma=0.1, kernel='linear')


k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]

parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-3981.6608777688257
{'n_neighbors': 6, 'p': 1, 'weights': 'distance'}


kr = KNeighborsRegressor(n_neighbors=6, p=1, weights='distance')


random_states=[3]
n_estimators=[50,100,200,300,400]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2325.4599213494507
{'n_estimators': 100, 'random_state': 3}


rf = ensemble.RandomForestRegressor(n_estimators=100, random_state=3)


random_states=[3]
n_estimators=[20,50,100,200,300]
learning_rates=[0.1,0.2,0.4,0.8,1.6]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-3533.2810668705733
{'learning_rate': 0.4, 'n_estimators': 50, 'random_state': 3}


ar = AdaBoostRegressor(learning_rate=0.4, n_estimators=50, random_state=3)


random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.05,0.1,0.2,0.4,0.8]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2514.4621345140968
{'learning_rate': 0.1, 'n_estimators': 100, 'random_state': 3}


gr = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, random_state=3)


random_states=[3]
n_estimators=[50,100,200,300,400]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2303.512217164249
{'n_estimators': 100, 'random_state': 3}


br = BaggingRegressor(n_estimators=100,random_state=3)


random_states=[3]
max_features=[2,5,10]

parameters=dict(random_state=random_states,
                max_features=max_features
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-4779.930741951191
{'max_features': 5, 'random_state': 3}


er = ExtraTreeRegressor(max_features=5,random_state=3)


# Cross-validaiton

cross_mse = []

ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)

ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)

ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)

ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)

ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)

ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)

ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)

ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)

ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)

cross_mse

[3202.931581968067,
 5085.57280973098,
 6273.949828273424,
 3981.6608777688257,
 2325.4599213494507,
 3533.2810668705733,
 2514.4621345140968,
 2303.512217164249,
 4779.930741951191]


# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']

plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()


def try_different_method(model):
    model.fit(X_train,y_train)
    score = model.score(X_test, y_test)
    print('R2:',score)
    result = model.predict(X_test)
    plt.figure()
    plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
    plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
    plt.title('score: %f'%score)
    plt.legend()
    plt.show()

try_different_method(br)

R2: 0.8331440202893755


br.fit(X_train,y_train)
import joblib

# 保存模型
joblib.dump(br,'modelhaswebbar.dat')  # 第二个参数只需要写文件名字,是不是比pickle更人性化

['modelhaswebbar.dat']

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比	箍筋屈服强度	配箍率	试件极限受剪承载力Vu
0	3.4	40.1	50.2	38.1	254	456	4.01	341.3	0.17	206.8
1	3.3	38.8	48.5	36.9	254	456	4.01	341.3	0.11	159.7
2	3.3	37.6	47.1	35.8	254	456	4.01	341.3	0.11	160.1
3	3.3	38.9	48.6	36.9	254	456	4.01	341.3	0.07	148.1
4	3.3	37.2	46.5	35.3	254	456	4.01	372.3	0.11	216.6

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比	箍筋屈服强度	配箍率	试件极限受剪承载力vu
0	3.4	40.1	50.2	38.1	254	456	4.01	341.3	0.17	206.8
1	3.3	38.8	48.5	36.9	254	456	4.01	341.3	0.11	159.7
2	3.3	37.6	47.1	35.8	254	456	4.01	341.3	0.11	160.1
3	3.3	38.9	48.6	36.9	254	456	4.01	341.3	0.07	148.1
4	3.3	37.2	46.5	35.3	254	456	4.01	372.3	0.11	216.6

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比	箍筋屈服强度	配箍率
32	3.8	48.1	60.2	46.9	152	254	3.60	279.9	0.41
208	2.3	20.2	25.2	19.2	200	537	2.80	210.0	0.49
91	2.6	24.6	30.8	23.4	203	390	1.95	331.1	0.37
31	3.5	43.0	53.7	41.2	152	254	3.36	269.9	0.21
112	2.6	24.0	30.0	22.8	203	390	1.17	331.1	0.61
...	...	...	...	...	...	...	...	...	...
63	2.5	23.2	29.0	22.1	150	315	2.00	354.4	0.23
70	2.7	27.0	33.8	25.7	151	265	4.21	344.8	0.21
81	2.6	24.6	30.8	23.4	140	464	1.75	330.3	0.74
11	3.3	37.6	47.0	35.7	254	456	4.01	372.3	0.06
95	3.5	42.1	52.7	40.3	203	390	1.95	331.1	0.37

	棱柱体抗拉强度ft	截面宽度b	截面有效高度h0	剪跨比	箍筋屈服强度	配箍率
32	3.8	152	254	3.60	279.9	0.41
208	2.3	200	537	2.80	210.0	0.49
91	2.6	203	390	1.95	331.1	0.37
31	3.5	152	254	3.36	269.9	0.21
112	2.6	203	390	1.17	331.1	0.61
...	...	...	...	...	...	...
63	2.5	150	315	2.00	354.4	0.23
70	2.7	151	265	4.21	344.8	0.21
81	2.6	140	464	1.75	330.3	0.74
11	3.3	254	456	4.01	372.3	0.06
95	3.5	203	390	1.95	331.1	0.37

数据划分与特征工程¶

交叉验证调参¶

决策树回归¶

SVM回归¶

k近邻算法¶

随机森林回归¶

Adaboost回归¶

GBDT回归¶

Bagging回归¶

ExtraTree极端随机树回归¶

交叉验证，选择最优模型¶

模型选择¶

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比	箍筋屈服强度	配箍率	试件极限受剪承载力vu
Total	0	0	0	0	0	0	0	0	0	0
Percent	0	0	0	0	0	0	0	0	0	0
Types	float64	float64	float64	float64	int64	int64	float64	float64	float64	float64