import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')


raw_data = pd.read_excel('data.xlsx', engine="openpyxl")

raw_data.info()
raw_data.describe()
raw_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788 entries, 0 to 787
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   棱柱体抗拉强度ft    788 non-null    float64
 1   圆柱体抗压强度fc'   788 non-null    float64
 2   立方体抗压强度fcu   788 non-null    float64
 3   棱柱体抗压强度fc    788 non-null    float64
 4   截面宽度b        788 non-null    int64  
 5   截面有效高度h0     788 non-null    int64  
 6   剪跨比a/d       788 non-null    float64
 7   纵筋配筋率ρ       558 non-null    float64
 8   箍筋屈服强度       230 non-null    float64
 9   配箍率          230 non-null    float64
 10  试件极限受剪承载力Vu  788 non-null    float64
dtypes: float64(9), int64(2)
memory usage: 67.8 KB


# 预处理

raw_data.columns = [x.lower().strip().replace(' ','_') for x in raw_data.columns]

def show_missing_features(x):
    total = x.isnull().sum()
    percent = (x.isnull().sum()/x.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in raw_data.columns:
        dtype = str(x[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

show_missing_features(raw_data)


# 补缺数据

for x in raw_data.columns:
    if raw_data[x].dtype=='float16' or  raw_data[x].dtype=='float32' or  raw_data[x].dtype=='float64':
        raw_data[x].fillna(0,inplace=True)

raw_data = raw_data.fillna(0)


raw_data.head()


# 训练集测试集划分

label_col = ['试件极限受剪承载力vu']
cols = [x for x in raw_data.columns if x not in label_col] # These columns are the features we can use to predict

raw_X = raw_data[cols]
raw_y = raw_data['试件极限受剪承载力vu']

X_train, X_test, y_train, y_test = train_test_split(raw_X, raw_y, test_size = 0.2, random_state=101)
X_train


# 特征工程，采用协方差矩阵，阈值设为0.95
# 去除掉信息重复的一些特征

corr_matrix = raw_data.corr().abs()
corr_matrix.head()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()
threshold=0.95
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
X_train = X_train.drop(columns = to_drop)
X_test = X_test.drop(columns = to_drop)
X_train


   
#决策树回归
from sklearn.tree import DecisionTreeRegressor
dr = DecisionTreeRegressor()
 
#线性回归
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
 
#SVM回归
from sklearn.svm import SVR
svr = SVR()
 
#KNN回归
from sklearn.neighbors import KNeighborsRegressor
kr = KNeighborsRegressor()

###下面为集成学习
#随机森林回归
from sklearn import ensemble
rf = ensemble.RandomForestRegressor()
 
#Adaboost回归
from sklearn.ensemble import AdaBoostRegressor
ar = AdaBoostRegressor() 
 
#GBDT回归
from sklearn.ensemble import GradientBoostingRegressor
gr = GradientBoostingRegressor()
 
#Bagging回归
from sklearn.ensemble import BaggingRegressor
br = BaggingRegressor()
 
#ExtraTree极端随机树回归
from sklearn.tree import ExtraTreeRegressor
er = ExtraTreeRegressor()


max_depths=[2,4,6,8,10]
min_samples_splits=[2,4,6,8,10]
min_samples_leafs=[1,2,3,4,5]
random_states=[3]

parameters=dict(max_depth=max_depths,
                min_samples_split=min_samples_splits,
                min_samples_leaf=min_samples_leafs,
                random_state=random_states)
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(dr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-3050.3989197979013
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 3}


dr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=2, random_state=3)


kernels=['rbf','linear']
Cs=[0.01,0.1,1]
gammas=[0.1,0.2,0.4,0.6]

parameters=dict(kernel=kernels,C=Cs,gamma=gammas)
grid=GridSearchCV(svr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)
svr = SVR(grid.best_params_)

-8962.711340550044
{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}


svr = SVR(C=1, gamma=0.1, kernel='linear')


k=range(1,10)
weightss=['uniform','distance']
ps=[1,2]

parameters=dict(n_neighbors=k,weights=weightss,p=ps)
grid=GridSearchCV(kr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-5279.807461163706
{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}


kr = KNeighborsRegressor(n_neighbors=7, p=1, weights='distance')


random_states=[3]
n_estimators=[100,200,300,400,500]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(rf,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2345.780706138281
{'n_estimators': 200, 'random_state': 3}


rf = ensemble.RandomForestRegressor(n_estimators=200, random_state=3)


random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(ar,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-4231.662325003929
{'learning_rate': 0.2, 'n_estimators': 100, 'random_state': 3}


ar = AdaBoostRegressor(learning_rate=0.2, n_estimators=100, random_state=3)


random_states=[3]
n_estimators=[50,100,200,300,400]
learning_rates=[0.1,0.2,0.4,0.8,1.6]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
                learning_rate=learning_rates
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(gr,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2141.260501893247
{'learning_rate': 0.2, 'n_estimators': 200, 'random_state': 3}


gr = GradientBoostingRegressor(learning_rate=0.2, n_estimators=200, random_state=3)


random_states=[3]
n_estimators=[50,100,200,300,400]

parameters=dict(random_state=random_states,
                n_estimators=n_estimators,
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(br,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-2397.1728460506847
{'n_estimators': 200, 'random_state': 3}


br = BaggingRegressor(n_estimators=200,random_state=3)


random_states=[3]
max_features=[2,5,10]

parameters=dict(random_state=random_states,
                max_features=max_features
               )
#3折方法进行网格划分，寻找最优超参数
grid=GridSearchCV(er,parameters,cv=3,scoring='neg_mean_squared_error')
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

-5171.304530630513
{'max_features': 5, 'random_state': 3}


er = ExtraTreeRegressor(max_features=5,random_state=3)


# Cross-validaiton

cross_mse = []

ca_dr = cross_val_score(dr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_dr = ca_dr.mean()
cross_mse.append(-ca_dr)

ca_lr = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_lr = ca_lr.mean()
cross_mse.append(-ca_lr)

ca_svr = cross_val_score(svr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_svr = ca_svr.mean()
cross_mse.append(-ca_svr)

ca_kr = cross_val_score(kr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_kr = ca_kr.mean()
cross_mse.append(-ca_kr)

ca_rf = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_rf = ca_rf.mean()
cross_mse.append(-ca_rf)

ca_ar = cross_val_score(ar, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_ar = ca_ar.mean()
cross_mse.append(-ca_ar)

ca_gr = cross_val_score(gr, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_gr = ca_gr.mean()
cross_mse.append(-ca_gr)

ca_br = cross_val_score(br, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_br = ca_br.mean()
cross_mse.append(-ca_br)

ca_er = cross_val_score(er, X_train, y_train, scoring='neg_mean_squared_error',cv=3)
ca_er = ca_er.mean()
cross_mse.append(-ca_er)

cross_mse

[3050.3989197979013,
 8591.61818230716,
 8962.711340550044,
 5279.807461163706,
 2345.780706138281,
 4231.662325003929,
 2141.260501893247,
 2397.1728460506847,
 5171.304530630513]


# Demostrate model results on Test split
model_list = ['DR', 'LR','SVR','KR','RF','AR','GR','BR','ER']

plt.rcParams['figure.figsize']=4,4
sns.set_style('darkgrid')
ax = sns.barplot(x=model_list, y=cross_mse, palette = "rocket", saturation =2.0)
plt.xlabel('Regression Models', fontsize = 12 )
plt.ylabel('MSE', fontsize = 12)
plt.title('MSE of different Regression Models', fontsize = 16)
plt.xticks(fontsize = 12, horizontalalignment = 'center')
plt.yticks(fontsize = 12)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy() 
    ax.annotate(f'{round(height,2)}', (x + width/2, y + height), ha='center', fontsize = 10)
plt.show()


def try_different_method(model):
    model.fit(X_train,y_train)
    score = model.score(X_test, y_test)
    print('R2:',score)
    result = model.predict(X_test)
    plt.figure()
    plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
    plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
    plt.title('score: %f'%score)
    plt.legend()
    plt.show()

try_different_method(gr)

R2: 0.8657070812586823


gr.fit(X_train,y_train)
import joblib 

# 保存模型
joblib.dump(gr,'model.dat')  # 第二个参数只需要写文件名字,是不是比pickle更人性化

['model.dat']

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比a/d	纵筋配筋率ρ	箍筋屈服强度	配箍率	试件极限受剪承载力Vu
0	2.8	28.4	35.5	27.0	190	270	3.0	2.07	NaN	NaN	61.5
1	2.8	28.4	35.5	27.0	190	270	3.0	2.07	NaN	NaN	78.5
2	2.8	28.4	35.5	27.0	190	270	4.0	2.07	NaN	NaN	62.0
3	2.8	28.4	35.5	27.0	190	270	4.0	2.07	NaN	NaN	69.5
4	2.9	29.7	37.1	28.2	190	278	5.0	2.01	NaN	NaN	63.5

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比a/d	纵筋配筋率ρ	试件极限受剪承载力vu
0	2.8	28.4	35.5	27.0	190	270	3.0	2.07	61.5
1	2.8	28.4	35.5	27.0	190	270	3.0	2.07	78.5
2	2.8	28.4	35.5	27.0	190	270	4.0	2.07	62.0
3	2.8	28.4	35.5	27.0	190	270	4.0	2.07	69.5
4	2.9	29.7	37.1	28.2	190	278	5.0	2.01	63.5

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比a/d	纵筋配筋率ρ	箍筋屈服强度	配箍率
174	3.4	39.0	51.3	39.0	300	925	2.92	1.01	0.0	0.00
416	2.9	28.8	37.8	28.8	154	268	2.53	0.76	0.0	0.00
744	3.3	37.8	47.2	35.6	153	500	0.78	0.00	333.0	0.89
355	2.7	25.2	33.1	25.2	203	403	3.78	2.55	0.0	0.00
68	3.0	31.1	38.8	29.5	152	1096	3.97	2.73	0.0	0.00
...	...	...	...	...	...	...	...	...	...	...
75	2.8	28.6	35.8	27.2	612	270	3.02	2.73	0.0	0.00
599	2.6	24.3	30.3	23.1	307	464	4.93	0.00	325.4	0.10
575	3.5	42.7	53.4	40.9	254	456	4.01	0.00	372.3	0.16
337	2.1	16.3	21.5	16.3	152	268	3.41	1.89	0.0	0.00
523	2.5	22.4	28.0	21.3	120	460	0.87	0.82	0.0	0.00

	棱柱体抗拉强度ft	截面宽度b	截面有效高度h0	剪跨比a/d	纵筋配筋率ρ	箍筋屈服强度	配箍率
174	3.4	300	925	2.92	1.01	0.0	0.00
416	2.9	154	268	2.53	0.76	0.0	0.00
744	3.3	153	500	0.78	0.00	333.0	0.89
355	2.7	203	403	3.78	2.55	0.0	0.00
68	3.0	152	1096	3.97	2.73	0.0	0.00
...	...	...	...	...	...	...	...
75	2.8	612	270	3.02	2.73	0.0	0.00
599	2.6	307	464	4.93	0.00	325.4	0.10
575	3.5	254	456	4.01	0.00	372.3	0.16
337	2.1	152	268	3.41	1.89	0.0	0.00
523	2.5	120	460	0.87	0.82	0.0	0.00

数据划分与特征工程¶

交叉验证调参¶

决策树回归¶

SVM回归¶

k近邻算法¶

随机森林回归¶

Adaboost回归¶

GBDT回归¶

Bagging回归¶

ExtraTree极端随机树回归¶

交叉验证，选择最优模型¶

模型选择¶

	棱柱体抗拉强度ft	圆柱体抗压强度fc'	立方体抗压强度fcu	棱柱体抗压强度fc	截面宽度b	截面有效高度h0	剪跨比a/d	纵筋配筋率ρ	箍筋屈服强度	配箍率	试件极限受剪承载力vu
Total	0	0	0	0	0	0	0	230	558	558	0
Percent	0	0	0	0	0	0	0	29.1878	70.8122	70.8122	0
Types	float64	float64	float64	float64	int64	int64	float64	float64	float64	float64	float64