您现在的位置：程式師世界 >> 編程語言 > >> 更多編程語言 >> Python
Python數據處理課程設計-房屋價格預測-代碼

編輯：Python
代碼部分如下所示：
#!/usr/bin/env python
# coding: utf-8
# ## 導包
# In[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as ppf #探索性數據分析（EDA）
import warnings##忽略警告
warnings.filterwarnings('ignore')
get_ipython().run_line_magic('matplotlib', 'inline')
plt.style.use('ggplot')
# In[2]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder#標簽編碼
from sklearn.preprocessing import RobustScaler, StandardScaler#去除異常值與數據標准化
from sklearn.pipeline import Pipeline, make_pipeline#構建管道
from scipy.stats import skew#偏度
from sklearn.impute import SimpleImputer
# ## 讀取並查看原數據
# In[3]:
train = pd.read_csv(r"G:\study類\大三上\機器學習\課程設計\datas\train.csv") #將數據讀取進來
# In[4]:
test = pd.read_csv(r"G:\study類\大三上\機器學習\課程設計\datas\test.csv") #將數據讀取進來
# In[5]:
train.head()#默認顯示前五行
# In[6]:
test.head()
從中可以看出還是有很多數據需要處理的
# ## 數據探索性分析 pandas_profiling
# In[7]:
ppf.ProfileReport(train)
# In[8]:
train.YearBuilt#顯示這一列的數據
# In[9]:
train.SalePrice
# ## 通過箱型圖查看異常值，離群點
# In[10]:
plt.figure(figsize=(12,8))
sns.boxplot(train.YearBuilt, train.SalePrice)
# ## 通過散點圖來觀察存在線型的關系
# In[11]:
plt.figure(figsize=(12,6))
plt.scatter(x=train.TotalBsmtSF, y=train.SalePrice)
plt.xlabel("TotalBsmtSF", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)
# In[12]:
train.drop(train[(train["TotalBsmtSF"]>5000)].index,inplace=True)
# In[13]:
plt.figure(figsize=(12,6))
plt.scatter(x=train.TotalBsmtSF, y=train.SalePrice)
plt.xlabel("TotalBsmtSF", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)
# In[14]:
plt.figure(figsize=(12,6))
plt.scatter(x=train.GrLivArea, y=train.SalePrice)
plt.xlabel("GrLivArea", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)
# ## 把太偏離線性的那些數據給去掉，把對應的索引給刪掉
# In[15]:
train.drop(train[(train["GrLivArea"]>4000)&(train["SalePrice"]<300000)].index,inplace=True)
刪除後的圖像
# In[16]:
plt.figure(figsize=(12,6))
plt.scatter(x=train.GrLivArea, y=train.SalePrice)
plt.xlabel("GrLivArea", fontsize=13)
plt.ylabel("SalePrice", fontsize=13)
plt.ylim(0,800000)
# ### 把test數據也做相同的處理
# In[17]:
full = pd.concat([train,test],ignore_index=True)
# ### 因為ID列和索引值都相同，故這裡把ID列給刪掉
# In187]:
full.drop("Id",axis=1,inplace=True)
# In[19]:
full.head()#查看刪除列之後的值
# In[20]:
full.info()#查看刪除後的數據信息
# # 數據清洗--空值的填充、刪除
# #### 查看缺失值，並且缺失的個數要從高到低排序
# In[21]:
miss = full.isnull().sum() #統計出空值的個數pd.set_option('display.max_rows', None) 
# In[22]:
miss[miss>0]
# In[23]:
miss[miss>0].sort_values(ascending=True) #由低到高進行排序
# In[24]:
full.info() #查看數據信息
# ## 空值的填充與刪除
# 對字符類型的進行填充
# In[25]:
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
full[col].fillna("None",inplace=True)
# In[26]:
full.head()
# 對數值類型的進行填充
# In[27]:
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
full[col].fillna(0, inplace=True)
# 對lotfrontage的空值使用其均值進行填充
# In[28]:
full["LotFrontage"].fillna(np.mean(full["LotFrontage"]),inplace=True)
# 對下面的列使用眾數進行填充
# In[29]:
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
full[col].fillna(full[col].mode()[0], inplace=True)
# 查看是否還有未填充好的數據
# In[30]:
full.isnull().sum()[full.isnull().sum()>0]
發現只有test的沒有標簽列，故已經把數據中的空值處理好了
# ## 數據預處理--把字符變成數值型
# In[31]:
full["MSZoning"].mode()[0]
# In[32]:
pd.set_option('display.max_rows', None) # 設置顯示最大行，不然有一些數據會以“...”顯示，不能看到部分數據
full.MSZoning
從上面可以發現有一些數據，比如31行：C（all），需要把這些數據轉換成字符串的形式，將一些數字特征轉換為類別特征,使用LabelEncoder來實現
# In[33]:
for col in cols2:
full[col]=full[col].astype(str)##astype來進行數據轉換成字符串類型
# In[34]:
lab = LabelEncoder() #對不連續的數字或者文本進行編號
# #### 把下列內容字符型轉換為數字型
# In[35]:
full["Alley"] = lab.fit_transform(full.Alley)
full["PoolQC"] = lab.fit_transform(full.PoolQC)
full["MiscFeature"] = lab.fit_transform(full.MiscFeature)
full["Fence"] = lab.fit_transform(full.Fence)
full["FireplaceQu"] = lab.fit_transform(full.FireplaceQu)
full["GarageQual"] = lab.fit_transform(full.GarageQual)
full["GarageCond"] = lab.fit_transform(full.GarageCond)
full["GarageFinish"] = lab.fit_transform(full.GarageFinish)
full["GarageYrBlt"] = full["GarageYrBlt"].astype(str)
full["GarageYrBlt"] = lab.fit_transform(full.GarageYrBlt)
full["GarageType"] = lab.fit_transform(full.GarageType)
full["BsmtExposure"] = lab.fit_transform(full.BsmtExposure)
full["BsmtCond"] = lab.fit_transform(full.BsmtCond)
full["BsmtQual"] = lab.fit_transform(full.BsmtQual)
full["BsmtFinType2"] = lab.fit_transform(full.BsmtFinType2)
full["BsmtFinType1"] = lab.fit_transform(full.BsmtFinType1)
full["MasVnrType"] = lab.fit_transform(full.MasVnrType)
full["BsmtFinType1"] = lab.fit_transform(full.BsmtFinType1)
# In[36]:
full.head()
# 將一些未轉換的列繼續轉換為數字型
# In[37]:
full["MSZoning"] = lab.fit_transform(full.MSZoning)
full["BsmtFullBath"] = lab.fit_transform(full.BsmtFullBath)
full["BsmtHalfBath"] = lab.fit_transform(full.BsmtHalfBath)
full["Utilities"] = lab.fit_transform(full.Utilities)
full["Functional"] = lab.fit_transform(full.Functional)
full["Electrical"] = lab.fit_transform(full.Electrical)
full["KitchenQual"] = lab.fit_transform(full.KitchenQual)
full["SaleType"] = lab.fit_transform(full.SaleType)
full["Exterior1st"] = lab.fit_transform(full.Exterior1st)
full["Exterior2nd"] = lab.fit_transform(full.Exterior2nd)
# In[38]:
full.head()
# #### 發現還有一些列是字符型，未能完全轉換為數字型
# In[39]:
full.drop("SalePrice",axis=1,inplace=True)##刪除這一列，以便後面進行操作
# #### 從結果可以看出，行和列變得很多了
# #### 可以看到所有數據都顯示為數字型了
# In[40]:
##自己寫一個轉換函數
class labelenc(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,X,y=None):
return self
def transform(self,X):
lab=LabelEncoder()
X["YearBuilt"] = lab.fit_transform(X["YearBuilt"])
X["YearRemodAdd"] = lab.fit_transform(X["YearRemodAdd"])
X["GarageYrBlt"] = lab.fit_transform(X["GarageYrBlt"])
X["BldgType"] = lab.fit_transform(X["BldgType"])
return X
# In[41]:
#寫一個轉換函數
class skew_dummies(BaseEstimator, TransformerMixin):
def __init__(self,skew=0.5):#偏度
self.skew = skew
def fit(self,X,y=None):
return self
def transform(self,X):
X_numeric=X.select_dtypes(exclude=["object"])#而是去除了包含了對象數據類型，取出來絕大部分是數值型，取出字符類型的數據
skewness = X_numeric.apply(lambda x: skew(x))#匿名函數，做成字典的形式
skewness_features = skewness[abs(skewness) >= self.skew].index#通過條件來涮選出skew>=0.5的索引的條件，取到了全部數據，防止數據的丟失
X[skewness_features] = np.log1p(X[skewness_features])#求對數，進一步讓他更符合正態分布
X = pd.get_dummies(X)##一鍵獨熱，獨熱編碼
return X
# In[42]:
from scipy.stats import norm
from scipy import stats
def get_dist_data(series):
sns.distplot(series, fit=norm);
fig = plt.figure()
res = stats.probplot(series, plot=plt)
#價格
print("Skewness: %f" % series.skew())
print("Kurtosis: %f" % series.kurt())
get_dist_data(train_df['SalePrice'])
# In[43]:
#價格對數化處理
log_SalePrice = np.log(train_df['SalePrice'] + 1)
get_dist_data(log_SalePrice)
# In[44]:
#對數化處理
plot_no = 0
plt.figure(figsize=(18, 60))
for feature in skewed.index:
plt.subplot(12, 4, plot_no + 1)
sns.distplot(df[feature], kde = True, fit=norm, color = "purple")
plt.title("Before", fontsize = 20)
plt.subplot(12, 4, plot_no + 2)
sns.distplot(df_temp[feature], kde = True, fit=norm, color = "green")
plt.title("After", fontsize = 20)
plot_no += 2
plt.tight_layout()
# In[45]:
# 構建管道
pipe = Pipeline([#構建管道
('labenc', labelenc()),
('skew_dummies', skew_dummies(skew=2)),
])
# In[46]:
# 保存原來的數據以備後用，為了防止寫錯
full2 = full.copy()
# In[47]:
pipeline_data = pipe.fit_transform(full2)
# In[48]:
pipeline_data.shape
# In[49]:
pipeline_data.head()
# In[50]:
from sklearn.linear_model import Lasso #運用算法來進行訓練以得到特征的重要性
lasso=Lasso(alpha=0.001)
lasso.fit(X_scaled,y_log)
# In[51]:
FI_lasso = pd.DataFrame({
"Feature Importance":lasso.coef_}, index=pipeline_data.columns) #索引和重要性做成dataframe形式
# In[52]:
FI_lasso.sort_values("Feature Importance",ascending=False)#由高到低進行排序
# In[53]:
#可視化
FI_lasso[FI_lasso["Feature Importance"]!=0].sort_values("Feature Importance").plot(kind="barh",figsize=(15,25))#barh：把x，y軸反轉
plt.xticks(rotation=90)
plt.show()#畫圖顯示
# ## 得到特征重要性圖之後就可以進行特征選擇與重做
# In[54]:
class add_feature(BaseEstimator, TransformerMixin):#定義轉換函數
def __init__(self,additional=1):
self.additional = additional
def fit(self,X,y=None):
return self
def transform(self,X):
if self.additional==1:
X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
else:
X["TotalHouse"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"]
X["TotalArea"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"]
X["+_TotalHouse_OverallQual"] = X["TotalHouse"] * X["OverallQual"]
X["+_GrLivArea_OverallQual"] = X["GrLivArea"] * X["OverallQual"]
X["+_oMSZoning_TotalHouse"] = X["oMSZoning"] * X["TotalHouse"]
X["+_oMSZoning_OverallQual"] = X["oMSZoning"] + X["OverallQual"]
X["+_oMSZoning_YearBuilt"] = X["oMSZoning"] + X["YearBuilt"]
X["+_oNeighborhood_TotalHouse"] = X["oNeighborhood"] * X["TotalHouse"]
X["+_oNeighborhood_OverallQual"] = X["oNeighborhood"] + X["OverallQual"]
X["+_oNeighborhood_YearBuilt"] = X["oNeighborhood"] + X["YearBuilt"]
X["+_BsmtFinSF1_OverallQual"] = X["BsmtFinSF1"] * X["OverallQual"]
X["-_oFunctional_TotalHouse"] = X["oFunctional"] * X["TotalHouse"]
X["-_oFunctional_OverallQual"] = X["oFunctional"] + X["OverallQual"]
X["-_LotArea_OverallQual"] = X["LotArea"] * X["OverallQual"]
X["-_TotalHouse_LotArea"] = X["TotalHouse"] + X["LotArea"]
X["-_oCondition1_TotalHouse"] = X["oCondition1"] * X["TotalHouse"]
X["-_oCondition1_OverallQual"] = X["oCondition1"] + X["OverallQual"]
X["Bsmt"] = X["BsmtFinSF1"] + X["BsmtFinSF2"] + X["BsmtUnfSF"]
X["Rooms"] = X["FullBath"]+X["TotRmsAbvGrd"]
X["PorchArea"] = X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
X["TotalPlace"] = X["TotalBsmtSF"] + X["1stFlrSF"] + X["2ndFlrSF"] + X["GarageArea"] + X["OpenPorchSF"]+X["EnclosedPorch"]+X["3SsnPorch"]+X["ScreenPorch"]
return X
# In[55]:
pipe = Pipeline([#把後面的東西加到管道裡面來
('labenc', labelenc()),
('add_feature', add_feature(additional=2)),
('skew_dummies', skew_dummies(skew=4)),
])
# In[56]:
pipe
# In[57]:
n_train=train.shape[0]#訓練集的行數
X = pipeline_data[:n_train]#取出處理之後的訓練集
test_X = pipeline_data[n_train:]#取出n_train後的數據作為測試集
y= train.SalePrice
X_scaled = StandardScaler().fit(X).transform(X)#做轉換
y_log = np.log(train.SalePrice)##這裡要注意的是，更符合正態分布
#得到測試集
test_X_scaled = StandardScaler().fit_transform(test_X)
# ## 模型的構建
# #### 線性回歸
# In[58]:
from sklearn.tree import DecisionTreeRegressor#導入模型
# In[59]:
model = DecisionTreeRegressor()
# In[60]:
model1 =model.fit(X_scaled,y_log)
# ## 前期比較簡單的處理得到結果，並沒有進行模型的堆疊
# In[61]:
#predict = modexp.predict(test_x)
# In[62]:
# result=pd.DataFrame({'Id':test.Id, 'SalePrice':predict})
# result.to_csv("submission1.csv",index=False)
# In[63]:
# predict = np.exp(model1.predict(test_X_scaled))#np.exp是對上面的對數變換之後的反變換
# In[64]:
# result=pd.DataFrame({'Id':test.Id, 'SalePrice':predict})
# result.to_csv("submission.csv",index=False)
# ## 模型的堆疊與集成並且選擇最優參數，模型和評估方式
# In[65]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold#交叉驗證，網格搜索，k折驗證
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
# In[66]:
#定義交叉驗證的策略，以及評估函數
def rmse_cv(model,X,y):
rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))#交叉驗證
return rmse
# In[67]:
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
ExtraTreesRegressor(),XGBRegressor()]#這裡也是列表
# In[68]:
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]#列表
for name, model in zip(names, models):
score = rmse_cv(model, X_scaled, y_log)
print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))
# In[69]:
#定義交叉方式，先指定模型後指定參數，方便測試多個模型，網格交叉驗證
class grid():
def __init__(self,model):
self.model = model#導入模型
#所有模型進行驗證5次
def grid_get(self,X,y,param_grid):#網格參數一般做出字典的格式
grid_search = GridSearchCV(self.model,param_grid,cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X,y)
print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])
# In[70]:
grid(Lasso()).grid_get(X_scaled,y_log,{
'alpha': [0.0004,0.0005,0.0007,0.0006,0.0009,0.0008],'max_iter':[10000]})
# In[71]:
grid(Ridge()).grid_get(X_scaled,y_log,{
'alpha':[35,40,45,50,55,60,65,70,80,90]})
# In[72]:
grid(SVR()).grid_get(X_scaled,y_log,{
'C':[11,12,13,14,15],'kernel':["rbf"],"gamma":[0.0003,0.0004],"epsilon":[0.008,0.009]})#支持向量機回歸
# In[73]:
param_grid={
'alpha':[0.2,0.3,0.4,0.5], 'kernel':["polynomial"], 'degree':[3],'coef0':[0.8,1,1.2]}#定義好的參數，用字典來表示
grid(KernelRidge()).grid_get(X_scaled,y_log,param_grid)
# In[74]:
grid(ElasticNet()).grid_get(X_scaled,y_log,{
'alpha':[0.0005,0.0008,0.004,0.005],'l1_ratio':[0.08,0.1,0.3,0.5,0.7],'max_iter':[10000]})
# In[75]:
#定義加權平均值，就相當於自己寫fit_transform（）
class AverageWeight(BaseEstimator, RegressorMixin):
def __init__(self,mod,weight):
self.mod = mod#模型的個數
self.weight = weight#權重
def fit(self,X,y):
self.models_ = [clone(x) for x in self.mod]
for model in self.models_:
model.fit(X,y)
return self
def predict(self,X):
w = list()
pred = np.array([model.predict(X) for model in self.models_])
# 針對於每一個數據點，單一的模型是乘以權重，然後加起來
for data in range(pred.shape[1]):#取列數
single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
w.append(np.sum(single))
return w
# In[76]:
#指定每一個算法的參數
lasso = Lasso(alpha=0.0005,max_iter=10000)
ridge = Ridge(alpha=60)
svr = SVR(gamma= 0.0004,kernel='rbf',C=13,epsilon=0.009)
ker = KernelRidge(alpha=0.2 ,kernel='polynomial',degree=3 , coef0=0.8)
ela = ElasticNet(alpha=0.005,l1_ratio=0.08,max_iter=10000)
bay = BayesianRidge()
# In[77]:
#6個權重
w1 = 0.02
w2 = 0.2
w3 = 0.25
w4 = 0.3
w5 = 0.03
w6 = 0.2
# In[78]:
weight_avg = AverageWeight(mod = [lasso,ridge,svr,ker,ela,bay],weight=[w1,w2,w3,w4,w5,w6])
# In[79]:
rmse_cv(weight_avg,X_scaled,y_log), rmse_cv(weight_avg,X_scaled,y_log).mean()#計算出交叉驗證的均值
# ## 模型的堆疊
# In[80]:
class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self,mod,meta_model):
self.mod = mod
self.meta_model = meta_model#元模型
self.kf = KFold(n_splits=5, random_state=42, shuffle=True)#5折的劃分
#數據集平均分成5份
def fit(self,X,y):
self.saved_model = [list() for i in self.mod]#用模型來進行擬合
oof_train = np.zeros((X.shape[0], len(self.mod)))
for i,model in enumerate(self.mod):#返回的是索引和模型本身
for train_index, val_index in self.kf.split(X,y):##返回的是數據本省
renew_model = clone(model)#模型的復制
renew_model.fit(X[train_index], y[train_index])#對數據進行訓練
self.saved_model[i].append(renew_model)#把模型添加進去
oof_train[val_index,i] = renew_model.predict(X[val_index])#用來預測驗證集
self.meta_model.fit(oof_train,y)#元模型
return self
def predict(self,X):
whole_test = np.column_stack([np.column_stack(model.predict(X) for model in single_model).mean(axis=1)
for single_model in self.saved_model]) #得到的是整個測試集
return self.meta_model.predict(whole_test)#返回的是利用元模型來對整個測試集進行預測
#預測，使整個測試集
def get_oof(self,X,y,test_X):
oof = np.zeros((X.shape[0],len(self.mod)))#初始化為0
test_single = np.zeros((test_X.shape[0],5))#初始化為0 
test_mean = np.zeros((test_X.shape[0],len(self.mod)))
for i,model in enumerate(self.mod):#i是模型
for j, (train_index,val_index) in enumerate(self.kf.split(X,y)):#j是所有劃分好的的數據
clone_model = clone(model)#克隆模塊，把模型復制一下
clone_model.fit(X[train_index],y[train_index])#把分割好的數據進行訓練
oof[val_index,i] = clone_model.predict(X[val_index])#對驗證集進行預測
test_single[:,j] = clone_model.predict(test_X)#對測試集進行預測
test_mean[:,i] = test_single.mean(axis=1)#測試集算好均值
return oof, test_mean
# In[81]:
#經過預處理之後才能放到堆疊的模型裡面去計算
a = SimpleImputer().fit_transform(X_scaled)#x
b = SimpleImputer().fit_transform(y_log.values.reshape(-1,1)).ravel()#y
# a = Imputer().fit_transform(X_scaled)#相當於x
# b = Imputer().fit_transform(y_log.values.reshape(-1,1)).ravel()#相當於y
# In[82]:
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)#定義了第一層的和第二層的模型
# In[83]:
print(rmse_cv(stack_model,a,b))#運用了評估函數
print(rmse_cv(stack_model,a,b).mean())
# In[84]:
X_train_stack, X_test_stack = stack_model.get_oof(a,b,test_X_scaled)#將數據進行變換
# In[85]:
X_train_stack.shape, a.shape
# In[86]:
X_train_add = np.hstack((a,X_train_stack))
X_test_add = np.hstack((test_X_scaled,X_test_stack))
X_train_add.shape, X_test_add.shape
# In[87]:
print(rmse_cv(stack_model,X_train_add,b))
print(rmse_cv(stack_model,X_train_add,b).mean())
# In[88]:
stack_model = stacking(mod=[lasso,ridge,svr,ker,ela,bay],meta_model=ker)
# In[89]:
stack_model.fit(a,b)#模型進行訓練
# In[90]:
pred = np.exp(stack_model.predict(test_X_scaled))#進行預測
# In[91]:
result=pd.DataFrame({
'Id':test.Id, 'SalePrice':pred})
result.to_csv("submission3.csv",index=False)