利用方差膨胀F求属性间的多重共线性

import pandas as pd, numpy as np import statsmodels.api as sm from statsmodels.stats.outliers_influence import variance_inflation_factor from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score import matplotlib.pyplot as plt, seaborn as sns ''' Regression Problem- Boston Housing dataset. Target attribute- 'medv' For details about dataset, refer to- https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html ''' # Read CSV data file- boston_data = pd.read_csv("Boston_Housing.csv") # Get dimension of dataset- print("\nDimension/Shape of dataset- {0}\n".format(boston_data.shape)) # Dimension/Shape of dataset- (333, 15) # Get attribute names of dataset- print("\nAttribute names of dataset are:\n{0}\n\n".format(boston_data.columns.tolist())) ''' Attribute names of dataset are: ['ID', 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv'] ''' # Dropping 'ID' attribute from dataset- boston_data.drop('ID', axis = 1, inplace=True) print("\nDimension/Shape of dataset after dropping 'ID' = {0}\n".forma(boston_data.shape)) # Dimension/Shape of dataset after dropping 'ID' = (333, 14) # Check for missing values- print("\nDoes dataset have missing values? {0}\n".format(boston_data.isnull().values.any())) # Does dataset have missing values? False # Visualize dataset distribution using boxplots- sns.boxplot(data=boston_data) plt.xticks(rotation = 20) plt.title("Boston Housing dataset distribution using boxplots") plt.show() # Create correlogram- # Compute correlation matrix of dataset- boston_data_corr = boston_data.corr() sns.heatmap(boston_data_corr) plt.xticks(rotation = 20) plt.title("Boston Housing dataset Correlogram") plt.show() # Separate dataset into features (X) and label (y)- X = boston_data.drop('medv', axis = 1) y = boston_data['medv'] # Create training and testing sets- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) print("\nDimensions of training and testing sets are:") print("X_train = {0}, y_train = {1}, X_test = {2} and y_test = {3}\n\n".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)) # Dimensions of training and testing sets are: # X_train = (233, 13), y_train = (233,), X_test = (100, 13) and y_test = (100,) # Detect Multicollinearity using VIF- vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] # Create a pandas Series to get labeled VIF for each attribute- vif_info = pd.Series(vif, index=X.columns) print("\nVIF calculated for each attribute in dataset are:\n{0}\n".format(vif_info.sort_values(ascending = True))) ''' VIF calculated for each attribute in dataset are: chas 1.131439 zn 2.554112 crim 2.649245 lstat 11.341896 indus 13.801806 rad 14.649941 dis 15.707447 age 21.929478 black 22.853979 tax 56.334001 nox 75.640420 rm 83.056962 ptratio 86.123039 dtype: float64 ''' # Removing collinear attributes/variables- ''' We create a function to remove the collinear variables. We choose a threshold of 5 which means if VIF is more than 5 for a particular variable/attribute then that variable will be removed. ''' # Linear Regression- # First train a Linear Regression using all attributes to get a baseline score lr_model = LinearRegression() lr_model.fit(X_train, y_train) y_pred = lr_model.predict(X_test) mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) r2s = r2_score(y_test, y_pred) print("\nLinear Regression model metrics are:") print("MSE = {0:.4f}, MAE = {1:.4f} & R2-Score = {2:.4f}\n\n".format(mse, mae, r2s)) # Linear Regression model metrics are: # MSE = 33.7151, MAE = 3.8272 & R2-Score = 0.6831 # Training a Linear Regression using attributes whose VIF <= 5 # Attributes to use for model training using VIF- filtered_cols = ['chas', 'zn', 'crim'] lr_model_filtered = LinearRegression() lr_model_filtered.fit(X_train.loc[:, filtered_cols], y_train) y_pred_filtered = lr_model_filtered.predict(X_test.loc[:, filtered_cols]) mse_f = mean_squared_error(y_test, y_pred_filtered) mae_f = mean_absolute_error(y_test, y_pred_filtered) r2s_f = r2_score(y_test, y_pred_filtered) print("\nLinear Regression model metrics using selected attributes are:") print("MSE = {0:.4f}, MAE = {1:.4f} & R2-Score = {2:.4f}\n\n".format(mse_f, mae_f, r2s_f)) # Linear Regression model metrics using selected attributes are: # MSE = 83.6148, MAE = 6.2886 & R2-Score = 0.2141 # Linear Regression model metrics using selected attributes are: # MSE = 46.3255, MAE = 4.9335 & R2-Score = 0.5646

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章