找到给出最高调整后R平方值的所有变量

import pandas as pd import statsmodels.api as sm data = {'accommodates':[2, 2, 3, 2, 2, 6, 8, 4, 3, 2], 'bedrooms':[1, 2, 1, 1, 3, 4, 2, 2, 2, 3], 'instant_bookable':[1, 0, 1, 1, 1, 1, 0, 0, 0, 1], 'availability_365':[123, 3, 33, 14, 15, 16, 3, 41, 61, 74], 'minimum_nights':[3, 12, 1, 4, 6, 7, 2, 3, 6, 10], 'beds':[2, 2, 3, 4, 1, 5, 6, 2, 3, 2], 'price':[59, 234, 15, 162, 56, 42, 28, 52, 22, 31]} df = pd.DataFrame(data, columns = ['accommodates', 'bedrooms', 'instant_bookable', 'availability_365', 'minimum_nights', 'beds', 'price'])

fit_d = {} for columns in [x for x in df.columns if x != 'price']: Y = df['price'] X = df[columns] X = sm.add_constant(X) model = sm.OLS(Y,X, missing = 'drop').fit() fit_d[columns] = model.rsquared fit_d

1条回答

网友

1楼 · 发布于 2024-10-05 14:27:14

这里有一种“蛮力方式”来做所有可能的不同长度的combinations（从itertools）来找到R值更高的变量。这个想法是做两个循环，一个用于尝试变量的数量，另一个用于所有变量数量的组合

from itertools import combinations

# all possible columns for X
cols = [x for x in df.columns if x != 'price']
# define Y as same accross the loops
Y = df['price']
# define result dictionary
fit_d = {}

# loop for any length of combinations
for i in range(1, len(cols)+1):
    # loop for any combinations with length i
    for comb in combinations(cols, i):
        # Define X from the combination
        X = df[list(comb)]
        X = sm.add_constant(X)
        # perform the OLS opertion
        model = sm.OLS(Y,X, missing = 'drop').fit()
        # save the rsquared in a dictionnary
        fit_d[comb] = model.rsquared

# extract the key for the max R value
key_max = max(fit_d, key=fit_d.get)

print(f'Best variables {key_max} for a R-value of {round(fit_d[key_max], 5)}')
# Best variables ('accommodates', 'bedrooms', 'instant_bookable', 'availability_365', 'minimum_nights', 'beds') for a R-value of 0.78506

相关问题更多 >

编程相关推荐

热门问题

热门文章