Pandas滚动回归：循环的替代方案

from datetime import date from pandas_datareader.data import DataReader import statsmodels.formula.api as smf syms = {'TWEXBMTH' : 'usd', 'T10Y2YM' : 'term_spread', 'PCOPPUSDM' : 'copper' } start = date(2000, 1, 1) data = (DataReader(syms.keys(), 'fred', start) .pct_change() .dropna()) data = data.rename(columns = syms) data = data.assign(intercept = 1.) # required by statsmodels OLS def sliding_windows(x, window): """Create rolling/sliding windows of length ~window~. Given an array of shape (y, z), it will return "blocks" of shape (x - window + 1, window, z).""" return np.array([x[i:i + window] for i in range(0, x.shape[0] - window + 1)]) data.head(3) Out[33]: usd term_spread copper intercept DATE 2000-02-01 0.012573 -1.409091 -0.019972 1.0 2000-03-01 -0.000079 2.000000 -0.037202 1.0 2000-04-01 0.005642 0.518519 -0.033275 1.0 window = 36 wins = sliding_windows(data.values, window=window) y, x = wins[:, :, 0], wins[:, :, 1:] coefs = [] for endog, exog in zip(y, x): model = smf.OLS(endog, exog).fit() # The full set of model attributes gets lost with each loop coefs.append(model.params) df = pd.DataFrame(coefs, columns=data.iloc[:, 1:].columns, index=data.index[window - 1:]) df.head(3) # rolling 36m coefficients Out[70]: term_spread copper intercept DATE 2003-01-01 -0.000122 -0.018426 0.001937 2003-02-01 0.000391 -0.015740 0.001597 2003-03-01 0.000655 -0.016811 0.001546

1条回答

网友

1楼 · 发布于 2024-10-05 10:21:30

我创建了一个ols模块，用来模拟熊猫不受欢迎的MovingOLS；它是here。

它有三个核心类：

OLS：静态（单窗口）普通最小二乘回归。输出是NumPy数组
RollingOLS：滚动（多窗口）普通最小二乘回归。输出是高维NumPy数组。
PandasRollingOLS：包装熊猫系列数据帧中RollingOLS的结果。旨在模仿弃用的熊猫模块的外观。

注意，该模块是package（我目前正在上传到PyPi）的一部分，它需要一个跨包导入。

上面的前两个类完全用NumPy实现，主要使用矩阵代数。RollingOLS还广泛利用广播。属性很大程度上模拟了statsmodels的OLSRegressionResultsWrapper。

例如：

import urllib.parse
import pandas as pd
from pyfinance.ols import PandasRollingOLS

# You can also do this with pandas-datareader; here's the hard way
url = "https://fred.stlouisfed.org/graph/fredgraph.csv"

syms = {
    "TWEXBMTH" : "usd", 
    "T10Y2YM" : "term_spread", 
    "GOLDAMGBD228NLBM" : "gold",
}

params = {
    "fq": "Monthly,Monthly,Monthly",
    "id": ",".join(syms.keys()),
    "cosd": "2000-01-01",
    "coed": "2019-02-01",
}

data = pd.read_csv(
    url + "?" + urllib.parse.urlencode(params, safe=","),
    na_values={"."},
    parse_dates=["DATE"],
    index_col=0
).pct_change().dropna().rename(columns=syms)
print(data.head())
#                  usd  term_spread      gold
# DATE                                       
# 2000-02-01  0.012580    -1.409091  0.057152
# 2000-03-01 -0.000113     2.000000 -0.047034
# 2000-04-01  0.005634     0.518519 -0.023520
# 2000-05-01  0.022017    -0.097561 -0.016675
# 2000-06-01 -0.010116     0.027027  0.036599

y = data.usd
x = data.drop('usd', axis=1)

window = 12  # months
model = PandasRollingOLS(y=y, x=x, window=window)

print(model.beta.head())  # Coefficients excluding the intercept
#             term_spread      gold
# DATE                             
# 2001-01-01     0.000033 -0.054261
# 2001-02-01     0.000277 -0.188556
# 2001-03-01     0.002432 -0.294865
# 2001-04-01     0.002796 -0.334880
# 2001-05-01     0.002448 -0.241902

print(model.fstat.head())
# DATE
# 2001-01-01    0.136991
# 2001-02-01    1.233794
# 2001-03-01    3.053000
# 2001-04-01    3.997486
# 2001-05-01    3.855118
# Name: fstat, dtype: float64

print(model.rsq.head())  # R-squared
# DATE
# 2001-01-01    0.029543
# 2001-02-01    0.215179
# 2001-03-01    0.404210
# 2001-04-01    0.470432
# 2001-05-01    0.461408
# Name: rsq, dtype: float64

相关问题更多 >

编程相关推荐

热门问题

热门文章