= ["SP500", "DTWEXAFEGS"] # "SP500" does not contain dividends; note: "DTWEXM" discontinued as of Jan 2020
factors_r = ["DGS10", "BAMLH0A0HYM2"] factors_d
= ["BAICX"] # fund inception date is "2011-11-28" tickers
= True intercept
Regression analysis
import math
import statsmodels.api as sm
Ordinary least squares
Coefficients
\[ \begin{aligned} \hat{\beta}=(X^\mathrm{T}WX)^{-1}X^\mathrm{T}Wy \end{aligned} \]
def lm_coef(x, y, weights, intercept):
if (intercept): x = sm.add_constant(x)
= np.dot(np.linalg.inv(np.dot(x.T, np.multiply(weights, x))),
result
np.dot(x.T, np.multiply(weights, y)))
return np.ravel(result)
lm_coef(overlap_x_df, overlap_y_df, weights, intercept)
array([-8.29110694e-06, 1.72197207e-01, -6.43201203e-02, 3.36952840e+00,
2.04055999e+00])
if (intercept): overlap_x_df = sm.add_constant(overlap_x_df)
= sm.WLS(overlap_y_df, overlap_x_df, weights = weights).fit()
fit
if (intercept): overlap_x_df = overlap_x_df.iloc[:, 1:]
np.array(fit.params)
array([-8.29110694e-06, 1.72197207e-01, -6.43201203e-02, 3.36952840e+00,
2.04055999e+00])
R-squared
\[ \begin{aligned} R^{2}=\frac{\hat{\beta}^\mathrm{T}(X^\mathrm{T}WX)\hat{\beta}}{y^\mathrm{T}Wy} \end{aligned} \]
def lm_rsq(x, y, weights, intercept):
= np.matrix(lm_coef(x, y, weights, intercept))
coef
if (intercept):
= sm.add_constant(x)
x = x - np.average(x, axis = 0, weights = weights.reshape(-1))
x = y - np.average(y, axis = 0, weights = weights.reshape(-1))
y
= np.dot(coef, np.dot(np.dot(x.T, np.multiply(weights, x)), coef.T)) / \
result
np.dot(y.T, np.multiply(weights, y))
return result.item()
lm_rsq(overlap_x_df, overlap_y_df, weights, intercept)
0.8368528819018047
fit.rsquared
0.8368528819018044
Standard errors
\[ \begin{aligned} \sigma_{\hat{\beta}}^{2}&=\sigma_{\varepsilon}^{2}(X^\mathrm{T}WX)^{-1}\\ &=\frac{(1-R^{2})}{n-p}(X^\mathrm{T}WX)^{-1}\\ &=\frac{SSE}{df_{E}}(X^\mathrm{T}WX)^{-1}\\ \sigma_{\hat{\alpha}}^{2}&=\sigma_{\varepsilon}^{2}\left(\frac{1}{n}+\mu^\mathrm{T}(X^\mathrm{T}WX)^{-1}\mu\right) \end{aligned} \]
def lm_se(x, y, weights, intercept):
= x.shape[0]
n_rows = x.shape[1]
n_cols
= lm_rsq(x, y, weights, intercept)
rsq
if (intercept):
= sm.add_constant(x)
x = y - np.average(y, axis = 0, weights = weights.reshape(-1))
y
= n_rows - n_cols - 1
df_resid
else:
= n_rows - n_cols
df_resid
= np.dot(y.T, np.multiply(weights, y))
var_y = (1 - rsq) * var_y / df_resid
var_resid
= np.sqrt(var_resid * np.linalg.inv(np.dot(x.T, np.multiply(weights, x))).diagonal())
result
return np.ravel(result)
lm_se(overlap_x_df, overlap_y_df, weights, intercept)
array([5.06351899e-05, 2.03221060e-02, 3.66481317e-02, 1.99052033e-01,
1.72678066e-01])
np.array(fit.bse)
array([5.06351899e-05, 2.03221060e-02, 3.66481317e-02, 1.99052033e-01,
1.72678066e-01])
Shapley values
\[ R^{2}_{i}=\sum_{S\subseteq N\setminus\{i\}}{\frac{|S|!\;(n-|S|-1)!}{n!}}(R^{2}(S\cup\{i\})-R^{2}(S)) \]
def lm_shap(x, y, weights, intercept):
= x.shape[0]
n_rows = x.shape[1]
n_cols = 2 ** n_cols
n_combn = np.zeros(n_combn)
n_vec = np.zeros((n_cols, n_combn))
ix_mat = np.zeros(n_combn)
rsq = np.zeros(n_cols)
result
# number of binary combinations
for k in range(n_combn):
= 0
n = k
n_size
# find the binary combination
for j in range(n_cols):
if n_size % 2 == 0:
+= 1
n
= j + 1
ix_mat[j, k]
//= 2
n_size
= n
n_vec[k]
if n > 0:
= np.where(ix_mat[:, k] != 0)[0]
ix_subset = x.iloc[:, ix_subset]
x_subset
= lm_rsq(x_subset, y, weights, intercept)
rsq[k]
# calculate the exact Shapley value for r-squared
for j in range(n_cols):
= np.where(ix_mat[j, :] != 0)[0]
ix_pos = np.where(ix_mat[j, :] == 0)[0]
ix_neg = n_vec[ix_neg]
ix_n = rsq[ix_pos] - rsq[ix_neg]
rsq_diff
for k in range(int(n_combn / 2)):
= int(ix_n[k])
s = math.factorial(s) * math.factorial(n_cols - s - 1) \
weight / math.factorial(n_cols)
+= weight * rsq_diff[k]
result[j]
return result
lm_shap(overlap_x_df, overlap_y_df, weights, intercept)
array([0.28814527, 0.10128527, 0.24069149, 0.20673085])
Principal component regression
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
= 1 comps
Coefficients
\[ \begin{aligned} W_{k}&=\mathbf{X}V_{k}=[\mathbf{X}\mathbf{v}_{1},\ldots,\mathbf{X}\mathbf{v}_{k}]\\ {\widehat{\gamma}}_{k}&=\left(W_{k}^\mathrm{T}W_{k}\right)^{-1}W_{k}^\mathrm{T}\mathbf{Y}\\ {\widehat{\boldsymbol{\beta}}}_{k}&=V_{k}{\widehat{\gamma}}_{k} \end{aligned} \]
def pcr_coef(x, y, comps):
= x - np.average(x, axis = 0)
x = np.linalg.eig(np.cov(x.T, ddof = 1))
L, V = L.argsort()[::-1]
idx = V[:, idx]
V
= np.dot(x, V)
W = np.dot(np.dot(np.linalg.inv(np.dot(W.T, W)), W.T), y)
gamma
= np.dot(V[:, :comps], gamma[:comps])
result
return np.ravel(result)
= (overlap_x_df - np.average(overlap_x_df, axis = 0)) \
scale_x_df / np.std(overlap_x_df, axis = 0, ddof = 1)
pcr_coef(scale_x_df, overlap_y_df, comps)
array([ 0.00075894, -0.00060485, 0.00035895, 0.00058206])
pcr_coef(overlap_x_df, overlap_y_df, comps)
array([ 0.38941837, -0.07539636, 0.00743874, 0.03004974])
= PCA(n_components = len(factors))
pca = pca.fit_transform(scale_x_df)
pca_x_df
= LinearRegression(fit_intercept = False).fit(pca_x_df, overlap_y_df)
fit
= fit.coef_
gamma np.dot(pca.components_.T[:, :comps], gamma.T[:comps]).ravel()
array([ 0.00075894, -0.00060485, 0.00035895, 0.00058206])
R-squared
def pcr_rsq(x, y, comps):
= np.matrix(pcr_coef(x, y, comps))
coef
= x - np.average(x, axis = 0)
x = y - np.average(y, axis = 0)
y
= np.dot(np.dot(coef, np.dot(x.T, x)), coef.T) / np.dot(y.T, y)
result
return result.item()
pcr_rsq(scale_x_df, overlap_y_df, comps)
0.7445836210862191
pcr_rsq(overlap_x_df, overlap_y_df, comps)
0.5939657482901217
Standard errors
\[ \begin{aligned} \text{Var}({\widehat{\boldsymbol{\beta}}}_{k})&=\sigma^{2}V_{k}(W_{k}^\mathrm{T}W_{k})^{-1}V_{k}^\mathrm{T}\\ &=\sigma^{2}V_{k}\text{diag}\left(\lambda_{1}^{-1},\ldots,\lambda_{k}^{-1}\right)V_{k}^\mathrm{T}\\ &=\sigma^{2}\sum_{j=1}^{k}{\frac{\mathbf{v}_{j}\mathbf{v}_{j}^\mathrm{T}}{\lambda_{j}}} \end{aligned} \]
# unable to verify the result
def pcr_se(x, y, comps):
= x.shape[0]
n_rows = x.shape[1]
n_cols
= pcr_rsq(x, y, comps)
rsq
= y - np.average(y, axis = 0)
y
= n_rows - n_cols - 1
df_resid
= np.dot(y.T, y)
var_y = (1 - rsq) * var_y / df_resid
var_resid
# uses statsmodels for illustrative purposes
= sm.multivariate.PCA(x, standardize = False, demean = True)
pca = pca.eigenvals[:comps]
L = pca.eigenvecs.iloc[:, :comps]
V
= np.sqrt(var_resid * np.dot(V, np.dot(np.diag(1 / L), V.T)).diagonal())
result
return np.ravel(result)
pcr_se(scale_x_df, overlap_y_df, comps)
array([2.82830797e-05, 2.25407619e-05, 1.33768755e-05, 2.16911934e-05])
pcr_se(overlap_x_df, overlap_y_df, comps)
array([0.02048654, 0.00396646, 0.00039134, 0.00158086])
Partial least squares
Risk decomposition
Standalone risk
\[ \begin{aligned} \text{SAR}_{k}&=\sqrt{w_{k}^{2}\sigma_{k}^{2}}\\ \text{SAR}_{\varepsilon}&=\sqrt{(1-R^{2})\sigma_{y}^{2}} \end{aligned} \]
def cov_wt(x, weights, center):
= sum(weights)
sum_w = sum(np.power(weights, 2))
sumsq_w
if (center):
= x - np.average(x, axis = 0, weights = weights.reshape(-1))
x
= np.dot(x.T, np.multiply(weights, x)) / (sum_w - sumsq_w / sum_w)
result
return result
def lm_sar(x, y, weights, intercept):
= lm_coef(x, y, weights, intercept)
coef = lm_rsq(x, y, weights, intercept)
rsq
if (intercept): x = sm.add_constant(x)
# sigma = np.cov(np.concatenate((x, y), axis = 1).T,
# aweights = weights.reshape(-1))
= cov_wt(np.concatenate((x, y), axis = 1), weights, intercept)
sigma = np.multiply(np.power(coef, 2).T, sigma[:-1, :-1].diagonal())
sar = (1 - rsq) * sigma[-1, -1]
sar_eps
= np.sqrt(np.concatenate((np.matrix(sigma[-1, -1]),
result
np.matrix(sar),= 1))
np.matrix(sar_eps)), axis
return np.ravel(result)
* np.sqrt(scale["periods"] * scale["overlap"]) lm_sar(overlap_x_df, overlap_y_df, weights, intercept)
array([0.06697784, 0. , 0.0219343 , 0.00367218, 0.03734573,
0.03050915, 0.02705334])
Risk contribution
\[ \begin{aligned} \text{MCR}&=w^\mathrm{T}\frac{\partial\sigma_{y}}{\partial w}\\ &=w^\mathrm{T}\frac{\Sigma w}{\sigma_{y}}\\ \text{MCR}_{\varepsilon}&=\sigma_{y}-\sum_{k=1}^{n}\text{MCR}_{k} \end{aligned} \]
def lm_mcr(x, y, weights, intercept):
= np.matrix(lm_coef(x, y, weights, intercept)).T
coef = lm_rsq(x, y, weights, intercept)
rsq
if (intercept): x = sm.add_constant(x)
# sigma = np.cov(np.concatenate((x, y), axis = 1).T,
# aweights = weights.reshape(-1))
= cov_wt(np.concatenate((x, y), axis = 1), weights, intercept)
sigma = np.multiply(coef, np.dot(sigma[:-1, :-1], coef)) / np.sqrt(sigma[-1, -1])
mcr = np.sqrt(sigma[-1, -1]) - sum(mcr)
mcr_eps
= np.concatenate((np.sqrt(np.matrix(sigma[-1, -1])),
result
np.matrix(mcr).T,= 1)
np.matrix(mcr_eps)), axis
return np.ravel(result)
* np.sqrt(scale["periods"] * scale["overlap"]) lm_mcr(overlap_x_df, overlap_y_df, weights, intercept)
array([ 0.06697784, -0. , 0.01648449, 0.00188136, 0.02052117,
0.01716357, 0.01092724])
Scenario analysis
Implied shocks
\[ \begin{aligned} \hat{\beta}&=(Z^\mathrm{T}WZ)^{-1}Z^\mathrm{T}WX \end{aligned} \]
def implied_shocks(shocks, x, z, weights):
= np.linalg.lstsq(np.multiply(weights, z), np.multiply(weights, x), rcond = None)[0]
beta
= np.dot(shocks, beta)
result
return np.ravel(result)
= np.array([-0.1, 0.1])
shocks = overlap_x_df.iloc[:, [0, 1]] overlap_z_df
implied_shocks(shocks, overlap_x_df, overlap_z_df, weights)
array([-0.1 , 0.1 , -0.00951065, -0.00516176])
Stress P&L
def pnl_stress(shocks, x, y, z, weights, intercept):
= lm_coef(x, y, weights, intercept)
coef
if (intercept): x = sm.add_constant(x)
= np.multiply(coef.T, implied_shocks(shocks, x, z, weights))
result
return np.ravel(result)
pnl_stress(shocks, overlap_x_df, overlap_y_df, overlap_z_df, weights, intercept)
array([ 5.29387044e-05, -1.72197207e-02, -6.43201203e-03, -3.20464075e-02,
-1.05328881e-02])