diff --git a/docs/ExpectedReturns.rst b/docs/ExpectedReturns.rst index a1c7247c..960af8cd 100755 --- a/docs/ExpectedReturns.rst +++ b/docs/ExpectedReturns.rst @@ -46,6 +46,14 @@ superior models and feed them into the optimizer. the mean historical return. However, if you plan on rebalancing much more frequently, there is a case to be made for lowering the span in order to capture recent trends. + .. autofunction:: james_stein_return + + The James-Stein estimator shrinks each asset's sample mean return towards + the cross-sectional (grand) mean, reducing the estimation error that + mean-variance optimizers are so sensitive to. This is most valuable when + there are many assets relative to the length of the price history. The + shrinkage intensity can be set manually, or estimated from the data. + .. autofunction:: capm_return .. autofunction:: returns_from_prices diff --git a/pypfopt/expected_returns.py b/pypfopt/expected_returns.py index bbbf69c6..9782ffc8 100644 --- a/pypfopt/expected_returns.py +++ b/pypfopt/expected_returns.py @@ -15,6 +15,7 @@ - general return model function, allowing you to run any return model from one function. - mean historical return - exponentially weighted mean historical return + - James-Stein shrinkage estimate of returns - CAPM estimate of returns Additionally, we provide utility functions to convert from returns to prices and vice-versa. @@ -104,6 +105,7 @@ def return_model(prices, method="mean_historical_return", **kwargs): - ``mean_historical_return`` - ``ema_historical_return`` + - ``james_stein_return`` - ``capm_return`` Raises @@ -122,6 +124,8 @@ def return_model(prices, method="mean_historical_return", **kwargs): return ema_historical_return(prices, **kwargs) elif method == "capm_return": return capm_return(prices, **kwargs) + elif method == "james_stein_return": + return james_stein_return(prices, **kwargs) else: raise NotImplementedError("Return model {} not implemented".format(method)) @@ -223,6 +227,110 @@ def ema_historical_return( return returns.ewm(span=span).mean().iloc[-1] * frequency +def james_stein_return( + prices, + returns_data=False, + shrinkage=None, + compounding=True, + frequency=252, + log_returns=False, +): + """ + Estimate annualised expected returns using James-Stein shrinkage. + + The per-asset sample estimate of expected return is shrunk towards the + cross-sectional (grand) mean. This reduces estimation error in the mean + vector, which mean-variance optimisers are notoriously sensitive to, and + is especially helpful when the number of assets is large relative to the + length of the available price history. + + Parameters + ---------- + prices : pd.DataFrame + adjusted closing prices of the asset, each row is a date + and each column is a ticker/id. + returns_data : bool, optional + if true, the first argument is returns instead of prices. + These **should not** be log returns. Defaults to False. + shrinkage : float, optional + shrinkage intensity in [0, 1]. ``0`` returns the sample estimate + (identical to :func:`mean_historical_return`) and ``1`` returns the + grand mean for every asset. If ``None`` (default), the intensity is + estimated from the data via a James-Stein/SURE rule. + compounding : bool, optional + computes geometric mean returns if True, + arithmetic otherwise. Defaults to True. + frequency : int, optional + number of time periods in a year, defaults to 252 (the number + of trading days in a year) + log_returns : bool, optional + whether to compute using log returns. Defaults to False. + + Returns + ------- + pd.Series + annualised James-Stein shrinkage estimate of expected returns + + Raises + ------ + ValueError + if ``shrinkage`` is supplied and is not in [0, 1] + + References + ---------- + Stein, C. (1956). Inadmissibility of the usual estimator for the mean of a + multivariate normal distribution. *Proc. Third Berkeley Symp. on Math. + Statist. and Prob.*, Vol. 1, 197-206. + """ + if not isinstance(prices, pd.DataFrame): + warnings.warn("prices are not in a dataframe", RuntimeWarning) + prices = pd.DataFrame(prices) + + if shrinkage is not None and not 0 <= shrinkage <= 1: + raise ValueError("shrinkage must be in [0, 1], got {}".format(shrinkage)) + + if returns_data: + returns = prices + else: + returns = returns_from_prices(prices, log_returns) + + _check_returns(returns) + + # Per-asset annualised expected returns: identical to mean_historical_return, + # so that shrinkage=0 recovers the sample estimate exactly. + if compounding: + mu = (1 + returns).prod() ** (frequency / returns.count()) - 1 + else: + mu = returns.mean() * frequency + grand_mean = mu.mean() + + if shrinkage is None: + # Base the James-Stein dimensionality only on assets that carry usable + # information. All-NaN or single-observation columns have no meaningful + # mean/variance and would otherwise inflate ``p`` and bias the shrinkage + # intensity. + valid = (returns.count() >= 2) & np.isfinite(mu) + p = int(valid.sum()) + dispersion = float(((mu[valid] - grand_mean) ** 2).sum()) + if p <= 2 or dispersion <= 1e-12: + # Degenerate case: no meaningful cross-sectional dispersion to shrink, + # or too few assets for the James-Stein rule. Fall back to full + # shrinkage when all means coincide, otherwise none. + shrinkage = 1.0 if dispersion <= 1e-12 else 0.0 + else: + # Variance of each annualised mean estimator (~ frequency**2 * var / n), + # averaged across the valid assets. For the arithmetic (non-compounding) + # mean this frequency**2 factor cancels against the annualised + # dispersion, making the intensity frequency-invariant; for the + # geometric/compounding mean the invariance holds only approximately. + tau_squared = (frequency**2 * returns.var(ddof=1) / returns.count())[ + valid + ].mean() + shrinkage = float(np.clip((p - 2) * tau_squared / dispersion, 0.0, 1.0)) + + return shrinkage * grand_mean + (1 - shrinkage) * mu + + def capm_return( prices, market_prices=None, diff --git a/tests/test_expected_returns.py b/tests/test_expected_returns.py index 4e16583b..aed42e4a 100644 --- a/tests/test_expected_returns.py +++ b/tests/test_expected_returns.py @@ -299,3 +299,97 @@ def test_log_return_passthrough(): except AssertionError: return assert False + + +def test_james_stein_return(): + df = get_data() + js_return = expected_returns.james_stein_return(df) + + assert isinstance(js_return, pd.Series) + assert list(js_return.index) == list(df.columns) + assert js_return.notnull().all() + assert js_return.dtype == "float64" + + +def test_james_stein_return_shrinkage(): + df = get_data() + + # With shrinkage=0 we recover the sample estimate (mean_historical_return). + js_return_0 = expected_returns.james_stein_return(df, shrinkage=0.0) + sample_mean = expected_returns.mean_historical_return(df) + np.testing.assert_array_almost_equal(js_return_0.values, sample_mean.values) + + # With shrinkage=1 every asset collapses to the grand mean. + js_return_1 = expected_returns.james_stein_return(df, shrinkage=1.0) + assert (js_return_1 == js_return_1.iloc[0]).all() + + # A manual intermediate intensity still yields a valid Series. + js_return_05 = expected_returns.james_stein_return(df, shrinkage=0.5) + assert isinstance(js_return_05, pd.Series) + + +def test_james_stein_return_auto_shrinkage(): + df = get_data() + js_return_auto = expected_returns.james_stein_return(df, shrinkage=None) + assert isinstance(js_return_auto, pd.Series) + assert js_return_auto.notnull().all() + + +def test_james_stein_return_invalid_shrinkage(): + df = get_data() + with pytest.raises(ValueError): + expected_returns.james_stein_return(df, shrinkage=-0.1) + with pytest.raises(ValueError): + expected_returns.james_stein_return(df, shrinkage=1.5) + + +def test_james_stein_return_compounding(): + df = get_data() + js_compound = expected_returns.james_stein_return(df, compounding=True) + js_simple = expected_returns.james_stein_return(df, compounding=False) + assert isinstance(js_compound, pd.Series) + assert isinstance(js_simple, pd.Series) + assert js_compound.notnull().all() + assert js_simple.notnull().all() + + +def test_james_stein_return_frequency(): + df = get_data() + js_annual = expected_returns.james_stein_return( + df, compounding=False, frequency=252 + ) + js_monthly = expected_returns.james_stein_return( + df, compounding=False, frequency=12 + ) + np.testing.assert_array_almost_equal(js_annual / 252, js_monthly / 12) + + +def test_james_stein_return_with_return_model(): + df = get_data() + js_direct = expected_returns.james_stein_return(df, shrinkage=0.3) + js_dispatch = expected_returns.return_model( + df, method="james_stein_return", shrinkage=0.3 + ) + pd.testing.assert_series_equal(js_direct, js_dispatch) + + +def test_james_stein_return_data_warning(): + df = get_data() + with pytest.warns(RuntimeWarning): + js_return = expected_returns.james_stein_return(df.to_numpy()) + assert isinstance(js_return, pd.Series) + + +def test_james_stein_return_log_returns(): + df = get_data() + js_normal = expected_returns.james_stein_return(df, log_returns=False) + js_log = expected_returns.james_stein_return(df, log_returns=True) + assert not (js_normal == js_log).all() + + +def test_james_stein_return_returns_data(): + df = get_data() + returns = expected_returns.returns_from_prices(df) + js_prices = expected_returns.james_stein_return(df) + js_returns = expected_returns.james_stein_return(returns, returns_data=True) + pd.testing.assert_series_equal(js_prices, js_returns)