Coverage for src/cvx/risk/linalg/pca.py: 100%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14"""PCA analysis (pure NumPy implementation).

16This module provides Principal Component Analysis (PCA) for dimensionality

17reduction of return data. PCA is commonly used to construct factor models

18for portfolio optimization.

20Example:

21 Perform PCA on stock returns:

23 >>> import numpy as np

24 >>> import pandas as pd

25 >>> from cvx.risk.linalg import pca

26 >>> # Create sample returns data

27 >>> np.random.seed(42)

28 >>> returns = pd.DataFrame(

29 ... np.random.randn(100, 5),

30 ... columns=['A', 'B', 'C', 'D', 'E']

31 ... )

32 >>> # Compute PCA with 3 components

33 >>> result = pca(returns, n_components=3)

34 >>> # Access explained variance

35 >>> len(result.explained_variance)

36 3

37 >>> # Access factors (principal components)

38 >>> result.factors.shape

39 (100, 3)

40 >>> # Access factor exposures (loadings)

41 >>> result.exposure.shape

42 (3, 5)

44"""

46from __future__ import annotations

48from collections import namedtuple

50import numpy as np

51import pandas as pd

53PCA = namedtuple(

54 "PCA",

55 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],

56)

57"""Named tuple containing the results of PCA analysis.

59Attributes:

60 explained_variance: Explained variance ratio for each component.

61 An array of shape (n_components,) where each element represents

62 the proportion of total variance explained by that component.

63 factors: Factor returns (principal components) as a DataFrame.

64 Shape is (n_samples, n_components). Each column is a factor.

65 exposure: Factor exposures (loadings) for each asset as a DataFrame.

66 Shape is (n_components, n_assets). Each row contains the loadings

67 of one component on all assets.

68 cov: Covariance matrix of the factors as a DataFrame.

69 Shape is (n_components, n_components).

70 systematic: Systematic returns explained by the factors as a DataFrame.

71 Shape is (n_samples, n_assets). This is the part of returns

72 explained by the factor model.

73 idiosyncratic: Idiosyncratic returns not explained by factors as a DataFrame.

74 Shape is (n_samples, n_assets). This is the residual part of returns.

76Example:

77 >>> import numpy as np

78 >>> import pandas as pd

79 >>> from cvx.risk.linalg import pca

80 >>> np.random.seed(42)

81 >>> returns = pd.DataFrame(np.random.randn(50, 4))

82 >>> result = pca(returns, n_components=2)

83 >>> # Check explained variance sums to less than 1

84 >>> result.explained_variance.sum() < 1

85 True

86 >>> # Systematic + idiosyncratic approximately equals original

87 >>> np.allclose(

88 ... result.systematic.values + result.idiosyncratic.values,

89 ... returns.values,

90 ... atol=1e-10

91 ... )

92 True

94"""

97def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:

98 """Compute the first n principal components for a return matrix using SVD.

100 This function performs Principal Component Analysis on asset returns to

101 extract the main sources of variance. The results can be used to construct

102 a factor model for portfolio optimization.

103

104 Args:

105 returns: DataFrame of asset returns with shape (n_samples, n_assets).

106 Rows represent time periods, columns represent assets.

107 n_components: Number of principal components to extract. Defaults to 10.

108

109 Returns:

110 PCA named tuple containing:

111 - explained_variance: Ratio of variance explained by each component

112 - factors: Factor returns (scores)

113 - exposure: Factor exposures (loadings)

114 - cov: Factor covariance matrix

115 - systematic: Returns explained by factors

116 - idiosyncratic: Residual returns

117

118 Example:

119 Basic PCA on synthetic returns:

120

121 >>> import numpy as np

122 >>> import pandas as pd

123 >>> from cvx.risk.linalg import pca

124 >>> np.random.seed(42)

125 >>> # Create returns with 100 periods and 10 assets

126 >>> returns = pd.DataFrame(np.random.randn(100, 10))

127 >>> result = pca(returns, n_components=3)

128 >>> # First component explains most variance

129 >>> bool(result.explained_variance[0] > result.explained_variance[1])

130 True

131 >>> # Factors are orthogonal

132 >>> factor_corr = np.corrcoef(result.factors.T)

133 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1))

134 True

135

136 Using PCA results for a factor model:

137

138 >>> from cvx.risk.factor import FactorModel

139 >>> import cvxpy as cp

140 >>> model = FactorModel(assets=10, k=3)

141 >>> model.update(

142 ... exposure=result.exposure.values,

143 ... cov=result.cov.values,

144 ... idiosyncratic_risk=result.idiosyncratic.std().values,

145 ... lower_assets=np.zeros(10),

146 ... upper_assets=np.ones(10),

147 ... lower_factors=-np.ones(3),

148 ... upper_factors=np.ones(3)

149 ... )

150

151 """

152 # Demean the returns

153 x = returns.to_numpy()

154 x_mean = x.mean(axis=0)

155 x_centered = x - x_mean

156

157 # Singular Value Decomposition

158 # x = u s V^T, where columns of V are principal axes

159 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False)

160

161 # Take only the first n components

162 u = u[:, :n_components]

163 s = s_full[:n_components]

164 vt = vt[:n_components, :]

165

166 # Factor exposures (loadings): each component's weight per asset

167 exposure = pd.DataFrame(vt, columns=returns.columns)

168

169 # Factor returns (scores): projection of data onto components

170 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)])

171

172 # Explained variance ratio (normalize by total variance across ALL components)

173 explained_variance = (s**2) / np.sum(s_full**2)

174

175 # Covariance of factor returns

176 cov = factors.cov()

177

178 # Systematic + Idiosyncratic returns

179 systematic = pd.DataFrame(

180 data=(u * s) @ vt + x_mean,

181 index=returns.index,

182 columns=returns.columns,

183 )

184 idiosyncratic = pd.DataFrame(

185 data=x_centered - (u * s) @ vt,

186 index=returns.index,

187 columns=returns.columns,

188 )

189

190 return PCA(

191 explained_variance=explained_variance,

192 factors=factors,

193 exposure=exposure,

194 cov=cov,

195 systematic=systematic,

196 idiosyncratic=idiosyncratic,

197 )

Coverage for src / cvx / risk / linalg / pca.py: 100%

21 statements