Coverage for src / cvx / risk / linalg / pca.py: 100%

21 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-09 03:39 +0000

1# Copyright 2023 Stanford University Convex Optimization Group 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""PCA analysis (pure NumPy implementation). 

15 

16This module provides Principal Component Analysis (PCA) for dimensionality 

17reduction of return data. PCA is commonly used to construct factor models 

18for portfolio optimization. 

19 

20Example: 

21 Perform PCA on stock returns: 

22 

23 >>> import numpy as np 

24 >>> import pandas as pd 

25 >>> from cvx.risk.linalg import pca 

26 >>> # Create sample returns data 

27 >>> np.random.seed(42) 

28 >>> returns = pd.DataFrame( 

29 ... np.random.randn(100, 5), 

30 ... columns=['A', 'B', 'C', 'D', 'E'] 

31 ... ) 

32 >>> # Compute PCA with 3 components 

33 >>> result = pca(returns, n_components=3) 

34 >>> # Access explained variance 

35 >>> len(result.explained_variance) 

36 3 

37 >>> # Access factors (principal components) 

38 >>> result.factors.shape 

39 (100, 3) 

40 >>> # Access factor exposures (loadings) 

41 >>> result.exposure.shape 

42 (3, 5) 

43 

44""" 

45 

46from __future__ import annotations 

47 

48from collections import namedtuple 

49 

50import numpy as np 

51import pandas as pd 

52 

53PCA = namedtuple( 

54 "PCA", 

55 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"], 

56) 

57"""Named tuple containing the results of PCA analysis. 

58 

59Attributes: 

60 explained_variance: Explained variance ratio for each component. 

61 An array of shape (n_components,) where each element represents 

62 the proportion of total variance explained by that component. 

63 factors: Factor returns (principal components) as a DataFrame. 

64 Shape is (n_samples, n_components). Each column is a factor. 

65 exposure: Factor exposures (loadings) for each asset as a DataFrame. 

66 Shape is (n_components, n_assets). Each row contains the loadings 

67 of one component on all assets. 

68 cov: Covariance matrix of the factors as a DataFrame. 

69 Shape is (n_components, n_components). 

70 systematic: Systematic returns explained by the factors as a DataFrame. 

71 Shape is (n_samples, n_assets). This is the part of returns 

72 explained by the factor model. 

73 idiosyncratic: Idiosyncratic returns not explained by factors as a DataFrame. 

74 Shape is (n_samples, n_assets). This is the residual part of returns. 

75 

76Example: 

77 >>> import numpy as np 

78 >>> import pandas as pd 

79 >>> from cvx.risk.linalg import pca 

80 >>> np.random.seed(42) 

81 >>> returns = pd.DataFrame(np.random.randn(50, 4)) 

82 >>> result = pca(returns, n_components=2) 

83 >>> # Check explained variance sums to less than 1 

84 >>> result.explained_variance.sum() < 1 

85 True 

86 >>> # Systematic + idiosyncratic approximately equals original 

87 >>> np.allclose( 

88 ... result.systematic.values + result.idiosyncratic.values, 

89 ... returns.values, 

90 ... atol=1e-10 

91 ... ) 

92 True 

93 

94""" 

95 

96 

97def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA: 

98 """Compute the first n principal components for a return matrix using SVD. 

99 

100 This function performs Principal Component Analysis on asset returns to 

101 extract the main sources of variance. The results can be used to construct 

102 a factor model for portfolio optimization. 

103 

104 Args: 

105 returns: DataFrame of asset returns with shape (n_samples, n_assets). 

106 Rows represent time periods, columns represent assets. 

107 n_components: Number of principal components to extract. Defaults to 10. 

108 

109 Returns: 

110 PCA named tuple containing: 

111 - explained_variance: Ratio of variance explained by each component 

112 - factors: Factor returns (scores) 

113 - exposure: Factor exposures (loadings) 

114 - cov: Factor covariance matrix 

115 - systematic: Returns explained by factors 

116 - idiosyncratic: Residual returns 

117 

118 Example: 

119 Basic PCA on synthetic returns: 

120 

121 >>> import numpy as np 

122 >>> import pandas as pd 

123 >>> from cvx.risk.linalg import pca 

124 >>> np.random.seed(42) 

125 >>> # Create returns with 100 periods and 10 assets 

126 >>> returns = pd.DataFrame(np.random.randn(100, 10)) 

127 >>> result = pca(returns, n_components=3) 

128 >>> # First component explains most variance 

129 >>> bool(result.explained_variance[0] > result.explained_variance[1]) 

130 True 

131 >>> # Factors are orthogonal 

132 >>> factor_corr = np.corrcoef(result.factors.T) 

133 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1)) 

134 True 

135 

136 Using PCA results for a factor model: 

137 

138 >>> from cvx.risk.factor import FactorModel 

139 >>> import cvxpy as cp 

140 >>> model = FactorModel(assets=10, k=3) 

141 >>> model.update( 

142 ... exposure=result.exposure.values, 

143 ... cov=result.cov.values, 

144 ... idiosyncratic_risk=result.idiosyncratic.std().values, 

145 ... lower_assets=np.zeros(10), 

146 ... upper_assets=np.ones(10), 

147 ... lower_factors=-np.ones(3), 

148 ... upper_factors=np.ones(3) 

149 ... ) 

150 

151 Verifying variance decomposition (systematic + idiosyncratic = total): 

152 

153 >>> np.random.seed(123) 

154 >>> returns = pd.DataFrame(np.random.randn(50, 5)) 

155 >>> result = pca(returns, n_components=3) 

156 >>> # Systematic variance + idiosyncratic variance ≈ total variance 

157 >>> total_var = returns.var().sum() 

158 >>> systematic_var = result.systematic.var().sum() 

159 >>> idio_var = result.idiosyncratic.var().sum() 

160 >>> # Note: small differences due to demeaning 

161 >>> bool(np.isclose(systematic_var + idio_var, total_var, rtol=0.1)) 

162 True 

163 

164 Exposure matrix has orthonormal rows (loadings are orthogonal): 

165 

166 >>> np.random.seed(42) 

167 >>> returns = pd.DataFrame(np.random.randn(100, 6)) 

168 >>> result = pca(returns, n_components=3) 

169 >>> # V^T @ V should be identity (orthonormal loadings) 

170 >>> VtV = result.exposure.values @ result.exposure.values.T 

171 >>> bool(np.allclose(VtV, np.eye(3), atol=1e-10)) 

172 True 

173 

174 Explained variance is ordered (first component explains most): 

175 

176 >>> all(result.explained_variance[i] >= result.explained_variance[i+1] 

177 ... for i in range(len(result.explained_variance)-1)) 

178 True 

179 

180 Reconstructing returns from factors and exposures: 

181 

182 >>> # systematic = factors @ exposure (plus mean) 

183 >>> reconstructed = result.factors.values @ result.exposure.values 

184 >>> # Should match systematic (centered part) 

185 >>> centered_systematic = result.systematic.values - returns.values.mean(axis=0) 

186 >>> bool(np.allclose(reconstructed, centered_systematic, atol=1e-10)) 

187 True 

188 

189 """ 

190 # Demean the returns 

191 x = returns.to_numpy() 

192 x_mean = x.mean(axis=0) 

193 x_centered = x - x_mean 

194 

195 # Singular Value Decomposition 

196 # x = u s V^T, where columns of V are principal axes 

197 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False) 

198 

199 # Take only the first n components 

200 u = u[:, :n_components] 

201 s = s_full[:n_components] 

202 vt = vt[:n_components, :] 

203 

204 # Factor exposures (loadings): each component's weight per asset 

205 exposure = pd.DataFrame(vt, columns=returns.columns) 

206 

207 # Factor returns (scores): projection of data onto components 

208 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)]) 

209 

210 # Explained variance ratio (normalize by total variance across ALL components) 

211 explained_variance = (s**2) / np.sum(s_full**2) 

212 

213 # Covariance of factor returns 

214 cov = factors.cov() 

215 

216 # Systematic + Idiosyncratic returns 

217 systematic = pd.DataFrame( 

218 data=(u * s) @ vt + x_mean, 

219 index=returns.index, 

220 columns=returns.columns, 

221 ) 

222 idiosyncratic = pd.DataFrame( 

223 data=x_centered - (u * s) @ vt, 

224 index=returns.index, 

225 columns=returns.columns, 

226 ) 

227 

228 return PCA( 

229 explained_variance=explained_variance, 

230 factors=factors, 

231 exposure=exposure, 

232 cov=cov, 

233 systematic=systematic, 

234 idiosyncratic=idiosyncratic, 

235 )