Coverage for src / cvx / risk / linalg / pca.py: 100%

21 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-15 12:21 +0000

1# Copyright 2023 Stanford University Convex Optimization Group 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""PCA analysis (pure NumPy implementation). 

15 

16This module provides Principal Component Analysis (PCA) for dimensionality 

17reduction of return data. PCA is commonly used to construct factor models 

18for portfolio optimization. 

19 

20Example: 

21 Perform PCA on stock returns: 

22 

23 >>> import numpy as np 

24 >>> import pandas as pd 

25 >>> from cvx.risk.linalg import pca 

26 >>> # Create sample returns data 

27 >>> np.random.seed(42) 

28 >>> returns = pd.DataFrame( 

29 ... np.random.randn(100, 5), 

30 ... columns=['A', 'B', 'C', 'D', 'E'] 

31 ... ) 

32 >>> # Compute PCA with 3 components 

33 >>> result = pca(returns, n_components=3) 

34 >>> # Access explained variance 

35 >>> len(result.explained_variance) 

36 3 

37 >>> # Access factors (principal components) 

38 >>> result.factors.shape 

39 (100, 3) 

40 >>> # Access factor exposures (loadings) 

41 >>> result.exposure.shape 

42 (3, 5) 

43 

44""" 

45 

46from __future__ import annotations 

47 

48from collections import namedtuple 

49 

50import numpy as np 

51import pandas as pd 

52 

53PCA = namedtuple( 

54 "PCA", 

55 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"], 

56) 

57"""Named tuple containing the results of PCA analysis. 

58 

59Attributes: 

60 explained_variance: Explained variance ratio for each component. 

61 An array of shape (n_components,) where each element represents 

62 the proportion of total variance explained by that component. 

63 factors: Factor returns (principal components) as a DataFrame. 

64 Shape is (n_samples, n_components). Each column is a factor. 

65 exposure: Factor exposures (loadings) for each asset as a DataFrame. 

66 Shape is (n_components, n_assets). Each row contains the loadings 

67 of one component on all assets. 

68 cov: Covariance matrix of the factors as a DataFrame. 

69 Shape is (n_components, n_components). 

70 systematic: Systematic returns explained by the factors as a DataFrame. 

71 Shape is (n_samples, n_assets). This is the part of returns 

72 explained by the factor model. 

73 idiosyncratic: Idiosyncratic returns not explained by factors as a DataFrame. 

74 Shape is (n_samples, n_assets). This is the residual part of returns. 

75 

76Example: 

77 >>> import numpy as np 

78 >>> import pandas as pd 

79 >>> from cvx.risk.linalg import pca 

80 >>> np.random.seed(42) 

81 >>> returns = pd.DataFrame(np.random.randn(50, 4)) 

82 >>> result = pca(returns, n_components=2) 

83 >>> # Check explained variance sums to less than 1 

84 >>> result.explained_variance.sum() < 1 

85 True 

86 >>> # Systematic + idiosyncratic approximately equals original 

87 >>> np.allclose( 

88 ... result.systematic.values + result.idiosyncratic.values, 

89 ... returns.values, 

90 ... atol=1e-10 

91 ... ) 

92 True 

93 

94""" 

95 

96 

97def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA: 

98 """Compute the first n principal components for a return matrix using SVD. 

99 

100 This function performs Principal Component Analysis on asset returns to 

101 extract the main sources of variance. The results can be used to construct 

102 a factor model for portfolio optimization. 

103 

104 Args: 

105 returns: DataFrame of asset returns with shape (n_samples, n_assets). 

106 Rows represent time periods, columns represent assets. 

107 n_components: Number of principal components to extract. Defaults to 10. 

108 

109 Returns: 

110 PCA named tuple containing: 

111 - explained_variance: Ratio of variance explained by each component 

112 - factors: Factor returns (scores) 

113 - exposure: Factor exposures (loadings) 

114 - cov: Factor covariance matrix 

115 - systematic: Returns explained by factors 

116 - idiosyncratic: Residual returns 

117 

118 Example: 

119 Basic PCA on synthetic returns: 

120 

121 >>> import numpy as np 

122 >>> import pandas as pd 

123 >>> from cvx.risk.linalg import pca 

124 >>> np.random.seed(42) 

125 >>> # Create returns with 100 periods and 10 assets 

126 >>> returns = pd.DataFrame(np.random.randn(100, 10)) 

127 >>> result = pca(returns, n_components=3) 

128 >>> # First component explains most variance 

129 >>> bool(result.explained_variance[0] > result.explained_variance[1]) 

130 True 

131 >>> # Factors are orthogonal 

132 >>> factor_corr = np.corrcoef(result.factors.T) 

133 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1)) 

134 True 

135 

136 Using PCA results for a factor model: 

137 

138 >>> from cvx.risk.factor import FactorModel 

139 >>> import cvxpy as cp 

140 >>> model = FactorModel(assets=10, k=3) 

141 >>> model.update( 

142 ... exposure=result.exposure.values, 

143 ... cov=result.cov.values, 

144 ... idiosyncratic_risk=result.idiosyncratic.std().values, 

145 ... lower_assets=np.zeros(10), 

146 ... upper_assets=np.ones(10), 

147 ... lower_factors=-np.ones(3), 

148 ... upper_factors=np.ones(3) 

149 ... ) 

150 

151 """ 

152 # Demean the returns 

153 x = returns.to_numpy() 

154 x_mean = x.mean(axis=0) 

155 x_centered = x - x_mean 

156 

157 # Singular Value Decomposition 

158 # x = u s V^T, where columns of V are principal axes 

159 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False) 

160 

161 # Take only the first n components 

162 u = u[:, :n_components] 

163 s = s_full[:n_components] 

164 vt = vt[:n_components, :] 

165 

166 # Factor exposures (loadings): each component's weight per asset 

167 exposure = pd.DataFrame(vt, columns=returns.columns) 

168 

169 # Factor returns (scores): projection of data onto components 

170 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)]) 

171 

172 # Explained variance ratio (normalize by total variance across ALL components) 

173 explained_variance = (s**2) / np.sum(s_full**2) 

174 

175 # Covariance of factor returns 

176 cov = factors.cov() 

177 

178 # Systematic + Idiosyncratic returns 

179 systematic = pd.DataFrame( 

180 data=(u * s) @ vt + x_mean, 

181 index=returns.index, 

182 columns=returns.columns, 

183 ) 

184 idiosyncratic = pd.DataFrame( 

185 data=x_centered - (u * s) @ vt, 

186 index=returns.index, 

187 columns=returns.columns, 

188 ) 

189 

190 return PCA( 

191 explained_variance=explained_variance, 

192 factors=factors, 

193 exposure=exposure, 

194 cov=cov, 

195 systematic=systematic, 

196 idiosyncratic=idiosyncratic, 

197 )