Coverage for src / cvx / risk / linalg / pca.py: 100%
21 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-09 03:39 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-09 03:39 +0000
1# Copyright 2023 Stanford University Convex Optimization Group
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""PCA analysis (pure NumPy implementation).
16This module provides Principal Component Analysis (PCA) for dimensionality
17reduction of return data. PCA is commonly used to construct factor models
18for portfolio optimization.
20Example:
21 Perform PCA on stock returns:
23 >>> import numpy as np
24 >>> import pandas as pd
25 >>> from cvx.risk.linalg import pca
26 >>> # Create sample returns data
27 >>> np.random.seed(42)
28 >>> returns = pd.DataFrame(
29 ... np.random.randn(100, 5),
30 ... columns=['A', 'B', 'C', 'D', 'E']
31 ... )
32 >>> # Compute PCA with 3 components
33 >>> result = pca(returns, n_components=3)
34 >>> # Access explained variance
35 >>> len(result.explained_variance)
36 3
37 >>> # Access factors (principal components)
38 >>> result.factors.shape
39 (100, 3)
40 >>> # Access factor exposures (loadings)
41 >>> result.exposure.shape
42 (3, 5)
44"""
46from __future__ import annotations
48from collections import namedtuple
50import numpy as np
51import pandas as pd
53PCA = namedtuple(
54 "PCA",
55 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],
56)
57"""Named tuple containing the results of PCA analysis.
59Attributes:
60 explained_variance: Explained variance ratio for each component.
61 An array of shape (n_components,) where each element represents
62 the proportion of total variance explained by that component.
63 factors: Factor returns (principal components) as a DataFrame.
64 Shape is (n_samples, n_components). Each column is a factor.
65 exposure: Factor exposures (loadings) for each asset as a DataFrame.
66 Shape is (n_components, n_assets). Each row contains the loadings
67 of one component on all assets.
68 cov: Covariance matrix of the factors as a DataFrame.
69 Shape is (n_components, n_components).
70 systematic: Systematic returns explained by the factors as a DataFrame.
71 Shape is (n_samples, n_assets). This is the part of returns
72 explained by the factor model.
73 idiosyncratic: Idiosyncratic returns not explained by factors as a DataFrame.
74 Shape is (n_samples, n_assets). This is the residual part of returns.
76Example:
77 >>> import numpy as np
78 >>> import pandas as pd
79 >>> from cvx.risk.linalg import pca
80 >>> np.random.seed(42)
81 >>> returns = pd.DataFrame(np.random.randn(50, 4))
82 >>> result = pca(returns, n_components=2)
83 >>> # Check explained variance sums to less than 1
84 >>> result.explained_variance.sum() < 1
85 True
86 >>> # Systematic + idiosyncratic approximately equals original
87 >>> np.allclose(
88 ... result.systematic.values + result.idiosyncratic.values,
89 ... returns.values,
90 ... atol=1e-10
91 ... )
92 True
94"""
97def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:
98 """Compute the first n principal components for a return matrix using SVD.
100 This function performs Principal Component Analysis on asset returns to
101 extract the main sources of variance. The results can be used to construct
102 a factor model for portfolio optimization.
104 Args:
105 returns: DataFrame of asset returns with shape (n_samples, n_assets).
106 Rows represent time periods, columns represent assets.
107 n_components: Number of principal components to extract. Defaults to 10.
109 Returns:
110 PCA named tuple containing:
111 - explained_variance: Ratio of variance explained by each component
112 - factors: Factor returns (scores)
113 - exposure: Factor exposures (loadings)
114 - cov: Factor covariance matrix
115 - systematic: Returns explained by factors
116 - idiosyncratic: Residual returns
118 Example:
119 Basic PCA on synthetic returns:
121 >>> import numpy as np
122 >>> import pandas as pd
123 >>> from cvx.risk.linalg import pca
124 >>> np.random.seed(42)
125 >>> # Create returns with 100 periods and 10 assets
126 >>> returns = pd.DataFrame(np.random.randn(100, 10))
127 >>> result = pca(returns, n_components=3)
128 >>> # First component explains most variance
129 >>> bool(result.explained_variance[0] > result.explained_variance[1])
130 True
131 >>> # Factors are orthogonal
132 >>> factor_corr = np.corrcoef(result.factors.T)
133 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1))
134 True
136 Using PCA results for a factor model:
138 >>> from cvx.risk.factor import FactorModel
139 >>> import cvxpy as cp
140 >>> model = FactorModel(assets=10, k=3)
141 >>> model.update(
142 ... exposure=result.exposure.values,
143 ... cov=result.cov.values,
144 ... idiosyncratic_risk=result.idiosyncratic.std().values,
145 ... lower_assets=np.zeros(10),
146 ... upper_assets=np.ones(10),
147 ... lower_factors=-np.ones(3),
148 ... upper_factors=np.ones(3)
149 ... )
151 Verifying variance decomposition (systematic + idiosyncratic = total):
153 >>> np.random.seed(123)
154 >>> returns = pd.DataFrame(np.random.randn(50, 5))
155 >>> result = pca(returns, n_components=3)
156 >>> # Systematic variance + idiosyncratic variance ≈ total variance
157 >>> total_var = returns.var().sum()
158 >>> systematic_var = result.systematic.var().sum()
159 >>> idio_var = result.idiosyncratic.var().sum()
160 >>> # Note: small differences due to demeaning
161 >>> bool(np.isclose(systematic_var + idio_var, total_var, rtol=0.1))
162 True
164 Exposure matrix has orthonormal rows (loadings are orthogonal):
166 >>> np.random.seed(42)
167 >>> returns = pd.DataFrame(np.random.randn(100, 6))
168 >>> result = pca(returns, n_components=3)
169 >>> # V^T @ V should be identity (orthonormal loadings)
170 >>> VtV = result.exposure.values @ result.exposure.values.T
171 >>> bool(np.allclose(VtV, np.eye(3), atol=1e-10))
172 True
174 Explained variance is ordered (first component explains most):
176 >>> all(result.explained_variance[i] >= result.explained_variance[i+1]
177 ... for i in range(len(result.explained_variance)-1))
178 True
180 Reconstructing returns from factors and exposures:
182 >>> # systematic = factors @ exposure (plus mean)
183 >>> reconstructed = result.factors.values @ result.exposure.values
184 >>> # Should match systematic (centered part)
185 >>> centered_systematic = result.systematic.values - returns.values.mean(axis=0)
186 >>> bool(np.allclose(reconstructed, centered_systematic, atol=1e-10))
187 True
189 """
190 # Demean the returns
191 x = returns.to_numpy()
192 x_mean = x.mean(axis=0)
193 x_centered = x - x_mean
195 # Singular Value Decomposition
196 # x = u s V^T, where columns of V are principal axes
197 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False)
199 # Take only the first n components
200 u = u[:, :n_components]
201 s = s_full[:n_components]
202 vt = vt[:n_components, :]
204 # Factor exposures (loadings): each component's weight per asset
205 exposure = pd.DataFrame(vt, columns=returns.columns)
207 # Factor returns (scores): projection of data onto components
208 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)])
210 # Explained variance ratio (normalize by total variance across ALL components)
211 explained_variance = (s**2) / np.sum(s_full**2)
213 # Covariance of factor returns
214 cov = factors.cov()
216 # Systematic + Idiosyncratic returns
217 systematic = pd.DataFrame(
218 data=(u * s) @ vt + x_mean,
219 index=returns.index,
220 columns=returns.columns,
221 )
222 idiosyncratic = pd.DataFrame(
223 data=x_centered - (u * s) @ vt,
224 index=returns.index,
225 columns=returns.columns,
226 )
228 return PCA(
229 explained_variance=explained_variance,
230 factors=factors,
231 exposure=exposure,
232 cov=cov,
233 systematic=systematic,
234 idiosyncratic=idiosyncratic,
235 )