Coverage for src / cvx / risk / linalg / pca.py: 100%
21 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 12:21 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-15 12:21 +0000
1# Copyright 2023 Stanford University Convex Optimization Group
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""PCA analysis (pure NumPy implementation).
16This module provides Principal Component Analysis (PCA) for dimensionality
17reduction of return data. PCA is commonly used to construct factor models
18for portfolio optimization.
20Example:
21 Perform PCA on stock returns:
23 >>> import numpy as np
24 >>> import pandas as pd
25 >>> from cvx.risk.linalg import pca
26 >>> # Create sample returns data
27 >>> np.random.seed(42)
28 >>> returns = pd.DataFrame(
29 ... np.random.randn(100, 5),
30 ... columns=['A', 'B', 'C', 'D', 'E']
31 ... )
32 >>> # Compute PCA with 3 components
33 >>> result = pca(returns, n_components=3)
34 >>> # Access explained variance
35 >>> len(result.explained_variance)
36 3
37 >>> # Access factors (principal components)
38 >>> result.factors.shape
39 (100, 3)
40 >>> # Access factor exposures (loadings)
41 >>> result.exposure.shape
42 (3, 5)
44"""
46from __future__ import annotations
48from collections import namedtuple
50import numpy as np
51import pandas as pd
53PCA = namedtuple(
54 "PCA",
55 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],
56)
57"""Named tuple containing the results of PCA analysis.
59Attributes:
60 explained_variance: Explained variance ratio for each component.
61 An array of shape (n_components,) where each element represents
62 the proportion of total variance explained by that component.
63 factors: Factor returns (principal components) as a DataFrame.
64 Shape is (n_samples, n_components). Each column is a factor.
65 exposure: Factor exposures (loadings) for each asset as a DataFrame.
66 Shape is (n_components, n_assets). Each row contains the loadings
67 of one component on all assets.
68 cov: Covariance matrix of the factors as a DataFrame.
69 Shape is (n_components, n_components).
70 systematic: Systematic returns explained by the factors as a DataFrame.
71 Shape is (n_samples, n_assets). This is the part of returns
72 explained by the factor model.
73 idiosyncratic: Idiosyncratic returns not explained by factors as a DataFrame.
74 Shape is (n_samples, n_assets). This is the residual part of returns.
76Example:
77 >>> import numpy as np
78 >>> import pandas as pd
79 >>> from cvx.risk.linalg import pca
80 >>> np.random.seed(42)
81 >>> returns = pd.DataFrame(np.random.randn(50, 4))
82 >>> result = pca(returns, n_components=2)
83 >>> # Check explained variance sums to less than 1
84 >>> result.explained_variance.sum() < 1
85 True
86 >>> # Systematic + idiosyncratic approximately equals original
87 >>> np.allclose(
88 ... result.systematic.values + result.idiosyncratic.values,
89 ... returns.values,
90 ... atol=1e-10
91 ... )
92 True
94"""
97def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:
98 """Compute the first n principal components for a return matrix using SVD.
100 This function performs Principal Component Analysis on asset returns to
101 extract the main sources of variance. The results can be used to construct
102 a factor model for portfolio optimization.
104 Args:
105 returns: DataFrame of asset returns with shape (n_samples, n_assets).
106 Rows represent time periods, columns represent assets.
107 n_components: Number of principal components to extract. Defaults to 10.
109 Returns:
110 PCA named tuple containing:
111 - explained_variance: Ratio of variance explained by each component
112 - factors: Factor returns (scores)
113 - exposure: Factor exposures (loadings)
114 - cov: Factor covariance matrix
115 - systematic: Returns explained by factors
116 - idiosyncratic: Residual returns
118 Example:
119 Basic PCA on synthetic returns:
121 >>> import numpy as np
122 >>> import pandas as pd
123 >>> from cvx.risk.linalg import pca
124 >>> np.random.seed(42)
125 >>> # Create returns with 100 periods and 10 assets
126 >>> returns = pd.DataFrame(np.random.randn(100, 10))
127 >>> result = pca(returns, n_components=3)
128 >>> # First component explains most variance
129 >>> bool(result.explained_variance[0] > result.explained_variance[1])
130 True
131 >>> # Factors are orthogonal
132 >>> factor_corr = np.corrcoef(result.factors.T)
133 >>> bool(np.allclose(factor_corr, np.eye(3), atol=0.1))
134 True
136 Using PCA results for a factor model:
138 >>> from cvx.risk.factor import FactorModel
139 >>> import cvxpy as cp
140 >>> model = FactorModel(assets=10, k=3)
141 >>> model.update(
142 ... exposure=result.exposure.values,
143 ... cov=result.cov.values,
144 ... idiosyncratic_risk=result.idiosyncratic.std().values,
145 ... lower_assets=np.zeros(10),
146 ... upper_assets=np.ones(10),
147 ... lower_factors=-np.ones(3),
148 ... upper_factors=np.ones(3)
149 ... )
151 """
152 # Demean the returns
153 x = returns.to_numpy()
154 x_mean = x.mean(axis=0)
155 x_centered = x - x_mean
157 # Singular Value Decomposition
158 # x = u s V^T, where columns of V are principal axes
159 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False)
161 # Take only the first n components
162 u = u[:, :n_components]
163 s = s_full[:n_components]
164 vt = vt[:n_components, :]
166 # Factor exposures (loadings): each component's weight per asset
167 exposure = pd.DataFrame(vt, columns=returns.columns)
169 # Factor returns (scores): projection of data onto components
170 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)])
172 # Explained variance ratio (normalize by total variance across ALL components)
173 explained_variance = (s**2) / np.sum(s_full**2)
175 # Covariance of factor returns
176 cov = factors.cov()
178 # Systematic + Idiosyncratic returns
179 systematic = pd.DataFrame(
180 data=(u * s) @ vt + x_mean,
181 index=returns.index,
182 columns=returns.columns,
183 )
184 idiosyncratic = pd.DataFrame(
185 data=x_centered - (u * s) @ vt,
186 index=returns.index,
187 columns=returns.columns,
188 )
190 return PCA(
191 explained_variance=explained_variance,
192 factors=factors,
193 exposure=exposure,
194 cov=cov,
195 systematic=systematic,
196 idiosyncratic=idiosyncratic,
197 )