Coverage for src/cvxrisk/linalg/pca.py: 100%
21 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-11-24 06:34 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-11-24 06:34 +0000
1# Copyright 2023 Stanford University Convex Optimization Group
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""PCA analysis (pure NumPy implementation)."""
16from __future__ import annotations
18from collections import namedtuple
20import numpy as np
21import pandas as pd
23PCA = namedtuple(
24 "PCA",
25 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],
26)
27"""
28A named tuple containing the results of PCA analysis.
30Attributes:
31 explained_variance (numpy.ndarray): Explained variance ratio for each component
32 factors (pandas.DataFrame): Factor returns (principal components)
33 exposure (pandas.DataFrame): Factor exposures (loadings) for each asset
34 cov (pandas.DataFrame): Covariance matrix of the factors
35 systematic (pandas.DataFrame): Systematic returns explained by the factors
36 idiosyncratic (pandas.DataFrame): Idiosyncratic returns not explained by the factors
37"""
40def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:
41 """Compute the first n principal components for a return matrix using SVD.
43 Args:
44 returns: DataFrame of asset returns (rows: time, columns: assets)
45 n_components: Number of principal components to extract. Defaults to 10.
47 Returns:
48 PCA named tuple with the results.
49 """
50 # Demean the returns
51 x = returns.to_numpy()
52 x_mean = x.mean(axis=0)
53 x_centered = x - x_mean
55 # Singular Value Decomposition
56 # x = u s V^T, where columns of V are principal axes
57 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False)
59 # Take only the first n components
60 u = u[:, :n_components]
61 s = s_full[:n_components]
62 vt = vt[:n_components, :]
64 # Factor exposures (loadings): each component's weight per asset
65 exposure = pd.DataFrame(vt, columns=returns.columns)
67 # Factor returns (scores): projection of data onto components
68 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)])
70 # Explained variance ratio (normalize by total variance across ALL components)
71 explained_variance = (s**2) / np.sum(s_full**2)
73 # Covariance of factor returns
74 cov = factors.cov()
76 # Systematic + Idiosyncratic returns
77 systematic = pd.DataFrame(
78 data=(u * s) @ vt + x_mean,
79 index=returns.index,
80 columns=returns.columns,
81 )
82 idiosyncratic = pd.DataFrame(
83 data=x_centered - (u * s) @ vt,
84 index=returns.index,
85 columns=returns.columns,
86 )
88 return PCA(
89 explained_variance=explained_variance,
90 factors=factors,
91 exposure=exposure,
92 cov=cov,
93 systematic=systematic,
94 idiosyncratic=idiosyncratic,
95 )