Coverage for src/cvxrisk/linalg/pca.py: 100%
13 statements
« prev ^ index » next coverage.py v7.10.2, created at 2025-08-12 07:32 +0000
« prev ^ index » next coverage.py v7.10.2, created at 2025-08-12 07:32 +0000
1# Copyright 2023 Stanford University Convex Optimization Group
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""PCA analysis."""
16from __future__ import annotations
18from collections import namedtuple
20import numpy as np
21import pandas as pd
22import sklearn.decomposition as skdecomp
24PCA = namedtuple(
25 "PCA",
26 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],
27)
28"""
29A named tuple containing the results of PCA analysis.
31Attributes:
32 explained_variance (numpy.ndarray): The explained variance ratio for each component
33 factors (numpy.ndarray): The factor returns (principal components)
34 exposure (pandas.DataFrame): The factor exposures (loadings) for each asset
35 cov (pandas.DataFrame): The covariance matrix of the factors
36 systematic (pandas.DataFrame): The systematic returns explained by the factors
37 idiosyncratic (pandas.DataFrame): The idiosyncratic returns not explained by the factors
38"""
41def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:
42 """Compute the first n principal components for a return matrix.
44 Performs Principal Component Analysis (PCA) on the returns data to extract
45 the most important factors that explain the variance in the returns.
47 Args:
48 returns: DataFrame of asset returns
50 n_components: Number of principal components to extract. Defaults to 10.
52 Returns:
53 A named tuple containing the PCA results with the following fields:
54 - explained_variance: The explained variance ratio for each component
55 - factors: The factor returns (principal components)
56 - exposure: The factor exposures (loadings) for each asset
57 - cov: The covariance matrix of the factors
58 - systematic: The systematic returns explained by the factors
59 - idiosyncratic: The idiosyncratic returns not explained by the factors
61 """
62 # USING SKLEARN. Let's look at the first n components
63 sklearn_pca = skdecomp.PCA(n_components=n_components)
64 sklearn_pca.fit_transform(returns)
66 exposure = sklearn_pca.components_
67 factors = returns @ np.transpose(exposure)
69 return PCA(
70 explained_variance=sklearn_pca.explained_variance_ratio_,
71 factors=factors,
72 exposure=pd.DataFrame(data=exposure, columns=returns.columns),
73 cov=factors.cov(),
74 systematic=pd.DataFrame(data=factors.to_numpy() @ exposure, index=returns.index, columns=returns.columns),
75 idiosyncratic=pd.DataFrame(
76 data=returns.to_numpy() - factors.to_numpy() @ exposure,
77 index=returns.index,
78 columns=returns.columns,
79 ),
80 )