Coverage for src/cvxrisk/linalg/pca.py: 100%

13 statements  

« prev     ^ index     » next       coverage.py v7.10.2, created at 2025-08-12 07:32 +0000

1# Copyright 2023 Stanford University Convex Optimization Group 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""PCA analysis.""" 

15 

16from __future__ import annotations 

17 

18from collections import namedtuple 

19 

20import numpy as np 

21import pandas as pd 

22import sklearn.decomposition as skdecomp 

23 

24PCA = namedtuple( 

25 "PCA", 

26 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"], 

27) 

28""" 

29A named tuple containing the results of PCA analysis. 

30 

31Attributes: 

32 explained_variance (numpy.ndarray): The explained variance ratio for each component 

33 factors (numpy.ndarray): The factor returns (principal components) 

34 exposure (pandas.DataFrame): The factor exposures (loadings) for each asset 

35 cov (pandas.DataFrame): The covariance matrix of the factors 

36 systematic (pandas.DataFrame): The systematic returns explained by the factors 

37 idiosyncratic (pandas.DataFrame): The idiosyncratic returns not explained by the factors 

38""" 

39 

40 

41def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA: 

42 """Compute the first n principal components for a return matrix. 

43 

44 Performs Principal Component Analysis (PCA) on the returns data to extract 

45 the most important factors that explain the variance in the returns. 

46 

47 Args: 

48 returns: DataFrame of asset returns 

49 

50 n_components: Number of principal components to extract. Defaults to 10. 

51 

52 Returns: 

53 A named tuple containing the PCA results with the following fields: 

54 - explained_variance: The explained variance ratio for each component 

55 - factors: The factor returns (principal components) 

56 - exposure: The factor exposures (loadings) for each asset 

57 - cov: The covariance matrix of the factors 

58 - systematic: The systematic returns explained by the factors 

59 - idiosyncratic: The idiosyncratic returns not explained by the factors 

60 

61 """ 

62 # USING SKLEARN. Let's look at the first n components 

63 sklearn_pca = skdecomp.PCA(n_components=n_components) 

64 sklearn_pca.fit_transform(returns) 

65 

66 exposure = sklearn_pca.components_ 

67 factors = returns @ np.transpose(exposure) 

68 

69 return PCA( 

70 explained_variance=sklearn_pca.explained_variance_ratio_, 

71 factors=factors, 

72 exposure=pd.DataFrame(data=exposure, columns=returns.columns), 

73 cov=factors.cov(), 

74 systematic=pd.DataFrame(data=factors.to_numpy() @ exposure, index=returns.index, columns=returns.columns), 

75 idiosyncratic=pd.DataFrame( 

76 data=returns.to_numpy() - factors.to_numpy() @ exposure, 

77 index=returns.index, 

78 columns=returns.columns, 

79 ), 

80 )