Coverage for src/cvxrisk/linalg/pca.py: 100%

21 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-11-24 06:34 +0000

1# Copyright 2023 Stanford University Convex Optimization Group 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""PCA analysis (pure NumPy implementation).""" 

15 

16from __future__ import annotations 

17 

18from collections import namedtuple 

19 

20import numpy as np 

21import pandas as pd 

22 

23PCA = namedtuple( 

24 "PCA", 

25 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"], 

26) 

27""" 

28A named tuple containing the results of PCA analysis. 

29 

30Attributes: 

31 explained_variance (numpy.ndarray): Explained variance ratio for each component 

32 factors (pandas.DataFrame): Factor returns (principal components) 

33 exposure (pandas.DataFrame): Factor exposures (loadings) for each asset 

34 cov (pandas.DataFrame): Covariance matrix of the factors 

35 systematic (pandas.DataFrame): Systematic returns explained by the factors 

36 idiosyncratic (pandas.DataFrame): Idiosyncratic returns not explained by the factors 

37""" 

38 

39 

40def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA: 

41 """Compute the first n principal components for a return matrix using SVD. 

42 

43 Args: 

44 returns: DataFrame of asset returns (rows: time, columns: assets) 

45 n_components: Number of principal components to extract. Defaults to 10. 

46 

47 Returns: 

48 PCA named tuple with the results. 

49 """ 

50 # Demean the returns 

51 x = returns.to_numpy() 

52 x_mean = x.mean(axis=0) 

53 x_centered = x - x_mean 

54 

55 # Singular Value Decomposition 

56 # x = u s V^T, where columns of V are principal axes 

57 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False) 

58 

59 # Take only the first n components 

60 u = u[:, :n_components] 

61 s = s_full[:n_components] 

62 vt = vt[:n_components, :] 

63 

64 # Factor exposures (loadings): each component's weight per asset 

65 exposure = pd.DataFrame(vt, columns=returns.columns) 

66 

67 # Factor returns (scores): projection of data onto components 

68 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)]) 

69 

70 # Explained variance ratio (normalize by total variance across ALL components) 

71 explained_variance = (s**2) / np.sum(s_full**2) 

72 

73 # Covariance of factor returns 

74 cov = factors.cov() 

75 

76 # Systematic + Idiosyncratic returns 

77 systematic = pd.DataFrame( 

78 data=(u * s) @ vt + x_mean, 

79 index=returns.index, 

80 columns=returns.columns, 

81 ) 

82 idiosyncratic = pd.DataFrame( 

83 data=x_centered - (u * s) @ vt, 

84 index=returns.index, 

85 columns=returns.columns, 

86 ) 

87 

88 return PCA( 

89 explained_variance=explained_variance, 

90 factors=factors, 

91 exposure=exposure, 

92 cov=cov, 

93 systematic=systematic, 

94 idiosyncratic=idiosyncratic, 

95 )