Coverage for src/cvxrisk/linalg/pca.py: 100%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14"""PCA analysis (pure NumPy implementation)."""

16from __future__ import annotations

18from collections import namedtuple

20import numpy as np

21import pandas as pd

23PCA = namedtuple(

24 "PCA",

25 ["explained_variance", "factors", "exposure", "cov", "systematic", "idiosyncratic"],

26)

27"""

28A named tuple containing the results of PCA analysis.

30Attributes:

31 explained_variance (numpy.ndarray): Explained variance ratio for each component

32 factors (pandas.DataFrame): Factor returns (principal components)

33 exposure (pandas.DataFrame): Factor exposures (loadings) for each asset

34 cov (pandas.DataFrame): Covariance matrix of the factors

35 systematic (pandas.DataFrame): Systematic returns explained by the factors

36 idiosyncratic (pandas.DataFrame): Idiosyncratic returns not explained by the factors

37"""

40def pca(returns: pd.DataFrame, n_components: int = 10) -> PCA:

41 """Compute the first n principal components for a return matrix using SVD.

43 Args:

44 returns: DataFrame of asset returns (rows: time, columns: assets)

45 n_components: Number of principal components to extract. Defaults to 10.

47 Returns:

48 PCA named tuple with the results.

49 """

50 # Demean the returns

51 x = returns.to_numpy()

52 x_mean = x.mean(axis=0)

53 x_centered = x - x_mean

55 # Singular Value Decomposition

56 # x = u s V^T, where columns of V are principal axes

57 u, s_full, vt = np.linalg.svd(x_centered, full_matrices=False)

59 # Take only the first n components

60 u = u[:, :n_components]

61 s = s_full[:n_components]

62 vt = vt[:n_components, :]

64 # Factor exposures (loadings): each component's weight per asset

65 exposure = pd.DataFrame(vt, columns=returns.columns)

67 # Factor returns (scores): projection of data onto components

68 factors = pd.DataFrame(u * s, index=returns.index, columns=[f"PC{i + 1}" for i in range(n_components)])

70 # Explained variance ratio (normalize by total variance across ALL components)

71 explained_variance = (s**2) / np.sum(s_full**2)

73 # Covariance of factor returns

74 cov = factors.cov()

76 # Systematic + Idiosyncratic returns

77 systematic = pd.DataFrame(

78 data=(u * s) @ vt + x_mean,

79 index=returns.index,

80 columns=returns.columns,

81 )

82 idiosyncratic = pd.DataFrame(

83 data=x_centered - (u * s) @ vt,

84 index=returns.index,

85 columns=returns.columns,

86 )

88 return PCA(

89 explained_variance=explained_variance,

90 factors=factors,

91 exposure=exposure,

92 cov=cov,

93 systematic=systematic,

94 idiosyncratic=idiosyncratic,

95 )