Coverage for src/cvxsimulator/utils/interpolation.py: 100%
54 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-10 18:45 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-10 18:45 +0000
1# Copyright 2023 Stanford University Convex Optimization Group
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Interpolation utilities for time series data.
16This module provides functions for interpolating missing values in time series
17and validating that time series don't have missing values in the middle.
18"""
20import pandas as pd
21import polars as pl
24def interpolate(ts):
25 """Interpolate missing values in a time series between the first and last valid indices.
27 This function fills forward (ffill) missing values in a time series, but only
28 between the first and last valid indices. Values outside this range remain NaN/null.
30 Parameters
31 ----------
32 ts : pd.Series or pl.Series
33 The time series to interpolate
35 Returns
36 -------
37 pd.Series or pl.Series
38 The interpolated time series
40 Examples
41 --------
42 >>> import pandas as pd
43 >>> import numpy as np
44 >>> ts = pd.Series([1, np.nan, np.nan, 4, 5])
45 >>> interpolate(ts)
46 0 1.0
47 1 1.0
48 2 1.0
49 3 4.0
50 4 5.0
51 dtype: float64
53 """
54 # Check if the input is a valid type
55 if not isinstance(ts, pd.Series | pl.Series):
56 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}")
58 # If the input is a polars Series, use the polars-specific function
59 if isinstance(ts, pl.Series):
60 return interpolate_pl(ts)
61 first = ts.first_valid_index()
62 last = ts.last_valid_index()
64 if first is not None and last is not None:
65 ts_slice = ts.loc[first:last]
66 ts_slice = ts_slice.ffill()
67 result = ts.copy()
68 result.loc[first:last] = ts_slice
69 return result
70 return ts
73def valid(ts) -> bool:
74 """Check if a time series has no missing values between the first and last valid indices.
76 This function verifies that a time series doesn't have any NaN/null values in the middle.
77 It's acceptable to have NaNs/nulls at the beginning or end of the series.
79 Parameters
80 ----------
81 ts : pd.Series or pl.Series
82 The time series to check
84 Returns
85 -------
86 bool
87 True if the time series has no missing values between the first and last valid indices,
88 False otherwise
90 Examples
91 --------
92 >>> import pandas as pd
93 >>> import numpy as np
94 >>> ts1 = pd.Series([np.nan, 1, 2, 3, np.nan]) # NaNs only at beginning and end
95 >>> valid(ts1)
96 True
97 >>> ts2 = pd.Series([1, 2, np.nan, 4, 5]) # NaN in the middle
98 >>> valid(ts2)
99 False
101 """
102 # Check if the input is a valid type
103 if not isinstance(ts, pd.Series | pl.Series):
104 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}")
106 # If the input is a polars Series, use the polars-specific function
107 if isinstance(ts, pl.Series):
108 return valid_pl(ts)
109 # Check if the series with NaNs dropped has the same indices as the interpolated series with NaNs dropped
110 # If they're the same, there are no NaNs in the middle of the series
111 return ts.dropna().index.equals(interpolate(ts).dropna().index)
114def interpolate_pl(ts: pl.Series) -> pl.Series:
115 """Interpolate missing values in a polars time series between the first and last valid indices.
117 This function fills forward (ffill) missing values in a time series, but only
118 between the first and last valid indices. Values outside this range remain null.
120 Parameters
121 ----------
122 ts : pl.Series
123 The time series to interpolate
125 Returns
126 -------
127 pl.Series
128 The interpolated time series
130 Examples
131 --------
132 >>> import polars as pl
133 >>> ts = pl.Series([1, None, None, 4, 5])
134 >>> interpolate_pl(ts)
135 shape: (5,)
136 Series: '' [i64]
137 [
138 1
139 1
140 1
141 4
142 5
143 ]
145 """
146 # Find first and last valid indices
147 non_null_indices = ts.is_not_null().arg_true()
149 if len(non_null_indices) == 0:
150 return ts
152 first = non_null_indices[0]
153 last = non_null_indices[-1]
155 # Create a new series with the same length as the original
156 values = ts.to_list()
158 # Fill forward within the slice between first and last valid indices
159 current_value = None
160 for i in range(first, last + 1):
161 if values[i] is not None:
162 current_value = values[i]
163 elif current_value is not None:
164 values[i] = current_value
166 # Create a new series with the filled values
167 return pl.Series(values, dtype=ts.dtype)
170def valid_pl(ts: pl.Series) -> bool:
171 """Check if a polars time series has no missing values between the first and last valid indices.
173 This function verifies that a time series doesn't have any null values in the middle.
174 It's acceptable to have nulls at the beginning or end of the series.
176 Parameters
177 ----------
178 ts : pl.Series
179 The time series to check
181 Returns
182 -------
183 bool
184 True if the time series has no missing values between the first and last valid indices,
185 False otherwise
187 Examples
188 --------
189 >>> import polars as pl
190 >>> ts1 = pl.Series([None, 1, 2, 3, None]) # Nulls only at beginning and end
191 >>> valid_pl(ts1)
192 True
193 >>> ts2 = pl.Series([1, 2, None, 4, 5]) # Null in the middle
194 >>> valid_pl(ts2)
195 False
197 """
198 # Get indices of non-null values
199 non_null_indices = ts.is_not_null().arg_true()
201 if len(non_null_indices) <= 1:
202 return True
204 # Check if the range of indices is continuous
205 first = non_null_indices[0]
206 last = non_null_indices[-1]
207 expected_count = last - first + 1
209 # If all values between first and last valid indices are non-null,
210 # then the count of non-null values should equal the range size
211 return len([i for i in non_null_indices if first <= i <= last]) == expected_count
214def interpolate_df_pl(df: pl.DataFrame) -> pl.DataFrame:
215 """Interpolate missing values in a polars DataFrame between the first and last valid indices for each column.
217 This function applies interpolate_pl to each column of a DataFrame,
218 filling forward (ffill) missing values in each column, but only
219 between the first and last valid indices. Values outside this range remain null.
221 Parameters
222 ----------
223 df : pl.DataFrame
224 The DataFrame to interpolate
226 Returns
227 -------
228 pl.DataFrame
229 The interpolated DataFrame
231 Examples
232 --------
233 >>> import polars as pl
234 >>> df = pl.DataFrame({
235 ... 'A': [1.0, None, None, 4.0, 5.0],
236 ... 'B': [None, 2.0, None, 4.0, None]
237 ... })
238 >>> interpolate_df_pl(df)
239 shape: (5, 2)
240 ┌─────┬──────┐
241 │ A ┆ B │
242 │ --- ┆ --- │
243 │ f64 ┆ f64 │
244 ╞═════╪══════╡
245 │ 1.0 ┆ null │
246 │ 1.0 ┆ 2.0 │
247 │ 1.0 ┆ 2.0 │
248 │ 4.0 ┆ 4.0 │
249 │ 5.0 ┆ null │
250 └─────┴──────┘
252 """
253 # Apply interpolate_pl to each column
254 result = {}
255 for col in df.columns:
256 result[col] = interpolate_pl(df[col])
258 return pl.DataFrame(result)
261def valid_df_pl(df: pl.DataFrame) -> bool:
262 """Check if a polars DataFrame has no missing values between the first and last valid indices for each column.
264 This function verifies that each column in the DataFrame doesn't have any null values in the middle.
265 It's acceptable to have nulls at the beginning or end of each column.
267 Parameters
268 ----------
269 df : pl.DataFrame
270 The DataFrame to check
272 Returns
273 -------
274 bool
275 True if all columns in the DataFrame have no missing values between their first and last valid indices,
276 False otherwise
278 Examples
279 --------
280 >>> import polars as pl
281 >>> df1 = pl.DataFrame({
282 ... 'A': [None, 1, 2, 3, None], # Nulls only at beginning and end
283 ... 'B': [None, 2, 3, 4, None] # Nulls only at beginning and end
284 ... })
285 >>> valid_df_pl(df1)
286 True
287 >>> df2 = pl.DataFrame({
288 ... 'A': [1, 2, None, 4, 5], # Null in the middle
289 ... 'B': [1, 2, 3, 4, 5] # No nulls
290 ... })
291 >>> valid_df_pl(df2)
292 False
294 """
295 # Check each column
296 for col in df.columns:
297 if not valid_pl(df[col]):
298 return False
300 return True