Coverage for src/cvxsimulator/utils/interpolation.py: 100%

54 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-10 18:45 +0000

1# Copyright 2023 Stanford University Convex Optimization Group 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14"""Interpolation utilities for time series data. 

15 

16This module provides functions for interpolating missing values in time series 

17and validating that time series don't have missing values in the middle. 

18""" 

19 

20import pandas as pd 

21import polars as pl 

22 

23 

24def interpolate(ts): 

25 """Interpolate missing values in a time series between the first and last valid indices. 

26 

27 This function fills forward (ffill) missing values in a time series, but only 

28 between the first and last valid indices. Values outside this range remain NaN/null. 

29 

30 Parameters 

31 ---------- 

32 ts : pd.Series or pl.Series 

33 The time series to interpolate 

34 

35 Returns 

36 ------- 

37 pd.Series or pl.Series 

38 The interpolated time series 

39 

40 Examples 

41 -------- 

42 >>> import pandas as pd 

43 >>> import numpy as np 

44 >>> ts = pd.Series([1, np.nan, np.nan, 4, 5]) 

45 >>> interpolate(ts) 

46 0 1.0 

47 1 1.0 

48 2 1.0 

49 3 4.0 

50 4 5.0 

51 dtype: float64 

52 

53 """ 

54 # Check if the input is a valid type 

55 if not isinstance(ts, pd.Series | pl.Series): 

56 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}") 

57 

58 # If the input is a polars Series, use the polars-specific function 

59 if isinstance(ts, pl.Series): 

60 return interpolate_pl(ts) 

61 first = ts.first_valid_index() 

62 last = ts.last_valid_index() 

63 

64 if first is not None and last is not None: 

65 ts_slice = ts.loc[first:last] 

66 ts_slice = ts_slice.ffill() 

67 result = ts.copy() 

68 result.loc[first:last] = ts_slice 

69 return result 

70 return ts 

71 

72 

73def valid(ts) -> bool: 

74 """Check if a time series has no missing values between the first and last valid indices. 

75 

76 This function verifies that a time series doesn't have any NaN/null values in the middle. 

77 It's acceptable to have NaNs/nulls at the beginning or end of the series. 

78 

79 Parameters 

80 ---------- 

81 ts : pd.Series or pl.Series 

82 The time series to check 

83 

84 Returns 

85 ------- 

86 bool 

87 True if the time series has no missing values between the first and last valid indices, 

88 False otherwise 

89 

90 Examples 

91 -------- 

92 >>> import pandas as pd 

93 >>> import numpy as np 

94 >>> ts1 = pd.Series([np.nan, 1, 2, 3, np.nan]) # NaNs only at beginning and end 

95 >>> valid(ts1) 

96 True 

97 >>> ts2 = pd.Series([1, 2, np.nan, 4, 5]) # NaN in the middle 

98 >>> valid(ts2) 

99 False 

100 

101 """ 

102 # Check if the input is a valid type 

103 if not isinstance(ts, pd.Series | pl.Series): 

104 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}") 

105 

106 # If the input is a polars Series, use the polars-specific function 

107 if isinstance(ts, pl.Series): 

108 return valid_pl(ts) 

109 # Check if the series with NaNs dropped has the same indices as the interpolated series with NaNs dropped 

110 # If they're the same, there are no NaNs in the middle of the series 

111 return ts.dropna().index.equals(interpolate(ts).dropna().index) 

112 

113 

114def interpolate_pl(ts: pl.Series) -> pl.Series: 

115 """Interpolate missing values in a polars time series between the first and last valid indices. 

116 

117 This function fills forward (ffill) missing values in a time series, but only 

118 between the first and last valid indices. Values outside this range remain null. 

119 

120 Parameters 

121 ---------- 

122 ts : pl.Series 

123 The time series to interpolate 

124 

125 Returns 

126 ------- 

127 pl.Series 

128 The interpolated time series 

129 

130 Examples 

131 -------- 

132 >>> import polars as pl 

133 >>> ts = pl.Series([1, None, None, 4, 5]) 

134 >>> interpolate_pl(ts) 

135 shape: (5,) 

136 Series: '' [i64] 

137 [ 

138 1 

139 1 

140 1 

141 4 

142 5 

143 ] 

144 

145 """ 

146 # Find first and last valid indices 

147 non_null_indices = ts.is_not_null().arg_true() 

148 

149 if len(non_null_indices) == 0: 

150 return ts 

151 

152 first = non_null_indices[0] 

153 last = non_null_indices[-1] 

154 

155 # Create a new series with the same length as the original 

156 values = ts.to_list() 

157 

158 # Fill forward within the slice between first and last valid indices 

159 current_value = None 

160 for i in range(first, last + 1): 

161 if values[i] is not None: 

162 current_value = values[i] 

163 elif current_value is not None: 

164 values[i] = current_value 

165 

166 # Create a new series with the filled values 

167 return pl.Series(values, dtype=ts.dtype) 

168 

169 

170def valid_pl(ts: pl.Series) -> bool: 

171 """Check if a polars time series has no missing values between the first and last valid indices. 

172 

173 This function verifies that a time series doesn't have any null values in the middle. 

174 It's acceptable to have nulls at the beginning or end of the series. 

175 

176 Parameters 

177 ---------- 

178 ts : pl.Series 

179 The time series to check 

180 

181 Returns 

182 ------- 

183 bool 

184 True if the time series has no missing values between the first and last valid indices, 

185 False otherwise 

186 

187 Examples 

188 -------- 

189 >>> import polars as pl 

190 >>> ts1 = pl.Series([None, 1, 2, 3, None]) # Nulls only at beginning and end 

191 >>> valid_pl(ts1) 

192 True 

193 >>> ts2 = pl.Series([1, 2, None, 4, 5]) # Null in the middle 

194 >>> valid_pl(ts2) 

195 False 

196 

197 """ 

198 # Get indices of non-null values 

199 non_null_indices = ts.is_not_null().arg_true() 

200 

201 if len(non_null_indices) <= 1: 

202 return True 

203 

204 # Check if the range of indices is continuous 

205 first = non_null_indices[0] 

206 last = non_null_indices[-1] 

207 expected_count = last - first + 1 

208 

209 # If all values between first and last valid indices are non-null, 

210 # then the count of non-null values should equal the range size 

211 return len([i for i in non_null_indices if first <= i <= last]) == expected_count 

212 

213 

214def interpolate_df_pl(df: pl.DataFrame) -> pl.DataFrame: 

215 """Interpolate missing values in a polars DataFrame between the first and last valid indices for each column. 

216 

217 This function applies interpolate_pl to each column of a DataFrame, 

218 filling forward (ffill) missing values in each column, but only 

219 between the first and last valid indices. Values outside this range remain null. 

220 

221 Parameters 

222 ---------- 

223 df : pl.DataFrame 

224 The DataFrame to interpolate 

225 

226 Returns 

227 ------- 

228 pl.DataFrame 

229 The interpolated DataFrame 

230 

231 Examples 

232 -------- 

233 >>> import polars as pl 

234 >>> df = pl.DataFrame({ 

235 ... 'A': [1.0, None, None, 4.0, 5.0], 

236 ... 'B': [None, 2.0, None, 4.0, None] 

237 ... }) 

238 >>> interpolate_df_pl(df) 

239 shape: (5, 2) 

240 ┌─────┬──────┐ 

241 │ A ┆ B │ 

242 │ --- ┆ --- │ 

243 │ f64 ┆ f64 │ 

244 ╞═════╪══════╡ 

245 │ 1.0 ┆ null │ 

246 │ 1.0 ┆ 2.0 │ 

247 │ 1.0 ┆ 2.0 │ 

248 │ 4.0 ┆ 4.0 │ 

249 │ 5.0 ┆ null │ 

250 └─────┴──────┘ 

251 

252 """ 

253 # Apply interpolate_pl to each column 

254 result = {} 

255 for col in df.columns: 

256 result[col] = interpolate_pl(df[col]) 

257 

258 return pl.DataFrame(result) 

259 

260 

261def valid_df_pl(df: pl.DataFrame) -> bool: 

262 """Check if a polars DataFrame has no missing values between the first and last valid indices for each column. 

263 

264 This function verifies that each column in the DataFrame doesn't have any null values in the middle. 

265 It's acceptable to have nulls at the beginning or end of each column. 

266 

267 Parameters 

268 ---------- 

269 df : pl.DataFrame 

270 The DataFrame to check 

271 

272 Returns 

273 ------- 

274 bool 

275 True if all columns in the DataFrame have no missing values between their first and last valid indices, 

276 False otherwise 

277 

278 Examples 

279 -------- 

280 >>> import polars as pl 

281 >>> df1 = pl.DataFrame({ 

282 ... 'A': [None, 1, 2, 3, None], # Nulls only at beginning and end 

283 ... 'B': [None, 2, 3, 4, None] # Nulls only at beginning and end 

284 ... }) 

285 >>> valid_df_pl(df1) 

286 True 

287 >>> df2 = pl.DataFrame({ 

288 ... 'A': [1, 2, None, 4, 5], # Null in the middle 

289 ... 'B': [1, 2, 3, 4, 5] # No nulls 

290 ... }) 

291 >>> valid_df_pl(df2) 

292 False 

293 

294 """ 

295 # Check each column 

296 for col in df.columns: 

297 if not valid_pl(df[col]): 

298 return False 

299 

300 return True