Coverage for src/cvxsimulator/utils/interpolation.py: 100%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14"""Interpolation utilities for time series data.

16This module provides functions for interpolating missing values in time series

17and validating that time series don't have missing values in the middle.

18"""

20import pandas as pd

21import polars as pl

24def interpolate(ts):

25 """Interpolate missing values in a time series between the first and last valid indices.

27 This function fills forward (ffill) missing values in a time series, but only

28 between the first and last valid indices. Values outside this range remain NaN/null.

30 Parameters

31 ----------

32 ts : pd.Series or pl.Series

33 The time series to interpolate

35 Returns

36 -------

37 pd.Series or pl.Series

38 The interpolated time series

40 Examples

41 --------

42 >>> import pandas as pd

43 >>> import numpy as np

44 >>> ts = pd.Series([1, np.nan, np.nan, 4, 5])

45 >>> interpolate(ts)

46 0 1.0

47 1 1.0

48 2 1.0

49 3 4.0

50 4 5.0

51 dtype: float64

53 """

54 # Check if the input is a valid type

55 if not isinstance(ts, pd.Series | pl.Series):

56 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}")

58 # If the input is a polars Series, use the polars-specific function

59 if isinstance(ts, pl.Series):

60 return interpolate_pl(ts)

61 first = ts.first_valid_index()

62 last = ts.last_valid_index()

64 if first is not None and last is not None:

65 ts_slice = ts.loc[first:last]

66 ts_slice = ts_slice.ffill()

67 result = ts.copy()

68 result.loc[first:last] = ts_slice

69 return result

70 return ts

73def valid(ts) -> bool:

74 """Check if a time series has no missing values between the first and last valid indices.

76 This function verifies that a time series doesn't have any NaN/null values in the middle.

77 It's acceptable to have NaNs/nulls at the beginning or end of the series.

79 Parameters

80 ----------

81 ts : pd.Series or pl.Series

82 The time series to check

84 Returns

85 -------

86 bool

87 True if the time series has no missing values between the first and last valid indices,

88 False otherwise

90 Examples

91 --------

92 >>> import pandas as pd

93 >>> import numpy as np

94 >>> ts1 = pd.Series([np.nan, 1, 2, 3, np.nan]) # NaNs only at beginning and end

95 >>> valid(ts1)

96 True

97 >>> ts2 = pd.Series([1, 2, np.nan, 4, 5]) # NaN in the middle

98 >>> valid(ts2)

99 False

100

101 """

102 # Check if the input is a valid type

103 if not isinstance(ts, pd.Series | pl.Series):

104 raise TypeError(f"Expected pd.Series or pl.Series, got {type(ts)}")

105

106 # If the input is a polars Series, use the polars-specific function

107 if isinstance(ts, pl.Series):

108 return valid_pl(ts)

109 # Check if the series with NaNs dropped has the same indices as the interpolated series with NaNs dropped

110 # If they're the same, there are no NaNs in the middle of the series

111 return ts.dropna().index.equals(interpolate(ts).dropna().index)

112

113

114def interpolate_pl(ts: pl.Series) -> pl.Series:

115 """Interpolate missing values in a polars time series between the first and last valid indices.

116

117 This function fills forward (ffill) missing values in a time series, but only

118 between the first and last valid indices. Values outside this range remain null.

119

120 Parameters

121 ----------

122 ts : pl.Series

123 The time series to interpolate

124

125 Returns

126 -------

127 pl.Series

128 The interpolated time series

129

130 Examples

131 --------

132 >>> import polars as pl

133 >>> ts = pl.Series([1, None, None, 4, 5])

134 >>> interpolate_pl(ts)

135 shape: (5,)

136 Series: '' [i64]

137 [

138 1

139 1

140 1

141 4

142 5

143 ]

144

145 """

146 # Find first and last valid indices

147 non_null_indices = ts.is_not_null().arg_true()

148

149 if len(non_null_indices) == 0:

150 return ts

151

152 first = non_null_indices[0]

153 last = non_null_indices[-1]

154

155 # Create a new series with the same length as the original

156 values = ts.to_list()

157

158 # Fill forward within the slice between first and last valid indices

159 current_value = None

160 for i in range(first, last + 1):

161 if values[i] is not None:

162 current_value = values[i]

163 elif current_value is not None:

164 values[i] = current_value

165

166 # Create a new series with the filled values

167 return pl.Series(values, dtype=ts.dtype)

168

169

170def valid_pl(ts: pl.Series) -> bool:

171 """Check if a polars time series has no missing values between the first and last valid indices.

172

173 This function verifies that a time series doesn't have any null values in the middle.

174 It's acceptable to have nulls at the beginning or end of the series.

175

176 Parameters

177 ----------

178 ts : pl.Series

179 The time series to check

180

181 Returns

182 -------

183 bool

184 True if the time series has no missing values between the first and last valid indices,

185 False otherwise

186

187 Examples

188 --------

189 >>> import polars as pl

190 >>> ts1 = pl.Series([None, 1, 2, 3, None]) # Nulls only at beginning and end

191 >>> valid_pl(ts1)

192 True

193 >>> ts2 = pl.Series([1, 2, None, 4, 5]) # Null in the middle

194 >>> valid_pl(ts2)

195 False

196

197 """

198 # Get indices of non-null values

199 non_null_indices = ts.is_not_null().arg_true()

200

201 if len(non_null_indices) <= 1:

202 return True

203

204 # Check if the range of indices is continuous

205 first = non_null_indices[0]

206 last = non_null_indices[-1]

207 expected_count = last - first + 1

208

209 # If all values between first and last valid indices are non-null,

210 # then the count of non-null values should equal the range size

211 return len([i for i in non_null_indices if first <= i <= last]) == expected_count

212

213

214def interpolate_df_pl(df: pl.DataFrame) -> pl.DataFrame:

215 """Interpolate missing values in a polars DataFrame between the first and last valid indices for each column.

216

217 This function applies interpolate_pl to each column of a DataFrame,

218 filling forward (ffill) missing values in each column, but only

219 between the first and last valid indices. Values outside this range remain null.

220

221 Parameters

222 ----------

223 df : pl.DataFrame

224 The DataFrame to interpolate

225

226 Returns

227 -------

228 pl.DataFrame

229 The interpolated DataFrame

230

231 Examples

232 --------

233 >>> import polars as pl

234 >>> df = pl.DataFrame({

235 ... 'A': [1.0, None, None, 4.0, 5.0],

236 ... 'B': [None, 2.0, None, 4.0, None]

237 ... })

238 >>> interpolate_df_pl(df)

239 shape: (5, 2)

240 ┌─────┬──────┐

241 │ A ┆ B │

242 │ --- ┆ --- │

243 │ f64 ┆ f64 │

244 ╞═════╪══════╡

245 │ 1.0 ┆ null │

246 │ 1.0 ┆ 2.0 │

247 │ 1.0 ┆ 2.0 │

248 │ 4.0 ┆ 4.0 │

249 │ 5.0 ┆ null │

250 └─────┴──────┘

251

252 """

253 # Apply interpolate_pl to each column

254 result = {}

255 for col in df.columns:

256 result[col] = interpolate_pl(df[col])

257

258 return pl.DataFrame(result)

259

260

261def valid_df_pl(df: pl.DataFrame) -> bool:

262 """Check if a polars DataFrame has no missing values between the first and last valid indices for each column.

263

264 This function verifies that each column in the DataFrame doesn't have any null values in the middle.

265 It's acceptable to have nulls at the beginning or end of each column.

266

267 Parameters

268 ----------

269 df : pl.DataFrame

270 The DataFrame to check

271

272 Returns

273 -------

274 bool

275 True if all columns in the DataFrame have no missing values between their first and last valid indices,

276 False otherwise

277

278 Examples

279 --------

280 >>> import polars as pl

281 >>> df1 = pl.DataFrame({

282 ... 'A': [None, 1, 2, 3, None], # Nulls only at beginning and end

283 ... 'B': [None, 2, 3, 4, None] # Nulls only at beginning and end

284 ... })

285 >>> valid_df_pl(df1)

286 True

287 >>> df2 = pl.DataFrame({

288 ... 'A': [1, 2, None, 4, 5], # Null in the middle

289 ... 'B': [1, 2, 3, 4, 5] # No nulls

290 ... })

291 >>> valid_df_pl(df2)

292 False

293

294 """

295 # Check each column

296 for col in df.columns:

297 if not valid_pl(df[col]):

298 return False

299

300 return True