In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas
Comparing Time Series with Different Time Steps¶
In [ ]:
ROOT = ""
station = "abur" #"abed"
sensor = "rad" #"bub"
def load_ts(station, sensor):
obs = pd.read_parquet(ROOT + f"data/obs/{station}_{sensor}.parquet")
sim = pd.read_parquet(ROOT + f"data/models/seareport-v2.2/{station}.parquet")
obs = pd.Series(obs[obs.columns[0]], name = "obs")
sim = pd.Series(sim[sim.columns[0]], name = "sim")
sim = sim.sort_index()
return sim, obs
sim, obs = load_ts(station, sensor)
Let's test the different approaches on one week of data: it's more than enough
In [3]:
YEAR = 2022
sim_subset = sim.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
obs_subset = obs.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
In [4]:
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()
Out[4]:
0 days 00:06:40 236736 0 days 00:00:00 2 Name: count, dtype: int64
Out[4]:
time 0 days 00:01:00 1563040 0 days 00:11:00 351 0 days 00:21:00 35 0 days 00:07:00 22 0 days 00:31:00 9 0 days 01:01:00 6 0 days 00:08:00 6 0 days 00:41:00 5 0 days 00:51:00 4 0 days 00:02:00 3 0 days 04:51:00 2 0 days 01:11:00 2 0 days 01:31:00 2 0 days 00:04:00 2 0 days 01:21:00 2 0 days 00:06:00 1 0 days 08:51:00 1 4 days 03:01:00 1 0 days 00:12:00 1 0 days 01:51:00 1 0 days 00:03:00 1 0 days 03:41:00 1 Name: count, dtype: int64
In [5]:
(obs_subset.hvplot()*sim_subset.hvplot()).opts(width=1300, height=800, title = f"simple comparison: model vs. observation, station: '{station}'")
Out[5]:
1. Nearest-Neighbor Alignment¶
Method: For each model timestamp, find the closest observation timestamp within a defined tolerance window Advantages:
- No interpolation (no fictional data)
- Preserves actual observation values
Cons:
- No control on the aligned signal, resulting in:
- missing peaks
- or even missing the trend (if there is noise signal looks chaotic)
In [6]:
aligned_data = pd.merge_asof(
sim_subset, obs_subset,
left_index=True, right_index=True,
tolerance=pd.Timedelta('2min'), # Set appropriate tolerance
direction='nearest'
)
aligned_data = aligned_data.rename(columns={"obs": "obs_aligned"})
aligned_data.sort_index().index.diff().dropna().value_counts()
(obs_subset.hvplot() * aligned_data.hvplot()).opts(width=1300, height = 800, title=f"Method 1. Nearest-Neighbor Alignment, station: '{station}'")
Out[6]:
0 days 00:06:40 1727 Name: count, dtype: int64
Out[6]:
not insteresting enough to be considered
Method 2. Window-Based Aggregation¶
Method: Use the model timestamps as reference points and aggregate observations within a window
Advantages:
- we can choose between max or mean
Cons
- We drop some maxima (outside ouf the averaging window) if the window is not adapted
- We end loosing information, because dropping data points
In [7]:
def aggregate(sim, obs, window_size='7min'):
all_times = pd.DatetimeIndex(sorted(set(sim.index) | set(obs.index)))
full_obs = pd.Series(np.nan, index=all_times)
full_obs.loc[obs.index] = obs
window = pd.Timedelta(window_size)
rolling_stats = pd.DataFrame({
'obs_mean': full_obs.rolling(window=window, center=True).mean(),
'obs_max': full_obs.rolling(window=window, center=True).max(),
'obs_count': full_obs.rolling(window=window, center=True).count()
})
result = pd.DataFrame({'sim': sim})
result = result.join(rolling_stats)
result = result[result['obs_count'] > 0].copy()
return result
df1 = aggregate(sim_subset, obs_subset)
df1
Out[7]:
sim | obs_mean | obs_max | obs_count | |
---|---|---|---|---|
2022-09-13 00:00:00 | 0.075297 | 0.081856 | 0.086098 | 4.0 |
2022-09-13 00:06:40 | 0.077474 | 0.085185 | 0.094198 | 7.0 |
2022-09-13 00:13:20 | 0.079457 | 0.106724 | 0.113791 | 7.0 |
2022-09-13 00:20:00 | 0.081009 | 0.097287 | 0.112626 | 7.0 |
2022-09-13 00:26:40 | 0.082073 | 0.085271 | 0.100874 | 7.0 |
... | ... | ... | ... | ... |
2022-09-20 23:26:40 | 0.060263 | 0.100853 | 0.115913 | 7.0 |
2022-09-20 23:33:20 | 0.060033 | 0.119736 | 0.124353 | 7.0 |
2022-09-20 23:40:00 | 0.060690 | 0.117228 | 0.122574 | 7.0 |
2022-09-20 23:46:40 | 0.062372 | 0.104759 | 0.108953 | 7.0 |
2022-09-20 23:53:20 | 0.064460 | 0.097619 | 0.099798 | 7.0 |
1726 rows × 4 columns
In [8]:
df2 = aggregate(sim_subset, obs_subset, window_size="2min")
df2
Out[8]:
sim | obs_mean | obs_max | obs_count | |
---|---|---|---|---|
2022-09-13 00:00:00 | 0.075297 | 0.083636 | 0.086098 | 2.0 |
2022-09-13 00:06:40 | 0.077474 | 0.081077 | 0.083344 | 2.0 |
2022-09-13 00:13:20 | 0.079457 | 0.109276 | 0.113236 | 2.0 |
2022-09-13 00:20:00 | 0.081009 | 0.095198 | 0.100256 | 2.0 |
2022-09-13 00:26:40 | 0.082073 | 0.082142 | 0.084789 | 2.0 |
... | ... | ... | ... | ... |
2022-09-20 23:26:40 | 0.060263 | 0.099891 | 0.102171 | 2.0 |
2022-09-20 23:33:20 | 0.060033 | 0.119389 | 0.120241 | 2.0 |
2022-09-20 23:40:00 | 0.060690 | 0.114853 | 0.117292 | 2.0 |
2022-09-20 23:46:40 | 0.062372 | 0.106439 | 0.108953 | 2.0 |
2022-09-20 23:53:20 | 0.064460 | 0.095060 | 0.095505 | 2.0 |
1726 rows × 4 columns
In [9]:
(obs_subset.hvplot()
* sim_subset.hvplot()
* df1.obs_max.hvplot(label = "obs_max window: 7min")
* df2.obs_max.hvplot(label = "obs_max window: 2min")
).opts(width=1300, height=800, title = f"Method 2. Window-Based Aggregation, station: '{station}'")
Out[9]:
3. Interpolating the model on the observed TS index¶
Advantages:
- No observation data is dropped
Cons:
- We create fictional data (for the model TS)
- May result in heavy process if observed signal has high sample rate
In [10]:
def sim_on_obs(sim, obs):
sim = sim.drop_duplicates()
obs = obs.drop_duplicates()
df = pd.merge(sim, obs, left_index=True, right_index=True, how='outer')
df = df.drop_duplicates()
df = df[~df.index.duplicated(keep='first')]
print("merged df:")
print(df.iloc[:30])
df['sim'] = df['sim'].interpolate(method="cubic", limit_direction="both")
df = df.dropna(subset=['obs'])
return df
df = sim_on_obs(sim_subset, obs_subset)
print("final df: ")
df
merged df: sim obs 2022-09-13 00:00:00 0.075297 0.086098 2022-09-13 00:01:00 NaN 0.081174 2022-09-13 00:02:00 NaN 0.082495 2022-09-13 00:03:00 NaN 0.077659 2022-09-13 00:04:00 NaN 0.079066 2022-09-13 00:05:00 NaN 0.080416 2022-09-13 00:06:00 NaN 0.078809 2022-09-13 00:06:40 0.077474 NaN 2022-09-13 00:07:00 NaN 0.083344 2022-09-13 00:08:00 NaN 0.087921 2022-09-13 00:09:00 NaN 0.092539 2022-09-13 00:10:00 NaN 0.094198 2022-09-13 00:11:00 NaN 0.101897 2022-09-13 00:12:00 NaN 0.106637 2022-09-13 00:13:00 NaN 0.105317 2022-09-13 00:13:20 0.079457 NaN 2022-09-13 00:14:00 NaN 0.113236 2022-09-13 00:15:00 NaN 0.111994 2022-09-13 00:16:00 NaN 0.113791 2022-09-13 00:17:00 NaN 0.112626 2022-09-13 00:18:00 NaN 0.108499 2022-09-13 00:19:00 NaN 0.101309 2022-09-13 00:20:00 0.081009 0.100256 2022-09-13 00:21:00 NaN 0.090140 2022-09-13 00:22:00 NaN 0.086060 2022-09-13 00:23:00 NaN 0.082116 2022-09-13 00:24:00 NaN 0.081208 2022-09-13 00:25:00 NaN 0.071134 2022-09-13 00:26:00 NaN 0.079494 2022-09-13 00:26:40 0.082073 NaN final df:
Out[10]:
sim | obs | |
---|---|---|
2022-09-13 00:00:00 | 0.075297 | 0.086098 |
2022-09-13 00:01:00 | 0.075626 | 0.081174 |
2022-09-13 00:02:00 | 0.075956 | 0.082495 |
2022-09-13 00:03:00 | 0.076285 | 0.077659 |
2022-09-13 00:04:00 | 0.076613 | 0.079066 |
... | ... | ... |
2022-09-20 23:55:00 | NaN | 0.099470 |
2022-09-20 23:56:00 | NaN | 0.097208 |
2022-09-20 23:57:00 | NaN | 0.104120 |
2022-09-20 23:58:00 | NaN | 0.107905 |
2022-09-20 23:59:00 | NaN | 0.111764 |
11498 rows × 2 columns
In [11]:
sim_, obs_ = df["sim"], df["obs"]
(obs_ == obs_subset).all()
Out[11]:
np.True_
In [12]:
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")
Out[12]:
testing now with a station with a lower sampling rate than the simulation:
In [13]:
station = "abed"
sensor = "bub"
sim, obs = load_ts(station, sensor)
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()
YEAR = 2024
sim_subset = sim.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]
obs_subset = obs.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]
Out[13]:
0 days 00:06:40 236736 0 days 00:00:00 2 Name: count, dtype: int64
Out[13]:
time 0 days 00:15:00 103752 0 days 00:30:00 181 0 days 00:45:00 41 0 days 01:00:00 11 0 days 01:30:00 10 0 days 01:15:00 9 0 days 01:45:00 5 0 days 02:15:00 4 0 days 02:00:00 3 0 days 03:15:00 3 0 days 04:15:00 2 0 days 04:00:00 1 0 days 03:45:00 1 0 days 17:15:00 1 3 days 06:30:00 1 0 days 06:15:00 1 0 days 02:45:00 1 0 days 03:00:00 1 0 days 23:15:00 1 0 days 02:30:00 1 Name: count, dtype: int64
In [14]:
df = aggregate(sim_subset, obs_subset)
df
(obs_subset.hvplot()
* sim_subset.hvplot()
* df.obs_max.hvplot(label = "obs_max window: 7min")
).opts(width=1300, height=800, title = f"Method 2. Window-Based Aggregation, station: '{station}'")
Out[14]:
sim | obs_mean | obs_max | obs_count | |
---|---|---|---|---|
2024-01-22 00:00:00 | -0.081508 | -0.072578 | -0.072578 | 1.0 |
2024-01-22 00:13:20 | -0.084692 | -0.035246 | -0.035246 | 1.0 |
2024-01-22 00:26:40 | -0.084821 | -0.088447 | -0.088447 | 1.0 |
2024-01-22 00:33:20 | -0.081714 | -0.088447 | -0.088447 | 1.0 |
2024-01-22 00:46:40 | -0.069040 | -0.112014 | -0.112014 | 1.0 |
... | ... | ... | ... | ... |
2024-02-01 23:00:00 | -0.100931 | -0.135878 | -0.135878 | 1.0 |
2024-02-01 23:13:20 | -0.093476 | -0.124947 | -0.124947 | 1.0 |
2024-02-01 23:26:40 | -0.083951 | -0.080565 | -0.080565 | 1.0 |
2024-02-01 23:33:20 | -0.078872 | -0.080565 | -0.080565 | 1.0 |
2024-02-01 23:46:40 | -0.069120 | -0.094202 | -0.094202 | 1.0 |
1320 rows × 4 columns
Out[14]:
In [15]:
df = sim_on_obs(sim_subset, obs_subset)
sim_, obs_ = df["sim"], df["obs"]
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")
merged df: sim obs 2024-01-22 00:00:00 -0.081508 -0.072578 2024-01-22 00:06:40 -0.083077 NaN 2024-01-22 00:13:20 -0.084692 NaN 2024-01-22 00:15:00 NaN -0.035246 2024-01-22 00:20:00 -0.085591 NaN 2024-01-22 00:26:40 -0.084821 NaN 2024-01-22 00:30:00 NaN -0.088447 2024-01-22 00:33:20 -0.081714 NaN 2024-01-22 00:40:00 -0.076266 NaN 2024-01-22 00:45:00 NaN -0.112014 2024-01-22 00:46:40 -0.069040 NaN 2024-01-22 00:53:20 -0.060762 NaN 2024-01-22 01:00:00 -0.052036 -0.049075 2024-01-22 01:06:40 -0.043210 NaN 2024-01-22 01:13:20 -0.034527 NaN 2024-01-22 01:15:00 NaN -0.049939 2024-01-22 01:20:00 -0.026361 NaN 2024-01-22 01:26:40 -0.019113 NaN 2024-01-22 01:30:00 NaN -0.018956 2024-01-22 01:33:20 -0.012782 NaN 2024-01-22 01:40:00 -0.006738 NaN 2024-01-22 01:45:00 NaN -0.021372 2024-01-22 01:46:40 -0.000075 NaN 2024-01-22 01:53:20 0.007859 NaN 2024-01-22 02:00:00 0.017258 0.032797 2024-01-22 02:06:40 0.028108 NaN 2024-01-22 02:13:20 0.040120 NaN 2024-01-22 02:15:00 NaN -0.029174 2024-01-22 02:20:00 0.052665 NaN 2024-01-22 02:26:40 0.065018 NaN
Out[15]:
Compare stats for the whole series
In [16]:
station = "abur"
sensor = "rad"
sim, obs = load_ts(station, sensor)
In [17]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])
df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])
merged df: sim obs 2022-01-01 00:00:00 0.000000 NaN 2022-01-01 00:01:00 NaN -0.114223 2022-01-01 00:02:00 NaN -0.138153 2022-01-01 00:03:00 NaN -0.159002 2022-01-01 00:04:00 NaN -0.158472 2022-01-01 00:05:00 NaN -0.173261 2022-01-01 00:06:00 NaN -0.181971 2022-01-01 00:06:40 -0.001448 NaN 2022-01-01 00:07:00 NaN -0.175402 2022-01-01 00:08:00 NaN -0.168954 2022-01-01 00:09:00 NaN -0.168528 2022-01-01 00:10:00 NaN -0.156023 2022-01-01 00:11:00 NaN -0.146540 2022-01-01 00:12:00 NaN -0.152280 2022-01-01 00:13:00 NaN -0.145942 2022-01-01 00:13:20 -0.003688 NaN 2022-01-01 00:14:00 NaN -0.139627 2022-01-01 00:15:00 NaN -0.145435 2022-01-01 00:16:00 NaN -0.145167 2022-01-01 00:17:00 NaN -0.141923 2022-01-01 00:18:00 NaN -0.144803 2022-01-01 00:19:00 NaN -0.141607 2022-01-01 00:20:00 -0.007483 -0.150636 2022-01-01 00:21:00 NaN -0.153589 2022-01-01 00:22:00 NaN -0.156568 2022-01-01 00:23:00 NaN -0.159573 2022-01-01 00:24:00 NaN -0.162603 2022-01-01 00:25:00 NaN -0.162659 2022-01-01 00:26:00 NaN -0.150442 2022-01-01 00:26:40 -0.016760 NaN
Out[17]:
R1 | R3 | bias | cr | error | kge | nse | rms | rmse | |
---|---|---|---|---|---|---|---|---|---|
method 2 | 0.549898 | 0.604785 | -0.048054 | 0.349005 | 0.604785 | 0.139450 | -0.506296 | 0.094535 | 0.106048 |
method 3 | 0.549674 | 0.604614 | -0.030002 | 0.366957 | 0.604614 | 0.271801 | -0.303839 | 0.092457 | 0.097203 |
In [18]:
ext2.iloc[:5] # method 2
Out[18]:
observed | model | time model | diff | error | error_norm | tdiff | |
---|---|---|---|---|---|---|---|
time observed | |||||||
2022-09-18 07:46:40 | 1.147230 | 0.597332 | 2022-09-18 09:00:00 | -0.549898 | 0.549898 | 0.479326 | 1.222222 |
2022-01-15 16:26:40 | 0.589841 | -0.069832 | 2022-01-16 00:06:40 | -0.659673 | 0.659673 | 1.118391 | 7.666667 |
2023-08-08 14:06:40 | 0.472867 | 0.306800 | 2023-08-08 12:46:40 | -0.166067 | 0.166067 | 0.351192 | -1.333333 |
2024-02-04 22:40:00 | 0.453307 | 0.041071 | 2024-02-05 00:06:40 | -0.412236 | 0.412236 | 0.909398 | 1.444444 |
2024-03-25 19:40:00 | 0.432861 | 0.085566 | 2024-03-25 23:40:00 | -0.347295 | 0.347295 | 0.802324 | 4.000000 |
In [19]:
ext3.iloc[:5] # method 3
Out[19]:
observed | model | time model | diff | error | error_norm | tdiff | |
---|---|---|---|---|---|---|---|
time observed | |||||||
2022-09-18 07:45:00 | 1.147230 | 0.597556 | 2022-09-18 08:59:00 | -0.549674 | 0.549674 | 0.479132 | 1.233333 |
2022-01-15 16:29:00 | 0.589841 | -0.069713 | 2022-01-16 00:10:00 | -0.659553 | 0.659553 | 1.118189 | 7.683333 |
2023-08-08 14:10:00 | 0.472867 | 0.307113 | 2023-08-08 12:49:00 | -0.165754 | 0.165754 | 0.350530 | -1.350000 |
2024-02-04 22:41:00 | 0.453307 | 0.041166 | 2024-02-05 00:05:00 | -0.412141 | 0.412141 | 0.909187 | 1.400000 |
2024-03-25 19:38:00 | 0.432861 | 0.085666 | 2024-03-25 23:38:00 | -0.347195 | 0.347195 | 0.802094 | 4.000000 |
In [20]:
station = "abed"
sensor = "bub"
sim, obs = load_ts(station, sensor)
In [21]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])
df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])
merged df: sim obs 2022-01-01 00:00:00 0.000000 NaN 2022-01-01 00:06:40 0.000863 NaN 2022-01-01 00:13:20 0.002603 NaN 2022-01-01 00:15:00 NaN -0.094530 2022-01-01 00:20:00 0.004427 NaN 2022-01-01 00:26:40 0.006277 NaN 2022-01-01 00:30:00 NaN -0.101112 2022-01-01 00:33:20 0.008207 NaN 2022-01-01 00:40:00 0.010070 NaN 2022-01-01 00:45:00 NaN -0.083943 2022-01-01 00:46:40 0.011820 NaN 2022-01-01 00:53:20 0.013458 NaN 2022-01-01 01:00:00 0.014803 -0.094749 2022-01-01 01:06:40 0.015785 NaN 2022-01-01 01:13:20 0.016642 NaN 2022-01-01 01:15:00 NaN -0.139401 2022-01-01 01:20:00 0.017535 NaN 2022-01-01 01:26:40 0.018257 NaN 2022-01-01 01:30:00 NaN -0.129927 2022-01-01 01:33:20 0.018475 NaN 2022-01-01 01:40:00 0.018118 NaN 2022-01-01 01:45:00 NaN -0.131556 2022-01-01 01:46:40 0.017351 NaN 2022-01-01 01:53:20 0.016302 NaN 2022-01-01 02:00:00 0.014954 -0.117781 2022-01-01 02:06:40 0.013234 NaN 2022-01-01 02:13:20 0.011144 NaN 2022-01-01 02:15:00 NaN -0.149402 2022-01-01 02:20:00 0.008853 NaN 2022-01-01 02:26:40 0.006667 NaN
Out[21]:
R1 | R3 | bias | cr | error | kge | nse | rms | rmse | |
---|---|---|---|---|---|---|---|---|---|
method 2 | 0.241109 | 0.230598 | 0.008629 | 0.803094 | 0.230598 | 0.795181 | 0.603346 | 0.096010 | 0.096397 |
method 3 | 0.241247 | 0.230667 | 0.008674 | 0.803105 | 0.230667 | 0.795110 | 0.603296 | 0.096004 | 0.096395 |