In [1]:
import pandas as pd
import numpy as np
import hvplot.pandas

Comparing Time Series with Different Time Steps¶

In [ ]:
ROOT = ""
station = "abur" #"abed"
sensor =  "rad" #"bub"

def load_ts(station, sensor):
    obs = pd.read_parquet(ROOT + f"data/obs/{station}_{sensor}.parquet")
    sim = pd.read_parquet(ROOT + f"data/models/seareport-v2.2/{station}.parquet")
    obs = pd.Series(obs[obs.columns[0]], name = "obs")
    sim = pd.Series(sim[sim.columns[0]], name = "sim")
    sim = sim.sort_index()
    return sim, obs
sim, obs = load_ts(station, sensor)

Let's test the different approaches on one week of data: it's more than enough

In [3]:
YEAR = 2022
sim_subset = sim.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
obs_subset = obs.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
In [4]:
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()
Out[4]:
0 days 00:06:40    236736
0 days 00:00:00         2
Name: count, dtype: int64
Out[4]:
time
0 days 00:01:00    1563040
0 days 00:11:00        351
0 days 00:21:00         35
0 days 00:07:00         22
0 days 00:31:00          9
0 days 01:01:00          6
0 days 00:08:00          6
0 days 00:41:00          5
0 days 00:51:00          4
0 days 00:02:00          3
0 days 04:51:00          2
0 days 01:11:00          2
0 days 01:31:00          2
0 days 00:04:00          2
0 days 01:21:00          2
0 days 00:06:00          1
0 days 08:51:00          1
4 days 03:01:00          1
0 days 00:12:00          1
0 days 01:51:00          1
0 days 00:03:00          1
0 days 03:41:00          1
Name: count, dtype: int64
In [5]:
(obs_subset.hvplot()*sim_subset.hvplot()).opts(width=1300, height=800, title = f"simple comparison: model vs. observation, station: '{station}'")
Out[5]:

1. Nearest-Neighbor Alignment¶

Method: For each model timestamp, find the closest observation timestamp within a defined tolerance window Advantages:

  • No interpolation (no fictional data)
  • Preserves actual observation values

Cons:

  • No control on the aligned signal, resulting in:
    • missing peaks
    • or even missing the trend (if there is noise signal looks chaotic)
In [6]:
aligned_data = pd.merge_asof(
    sim_subset, obs_subset, 
    left_index=True, right_index=True,
    tolerance=pd.Timedelta('2min'),  # Set appropriate tolerance
    direction='nearest'
)
aligned_data = aligned_data.rename(columns={"obs": "obs_aligned"})
aligned_data.sort_index().index.diff().dropna().value_counts()
(obs_subset.hvplot() * aligned_data.hvplot()).opts(width=1300, height = 800,  title=f"Method 1. Nearest-Neighbor Alignment, station: '{station}'")
Out[6]:
0 days 00:06:40    1727
Name: count, dtype: int64
Out[6]:

not insteresting enough to be considered

Method 2. Window-Based Aggregation¶

Method: Use the model timestamps as reference points and aggregate observations within a window

Advantages:

  • we can choose between max or mean

Cons

  • We drop some maxima (outside ouf the averaging window) if the window is not adapted
  • We end loosing information, because dropping data points
In [7]:
def aggregate(sim, obs, window_size='7min'):
    all_times = pd.DatetimeIndex(sorted(set(sim.index) | set(obs.index)))
    
    full_obs = pd.Series(np.nan, index=all_times)
    full_obs.loc[obs.index] = obs
    
    window = pd.Timedelta(window_size)
    rolling_stats = pd.DataFrame({
        'obs_mean': full_obs.rolling(window=window, center=True).mean(),
        'obs_max': full_obs.rolling(window=window, center=True).max(),
        'obs_count': full_obs.rolling(window=window, center=True).count()
    })
    
    result = pd.DataFrame({'sim': sim})
    result = result.join(rolling_stats)
    result = result[result['obs_count'] > 0].copy()
    
    return result

df1 = aggregate(sim_subset, obs_subset)
df1
Out[7]:
sim obs_mean obs_max obs_count
2022-09-13 00:00:00 0.075297 0.081856 0.086098 4.0
2022-09-13 00:06:40 0.077474 0.085185 0.094198 7.0
2022-09-13 00:13:20 0.079457 0.106724 0.113791 7.0
2022-09-13 00:20:00 0.081009 0.097287 0.112626 7.0
2022-09-13 00:26:40 0.082073 0.085271 0.100874 7.0
... ... ... ... ...
2022-09-20 23:26:40 0.060263 0.100853 0.115913 7.0
2022-09-20 23:33:20 0.060033 0.119736 0.124353 7.0
2022-09-20 23:40:00 0.060690 0.117228 0.122574 7.0
2022-09-20 23:46:40 0.062372 0.104759 0.108953 7.0
2022-09-20 23:53:20 0.064460 0.097619 0.099798 7.0

1726 rows × 4 columns

In [8]:
df2 = aggregate(sim_subset, obs_subset, window_size="2min")
df2
Out[8]:
sim obs_mean obs_max obs_count
2022-09-13 00:00:00 0.075297 0.083636 0.086098 2.0
2022-09-13 00:06:40 0.077474 0.081077 0.083344 2.0
2022-09-13 00:13:20 0.079457 0.109276 0.113236 2.0
2022-09-13 00:20:00 0.081009 0.095198 0.100256 2.0
2022-09-13 00:26:40 0.082073 0.082142 0.084789 2.0
... ... ... ... ...
2022-09-20 23:26:40 0.060263 0.099891 0.102171 2.0
2022-09-20 23:33:20 0.060033 0.119389 0.120241 2.0
2022-09-20 23:40:00 0.060690 0.114853 0.117292 2.0
2022-09-20 23:46:40 0.062372 0.106439 0.108953 2.0
2022-09-20 23:53:20 0.064460 0.095060 0.095505 2.0

1726 rows × 4 columns

In [9]:
(obs_subset.hvplot() 
 * sim_subset.hvplot()
 * df1.obs_max.hvplot(label = "obs_max window: 7min")
 * df2.obs_max.hvplot(label = "obs_max window: 2min")
).opts(width=1300, height=800, title = f"Method 2. Window-Based Aggregation, station: '{station}'")
Out[9]:

3. Interpolating the model on the observed TS index¶

Advantages:

  • No observation data is dropped

Cons:

  • We create fictional data (for the model TS)
  • May result in heavy process if observed signal has high sample rate
In [10]:
def sim_on_obs(sim, obs):
    sim = sim.drop_duplicates()
    obs = obs.drop_duplicates()


    df = pd.merge(sim, obs, left_index=True, right_index=True, how='outer')
    df = df.drop_duplicates()
    df = df[~df.index.duplicated(keep='first')]
    print("merged df:")
    print(df.iloc[:30])
    df['sim'] = df['sim'].interpolate(method="cubic", limit_direction="both")
    df = df.dropna(subset=['obs'])
    
    return df

df = sim_on_obs(sim_subset, obs_subset)
print("final df: ")
df
merged df:
                          sim       obs
2022-09-13 00:00:00  0.075297  0.086098
2022-09-13 00:01:00       NaN  0.081174
2022-09-13 00:02:00       NaN  0.082495
2022-09-13 00:03:00       NaN  0.077659
2022-09-13 00:04:00       NaN  0.079066
2022-09-13 00:05:00       NaN  0.080416
2022-09-13 00:06:00       NaN  0.078809
2022-09-13 00:06:40  0.077474       NaN
2022-09-13 00:07:00       NaN  0.083344
2022-09-13 00:08:00       NaN  0.087921
2022-09-13 00:09:00       NaN  0.092539
2022-09-13 00:10:00       NaN  0.094198
2022-09-13 00:11:00       NaN  0.101897
2022-09-13 00:12:00       NaN  0.106637
2022-09-13 00:13:00       NaN  0.105317
2022-09-13 00:13:20  0.079457       NaN
2022-09-13 00:14:00       NaN  0.113236
2022-09-13 00:15:00       NaN  0.111994
2022-09-13 00:16:00       NaN  0.113791
2022-09-13 00:17:00       NaN  0.112626
2022-09-13 00:18:00       NaN  0.108499
2022-09-13 00:19:00       NaN  0.101309
2022-09-13 00:20:00  0.081009  0.100256
2022-09-13 00:21:00       NaN  0.090140
2022-09-13 00:22:00       NaN  0.086060
2022-09-13 00:23:00       NaN  0.082116
2022-09-13 00:24:00       NaN  0.081208
2022-09-13 00:25:00       NaN  0.071134
2022-09-13 00:26:00       NaN  0.079494
2022-09-13 00:26:40  0.082073       NaN
final df: 
Out[10]:
sim obs
2022-09-13 00:00:00 0.075297 0.086098
2022-09-13 00:01:00 0.075626 0.081174
2022-09-13 00:02:00 0.075956 0.082495
2022-09-13 00:03:00 0.076285 0.077659
2022-09-13 00:04:00 0.076613 0.079066
... ... ...
2022-09-20 23:55:00 NaN 0.099470
2022-09-20 23:56:00 NaN 0.097208
2022-09-20 23:57:00 NaN 0.104120
2022-09-20 23:58:00 NaN 0.107905
2022-09-20 23:59:00 NaN 0.111764

11498 rows × 2 columns

In [11]:
sim_, obs_ = df["sim"], df["obs"]
(obs_ == obs_subset).all()
Out[11]:
np.True_
In [12]:
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
    width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")
Out[12]:

testing now with a station with a lower sampling rate than the simulation:

In [13]:
station = "abed"
sensor = "bub"

sim, obs = load_ts(station, sensor)
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()

YEAR = 2024
sim_subset = sim.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]
obs_subset = obs.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]
Out[13]:
0 days 00:06:40    236736
0 days 00:00:00         2
Name: count, dtype: int64
Out[13]:
time
0 days 00:15:00    103752
0 days 00:30:00       181
0 days 00:45:00        41
0 days 01:00:00        11
0 days 01:30:00        10
0 days 01:15:00         9
0 days 01:45:00         5
0 days 02:15:00         4
0 days 02:00:00         3
0 days 03:15:00         3
0 days 04:15:00         2
0 days 04:00:00         1
0 days 03:45:00         1
0 days 17:15:00         1
3 days 06:30:00         1
0 days 06:15:00         1
0 days 02:45:00         1
0 days 03:00:00         1
0 days 23:15:00         1
0 days 02:30:00         1
Name: count, dtype: int64
In [14]:
df = aggregate(sim_subset, obs_subset)
df
(obs_subset.hvplot() 
 * sim_subset.hvplot()
 * df.obs_max.hvplot(label = "obs_max window: 7min")
).opts(width=1300, height=800,  title = f"Method 2. Window-Based Aggregation, station: '{station}'")
Out[14]:
sim obs_mean obs_max obs_count
2024-01-22 00:00:00 -0.081508 -0.072578 -0.072578 1.0
2024-01-22 00:13:20 -0.084692 -0.035246 -0.035246 1.0
2024-01-22 00:26:40 -0.084821 -0.088447 -0.088447 1.0
2024-01-22 00:33:20 -0.081714 -0.088447 -0.088447 1.0
2024-01-22 00:46:40 -0.069040 -0.112014 -0.112014 1.0
... ... ... ... ...
2024-02-01 23:00:00 -0.100931 -0.135878 -0.135878 1.0
2024-02-01 23:13:20 -0.093476 -0.124947 -0.124947 1.0
2024-02-01 23:26:40 -0.083951 -0.080565 -0.080565 1.0
2024-02-01 23:33:20 -0.078872 -0.080565 -0.080565 1.0
2024-02-01 23:46:40 -0.069120 -0.094202 -0.094202 1.0

1320 rows × 4 columns

Out[14]:
In [15]:
df = sim_on_obs(sim_subset, obs_subset)
sim_, obs_ = df["sim"], df["obs"]
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
    width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")
merged df:
                          sim       obs
2024-01-22 00:00:00 -0.081508 -0.072578
2024-01-22 00:06:40 -0.083077       NaN
2024-01-22 00:13:20 -0.084692       NaN
2024-01-22 00:15:00       NaN -0.035246
2024-01-22 00:20:00 -0.085591       NaN
2024-01-22 00:26:40 -0.084821       NaN
2024-01-22 00:30:00       NaN -0.088447
2024-01-22 00:33:20 -0.081714       NaN
2024-01-22 00:40:00 -0.076266       NaN
2024-01-22 00:45:00       NaN -0.112014
2024-01-22 00:46:40 -0.069040       NaN
2024-01-22 00:53:20 -0.060762       NaN
2024-01-22 01:00:00 -0.052036 -0.049075
2024-01-22 01:06:40 -0.043210       NaN
2024-01-22 01:13:20 -0.034527       NaN
2024-01-22 01:15:00       NaN -0.049939
2024-01-22 01:20:00 -0.026361       NaN
2024-01-22 01:26:40 -0.019113       NaN
2024-01-22 01:30:00       NaN -0.018956
2024-01-22 01:33:20 -0.012782       NaN
2024-01-22 01:40:00 -0.006738       NaN
2024-01-22 01:45:00       NaN -0.021372
2024-01-22 01:46:40 -0.000075       NaN
2024-01-22 01:53:20  0.007859       NaN
2024-01-22 02:00:00  0.017258  0.032797
2024-01-22 02:06:40  0.028108       NaN
2024-01-22 02:13:20  0.040120       NaN
2024-01-22 02:15:00       NaN -0.029174
2024-01-22 02:20:00  0.052665       NaN
2024-01-22 02:26:40  0.065018       NaN
Out[15]:

Compare stats for the whole series

In [16]:
station = "abur"
sensor = "rad"

sim, obs = load_ts(station, sensor)
In [17]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])

df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])
merged df:
                          sim       obs
2022-01-01 00:00:00  0.000000       NaN
2022-01-01 00:01:00       NaN -0.114223
2022-01-01 00:02:00       NaN -0.138153
2022-01-01 00:03:00       NaN -0.159002
2022-01-01 00:04:00       NaN -0.158472
2022-01-01 00:05:00       NaN -0.173261
2022-01-01 00:06:00       NaN -0.181971
2022-01-01 00:06:40 -0.001448       NaN
2022-01-01 00:07:00       NaN -0.175402
2022-01-01 00:08:00       NaN -0.168954
2022-01-01 00:09:00       NaN -0.168528
2022-01-01 00:10:00       NaN -0.156023
2022-01-01 00:11:00       NaN -0.146540
2022-01-01 00:12:00       NaN -0.152280
2022-01-01 00:13:00       NaN -0.145942
2022-01-01 00:13:20 -0.003688       NaN
2022-01-01 00:14:00       NaN -0.139627
2022-01-01 00:15:00       NaN -0.145435
2022-01-01 00:16:00       NaN -0.145167
2022-01-01 00:17:00       NaN -0.141923
2022-01-01 00:18:00       NaN -0.144803
2022-01-01 00:19:00       NaN -0.141607
2022-01-01 00:20:00 -0.007483 -0.150636
2022-01-01 00:21:00       NaN -0.153589
2022-01-01 00:22:00       NaN -0.156568
2022-01-01 00:23:00       NaN -0.159573
2022-01-01 00:24:00       NaN -0.162603
2022-01-01 00:25:00       NaN -0.162659
2022-01-01 00:26:00       NaN -0.150442
2022-01-01 00:26:40 -0.016760       NaN
Out[17]:
R1 R3 bias cr error kge nse rms rmse
method 2 0.549898 0.604785 -0.048054 0.349005 0.604785 0.139450 -0.506296 0.094535 0.106048
method 3 0.549674 0.604614 -0.030002 0.366957 0.604614 0.271801 -0.303839 0.092457 0.097203
In [18]:
ext2.iloc[:5] # method 2
Out[18]:
observed model time model diff error error_norm tdiff
time observed
2022-09-18 07:46:40 1.147230 0.597332 2022-09-18 09:00:00 -0.549898 0.549898 0.479326 1.222222
2022-01-15 16:26:40 0.589841 -0.069832 2022-01-16 00:06:40 -0.659673 0.659673 1.118391 7.666667
2023-08-08 14:06:40 0.472867 0.306800 2023-08-08 12:46:40 -0.166067 0.166067 0.351192 -1.333333
2024-02-04 22:40:00 0.453307 0.041071 2024-02-05 00:06:40 -0.412236 0.412236 0.909398 1.444444
2024-03-25 19:40:00 0.432861 0.085566 2024-03-25 23:40:00 -0.347295 0.347295 0.802324 4.000000
In [19]:
ext3.iloc[:5] # method 3
Out[19]:
observed model time model diff error error_norm tdiff
time observed
2022-09-18 07:45:00 1.147230 0.597556 2022-09-18 08:59:00 -0.549674 0.549674 0.479132 1.233333
2022-01-15 16:29:00 0.589841 -0.069713 2022-01-16 00:10:00 -0.659553 0.659553 1.118189 7.683333
2023-08-08 14:10:00 0.472867 0.307113 2023-08-08 12:49:00 -0.165754 0.165754 0.350530 -1.350000
2024-02-04 22:41:00 0.453307 0.041166 2024-02-05 00:05:00 -0.412141 0.412141 0.909187 1.400000
2024-03-25 19:38:00 0.432861 0.085666 2024-03-25 23:38:00 -0.347195 0.347195 0.802094 4.000000
In [20]:
station = "abed"
sensor = "bub"

sim, obs = load_ts(station, sensor)
In [21]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])

df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])
merged df:
                          sim       obs
2022-01-01 00:00:00  0.000000       NaN
2022-01-01 00:06:40  0.000863       NaN
2022-01-01 00:13:20  0.002603       NaN
2022-01-01 00:15:00       NaN -0.094530
2022-01-01 00:20:00  0.004427       NaN
2022-01-01 00:26:40  0.006277       NaN
2022-01-01 00:30:00       NaN -0.101112
2022-01-01 00:33:20  0.008207       NaN
2022-01-01 00:40:00  0.010070       NaN
2022-01-01 00:45:00       NaN -0.083943
2022-01-01 00:46:40  0.011820       NaN
2022-01-01 00:53:20  0.013458       NaN
2022-01-01 01:00:00  0.014803 -0.094749
2022-01-01 01:06:40  0.015785       NaN
2022-01-01 01:13:20  0.016642       NaN
2022-01-01 01:15:00       NaN -0.139401
2022-01-01 01:20:00  0.017535       NaN
2022-01-01 01:26:40  0.018257       NaN
2022-01-01 01:30:00       NaN -0.129927
2022-01-01 01:33:20  0.018475       NaN
2022-01-01 01:40:00  0.018118       NaN
2022-01-01 01:45:00       NaN -0.131556
2022-01-01 01:46:40  0.017351       NaN
2022-01-01 01:53:20  0.016302       NaN
2022-01-01 02:00:00  0.014954 -0.117781
2022-01-01 02:06:40  0.013234       NaN
2022-01-01 02:13:20  0.011144       NaN
2022-01-01 02:15:00       NaN -0.149402
2022-01-01 02:20:00  0.008853       NaN
2022-01-01 02:26:40  0.006667       NaN
Out[21]:
R1 R3 bias cr error kge nse rms rmse
method 2 0.241109 0.230598 0.008629 0.803094 0.230598 0.795181 0.603346 0.096010 0.096397
method 3 0.241247 0.230667 0.008674 0.803105 0.230667 0.795110 0.603296 0.096004 0.096395