In [37]:
import copernicusmarine as cm
import pandas as pd
import xarray as xr
import hvplot.xarray
import hvplot.pandas
import numpy as np
import holoviews as hv
In [ ]:
cm.login()
In [24]:
ioc_cleanup = pd.read_csv('ioc_cleanup_2023.csv', index_col=0)
ioc_cleanup
Out[24]:
Station_Name | ioc_code | gloss_id | latitude | longitude | Country | connection | contacts | dcp_id | last_observation_level | ... | number_of_years | time_zone_hours | datum_information | instrument | precision | null_value | gauge_type | overall_record_quality | gesla3_id | seaset_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
125 | Base O'Higgins | ohig | NaN | -63.321 | -57.901 | Antarctica | SXCH40 | Servicio Hidrográfico y Oceanográfico de la Ar... | ADC04BE6 | 1.75 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 125 |
135 | Puerto Deseado | dese | 190.0 | -47.754 | -65.915 | Argentina | SEPO40 | Armada Argentina Servicio de HidrografÃa Naval... | 33912088 | 3.69 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 135 |
136 | Puerto Madryn | madry | 191.0 | -42.763 | -65.031 | Argentina | SEPO40 | Armada Argentina Servicio de HidrografÃa Naval... | 335665D2 | 6.60 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 136 |
139 | Battery Point | bapj | NaN | -42.892 | 147.338 | Australia | SZAU01 | National Tidal Centre/Australian Bureau of Met... | 61221 | 0.91 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 139 |
140 | Broome | brom | 40.0 | -18.001 | 122.219 | Australia | SZAU01 | National Tidal Centre/Australian Bureau of Met... | 62650 | 7.69 | ... | 32.0 | 0.0 | Unspecified | Unspecified | Unspecified | -99.9999 | Coastal | No obvious issues | Broome | 140 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5451 | Woods Hole, MA | wood | NaN | 41.523 | -70.672 | USA | web | National Ocean Service-NOAA ( USA ) | 3340A79C | 1.08 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5451 |
5462 | Yakutat | yaku | NaN | 59.548 | -139.735 | USA | web | National Ocean Service-NOAA ( USA ) | 336B0522 | 2.04 | ... | 61.0 | 0.0 | Station Datum | Unspecified | Unspecified | -99.9999 | Coastal | No obvious issues | Yakutat | 5462 |
6129 | Colombo | colo | 33.0 | 6.941 | 79.850 | Sri Lanka | SWIO40 | National Aquatic Resources Research and Develo... | 0650018E | 1.39 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6129 |
6141 | DART New Zealand E Coast | dnze | NaN | -36.049 | -177.708 | New Zealand | web | GNS Science ( New Zealand ) | 5401001 | 5776.60 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6141 |
6161 | Prince Rupert | prin2 | 155.0 | 54.317 | -130.317 | Canada | web | Fisheries and Oceans Canada ( Canada ) | NaN | 6.28 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6161 |
169 rows × 77 columns
In [27]:
# subset
ioc_re = ioc_cleanup[ioc_cleanup.ioc_code == 'boma']
ioc_re
Out[27]:
Station_Name | ioc_code | gloss_id | latitude | longitude | Country | connection | contacts | dcp_id | last_observation_level | ... | number_of_years | time_zone_hours | datum_information | instrument | precision | null_value | gauge_type | overall_record_quality | gesla3_id | seaset_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3547 | Boston | boma | NaN | 42.355 | -71.052 | USA | web | National Ocean Service-NOAA ( USA ) | 335E54EE | 1.68 | ... | 101.0 | 0.0 | Station Datum | Unspecified | Unspecified | -99.9999 | Coastal | No obvious issues | Boston | 3547 |
1 rows × 77 columns
In [4]:
import os
os.makedirs('data/nc/', exist_ok=True)
os.makedirs('data/parquet/', exist_ok=True)
In [ ]:
for it, item in ioc_cleanup.iterrows():
lonmax = np.ceil(item.longitude) + 1
lonmin = np.floor(item.longitude)
latmax = np.ceil(item.latitude)
latmin = np.floor(item.latitude)
cm.subset(
dataset_id="cmems_mod_glo_phy_anfc_0.083deg_PT1H-m",
variables=["zos"],
minimum_longitude=lonmin,
maximum_longitude=lonmax,
minimum_latitude=latmin,
maximum_latitude=latmax,
start_datetime="2022-01-01T00:00:00",
end_datetime="2023-12-31T23:00:00",
output_filename = f"data/nc/{item.ioc_code}.nc",
force_download=True
)
In [30]:
def get_closest_coordinates(x, y, ds):
xx, yy = np.meshgrid(ds.longitude.values, ds.latitude.values)
xall, yall = xx.ravel(), yy.ravel()
data = ds.zos[-1, 0, :, :].values.ravel()
non_nan_mask = ~np.isnan(data)
xall_non_nan = xall[non_nan_mask]
yall_non_nan = yall[non_nan_mask]
distances_squared = (xall_non_nan - x)**2 + (yall_non_nan - y)**2
# print(distances_squared)
closest_idx = np.argmin(distances_squared)
lo_, la_ = xall_non_nan[closest_idx], yall_non_nan[closest_idx]
ilon = np.argmin(abs(ds.longitude.values - lo_))
ilat = np.argmin(abs(ds.latitude.values - la_))
return ilon, ilat
def extract_parquet(stations: pd.DataFrame):
for it, item in stations.iterrows():
print(item.ioc_code)
ds = xr.open_dataset(f'data/nc/{item.ioc_code}.nc')
ilon, ilat = get_closest_coordinates(item.longitude, item.latitude, ds)
df = pd.DataFrame({'zos':ds.zos[:,0,ilat, ilon].values}, ds.time.values)
df.to_parquet(f'data/parquet/{item.ioc_code}.parquet')
In [31]:
extract_parquet(ioc_cleanup)
ohig dese madry bapj brom barn djve darw pkem pmur ross sprg thev trst oste bele bamf prin stjo vhbc greg ohig3 cald coqu corr pich ptal pcha pwil qtro quir talc viti cher dzao herb mare nuku stqy stqy2 tubua cuxh helg horn itea LA23 abur fuka hmda hana ishig kusm kush naga naha omae sado saig tosa waka kant huat chst ande honn malo rorv treg vard davo dapi hie2 fue2 coru arko fors furu gokr holm kalit karl klag kung kungr land olan oska oxel rata simp simr smog spik visb zanz nkfa amas anta igne sile abed bang crom dove harw heys holy holy2 ilfa kinl leit lerw lerw2 live lowe mhav mill mumb nhav newl newl2 npor nshi plym porp prus ptmt shee stor whit wick work alam aren asto atka acnj bamd bame benc boma bgct amal chrp cwfl cres dkwa datu dpnc dutc elak fpnt guam hilo kahu kawa kwfl lajo lime pagb pslu sdpt sitk wpwa wood yaku colo dnze prin2
In [46]:
# check if we have data
for it, item in ioc_cleanup.iterrows():
print(it, item.ioc_code)
df = pd.read_parquet(f'./data/parquet/{item.ioc_code}.parquet')
df.hvplot()
if it > 200:
break
125 ohig
Out[46]:
135 dese
Out[46]:
136 madry
Out[46]:
139 bapj
Out[46]:
140 brom
Out[46]:
141 barn
Out[46]:
149 djve
Out[46]:
152 darw
Out[46]:
161 pkem
Out[46]:
164 pmur
Out[46]:
165 ross
Out[46]:
167 sprg
Out[46]:
168 thev
Out[46]:
169 trst
Out[46]:
221 oste
Out[46]: