import geopandas as gp
import pandas as pd
import searvey
from datetime import datetime
import numpy as np
import sklearn.neighbors
import xarray as xr
import hvplot.pandas
import os

/home/tomsail/miniconda3/envs/searvey/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

# A file available at the following address references all stofs 2D stations:

def get_stofs():
    mycols = [str(i) for i in range(6)] # we expect 17 cols max in that file
    stof2d = pd.read_csv(
        "https://polar.ncep.noaa.gov/stofs/data/stofs_2d_glo_elev_stat_v2_1_0",
        names=mycols, 
        sep="\t+|!", 
        header=None, 
        skiprows=1
    )
    stof2d['Info'] = stof2d.apply(lambda row: ' '.join(filter(None, row[2:])), axis=1)
    stof2d['ID'] = stof2d['Info'].apply(lambda x: ' '.join(x.split()[:3]))
    stof2d['Info'] = stof2d.apply(lambda row: row['Info'].replace(row['ID'], '').strip(), axis=1)
    stof2d = stof2d.drop(columns=["2", "3", "4", "5"])
    stof2d.rename(columns={"0": 'lon', "1": 'lat'}, inplace=True)
    return stof2d

stofs = get_stofs()
stofs

/tmp/ipykernel_21996/3927710647.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  stof2d = pd.read_csv(

stofs1 = xr.open_dataset("stofs2d/20220912_stofs_2d_glo.t18z.points.swl.nc")
stofs2 = xr.open_dataset("stofs2d/20231010_stofs_2d_glo.t18z.points.swl.nc")
stofs3 = xr.open_dataset("stofs2d/20241229_stofs_2d_glo.t12z.points.swl.nc")

stofs_2022 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs1.station_name.values])];len(stofs_2022)
stofs_2023 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs2.station_name.values])];len(stofs_2023)
stofs_2024 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs3.station_name.values])];len(stofs_2024)

562

1687

1688

stofs_2022[:557].equals(stofs_2023[:557])
stofs_2022[:557].equals(stofs_2024[:557])

True

True

def get_meta() -> gp.GeoDataFrame:
    meta_web = searvey.get_ioc_stations().drop(columns=["lon", "lat"])
    meta_api = (
        pd.read_json(
            "http://www.ioc-sealevelmonitoring.org/service.php?query=stationlist&showall=all"
        )
        .drop_duplicates()
        .drop(columns=["lon", "lat"])
        .rename(columns={"Code": "ioc_code", "Lon": "lon", "Lat": "lat"})
    )
    merged = pd.merge(
        meta_web,
        meta_api[["ioc_code", "lon", "lat"]].drop_duplicates(),
        on=["ioc_code"],
    )
    return merged.drop(columns=["geometry"])
ioc_ = get_meta()

IOC_CLEANUP = "ioc_cleanup_2023.csv"
ioc_cleanup = pd.read_csv(IOC_CLEANUP, index_col=0).rename(columns={"longitude": 'lon', "latitude": 'lat', "Station_Name":"location","Country":"country"})

stofs_plot = stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y', label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",s = 80, c='r', label = "stations cleaned for 2022-2023")

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_cleanup_plot* ioc_plot).opts(width = 1600, height = 800)

station_to_add = ["juan", "sanf", "anto", "ptmo", "valp", "ferg", "ambon", "bitu", "saum", "sho2", "ushu", 
                  "espr", "gamb", "riki", "prud", "vald", "cord", "paak", "dsea", "ketc", "june", "skag", "sewa", "anch", "niki", "seld", "kodi", "alak", 
                  "dshu", "dkod", "nome", "adak", "niko", "dchu", "midx", "fren", "sthl", "ascen", "jask", "chab", "kara", "musc", 
                  "masi", "mais", "kerg", "syow", "ver1", "vern", "wait", "stpa", "sala", "tara", "marsh", "kwaj", "wake", "fong", 
                  "solo", "vanu", "numbo", "numb2", "levu", "wlgt", "jack", "hako", "abas", "ofun", "mera", "toya", "nawi", "brpt", "heeia", 
                  "moku", "mane", "john", "plmy", "xmas", "penr", "hiva", "pape", "raro", "pago", "pagx", "east", "garc", "Male2", "ganm", "male", "hani", 
                  "mini", "coch", "vish", "chtt", "sitt", "moul", "ptbl", "komi", "kota", "lank", "ms001", "sab2", "saba", "vung", "quin", 
                  "quar", "curri", "subi", "mani", "luba", "lega", "tkao", "tkee", "chij", "mins", "saip", "mala", "chuu", "kapi", "deke", "naur", "nauu", 
                  "dumo", "espe", "porl", "hill", "waik", "lemba", "beno", "prgi", "prig", "cili", "cila", "tjls", "chrs", "ffcj", "cocb", "telu", "sibo", 
                  "sib2", "tanjo", "bupo", "padn", "pada", "fpga", "winc", "wbnc", "oinc", "kpva", "leva", "simd", "wsdc", "cbmd", "ocmd", "cmnj", "phap", 
                  "mhpa", "btny", "shnj", "mony", "ptme", "cwme", "epme", "hali", "nain", "nuk1", "nuuk", "qaqo", "reyk", "scor", "rptx", "cctx", "pitx", 
                  "pric", "ftfr", "rose", "barb", "stcr", "lame", "isab", "vieq", "yobu", "yabu", "faja", "sanj", "arac", "maya", "magi", "penu", "mona", 
                  "ptpr", "ptpl", "sama", "bull", "elpo", "limon", "quepo", "sana", "acaj", "acap", "acya", "manz", "mnza", "cabo", "fort", "call", "lobos", 
                  "tala", "lali", "vkfl", "nafl", "fmfl", "spfl", "pnfl", "pbfl", "apfl", "tpfl", "fbfl", "moal", "wlms", "psla", "gila", "pfla", "ncla", 
                  "apla", "eila", "cpla", "sptx", "gptx", "fptx", "bres", "sthm", "casc", "gibr", "ceut", "mars", "TR22", "gvd9", "alex", "palm", "pdas", 
                  "plus", "dakar", "tako", "tkdi", "lagos", "pntn", "sitin", "walvi", "prte", "durb", "pemba", "mtwa", "momb", "lamu", "pmon", "aric", "mata", 
                  "plat", "salv"]

possible_stations = []
all_ioc = ioc_.ioc_code.values
for stat in station_to_add:
    if any(stat in station for station in all_ioc):
        for station in all_ioc:
            if stat in station:
                possible_stations.append(station)
ioc_to_add = ioc_[ioc_.ioc_code.isin(possible_stations)]
ioc_to_add

stofs_plot =  stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y',label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",s = 90, c='r',label = 'stations already cleaned for 2022-2023')
ioc_to_add_plot = ioc_to_add.hvplot.scatter(x="lon", y="lat",s = 90, geo=True, c = 'g', label = 'stations to be added')

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_to_add_plot * ioc_cleanup_plot).opts(width = 1800, height = 800)

WARNING:param.main: geo option cannot be used with kind='scatter' plot type. Geographic plots are only supported for following plot types: ['bivariate', 'contour', 'contourf', 'dataset', 'hexbin', 'image', 'labels', 'paths', 'points', 'points', 'polygons', 'quadmesh', 'rgb', 'vectorfield']

ioc_cleanup_2024 = pd.concat([ioc_cleanup,ioc_to_add])
ioc_cleanup_2024
ioc_cleanup_2024.to_csv("ioc_cleanup_2024.csv")

def find_nearest_nodes(
    mesh_nodes: pd.DataFrame,
    points: pd.DataFrame,
    metric: str = "haversine",
    earth_radius = 6371000,
    ):
    """
    Calculate the mesh nodes that are nearest to the specified `points`.
    Both `mesh_nodes` and `points` must be `pandas.DataFrames` that have
    columns named `lon` and `lat` and the coords must be in EPSG:4326.
    Returns the `points` DataFrame after adding these extra columns:
    - `mesh_index` which is the index of the node in the `hgrid.gr3` file
    - `mesh_lon` which is the longitude of the nearest mesh node
    - `mesh_lat` which is the latitude of the nearest mesh node
    - `distance` which is the distance in meters between the point and the nearest mesh node
    Examples:
        >>> mesh_nodes = pd.DataFrame({
        ...     "lon": [0, 10, 20],
        ...     "lat": [0, 5, 0],
        ... })
        >>> points = pd.DataFrame({
        ...     "lon": [1, 11, 21],
        ...     "lat": [1, 4, 1],
        ...     "id": ["a", "b", "c"],
        ... })
        >>> nearest_nodes = find_nearest_nodes(mesh_nodes, points)
        >>> nearest_nodes
           lon  lat id  mesh_index  mesh_lon  mesh_lat       distance
        0    1    1  a           0         0         0  157249.381272
        1   11    4  b           1        10         5  157010.162641
        2   21    1  c           2        20         0  157249.381272
    """
    # The only requirement is that both `mesh_nodes and `points` have `lon/lat` columns
    tree = sklearn.neighbors.BallTree(
        np.radians(mesh_nodes[["lat", "lon"]]),
        metric=metric,
    )
    distances, indices = tree.query(np.radians(points[["lat", "lon"]].values))
    closest_nodes = (
        mesh_nodes
        .rename(columns={"lon": "mesh_lon", "lat": "mesh_lat"})
        .iloc[indices.flatten()]
        .assign(distance=(distances.flatten() * earth_radius))
        .reset_index(names=["mesh_index"])
    )

    return pd.concat((points.reset_index(drop = True), closest_nodes), axis="columns")

# 2 - get STOFS
nearest_nodes_2022 = find_nearest_nodes(stofs_2022, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2023 = find_nearest_nodes(stofs_2023, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2024 = find_nearest_nodes(stofs_2024, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2022 = nearest_nodes_2022[~nearest_nodes_2022.mesh_index.isna()]
nearest_nodes_2023 = nearest_nodes_2023[~nearest_nodes_2023.mesh_index.isna()]
nearest_nodes_2024 = nearest_nodes_2024[~nearest_nodes_2024.mesh_index.isna()]
keep_nodes_2022 = nearest_nodes_2022[nearest_nodes_2022.distance < 5000]
keep_nodes_2023 = nearest_nodes_2023[nearest_nodes_2023.distance < 5000]
keep_nodes_2024 = nearest_nodes_2024[nearest_nodes_2024.distance < 5000]

keep_nodes_2022.to_csv("keep_nodes_2022.csv")
keep_nodes_2023.to_csv("keep_nodes_2023.csv")
keep_nodes_2024.to_csv("keep_nodes_2024.csv")

p2 = stofs_2022.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=50, c='grey', label = 'STOFS 2022 output stations')
ip = ioc_cleanup_2024.hvplot.scatter(x="lon", y="lat",s = 10, c='g',label = 'IOC_CLEANUP 2022-2024')
k2 = keep_nodes_2022.hvplot.scatter(x="lon", y="lat", c = 'red',    s = 10, label = "STOFS2D stations to be extracted")

(p2 * ip * k2).opts(width = 1800, height = 800)

import pathlib

import pandas as pd
import httpx
import multifutures

base_path = "stofs2d"

pathlib.Path(base_path).mkdir(exist_ok=True)

# The URL changed at 2023-01-08 
# These are the new urls 
base_url = "https://noaa-nos-stofs2d-pds.s3.amazonaws.com/stofs_2d_glo.{date_str}/stofs_2d_glo.t{time}z.points.swl.nc"
new_url_names = {}
for date in pd.date_range("2023-01-08", "2024-12-31"):
    date_str = date.strftime("%Y%m%d")
    for time in ("00", "06", "12", "18"):
        url = base_url.format(date_str=date_str, time=time)
        name = f"{date_str}_{url.rsplit('/', 1)[1]}"
        new_url_names[url] = name

# These are the old urls 
base_url = "https://noaa-nos-stofs2d-pds.s3.amazonaws.com/estofs.{date_str}/estofs.t{time}z.points.swl.nc"
old_url_names = {}
for date in pd.date_range("2022-01-01", "2023-01-07"):
    date_str = date.strftime("%Y%m%d")
    for time in ("00", "06", "12", "18"):
        url = base_url.format(date_str=date_str, time=time)
        name = f"{date_str}_{url.rsplit('/', 1)[1].replace('estofs', 'stofs_2d_glo')}"
        old_url_names[url] = name

def download_file(client, url, name):
    with open(name, "wb") as fd:
        with client.stream("GET", url) as response:
            for data in response.iter_bytes():
                fd.write(data)

with httpx.Client() as client:
    func_kwargs = [{"client": client, "url": url, "name": f"{base_path}/{name}"} for (url, name) in new_url_names.items()]
    multifutures.multithread(func=download_file, func_kwargs=func_kwargs, check=True)

with httpx.Client() as client:
    func_kwargs = [{"client": client, "url": url, "name": f"{base_path}/{name}"} for (url, name) in old_url_names.items()]
    multifutures.multithread(func=download_file, func_kwargs=func_kwargs, check=True)

  0%|          | 0/2896 [00:00<?, ?it/s]

STOFS_FOLDER = "stofs2d/"

import glob
# gather stofs2D data
ds1 = []
ds2 = []
ds3 = []
for file in sorted(glob.glob(STOFS_FOLDER + "*.swl.nc")):
    root, filename = os.path.split(file)
    date = datetime.strptime(filename, "%Y%m%d_stofs_2d_glo.t%Hz.points.swl.nc")
    tmp = xr.open_dataset(file)
    
    if date > datetime(2024,5,14,11):
        ds3.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
        print(date, len(tmp.station_name),"ds3")
    else:
        if date > datetime(2023,1,7, 23):
            ds2.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
            print(date, len(tmp.station_name),"ds2")
        else: # date < datetime.datetime(2023,1,7)
            ds1.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
            print(date, len(tmp.station_name),"ds1")
# keep fist time 
ds1 = xr.concat(ds1, dim="time")
ds2 = xr.concat(ds2, dim="time")
ds3 = xr.concat(ds3, dim="time")
for it, station in keep_nodes_2022.iterrows(): # take 2022 because it has less nodes
    print(f'Station {station.ioc_code} at {station.lon:.2f}, {station.lat:.2f}')
    ds1_subset = ds1.isel(station = int(station.mesh_index))
    ds2_subset = ds2.isel(station = int(station.mesh_index))
    ds3_subset = ds3.isel(station = int(station.mesh_index))
    ds_subset = xr.concat([ds1_subset, ds2_subset, ds3_subset], dim="time")
    df = ds_subset.to_pandas()
    df.to_parquet(f'data/{station.ioc_code}.parquet')

	lon	lat	Info	ID
0	-66.981003	44.903000	ME Eastport	PSBM1 SOUS41 8410140
1	-67.204002	44.654499	ME Cutler Farris Wharf	CFWM1 SOUS41 8411060
2	-68.203499	44.394001	ME Bar Harbor	ATGM1 SOUS41 8413320
3	-70.246002	43.656101	ME Portland	CASM1 SOUS41 8418150
4	-70.563301	43.320000	ME Wells	WELM1 SOUS41 8419317
...	...	...	...	...
1683	-178.580000	58.960000		UJ850 SOUS00 SA850
1684	-144.570000	58.960000		UJ851 SOUS00 SA851
1685	-68.020000	58.960000		UJ852 SOUS00 SA852
1686	-51.010000	58.960000		UJ853 SOUS00 SA853
1687	-25.500000	58.960000		UJ854 SOUS00 SA854

	ioc_code	gloss_id	country	location	connection	dcp_id	last_observation_level	last_observation_time	delay	interval	...	observations_ratio_per_week	observations_arrived_per_month	observations_expected_per_month	observations_ratio_per_month	observations_ratio_per_day	sample_interval	average_delay_per_day	transmit_interval	lon	lat
0	abas	327	Japan	Abashiri	SWJP40	ABASHIRI	1.69	09:39	15'	10'	...	100	44580	44640.0	100	100	1'	7'	10'	144.290000	44.020000
3	acaj	182	El Salvador	Acajutla SV	SZXX01	300434064008810	0.94	09:49	5'	5'	...	93	43295	44640.0	97	99	1'	5'	5'	-89.838128	13.573792
4	acap	267	Mexico	Acapulco MX	SEPA40	3540E15A	8.26	-down-	2744d	5'	...	0	-down-	NaN	0	0	1'	NaN	5'	-99.916600	16.833300
5	acap2	267	Mexico	Acapulco API	SOMX10	0100D7CA	4.65	09:46	8'	10'	...	100	44535	44640.0	100	100	1'	7'	10'	-99.903000	16.837933
9	acya	267	Mexico	Acapulco Club de Yates	ftp	NaN	1.53	2025-01-08 14:43	2d	10'	...	0	650	44640.0	1	0	1'	NaN	10'	-99.902980	16.837990
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1615	wlms2	<NA>	USA	Bay Waveland Yacht Club, MS	SXXX03	33456542	16.02	-down-	675d	6'	...	0	-down-	NaN	0	0	1'	NaN	6'	-89.325800	30.326400
1621	wsdc	<NA>	USA	Washington DC	SXXX50	335861BA	NaN	NaN	NaN	6'	...	0	NaN	NaN	0	0	'	NaN	6'	-77.020000	38.870000
1624	xmas	146	Kiribati	Christmas KI	SZXX01	301434060001890	1.30	09:50	4'	5'	...	93	43291	44640.0	97	98	1'	5'	5'	-157.473000	1.984000
1625	yabu	<NA>	USA	Yabucoa Harbor, PR	SXXX03	3366B5CA	-6.28	09:50	4'	6'	...	97	43620	44640.0	98	98	1'	5'	6'	-65.833000	18.055100
1630	yobu	<NA>	Puerto Rico	Yobucou PR	SXXX03	3366B5CA	NaN	NaN	NaN	6'	...	0	NaN	NaN	0	0	'	NaN	6'	-65.833000	18.050100

	location	ioc_code	gloss_id	lat	lon	country	connection	contacts	dcp_id	last_observation_level	...	number_of_years	time_zone_hours	datum_information	instrument	precision	null_value	gauge_type	overall_record_quality	gesla3_id	seaset_id
125	Base O'Higgins	ohig	<NA>	-63.3210	-57.9010	Antarctica	SXCH40	Servicio Hidrográfico y Oceanográfico de la Ar...	ADC04BE6	1.75	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	125.0
135	Puerto Deseado	dese	190.0	-47.7540	-65.9150	Argentina	SEPO40	Armada Argentina Servicio de Hidrografía Naval...	33912088	3.69	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	135.0
136	Puerto Madryn	madry	191.0	-42.7630	-65.0310	Argentina	SEPO40	Armada Argentina Servicio de Hidrografía Naval...	335665D2	6.60	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	136.0
139	Battery Point	bapj	<NA>	-42.8920	147.3380	Australia	SZAU01	National Tidal Centre/Australian Bureau of Met...	61221	0.91	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	139.0
140	Broome	brom	40.0	-18.0010	122.2190	Australia	SZAU01	National Tidal Centre/Australian Bureau of Met...	62650	7.69	...	32.0	0.0	Unspecified	Unspecified	Unspecified	-99.9999	Coastal	No obvious issues	Broome	140.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1615	Bay Waveland Yacht Club, MS	wlms2	<NA>	30.3264	-89.3258	USA	SXXX03	National Ocean Service-NOAA ( USA )	33456542	16.02	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1621	Washington DC	wsdc	<NA>	38.8700	-77.0200	USA	SXXX50	National Ocean Service-NOAA ( USA )	335861BA	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1624	Christmas KI	xmas	146.0	1.9840	-157.4730	Kiribati	SZXX01	Kiribati Met Office ( Kiribati ) +University o...	301434060001890	1.30	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1625	Yabucoa Harbor, PR	yabu	<NA>	18.0551	-65.8330	USA	SXXX03	Puerto Rico Seismic Network ( USA ) +National ...	3366B5CA	-6.28	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1630	Yobucou PR	yobu	<NA>	18.0501	-65.8330	Puerto Rico	SXXX03	NaN	3366B5CA	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN