Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add climatology features #38

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mlpp_features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from mlpp_features.obs import *
from mlpp_features.terrain import *
from mlpp_features.time import *
from mlpp_features.climatology import *
45 changes: 45 additions & 0 deletions mlpp_features/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,51 @@ def wind_from_direction(self):
da.attrs["units"] = "degrees"
return da.to_dataset(name="wind_from_direction")

def rolling_mean_days_and_hours(self, days: int, hours: int) -> xr.Dataset:
"""
Compute the rolling mean of the input dataset over a given number of days and hours.
"""
ds = self.ds

ds = ds.assign_coords(hourofday=ds["time"].dt.hour).assign_coords(
dayofyear=ds["time"].dt.dayofyear
)

rolling_mean_hour = ds.rolling(
time=hours * 2 + 1, center=True, min_periods=1
).mean()

ds_rolled_days_list = []
days_list = list(range(1, 367))
for day in range(1, 367):
days_range = [
(day + i) % 366 if (day + i) % 366 != 0 else 366
for i in range(-days, days + 1)
]

try:
rolling_mean_day = (
rolling_mean_hour.where(
rolling_mean_hour["dayofyear"].isin(days_range), drop=True
)
.groupby("time.hour")
.mean()
)
except ValueError as e:
if "hour must not be empty" in str(e):
days_list.remove(day)
continue
else:
raise e
Comment on lines +331 to +344
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sort of fix to pass the pytest.
It fails (without that fix) if one the days window is empty.


ds_rolled_days_list.append(rolling_mean_day)

rolling_mean_ds = xr.concat(
ds_rolled_days_list, dim=xr.DataArray(days_list, dims="dayofyear")
).rename({"hour": "hourofday"})

return rolling_mean_ds

def rankdata(
self,
dim: str = "realization",
Expand Down
60 changes: 60 additions & 0 deletions mlpp_features/climatology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import logging
from typing import Dict

import pandas as pd
import xarray as xr

from mlpp_features.decorators import out_format

LOGGER = logging.getLogger(__name__)

# Set global options
xr.set_options(keep_attrs=True)


# @out_format()
# def cloud_area_fraction_rollingmean_1h_10d(
# data: Dict[str, xr.Dataset], stations, reftimes, leadtimes, **kwargs
# ) -> xr.DataArray:
# """
# Climatology of cloud cover: the values were averaged at +-1h and +-10 days to account for seasonal variability
# """
# return (
# data["climatology"]
# .mlpp.get("cloud_area_fraction")
# .mlpp.unstack_time(reftimes, leadtimes)
# .astype("float32")
# )


@out_format()
def cloud_area_fraction_rollingmean_1h_10d(
data: Dict[str, xr.Dataset], stations, reftimes, leadtimes, **kwargs
) -> xr.DataArray:
"""
Climatology of cloud cover: the values were averaged at +-1h and +-10 days to account for seasonal variability
"""
rolling_mean_ds = (
data["obs"]
.mlpp.get("cloud_area_fraction")
.mlpp.rolling_mean_days_and_hours(hours=1, days=10)
)
clim_ds = xr.Dataset(
None,
coords={
"forecast_reference_time": pd.to_datetime(reftimes).astype("datetime64[ns]"),
"lead_time": leadtimes.astype("timedelta64[ns]"),
},
)

clim_ds = clim_ds.assign_coords(time=clim_ds.forecast_reference_time + clim_ds.lead_time)
days = clim_ds.time.dt.dayofyear
hours = clim_ds.time.dt.hour

clim_ds = (
rolling_mean_ds.sel(dayofyear=days, hourofday=hours)
.mlpp.unstack_time(reftimes, leadtimes)
.astype("float32")
)

return clim_ds
2 changes: 1 addition & 1 deletion mlpp_features/discover.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def discover_inputs(pipelines: Union[str, List[str]]) -> List:
pipelines = [pipelines]

# pass an empty dataset to trigger a KeyError
data = {"nwp": xr.Dataset(), "terrain": xr.Dataset(), "obs": xr.Dataset()}
data = {"nwp": xr.Dataset(), "terrain": xr.Dataset(), "obs": xr.Dataset(), "climatology": xr.Dataset()}
inputs = []
for pipeline in pipelines:
try:
Expand Down
41 changes: 41 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,47 @@ def _data():
return _data


@pytest.fixture
def clim_dataset():
"""Create climatology dataset as if loaded from zarr files, still unprocessed."""

def _data():

variables = [
"cloud_area_fraction",
]

stations = _stations_dataframe()
times = pd.date_range("2000-01-01T00", "2000-01-02T00", freq="1h")

n_times = len(times)
n_stations = len(stations)

var_shape = (n_times, n_stations)
ds = xr.Dataset(
None,
coords={
"time": times,
"station": stations.index,
"longitude": ("station", stations.longitude),
"latitude": ("station", stations.latitude),
"height_masl": ("station", stations.height_masl),
"owner_id": ("station", np.random.randint(1, 5, stations.shape[0])),
"pole_height": ("station", np.random.randint(5, 15, stations.shape[0])),
"roof_height": ("station", np.zeros(stations.shape[0])),
},
)
for var in variables:
measurements = np.random.randn(*var_shape)
nan_idx = [np.random.randint(0, d, 60) for d in var_shape]
measurements[nan_idx[0], nan_idx[1]] = np.nan
ds[var] = (("time", "station"), measurements)
return ds

return _data
Comment on lines +275 to +312
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we move to using the "obs" part of the variable dictionnary, we likely don't need this anymore




@pytest.fixture
def preproc_dataset():
def _data():
Expand Down
5 changes: 4 additions & 1 deletion tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ class TestFeatures:

@pytest.fixture(autouse=True)
def _make_datasets(
self, nwp_dataset, obs_dataset, terrain_dataset, stations_dataframe
self, nwp_dataset, obs_dataset, terrain_dataset, clim_dataset, stations_dataframe
):
self._nwp = nwp_dataset(1e4)
self._obs = obs_dataset()
self._terrain = terrain_dataset(1e4)
self._climatology = clim_dataset()
self._stations = stations_dataframe()

@pytest.mark.parametrize("pipeline,", pipelines)
Expand All @@ -33,6 +34,7 @@ def test_raise_keyerror(self, pipeline):
"nwp": xr.Dataset(),
"terrain": xr.Dataset(),
"obs": xr.Dataset(),
"climatology": xr.Dataset(),
}
with pytest.raises(KeyError):
pipeline(empty_data, None, None, None)
Expand All @@ -44,6 +46,7 @@ def test_features(self, pipeline):
"nwp": self._nwp,
"terrain": self._terrain,
"obs": self._obs,
"climatology": self._climatology,
}
stations = self._stations
reftimes = pd.date_range("2000-01-01T00", "2000-01-01T18", periods=4)
Expand Down