MeteoSwiss · louisPoulain · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 12, 2024
diff --git a/mlpp_features/__init__.py b/mlpp_features/__init__.py
@@ -4,3 +4,4 @@
 from mlpp_features.obs import *
 from mlpp_features.terrain import *
 from mlpp_features.time import *
+from mlpp_features.climatology import *
diff --git a/mlpp_features/accessors.py b/mlpp_features/accessors.py
@@ -306,6 +306,51 @@ def wind_from_direction(self):
         da.attrs["units"] = "degrees"
         return da.to_dataset(name="wind_from_direction")
 
+    def rolling_mean_days_and_hours(self, days: int, hours: int) -> xr.Dataset:
+        """
+        Compute the rolling mean of the input dataset over a given number of days and hours.
+        """
+        ds = self.ds
+
+        ds = ds.assign_coords(hourofday=ds["time"].dt.hour).assign_coords(
+            dayofyear=ds["time"].dt.dayofyear
+        )
+
+        rolling_mean_hour = ds.rolling(
+            time=hours * 2 + 1, center=True, min_periods=1
+        ).mean()
+
+        ds_rolled_days_list = []
+        days_list = list(range(1, 367))
+        for day in range(1, 367):
+            days_range = [
+                (day + i) % 366 if (day + i) % 366 != 0 else 366
+                for i in range(-days, days + 1)
+            ]
+
+            try:
+                rolling_mean_day = (
+                    rolling_mean_hour.where(
+                        rolling_mean_hour["dayofyear"].isin(days_range), drop=True
+                    )
+                    .groupby("time.hour")
+                    .mean()
+                )
+            except ValueError as e:
+                if "hour must not be empty" in str(e):
+                    days_list.remove(day)
+                    continue
+                else:
+                    raise e
+
+            ds_rolled_days_list.append(rolling_mean_day)
+
+        rolling_mean_ds = xr.concat(
+            ds_rolled_days_list, dim=xr.DataArray(days_list, dims="dayofyear")
+        ).rename({"hour": "hourofday"})
+
+        return rolling_mean_ds
+
     def rankdata(
         self,
         dim: str = "realization",

diff --git a/mlpp_features/climatology.py b/mlpp_features/climatology.py
@@ -0,0 +1,60 @@
+import logging
+from typing import Dict
+
+import pandas as pd
+import xarray as xr
+
+from mlpp_features.decorators import out_format
+
+LOGGER = logging.getLogger(__name__)
+
+# Set global options
+xr.set_options(keep_attrs=True)
+
+
+# @out_format()
+# def cloud_area_fraction_rollingmean_1h_10d(
+#     data: Dict[str, xr.Dataset], stations, reftimes, leadtimes, **kwargs
+# ) -> xr.DataArray:
+#     """
+#     Climatology of cloud cover: the values were averaged at +-1h and +-10 days to account for seasonal variability 
+#     """
+#     return (
+#         data["climatology"]
+#         .mlpp.get("cloud_area_fraction")
+#         .mlpp.unstack_time(reftimes, leadtimes)
+#         .astype("float32")
+#     )
+
+
+@out_format()
+def cloud_area_fraction_rollingmean_1h_10d(
+    data: Dict[str, xr.Dataset], stations, reftimes, leadtimes, **kwargs
+) -> xr.DataArray:
+    """
+    Climatology of cloud cover: the values were averaged at +-1h and +-10 days to account for seasonal variability 
+    """
+    rolling_mean_ds = (
+        data["obs"]
+        .mlpp.get("cloud_area_fraction")
+        .mlpp.rolling_mean_days_and_hours(hours=1, days=10)
+    )
+    clim_ds = xr.Dataset(
+        None,
+        coords={
+            "forecast_reference_time": pd.to_datetime(reftimes).astype("datetime64[ns]"),
+            "lead_time": leadtimes.astype("timedelta64[ns]"),
+        },
+    )
+
+    clim_ds = clim_ds.assign_coords(time=clim_ds.forecast_reference_time + clim_ds.lead_time)
+    days = clim_ds.time.dt.dayofyear
+    hours = clim_ds.time.dt.hour
+
+    clim_ds = (
+        rolling_mean_ds.sel(dayofyear=days, hourofday=hours)
+        .mlpp.unstack_time(reftimes, leadtimes)
+        .astype("float32")
+    )
+
+    return clim_ds
diff --git a/mlpp_features/discover.py b/mlpp_features/discover.py
@@ -14,7 +14,7 @@ def discover_inputs(pipelines: Union[str, List[str]]) -> List:
         pipelines = [pipelines]
 
     # pass an empty dataset to trigger a KeyError
-    data = {"nwp": xr.Dataset(), "terrain": xr.Dataset(), "obs": xr.Dataset()}
+    data = {"nwp": xr.Dataset(), "terrain": xr.Dataset(), "obs": xr.Dataset(), "climatology": xr.Dataset()}
     inputs = []
     for pipeline in pipelines:
         try:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -272,6 +272,47 @@ def _data():
     return _data
 
 
+@pytest.fixture
+def clim_dataset():
+    """Create climatology dataset as if loaded from zarr files, still unprocessed."""
+
+    def _data():
+
+        variables = [
+            "cloud_area_fraction",
+        ]
+
+        stations = _stations_dataframe()
+        times = pd.date_range("2000-01-01T00", "2000-01-02T00", freq="1h")
+
+        n_times = len(times)
+        n_stations = len(stations)
+
+        var_shape = (n_times, n_stations)
+        ds = xr.Dataset(
+            None,
+            coords={
+                "time": times,
+                "station": stations.index,
+                "longitude": ("station", stations.longitude),
+                "latitude": ("station", stations.latitude),
+                "height_masl": ("station", stations.height_masl),
+                "owner_id": ("station", np.random.randint(1, 5, stations.shape[0])),
+                "pole_height": ("station", np.random.randint(5, 15, stations.shape[0])),
+                "roof_height": ("station", np.zeros(stations.shape[0])),
+            },
+        )
+        for var in variables:
+            measurements = np.random.randn(*var_shape)
+            nan_idx = [np.random.randint(0, d, 60) for d in var_shape]
+            measurements[nan_idx[0], nan_idx[1]] = np.nan
+            ds[var] = (("time", "station"), measurements)
+        return ds
+
+    return _data
+
+
+
 @pytest.fixture
 def preproc_dataset():
     def _data():

diff --git a/tests/test_features.py b/tests/test_features.py
@@ -19,11 +19,12 @@ class TestFeatures:
 
     @pytest.fixture(autouse=True)
     def _make_datasets(
-        self, nwp_dataset, obs_dataset, terrain_dataset, stations_dataframe
+        self, nwp_dataset, obs_dataset, terrain_dataset, clim_dataset, stations_dataframe
     ):
         self._nwp = nwp_dataset(1e4)
         self._obs = obs_dataset()
         self._terrain = terrain_dataset(1e4)
+        self._climatology = clim_dataset()
         self._stations = stations_dataframe()
 
     @pytest.mark.parametrize("pipeline,", pipelines)
@@ -33,6 +34,7 @@ def test_raise_keyerror(self, pipeline):
             "nwp": xr.Dataset(),
             "terrain": xr.Dataset(),
             "obs": xr.Dataset(),
+            "climatology": xr.Dataset(),
         }
         with pytest.raises(KeyError):
             pipeline(empty_data, None, None, None)
@@ -44,6 +46,7 @@ def test_features(self, pipeline):
             "nwp": self._nwp,
             "terrain": self._terrain,
             "obs": self._obs,
+            "climatology": self._climatology,
         }
         stations = self._stations
         reftimes = pd.date_range("2000-01-01T00", "2000-01-01T18", periods=4)