modify self-made loss func to avoid this issue: thouska/spotpy#319

OuyangWenyu · Mar 26, 2024 · 262ec8e · 262ec8e
1 parent a522194
commit 262ec8e
Show file tree

Hide file tree

Showing 13 changed files with 517 additions and 418 deletions.
diff --git a/README.md b/README.md
@@ -45,6 +45,8 @@ $ python -m ipykernel install --user --name hydromodel --display-name "hydromode
 
 You can use the CAMELS dataset (see [here](https://github.com/OuyangWenyu/hydrodataset) to prepare it) to run the model.
 
+If CAMELS is used, you can skip this step.
+
 To use your own data to run the model, you need prepare the data in the required format.
 
 We provide some transformation functions in the "scripts" directory. You can use them to transform your data to the required format.
@@ -81,24 +83,25 @@ No more unnecessary columns are allowed.
 For time series csv files, et and node1_flow are optional. If you don't have them, you can ignore them.
 The units of all variables could be different, but they cannot be missed and should be put in `()` in the column name.
 
-2. download [prepare_data.py](https://github.com/OuyangWenyu/hydro-model-xaj/tree/master/scripts) and run the following code to transform the data format to the required format:
+2. Download [prepare_data.py](https://github.com/OuyangWenyu/hydro-model-xaj/tree/master/scripts) and run the following code to transform the data format to the required format:
 ```Shell
 $ python prepare_data.py --origin_data_dir <your_data_directory_for_hydromodel>
 ```
 
-3. If the format is wrong, please do step 1 again carefully. If the format is right, you can run the following code to preprocess the data, such as cross-validation, etc.:
+### Run the model
+
+To run calibration with CAMLES dataset, you can use the following code:
+
 ```Shell
-$ python datapreprocess4calibrate.py --data <name of the data file> --exp <name of the directory of the prepared data>
+$ python calibrate_xaj.py --exp camels --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":5000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1}
 ```
 
-### Run the model
-
-Run the following code:
+To use your own data, run the following code:
 
 ```Shell
 # you can change the algorithm parameters:
 $ python calibrate_xaj.py --exp example --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":5000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1}
-# for advices of hyper-parameters of sceua, please see the help comment of the function 'calibrate_xaj.py'
+# for advices of hyper-parameters of sceua, please see the comment of the function 'calibrate_xaj.py'
 # python calibrate_xaj.py --exp <name of directory of the prepared data> --warmup_length <hydromodel need some warm-up period> --model <model function parameters> --algorithm <calibration algorithm parameters>
 ```
 

diff --git a/env-dev.yml b/env-dev.yml
@@ -18,6 +18,7 @@ dependencies:
   - sphinx
   - black
   - flake8
+  - pytest
   # pip
   - pip
   - pip:

diff --git a/hydromodel/datasets/data_postprocess.py b/hydromodel/datasets/data_postprocess.py
@@ -34,7 +34,7 @@ def read_save_sceua_calibrated_params(basin_id, save_dir, sceua_calibrated_file_
         results
     )  # 结果数组中具有最小目标函数的位置的索引
     best_model_run = results[bestindex]
-    fields = [word for word in best_model_run.dtype.names if word.startswith("par")] 
+    fields = [word for word in best_model_run.dtype.names if word.startswith("par")]
     best_calibrate_params = pd.DataFrame(list(best_model_run[fields]))
     save_file = os.path.join(save_dir, basin_id + "_calibrate_params.txt")
     best_calibrate_params.to_csv(save_file, sep=",", index=False, header=True)

diff --git a/hydromodel/models/model_dict.py b/hydromodel/models/model_dict.py
@@ -1,12 +1,13 @@
 """
 Author: Wenyu Ouyang
 Date: 2024-03-23 08:25:49
-LastEditTime: 2024-03-26 11:44:04
+LastEditTime: 2024-03-26 18:11:44
 LastEditors: Wenyu Ouyang
-Description: CRITERION_DICT and MODEL_DICT
+Description: LOSS_DICT and MODEL_DICT
 FilePath: \hydro-model-xaj\hydromodel\models\model_dict.py
 Copyright (c) 2023-2024 Wenyu Ouyang. All rights reserved.
 """
+
 import numpy as np
 from spotpy.objectivefunctions import rmse
 from hydromodel.models.xaj import xaj
@@ -15,14 +16,37 @@
 
 
 def rmse43darr(obs, sim):
+    """RMSE for 3D array
+
+    Parameters
+    ----------
+    obs : np.ndarray
+        observation data
+    sim : np.ndarray
+        simulation data
+
+    Returns
+    -------
+    _type_
+        _description_
+
+    Raises
+    ------
+    ValueError
+        _description_
+    """
     rmses = np.sqrt(np.nanmean((sim - obs) ** 2, axis=0))
     rmse = rmses.mean(axis=0)
-    if rmse is np.nan:
-        raise ValueError("RMSE is nan, please check the input data.")
-    return rmse
+    if np.isnan(rmse) or any(np.isnan(sim)):
+        raise ValueError(
+            "RMSE is nan or there are nan values in the simulation data, please check the input data."
+        )
+    # tolist is necessary for spotpy to get the value
+    # otherwise the print will incur to an issue https://github.com/thouska/spotpy/issues/319
+    return rmse.tolist()
 
 
-CRITERION_DICT = {
+LOSS_DICT = {
     "RMSE": rmse43darr,
     "spotpy_rmse": rmse,
 }

diff --git a/hydromodel/trainers/calibrate_ga.py b/hydromodel/trainers/calibrate_ga.py
@@ -1,7 +1,7 @@
 """
 Author: Wenyu Ouyang
 Date: 2021-12-10 23:01:02
-LastEditTime: 2024-03-22 21:26:01
+LastEditTime: 2024-03-26 15:22:19
 LastEditors: Wenyu Ouyang
 Description: Calibrate XAJ model using DEAP
 FilePath: \hydro-model-xaj\hydromodel\trainers\calibrate_ga.py
@@ -21,9 +21,8 @@
 
 
 from hydromodel.models.model_config import MODEL_PARAM_DICT
-from hydromodel.models.model_dict import MODEL_DICT
+from hydromodel.models.model_dict import MODEL_DICT, rmse43darr
 from hydromodel.trainers.train_utils import plot_sim_and_obs, plot_train_iteration
-from models.model_dict import rmse43darr
 
 
 def evaluate(individual, x_input, y_true, warmup_length, model):

diff --git a/hydromodel/trainers/calibrate_sceua.py b/hydromodel/trainers/calibrate_sceua.py
@@ -5,11 +5,11 @@
 import pandas as pd
 from spotpy.parameter import Uniform, ParameterSet
 from hydromodel.models.model_config import MODEL_PARAM_DICT
-from hydromodel.models.model_dict import CRITERION_DICT, MODEL_DICT
+from hydromodel.models.model_dict import LOSS_DICT, MODEL_DICT
 
 
 class SpotSetup(object):
-    def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None):
+    def __init__(self, p_and_e, qobs, warmup_length=365, model=None, loss=None):
         """
         Set up for Spotpy
         NOTE: once for a basin in one sampler or
@@ -27,17 +27,17 @@ def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None):
             we support "gr4j", "hymod", and "xaj"
         model_func_param
             parameters of model function
-        metric
-            metric configs including objective function, typically RMSE
+        loss
+            loss configs including objective function, typically RMSE
         """
         if model is None:
             model = {
                 "name": "xaj_mz",
                 "source_type": "sources5mm",
                 "source_book": "HF",
             }
-        if metric is None:
-            metric = {
+        if loss is None:
+            loss = {
                 "type": "time_series",
                 "obj_func": "rmse",
                 "events": None,
@@ -49,7 +49,7 @@ def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None):
             Uniform(par_name, low=0.0, high=1.0) for par_name in self.parameter_names
         )
         # Just a way to keep this example flexible and applicable to various examples
-        self.metric = metric
+        self.loss = loss
         # Load Observation data from file
         self.p_and_e = p_and_e
         # chose observation data after warmup period
@@ -98,6 +98,7 @@ def objectivefunction(
         self,
         simulation: Union[list, np.array],
         evaluation: Union[list, np.array],
+        params=None,  # this cannot be removed
     ) -> float:
         """
         A user defined objective function to calculate fitness.
@@ -114,10 +115,10 @@ def objectivefunction(
         float
             likelihood
         """
-        if self.metric["type"] == "time_series":
-            return CRITERION_DICT[self.metric["obj_func"]](evaluation, simulation)
+        if self.loss["type"] == "time_series":
+            return LOSS_DICT[self.loss["obj_func"]](evaluation, simulation)
         # for events
-        time = self.metric["events"]
+        time = self.loss["events"]
         if time is None:
             raise ValueError(
                 "time should not be None since you choose events, otherwise choose time_series"
@@ -137,7 +138,7 @@ def objectivefunction(
                 ) / pd.Timedelta(hours=1)
                 start_num = int(start_num)
                 end_num = int(end_num)
-                like_ = CRITERION_DICT[self.metric["obj_func"]](
+                like_ = LOSS_DICT[self.loss["obj_func"]](
                     evaluation[start_num:end_num,], simulation[start_num:end_num,]
                 )
                 count += 1
@@ -154,7 +155,7 @@ def calibrate_by_sceua(
     warmup_length=365,
     model=None,
     algorithm=None,
-    metric=None,
+    loss=None,
 ):
     """
     Function for calibrating model by SCE-UA
@@ -178,8 +179,8 @@ def calibrate_by_sceua(
     algorithm
         calibrate algorithm. For example, if you want to calibrate xaj model,
         and use sce-ua algorithm -- random seed=2000, rep=5000, ngs=7, kstop=3, peps=0.1, pcento=0.1
-    metric
-        metric configs for events calculation or
+    loss
+        loss configs for events calculation or
         just one long time-series calculation
         with an objective function, typically RMSE
 
@@ -203,8 +204,8 @@ def calibrate_by_sceua(
             "peps": 0.1,
             "pcento": 0.1,
         }
-    if metric is None:
-        metric = {
+    if loss is None:
+        loss = {
             "type": "time_series",
             "obj_func": "RMSE",
             # when "type" is "events", this is not None, but idxs of events in time series
@@ -226,11 +227,11 @@ def calibrate_by_sceua(
             qobs[:, i : i + 1, :],
             warmup_length=warmup_length,
             model=model,
-            metric=metric,
+            loss=loss,
         )
+        if not os.path.exists(dbname):
+            os.makedirs(dbname)
         db_basin = os.path.join(dbname, basins[i])
-        if not os.path.exists(db_basin):
-            os.makedirs(db_basin)
         # Select number of maximum allowed repetitions
         sampler = spotpy.algorithms.sceua(
             spot_setup,