From 262ec8e9315983987b9c3eb89722e33c659154c8 Mon Sep 17 00:00:00 2001 From: "ouyang,wenyu" Date: Tue, 26 Mar 2024 18:17:32 +0800 Subject: [PATCH] modify self-made loss func to avoid this issue: https://github.com/thouska/spotpy/issues/319 --- README.md | 17 +- env-dev.yml | 1 + hydromodel/datasets/data_postprocess.py | 2 +- hydromodel/models/model_dict.py | 36 ++- hydromodel/trainers/calibrate_ga.py | 5 +- hydromodel/trainers/calibrate_sceua.py | 39 +-- scripts/calibrate_xaj.py | 389 +++++++++++------------- scripts/calibrate_xaj_camels.py | 156 ---------- scripts/calibrate_xaj_for_multicases.py | 2 +- scripts/datapostprocess4calibrate.py | 18 +- scripts/evaluate_xaj.py | 226 ++++++++++++++ test/test_calibrate.py | 2 +- test/test_data_postprocess.py | 42 ++- 13 files changed, 517 insertions(+), 418 deletions(-) delete mode 100644 scripts/calibrate_xaj_camels.py create mode 100644 scripts/evaluate_xaj.py diff --git a/README.md b/README.md index 5c5a7b7..c70d262 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,8 @@ $ python -m ipykernel install --user --name hydromodel --display-name "hydromode You can use the CAMELS dataset (see [here](https://github.com/OuyangWenyu/hydrodataset) to prepare it) to run the model. +If CAMELS is used, you can skip this step. + To use your own data to run the model, you need prepare the data in the required format. We provide some transformation functions in the "scripts" directory. You can use them to transform your data to the required format. @@ -81,24 +83,25 @@ No more unnecessary columns are allowed. For time series csv files, et and node1_flow are optional. If you don't have them, you can ignore them. The units of all variables could be different, but they cannot be missed and should be put in `()` in the column name. -2. download [prepare_data.py](https://github.com/OuyangWenyu/hydro-model-xaj/tree/master/scripts) and run the following code to transform the data format to the required format: +2. Download [prepare_data.py](https://github.com/OuyangWenyu/hydro-model-xaj/tree/master/scripts) and run the following code to transform the data format to the required format: ```Shell $ python prepare_data.py --origin_data_dir ``` -3. If the format is wrong, please do step 1 again carefully. If the format is right, you can run the following code to preprocess the data, such as cross-validation, etc.: +### Run the model + +To run calibration with CAMLES dataset, you can use the following code: + ```Shell -$ python datapreprocess4calibrate.py --data --exp +$ python calibrate_xaj.py --exp camels --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":5000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1} ``` -### Run the model - -Run the following code: +To use your own data, run the following code: ```Shell # you can change the algorithm parameters: $ python calibrate_xaj.py --exp example --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":5000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1} -# for advices of hyper-parameters of sceua, please see the help comment of the function 'calibrate_xaj.py' +# for advices of hyper-parameters of sceua, please see the comment of the function 'calibrate_xaj.py' # python calibrate_xaj.py --exp --warmup_length --model --algorithm ``` diff --git a/env-dev.yml b/env-dev.yml index a0bfd98..bf5d19a 100644 --- a/env-dev.yml +++ b/env-dev.yml @@ -18,6 +18,7 @@ dependencies: - sphinx - black - flake8 + - pytest # pip - pip - pip: diff --git a/hydromodel/datasets/data_postprocess.py b/hydromodel/datasets/data_postprocess.py index 75d71fb..cd7dd76 100644 --- a/hydromodel/datasets/data_postprocess.py +++ b/hydromodel/datasets/data_postprocess.py @@ -34,7 +34,7 @@ def read_save_sceua_calibrated_params(basin_id, save_dir, sceua_calibrated_file_ results ) # 结果数组中具有最小目标函数的位置的索引 best_model_run = results[bestindex] - fields = [word for word in best_model_run.dtype.names if word.startswith("par")] + fields = [word for word in best_model_run.dtype.names if word.startswith("par")] best_calibrate_params = pd.DataFrame(list(best_model_run[fields])) save_file = os.path.join(save_dir, basin_id + "_calibrate_params.txt") best_calibrate_params.to_csv(save_file, sep=",", index=False, header=True) diff --git a/hydromodel/models/model_dict.py b/hydromodel/models/model_dict.py index a9361d0..938b1e0 100644 --- a/hydromodel/models/model_dict.py +++ b/hydromodel/models/model_dict.py @@ -1,12 +1,13 @@ """ Author: Wenyu Ouyang Date: 2024-03-23 08:25:49 -LastEditTime: 2024-03-26 11:44:04 +LastEditTime: 2024-03-26 18:11:44 LastEditors: Wenyu Ouyang -Description: CRITERION_DICT and MODEL_DICT +Description: LOSS_DICT and MODEL_DICT FilePath: \hydro-model-xaj\hydromodel\models\model_dict.py Copyright (c) 2023-2024 Wenyu Ouyang. All rights reserved. """ + import numpy as np from spotpy.objectivefunctions import rmse from hydromodel.models.xaj import xaj @@ -15,14 +16,37 @@ def rmse43darr(obs, sim): + """RMSE for 3D array + + Parameters + ---------- + obs : np.ndarray + observation data + sim : np.ndarray + simulation data + + Returns + ------- + _type_ + _description_ + + Raises + ------ + ValueError + _description_ + """ rmses = np.sqrt(np.nanmean((sim - obs) ** 2, axis=0)) rmse = rmses.mean(axis=0) - if rmse is np.nan: - raise ValueError("RMSE is nan, please check the input data.") - return rmse + if np.isnan(rmse) or any(np.isnan(sim)): + raise ValueError( + "RMSE is nan or there are nan values in the simulation data, please check the input data." + ) + # tolist is necessary for spotpy to get the value + # otherwise the print will incur to an issue https://github.com/thouska/spotpy/issues/319 + return rmse.tolist() -CRITERION_DICT = { +LOSS_DICT = { "RMSE": rmse43darr, "spotpy_rmse": rmse, } diff --git a/hydromodel/trainers/calibrate_ga.py b/hydromodel/trainers/calibrate_ga.py index 9facfcd..708f09e 100644 --- a/hydromodel/trainers/calibrate_ga.py +++ b/hydromodel/trainers/calibrate_ga.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2021-12-10 23:01:02 -LastEditTime: 2024-03-22 21:26:01 +LastEditTime: 2024-03-26 15:22:19 LastEditors: Wenyu Ouyang Description: Calibrate XAJ model using DEAP FilePath: \hydro-model-xaj\hydromodel\trainers\calibrate_ga.py @@ -21,9 +21,8 @@ from hydromodel.models.model_config import MODEL_PARAM_DICT -from hydromodel.models.model_dict import MODEL_DICT +from hydromodel.models.model_dict import MODEL_DICT, rmse43darr from hydromodel.trainers.train_utils import plot_sim_and_obs, plot_train_iteration -from models.model_dict import rmse43darr def evaluate(individual, x_input, y_true, warmup_length, model): diff --git a/hydromodel/trainers/calibrate_sceua.py b/hydromodel/trainers/calibrate_sceua.py index 6656a91..6d1758a 100644 --- a/hydromodel/trainers/calibrate_sceua.py +++ b/hydromodel/trainers/calibrate_sceua.py @@ -5,11 +5,11 @@ import pandas as pd from spotpy.parameter import Uniform, ParameterSet from hydromodel.models.model_config import MODEL_PARAM_DICT -from hydromodel.models.model_dict import CRITERION_DICT, MODEL_DICT +from hydromodel.models.model_dict import LOSS_DICT, MODEL_DICT class SpotSetup(object): - def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None): + def __init__(self, p_and_e, qobs, warmup_length=365, model=None, loss=None): """ Set up for Spotpy NOTE: once for a basin in one sampler or @@ -27,8 +27,8 @@ def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None): we support "gr4j", "hymod", and "xaj" model_func_param parameters of model function - metric - metric configs including objective function, typically RMSE + loss + loss configs including objective function, typically RMSE """ if model is None: model = { @@ -36,8 +36,8 @@ def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None): "source_type": "sources5mm", "source_book": "HF", } - if metric is None: - metric = { + if loss is None: + loss = { "type": "time_series", "obj_func": "rmse", "events": None, @@ -49,7 +49,7 @@ def __init__(self, p_and_e, qobs, warmup_length=365, model=None, metric=None): Uniform(par_name, low=0.0, high=1.0) for par_name in self.parameter_names ) # Just a way to keep this example flexible and applicable to various examples - self.metric = metric + self.loss = loss # Load Observation data from file self.p_and_e = p_and_e # chose observation data after warmup period @@ -98,6 +98,7 @@ def objectivefunction( self, simulation: Union[list, np.array], evaluation: Union[list, np.array], + params=None, # this cannot be removed ) -> float: """ A user defined objective function to calculate fitness. @@ -114,10 +115,10 @@ def objectivefunction( float likelihood """ - if self.metric["type"] == "time_series": - return CRITERION_DICT[self.metric["obj_func"]](evaluation, simulation) + if self.loss["type"] == "time_series": + return LOSS_DICT[self.loss["obj_func"]](evaluation, simulation) # for events - time = self.metric["events"] + time = self.loss["events"] if time is None: raise ValueError( "time should not be None since you choose events, otherwise choose time_series" @@ -137,7 +138,7 @@ def objectivefunction( ) / pd.Timedelta(hours=1) start_num = int(start_num) end_num = int(end_num) - like_ = CRITERION_DICT[self.metric["obj_func"]]( + like_ = LOSS_DICT[self.loss["obj_func"]]( evaluation[start_num:end_num,], simulation[start_num:end_num,] ) count += 1 @@ -154,7 +155,7 @@ def calibrate_by_sceua( warmup_length=365, model=None, algorithm=None, - metric=None, + loss=None, ): """ Function for calibrating model by SCE-UA @@ -178,8 +179,8 @@ def calibrate_by_sceua( algorithm calibrate algorithm. For example, if you want to calibrate xaj model, and use sce-ua algorithm -- random seed=2000, rep=5000, ngs=7, kstop=3, peps=0.1, pcento=0.1 - metric - metric configs for events calculation or + loss + loss configs for events calculation or just one long time-series calculation with an objective function, typically RMSE @@ -203,8 +204,8 @@ def calibrate_by_sceua( "peps": 0.1, "pcento": 0.1, } - if metric is None: - metric = { + if loss is None: + loss = { "type": "time_series", "obj_func": "RMSE", # when "type" is "events", this is not None, but idxs of events in time series @@ -226,11 +227,11 @@ def calibrate_by_sceua( qobs[:, i : i + 1, :], warmup_length=warmup_length, model=model, - metric=metric, + loss=loss, ) + if not os.path.exists(dbname): + os.makedirs(dbname) db_basin = os.path.join(dbname, basins[i]) - if not os.path.exists(db_basin): - os.makedirs(db_basin) # Select number of maximum allowed repetitions sampler = spotpy.algorithms.sceua( spot_setup, diff --git a/scripts/calibrate_xaj.py b/scripts/calibrate_xaj.py index a558833..1e4d462 100644 --- a/scripts/calibrate_xaj.py +++ b/scripts/calibrate_xaj.py @@ -1,234 +1,198 @@ -import argparse +""" +Author: Wenyu Ouyang +Date: 2022-11-19 17:27:05 +LastEditTime: 2024-03-26 16:54:05 +LastEditors: Wenyu Ouyang +Description: the script to calibrate a model for CAMELS basin +FilePath: \hydro-model-xaj\scripts\calibrate_xaj.py +Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. +""" + import json -import socket -import fnmatch -from datetime import datetime import numpy as np -import pandas as pd -import os +import argparse import sys +import os from pathlib import Path -from hydroutils import hydro_file +import xarray as xr +import yaml -sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) +from hydrodataset import Camels +from hydrodata.utils.utils import streamflow_unit_conv + + +repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) +sys.path.append(repo_path) +from hydromodel import SETTING +from hydromodel.datasets import * +from hydromodel.datasets.data_preprocess import cross_valid_data, split_train_test from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua -from hydromodel.datasets.data_postprocess import ( - renormalize_params, - read_save_sceua_calibrated_params, - save_streamflow, - summarize_metrics, - summarize_parameters, -) -from hydromodel.trainers.train_utils import show_calibrate_result, show_test_result -from hydromodel.models.xaj import xaj -from hydromodel.trainers.calibrate_ga import calibrate_by_ga, show_ga_result def calibrate(args): + data_type = args.data_type + data_dir = args.data_dir exp = args.exp - warmup = args.warmup_length + cv_fold = args.cv_fold + train_period = args.calibrate_period + test_period = args.test_period + periods = args.period + warmup = args.warmup + basin_ids = args.basin_id model_info = args.model algo_info = args.algorithm - comment = args.comment - data_dir = os.path.join('D:/研究生/毕业论文/new毕业论文/预答辩/碧流河水库/模型运行/') - kfold = [ - int(f_name[len("data_info_fold") : -len("_test.json")]) - for f_name in os.listdir(data_dir) #输出文件夹下的所有文件 - if fnmatch.fnmatch(f_name, "*_fold*_test.json") - ] - kfold = np.sort(kfold) - for fold in kfold: - print(f"Start to calibrate the {fold}-th fold") - train_data_info_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_train.json" - ) - train_data_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_train.npy" + loss = args.loss + if data_type == "camels": + camels_data_dir = os.path.join( + SETTING["local_data_path"]["datasets-origin"], "camels", data_dir ) - test_data_info_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_test.json" + camels = Camels(camels_data_dir) + ts_data = camels.read_ts_xrdataset( + basin_ids, periods, ["prcp", "PET", "streamflow"] ) - test_data_file = os.path.join( - data_dir, f"data_info_fold{str(fold)}_test.npy" + elif data_type == "owndata": + ts_data = xr.open_dataset( + os.path.join(os.path.dirname(data_dir), "timeseries.nc") ) - if ( - os.path.exists(train_data_info_file) is False - or os.path.exists(train_data_file) is False - or os.path.exists(test_data_info_file) is False - or os.path.exists(test_data_file) is False - ): - raise FileNotFoundError( - "The data files are not found, please run datapreprocess4calibrate.py first." - ) - data_train = hydro_file.unserialize_numpy(train_data_file) - data_test = hydro_file.unserialize_numpy(test_data_file) - data_info_train = hydro_file.unserialize_json_ordered(train_data_info_file) - data_info_test = hydro_file.unserialize_json_ordered(test_data_info_file) - current_time = datetime.now().strftime("%b%d_%H-%M-%S") - save_dir = os.path.join( - data_dir, - current_time - + "_" - + socket.gethostname() - + "_fold" - + str(fold) - + "_" - + comment, + else: + raise NotImplementedError( + "You should set the data type as 'camels' or 'owndata'" ) - # 读输入文件 - if os.path.exists(save_dir) is False: - os.makedirs(save_dir) - hydro_file.serialize_json(vars(args), os.path.join(save_dir, "args.json")) - if algo_info["name"] == "SCE_UA": - for i in range(len(data_info_train["basin"])): - basin_id = data_info_train["basin"][i] - basin_area = data_info_train["area"][i] - # one directory for one model + one hyperparam setting and one basin - spotpy_db_dir = os.path.join( # 一个模型一个文件夹 - save_dir, - basin_id, - ) - if not os.path.exists(spotpy_db_dir): - os.makedirs(spotpy_db_dir) - db_name = os.path.join(spotpy_db_dir, "SCEUA_" + model_info["name"]) - sampler = calibrate_by_sceua( # 率定 - data_train[:, i : i + 1, 0:2], - data_train[:, i : i + 1, -1:], - db_name, - warmup_length=warmup, - model=model_info, - algorithm=algo_info, - ) + where_save = Path(os.path.join(repo_path, "result", exp)) + if os.path.exists(where_save) is False: + os.makedirs(where_save) - show_calibrate_result( # 展示率定结果 - sampler.setup, - db_name, - warmup_length=warmup, - save_dir=spotpy_db_dir, - basin_id=basin_id, - train_period=data_info_train["time"], - basin_area=basin_area, - prcp=data_train[365:, i : i + 1, 0:1].flatten(), - ) - - params = read_save_sceua_calibrated_params( # 保存率定的参数文件 - basin_id, spotpy_db_dir, db_name - ) - # _ is et which we didn't use here - qsim, _ = xaj( # 计算模拟结果 - data_test[:, i : i + 1, 0:2], - params, - warmup_length=0 , - **model_info, - ) + if cv_fold <= 1: + # no cross validation + periods = np.sort( + [train_period[0], train_period[1], test_period[0], test_period[1]] + ) + if cv_fold > 1: + train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) + else: + # when using train_test_split, the warmup period is not used + # so you should include the warmup period in the train and test period + train_and_test_data = split_train_test(ts_data, train_period, test_period) + print("Start to calibrate the model") - qsim = units.convert_unit( - qsim, - # TODO: to unify "mm/hour" - unit_now="mm/day", - unit_final=units.unit["streamflow"], - basin_area=basin_area, - ) - qobs = units.convert_unit( - data_test[warmup:, i : i + 1, -1:], - # TODO: to unify "mm/hour" - unit_now="mm/day", - unit_final=units.unit["streamflow"], - basin_area=basin_area, - ) - test_result_file = os.path.join( - spotpy_db_dir, - "test_qsim_" + model_info["name"] + "_" + str(basin_id) + ".csv", - ) - pd.DataFrame(qsim.reshape(-1, 1)).to_csv( - test_result_file, - sep=",", - index=False, - header=False, - ) - test_date = pd.to_datetime(data_info_test["time"][:]).values.astype( - "datetime64[h]" - ) - show_test_result( - basin_id, - test_date, - qsim, - qobs, - save_dir=spotpy_db_dir, - warmup_length=warmup, - prcp=data_test[365:, i : i + 1, 0:1].flatten(), - ) - elif algo_info["name"] == "GA": - for i in range(len(data_info_train["basin"])): - basin_id = data_info_train["basin"][i] - basin_area = data_info_train["area"][i] - # one directory for one model + one hyperparam setting and one basin - deap_db_dir = os.path.join( - save_dir, - basin_id, - ) - if not os.path.exists(deap_db_dir): - os.makedirs(deap_db_dir) - calibrate_by_ga( - data_train[:, i : i + 1, 0:2], - data_train[:, i : i + 1, -1:], - deap_db_dir, - warmup_length=warmup, - model=model_info, - ga_param=algo_info, - ) - show_ga_result( - deap_db_dir, - warmup_length=warmup, - basin_id=basin_id, - the_data=data_train[:, i : i + 1, :], - the_period=data_info_train["time"], - basin_area=basin_area, - model_info=model_info, - train_mode=True, - ) - show_ga_result( - deap_db_dir, - warmup_length=warmup, - basin_id=basin_id, - the_data=data_test[:, i : i + 1, :], - the_period=data_info_test["time"], - basin_area=basin_area, - model_info=model_info, - train_mode=False, - ) - else: - raise NotImplementedError( - "We don't provide this calibrate method! Choose from 'SCE_UA' or 'GA'!" - ) - summarize_parameters(save_dir, model_info) - renormalize_params(save_dir, model_info) - summarize_metrics(save_dir, model_info) - save_streamflow(save_dir, model_info, fold=fold) - print(f"Finish calibrating the {fold}-th fold") + if data_type == "camels": + basin_area = camels.read_area(basin_ids) + p_and_e = ( + train_and_test_data[0][["prcp", "PET"]] + .to_array() + .to_numpy() + .transpose(2, 1, 0) + ) + # trans unit to mm/day + qobs_ = train_and_test_data[0][["streamflow"]] + r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit="mm/d") + qobs = np.expand_dims(r_mmd["streamflow"].to_numpy().transpose(1, 0), axis=2) + elif data_type == "owndata": + attr_data = xr.open_dataset( + os.path.join(os.path.dirname(data_dir), "attributes.nc") + ) + basin_area = attr_data["area"].values + p_and_e = ( + train_and_test_data[0][[PRCP_NAME, PET_NAME]] + .to_array() + .to_numpy() + .transpose(2, 1, 0) + ) + qobs = np.expand_dims( + train_and_test_data[0][[FLOW_NAME]].to_array().to_numpy().transpose(1, 0), + axis=2, + ) + else: + raise NotImplementedError( + "You should set the data type as 'camels' or 'owndata'" + ) + calibrate_by_sceua( + basin_ids, + p_and_e, + qobs, + os.path.join(where_save, "sceua_xaj"), + warmup, + model=model_info, + algorithm=algo_info, + loss=loss, + ) + # Convert the arguments to a dictionary + args_dict = vars(args) + # Save the arguments to a YAML file + with open(os.path.join(where_save, "config.yaml"), "w") as f: + yaml.dump(args_dict, f) -# NOTE: Before run this command, you should run data_preprocess.py file to save your data as hydro-model-xaj data format, -# the exp must be same as the exp in data_preprocess.py -# python calibrate_xaj.py --exp exp201 --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"SCE_UA\",\"random_seed\":1234,\"rep\":2000,\"ngs\":20,\"kstop\":3,\"peps\":0.1,\"pcento\":0.1} -# python calibrate_xaj.py --exp exp61561 --warmup_length 365 --model {\"name\":\"xaj_mz\",\"source_type\":\"sources\",\"source_book\":\"HF\"} --algorithm {\"name\":\"GA\",\"random_seed\":1234,\"run_counts\":50,\"pop_num\":50,\"cross_prob\":0.5,\"mut_prob\":0.5,\"save_freq\":1} -if __name__ == "__main__": # 固定套路 - parser = argparse.ArgumentParser(description="Calibrate a hydrological model.") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Run hydro-model-xaj models with the CAMELS dataset" + ) + parser.add_argument( + "--data_type", + dest="data_type", + help="CAMELS dataset or your own data, such as 'camels' or 'owndata'", + default="camels", + type=str, + ) + parser.add_argument( + "--data_dir", + dest="data_dir", + help="The directory of the CAMELS dataset or your own data, for CAMELS," + + " as we use SETTING to set the data path, you can directly choose camels_us;" + + " for your own data, you should set the absolute path of your data directory", + default="camels_us", + type=str, + ) parser.add_argument( "--exp", dest="exp", - help="An exp is corresponding to a data plan from data_preprocess.py", - default="/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7", + help="An exp is corresponding to one data setting", + default="expcamels001", type=str, ) parser.add_argument( - "--warmup_length", - dest="warmup_length", - help="the length of warmup period for hydro model", + "--cv_fold", + dest="cv_fold", + help="the number of cross-validation fold", + default=1, + type=int, + ) + parser.add_argument( + "--warmup", + dest="warmup", + help="the number of warmup days", default=365, type=int, ) + parser.add_argument( + "--period", + dest="period", + help="The whole period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--calibrate_period", + dest="calibrate_period", + help="The training period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--test_period", + dest="test_period", + help="The testing period", + default=["2007-01-01", "2014-01-01"], + nargs="+", + ) + parser.add_argument( + "--basin_id", + dest="basin_id", + help="The basins' ids", + default=["01439500", "06885500", "08104900", "09510200"], + nargs="+", + ) parser.add_argument( "--model", dest="model", @@ -252,11 +216,12 @@ def calibrate(args): default={ "name": "SCE_UA", "random_seed": 1234, - "rep": 600, - "ngs": 1000, - "kstop": 1000, - "peps": 0.01, - "pcento": 0.01, + # these params are just for test + "rep": 10, + "ngs": 10, + "kstop": 5, + "peps": 0.1, + "pcento": 0.1, }, # default={ # "name": "GA", @@ -270,11 +235,15 @@ def calibrate(args): type=json.loads, ) parser.add_argument( - "--comment", - dest="comment", + "--loss", + dest="loss", help="A tag for a plan, we will use it when postprocessing results", - default="HF", - type=str, + default={ + "type": "time_series", + "obj_func": "RMSE", + "events": None, + }, + type=json.loads, ) the_args = parser.parse_args() calibrate(the_args) diff --git a/scripts/calibrate_xaj_camels.py b/scripts/calibrate_xaj_camels.py deleted file mode 100644 index 18bd949..0000000 --- a/scripts/calibrate_xaj_camels.py +++ /dev/null @@ -1,156 +0,0 @@ -""" -Author: Wenyu Ouyang -Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-26 10:16:49 -LastEditors: Wenyu Ouyang -Description: the script to calibrate a model for CAMELS basin -FilePath: \hydro-model-xaj\scripts\calibrate_xaj_camels.py -Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. -""" - -import numpy as np -import argparse -import sys -import os -from pathlib import Path - -from hydrodataset import Camels -from hydrodata.utils.utils import streamflow_unit_conv - -repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) -sys.path.append(repo_path) -from hydromodel import SETTING -from hydromodel.datasets.data_preprocess import cross_valid_data, split_train_test -from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua - - -def main(args): - camels_name = args.camels_name - exp = args.exp - cv_fold = args.cv_fold - train_period = args.calibrate_period - test_period = args.test_period - periods = args.period - warmup = args.warmup - basin_ids = args.basin_id - camels_data_dir = os.path.join( - SETTING["local_data_path"]["datasets-origin"], "camels", camels_name - ) - camels = Camels(camels_data_dir) - ts_data = camels.read_ts_xrdataset( - basin_ids, periods, ["prcp", "PET", "streamflow"] - ) - where_save = Path(os.path.join(repo_path, "result", exp)) - if os.path.exists(where_save) is False: - os.makedirs(where_save) - - if cv_fold <= 1: - # no cross validation - periods = np.sort( - [train_period[0], train_period[1], test_period[0], test_period[1]] - ) - if cv_fold > 1: - train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) - else: - # when using train_test_split, the warmup period is not used - # so you should include the warmup period in the train and test period - train_and_test_data = split_train_test(ts_data, train_period, test_period) - print("Start to calibrate the model") - p_and_e = ( - train_and_test_data[0][["prcp", "PET"]].to_array().to_numpy().transpose(2, 1, 0) - ) - # trans unit to mm/day - qobs_ = train_and_test_data[0][["streamflow"]] - basin_area = camels.read_area(basin_ids) - r_mmd = streamflow_unit_conv(qobs_, basin_area, target_unit="mm/d") - qobs = np.expand_dims(r_mmd["streamflow"].to_numpy().transpose(1, 0), axis=2) - calibrate_by_sceua( - basin_ids, - p_and_e, - qobs, - os.path.join(where_save, "sceua_xaj"), - warmup, - model={ - "name": "xaj_mz", - "source_type": "sources", - "source_book": "HF", - }, - algorithm={ - "name": "SCE_UA", - "random_seed": 1234, - "rep": 5, - "ngs": 7, - "kstop": 3, - "peps": 0.1, - "pcento": 0.1, - }, - metric={ - "type": "time_series", - "obj_func": "RMSE", - "events": None, - }, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Run hydro-model-xaj models with the CAMELS dataset" - ) - parser.add_argument( - "--camels_name", - dest="camels_name", - help="the name of CAMELS-formatted data directory", - default="camels_us", - type=str, - ) - parser.add_argument( - "--exp", - dest="exp", - help="An exp is corresponding to one data setting", - default="expcamels001", - type=str, - ) - parser.add_argument( - "--cv_fold", - dest="cv_fold", - help="the number of cross-validation fold", - default=1, - type=int, - ) - parser.add_argument( - "--warmup", - dest="warmup", - help="the number of warmup days", - default=365, - type=int, - ) - parser.add_argument( - "--period", - dest="period", - help="The whole period", - default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--calibrate_period", - dest="calibrate_period", - help="The training period", - default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--test_period", - dest="test_period", - help="The testing period", - default=["2007-01-01", "2014-01-01"], - nargs="+", - ) - parser.add_argument( - "--basin_id", - dest="basin_id", - help="The basins' ids", - default=["01439500", "06885500", "08104900", "09510200"], - nargs="+", - ) - the_args = parser.parse_args() - main(the_args) diff --git a/scripts/calibrate_xaj_for_multicases.py b/scripts/calibrate_xaj_for_multicases.py index 7a64bc3..64d3021 100644 --- a/scripts/calibrate_xaj_for_multicases.py +++ b/scripts/calibrate_xaj_for_multicases.py @@ -13,7 +13,7 @@ from pathlib import Path sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) -from scripts.calibrate_xaj import calibrate +from scripts.evaluate_xaj import calibrate matplotlib.use("Agg") exp = "exp61561" diff --git a/scripts/datapostprocess4calibrate.py b/scripts/datapostprocess4calibrate.py index 7ec964e..52615a3 100644 --- a/scripts/datapostprocess4calibrate.py +++ b/scripts/datapostprocess4calibrate.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-21 18:44:31 +LastEditTime: 2024-03-26 15:43:08 LastEditors: Wenyu Ouyang Description: the script to postprocess calibrated models in hydro-model-xaj FilePath: \hydro-model-xaj\scripts\datapostprocess4calibrate.py @@ -17,9 +17,8 @@ import os from pathlib import Path - -sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) -import definitions +repo_dir = os.path.dirname(Path(os.path.abspath(__file__)).parent) +sys.path.append(repo_dir) from hydromodel.datasets.data_postprocess import read_and_save_et_ouputs @@ -27,15 +26,10 @@ def statistics(args): exp = args.exp cases = args.cases cv_fold = args.cv_fold - where_save_cache = Path( - os.path.join( - "/home/ldaning/code/biye/hydro-model-xaj/hydromodel/example/model_run_wuxi7" - ) - # definitions.ROOT_DIR, "hydromodel", "example", exp - ) + where_save_cache = Path(os.path.join(repo_dir, "result", exp)) if os.path.exists(where_save_cache) is False: raise NotImplementedError( - "You should run datapreprocess4calibrate.py and calibrate_xaj_camels_cc.py first." + "You should run prepare_data and calibrate scripts at first." ) if cases is None: cases = os.listdir(where_save_cache) @@ -142,7 +136,7 @@ def statistics(args): "--exp", dest="exp", help="An exp is corresponding to one data setting", - default="exp61561", + default="test_camels_us", type=str, ) parser.add_argument( diff --git a/scripts/evaluate_xaj.py b/scripts/evaluate_xaj.py new file mode 100644 index 0000000..7a44e97 --- /dev/null +++ b/scripts/evaluate_xaj.py @@ -0,0 +1,226 @@ +import argparse +import json +import socket +import fnmatch +from datetime import datetime +import numpy as np +import pandas as pd +import os +import sys +from pathlib import Path +from hydroutils import hydro_file + +sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent)) +from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua +from hydromodel.datasets.data_postprocess import ( + renormalize_params, + read_save_sceua_calibrated_params, + save_streamflow, + summarize_metrics, + summarize_parameters, +) +from hydromodel.trainers.train_utils import show_calibrate_result, show_test_result +from hydromodel.models.xaj import xaj +from hydromodel.trainers.calibrate_ga import calibrate_by_ga, show_ga_result + + +def calibrate(args): + exp = args.exp + warmup = args.warmup_length + data_dir = os.path.join( + "D:/研究生/毕业论文/new毕业论文/预答辩/碧流河水库/模型运行/" + ) + kfold = [ + int(f_name[len("data_info_fold") : -len("_test.json")]) + for f_name in os.listdir(data_dir) # 输出文件夹下的所有文件 + if fnmatch.fnmatch(f_name, "*_fold*_test.json") + ] + kfold = np.sort(kfold) + for fold in kfold: + print(f"Start to calibrate the {fold}-th fold") + train_data_info_file = os.path.join( + data_dir, f"data_info_fold{str(fold)}_train.json" + ) + train_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_train.npy") + test_data_info_file = os.path.join( + data_dir, f"data_info_fold{str(fold)}_test.json" + ) + test_data_file = os.path.join(data_dir, f"data_info_fold{str(fold)}_test.npy") + if ( + os.path.exists(train_data_info_file) is False + or os.path.exists(train_data_file) is False + or os.path.exists(test_data_info_file) is False + or os.path.exists(test_data_file) is False + ): + raise FileNotFoundError( + "The data files are not found, please run datapreprocess4calibrate.py first." + ) + data_train = hydro_file.unserialize_numpy(train_data_file) + data_test = hydro_file.unserialize_numpy(test_data_file) + data_info_train = hydro_file.unserialize_json_ordered(train_data_info_file) + data_info_test = hydro_file.unserialize_json_ordered(test_data_info_file) + current_time = datetime.now().strftime("%b%d_%H-%M-%S") + save_dir = os.path.join( + data_dir, + current_time + + "_" + + socket.gethostname() + + "_fold" + + str(fold) + + "_" + + comment, + ) + # 读输入文件 + if os.path.exists(save_dir) is False: + os.makedirs(save_dir) + hydro_file.serialize_json(vars(args), os.path.join(save_dir, "args.json")) + if algo_info["name"] == "SCE_UA": + for i in range(len(data_info_train["basin"])): + basin_id = data_info_train["basin"][i] + basin_area = data_info_train["area"][i] + # one directory for one model + one hyperparam setting and one basin + spotpy_db_dir = os.path.join( # 一个模型一个文件夹 + save_dir, + basin_id, + ) + if not os.path.exists(spotpy_db_dir): + os.makedirs(spotpy_db_dir) + db_name = os.path.join(spotpy_db_dir, "SCEUA_" + model_info["name"]) + + sampler = calibrate_by_sceua( # 率定 + data_train[:, i : i + 1, 0:2], + data_train[:, i : i + 1, -1:], + db_name, + warmup_length=warmup, + model=model_info, + algorithm=algo_info, + ) + + show_calibrate_result( # 展示率定结果 + sampler.setup, + db_name, + warmup_length=warmup, + save_dir=spotpy_db_dir, + basin_id=basin_id, + train_period=data_info_train["time"], + basin_area=basin_area, + prcp=data_train[365:, i : i + 1, 0:1].flatten(), + ) + + params = read_save_sceua_calibrated_params( # 保存率定的参数文件 + basin_id, spotpy_db_dir, db_name + ) + # _ is et which we didn't use here + qsim, _ = xaj( # 计算模拟结果 + data_test[:, i : i + 1, 0:2], + params, + warmup_length=0, + **model_info, + ) + + qsim = units.convert_unit( + qsim, + # TODO: to unify "mm/hour" + unit_now="mm/day", + unit_final=units.unit["streamflow"], + basin_area=basin_area, + ) + qobs = units.convert_unit( + data_test[warmup:, i : i + 1, -1:], + # TODO: to unify "mm/hour" + unit_now="mm/day", + unit_final=units.unit["streamflow"], + basin_area=basin_area, + ) + test_result_file = os.path.join( + spotpy_db_dir, + "test_qsim_" + model_info["name"] + "_" + str(basin_id) + ".csv", + ) + pd.DataFrame(qsim.reshape(-1, 1)).to_csv( + test_result_file, + sep=",", + index=False, + header=False, + ) + test_date = pd.to_datetime(data_info_test["time"][:]).values.astype( + "datetime64[h]" + ) + show_test_result( + basin_id, + test_date, + qsim, + qobs, + save_dir=spotpy_db_dir, + warmup_length=warmup, + prcp=data_test[365:, i : i + 1, 0:1].flatten(), + ) + elif algo_info["name"] == "GA": + for i in range(len(data_info_train["basin"])): + basin_id = data_info_train["basin"][i] + basin_area = data_info_train["area"][i] + # one directory for one model + one hyperparam setting and one basin + deap_db_dir = os.path.join( + save_dir, + basin_id, + ) + if not os.path.exists(deap_db_dir): + os.makedirs(deap_db_dir) + calibrate_by_ga( + data_train[:, i : i + 1, 0:2], + data_train[:, i : i + 1, -1:], + deap_db_dir, + warmup_length=warmup, + model=model_info, + ga_param=algo_info, + ) + show_ga_result( + deap_db_dir, + warmup_length=warmup, + basin_id=basin_id, + the_data=data_train[:, i : i + 1, :], + the_period=data_info_train["time"], + basin_area=basin_area, + model_info=model_info, + train_mode=True, + ) + show_ga_result( + deap_db_dir, + warmup_length=warmup, + basin_id=basin_id, + the_data=data_test[:, i : i + 1, :], + the_period=data_info_test["time"], + basin_area=basin_area, + model_info=model_info, + train_mode=False, + ) + else: + raise NotImplementedError( + "We don't provide this calibrate method! Choose from 'SCE_UA' or 'GA'!" + ) + summarize_parameters(save_dir, model_info) + renormalize_params(save_dir, model_info) + summarize_metrics(save_dir, model_info) + save_streamflow(save_dir, model_info, fold=fold) + print(f"Finish calibrating the {fold}-th fold") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="evaluate a calibrated hydrological model." + ) + parser.add_argument( + "--exp", + dest="exp", + help="An exp is corresponding to a data plan from calibrate_xaj.py", + default="expcamels001", + type=str, + ) + parser.add_argument( + "--warmup_length", + dest="warmup_length", + help="the length of warmup period for hydro model", + default=365, + type=int, + ) + the_args = parser.parse_args() + calibrate(the_args) diff --git a/test/test_calibrate.py b/test/test_calibrate.py index bb2f698..4ca756a 100644 --- a/test/test_calibrate.py +++ b/test/test_calibrate.py @@ -46,7 +46,7 @@ def test_calibrate_xaj_sceua(basins, p_and_e, qobs, warmup_length, db_dir): "peps": 0.1, "pcento": 0.1, }, - metric={ + loss={ "type": "time_series", "obj_func": "RMSE", "events": None, diff --git a/test/test_data_postprocess.py b/test/test_data_postprocess.py index 2db5253..9d0ffed 100644 --- a/test/test_data_postprocess.py +++ b/test/test_data_postprocess.py @@ -1,16 +1,20 @@ """ Author: Wenyu Ouyang Date: 2022-10-25 21:16:22 -LastEditTime: 2024-03-26 11:56:33 +LastEditTime: 2024-03-26 17:01:09 LastEditors: Wenyu Ouyang Description: Test for data preprocess FilePath: \hydro-model-xaj\test\test_data_postprocess.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt import spotpy from spotpy.examples.spot_setup_hymod_python import spot_setup as hymod_setup -import matplotlib.pyplot as plt +from hydromodel.datasets.data_postprocess import read_save_sceua_calibrated_params def test_run_hymod_calibration(): @@ -56,3 +60,37 @@ def test_run_hymod_calibration(): plt.ylabel("Discharge [l s-1]") plt.legend(loc="upper right") plt.show() + + +def test_read_save_sceua_calibrated_params(tmpdir): + # Create a temporary directory for testing + temp_dir = tmpdir.mkdir("test_data") + + # Generate some dummy data + results = np.array( + [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + dtype=[("par1", int), ("par2", int), ("par3", int)], + ) + spotpy.analyser.load_csv_results = lambda _: results + spotpy.analyser.get_minlikeindex = lambda _: (0, 0) + + # Call the function + basin_id = "test_basin" + save_dir = temp_dir + sceua_calibrated_file_name = "test_results.csv" + result = read_save_sceua_calibrated_params( + basin_id, save_dir, sceua_calibrated_file_name + ) + + # Check if the file is saved correctly + expected_file_path = os.path.join(save_dir, basin_id + "_calibrate_params.txt") + assert os.path.exists(expected_file_path) + + # Check if the saved file contains the expected data + expected_data = pd.DataFrame([(1, 2, 3)], columns=["par1", "par2", "par3"]) + saved_data = pd.read_csv(expected_file_path) + pd.testing.assert_frame_equal(saved_data, expected_data) + + # Check if the returned result is correct + expected_result = np.array([(1, 2, 3)]) + np.testing.assert_array_equal(result, expected_result)