From 49b7865b092898ce47bc512bda1916e196ac4c59 Mon Sep 17 00:00:00 2001 From: Aashis Khanal Date: Tue, 5 Dec 2023 18:00:09 -0500 Subject: [PATCH] Updated for logger. --- easytorch/data/data.py | 4 ++++ easytorch/easytorch.py | 31 ++++++++++++++---------------- easytorch/runner.py | 2 +- easytorch/utils/__init__.py | 4 ++-- easytorch/vision/plotter.py | 4 +++- examples/MNIST_easytorch_CNN.ipynb | 2 +- examples/MNIST_easytorch_CNN.py | 7 ++++--- setup.py | 2 +- 8 files changed, 30 insertions(+), 26 deletions(-) diff --git a/easytorch/data/data.py b/easytorch/data/data.py index 1167b50..baff301 100644 --- a/easytorch/data/data.py +++ b/easytorch/data/data.py @@ -152,6 +152,10 @@ def get_data_split(self): elif p.suffix == '.txt': with open(str(p)) as fw: files = fw.read().splitlines() + + elif p.is_file(): + files = [self.data_source] + else: raise ValueError(f"Unknown data source: {self.data_source}") diff --git a/easytorch/easytorch.py b/easytorch/easytorch.py index 7747462..f739719 100644 --- a/easytorch/easytorch.py +++ b/easytorch/easytorch.py @@ -89,8 +89,6 @@ def __init__(self, config_source=_conf.args_parser(), dataloader_args=None, **kw self._ddp_setup() self._make_reproducible() self.conf.update(is_master=self.conf.get('is_master', True)) - self.conf['RUN-ID'] = _dtime.now().strftime("ET-%Y-%m-%d-%H%M%S-") + _uuid.uuid4().hex[:8].upper() - self.conf['save_dir'] = self.conf['output_base_dir'] + _sep + ( self.conf['phase'].upper() + _sep + self.conf["name"] ) @@ -207,16 +205,14 @@ def _run_training_and_eval(self, data_split, engine, dataset_cls): engine.save_checkpoint(engine.conf['save_dir'] + _sep + engine.cache['latest_checkpoint']) - train_log = engine.conf['save_dir'] + _sep + ".train_log.npy" - val_log = engine.conf['save_dir'] + _sep + ".validation_log.npy" + train_log = engine.conf['save_dir'] + _sep + ".train_log.csv" + val_log = engine.conf['save_dir'] + _sep + ".validation_log.csv" - _np.save(train_log, _np.array(engine.cache[LogKey.TRAIN_LOG])) - _np.save(val_log, _np.array(engine.cache[LogKey.TRAIN_LOG])) + _np.savetxt(train_log, _np.array(engine.cache[LogKey.TRAIN_LOG]), delimiter=',', fmt='%.5f') + _np.savetxt(val_log, _np.array(engine.cache[LogKey.VALIDATION_LOG]), delimiter=',', fmt='%.5f') engine.cache[LogKey.TRAIN_LOG] = train_log engine.cache[LogKey.VALIDATION_LOG] = val_log - _utils.save_cache(self.conf, engine.cache, name=engine.conf['name'] + "_train") - engine.cache['_saved'] = True def _run_test(self, data_split, engine, dataset_cls, distributed=False) -> dict: test_dataset = engine.data_handle.get_dataset(Phase.TEST, data_split, dataset_cls) @@ -233,18 +229,13 @@ def _run_test(self, data_split, engine, dataset_cls, distributed=False) -> dict: """ Run and save experiment test scores """ engine.cache[ 'output_csv_TEST' - ] = f"{engine.conf['save_dir']}{_sep}TEST_results_{engine.conf['RUN-ID']}.csv" + ] = f"{engine.conf['save_dir']}{_sep}test_results_{engine.conf['RUN-ID']}.csv" with open(engine.cache[f'output_csv_TEST'], 'w') as rw: test_out = engine.evaluation(dataloader=dataloader, mode=Phase.TEST, save_predictions=True, results_writer=rw) test_meter = engine.reduce_scores([test_out], distributed=False) - engine.cache[LogKey.TEST_METRICS] = [test_meter.get()] - _utils.save_scores(self.conf['save_dir'], engine.cache, name=engine.conf['name'], - file_keys=[LogKey.TEST_METRICS]) - - if not engine.cache.get('_saved'): - _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_test") + engine.cache[LogKey.TEST_METRICS] = f"{test_meter}" return test_out def _inference(self, data_split, engine, dataset_cls): @@ -260,16 +251,16 @@ def _inference(self, data_split, engine, dataset_cls): engine.cache[ 'output_csv_INFERENCE' - ] = f"{engine.conf['save_dir']}{_sep}INFERENCE_results_{engine.conf['RUN-ID']}.csv" + ] = f"{engine.conf['save_dir']}{_sep}inference_results_{engine.conf['RUN-ID']}.csv" with open(engine.cache[f'output_csv_INFERENCE'], 'w') as rw: engine.inference(dataloader=dataloader, results_writer=rw) - _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_inference") def run(self, runner_cls: typing.Type[ETRunner], dataset_cls: typing.Type[ETDataset] = ETDataset, data_handle_cls: typing.Type[ETDataHandle] = ETDataHandle): if self.conf['is_master']: + """To avoid problems if the mount is the same location for multiple nodes(usually the case""" self._maybe_advance_run() _os.makedirs(self.conf['save_dir'], exist_ok=self.conf['force']) @@ -283,6 +274,7 @@ def run(self, runner_cls: typing.Type[ETRunner], self._run(runner_cls, dataset_cls, data_handle_cls) def _run(self, runner_cls, dataset_cls, data_handle_cls): + self.conf['RUN-ID'] = f"RUN{self.conf.get('world_rank', 0)}-" + _uuid.uuid4().hex[:8].upper() engine = runner_cls( conf=self.conf, @@ -292,6 +284,9 @@ def _run(self, runner_cls, dataset_cls, data_handle_cls): ) ) + engine.cache['START-TIME'] = _dtime.now().strftime("%Y-%m-%d %H:%M:%S") + _utils.save_cache(self.conf, {}, name=f"{self.conf['name']}_{self.conf['phase']}".upper()) + self._prepare_nn_engine(engine) data_split = {} @@ -307,3 +302,5 @@ def _run(self, runner_cls, dataset_cls, data_handle_cls): if self.conf['phase'] == Phase.INFERENCE: self._inference(data_split, engine, dataset_cls) _cleanup(engine, engine.data_handle) + engine.cache['END-TIME'] = _dtime.now().strftime("%Y-%m-%d %H:%M:%S") + _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_{self.conf['phase']}".upper()) diff --git a/easytorch/runner.py b/easytorch/runner.py index 1fe3c18..54ff047 100644 --- a/easytorch/runner.py +++ b/easytorch/runner.py @@ -245,7 +245,7 @@ def _update_scores(_out, _it, _meter): _update_scores(None, it, meter) if self.conf['verbose'] and lazy_debug(i, add=epoch): - info(f" Itr:{i}/{len(dataloader)}, {it['meter']}") + info(f" Itr:{i}/{len(dataloader)}, {meter}") # Accumulative score if self.conf['verbose']: info(f" {mode}, {meter}") diff --git a/easytorch/utils/__init__.py b/easytorch/utils/__init__.py index 24be2ca..300de2e 100644 --- a/easytorch/utils/__init__.py +++ b/easytorch/utils/__init__.py @@ -65,12 +65,12 @@ def clean_recursive(obj): def save_cache(conf, cache, name=''): _cache = {**cache, 'conf': conf} - with open(conf['save_dir'] + _os.sep + f"{name}_log.json", 'w') as fp: + with open(conf['save_dir'] + _os.sep + f"{name}.json", 'w') as fp: try: log = _copy.deepcopy(_cache) clean_recursive(log) _json.dump(log, fp) except Exception as e: - with open(conf['save_dir'] + _os.sep + f"{name}_log.txt", 'w') as raw: + with open(conf['save_dir'] + _os.sep + f"{name}.txt", 'w') as raw: raw.write(f"{e}\n") raw.write(f"{_cache}") diff --git a/easytorch/vision/plotter.py b/easytorch/vision/plotter.py index 4ef2978..d675099 100644 --- a/easytorch/vision/plotter.py +++ b/easytorch/vision/plotter.py @@ -19,6 +19,8 @@ def plot_progress(save_dir, cache, name='', plot_keys=[], num_points=31, epoch=N r""" Custom plot to plot data from the cache by keys. """ + save_to = save_dir + _os.sep + "_plots" + _os.makedirs(save_to, exist_ok=True) for k in plot_keys: D = _np.array(cache.get(k, [])) if len(D) == 0 or cache.get('log_header') is None: @@ -57,6 +59,6 @@ def plot_progress(save_dir, cache, name='', plot_keys=[], num_points=31, epoch=N ax.set_xticklabels(xticks_range) _plt.xlabel('Epochs') - _plt.savefig(save_dir + _os.sep + f"{name}_{k}_{plot_id}.png", bbox_inches='tight') + _plt.savefig(save_to + _os.sep + f"{name}_{k}_{plot_id}.png", bbox_inches='tight') _plt.close('all') i = j diff --git a/examples/MNIST_easytorch_CNN.ipynb b/examples/MNIST_easytorch_CNN.ipynb index dfda2be..a28a1d8 100644 --- a/examples/MNIST_easytorch_CNN.ipynb +++ b/examples/MNIST_easytorch_CNN.ipynb @@ -119,7 +119,7 @@ " def new_meter(self):\n", " return ETMeter(\n", " num_averages=2, # Since we are tracing two losses\n", - " cmf=ConfusionMatrix(num_classes=10),\n", + " cmf=ConfusionMatrix(num_classes=10, device=self.device['gpu']),\n", " auc=AUCROCMetrics()\n", " )" ] diff --git a/examples/MNIST_easytorch_CNN.py b/examples/MNIST_easytorch_CNN.py index 8296e3c..823a99e 100644 --- a/examples/MNIST_easytorch_CNN.py +++ b/examples/MNIST_easytorch_CNN.py @@ -42,8 +42,8 @@ def init_cache(self): def new_meter(self): return ETMeter( num_averages=2, # Since we are tracing two losses - cmf=ConfusionMatrix(num_classes=10), - auc=AUCROCMetrics() + cmf=ConfusionMatrix(num_classes=10, device=self.device['gpu']), + auc=AUCROCMetrics(), ) @@ -53,7 +53,8 @@ def new_meter(self): transform=transform) dataloader_args = {'train': {'dataset': train_dataset}, - 'validation': {'dataset': val_dataset}} + 'validation': {'dataset': val_dataset}, + 'test': {'dataset': val_dataset}} runner = EasyTorch(phase='train', distributed_validation=True, batch_size=512, epochs=21, dataloader_args=dataloader_args, diff --git a/setup.py b/setup.py index 83cafdd..88c1489 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ # This call to setup() does all the work setup( name="easytorch", - version="3.8.2", + version="3.8.3", description="Easy Neural Network Experiments with pytorch", long_description=_README, long_description_content_type="text/markdown",