From 49b7865b092898ce47bc512bda1916e196ac4c59 Mon Sep 17 00:00:00 2001
From: Aashis Khanal <sraashis@gmail.com>
Date: Tue, 5 Dec 2023 18:00:09 -0500
Subject: [PATCH] Updated for logger.

---
 easytorch/data/data.py             |  4 ++++
 easytorch/easytorch.py             | 31 ++++++++++++++----------------
 easytorch/runner.py                |  2 +-
 easytorch/utils/__init__.py        |  4 ++--
 easytorch/vision/plotter.py        |  4 +++-
 examples/MNIST_easytorch_CNN.ipynb |  2 +-
 examples/MNIST_easytorch_CNN.py    |  7 ++++---
 setup.py                           |  2 +-
 8 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/easytorch/data/data.py b/easytorch/data/data.py
index 1167b50..baff301 100644
--- a/easytorch/data/data.py
+++ b/easytorch/data/data.py
@@ -152,6 +152,10 @@ def get_data_split(self):
             elif p.suffix == '.txt':
                 with open(str(p)) as fw:
                     files = fw.read().splitlines()
+
+            elif p.is_file():
+                files = [self.data_source]
+
             else:
                 raise ValueError(f"Unknown data source: {self.data_source}")
 
diff --git a/easytorch/easytorch.py b/easytorch/easytorch.py
index 7747462..f739719 100644
--- a/easytorch/easytorch.py
+++ b/easytorch/easytorch.py
@@ -89,8 +89,6 @@ def __init__(self, config_source=_conf.args_parser(), dataloader_args=None, **kw
         self._ddp_setup()
         self._make_reproducible()
         self.conf.update(is_master=self.conf.get('is_master', True))
-        self.conf['RUN-ID'] = _dtime.now().strftime("ET-%Y-%m-%d-%H%M%S-") + _uuid.uuid4().hex[:8].upper()
-
         self.conf['save_dir'] = self.conf['output_base_dir'] + _sep + (
                 self.conf['phase'].upper() + _sep + self.conf["name"]
         )
@@ -207,16 +205,14 @@ def _run_training_and_eval(self, data_split, engine, dataset_cls):
 
         engine.save_checkpoint(engine.conf['save_dir'] + _sep + engine.cache['latest_checkpoint'])
 
-        train_log = engine.conf['save_dir'] + _sep + ".train_log.npy"
-        val_log = engine.conf['save_dir'] + _sep + ".validation_log.npy"
+        train_log = engine.conf['save_dir'] + _sep + ".train_log.csv"
+        val_log = engine.conf['save_dir'] + _sep + ".validation_log.csv"
 
-        _np.save(train_log, _np.array(engine.cache[LogKey.TRAIN_LOG]))
-        _np.save(val_log, _np.array(engine.cache[LogKey.TRAIN_LOG]))
+        _np.savetxt(train_log, _np.array(engine.cache[LogKey.TRAIN_LOG]), delimiter=',', fmt='%.5f')
+        _np.savetxt(val_log, _np.array(engine.cache[LogKey.VALIDATION_LOG]), delimiter=',', fmt='%.5f')
 
         engine.cache[LogKey.TRAIN_LOG] = train_log
         engine.cache[LogKey.VALIDATION_LOG] = val_log
-        _utils.save_cache(self.conf, engine.cache, name=engine.conf['name'] + "_train")
-        engine.cache['_saved'] = True
 
     def _run_test(self, data_split, engine, dataset_cls, distributed=False) -> dict:
         test_dataset = engine.data_handle.get_dataset(Phase.TEST, data_split, dataset_cls)
@@ -233,18 +229,13 @@ def _run_test(self, data_split, engine, dataset_cls, distributed=False) -> dict:
         """ Run and save experiment test scores """
         engine.cache[
             'output_csv_TEST'
-        ] = f"{engine.conf['save_dir']}{_sep}TEST_results_{engine.conf['RUN-ID']}.csv"
+        ] = f"{engine.conf['save_dir']}{_sep}test_results_{engine.conf['RUN-ID']}.csv"
         with open(engine.cache[f'output_csv_TEST'], 'w') as rw:
             test_out = engine.evaluation(dataloader=dataloader, mode=Phase.TEST,
                                          save_predictions=True, results_writer=rw)
 
             test_meter = engine.reduce_scores([test_out], distributed=False)
-            engine.cache[LogKey.TEST_METRICS] = [test_meter.get()]
-        _utils.save_scores(self.conf['save_dir'], engine.cache, name=engine.conf['name'],
-                           file_keys=[LogKey.TEST_METRICS])
-
-        if not engine.cache.get('_saved'):
-            _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_test")
+            engine.cache[LogKey.TEST_METRICS] = f"{test_meter}"
         return test_out
 
     def _inference(self, data_split, engine, dataset_cls):
@@ -260,16 +251,16 @@ def _inference(self, data_split, engine, dataset_cls):
 
         engine.cache[
             'output_csv_INFERENCE'
-        ] = f"{engine.conf['save_dir']}{_sep}INFERENCE_results_{engine.conf['RUN-ID']}.csv"
+        ] = f"{engine.conf['save_dir']}{_sep}inference_results_{engine.conf['RUN-ID']}.csv"
         with open(engine.cache[f'output_csv_INFERENCE'], 'w') as rw:
             engine.inference(dataloader=dataloader, results_writer=rw)
-        _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_inference")
 
     def run(self, runner_cls: typing.Type[ETRunner],
             dataset_cls: typing.Type[ETDataset] = ETDataset,
             data_handle_cls: typing.Type[ETDataHandle] = ETDataHandle):
 
         if self.conf['is_master']:
+            """To avoid problems if the mount is the same location for multiple nodes(usually the case"""
             self._maybe_advance_run()
             _os.makedirs(self.conf['save_dir'], exist_ok=self.conf['force'])
 
@@ -283,6 +274,7 @@ def run(self, runner_cls: typing.Type[ETRunner],
             self._run(runner_cls, dataset_cls, data_handle_cls)
 
     def _run(self, runner_cls, dataset_cls, data_handle_cls):
+        self.conf['RUN-ID'] = f"RUN{self.conf.get('world_rank', 0)}-" + _uuid.uuid4().hex[:8].upper()
 
         engine = runner_cls(
             conf=self.conf,
@@ -292,6 +284,9 @@ def _run(self, runner_cls, dataset_cls, data_handle_cls):
             )
         )
 
+        engine.cache['START-TIME'] = _dtime.now().strftime("%Y-%m-%d %H:%M:%S")
+        _utils.save_cache(self.conf, {}, name=f"{self.conf['name']}_{self.conf['phase']}".upper())
+
         self._prepare_nn_engine(engine)
 
         data_split = {}
@@ -307,3 +302,5 @@ def _run(self, runner_cls, dataset_cls, data_handle_cls):
         if self.conf['phase'] == Phase.INFERENCE:
             self._inference(data_split, engine, dataset_cls)
         _cleanup(engine, engine.data_handle)
+        engine.cache['END-TIME'] = _dtime.now().strftime("%Y-%m-%d %H:%M:%S")
+        _utils.save_cache(self.conf, engine.cache, name=f"{engine.conf['name']}_{self.conf['phase']}".upper())
diff --git a/easytorch/runner.py b/easytorch/runner.py
index 1fe3c18..54ff047 100644
--- a/easytorch/runner.py
+++ b/easytorch/runner.py
@@ -245,7 +245,7 @@ def _update_scores(_out, _it, _meter):
                         _update_scores(None, it, meter)
 
                     if self.conf['verbose'] and lazy_debug(i, add=epoch):
-                        info(f"  Itr:{i}/{len(dataloader)}, {it['meter']}")
+                        info(f"  Itr:{i}/{len(dataloader)}, {meter}")  # Accumulative score
 
                 if self.conf['verbose']:
                     info(f" {mode}, {meter}")
diff --git a/easytorch/utils/__init__.py b/easytorch/utils/__init__.py
index 24be2ca..300de2e 100644
--- a/easytorch/utils/__init__.py
+++ b/easytorch/utils/__init__.py
@@ -65,12 +65,12 @@ def clean_recursive(obj):
 
 def save_cache(conf, cache, name=''):
     _cache = {**cache, 'conf': conf}
-    with open(conf['save_dir'] + _os.sep + f"{name}_log.json", 'w') as fp:
+    with open(conf['save_dir'] + _os.sep + f"{name}.json", 'w') as fp:
         try:
             log = _copy.deepcopy(_cache)
             clean_recursive(log)
             _json.dump(log, fp)
         except Exception as e:
-            with open(conf['save_dir'] + _os.sep + f"{name}_log.txt", 'w') as raw:
+            with open(conf['save_dir'] + _os.sep + f"{name}.txt", 'w') as raw:
                 raw.write(f"{e}\n")
                 raw.write(f"{_cache}")
diff --git a/easytorch/vision/plotter.py b/easytorch/vision/plotter.py
index 4ef2978..d675099 100644
--- a/easytorch/vision/plotter.py
+++ b/easytorch/vision/plotter.py
@@ -19,6 +19,8 @@ def plot_progress(save_dir, cache, name='', plot_keys=[], num_points=31, epoch=N
     r"""
     Custom plot to plot data from the cache by keys.
     """
+    save_to = save_dir + _os.sep + "_plots"
+    _os.makedirs(save_to, exist_ok=True)
     for k in plot_keys:
         D = _np.array(cache.get(k, []))
         if len(D) == 0 or cache.get('log_header') is None:
@@ -57,6 +59,6 @@ def plot_progress(save_dir, cache, name='', plot_keys=[], num_points=31, epoch=N
                 ax.set_xticklabels(xticks_range)
 
             _plt.xlabel('Epochs')
-            _plt.savefig(save_dir + _os.sep + f"{name}_{k}_{plot_id}.png", bbox_inches='tight')
+            _plt.savefig(save_to + _os.sep + f"{name}_{k}_{plot_id}.png", bbox_inches='tight')
             _plt.close('all')
             i = j
diff --git a/examples/MNIST_easytorch_CNN.ipynb b/examples/MNIST_easytorch_CNN.ipynb
index dfda2be..a28a1d8 100644
--- a/examples/MNIST_easytorch_CNN.ipynb
+++ b/examples/MNIST_easytorch_CNN.ipynb
@@ -119,7 +119,7 @@
     "    def new_meter(self):\n",
     "        return ETMeter(\n",
     "            num_averages=2,  # Since we are tracing two losses\n",
-    "            cmf=ConfusionMatrix(num_classes=10),\n",
+    "            cmf=ConfusionMatrix(num_classes=10, device=self.device['gpu']),\n",
     "            auc=AUCROCMetrics()\n",
     "        )"
    ]
diff --git a/examples/MNIST_easytorch_CNN.py b/examples/MNIST_easytorch_CNN.py
index 8296e3c..823a99e 100644
--- a/examples/MNIST_easytorch_CNN.py
+++ b/examples/MNIST_easytorch_CNN.py
@@ -42,8 +42,8 @@ def init_cache(self):
     def new_meter(self):
         return ETMeter(
             num_averages=2,  # Since we are tracing two losses
-            cmf=ConfusionMatrix(num_classes=10),
-            auc=AUCROCMetrics()
+            cmf=ConfusionMatrix(num_classes=10, device=self.device['gpu']),
+            auc=AUCROCMetrics(),
         )
 
 
@@ -53,7 +53,8 @@ def new_meter(self):
                              transform=transform)
 
 dataloader_args = {'train': {'dataset': train_dataset},
-                   'validation': {'dataset': val_dataset}}
+                   'validation': {'dataset': val_dataset},
+                   'test': {'dataset': val_dataset}}
 runner = EasyTorch(phase='train', distributed_validation=True,
                    batch_size=512, epochs=21,
                    dataloader_args=dataloader_args,
diff --git a/setup.py b/setup.py
index 83cafdd..88c1489 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 # This call to setup() does all the work
 setup(
     name="easytorch",
-    version="3.8.2",
+    version="3.8.3",
     description="Easy Neural Network Experiments with pytorch",
     long_description=_README,
     long_description_content_type="text/markdown",