From 039a931f6decae00fd53c937a66a56e1724be7af Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 27 Oct 2020 18:18:14 -0500
Subject: [PATCH 01/70] use python instead of python3

---
 examples/trials/mnist-tfv2/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/mnist-tfv2/config.yml b/examples/trials/mnist-tfv2/config.yml
index 06e9af6be3..936aedd940 100644
--- a/examples/trials/mnist-tfv2/config.yml
+++ b/examples/trials/mnist-tfv2/config.yml
@@ -12,6 +12,6 @@ tuner:
     classArgs:
         optimize_mode: maximize  # choices: maximize, minimize
 trial:
-  command: python3 mnist.py
+  command: python mnist.py
   codeDir: .
   gpuNum: 0

From 060677fc99ee3aa60ac340a5a8e0da91f1d475e1 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 28 Oct 2020 16:58:32 -0500
Subject: [PATCH 02/70] store models in the /mnt/output directory

---
 examples/nas/enas/search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index ae1e615a92..d229102989 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -12,7 +12,7 @@
 from macro import GeneralNetwork
 from micro import MicroNetwork
 from nni.nas.pytorch import enas
-from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
+from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, ModelCheckpoint,
                                        LRSchedulerCallback)
 from utils import accuracy, reward_accuracy
 
@@ -49,7 +49,7 @@
                                metrics=accuracy,
                                reward_function=reward_accuracy,
                                optimizer=optimizer,
-                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
+                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")],
                                batch_size=args.batch_size,
                                num_epochs=num_epochs,
                                dataset_train=dataset_train,

From 5a0087f70ab18b038657bd9cbdbe6fa6ffb08669 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Fri, 30 Oct 2020 13:34:48 -0500
Subject: [PATCH 03/70] add template for nas

---
 template.yaml | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 template.yaml

diff --git a/template.yaml b/template.yaml
new file mode 100644
index 0000000000..65c080cdc0
--- /dev/null
+++ b/template.yaml
@@ -0,0 +1,85 @@
+entrypoint: main
+arguments:
+    parameters:
+    - name: source
+      value: https://github.com/onepanelio/nni.git
+    - name: epochs
+      value: 1
+    - name: search-method
+      value: macro
+      type: select.select
+      options:
+      - name: 'Macro'
+        value: macro
+      - name: 'Micro'
+        value: micro
+    - displayName: Node pool
+      hint: Name of node pool or group to run this workflow task
+      type: select.select
+      visibility: public
+      name: sys-node-pool
+      value: Standard_D4s_v3
+      required: true
+      options:
+      - name: 'CPU: 2, RAM: 8GB'
+        value: Standard_D2s_v3
+      - name: 'CPU: 4, RAM: 16GB'
+        value: Standard_D4s_v3
+      - name: 'GPU: 1xK80, CPU: 6, RAM: 56GB'
+        value: Standard_NC6
+      - name: 'GPU: 1xV100, CPU: 6, RAM: 56GB'
+        value: Standard_NC6s_v3
+    
+volumeClaimTemplates:
+  - metadata:
+      name: data
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 2Gi
+  - metadata:
+      name: output
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 2Gi
+templates:
+  - name: main
+    dag:
+      tasks:
+      - name: train-model
+        template: pytorch
+  - name: pytorch
+    inputs:
+      artifacts:
+      - name: src
+        path: /mnt/src
+        git:
+          repo: "{{workflow.parameters.source}}"
+          revision: "dev"
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        archive:
+          none: {}
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ && \
+        pip install setuptools nni && \
+        python /mnt/src/examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}}
+      workingDir: /mnt
+      volumeMounts:
+      - name: data
+        mountPath: /mnt/data
+      - name: output
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
\ No newline at end of file

From 4e04cb36d50ed175a4cc95456ddcecc8ca56d6cd Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Fri, 30 Oct 2020 18:19:14 -0500
Subject: [PATCH 04/70] place viz logs into /mnt/output

---
 examples/nas/enas/search.py | 4 ++--
 nni/nas/pytorch/trainer.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index d229102989..1041c4b3ed 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -25,7 +25,7 @@
     parser.add_argument("--log-frequency", default=10, type=int)
     parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
     parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
-    parser.add_argument("--visualization", default=False, action="store_true")
+    parser.add_argument("--visualization", default=True, action="store_true")
     args = parser.parse_args()
 
     dataset_train, dataset_valid = datasets.get_dataset("cifar10")
@@ -49,7 +49,7 @@
                                metrics=accuracy,
                                reward_function=reward_accuracy,
                                optimizer=optimizer,
-                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")],
+                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./output"), ModelCheckpoint("./output")],
                                batch_size=args.batch_size,
                                num_epochs=num_epochs,
                                dataset_train=dataset_train,
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index 6a3881177a..7264c3792e 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -92,7 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         self.batch_size = batch_size
         self.workers = workers
         self.log_frequency = log_frequency
-        self.log_dir = os.path.join("logs", str(time.time()))
+        self.log_dir = os.path.join("/mnt/output", str(time.time()))
         os.makedirs(self.log_dir, exist_ok=True)
         self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
         self.callbacks = callbacks if callbacks is not None else []

From 63610ed57020d689cb42c3aa939d5a038e82a0d7 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 2 Nov 2020 17:06:36 -0600
Subject: [PATCH 05/70] update paths for tf enas

---
 examples/nas/enas-tf/search.py | 2 +-
 examples/nas/enas/search.py    | 4 ++--
 nni/nas/pytorch/trainer.py     | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/nas/enas-tf/search.py b/examples/nas/enas-tf/search.py
index b68daf62f3..b7d5ee93d2 100644
--- a/examples/nas/enas-tf/search.py
+++ b/examples/nas/enas-tf/search.py
@@ -5,7 +5,7 @@
 from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy
 from tensorflow.keras.optimizers import SGD
 
-from nni.nas.tensorflow import enas
+from nni.algorithms.nas.tensorflow import enas
 
 import datasets
 from macro import GeneralNetwork
diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 1041c4b3ed..380483cb1e 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -11,7 +11,7 @@
 import datasets
 from macro import GeneralNetwork
 from micro import MicroNetwork
-from nni.nas.pytorch import enas
+from nni.algorithms.nas.pytorch import enas
 from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, ModelCheckpoint,
                                        LRSchedulerCallback)
 from utils import accuracy, reward_accuracy
@@ -49,7 +49,7 @@
                                metrics=accuracy,
                                reward_function=reward_accuracy,
                                optimizer=optimizer,
-                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./output"), ModelCheckpoint("./output")],
+                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")],
                                batch_size=args.batch_size,
                                num_epochs=num_epochs,
                                dataset_train=dataset_train,
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index 7264c3792e..cc740e7595 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -92,6 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         self.batch_size = batch_size
         self.workers = workers
         self.log_frequency = log_frequency
+        print("Log dir...")
         self.log_dir = os.path.join("/mnt/output", str(time.time()))
         os.makedirs(self.log_dir, exist_ok=True)
         self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
@@ -184,6 +185,7 @@ def enable_visualization(self):
             break
         if sample is None:
             _logger.warning("Sample is %s.", sample)
+        _logger.info("Visualization: %s",self.log_dir)
         _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir)
         with open(os.path.join(self.log_dir, "graph.json"), "w") as f:
             json.dump(self.mutator.graph(sample), f)

From c1c5a3182478177ed3957398383873ca5a9c3913 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 2 Nov 2020 18:46:12 -0600
Subject: [PATCH 06/70] allow users to specify dataset

---
 examples/nas/enas/datasets.py              | 18 +++++++++++++++++-
 examples/nas/enas/search.py                | 11 ++++++++---
 nni/algorithms/nas/pytorch/enas/trainer.py |  1 +
 nni/nas/pytorch/trainer.py                 |  3 +--
 4 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/examples/nas/enas/datasets.py b/examples/nas/enas/datasets.py
index 9a5128a8a9..d781ba0a8a 100644
--- a/examples/nas/enas/datasets.py
+++ b/examples/nas/enas/datasets.py
@@ -3,9 +3,23 @@
 
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
+from torchvision.datasets import ImageFolder
 
+def get_custom_dataset(train_dir, valid_dir):
+    """ Load custom classification dataset using ImageFolder.
+        The train and test directory should have sub directories with name equals to label names.
 
-def get_dataset(cls):
+    """
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize((32, 32))
+    ])
+    train_dataset = ImageFolder(root=train_dir, transform=transform)
+    valid_dataset = ImageFolder(root=valid_dir, transform=transform)
+    return train_dataset, valid_dataset
+    
+
+def get_dataset(cls, train_dir=None, valid_data=None):
     MEAN = [0.49139968, 0.48215827, 0.44653124]
     STD = [0.24703233, 0.24348505, 0.26158768]
     transf = [
@@ -23,6 +37,8 @@ def get_dataset(cls):
     if cls == "cifar10":
         dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform)
         dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform)
+    elif cls == "custom_classification":
+        dataset_train, dataset_valid = get_custom_dataset(train_dir, valid_data)
     else:
         raise NotImplementedError
     return dataset_train, dataset_valid
diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 380483cb1e..073c96c44d 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -23,18 +23,23 @@
     parser = ArgumentParser("enas")
     parser.add_argument("--batch-size", default=128, type=int)
     parser.add_argument("--log-frequency", default=10, type=int)
+    parser.add_argument("--num-classes", default=2, type=int)
+    parser.add_argument("--dataset", default="cifar10", choices=["cifar10", "custom_classification"])
     parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
     parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
     parser.add_argument("--visualization", default=True, action="store_true")
+    parser.add_argument("--train-data-dir", default="/home/savan/Documents/train_data", help="train dataset for classification")
+    parser.add_argument("--valid-data-dir", default="/home/savan/Documents/test_data", help="validation dataset for classification")
     args = parser.parse_args()
 
-    dataset_train, dataset_valid = datasets.get_dataset("cifar10")
+    dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir)
+    print(len(dataset_train))
     if args.search_for == "macro":
-        model = GeneralNetwork()
+        model = GeneralNetwork(num_classes=args.num_classes)
         num_epochs = args.epochs or 310
         mutator = None
     elif args.search_for == "micro":
-        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
+        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args.num_classes, use_aux_heads=True)
         num_epochs = args.epochs or 150
         mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
     else:
diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py
index 5e7a966580..f67c38060a 100644
--- a/nni/algorithms/nas/pytorch/enas/trainer.py
+++ b/nni/algorithms/nas/pytorch/enas/trainer.py
@@ -100,6 +100,7 @@ def init_dataloader(self):
         indices = list(range(n_train))
         train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split])
         valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:])
+        print("Loading dataset of size", n_train)
         self.train_loader = torch.utils.data.DataLoader(self.dataset_train,
                                                         batch_size=self.batch_size,
                                                         sampler=train_sampler,
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index cc740e7595..6024a05a4a 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -92,7 +92,6 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         self.batch_size = batch_size
         self.workers = workers
         self.log_frequency = log_frequency
-        print("Log dir...")
         self.log_dir = os.path.join("/mnt/output", str(time.time()))
         os.makedirs(self.log_dir, exist_ok=True)
         self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
@@ -180,12 +179,12 @@ def enable_visualization(self):
         Enable visualization. Write graph and training log to folder ``logs/<timestamp>``.
         """
         sample = None
+        # print(len(self.train_loader))
         for x, _ in self.train_loader:
             sample = x.to(self.device)[:2]
             break
         if sample is None:
             _logger.warning("Sample is %s.", sample)
-        _logger.info("Visualization: %s",self.log_dir)
         _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir)
         with open(os.path.join(self.log_dir, "graph.json"), "w") as f:
             json.dump(self.mutator.graph(sample), f)

From 812bdbbedc0d5b65520239fe9e946f7edb062e55 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 3 Nov 2020 10:31:57 -0600
Subject: [PATCH 07/70] perform resizing before converting it into tensor

---
 examples/nas/enas/datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nas/enas/datasets.py b/examples/nas/enas/datasets.py
index d781ba0a8a..3a45d97bef 100644
--- a/examples/nas/enas/datasets.py
+++ b/examples/nas/enas/datasets.py
@@ -11,8 +11,8 @@ def get_custom_dataset(train_dir, valid_dir):
 
     """
     transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Resize((32, 32))
+        transforms.Resize((32, 32)),
+        transforms.ToTensor()
     ])
     train_dataset = ImageFolder(root=train_dir, transform=transform)
     valid_dataset = ImageFolder(root=valid_dir, transform=transform)

From d04ab5ec82a2d00f217d8c0f2ed7c9eaaa4812d7 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 5 Nov 2020 17:11:13 -0600
Subject: [PATCH 08/70] add data processing script

---
 examples/nas/enas/search.py |  1 -
 prepare_data.py             | 27 +++++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 prepare_data.py

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 073c96c44d..d4e2b03443 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -33,7 +33,6 @@
     args = parser.parse_args()
 
     dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir)
-    print(len(dataset_train))
     if args.search_for == "macro":
         model = GeneralNetwork(num_classes=args.num_classes)
         num_epochs = args.epochs or 310
diff --git a/prepare_data.py b/prepare_data.py
new file mode 100644
index 0000000000..33806b66ab
--- /dev/null
+++ b/prepare_data.py
@@ -0,0 +1,27 @@
+import xml.etree.ElementTree as ET
+import os
+import argparse
+
+def main(args):
+
+    tree = ET.parse(args.xml_path)
+    root = tree.getroot()
+
+    # create directories
+    for label in root.iter('label'):
+        os.makedirs(os.path.join(args.data_dir, label.find('name').text))
+
+    for img in root.iter('image'):
+        #move image
+        lbl = img.find('tag').attrib['label']
+        if lbl:
+            os.rename(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name']))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml')
+    parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data')
+    parser.add_argument('--image_dir', default='/mnt/data/datasets/images')
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file

From da463dd758dc2ba6697341ba90fe45a3dd95f6ae Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Fri, 6 Nov 2020 19:00:35 -0600
Subject: [PATCH 09/70] add generic pytorch classifier

---
 examples/trials/pytorch-classifier/config.yml |  21 +++
 .../trials/pytorch-classifier/config_aml.yml  |  25 +++
 .../pytorch-classifier/config_assessor.yml    |  27 +++
 .../config_frameworkcontroller.yml            |  40 ++++
 .../pytorch-classifier/config_kubeflow.yml    |  32 ++++
 .../trials/pytorch-classifier/config_pai.yml  |  35 ++++
 .../pytorch-classifier/config_windows.yml     |  21 +++
 examples/trials/pytorch-classifier/main.py    | 172 ++++++++++++++++++
 .../pytorch-classifier/requirements.txt       |   2 +
 .../pytorch-classifier/search_space.json      |   6 +
 10 files changed, 381 insertions(+)
 create mode 100644 examples/trials/pytorch-classifier/config.yml
 create mode 100644 examples/trials/pytorch-classifier/config_aml.yml
 create mode 100644 examples/trials/pytorch-classifier/config_assessor.yml
 create mode 100644 examples/trials/pytorch-classifier/config_frameworkcontroller.yml
 create mode 100644 examples/trials/pytorch-classifier/config_kubeflow.yml
 create mode 100644 examples/trials/pytorch-classifier/config_pai.yml
 create mode 100644 examples/trials/pytorch-classifier/config_windows.yml
 create mode 100644 examples/trials/pytorch-classifier/main.py
 create mode 100644 examples/trials/pytorch-classifier/requirements.txt
 create mode 100644 examples/trials/pytorch-classifier/search_space.json

diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml
new file mode 100644
index 0000000000..c671dcf3dd
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config.yml
@@ -0,0 +1,21 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 main.py
+  codeDir: .
+  gpuNum: 0
diff --git a/examples/trials/pytorch-classifier/config_aml.yml b/examples/trials/pytorch-classifier/config_aml.yml
new file mode 100644
index 0000000000..d627e78b14
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_aml.yml
@@ -0,0 +1,25 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 10
+trainingServicePlatform: aml
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 main.py
+  codeDir: .
+  image: msranni/nni
+amlConfig:
+  subscriptionId: ${replace_to_your_subscriptionId}
+  resourceGroup: ${replace_to_your_resourceGroup}
+  workspaceName: ${replace_to_your_workspaceName}
+  computeTarget: ${replace_to_your_computeTarget}
diff --git a/examples/trials/pytorch-classifier/config_assessor.yml b/examples/trials/pytorch-classifier/config_assessor.yml
new file mode 100644
index 0000000000..3aca3ffb5d
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_assessor.yml
@@ -0,0 +1,27 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 50
+#choice: local, remote
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+assessor:
+  #choice: Medianstop, Curvefitting
+  builtinAssessorName: Curvefitting
+  classArgs:
+    epoch_num: 20
+    threshold: 0.9
+trial:
+  command: python3 main.py
+  codeDir: .
+  gpuNum: 0
diff --git a/examples/trials/pytorch-classifier/config_frameworkcontroller.yml b/examples/trials/pytorch-classifier/config_frameworkcontroller.yml
new file mode 100644
index 0000000000..aecf6b18bf
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_frameworkcontroller.yml
@@ -0,0 +1,40 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 1h
+maxTrialNum: 10
+#choice: local, remote, pai, kubeflow
+trainingServicePlatform: frameworkcontroller
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+assessor:
+  builtinAssessorName: Medianstop
+  classArgs:
+    optimize_mode: maximize
+trial:
+  codeDir: .
+  taskRoles:
+    - name: worker
+      taskNum: 1
+      command: python3 main.py
+      gpuNum: 1
+      cpuNum: 1
+      memoryMB: 8192
+      image: msranni/nni:latest
+      frameworkAttemptCompletionPolicy:
+        minFailedTaskCount: 1
+        minSucceededTaskCount: 1
+frameworkcontrollerConfig:
+  storage: nfs
+  nfs:
+    # Your NFS server IP, like 10.10.10.10
+    server: {your_nfs_server_ip}
+    # Your NFS server export path, like /var/nfs/nni
+    path: {your_nfs_server_export_path}
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/config_kubeflow.yml b/examples/trials/pytorch-classifier/config_kubeflow.yml
new file mode 100644
index 0000000000..5bf2804352
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_kubeflow.yml
@@ -0,0 +1,32 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 1
+#choice: local, remote, pai, kubeflow
+trainingServicePlatform: kubeflow
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  codeDir: .
+  worker:
+    replicas: 1
+    command: python3 main.py
+    gpuNum: 0
+    cpuNum: 1
+    memoryMB: 8192
+    image: msranni/nni:latest
+kubeflowConfig:
+  operator: tf-operator
+  apiVersion: v1alpha2
+  storage: nfs
+  nfs:
+    server: 10.10.10.10
+    path: /var/nfs/general
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/config_pai.yml b/examples/trials/pytorch-classifier/config_pai.yml
new file mode 100644
index 0000000000..032525f54d
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_pai.yml
@@ -0,0 +1,35 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: pai
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python3 main.py
+  codeDir: .
+  gpuNum: 0
+  cpuNum: 1
+  memoryMB: 8196
+  #The docker image to run nni job on pai
+  image: msranni/nni:latest
+  nniManagerNFSMountPath: {replace_to_your_nfs_mount_path}
+  containerNFSMountPath: {replace_to_your_container_mount_path}
+  paiStorageConfigName: {replace_to_your_storage_config_name}
+paiConfig:
+  #The username to login pai
+  userName: username
+  #The token to login pai
+  token: token
+  #The host of restful server of pai
+  host: 10.10.10.10
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/config_windows.yml b/examples/trials/pytorch-classifier/config_windows.yml
new file mode 100644
index 0000000000..3dd7325b33
--- /dev/null
+++ b/examples/trials/pytorch-classifier/config_windows.yml
@@ -0,0 +1,21 @@
+authorName: default
+experimentName: pytorch_classifier
+trialConcurrency: 1
+maxExecDuration: 10h
+maxTrialNum: 10
+#choice: local, remote, pai
+trainingServicePlatform: local
+searchSpacePath: search_space.json
+#choice: true, false
+useAnnotation: false
+tuner:
+  #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner
+  #SMAC (SMAC should be installed through nnictl)
+  builtinTunerName: TPE
+  classArgs:
+    #choice: maximize, minimize
+    optimize_mode: maximize
+trial:
+  command: python main.py
+  codeDir: .
+  gpuNum: 0
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
new file mode 100644
index 0000000000..1a01193de5
--- /dev/null
+++ b/examples/trials/pytorch-classifier/main.py
@@ -0,0 +1,172 @@
+"""
+A general purpose classification script using PyTorch.
+"""
+
+import os
+import argparse
+import logging
+import nni
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from nni.utils import merge_parameter
+from torchvision.datasets import ImageFolder
+import torchvision.models as models
+from torchvision import datasets, transforms
+
+logger = logging.getLogger('pytorch_classifier')
+
+
+# mean = 0.0
+# for images, _ in loader:
+#     batch_samples = images.size(0) 
+#     images = images.view(batch_samples, images.size(1), -1)
+#     mean += images.mean(2).sum(0)
+# mean = mean / len(loader.dataset)
+
+# var = 0.0
+# for images, _ in loader:
+#     batch_samples = images.size(0)
+#     images = images.view(batch_samples, images.size(1), -1)
+#     var += ((images - mean.unsqueeze(1))**2).sum([0,2])
+# std = torch.sqrt(var / (len(loader.dataset)*224*224))
+
+
+def build_model(model_type, num_classes):
+	if model_type == "googlenet":
+		model = models.googlenet(pretrained=True)
+		in_features = 1024
+	elif model_type == "resnet50":
+		model = models.resnet50(pretrained=True)
+		in_features = 2048
+	model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
+                                 nn.LogSoftmax(dim=1))
+	return model
+
+
+def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if (args['batch_num'] is not None) and batch_idx >= args['batch_num']:
+            break
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args['log_interval'] == 0:
+            logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test(args, model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            # get the index of the max log-probability
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    accuracy = 100. * correct / len(test_loader.dataset)
+
+    logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset), accuracy))
+
+    return accuracy
+
+
+def train(args):
+    use_cuda = not args['no_cuda'] and torch.cuda.is_available()
+
+    torch.manual_seed(args['seed'])
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+       		ImageFolder(root=args['train_dir'], transform=transforms.Compose([
+            	transforms.ToTensor(),
+              # add Normlize with mean and std
+        ])),
+        batch_size=args['batch_size'], shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+            ImageFolder(root=args['test_dir'], transform=transforms.Compose([
+            	transforms.ToTensor(),
+              # add Normlize with mean and std
+        ])),
+        batch_size=args['batch_size'], shuffle=True, **kwargs)
+
+
+    model = build_model(args['model_type'], args['num_classes']).to(device)
+    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
+                          momentum=args['momentum'])
+
+    for epoch in range(1, args['epochs'] + 1):
+        train_one_epoch(args, model, device, train_loader, optimizer, epoch)
+        test_acc = test(args, model, device, test_loader)
+
+        # report intermediate result
+        nni.report_intermediate_result(test_acc)
+        logger.debug('test accuracy %g', test_acc)
+        logger.debug('Pipe send intermediate result done.')
+
+    # report final result
+    nni.report_final_result(test_acc)
+    logger.debug('Final result is %g', test_acc)
+    logger.debug('Send final result done.')
+
+
+def get_params():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch Classification Example')
+    parser.add_argument("--train_dir", type=str,
+                        default='/home/savan/Documents/train_data', help="train data directory")
+    parser.add_argument("--test_dir", type=str,
+                        default='/home/savan/Documents/test_data', help="test data directory")
+    parser.add_argument("--model_type", type=str,
+                        default='googlenet', help="model to train")
+    parser.add_argument('--batch_size', type=int, default=1, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument("--batch_num", type=int, default=None)
+    parser.add_argument("--num_classes", type=int, default=2, metavar='N',
+                        help='number of classes in the dataset')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--no_cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
+                        help='how many batches to wait before logging training status')
+
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+if __name__ == '__main__':
+    try:
+        tuner_params = nni.get_next_parameter()
+        logger.debug(tuner_params)
+        params = vars(merge_parameter(get_params(), tuner_params))
+        print("Current Parameters:\n")
+        print(params)
+        train(params)
+    except Exception as exception:
+        logger.exception(exception)
+        raise
diff --git a/examples/trials/pytorch-classifier/requirements.txt b/examples/trials/pytorch-classifier/requirements.txt
new file mode 100644
index 0000000000..01f6b72556
--- /dev/null
+++ b/examples/trials/pytorch-classifier/requirements.txt
@@ -0,0 +1,2 @@
+torch
+torchvision
diff --git a/examples/trials/pytorch-classifier/search_space.json b/examples/trials/pytorch-classifier/search_space.json
new file mode 100644
index 0000000000..c26cdce369
--- /dev/null
+++ b/examples/trials/pytorch-classifier/search_space.json
@@ -0,0 +1,6 @@
+{
+    "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]},
+    "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]},
+    "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]},
+    "momentum":{"_type":"uniform","_value":[0, 1]}
+}

From 7688fd281d7e2b5da0b1b209eb9d774171b12718 Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 05:17:23 -0600
Subject: [PATCH 10/70] train model with specific parameters

---
 .../trials/pytorch-classifier/train_model.py  | 167 ++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 examples/trials/pytorch-classifier/train_model.py

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
new file mode 100644
index 0000000000..250c1e9b9e
--- /dev/null
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -0,0 +1,167 @@
+"""
+A general purpose classification script using PyTorch.
+"""
+
+import argparse
+import logging
+import nni
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision.datasets import ImageFolder
+import torchvision.models as models
+from torchvision import datasets, transforms
+
+logger = logging.getLogger('pytorch_classifier')
+
+
+# mean = 0.0
+# for images, _ in loader:
+#     batch_samples = images.size(0) 
+#     images = images.view(batch_samples, images.size(1), -1)
+#     mean += images.mean(2).sum(0)
+# mean = mean / len(loader.dataset)
+
+# var = 0.0
+# for images, _ in loader:
+#     batch_samples = images.size(0)
+#     images = images.view(batch_samples, images.size(1), -1)
+#     var += ((images - mean.unsqueeze(1))**2).sum([0,2])
+# std = torch.sqrt(var / (len(loader.dataset)*224*224))
+
+
+def build_model(model_type, num_classes):
+	if model_type == "googlenet":
+		model = models.googlenet(pretrained=True)
+		in_features = 1024
+	elif model_type == "resnet50":
+		model = models.resnet50(pretrained=True)
+		in_features = 2048
+	model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
+                                 nn.LogSoftmax(dim=1))
+	return model
+
+
+def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if (args['batch_num'] is not None) and batch_idx >= args['batch_num']:
+            break
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args['log_interval'] == 0:
+            logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+
+def test(args, model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            # sum up batch loss
+            test_loss += F.nll_loss(output, target, reduction='sum').item()
+            # get the index of the max log-probability
+            pred = output.argmax(dim=1, keepdim=True)
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    accuracy = 100. * correct / len(test_loader.dataset)
+
+    logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset), accuracy))
+
+    return accuracy
+
+
+def train(args):
+    use_cuda = not args['no_cuda'] and torch.cuda.is_available()
+
+    torch.manual_seed(args['seed'])
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+       		ImageFolder(root=args['train_dir'], transform=transforms.Compose([
+            	transforms.ToTensor(),
+              # add Normlize with mean and std
+        ])),
+        batch_size=args['batch_size'], shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+            ImageFolder(root=args['test_dir'], transform=transforms.Compose([
+            	transforms.ToTensor(),
+              # add Normlize with mean and std
+        ])),
+        batch_size=args['batch_size'], shuffle=True, **kwargs)
+
+
+    model = build_model(args['model_type'], args['num_classes']).to(device)
+    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
+                          momentum=args['momentum'])
+
+    for epoch in range(1, args['epochs'] + 1):
+        train_one_epoch(args, model, device, train_loader, optimizer, epoch)
+        test_acc = test(args, model, device, test_loader)
+
+        # report intermediate result
+        nni.report_intermediate_result(test_acc)
+        logger.debug('test accuracy %g', test_acc)
+        logger.debug('Pipe send intermediate result done.')
+
+    # report final result
+    nni.report_final_result(test_acc)
+    logger.debug('Final result is %g', test_acc)
+    logger.debug('Send final result done.')
+
+
+def get_params():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch Classification Example')
+    parser.add_argument("--train_dir", type=str,
+                        default='/home/savan/Documents/train_data', help="train data directory")
+    parser.add_argument("--test_dir", type=str,
+                        default='/home/savan/Documents/test_data', help="test data directory")
+    parser.add_argument("--model_type", type=str,
+                        default='googlenet', help="model to train")
+    parser.add_argument('--batch_size', type=int, default=1, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument("--batch_num", type=int, default=None)
+    parser.add_argument("--num_classes", type=int, default=2, metavar='N',
+                        help='number of classes in the dataset')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--no_cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
+                        help='how many batches to wait before logging training status')
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+if __name__ == '__main__':
+    try:
+        params = get_params()
+        print("Current Parameters:\n")
+        print(params)
+        train(params)
+    except Exception as exception:
+        logger.exception(exception)
+        raise

From 4ba1420000a75dc1f854575291d944fa65eedadb Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 05:28:51 -0600
Subject: [PATCH 11/70] add workflow template

---
 .../trials/pytorch-classifier/template.yml    | 196 ++++++++++++++++++
 1 file changed, 196 insertions(+)
 create mode 100644 examples/trials/pytorch-classifier/template.yml

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
new file mode 100644
index 0000000000..6e3a45808d
--- /dev/null
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -0,0 +1,196 @@
+entrypoint: main
+arguments:
+    parameters:
+    - name: source
+      value: https://github.com/onepanelio/nni.git
+    - name: cvat-annotation-path
+      value: annotation-dump/animals/11052020231652
+      displayName: Dataset path
+      hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated.
+      visibility: private
+    - name: num-classes
+      displayName: Number of classes
+      visibility: public
+      value: 2
+    - name: learning-rate
+      value: 0.01
+      displayName: Learning rate
+      visibility: public
+    - name: batch-size
+      value: 1
+      displayName: Batch size
+      visibility: public
+    - name: momentum
+      value: 0.5
+      displayName: Momentum
+      visibility: public
+    - name: model-type
+      displayName: Model type
+      visibility: public
+      value: googlenet
+      options:
+      - name: 'GoogleNet'
+        value: 'googlenet'
+      - name: 'ResNet50'
+        value: 'resnet50'
+    - name: epochs
+      value: 1
+    - name: search-method
+      value: macro
+      type: select.select
+      options:
+      - name: 'Macro'
+        value: macro
+      - name: 'Micro'
+        value: micro
+    - displayName: Node pool
+      hint: Name of node pool or group to run this workflow task
+      type: select.select
+      visibility: public
+      name: sys-node-pool
+      value: Standard_D4s_v3
+      required: true
+      options:
+      - name: 'CPU: 2, RAM: 8GB'
+        value: Standard_D2s_v3
+      - name: 'CPU: 4, RAM: 16GB'
+        value: Standard_D4s_v3
+      - name: 'GPU: 1xK80, CPU: 6, RAM: 56GB'
+        value: Standard_NC6
+      - name: 'GPU: 1xV100, CPU: 6, RAM: 56GB'
+        value: Standard_NC6s_v3
+    
+volumeClaimTemplates:
+  - metadata:
+      name: data
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 2Gi
+  - metadata:
+      name: output
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 2Gi
+templates:
+  - name: main
+    dag:
+      tasks:
+      - name: nas
+        template: pytorch
+      - name: hyperparameter-tuning
+        template: hyperop
+      - name: model-training
+        template: model-param
+  - name: pytorch
+    inputs:
+      artifacts:
+      - name: data
+        path: /mnt/data/datasets/
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        archive:
+          none: {}
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ git && \
+        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd nni/ && \
+        python3 setup.py install && \
+        python3 prepare_data.py 
+        python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \
+                --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data
+      workingDir: /mnt
+      volumeMounts:
+      - name: data
+        mountPath: /mnt/data
+      - name: output
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+  - name: hyperop
+    inputs:
+      artifacts:
+      - name: data
+        path: /mnt/data/datasets/
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        archive:
+          none: {}
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ git && \
+        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd nni/ && \
+        python3 setup.py install && \
+        python3 prepare_data.py && \
+        python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
+                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data
+      workingDir: /mnt
+      volumeMounts:
+      - name: data
+        mountPath: /mnt/data
+      - name: output
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+    
+  - name: model-param
+    inputs:
+      artifacts:
+      - name: data
+        path: /mnt/data/datasets/
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        archive:
+          none: {}
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ git && \
+        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd nni/ && \
+        python3 setup.py install && \
+        python3 prepare_data.py && \
+        python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
+                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
+      workingDir: /mnt
+      volumeMounts:
+      - name: data
+        mountPath: /mnt/data
+      - name: output
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
\ No newline at end of file

From 6c6333d238c652108e0fee5311467516ceac0962 Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 13:16:48 -0600
Subject: [PATCH 12/70] use shutil to move files between different file systems

---
 prepare_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 33806b66ab..5ac01adfc4 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -1,5 +1,6 @@
 import xml.etree.ElementTree as ET
 import os
+import shutil
 import argparse
 
 def main(args):
@@ -15,7 +16,7 @@ def main(args):
         #move image
         lbl = img.find('tag').attrib['label']
         if lbl:
-            os.rename(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name']))
+            shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name']))
 
 
 if __name__ == '__main__':
@@ -24,4 +25,4 @@ def main(args):
     parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data')
     parser.add_argument('--image_dir', default='/mnt/data/datasets/images')
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)

From a1e447bde4235c20d8f43b1e3e4488bd6a61a092 Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 15:22:31 -0600
Subject: [PATCH 13/70] add script for model training with specific parameters

---
 .../trials/pytorch-classifier/template.yml    | 159 ++++++++++++++++--
 .../trials/pytorch-classifier/train_model.py  |   1 -
 2 files changed, 148 insertions(+), 12 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index 6e3a45808d..6ed90c9258 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -8,6 +8,9 @@ arguments:
       displayName: Dataset path
       hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated.
       visibility: private
+    - name: cvat-output-path
+      value: workflow-data/output/nas/nas-model-comparison
+      visibility: private
     - name: num-classes
       displayName: Number of classes
       visibility: public
@@ -67,31 +70,95 @@ volumeClaimTemplates:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 2Gi
+          storage: 20Gi
   - metadata:
       name: output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 2Gi
+          storage: 20Gi
+  - metadata:
+      name: data2
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: output2
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: data3
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: output3
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: data4
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: output4
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: data5
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+  - metadata:
+      name: output5
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
 templates:
   - name: main
     dag:
       tasks:
-      - name: nas
+      - name: process-data
+        template: process-data
+      - name: neural-architecture-search
         template: pytorch
+        dependencies: [process-data]
       - name: hyperparameter-tuning
         template: hyperop
-      - name: model-training
+        dependencies: [process-data]
+      - name: train-model
         template: model-param
+        dependencies: [process-data]
+      - name: compare-models
+        template: compare-models
+        dependencies: [neural-architecture-search, hyperparameter-tuning, train-model]
   - name: pytorch
     inputs:
       artifacts:
       - name: data
         path: /mnt/data/datasets/
         s3:
-          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}'
     outputs:
       artifacts:
       - name: model
@@ -110,7 +177,6 @@ templates:
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
-        python3 prepare_data.py 
         python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \
                 --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data
       workingDir: /mnt
@@ -121,13 +187,14 @@ templates:
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+
   - name: hyperop
     inputs:
       artifacts:
       - name: data
         path: /mnt/data/datasets/
         s3:
-          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+          key:  '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}'
     outputs:
       artifacts:
       - name: model
@@ -146,19 +213,55 @@ templates:
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
-        python3 prepare_data.py && \
         python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
                  --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data
       workingDir: /mnt
       volumeMounts:
-      - name: data
+      - name: data2
         mountPath: /mnt/data
-      - name: output
+      - name: output2
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
     
   - name: model-param
+    inputs:
+      artifacts:
+      - name: data
+        path: /mnt/data/datasets/
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}'
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        archive:
+          none: {}
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ git && \
+        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd nni/ && \
+        python3 setup.py install && \
+        python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
+                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
+      workingDir: /mnt
+      volumeMounts:
+      - name: data3
+        mountPath: /mnt/data
+      - name: output3
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+    
+   
+  - name: compare-models
     inputs:
       artifacts:
       - name: data
@@ -188,9 +291,43 @@ templates:
                  --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
       workingDir: /mnt
       volumeMounts:
+      - name: data4
+        mountPath: /mnt/data
+      - name: output4
+        mountPath: /mnt/output
+    nodeSelector:
+      beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+   
+  - name: process-data
+    inputs:
+      artifacts:
       - name: data
+        path: /mnt/data/datasets/
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+    outputs:
+      artifacts:
+      - name: model
+        path: /mnt/output
+        optional: true
+        s3:
+          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}'
+    container:
+      image: pytorch/pytorch:latest
+      command: [sh,-c]
+      args:
+      - |
+        apt-get update && \
+        apt-get install -y gcc g++ git && \
+        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd nni/ && \
+        python3 prepare_data.py --data_dir=/mnt/output/processed_data
+      workingDir: /mnt
+      volumeMounts:
+      - name: data5
         mountPath: /mnt/data
-      - name: output
+      - name: output5
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 250c1e9b9e..efa9403427 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -75,7 +75,6 @@ def test(args, model, device, test_loader):
             correct += pred.eq(target.view_as(pred)).sum().item()
 
     test_loss /= len(test_loader.dataset)
-
     accuracy = 100. * correct / len(test_loader.dataset)
 
     logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(

From 4609a4c2b756e2210444ce65930147d0ff01ceaa Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 15:40:20 -0600
Subject: [PATCH 14/70] split dataset into train and test set

---
 prepare_data.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 5ac01adfc4..9006914470 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -2,6 +2,7 @@
 import os
 import shutil
 import argparse
+import random
 
 def main(args):
 
@@ -10,13 +11,17 @@ def main(args):
 
     # create directories
     for label in root.iter('label'):
-        os.makedirs(os.path.join(args.data_dir, label.find('name').text))
+        os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text))
+        os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text))
 
     for img in root.iter('image'):
         #move image
         lbl = img.find('tag').attrib['label']
         if lbl:
-            shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name']))
+            if random.randrange(100) < args.test_split:
+                shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name']))
+            else:
+                shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name']))
 
 
 if __name__ == '__main__':
@@ -24,5 +29,6 @@ def main(args):
     parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml')
     parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data')
     parser.add_argument('--image_dir', default='/mnt/data/datasets/images')
+    parser.add_argument('--test_split', default=20, type=int)
     args = parser.parse_args()
     main(args)

From 3e05a4b46ca5344b3d1008ec1ba69f7b7a482280 Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 16:43:32 -0600
Subject: [PATCH 15/70] update logic for dataset split

---
 prepare_data.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 9006914470..6697653ced 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -13,13 +13,16 @@ def main(args):
     for label in root.iter('label'):
         os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text))
         os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text))
-
+    images_len = len(list(root.iter('tag')))
+    test_len = (images_len * args.test_split )// 100
+    count = 0
     for img in root.iter('image'):
         #move image
         lbl = img.find('tag').attrib['label']
         if lbl:
-            if random.randrange(100) < args.test_split:
+            if bool(random.getrandbits(1)) and count <= test_len :
                 shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name']))
+                count += 1
             else:
                 shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name']))
 

From d96cd131fd18a93a9b9293161405b8dc9bafd19b Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 17:03:53 -0600
Subject: [PATCH 16/70] add support for vgg and alexnet

---
 examples/trials/pytorch-classifier/main.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 1a01193de5..702f52b7a4 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -40,7 +40,17 @@ def build_model(model_type, num_classes):
 	elif model_type == "resnet50":
 		model = models.resnet50(pretrained=True)
 		in_features = 2048
-	model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
+    elif model_type == "alexnet":
+        model = models.alexnet(pretrained=True)
+        in_features = 4096
+    elif model_type == "vgg19":
+        model = models.alexnet(pretrained=True)
+        in_features = 4096
+    if model_type in ['alexnet', 'vgg19']:
+        model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes),
+                                        nn.LogSoftmax(dim=1))
+    else:
+	    model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
                                  nn.LogSoftmax(dim=1))
 	return model
 

From 1a99b518f80f153e69e8f265f0d10f1c04940767 Mon Sep 17 00:00:00 2001
From: Savan Visalpara <savan77@users.noreply.github.com>
Date: Mon, 9 Nov 2020 17:54:36 -0600
Subject: [PATCH 17/70] resolve indentation and subscription issue

---
 examples/trials/pytorch-classifier/main.py        | 12 ++++++------
 examples/trials/pytorch-classifier/train_model.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 702f52b7a4..a08eac344a 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -34,12 +34,12 @@
 
 
 def build_model(model_type, num_classes):
-	if model_type == "googlenet":
-		model = models.googlenet(pretrained=True)
-		in_features = 1024
-	elif model_type == "resnet50":
-		model = models.resnet50(pretrained=True)
-		in_features = 2048
+    if model_type == "googlenet":
+        model = models.googlenet(pretrained=True)
+        in_features = 1024
+    elif model_type == "resnet50":
+        model = models.resnet50(pretrained=True)
+        in_features = 2048
     elif model_type == "alexnet":
         model = models.alexnet(pretrained=True)
         in_features = 4096
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index efa9403427..bc99af1072 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -157,7 +157,7 @@ def get_params():
 
 if __name__ == '__main__':
     try:
-        params = get_params()
+        params = vars(get_params())
         print("Current Parameters:\n")
         print(params)
         train(params)

From fb23a16f58f9a32d2abe9ac090224fc35bb0fe08 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 9 Nov 2020 19:59:10 -0600
Subject: [PATCH 18/70] add alexnet and vgg support for specific param training

---
 examples/trials/pytorch-classifier/main.py    |  8 ++---
 .../trials/pytorch-classifier/train_model.py  | 32 ++++++++++++-------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index a08eac344a..de2d591467 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -50,9 +50,9 @@ def build_model(model_type, num_classes):
         model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes),
                                         nn.LogSoftmax(dim=1))
     else:
-	    model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
+        model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
                                  nn.LogSoftmax(dim=1))
-	return model
+    return model
 
 
 def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
@@ -105,8 +105,8 @@ def train(args):
 
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
-       		ImageFolder(root=args['train_dir'], transform=transforms.Compose([
-            	transforms.ToTensor(),
+            ImageFolder(root=args['train_dir'], transform=transforms.Compose([
+            transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index bc99af1072..9edf6ebdf4 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -32,15 +32,25 @@
 
 
 def build_model(model_type, num_classes):
-	if model_type == "googlenet":
-		model = models.googlenet(pretrained=True)
-		in_features = 1024
-	elif model_type == "resnet50":
-		model = models.resnet50(pretrained=True)
-		in_features = 2048
-	model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
+    if model_type == "googlenet":
+        model = models.googlenet(pretrained=True)
+        in_features = 1024
+    elif model_type == "resnet50":
+        model = models.resnet50(pretrained=True)
+        in_features = 2048
+    elif model_type == "alexnet":
+        model = models.alexnet(pretrained=True)
+        in_features = 4096
+    elif model_type == "vgg19":
+        model = models.alexnet(pretrained=True)
+        in_features = 4096
+    if model_type in ['alexnet', 'vgg19']:
+        model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes),
+                                        nn.LogSoftmax(dim=1))
+    else:
+        model.fc = nn.Sequential(nn.Linear(in_features, num_classes),
                                  nn.LogSoftmax(dim=1))
-	return model
+    return model
 
 
 def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
@@ -92,14 +102,14 @@ def train(args):
 
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
-       		ImageFolder(root=args['train_dir'], transform=transforms.Compose([
-            	transforms.ToTensor(),
+            ImageFolder(root=args['train_dir'], transform=transforms.Compose([
+                transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)
     test_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['test_dir'], transform=transforms.Compose([
-            	transforms.ToTensor(),
+                transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)

From 00ba638f382b716ef88dec4fe9dae915dedd1b05 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 16:40:17 -0600
Subject: [PATCH 19/70] changes to persist metrics

---
 examples/nas/enas/search.py                   |  6 ++--
 examples/trials/pytorch-classifier/main.py    |  9 ++---
 .../trials/pytorch-classifier/template.yml    | 12 +++++--
 .../trials/pytorch-classifier/train_model.py  | 33 ++++++++++++-------
 nni/algorithms/nas/pytorch/enas/trainer.py    |  1 +
 nni/nas/pytorch/trainer.py                    |  2 +-
 6 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index d4e2b03443..c1ac75cdff 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -2,9 +2,8 @@
 # Licensed under the MIT license.
 
 import logging
-import time
 from argparse import ArgumentParser
-
+import json
 import torch
 import torch.nn as nn
 
@@ -63,3 +62,6 @@
     if args.visualization:
         trainer.enable_visualization()
     trainer.train()
+    metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}]
+    with open('/tmp/sys-metrics.json', 'w') as f:
+        json.dump(metrics, f)
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index de2d591467..46c32bbb0c 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -93,7 +93,7 @@ def test(args, model, device, test_loader):
     logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
         test_loss, correct, len(test_loader.dataset), accuracy))
 
-    return accuracy
+    return accuracy, test_loss
 
 
 def train(args):
@@ -124,7 +124,7 @@ def train(args):
 
     for epoch in range(1, args['epochs'] + 1):
         train_one_epoch(args, model, device, train_loader, optimizer, epoch)
-        test_acc = test(args, model, device, test_loader)
+        test_acc, test_loss = test(args, model, device, test_loader)
 
         # report intermediate result
         nni.report_intermediate_result(test_acc)
@@ -133,7 +133,8 @@ def train(args):
 
     # report final result
     nni.report_final_result(test_acc)
-    logger.debug('Final result is %g', test_acc)
+    print(test_acc, test_loss)
+    logger.debug('Final result is %g and loss is %g', test_acc, test_loss)
     logger.debug('Send final result done.')
 
 
@@ -145,7 +146,7 @@ def get_params():
     parser.add_argument("--test_dir", type=str,
                         default='/home/savan/Documents/test_data', help="test data directory")
     parser.add_argument("--model_type", type=str,
-                        default='googlenet', help="model to train")
+                        default='alexnet', help="model to train")
     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
                         help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index 6ed90c9258..9b5c32946d 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -30,7 +30,7 @@ arguments:
     - name: model-type
       displayName: Model type
       visibility: public
-      value: googlenet
+      value: alexnet
       options:
       - name: 'GoogleNet'
         value: 'googlenet'
@@ -152,6 +152,14 @@ templates:
       - name: compare-models
         template: compare-models
         dependencies: [neural-architecture-search, hyperparameter-tuning, train-model]
+        arguments:
+          artifacts:
+            - name: nas-metrics
+              from: "{{tasks.neural-architecture-search.outputs.artifacts.sys-metrics}}" 
+            - name: hyperop-metrics
+              from: "{{tasks.hyperparameter-tuning.outputs.artifacts.sys-metrics}}"
+            - name: singlemodel-metrics
+              from: "{{tasks.model-param.outputs.artifacts.sys-metrics}}"
   - name: pytorch
     inputs:
       artifacts:
@@ -249,7 +257,7 @@ templates:
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
-        python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
+        python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
                  --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
       workingDir: /mnt
       volumeMounts:
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 9edf6ebdf4..61e08e099a 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -4,8 +4,9 @@
 
 import argparse
 import logging
-import nni
+import json
 import torch
+import os
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
@@ -90,7 +91,7 @@ def test(args, model, device, test_loader):
     logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
         test_loss, correct, len(test_loader.dataset), accuracy))
 
-    return accuracy
+    return accuracy, test_loss
 
 
 def train(args):
@@ -118,20 +119,20 @@ def train(args):
     model = build_model(args['model_type'], args['num_classes']).to(device)
     optimizer = optim.SGD(model.parameters(), lr=args['lr'],
                           momentum=args['momentum'])
+    
+    if not os.path.exists('/mnt/output/fixed-params'):
+        os.makedirs('/mnt/output/fixed-params')
 
     for epoch in range(1, args['epochs'] + 1):
         train_one_epoch(args, model, device, train_loader, optimizer, epoch)
-        test_acc = test(args, model, device, test_loader)
-
+        test_acc, test_loss = test(args, model, device, test_loader)
+        torch.save(model, '/mnt/output/fixed-params/fixed-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2)))
         # report intermediate result
-        nni.report_intermediate_result(test_acc)
-        logger.debug('test accuracy %g', test_acc)
-        logger.debug('Pipe send intermediate result done.')
+        print('test accuracy: {} test loss: {}'.format(test_acc, test_loss))
 
     # report final result
-    nni.report_final_result(test_acc)
-    logger.debug('Final result is %g', test_acc)
-    logger.debug('Send final result done.')
+    print('Final result is ', test_acc)
+    return test_acc, test_loss
 
 
 def get_params():
@@ -142,7 +143,7 @@ def get_params():
     parser.add_argument("--test_dir", type=str,
                         default='/home/savan/Documents/test_data', help="test data directory")
     parser.add_argument("--model_type", type=str,
-                        default='googlenet', help="model to train")
+                        default='alexnet', help="model to train")
     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
                         help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
@@ -170,7 +171,15 @@ def get_params():
         params = vars(get_params())
         print("Current Parameters:\n")
         print(params)
-        train(params)
+        acc, loss = train(params)
+        metrics = [
+          {'name': 'accuracy', 'value': acc},
+          {'name': 'loss', 'value': loss},
+        ]
+        
+        # Write metrics to `/tmp/sys-metrics.json`
+        with open('/tmp/sys-metrics.json', 'w') as f:
+            json.dump(metrics, f)
     except Exception as exception:
         logger.exception(exception)
         raise
diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py
index f67c38060a..33147b6174 100644
--- a/nni/algorithms/nas/pytorch/enas/trainer.py
+++ b/nni/algorithms/nas/pytorch/enas/trainer.py
@@ -208,3 +208,4 @@ def validate_one_epoch(self, epoch):
                 logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary  %s",
                             epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch,
                             meters.summary())
+        return meters
\ No newline at end of file
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index 6024a05a4a..03a093328a 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -144,7 +144,7 @@ def train(self, validate=True):
             if validate:
                 # validation
                 _logger.info("Epoch %d Validating", epoch + 1)
-                self.validate_one_epoch(epoch)
+                self.val_model_summary = self.validate_one_epoch(epoch)
 
             for callback in self.callbacks:
                 callback.on_epoch_end(epoch)

From fd00e11e66dad037a1c017eef67ab4c0b9b41ba8 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 17:33:16 -0600
Subject: [PATCH 20/70] add model comparison script

---
 compare.py                                     | 18 ++++++++++++++++++
 examples/trials/pytorch-classifier/config.yml  |  2 +-
 examples/trials/pytorch-classifier/main.py     |  9 +++++++--
 .../trials/pytorch-classifier/requirements.txt |  2 +-
 .../pytorch-classifier/search_space.json       |  1 -
 5 files changed, 27 insertions(+), 5 deletions(-)
 create mode 100644 compare.py

diff --git a/compare.py b/compare.py
new file mode 100644
index 0000000000..253151583c
--- /dev/null
+++ b/compare.py
@@ -0,0 +1,18 @@
+import json
+
+accuracies = {}
+with open('/tmp/nas-metrics.json') as f:
+    nas = json.load(f)
+
+with open('/tmp/hyperop-metrics.json') as f:
+    hyper = json.load(f)
+
+with open('/tmp/singlemodel-metrics.json') as f:
+    fm = json.load(f)
+
+accuracies['nas_acc'] = [i['value'] for i in nas if i['name'] == 'accuracy'][0]
+accuracies['hyper_acc'] = [i['value'] for i in hyper if i['name'] == 'accuracy'][0]
+accuracies['fm_acc'] = [i['value'] for i in fm if i['name'] == 'accuracy'][0]
+
+max_acc_name = max(accuracies, key=accuracies.get)
+print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name))
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml
index c671dcf3dd..d58b9e133d 100644
--- a/examples/trials/pytorch-classifier/config.yml
+++ b/examples/trials/pytorch-classifier/config.yml
@@ -2,7 +2,7 @@ authorName: default
 experimentName: pytorch_classifier
 trialConcurrency: 1
 maxExecDuration: 10h
-maxTrialNum: 10
+maxTrialNum: 15
 #choice: local, remote, pai
 trainingServicePlatform: local
 searchSpacePath: search_space.json
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 46c32bbb0c..896fd9900c 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -103,16 +103,21 @@ def train(args):
 
     device = torch.device("cuda" if use_cuda else "cpu")
 
+    if args['model_type'] == 'alexnet':
+        w, h = 256, 256
+    else:
+        w, h = 224, 224
+
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['train_dir'], transform=transforms.Compose([
-            transforms.ToTensor(),
+            transforms.Resize((w, h)), transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)
     test_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['test_dir'], transform=transforms.Compose([
-            	transforms.ToTensor(),
+            	transforms.Resize((w, h)), transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)
diff --git a/examples/trials/pytorch-classifier/requirements.txt b/examples/trials/pytorch-classifier/requirements.txt
index 01f6b72556..e7ccd30e3d 100644
--- a/examples/trials/pytorch-classifier/requirements.txt
+++ b/examples/trials/pytorch-classifier/requirements.txt
@@ -1,2 +1,2 @@
 torch
-torchvision
+torchvision
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/search_space.json b/examples/trials/pytorch-classifier/search_space.json
index c26cdce369..978497f8fa 100644
--- a/examples/trials/pytorch-classifier/search_space.json
+++ b/examples/trials/pytorch-classifier/search_space.json
@@ -1,6 +1,5 @@
 {
     "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]},
-    "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]},
     "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]},
     "momentum":{"_type":"uniform","_value":[0, 1]}
 }

From d76ac418e97b2bad09b9d77b536a589f8e8822ee Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 17:37:16 -0600
Subject: [PATCH 21/70] update template

---
 examples/trials/pytorch-classifier/template.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index 9b5c32946d..a37b43d7d5 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -276,6 +276,12 @@ templates:
         path: /mnt/data/datasets/
         s3:
           key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+      - name: nas-metrics
+        value: /tmp/nas-metrics.json
+      - name: hyperop-metrics
+        value: /tmp/hyperop-metrics.json
+      - name: singlemodel-metrics
+        value: /tmp/singlemodel-metrics.json
     outputs:
       artifacts:
       - name: model

From cf5a6cd4f252d4fd182c927d0e6dd3d89f9bc946 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 17:39:47 -0600
Subject: [PATCH 22/70] correct typos in template

---
 examples/trials/pytorch-classifier/template.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index a37b43d7d5..e9afa5de53 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -159,7 +159,7 @@ templates:
             - name: hyperop-metrics
               from: "{{tasks.hyperparameter-tuning.outputs.artifacts.sys-metrics}}"
             - name: singlemodel-metrics
-              from: "{{tasks.model-param.outputs.artifacts.sys-metrics}}"
+              from: "{{tasks.train-model.outputs.artifacts.sys-metrics}}"
   - name: pytorch
     inputs:
       artifacts:
@@ -277,11 +277,11 @@ templates:
         s3:
           key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
       - name: nas-metrics
-        value: /tmp/nas-metrics.json
+        path: /tmp/nas-metrics.json
       - name: hyperop-metrics
-        value: /tmp/hyperop-metrics.json
+        path: /tmp/hyperop-metrics.json
       - name: singlemodel-metrics
-        value: /tmp/singlemodel-metrics.json
+        path: /tmp/singlemodel-metrics.json
     outputs:
       artifacts:
       - name: model

From c9a39042ad38253c99ae8705dc043d5a1c33f61b Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 18:14:14 -0600
Subject: [PATCH 23/70] update path for processed data

---
 examples/trials/pytorch-classifier/main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 896fd9900c..9a5b2eff21 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -147,9 +147,9 @@ def get_params():
     # Training settings
     parser = argparse.ArgumentParser(description='PyTorch Classification Example')
     parser.add_argument("--train_dir", type=str,
-                        default='/home/savan/Documents/train_data', help="train data directory")
+                        default='/mnt/data/datasets/processed_data/train', help="train data directory")
     parser.add_argument("--test_dir", type=str,
-                        default='/home/savan/Documents/test_data', help="test data directory")
+                        default='/mnt/data/datasets/processed_data/test', help="test data directory")
     parser.add_argument("--model_type", type=str,
                         default='alexnet', help="model to train")
     parser.add_argument('--batch_size', type=int, default=1, metavar='N',

From c628e5664de933726a02fff6fe25be3bd0bf630f Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 10 Nov 2020 19:13:27 -0600
Subject: [PATCH 24/70] handle case when loss is NaN

---
 examples/trials/pytorch-classifier/train_model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 61e08e099a..46ae63e03b 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -172,6 +172,8 @@ def get_params():
         print("Current Parameters:\n")
         print(params)
         acc, loss = train(params)
+        if loss is None:
+            loss = 0
         metrics = [
           {'name': 'accuracy', 'value': acc},
           {'name': 'loss', 'value': loss},

From 72f491dde1ac152966596c8fd613dbc1ae16ddfb Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 11 Nov 2020 16:33:16 -0600
Subject: [PATCH 25/70] store model after every epoch

---
 examples/trials/pytorch-classifier/main.py        | 5 +++++
 examples/trials/pytorch-classifier/train_model.py | 9 +++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 9a5b2eff21..b3ccacaaab 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -126,12 +126,17 @@ def train(args):
     model = build_model(args['model_type'], args['num_classes']).to(device)
     optimizer = optim.SGD(model.parameters(), lr=args['lr'],
                           momentum=args['momentum'])
+    
+    if not os.path.exists('/mnt/output/hyper-params'):
+        os.makedirs('/mnt/output/hyper-params')
 
     for epoch in range(1, args['epochs'] + 1):
         train_one_epoch(args, model, device, train_loader, optimizer, epoch)
         test_acc, test_loss = test(args, model, device, test_loader)
+        torch.save(model, '/mnt/output/hyper-params/hyper-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2)))
 
         # report intermediate result
+        print('test accuracy: {} test loss: {}'.format(test_acc, test_loss))
         nni.report_intermediate_result(test_acc)
         logger.debug('test accuracy %g', test_acc)
         logger.debug('Pipe send intermediate result done.')
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 46ae63e03b..a6fad08c1e 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -101,16 +101,21 @@ def train(args):
 
     device = torch.device("cuda" if use_cuda else "cpu")
 
+    if args['model_type'] == 'alexnet':
+        w, h = 256, 256
+    else:
+        w, h = 224, 224
+        
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['train_dir'], transform=transforms.Compose([
-                transforms.ToTensor(),
+                transforms.Resize((w, h)),transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)
     test_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['test_dir'], transform=transforms.Compose([
-                transforms.ToTensor(),
+                transforms.Resize((w, h)),transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
         batch_size=args['batch_size'], shuffle=True, **kwargs)

From d899e4cee104e6d330d7d47b3554930a9a09fd43 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 11 Nov 2020 17:48:03 -0600
Subject: [PATCH 26/70] change nas log directory for visualization

---
 nni/nas/pytorch/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index 03a093328a..08c1384bf3 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -92,7 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         self.batch_size = batch_size
         self.workers = workers
         self.log_frequency = log_frequency
-        self.log_dir = os.path.join("/mnt/output", str(time.time()))
+        self.log_dir = "/mnt/output/naslogs"
         os.makedirs(self.log_dir, exist_ok=True)
         self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
         self.callbacks = callbacks if callbacks is not None else []

From f085f9eac093e7e5e60b6bfa0fffbe11759568a7 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 11 Nov 2020 18:31:46 -0600
Subject: [PATCH 27/70] get best parameter for hyper param tuning

---
 examples/trials/pytorch-classifier/main.py |  3 ++-
 nni/trial.py                               | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index b3ccacaaab..126cd5fbc6 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -143,7 +143,8 @@ def train(args):
 
     # report final result
     nni.report_final_result(test_acc)
-    print(test_acc, test_loss)
+    best_params = nni.get_best_params()
+    print("Best param and score: ", best_params)
     logger.debug('Final result is %g and loss is %g', test_acc, test_loss)
     logger.debug('Send final result done.')
 
diff --git a/nni/trial.py b/nni/trial.py
index cdb2b1e683..07fc5cd228 100644
--- a/nni/trial.py
+++ b/nni/trial.py
@@ -22,6 +22,8 @@
 _trial_id = platform.get_trial_id()
 _sequence_id = platform.get_sequence_id()
 
+#keep track of highest accuracy
+_best_score = {'params':None, 'score':0}
 
 def get_next_parameter():
     """
@@ -139,3 +141,14 @@ def report_final_result(metric):
         'value': to_json(metric)
     })
     platform.send_metric(metric)
+    update_score(metric)
+
+def update_score(score):
+    global _best_score
+    if score > _best_score['score']:
+        _best_score['score'] = score
+        _best_score['params'] = get_current_parameter()
+
+def get_best_params():
+    global _best_score
+    return _best_score
\ No newline at end of file

From f4ac73548e4a84d1ef07f26254706410510d53c5 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 12 Nov 2020 14:06:07 -0600
Subject: [PATCH 28/70] update node package version

---
 nni/trial.py | 4 +++-
 setup_ts.py  | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/nni/trial.py b/nni/trial.py
index 07fc5cd228..66d690b840 100644
--- a/nni/trial.py
+++ b/nni/trial.py
@@ -133,6 +133,7 @@ def report_final_result(metric):
     """
     assert _params or trial_env_vars.NNI_PLATFORM is None, \
         'nni.get_next_parameter() needs to be called before report_final_result'
+    update_score(metric)
     metric = to_json({
         'parameter_id': _params['parameter_id'] if _params else None,
         'trial_job_id': trial_env_vars.NNI_TRIAL_JOB_ID,
@@ -141,7 +142,7 @@ def report_final_result(metric):
         'value': to_json(metric)
     })
     platform.send_metric(metric)
-    update_score(metric)
+    
 
 def update_score(score):
     global _best_score
@@ -151,4 +152,5 @@ def update_score(score):
 
 def get_best_params():
     global _best_score
+    print("Best Score", _best_score)
     return _best_score
\ No newline at end of file
diff --git a/setup_ts.py b/setup_ts.py
index 5872ce7bbd..5eb3cba7f0 100644
--- a/setup_ts.py
+++ b/setup_ts.py
@@ -22,7 +22,7 @@
 from zipfile import ZipFile
 
 
-node_version = 'v10.22.1'
+node_version = 'v14.15.0'
 yarn_version = 'v1.22.10'
 
 
@@ -59,7 +59,7 @@ def clean(clean_all=False):
 if sys.platform == 'linux' or sys.platform == 'darwin':
     node_executable = 'node'
     node_spec = f'node-{node_version}-{sys.platform}-x64'
-    node_download_url = f'https://nodejs.org/dist/latest-v10.x/{node_spec}.tar.xz'
+    node_download_url = f'https://nodejs.org/dist/{node_version}/{node_spec}.tar.xz'
     node_extractor = lambda data: tarfile.open(fileobj=BytesIO(data), mode='r:xz')
     node_executable_in_tarball = 'bin/node'
 

From 918f773ac6eee284a282a2518b21fcb50dd32c69 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 12 Nov 2020 16:42:20 -0600
Subject: [PATCH 29/70] fixed issue with metrics dumping of tuner

---
 examples/trials/pytorch-classifier/main.py    |  6 +++
 .../trials/pytorch-classifier/train_model.py  | 12 +++++-
 nni/trial.py                                  | 41 ++++++++++++++-----
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 126cd5fbc6..a836e97f2a 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -6,6 +6,7 @@
 import argparse
 import logging
 import nni
+import json
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -145,6 +146,11 @@ def train(args):
     nni.report_final_result(test_acc)
     best_params = nni.get_best_params()
     print("Best param and score: ", best_params)
+    metrics = [
+        {'name': 'accuracy', 'value': best_params['score']},
+    ]
+    with open('/tmp/sys-metrics.json', 'w') as f:
+        json.dump(metrics, f)
     logger.debug('Final result is %g and loss is %g', test_acc, test_loss)
     logger.debug('Send final result done.')
 
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index a6fad08c1e..56b85ba104 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -13,7 +13,8 @@
 from torchvision.datasets import ImageFolder
 import torchvision.models as models
 from torchvision import datasets, transforms
-
+from torch.utils.tensorboard import SummaryWriter
+writer = SummaryWriter("/mnt/output/fixed_param__tb")
 logger = logging.getLogger('pytorch_classifier')
 
 
@@ -63,6 +64,11 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
         optimizer.zero_grad()
         output = model(data)
         loss = F.nll_loss(output, target)
+        pred = output.argmax(dim=1, keepdim=True)
+        correct += pred.eq(target.view_as(pred)).sum().item()
+        accuracy = 100. * correct / len(train_loader.dataset)
+        writer.add_scalar("Loss/train", loss, batch_idx + (epoch * 10))
+        writer.add_scalar("Accuracy/train", accuracy, batch_idx + (epoch * 10))
         loss.backward()
         optimizer.step()
         if batch_idx % args['log_interval'] == 0:
@@ -105,7 +111,7 @@ def train(args):
         w, h = 256, 256
     else:
         w, h = 224, 224
-        
+
     kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
     train_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['train_dir'], transform=transforms.Compose([
@@ -131,6 +137,8 @@ def train(args):
     for epoch in range(1, args['epochs'] + 1):
         train_one_epoch(args, model, device, train_loader, optimizer, epoch)
         test_acc, test_loss = test(args, model, device, test_loader)
+        writer.add_scalar("Loss/test", test_loss, epoch )
+        writer.add_scalar("Accuracy/test", test_acc, epoch )
         torch.save(model, '/mnt/output/fixed-params/fixed-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2)))
         # report intermediate result
         print('test accuracy: {} test loss: {}'.format(test_acc, test_loss))
diff --git a/nni/trial.py b/nni/trial.py
index 66d690b840..0d24274899 100644
--- a/nni/trial.py
+++ b/nni/trial.py
@@ -4,7 +4,7 @@
 from .utils import to_json
 from .runtime.env_vars import trial_env_vars
 from .runtime import platform
-
+import os, json
 
 __all__ = [
     'get_next_parameter',
@@ -13,7 +13,8 @@
     'report_final_result',
     'get_experiment_id',
     'get_trial_id',
-    'get_sequence_id'
+    'get_sequence_id',
+    'get_best_params'
 ]
 
 
@@ -23,7 +24,8 @@
 _sequence_id = platform.get_sequence_id()
 
 #keep track of highest accuracy
-_best_score = {'params':None, 'score':0}
+#_best_params = os.getenv('_BEST_PARAMS', None)
+#_best_score  = os.getenv('_BEST_SCORE', 0)
 
 def get_next_parameter():
     """
@@ -144,13 +146,30 @@ def report_final_result(metric):
     platform.send_metric(metric)
     
 
-def update_score(score):
-    global _best_score
-    if score > _best_score['score']:
-        _best_score['score'] = score
-        _best_score['params'] = get_current_parameter()
+def update_score(metric):
+  
+    #keep track of highest accuracy
+    _sysdir = trial_env_vars.NNI_SYS_DIR
+    _trials = os.path.dirname(_sysdir)
+    if os.path.exists(os.path.join(_trials, 'best_score.json')):
+        with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile:
+            data = json.load(jsonFile)
+        if float(data['score']) < metric:
+            data['score'] = str(metric)
+
+            with open(os.path.join(_trials, 'best_score.json'), "w") as jsonFile2:
+                print("updating json file", data)
+                json.dump(data, jsonFile2)
+    else:
+        params = get_current_parameter()
+        with open(os.path.join(_trials, 'best_score.json'),'w') as f:
+            json.dump({'score':metric, 'params':str(params) } , f)        
 
 def get_best_params():
-    global _best_score
-    print("Best Score", _best_score)
-    return _best_score
\ No newline at end of file
+    _sysdir = trial_env_vars.NNI_SYS_DIR
+    _trials = os.path.dirname(_sysdir)
+    if os.path.exists(os.path.join(_trials, 'best_score.json')):
+        with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile:
+            data = json.load(jsonFile)
+        return data
+    return None

From e25b239bd58bdf568fc89265d1397f1b00c9bf96 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 12 Nov 2020 16:45:11 -0600
Subject: [PATCH 30/70] fixed a typo

---
 examples/trials/pytorch-classifier/train_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 56b85ba104..93435b6163 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -65,7 +65,7 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
         output = model(data)
         loss = F.nll_loss(output, target)
         pred = output.argmax(dim=1, keepdim=True)
-        correct += pred.eq(target.view_as(pred)).sum().item()
+        correct = pred.eq(target.view_as(pred)).sum().item()
         accuracy = 100. * correct / len(train_loader.dataset)
         writer.add_scalar("Loss/train", loss, batch_idx + (epoch * 10))
         writer.add_scalar("Accuracy/train", accuracy, batch_idx + (epoch * 10))

From 47f49824a4d58c9e3dc987821b96e1f56f146be5 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 12 Nov 2020 16:48:10 -0600
Subject: [PATCH 31/70] fixed a typo in a path

---
 examples/trials/pytorch-classifier/train_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 93435b6163..d42ad44593 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -14,7 +14,7 @@
 import torchvision.models as models
 from torchvision import datasets, transforms
 from torch.utils.tensorboard import SummaryWriter
-writer = SummaryWriter("/mnt/output/fixed_param__tb")
+writer = SummaryWriter("/mnt/output/fixed_param_tb")
 logger = logging.getLogger('pytorch_classifier')
 
 

From 5b048d6b1bd1eac12fdda68b45ae4735ff0b9001 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 12 Nov 2020 17:24:35 -0600
Subject: [PATCH 32/70] update template

---
 .../trials/pytorch-classifier/template.yml    | 38 +++++++++++++++----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index e9afa5de53..e82b326451 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -181,7 +181,7 @@ templates:
       - |
         apt-get update && \
         apt-get install -y gcc g++ git && \
-        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
@@ -195,6 +195,20 @@ templates:
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+    sidecars:
+      - name: nni-web-ui
+        image: 'tensorflow/tensorflow:2.3.0'
+        command:
+          - sh
+          - '-c'
+        tty: true
+        args:
+          - |
+            pip install nni && \
+            nnictl webui nas --logdir /mnt/output/naslogs --port 8888
+        ports:
+          - containerPort: 8888
+            name: nni
 
   - name: hyperop
     inputs:
@@ -217,12 +231,11 @@ templates:
       - |
         apt-get update && \
         apt-get install -y gcc g++ git && \
-        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
-        python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
-                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data
+        nnictl create --config examples/trials/pytorch-classifier/config.yml
       workingDir: /mnt
       volumeMounts:
       - name: data2
@@ -253,7 +266,7 @@ templates:
       - |
         apt-get update && \
         apt-get install -y gcc g++ git && \
-        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
@@ -267,7 +280,18 @@ templates:
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
-    
+    sidecars:
+      - name: tensorboard
+        image: 'tensorflow/tensorflow:2.3.0'
+        command:
+          - sh
+          - '-c'
+        tty: true
+        args:
+          - tensorboard --logdir /mnt/output/fixed_param_tb
+        ports:
+          - containerPort: 6006
+            name: tensorboard
    
   - name: compare-models
     inputs:
@@ -333,7 +357,7 @@ templates:
       - |
         apt-get update && \
         apt-get install -y gcc g++ git && \
-        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        python3 -m pip install setuptools && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 prepare_data.py --data_dir=/mnt/output/processed_data

From 603638ccbb821dfe4cb394f4e7b3ac26ad18afd1 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Sun, 15 Nov 2020 18:55:25 -0600
Subject: [PATCH 33/70] update workflow

---
 examples/trials/pytorch-classifier/template.yml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index e82b326451..5b3825e6c2 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -180,11 +180,12 @@ templates:
       args:
       - |
         apt-get update && \
-        apt-get install -y gcc g++ git && \
+        apt-get install -y gcc g++ git curl && \
         python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 setup.py install && \
+        
         python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \
                 --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data
       workingDir: /mnt
@@ -230,11 +231,15 @@ templates:
       args:
       - |
         apt-get update && \
-        apt-get install -y gcc g++ git && \
+        apt-get install -y gcc g++ git curl wget && \
         python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
+        python3 -m pip install --upgrade requests
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 setup.py install && \
+        curl -sL https://deb.nodesource.com/setup_14.x | bash - && \
+        apt-get install -y nodejs && \
+        wget https://github.com/onepanelio/nni/releases/download/2.0.0a0/nni-2.1-py3-none-manylinux1_x86_64.whl && \
+        python3 -m pip install nni-2.1-py3-none-manylinux1_x86_64.whl && \ 
         nnictl create --config examples/trials/pytorch-classifier/config.yml
       workingDir: /mnt
       volumeMounts:

From 199aacf347263a71980f76eaffbe5c9d6b7149dd Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Sun, 15 Nov 2020 23:37:50 -0600
Subject: [PATCH 34/70] handle case when loss is NaN

---
 examples/trials/pytorch-classifier/train_model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index d42ad44593..438c7e944c 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -14,6 +14,7 @@
 import torchvision.models as models
 from torchvision import datasets, transforms
 from torch.utils.tensorboard import SummaryWriter
+import math
 writer = SummaryWriter("/mnt/output/fixed_param_tb")
 logger = logging.getLogger('pytorch_classifier')
 
@@ -185,7 +186,7 @@ def get_params():
         print("Current Parameters:\n")
         print(params)
         acc, loss = train(params)
-        if loss is None:
+        if loss is None or math.isnan(loss):
             loss = 0
         metrics = [
           {'name': 'accuracy', 'value': acc},

From b19ef2aeed06d698537e71da2cd8d404c02b2cc3 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 16 Nov 2020 13:33:58 -0600
Subject: [PATCH 35/70] reduce default epochs for testing

---
 examples/trials/pytorch-classifier/main.py        | 2 +-
 examples/trials/pytorch-classifier/train_model.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index a836e97f2a..1012e09d9e 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -173,7 +173,7 @@ def get_params():
                         help='learning rate (default: 0.01)')
     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                         help='SGD momentum (default: 0.5)')
-    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+    parser.add_argument('--epochs', type=int, default=1, metavar='N',
                         help='number of epochs to train (default: 10)')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 438c7e944c..182de86d68 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -1,7 +1,6 @@
 """
 A general purpose classification script using PyTorch.
 """
-
 import argparse
 import logging
 import json

From 3da6fb78fd15bb8e31132b8a2b4e13ad9289c290 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 16 Nov 2020 15:59:24 -0600
Subject: [PATCH 36/70] shut down process when experiment is finished

---
 ts/nni_manager/core/nnimanager.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts
index 6ec4d0e21d..dfeb3c1bc9 100644
--- a/ts/nni_manager/core/nnimanager.ts
+++ b/ts/nni_manager/core/nnimanager.ts
@@ -588,6 +588,7 @@ class NNIManager implements Manager {
                         await this.storeExperimentProfile();
                         // write this log for travis CI
                         this.log.info('Experiment done.');
+                        return  process.exit(0);
                     }
                 }
             } else {

From 5acfbe4a8d29ef979a29d322ff8cfbbe36a86c0d Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 14:56:46 -0600
Subject: [PATCH 37/70] stop process when experiment is done

---
 .../trials/pytorch-classifier/template.yml    | 62 ++++++++++---------
 nni/tools/nnictl/launcher.py                  |  2 +
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index 5b3825e6c2..fa1d11c0f5 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -46,6 +46,26 @@ arguments:
         value: macro
       - name: 'Micro'
         value: micro
+    - name: hyperparamtuning-config
+      value: |-
+        epochs=1
+      displayName: Settings for hyperparameter tuning
+      visibility: public
+      type: textarea.textarea
+    - name: nas-config
+      value: |-
+        epochs=1
+      displayName: Settings for Neural Architecture Search
+      visibility: public
+      type: textarea.textarea
+    - name: fixedparam-config
+      value: |-
+        epochs=1
+        momentum=0.5
+        lr=0.01
+      displayName: Settings for model training
+      visibility: public
+      type: textarea.textarea
     - displayName: Node pool
       hint: Name of node pool or group to run this workflow task
       type: select.select
@@ -175,17 +195,12 @@ templates:
         archive:
           none: {}
     container:
-      image: pytorch/pytorch:latest
+      image: onepanel/nas:0.0.1
       command: [sh,-c]
       args:
       - |
-        apt-get update && \
-        apt-get install -y gcc g++ git curl && \
-        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 setup.py install && \
-        
         python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \
                 --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data
       workingDir: /mnt
@@ -226,21 +241,13 @@ templates:
         archive:
           none: {}
     container:
-      image: pytorch/pytorch:latest
+      image: onepanel/nas:0.0.1
       command: [sh,-c]
       args:
       - |
-        apt-get update && \
-        apt-get install -y gcc g++ git curl wget && \
-        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
-        python3 -m pip install --upgrade requests
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        curl -sL https://deb.nodesource.com/setup_14.x | bash - && \
-        apt-get install -y nodejs && \
-        wget https://github.com/onepanelio/nni/releases/download/2.0.0a0/nni-2.1-py3-none-manylinux1_x86_64.whl && \
-        python3 -m pip install nni-2.1-py3-none-manylinux1_x86_64.whl && \ 
-        nnictl create --config examples/trials/pytorch-classifier/config.yml
+        nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground
       workingDir: /mnt
       volumeMounts:
       - name: data2
@@ -249,6 +256,13 @@ templates:
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
+    sidecars:
+      - name: nni-web-ui
+        image: 'onepanel/nni-proxy:0.0.1'
+        tty: true
+        ports:
+          - containerPort: 8089
+            name: nni
     
   - name: model-param
     inputs:
@@ -265,16 +279,12 @@ templates:
         archive:
           none: {}
     container:
-      image: pytorch/pytorch:latest
+      image: onepanel/nas:0.0.1
       command: [sh,-c]
       args:
       - |
-        apt-get update && \
-        apt-get install -y gcc g++ git && \
-        python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 setup.py install && \
         python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
                  --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
       workingDir: /mnt
@@ -319,19 +329,13 @@ templates:
         archive:
           none: {}
     container:
-      image: pytorch/pytorch:latest
+      image: onepanel/nas:0.0.1
       command: [sh,-c]
       args:
       - |
-        apt-get update && \
-        apt-get install -y gcc g++ git && \
-        python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 setup.py install && \
-        python3 prepare_data.py && \
-        python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
-                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
+        python3 compare.py
       workingDir: /mnt
       volumeMounts:
       - name: data4
diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py
index fa0aa3baab..8a05ae9803 100644
--- a/nni/tools/nnictl/launcher.py
+++ b/nni/tools/nnictl/launcher.py
@@ -528,6 +528,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen
             while True:
                 log_content = rest_process.stdout.readline().strip().decode('utf-8')
                 print(log_content)
+                if 'Experiment done.' in log_content:
+                    sys.exit(0)
         except KeyboardInterrupt:
             kill_command(rest_process.pid)
             print_normal('Stopping experiment...')

From 9265381ec67ddf7c7c68dad6c61d5d9b38a3f50d Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 16:15:27 -0600
Subject: [PATCH 38/70] accept settings in a single param

---
 .../trials/pytorch-classifier/train_model.py  | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 182de86d68..8d48553088 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -118,23 +118,23 @@ def train(args):
                 transforms.Resize((w, h)),transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
-        batch_size=args['batch_size'], shuffle=True, **kwargs)
+        batch_size=int(args['batch_size']), shuffle=True, **kwargs)
     test_loader = torch.utils.data.DataLoader(
             ImageFolder(root=args['test_dir'], transform=transforms.Compose([
                 transforms.Resize((w, h)),transforms.ToTensor(),
               # add Normlize with mean and std
         ])),
-        batch_size=args['batch_size'], shuffle=True, **kwargs)
+        batch_size=int(args['batch_size']), shuffle=True, **kwargs)
 
 
-    model = build_model(args['model_type'], args['num_classes']).to(device)
-    optimizer = optim.SGD(model.parameters(), lr=args['lr'],
-                          momentum=args['momentum'])
+    model = build_model(args['model_type'], int(args['num_classes'])).to(device)
+    optimizer = optim.SGD(model.parameters(), lr=float(args['lr']),
+                          momentum=float(args['momentum']))
     
     if not os.path.exists('/mnt/output/fixed-params'):
         os.makedirs('/mnt/output/fixed-params')
 
-    for epoch in range(1, args['epochs'] + 1):
+    for epoch in range(1, int(args['epochs']) + 1):
         train_one_epoch(args, model, device, train_loader, optimizer, epoch)
         test_acc, test_loss = test(args, model, device, test_loader)
         writer.add_scalar("Loss/test", test_loss, epoch )
@@ -155,26 +155,26 @@ def get_params():
                         default='/home/savan/Documents/train_data', help="train data directory")
     parser.add_argument("--test_dir", type=str,
                         default='/home/savan/Documents/test_data', help="test data directory")
-    parser.add_argument("--model_type", type=str,
-                        default='alexnet', help="model to train")
-    parser.add_argument('--batch_size', type=int, default=1, metavar='N',
-                        help='input batch size for training (default: 64)')
+#     parser.add_argument("--model_type", type=str,
+#                         default='alexnet', help="model to train")
+#     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
+#                         help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
-    parser.add_argument("--num_classes", type=int, default=2, metavar='N',
-                        help='number of classes in the dataset')
-    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-                        help='learning rate (default: 0.01)')
-    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-                        help='SGD momentum (default: 0.5)')
-    parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                        help='number of epochs to train (default: 10)')
+#     parser.add_argument("--num_classes", type=int, default=2, metavar='N',
+#                         help='number of classes in the dataset')
+#     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+#                         help='learning rate (default: 0.01)')
+#     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+#                         help='SGD momentum (default: 0.5)')
+#     parser.add_argument('--epochs', type=int, default=10, metavar='N',
+#                         help='number of epochs to train (default: 10)')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--no_cuda', action='store_true', default=False,
                         help='disables CUDA training')
     parser.add_argument('--log_interval', type=int, default=1000, metavar='N',
                         help='how many batches to wait before logging training status')
-
+    parser.add_argument('--config', help="hyperparameters or other configs")
     args, _ = parser.parse_known_args()
     return args
 
@@ -182,9 +182,14 @@ def get_params():
 if __name__ == '__main__':
     try:
         params = vars(get_params())
+        extras = params['config'].split("\\n")
+        extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+        config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+        config.update(params)
+        config.pop('config')
         print("Current Parameters:\n")
-        print(params)
-        acc, loss = train(params)
+        print(config)
+        acc, loss = train(config)
         if loss is None or math.isnan(loss):
             loss = 0
         metrics = [

From a89139eb70dd00e1eadbcfab947d1171d8a8ebaf Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 17:30:01 -0600
Subject: [PATCH 39/70] get config args from user

---
 .../trials/pytorch-classifier/create_yaml.py  | 36 +++++++++++++++++++
 examples/trials/pytorch-classifier/main.py    |  2 +-
 .../trials/pytorch-classifier/template.yml    | 22 ++++++++----
 .../trials/pytorch-classifier/train_model.py  |  6 ++--
 4 files changed, 56 insertions(+), 10 deletions(-)
 create mode 100644 examples/trials/pytorch-classifier/create_yaml.py

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
new file mode 100644
index 0000000000..18a68830f2
--- /dev/null
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -0,0 +1,36 @@
+import yaml
+import argparse
+import json
+
+def main(args):
+    stream = open(args['config_path'], 'r')
+    data = yaml.load(stream)
+    data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs'])
+
+    with open(args['output_path'], 'w') as yaml_file:
+        yaml_file.write( yaml.dump(data, default_flow_style=False))
+    mm_list = [int(item) for item in args['momentum_range'].split(',')]
+    lr_list = [float(item) for item in args['lr_list'].split(',')]
+    bs_list = [int(item) for item in args['batch_size_list'].split(',')]
+    with open(args['output_search_space_path'], 'w') as json_file:
+        json_data = {'batch_size': {'_type':'choice', '_value':bs_list}, 'lr':{"_type":"choice","_value":lr_list} , 'momentum':{"_type":"uniform","_value":mm_list}}
+        json.dump(json_data, json_file)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='PyTorch Classification Example')
+    parser.add_argument("--config_path", type=str,
+                        default='/mnt/nni/examples/trials/pytorch-classifier/config.yml', help="train data directory")
+    parser.add_argument("--output_path", type=str,
+                        default='/mnt/nni/examples/trials/pytorch-classifier/config.yml', help="model to train")
+    parser.add_argument("--output_search_space_path", type=str,
+                        default='/mnt/nni/examples/trials/pytorch-classifier/search_space.json', help="model to train")
+    parser.add_argument("--num_classes", type=int, default=2,
+                        help="number of classes in the dataset")
+    parser.add_argument("--config", default="batch_size_list=16,32,64,128\nlr_list=0.001,0.001\nmomentum_range=0,1\nepochs=10")
+    args = parser.parse_args()
+    extras = args.config.split("\n")
+    extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+    config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+    config.update(vars(args))
+    print(config)
+    main(config)
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 1012e09d9e..3f8d54fa07 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -165,7 +165,7 @@ def get_params():
     parser.add_argument("--model_type", type=str,
                         default='alexnet', help="model to train")
     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
-                        help='input batch size for training (default: 64)')
+                        help='input batch size for training (default: 1)')
     parser.add_argument("--batch_num", type=int, default=None)
     parser.add_argument("--num_classes", type=int, default=2, metavar='N',
                         help='number of classes in the dataset')
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index fa1d11c0f5..fd07fbd21f 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -63,9 +63,17 @@ arguments:
         epochs=1
         momentum=0.5
         lr=0.01
+        model_type=alexnet
+        batch_size=1
       displayName: Settings for model training
       visibility: public
       type: textarea.textarea
+    - name: searchspace-config
+      value: |-
+        batch_size_list=16,32,64,128
+        lr_list=0.0001,0.001,0.01,0.1
+        momentum_range=0,1
+        epochs=10
     - displayName: Node pool
       hint: Name of node pool or group to run this workflow task
       type: select.select
@@ -195,7 +203,7 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nas:0.0.1
+      image: onepanel/nni:0.0.4
       command: [sh,-c]
       args:
       - |
@@ -241,12 +249,14 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nas:0.0.1
+      image: onepanel/nni:0.0.4
       command: [sh,-c]
       args:
       - |
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
+        python3 examples/trials/pytorch-classifier/create_yaml.py --config {{workflow.parameters.searchspace-config}} \
+              --num_classes {{workflow.parameters.num-classes}}
         nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground
       workingDir: /mnt
       volumeMounts:
@@ -279,14 +289,14 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nas:0.0.1
+      image: onepanel/nni:0.0.4
       command: [sh,-c]
       args:
       - |
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \
-                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}}
+        python3 examples/trials/pytorch-classifier/train_model.py --num_classes {{workflow.parameters.num-classes}} \
+                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --config {{workflow.parameters.fixedparam-config}}
       workingDir: /mnt
       volumeMounts:
       - name: data3
@@ -329,7 +339,7 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nas:0.0.1
+      image: onepanel/nni:0.0.4
       command: [sh,-c]
       args:
       - |
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 8d48553088..7721b81290 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -160,8 +160,8 @@ def get_params():
 #     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
 #                         help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
-#     parser.add_argument("--num_classes", type=int, default=2, metavar='N',
-#                         help='number of classes in the dataset')
+    parser.add_argument("--num_classes", type=int, default=2, metavar='N',
+                        help='number of classes in the dataset')
 #     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
 #                         help='learning rate (default: 0.01)')
 #     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
@@ -182,7 +182,7 @@ def get_params():
 if __name__ == '__main__':
     try:
         params = vars(get_params())
-        extras = params['config'].split("\\n")
+        extras = params['config'].split("\n")
         extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
         config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
         config.update(params)

From c6a770601b2748808301c1c9ad6ccb18395fdb1d Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 19:33:23 -0600
Subject: [PATCH 40/70] add log statements

---
 examples/trials/pytorch-classifier/train_model.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 7721b81290..e6179d201c 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -182,9 +182,13 @@ def get_params():
 if __name__ == '__main__':
     try:
         params = vars(get_params())
+        print("Older params:", params)
         extras = params['config'].split("\n")
+        print("extras", extras)
         extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+        print("extra processed", extras_processed)
         config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+        print("config", config)
         config.update(params)
         config.pop('config')
         print("Current Parameters:\n")

From ed43e9c3ad98e4b1e8652bc425207ad150a7fc45 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 21:53:40 -0600
Subject: [PATCH 41/70] revert changes related to single param config

---
 .../trials/pytorch-classifier/train_model.py  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index e6179d201c..fc7423ccd9 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -182,18 +182,18 @@ def get_params():
 if __name__ == '__main__':
     try:
         params = vars(get_params())
-        print("Older params:", params)
-        extras = params['config'].split("\n")
-        print("extras", extras)
-        extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
-        print("extra processed", extras_processed)
-        config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
-        print("config", config)
-        config.update(params)
-        config.pop('config')
+        # print("Older params:", params)
+        # extras = params['config'].split("\n")
+        # print("extras", extras)
+        # extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+        # print("extra processed", extras_processed)
+        # config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+        # print("config", config)
+        # config.update(params)
+        # config.pop('config')
         print("Current Parameters:\n")
-        print(config)
-        acc, loss = train(config)
+        print(params)
+        acc, loss = train(params)
         if loss is None or math.isnan(loss):
             loss = 0
         metrics = [

From 7e49f77e2dcd71739656bfb2eab0d642f170b50f Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 23:18:24 -0600
Subject: [PATCH 42/70] revert parameter changes

---
 .../trials/pytorch-classifier/template.yml    |  1 +
 .../trials/pytorch-classifier/train_model.py  | 20 +++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index fd07fbd21f..fadc1299d4 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -69,6 +69,7 @@ arguments:
       visibility: public
       type: textarea.textarea
     - name: searchspace-config
+      type: textarea.textarea
       value: |-
         batch_size_list=16,32,64,128
         lr_list=0.0001,0.001,0.01,0.1
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index fc7423ccd9..af86510659 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -155,19 +155,19 @@ def get_params():
                         default='/home/savan/Documents/train_data', help="train data directory")
     parser.add_argument("--test_dir", type=str,
                         default='/home/savan/Documents/test_data', help="test data directory")
-#     parser.add_argument("--model_type", type=str,
-#                         default='alexnet', help="model to train")
-#     parser.add_argument('--batch_size', type=int, default=1, metavar='N',
-#                         help='input batch size for training (default: 64)')
+    parser.add_argument("--model_type", type=str,
+                        default='alexnet', help="model to train")
+    parser.add_argument('--batch_size', type=int, default=1, metavar='N',
+                        help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
     parser.add_argument("--num_classes", type=int, default=2, metavar='N',
                         help='number of classes in the dataset')
-#     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-#                         help='learning rate (default: 0.01)')
-#     parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-#                         help='SGD momentum (default: 0.5)')
-#     parser.add_argument('--epochs', type=int, default=10, metavar='N',
-#                         help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--no_cuda', action='store_true', default=False,

From 2f6f5c7310906f030146e7c79e750b2587fe9b2d Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 17 Nov 2020 23:45:44 -0600
Subject: [PATCH 43/70] convert metrics to float explicitly

---
 compare.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compare.py b/compare.py
index 253151583c..8e036f4872 100644
--- a/compare.py
+++ b/compare.py
@@ -10,9 +10,9 @@
 with open('/tmp/singlemodel-metrics.json') as f:
     fm = json.load(f)
 
-accuracies['nas_acc'] = [i['value'] for i in nas if i['name'] == 'accuracy'][0]
-accuracies['hyper_acc'] = [i['value'] for i in hyper if i['name'] == 'accuracy'][0]
-accuracies['fm_acc'] = [i['value'] for i in fm if i['name'] == 'accuracy'][0]
+accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0]
+accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0]
+accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0]
 
 max_acc_name = max(accuracies, key=accuracies.get)
 print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name))
\ No newline at end of file

From 6dad848a47aba51b2eca40bc25cc7eacf839087c Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 13:13:19 -0600
Subject: [PATCH 44/70] read parameters from config

---
 .../trials/pytorch-classifier/train_model.py  | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index af86510659..1aa03d19b6 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -155,19 +155,19 @@ def get_params():
                         default='/home/savan/Documents/train_data', help="train data directory")
     parser.add_argument("--test_dir", type=str,
                         default='/home/savan/Documents/test_data', help="test data directory")
-    parser.add_argument("--model_type", type=str,
-                        default='alexnet', help="model to train")
-    parser.add_argument('--batch_size', type=int, default=1, metavar='N',
-                        help='input batch size for training (default: 64)')
+    # parser.add_argument("--model_type", type=str,
+    #                     default='alexnet', help="model to train")
+    # parser.add_argument('--batch_size', type=int, default=1, metavar='N',
+    #                     help='input batch size for training (default: 64)')
     parser.add_argument("--batch_num", type=int, default=None)
     parser.add_argument("--num_classes", type=int, default=2, metavar='N',
                         help='number of classes in the dataset')
-    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
-                        help='learning rate (default: 0.01)')
-    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
-                        help='SGD momentum (default: 0.5)')
-    parser.add_argument('--epochs', type=int, default=10, metavar='N',
-                        help='number of epochs to train (default: 10)')
+    # parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+    #                     help='learning rate (default: 0.01)')
+    # parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+    #                     help='SGD momentum (default: 0.5)')
+    # parser.add_argument('--epochs', type=int, default=10, metavar='N',
+    #                     help='number of epochs to train (default: 10)')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--no_cuda', action='store_true', default=False,
@@ -182,18 +182,18 @@ def get_params():
 if __name__ == '__main__':
     try:
         params = vars(get_params())
-        # print("Older params:", params)
-        # extras = params['config'].split("\n")
-        # print("extras", extras)
-        # extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
-        # print("extra processed", extras_processed)
-        # config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
-        # print("config", config)
-        # config.update(params)
-        # config.pop('config')
+        print("Older params:", params)
+        extras = params['config'].split("\n")
+        print("extras", extras)
+        extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+        print("extra processed", extras_processed)
+        config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+        print("config", config)
+        config.update(params)
+        config.pop('config')
         print("Current Parameters:\n")
-        print(params)
-        acc, loss = train(params)
+        print(config)
+        acc, loss = train(config)
         if loss is None or math.isnan(loss):
             loss = 0
         metrics = [

From 4454f50c36ab4a4f4fa620a8d57ca6a789e2661e Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 15:14:21 -0600
Subject: [PATCH 45/70] accept config for enas

---
 compare.py                                    |  3 ++
 examples/nas/enas/search.py                   | 35 ++++++++++++-------
 .../trials/pytorch-classifier/create_yaml.py  |  4 ++-
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/compare.py b/compare.py
index 8e036f4872..b4ce35929f 100644
--- a/compare.py
+++ b/compare.py
@@ -3,12 +3,15 @@
 accuracies = {}
 with open('/tmp/nas-metrics.json') as f:
     nas = json.load(f)
+    print("Metrics for Neural Architecture Search: ", nas)
 
 with open('/tmp/hyperop-metrics.json') as f:
     hyper = json.load(f)
+    print("Metrics for hyper parameter optimization: ", hyper)
 
 with open('/tmp/singlemodel-metrics.json') as f:
     fm = json.load(f)
+    print("Metrics for model trained with fixed parameters: ", fm)
 
 accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0]
 accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0]
diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index c1ac75cdff..fece3fd380 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -20,25 +20,34 @@
 
 if __name__ == "__main__":
     parser = ArgumentParser("enas")
-    parser.add_argument("--batch-size", default=128, type=int)
+    # parser.add_argument("--batch-size", default=128, type=int)
     parser.add_argument("--log-frequency", default=10, type=int)
     parser.add_argument("--num-classes", default=2, type=int)
     parser.add_argument("--dataset", default="cifar10", choices=["cifar10", "custom_classification"])
-    parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
-    parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
+    # parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
+    # parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)")
     parser.add_argument("--visualization", default=True, action="store_true")
     parser.add_argument("--train-data-dir", default="/home/savan/Documents/train_data", help="train dataset for classification")
     parser.add_argument("--valid-data-dir", default="/home/savan/Documents/test_data", help="validation dataset for classification")
+    parser.add_argument("--config", default="batch-size=128 \n search-for=macro \n epochs=30")
     args = parser.parse_args()
 
-    dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir)
-    if args.search_for == "macro":
-        model = GeneralNetwork(num_classes=args.num_classes)
-        num_epochs = args.epochs or 310
+    extras = args['config'].split("\n")
+    print("nas extras", extras)
+    extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
+    print("nas extra processed", extras_processed)
+    config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+    print("nas config", config)
+    config.update(args)
+
+    dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir'])
+    if args['search_for'] == "macro":
+        model = GeneralNetwork(num_classes=args['num_classes'])
+        num_epochs = args['epochs'] or 310
         mutator = None
-    elif args.search_for == "micro":
-        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args.num_classes, use_aux_heads=True)
-        num_epochs = args.epochs or 150
+    elif args['search_for'] == "micro":
+        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args['num_classes'], use_aux_heads=True)
+        num_epochs = args['epochs'] or 150
         mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
     else:
         raise AssertionError
@@ -53,13 +62,13 @@
                                reward_function=reward_accuracy,
                                optimizer=optimizer,
                                callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")],
-                               batch_size=args.batch_size,
+                               batch_size=args['batch_size'],
                                num_epochs=num_epochs,
                                dataset_train=dataset_train,
                                dataset_valid=dataset_valid,
-                               log_frequency=args.log_frequency,
+                               log_frequency=args['log_frequency'],
                                mutator=mutator)
-    if args.visualization:
+    if args['visualization']:
         trainer.enable_visualization()
     trainer.train()
     metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}]
diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index 18a68830f2..ce641998c9 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -28,9 +28,11 @@ def main(args):
                         help="number of classes in the dataset")
     parser.add_argument("--config", default="batch_size_list=16,32,64,128\nlr_list=0.001,0.001\nmomentum_range=0,1\nepochs=10")
     args = parser.parse_args()
+    print("Arguments: ", args)
     extras = args.config.split("\n")
     extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
     config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
+    print("[Create YAML] Config: ", config)
     config.update(vars(args))
-    print(config)
+    print("Final Arguments: ", config)
     main(config)
\ No newline at end of file

From b4673e7820bcd2008f3b328be78fd8cc1bcc5f1a Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 15:17:06 -0600
Subject: [PATCH 46/70] convert score to float

---
 examples/trials/pytorch-classifier/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index 3f8d54fa07..b38516e588 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -147,7 +147,7 @@ def train(args):
     best_params = nni.get_best_params()
     print("Best param and score: ", best_params)
     metrics = [
-        {'name': 'accuracy', 'value': best_params['score']},
+        {'name': 'accuracy', 'value': float(best_params['score'])},
     ]
     with open('/tmp/sys-metrics.json', 'w') as f:
         json.dump(metrics, f)

From 70517e652d9658c737dc10ea1128bef57a47a300 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 15:23:16 -0600
Subject: [PATCH 47/70] add error handling in comparison script

---
 compare.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/compare.py b/compare.py
index b4ce35929f..2783e810ef 100644
--- a/compare.py
+++ b/compare.py
@@ -1,21 +1,30 @@
 import json
 
 accuracies = {}
-with open('/tmp/nas-metrics.json') as f:
-    nas = json.load(f)
-    print("Metrics for Neural Architecture Search: ", nas)
 
-with open('/tmp/hyperop-metrics.json') as f:
-    hyper = json.load(f)
-    print("Metrics for hyper parameter optimization: ", hyper)
+try:
+    with open('/tmp/nas-metrics.json') as f:
+        nas = json.load(f)
+        print("Metrics for Neural Architecture Search: ", nas)
+    accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0]
+except RuntimeError as e:
+    print("Error occurred while reading metrics for NAS: ", e)
 
-with open('/tmp/singlemodel-metrics.json') as f:
-    fm = json.load(f)
-    print("Metrics for model trained with fixed parameters: ", fm)
+try:
+    with open('/tmp/hyperop-metrics.json') as f:
+        hyper = json.load(f)
+        print("Metrics for hyper parameter optimization: ", hyper)
+    accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0]
+except RuntimeError as e:
+    print("Error occurred while reading metrics for hyperparameter optimization: ", e)
 
-accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0]
-accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0]
-accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0]
+try:
+    with open('/tmp/singlemodel-metrics.json') as f:
+        fm = json.load(f)
+        print("Metrics for model trained with fixed parameters: ", fm)
+    accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0]
+except RuntimeError as e:
+    print("Error occurred while reading metrics for fixed-param model: ", e)
 
 max_acc_name = max(accuracies, key=accuracies.get)
 print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name))
\ No newline at end of file

From c54a626c24b0cb3c79bc697ef13edf457cd46fd9 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 16:17:12 -0600
Subject: [PATCH 48/70] remove subscription from argparse var

---
 examples/nas/enas/search.py                   |  2 +-
 .../trials/pytorch-classifier/template.yml    | 96 ++++++++-----------
 2 files changed, 41 insertions(+), 57 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index fece3fd380..8a0e84b544 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -32,7 +32,7 @@
     parser.add_argument("--config", default="batch-size=128 \n search-for=macro \n epochs=30")
     args = parser.parse_args()
 
-    extras = args['config'].split("\n")
+    extras = args.config.split("\n")
     print("nas extras", extras)
     extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i]
     print("nas extra processed", extras_processed)
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index fadc1299d4..af747fbc1a 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -1,8 +1,6 @@
 entrypoint: main
 arguments:
     parameters:
-    - name: source
-      value: https://github.com/onepanelio/nni.git
     - name: cvat-annotation-path
       value: annotation-dump/animals/11052020231652
       displayName: Dataset path
@@ -14,67 +12,44 @@ arguments:
     - name: num-classes
       displayName: Number of classes
       visibility: public
-      value: 2
-    - name: learning-rate
-      value: 0.01
-      displayName: Learning rate
+      value: '2'
+    - name: test-split
+      displayName: Percentage of images to use for testing
       visibility: public
-    - name: batch-size
-      value: 1
-      displayName: Batch size
-      visibility: public
-    - name: momentum
-      value: 0.5
-      displayName: Momentum
-      visibility: public
-    - name: model-type
-      displayName: Model type
-      visibility: public
-      value: alexnet
-      options:
-      - name: 'GoogleNet'
-        value: 'googlenet'
-      - name: 'ResNet50'
-        value: 'resnet50'
-    - name: epochs
-      value: 1
-    - name: search-method
-      value: macro
-      type: select.select
-      options:
-      - name: 'Macro'
-        value: macro
-      - name: 'Micro'
-        value: micro
+      value: '20'
     - name: hyperparamtuning-config
       value: |-
-        epochs=1
+        epochs=10
       displayName: Settings for hyperparameter tuning
       visibility: public
       type: textarea.textarea
+    - name: searchspace-config
+      type: textarea.textarea
+      displayName: Search space for hyperparameter tuning
+      value: |-
+        batch_size_list=16,32,64,128
+        lr_list=0.0001,0.001,0.01,0.1
+        momentum_range=0,1
+        epochs=10
     - name: nas-config
       value: |-
-        epochs=1
+        epochs=20
+        batch-size=128
+        search-for=macro
       displayName: Settings for Neural Architecture Search
       visibility: public
       type: textarea.textarea
     - name: fixedparam-config
       value: |-
-        epochs=1
         momentum=0.5
         lr=0.01
         model_type=alexnet
-        batch_size=1
+        batch_size=16
+        epochs=10
       displayName: Settings for model training
       visibility: public
       type: textarea.textarea
-    - name: searchspace-config
-      type: textarea.textarea
-      value: |-
-        batch_size_list=16,32,64,128
-        lr_list=0.0001,0.001,0.01,0.1
-        momentum_range=0,1
-        epochs=10
+    
     - displayName: Node pool
       hint: Name of node pool or group to run this workflow task
       type: select.select
@@ -208,10 +183,14 @@ templates:
       command: [sh,-c]
       args:
       - |
-        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \
-                --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data
+        python3 examples/nas/enas/search.py \
+              --config="{{workflow.parameters.nas-config}}" \
+              --num-classes="{{workflow.parameters.num-classes}}" \
+              --dataset="custom_classification" \
+              --train-data-dir="/mnt/data/datasets/processed_data" \
+              --valid-data-dir="/mnt/data/datasets/processed_data"
       workingDir: /mnt
       volumeMounts:
       - name: data
@@ -221,7 +200,7 @@ templates:
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
     sidecars:
-      - name: nni-web-ui
+      - name: nni-nas-ui
         image: 'tensorflow/tensorflow:2.3.0'
         command:
           - sh
@@ -254,10 +233,11 @@ templates:
       command: [sh,-c]
       args:
       - |
-        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 examples/trials/pytorch-classifier/create_yaml.py --config {{workflow.parameters.searchspace-config}} \
-              --num_classes {{workflow.parameters.num-classes}}
+        python3 examples/trials/pytorch-classifier/create_yaml.py \
+            --config="{{workflow.parameters.searchspace-config}}" \
+            --num_classes="{{workflow.parameters.num-classes}}" && \
         nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground
       workingDir: /mnt
       volumeMounts:
@@ -268,7 +248,7 @@ templates:
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
     sidecars:
-      - name: nni-web-ui
+      - name: nni-hyperparamopt-ui
         image: 'onepanel/nni-proxy:0.0.1'
         tty: true
         ports:
@@ -294,10 +274,13 @@ templates:
       command: [sh,-c]
       args:
       - |
-        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 examples/trials/pytorch-classifier/train_model.py --num_classes {{workflow.parameters.num-classes}} \
-                 --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --config {{workflow.parameters.fixedparam-config}}
+        python3 examples/trials/pytorch-classifier/train_model.py \
+              --num_classes="{{workflow.parameters.num-classes}}" \
+              --train_dir="/mnt/data/datasets/processed_data" \
+              --test_dir="/mnt/data/datasets/processed_data" \
+              --config="{{workflow.parameters.fixedparam-config}}" \
       workingDir: /mnt
       volumeMounts:
       - name: data3
@@ -380,7 +363,8 @@ templates:
         python3 -m pip install setuptools && \
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 prepare_data.py --data_dir=/mnt/output/processed_data
+        python3 prepare_data.py --data_dir="/mnt/output/processed_data" \
+                                --test_split="{{workflow.parameters.test-split}}"
       workingDir: /mnt
       volumeMounts:
       - name: data5

From e3a8f93f0a7ba4caac4f9a691b9b6df7c6a138a4 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 16:41:38 -0600
Subject: [PATCH 49/70] convert args to dictionary

---
 examples/nas/enas/search.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 8a0e84b544..701f099d14 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -38,7 +38,7 @@
     print("nas extra processed", extras_processed)
     config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
     print("nas config", config)
-    config.update(args)
+    config.update(vars(args))
 
     dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir'])
     if args['search_for'] == "macro":

From 3fc027beea9a4e30decab7d60e44ec7900a292c3 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 16:59:04 -0600
Subject: [PATCH 50/70] assign config back to args

---
 examples/nas/enas/search.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 701f099d14..2e574d8b28 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -39,6 +39,7 @@
     config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed}
     print("nas config", config)
     config.update(vars(args))
+    args = config
 
     dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir'])
     if args['search_for'] == "macro":

From ccee20fe296c01c370b118cd3ce14721b6e01006 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 17:51:32 -0600
Subject: [PATCH 51/70] convert numerican strings to int

---
 examples/nas/enas/search.py                     | 10 +++++-----
 examples/trials/pytorch-classifier/template.yml |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 2e574d8b28..234e4b2189 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -43,12 +43,12 @@
 
     dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir'])
     if args['search_for'] == "macro":
-        model = GeneralNetwork(num_classes=args['num_classes'])
-        num_epochs = args['epochs'] or 310
+        model = GeneralNetwork(num_classes=int(args['num_classes']))
+        num_epochs = int(args['epochs']) or 310
         mutator = None
     elif args['search_for'] == "micro":
-        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args['num_classes'], use_aux_heads=True)
-        num_epochs = args['epochs'] or 150
+        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=int(args['num_classes']), use_aux_heads=True)
+        num_epochs = int(args['epochs']) or 150
         mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
     else:
         raise AssertionError
@@ -63,7 +63,7 @@
                                reward_function=reward_accuracy,
                                optimizer=optimizer,
                                callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")],
-                               batch_size=args['batch_size'],
+                               batch_size=int(args['batch_size']),
                                num_epochs=num_epochs,
                                dataset_train=dataset_train,
                                dataset_valid=dataset_valid,
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index af747fbc1a..a4b52ec377 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -34,8 +34,8 @@ arguments:
     - name: nas-config
       value: |-
         epochs=20
-        batch-size=128
-        search-for=macro
+        batch_size=128
+        search_for=macro
       displayName: Settings for Neural Architecture Search
       visibility: public
       type: textarea.textarea

From b890319123e4808094d10e402c44451397abee2c Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 18 Nov 2020 17:56:45 -0600
Subject: [PATCH 52/70] round numbers to 2 decimal points

---
 examples/nas/enas/search.py                       | 2 +-
 examples/trials/pytorch-classifier/main.py        | 2 +-
 examples/trials/pytorch-classifier/train_model.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py
index 234e4b2189..f6d7ea926e 100644
--- a/examples/nas/enas/search.py
+++ b/examples/nas/enas/search.py
@@ -72,6 +72,6 @@
     if args['visualization']:
         trainer.enable_visualization()
     trainer.train()
-    metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}]
+    metrics = [{'name':'accuracy', 'value':round(trainer.val_model_summary['acc1'].avg, 2)}, {'name':'loss', 'value':round(trainer.val_model_summary['loss'].avg,2)}]
     with open('/tmp/sys-metrics.json', 'w') as f:
         json.dump(metrics, f)
\ No newline at end of file
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index b38516e588..a901a7f67f 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -147,7 +147,7 @@ def train(args):
     best_params = nni.get_best_params()
     print("Best param and score: ", best_params)
     metrics = [
-        {'name': 'accuracy', 'value': float(best_params['score'])},
+        {'name': 'accuracy', 'value': round(float(best_params['score']),2)},
     ]
     with open('/tmp/sys-metrics.json', 'w') as f:
         json.dump(metrics, f)
diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index 1aa03d19b6..e63f01cb21 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -197,8 +197,8 @@ def get_params():
         if loss is None or math.isnan(loss):
             loss = 0
         metrics = [
-          {'name': 'accuracy', 'value': acc},
-          {'name': 'loss', 'value': loss},
+          {'name': 'accuracy', 'value': round(acc,2)},
+          {'name': 'loss', 'value': round(loss,2)},
         ]
         
         # Write metrics to `/tmp/sys-metrics.json`

From 63fb18378e9ff108e52725739780a82becb7e7c8 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 19 Nov 2020 19:22:56 -0600
Subject: [PATCH 53/70] add a flag to skip the preprocessing

---
 examples/trials/pytorch-classifier/template.yml | 6 ------
 prepare_data.py                                 | 8 +++++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index a4b52ec377..6f564fb401 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -17,12 +17,6 @@ arguments:
       displayName: Percentage of images to use for testing
       visibility: public
       value: '20'
-    - name: hyperparamtuning-config
-      value: |-
-        epochs=10
-      displayName: Settings for hyperparameter tuning
-      visibility: public
-      type: textarea.textarea
     - name: searchspace-config
       type: textarea.textarea
       displayName: Search space for hyperparameter tuning
diff --git a/prepare_data.py b/prepare_data.py
index 6697653ced..c1368eca0a 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -33,5 +33,11 @@ def main(args):
     parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data')
     parser.add_argument('--image_dir', default='/mnt/data/datasets/images')
     parser.add_argument('--test_split', default=20, type=int)
+    parser.add_argument('--skip', default=True, type=bool)
     args = parser.parse_args()
-    main(args)
+    if not args.skip:
+        main(args)
+    else:
+        os.makedirs("/mnt/output/processed_data")
+        for imdir in os.listdir("/mnt/data/datasets/"):
+            shutil.move(os.path.join("/mnt/data/datasets", imdir), "/mnt/output/processed_data/")

From aeeededa9ebc69b4635cc877a794d7faaae53b76 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 19 Nov 2020 19:27:35 -0600
Subject: [PATCH 54/70] change argument type for skip

---
 prepare_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_data.py b/prepare_data.py
index c1368eca0a..b04d51cf31 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -35,7 +35,7 @@ def main(args):
     parser.add_argument('--test_split', default=20, type=int)
     parser.add_argument('--skip', default=True, type=bool)
     args = parser.parse_args()
-    if not args.skip:
+    if args.skip == "false":
         main(args)
     else:
         os.makedirs("/mnt/output/processed_data")

From d85c27ca8f3c8ad378375bf68dbdae9da5594de9 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 19 Nov 2020 20:25:09 -0600
Subject: [PATCH 55/70] change argument type to string

---
 prepare_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prepare_data.py b/prepare_data.py
index b04d51cf31..8bcb91f80d 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -33,7 +33,7 @@ def main(args):
     parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data')
     parser.add_argument('--image_dir', default='/mnt/data/datasets/images')
     parser.add_argument('--test_split', default=20, type=int)
-    parser.add_argument('--skip', default=True, type=bool)
+    parser.add_argument('--skip', default="false")
     args = parser.parse_args()
     if args.skip == "false":
         main(args)

From 198997d27ae4a7fef494e4dd497c96433c165d0b Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 19 Nov 2020 21:09:42 -0600
Subject: [PATCH 56/70] update template

---
 .../trials/pytorch-classifier/template.yml    | 93 +++++++++++--------
 1 file changed, 55 insertions(+), 38 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index 6f564fb401..eaeecbf570 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -2,44 +2,60 @@ entrypoint: main
 arguments:
     parameters:
     - name: cvat-annotation-path
-      value: annotation-dump/animals/11052020231652
+      value: annotation-dump/patch_medical/valid
       displayName: Dataset path
       hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated.
       visibility: private
+      
     - name: cvat-output-path
       value: workflow-data/output/nas/nas-model-comparison
       visibility: private
+      
     - name: num-classes
       displayName: Number of classes
       visibility: public
       value: '2'
+      hint: 'Number of classes in a dataset'
+      
     - name: test-split
       displayName: Percentage of images to use for testing
       visibility: public
       value: '20'
+      hint: 'Percentage of data to be used for test set'
+      
+    - name: skip-preprocessing
+      displayName: Whether to skip preprocessing data or not
+      visibility: public
+      value: 'false'
+      hint: 'Specify whether to skip preprocessing or not. Skip preprocessing if your dataset is already in a required format.'
+      
     - name: searchspace-config
       type: textarea.textarea
       displayName: Search space for hyperparameter tuning
       value: |-
-        batch_size_list=16,32,64,128
-        lr_list=0.0001,0.001,0.01,0.1
-        momentum_range=0,1
-        epochs=10
+        batch_size_list=16,32,64,128   # batch sizes for hyperparameter tuner
+        lr_list=0.0001,0.001,0.01,0.1  # learning rates for hyperparameter tuner
+        momentum_range=0,1             # range for momentum for hyperparameter tuner
+        epochs=10                      # epochs to train each model for
+      hint: 'Define parameters for hyperparameter tuning'
+        
     - name: nas-config
       value: |-
-        epochs=20
-        batch_size=128
-        search_for=macro
+        epochs=20                     # epochs to train a model for
+        batch_size=128                # batch size for training
+        search_for=macro              # macro or micro search technique for ENAS
       displayName: Settings for Neural Architecture Search
       visibility: public
+      hint: 'Define parameters for Neural Architecture Search'
+      
       type: textarea.textarea
     - name: fixedparam-config
       value: |-
-        momentum=0.5
-        lr=0.01
-        model_type=alexnet
-        batch_size=16
-        epochs=10
+        momentum=0.5                  # momentum to use for training
+        lr=0.01                       # learning rate for training
+        model_type=alexnet            # model to train (i.e alexnet, googlenet)
+        batch_size=16                 # batch size for training
+        epochs=10                     # epochs to train a model for
       displayName: Settings for model training
       visibility: public
       type: textarea.textarea
@@ -68,70 +84,70 @@ volumeClaimTemplates:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
       name: output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: data2
+      name: hyperparamtuning-data
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: output2
+      name: hyperparamtuning-output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: data3
+      name: fixedparam-data
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: output3
+      name: fixedparam-output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: data4
+      name: comparemodel-data
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
   - metadata:
-      name: output4
+      name: comparemodel-output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
           storage: 20Gi
   - metadata:
-      name: data5
+      name: preprocess-data
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
           storage: 20Gi
   - metadata:
-      name: output5
+      name: preprocess-output
     spec:
       accessModes: [ "ReadWriteOnce" ]
       resources:
         requests:
-          storage: 20Gi
+          storage: 200Gi
 templates:
   - name: main
     dag:
@@ -235,9 +251,9 @@ templates:
         nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground
       workingDir: /mnt
       volumeMounts:
-      - name: data2
+      - name: hyperparamtuning-data
         mountPath: /mnt/data
-      - name: output2
+      - name: hyperparamtuning-output
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
@@ -277,9 +293,9 @@ templates:
               --config="{{workflow.parameters.fixedparam-config}}" \
       workingDir: /mnt
       volumeMounts:
-      - name: data3
+      - name: fixedparam-data
         mountPath: /mnt/data
-      - name: output3
+      - name: fixedparam-output
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
@@ -326,9 +342,9 @@ templates:
         python3 compare.py
       workingDir: /mnt
       volumeMounts:
-      - name: data4
+      - name: comparemodel-data
         mountPath: /mnt/data
-      - name: output4
+      - name: comparemodel-output
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
@@ -358,12 +374,13 @@ templates:
         git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 prepare_data.py --data_dir="/mnt/output/processed_data" \
-                                --test_split="{{workflow.parameters.test-split}}"
+                                --test_split="{{workflow.parameters.test-split}}" \
+                                --skip="{{workflow.parameters.skip-preprocessing}}"
       workingDir: /mnt
       volumeMounts:
-      - name: data5
+      - name: preprocess-data
         mountPath: /mnt/data
-      - name: output5
+      - name: preprocess-output
         mountPath: /mnt/output
     nodeSelector:
       beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}'
\ No newline at end of file

From 5ad2cfebef10dfecd5c4277a7534922932c648ff Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Fri, 20 Nov 2020 12:14:06 -0600
Subject: [PATCH 57/70] handle case when data is already processed

---
 prepare_data.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 8bcb91f80d..afdf3855a8 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -3,6 +3,7 @@
 import shutil
 import argparse
 import random
+import glob
 
 def main(args):
 
@@ -27,6 +28,25 @@ def main(args):
                 shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name']))
 
 
+def train_test_split(args):
+
+    for dir in os.listdir('/mnt/data/datasets'):
+        os.makedirs(os.path.join(args.data_dir, 'train', dir))
+        os.makedirs(os.path.join(args.data_dir, 'test', dir))
+        a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg')
+        a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png'))
+        test_len = (len(a) * int(args.test_split) )// 100
+        count = 0
+        for file in a:
+            print(file)
+            img_path = os.path.split(file)[-1]
+            if bool(random.getrandbits(1)) and count <= test_len:
+                shutil.move(file, os.path.join(args.data_dir, 'test', dir, img_path))
+                count += 1
+            else:
+                shutil.move(file, os.path.join(args.data_dir, 'train', dir, img_path))
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml')
@@ -38,6 +58,4 @@ def main(args):
     if args.skip == "false":
         main(args)
     else:
-        os.makedirs("/mnt/output/processed_data")
-        for imdir in os.listdir("/mnt/data/datasets/"):
-            shutil.move(os.path.join("/mnt/data/datasets", imdir), "/mnt/output/processed_data/")
+        train_test_split(args)
\ No newline at end of file

From 5235b99ad4ad8b7c88d525fdb85d76da0f2ee626 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 23 Nov 2020 14:14:32 -0600
Subject: [PATCH 58/70] add log lines

---
 prepare_data.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index afdf3855a8..6be6079a66 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -30,21 +30,20 @@ def main(args):
 
 def train_test_split(args):
 
-    for dir in os.listdir('/mnt/data/datasets'):
-        os.makedirs(os.path.join(args.data_dir, 'train', dir))
-        os.makedirs(os.path.join(args.data_dir, 'test', dir))
+    for dirn in os.listdir('/mnt/data/datasets'):
+        os.makedirs(os.path.join(args.data_dir, 'train', dirn))
+        os.makedirs(os.path.join(args.data_dir, 'test', dirn))
         a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg')
         a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png'))
         test_len = (len(a) * int(args.test_split) )// 100
         count = 0
         for file in a:
-            print(file)
             img_path = os.path.split(file)[-1]
             if bool(random.getrandbits(1)) and count <= test_len:
-                shutil.move(file, os.path.join(args.data_dir, 'test', dir, img_path))
+                shutil.move(file, os.path.join(args.data_dir, 'test', dirn, img_path))
                 count += 1
             else:
-                shutil.move(file, os.path.join(args.data_dir, 'train', dir, img_path))
+                shutil.move(file, os.path.join(args.data_dir, 'train', dirn, img_path))
 
 
 if __name__ == '__main__':
@@ -56,6 +55,8 @@ def train_test_split(args):
     parser.add_argument('--skip', default="false")
     args = parser.parse_args()
     if args.skip == "false":
+        print("Processing data...")
         main(args)
     else:
+        print("Moving files to appropriate directories...")
         train_test_split(args)
\ No newline at end of file

From d68b5f728637547c7338eec6d7ca5bc3a18b217c Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 23 Nov 2020 15:30:49 -0600
Subject: [PATCH 59/70] correct typo in var name

---
 prepare_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 6be6079a66..4eac1e55ac 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -33,8 +33,8 @@ def train_test_split(args):
     for dirn in os.listdir('/mnt/data/datasets'):
         os.makedirs(os.path.join(args.data_dir, 'train', dirn))
         os.makedirs(os.path.join(args.data_dir, 'test', dirn))
-        a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg')
-        a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png'))
+        a = glob.glob('/mnt/data/datasets/'+dirn+'/*.jpg')
+        a.extend(glob.glob('/mnt/data/datasets/'+dirn+'/*.png'))
         test_len = (len(a) * int(args.test_split) )// 100
         count = 0
         for file in a:

From 50b9016fd37f89aa77bdb188c993de4442e6f3c9 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 23 Nov 2020 17:32:37 -0600
Subject: [PATCH 60/70] replace logger with print

---
 examples/trials/pytorch-classifier/train_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py
index e63f01cb21..67e2d0c816 100644
--- a/examples/trials/pytorch-classifier/train_model.py
+++ b/examples/trials/pytorch-classifier/train_model.py
@@ -72,7 +72,7 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch):
         loss.backward()
         optimizer.step()
         if batch_idx % args['log_interval'] == 0:
-            logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                 epoch, batch_idx * len(data), len(train_loader.dataset),
                 100. * batch_idx / len(train_loader), loss.item()))
 

From 56df7a8542955069c34d7cc85fca2b14fd7d6047 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 24 Nov 2020 13:46:17 -0600
Subject: [PATCH 61/70] update template

---
 .../trials/pytorch-classifier/template.yml    | 47 +++++++++++--------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index eaeecbf570..b82467eb58 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -2,7 +2,7 @@ entrypoint: main
 arguments:
     parameters:
     - name: cvat-annotation-path
-      value: annotation-dump/patch_medical/valid
+      value: annotation-dump/patch_medical/compressed_valid
       displayName: Dataset path
       hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated.
       visibility: private
@@ -26,7 +26,7 @@ arguments:
     - name: skip-preprocessing
       displayName: Whether to skip preprocessing data or not
       visibility: public
-      value: 'false'
+      value: 'true'
       hint: 'Specify whether to skip preprocessing or not. Skip preprocessing if your dataset is already in a required format.'
       
     - name: searchspace-config
@@ -189,18 +189,20 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:0.0.4
+      image: onepanel/nni:1.0.0
       command: [sh,-c]
       args:
       - |
+        mv /mnt/data/datasets/processed_data.zip ./ && \
+        unzip processed_data.zip -d /mnt/data/datasets/ && \
         git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 examples/nas/enas/search.py \
               --config="{{workflow.parameters.nas-config}}" \
               --num-classes="{{workflow.parameters.num-classes}}" \
               --dataset="custom_classification" \
-              --train-data-dir="/mnt/data/datasets/processed_data" \
-              --valid-data-dir="/mnt/data/datasets/processed_data"
+              --train-data-dir="/mnt/data/datasets/processed_data/train" \
+              --valid-data-dir="/mnt/data/datasets/processed_data/test"
       workingDir: /mnt
       volumeMounts:
       - name: data
@@ -239,10 +241,12 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:0.0.4
+      image: onepanel/nni:1.0.0
       command: [sh,-c]
       args:
       - |
+        mv /mnt/data/datasets/processed_data.zip ./ && \
+        unzip processed_data.zip -d /mnt/data/datasets/ && \
         git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 examples/trials/pytorch-classifier/create_yaml.py \
@@ -280,17 +284,20 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:0.0.4
+      image: onepanel/nni:1.0.0
       command: [sh,-c]
       args:
       - |
+        mv /mnt/data/datasets/processed_data.zip ./ && \
+        unzip processed_data.zip -d /mnt/data/datasets/ && \
         git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 examples/trials/pytorch-classifier/train_model.py \
               --num_classes="{{workflow.parameters.num-classes}}" \
-              --train_dir="/mnt/data/datasets/processed_data" \
-              --test_dir="/mnt/data/datasets/processed_data" \
+              --train_dir="/mnt/data/datasets/processed_data/train" \
+              --test_dir="/mnt/data/datasets/processed_data/test" \
               --config="{{workflow.parameters.fixedparam-config}}" \
+              --log_interval=1
       workingDir: /mnt
       volumeMounts:
       - name: fixedparam-data
@@ -315,10 +322,6 @@ templates:
   - name: compare-models
     inputs:
       artifacts:
-      - name: data
-        path: /mnt/data/datasets/
-        s3:
-          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
       - name: nas-metrics
         path: /tmp/nas-metrics.json
       - name: hyperop-metrics
@@ -333,11 +336,11 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:0.0.4
+      image: onepanel/nni:1.0.0
       command: [sh,-c]
       args:
       - |
-        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 compare.py
       workingDir: /mnt
@@ -356,6 +359,7 @@ templates:
         path: /mnt/data/datasets/
         s3:
           key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+    
     outputs:
       artifacts:
       - name: model
@@ -369,13 +373,18 @@ templates:
       args:
       - |
         apt-get update && \
-        apt-get install -y gcc g++ git && \
+        apt-get install -y gcc g++ git unzip zip && \
         python3 -m pip install setuptools && \
-        git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \
+        cd /mnt/data/datasets && \
+        unzip processed_data.zip && \
+        rm -f processed_data.zip && \
+        cd /mnt && \
+        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
-        python3 prepare_data.py --data_dir="/mnt/output/processed_data" \
+        python3 prepare_data.py --data_dir="./processed_data" \
                                 --test_split="{{workflow.parameters.test-split}}" \
-                                --skip="{{workflow.parameters.skip-preprocessing}}"
+                                --skip="{{workflow.parameters.skip-preprocessing}}" && \
+        zip -r /mnt/output/processed_data.zip ./processed_data
       workingDir: /mnt
       volumeMounts:
       - name: preprocess-data

From 9c6fb6f16d24b4e2df228d07b255e9af4b3cf092 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Sun, 29 Nov 2020 16:09:04 -0600
Subject: [PATCH 62/70] handle case when search params aren;t provided

---
 .../trials/pytorch-classifier/create_yaml.py  | 18 +++++--
 .../trials/pytorch-classifier/template.yml    | 47 ++++++++++++++++---
 2 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index ce641998c9..8e28111c2b 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -8,12 +8,20 @@ def main(args):
     data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs'])
 
     with open(args['output_path'], 'w') as yaml_file:
-        yaml_file.write( yaml.dump(data, default_flow_style=False))
-    mm_list = [int(item) for item in args['momentum_range'].split(',')]
-    lr_list = [float(item) for item in args['lr_list'].split(',')]
-    bs_list = [int(item) for item in args['batch_size_list'].split(',')]
+        yaml_file.write(yaml.dump(data, default_flow_style=False))
+
+    json_data = {}
+    if 'momentum_range' in args:
+        mm_list = [int(item) for item in args['momentum_range'].split(',')]
+        json_data['momentum'] = {"_type":"uniform","_value":mm_list}
+    if 'lr_list' in args:
+        lr_list = [float(item) for item in args['lr_list'].split(',')]
+        json_data['lr'] = {"_type":"choice","_value":lr_list}
+    if 'batch_size_list' in args:
+        bs_list = [int(item) for item in args['batch_size_list'].split(',')]
+        json_data['batch_size'] = {'_type':'choice', '_value':bs_list}
+
     with open(args['output_search_space_path'], 'w') as json_file:
-        json_data = {'batch_size': {'_type':'choice', '_value':bs_list}, 'lr':{"_type":"choice","_value":lr_list} , 'momentum':{"_type":"uniform","_value":mm_list}}
         json.dump(json_data, json_file)
 
 if __name__ == "__main__":
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index b82467eb58..a4d962deac 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -335,14 +335,47 @@ templates:
         optional: true
         archive:
           none: {}
-    container:
+    script:
       image: onepanel/nni:1.0.0
-      command: [sh,-c]
-      args:
-      - |
-        git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
-        cd nni/ && \
-        python3 compare.py
+      command: [python3, '-u']
+      source: |
+        import json
+
+        accuracies = {}
+        try:
+            with open('/tmp/nas-metrics.json') as f:
+                nas = json.load(f)
+                print("Metrics for Neural Architecture Search: ", nas)
+            accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0]
+        except RuntimeError as e:
+            print("Error occurred while reading metrics for NAS: ", e)
+
+        try:
+            with open('/tmp/hyperop-metrics.json') as f:
+                hyper = json.load(f)
+                print("Metrics for hyper parameter optimization: ", hyper)
+            accuracies['hyperparam_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0]
+        except RuntimeError as e:
+            print("Error occurred while reading metrics for hyperparameter optimization: ", e)
+        
+        try:
+            with open('/tmp/singlemodel-metrics.json') as f:
+                fm = json.load(f)
+                print("Metrics for model trained with fixed parameters: ", fm)
+            accuracies['fixedparam_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0]
+        except RuntimeError as e:
+            print("Error occurred while reading metrics for fixed-param model: ", e)
+        
+        max_acc_name = max(accuracies, key=accuracies.get)
+        print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name))
+        
+        metrics = [{'name': mac_acc_name, 'value': round(max(accuracies.values()),2)}]
+        try:
+            with open('/tmp/sys-metrics.json') as f:
+                json.dump(metrics, f)
+        except:
+            pass
+                  
       workingDir: /mnt
       volumeMounts:
       - name: comparemodel-data

From 1e2f39eb9b91d8182e99f14e7f5f2d2bd5632fdb Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 1 Dec 2020 16:13:01 -0600
Subject: [PATCH 63/70] update maximum trials

---
 examples/trials/pytorch-classifier/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml
index d58b9e133d..ff6e8ff5cf 100644
--- a/examples/trials/pytorch-classifier/config.yml
+++ b/examples/trials/pytorch-classifier/config.yml
@@ -2,7 +2,7 @@ authorName: default
 experimentName: pytorch_classifier
 trialConcurrency: 1
 maxExecDuration: 10h
-maxTrialNum: 15
+maxTrialNum: 1
 #choice: local, remote, pai
 trainingServicePlatform: local
 searchSpacePath: search_space.json

From 24088cdf0afe306ba76d873f0e716edb8a593c89 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 2 Dec 2020 10:25:05 -0600
Subject: [PATCH 64/70] update data directory for preprocessing

---
 prepare_data.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/prepare_data.py b/prepare_data.py
index 4eac1e55ac..343a01a10f 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -30,11 +30,11 @@ def main(args):
 
 def train_test_split(args):
 
-    for dirn in os.listdir('/mnt/data/datasets'):
+    for dirn in os.listdir(args.image_dir):
         os.makedirs(os.path.join(args.data_dir, 'train', dirn))
         os.makedirs(os.path.join(args.data_dir, 'test', dirn))
-        a = glob.glob('/mnt/data/datasets/'+dirn+'/*.jpg')
-        a.extend(glob.glob('/mnt/data/datasets/'+dirn+'/*.png'))
+        a = glob.glob(args.image_dir+'/'+dirn+'/*.jpg')
+        a.extend(glob.glob(args.image_dir+'/'+dirn+'/*.png'))
         test_len = (len(a) * int(args.test_split) )// 100
         count = 0
         for file in a:

From 2d29f3f364f6325b5eb7ea0f03a8a2499841f237 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Wed, 2 Dec 2020 11:58:50 -0600
Subject: [PATCH 65/70] delete lost+found directories

---
 prepare_data.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/prepare_data.py b/prepare_data.py
index 343a01a10f..158001e55c 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -59,4 +59,11 @@ def train_test_split(args):
         main(args)
     else:
         print("Moving files to appropriate directories...")
-        train_test_split(args)
\ No newline at end of file
+        train_test_split(args)
+        # clean up, lost+found directory causes PyTorch to think there are three classes
+        # so, remove it
+        try:
+            shutil.rmtree(os.path.join(args.data_dir, 'train', 'lost+found'))
+            shutil.rmtree(os.path.join(args.data_dir, 'test', 'lost+found'))
+        except:
+            pass
\ No newline at end of file

From 0bf71940b0cd57e7bad330f16e42efeb141bbf29 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 3 Dec 2020 14:22:38 -0600
Subject: [PATCH 66/70] pass model_type to main script

---
 examples/trials/pytorch-classifier/create_yaml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index 8e28111c2b..9905c0b7af 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -5,7 +5,7 @@
 def main(args):
     stream = open(args['config_path'], 'r')
     data = yaml.load(stream)
-    data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs'])
+    data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type'])
 
     with open(args['output_path'], 'w') as yaml_file:
         yaml_file.write(yaml.dump(data, default_flow_style=False))

From 686a1b56137873013a7579c186d9e41cbb638367 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 3 Dec 2020 18:17:29 -0600
Subject: [PATCH 67/70] allow user to update settings in config.yml

---
 examples/trials/pytorch-classifier/create_yaml.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index 9905c0b7af..9d6c2bc53d 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -6,6 +6,17 @@ def main(args):
     stream = open(args['config_path'], 'r')
     data = yaml.load(stream)
     data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type'])
+    # update config settings
+    if 'max_trial_num' in args:
+        data['maxTrialNum'] = int(args['max_trial_num'])
+    if 'max_exec_duration' in args:
+        data['maxExecDuration'] = args['max_exec_duration']
+    if 'trial_concurrency' in args:
+        data['trialConcurrency'] = int(args['trial_concurrency'])
+    if 'use_annotation' in args:
+        data['useAnnotation'] = args['use_annotation']
+    if 'tuner' in args:
+        data['builtinTunerName'] = args['tuner']
 
     with open(args['output_path'], 'w') as yaml_file:
         yaml_file.write(yaml.dump(data, default_flow_style=False))

From 48baeb6720c7d16042fbac3d1893105de1ebb176 Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Thu, 3 Dec 2020 18:33:42 -0600
Subject: [PATCH 68/70] resolve key access error

---
 examples/trials/pytorch-classifier/create_yaml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index 9d6c2bc53d..a89eced5ab 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -16,7 +16,7 @@ def main(args):
     if 'use_annotation' in args:
         data['useAnnotation'] = args['use_annotation']
     if 'tuner' in args:
-        data['builtinTunerName'] = args['tuner']
+        data['tuner']['builtinTunerName'] = args['tuner']
 
     with open(args['output_path'], 'w') as yaml_file:
         yaml_file.write(yaml.dump(data, default_flow_style=False))

From 1a840b5d444ee8f95f2a6090f7401ba1f1f3ecad Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Mon, 7 Dec 2020 21:58:47 -0600
Subject: [PATCH 69/70] add comments

---
 compare.py                                    |  2 +
 .../trials/pytorch-classifier/template.yml    | 72 +++++++++++++------
 nni/algorithms/nas/pytorch/enas/trainer.py    |  1 +
 nni/nas/pytorch/trainer.py                    |  2 +
 nni/trial.py                                  | 17 +++--
 prepare_data.py                               | 11 ++-
 6 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/compare.py b/compare.py
index 2783e810ef..7e62fc923d 100644
--- a/compare.py
+++ b/compare.py
@@ -1,3 +1,5 @@
+# Read sys-metrics files and find the one with the highest accuracy.
+
 import json
 
 accuracies = {}
diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml
index a4d962deac..a5f099441f 100644
--- a/examples/trials/pytorch-classifier/template.yml
+++ b/examples/trials/pytorch-classifier/template.yml
@@ -33,10 +33,13 @@ arguments:
       type: textarea.textarea
       displayName: Search space for hyperparameter tuning
       value: |-
-        batch_size_list=16,32,64,128   # batch sizes for hyperparameter tuner
-        lr_list=0.0001,0.001,0.01,0.1  # learning rates for hyperparameter tuner
-        momentum_range=0,1             # range for momentum for hyperparameter tuner
-        epochs=10                      # epochs to train each model for
+        model_type=alexnet              # any model type supported by torchvision
+        batch_size_list=16,32,64,128    # batch sizes for hyperparameter tuner
+        lr_list=0.0001,0.001,0.01,0.1   # learning rates for hyperparameter tuner
+        momentum_range=0,1              # range for momentum for hyperparameter tuner
+        epochs=10                       # epochs to train each model for
+        tuner=TPE                       # any tuner supprted by NNI
+        max_trial_num=2                 # max number of trials to run
       hint: 'Define parameters for hyperparameter tuning'
         
     - name: nas-config
@@ -189,7 +192,7 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:1.0.0
+      image: onepanel/nni:1.0.2
       command: [sh,-c]
       args:
       - |
@@ -241,7 +244,7 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:1.0.0
+      image: onepanel/nni:1.0.2
       command: [sh,-c]
       args:
       - |
@@ -284,12 +287,14 @@ templates:
         archive:
           none: {}
     container:
-      image: onepanel/nni:1.0.0
+      image: onepanel/nni:1.0.2
       command: [sh,-c]
       args:
       - |
         mv /mnt/data/datasets/processed_data.zip ./ && \
         unzip processed_data.zip -d /mnt/data/datasets/ && \
+        ls /mnt/data/datasets/ && \
+        ls /mnt/data/datasets/processed_data/train/ && \
         git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 examples/trials/pytorch-classifier/train_model.py \
@@ -336,10 +341,15 @@ templates:
         archive:
           none: {}
     script:
-      image: onepanel/nni:1.0.0
+      image: onepanel/nni:1.0.2
       command: [python3, '-u']
       source: |
         import json
+        import os
+        import onepanel.core.api
+        from onepanel.core.api.models.metric import Metric
+        from onepanel.core.api.rest import ApiException
+        from onepanel.core.api.models import Parameter
 
         accuracies = {}
         try:
@@ -368,14 +378,32 @@ templates:
         
         max_acc_name = max(accuracies, key=accuracies.get)
         print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name))
+        metrics = [{'name': 'accuracy', 'value': round(max(accuracies.values()),2)}]
+        with open('/var/run/secrets/kubernetes.io/serviceaccount/token') as f:
+            token = f.read()
+
+        # Configure API authorization
+        configuration = onepanel.core.api.Configuration(
+            host = os.getenv('ONEPANEL_API_URL'),
+            api_key = {
+                'authorization': token
+            }
+        )
+        configuration.api_key_prefix['authorization'] = 'Bearer'
         
-        metrics = [{'name': mac_acc_name, 'value': round(max(accuracies.values()),2)}]
-        try:
-            with open('/tmp/sys-metrics.json') as f:
-                json.dump(metrics, f)
-        except:
-            pass
-                  
+        # Call SDK method to save metrics
+        with onepanel.core.api.ApiClient(configuration) as api_client:
+            api_instance = onepanel.core.api.WorkflowServiceApi(api_client)
+            namespace = '{{workflow.namespace}}'
+            uid = '{{workflow.name}}'
+            body = onepanel.core.api.AddWorkflowExecutionsMetricsRequest()
+            body.metrics = metrics
+            try:
+                api_response = api_instance.add_workflow_execution_metrics(namespace, uid, body)
+                print('Metrics added.')
+            except ApiException as e:
+                print("Exception when calling WorkflowServiceApi->add_workflow_execution_metrics: %s\n" % e)
+                    
       workingDir: /mnt
       volumeMounts:
       - name: comparemodel-data
@@ -389,9 +417,9 @@ templates:
     inputs:
       artifacts:
       - name: data
-        path: /mnt/data/datasets/
-        s3:
-          key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}'
+        path: /mnt/data/patch_medical_valid.zip
+        http:
+          url: https://github.com/onepanelio/templates/releases/download/v0.2.0/patch_medical_valid.zip
     
     outputs:
       artifacts:
@@ -408,14 +436,16 @@ templates:
         apt-get update && \
         apt-get install -y gcc g++ git unzip zip && \
         python3 -m pip install setuptools && \
-        cd /mnt/data/datasets && \
-        unzip processed_data.zip && \
-        rm -f processed_data.zip && \
+        ls /mnt/data && \
+        cd /mnt/data/ && \
+        unzip patch_medical_valid.zip && \
+        rm -f patch_medical_valid.zip && \
         cd /mnt && \
         git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \
         cd nni/ && \
         python3 prepare_data.py --data_dir="./processed_data" \
                                 --test_split="{{workflow.parameters.test-split}}" \
+                                --image_dir="/mnt/data" \
                                 --skip="{{workflow.parameters.skip-preprocessing}}" && \
         zip -r /mnt/output/processed_data.zip ./processed_data
       workingDir: /mnt
diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py
index 33147b6174..dc45d6da43 100644
--- a/nni/algorithms/nas/pytorch/enas/trainer.py
+++ b/nni/algorithms/nas/pytorch/enas/trainer.py
@@ -208,4 +208,5 @@ def validate_one_epoch(self, epoch):
                 logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary  %s",
                             epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch,
                             meters.summary())
+        # return metrics so that it can be saved later
         return meters
\ No newline at end of file
diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py
index 08c1384bf3..7f49e0c059 100644
--- a/nni/nas/pytorch/trainer.py
+++ b/nni/nas/pytorch/trainer.py
@@ -92,6 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs,
         self.batch_size = batch_size
         self.workers = workers
         self.log_frequency = log_frequency
+        # update log dir for Onepanel
         self.log_dir = "/mnt/output/naslogs"
         os.makedirs(self.log_dir, exist_ok=True)
         self.status_writer = open(os.path.join(self.log_dir, "log"), "w")
@@ -144,6 +145,7 @@ def train(self, validate=True):
             if validate:
                 # validation
                 _logger.info("Epoch %d Validating", epoch + 1)
+                # keep track of metrics so that it can be used later
                 self.val_model_summary = self.validate_one_epoch(epoch)
 
             for callback in self.callbacks:
diff --git a/nni/trial.py b/nni/trial.py
index 0d24274899..13b6e3289e 100644
--- a/nni/trial.py
+++ b/nni/trial.py
@@ -23,10 +23,6 @@
 _trial_id = platform.get_trial_id()
 _sequence_id = platform.get_sequence_id()
 
-#keep track of highest accuracy
-#_best_params = os.getenv('_BEST_PARAMS', None)
-#_best_score  = os.getenv('_BEST_SCORE', 0)
-
 def get_next_parameter():
     """
     Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper
@@ -147,25 +143,28 @@ def report_final_result(metric):
     
 
 def update_score(metric):
-  
-    #keep track of highest accuracy
+    """
+    Keep track of metrics over trials. Maintain highest accuracy so far.
+    """
     _sysdir = trial_env_vars.NNI_SYS_DIR
     _trials = os.path.dirname(_sysdir)
     if os.path.exists(os.path.join(_trials, 'best_score.json')):
         with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile:
             data = json.load(jsonFile)
-        if float(data['score']) < metric:
+        if float(data['score']) < metric: # new accuracy is higher than prev one
             data['score'] = str(metric)
-
+            # update best_score file with the new one
             with open(os.path.join(_trials, 'best_score.json'), "w") as jsonFile2:
                 print("updating json file", data)
                 json.dump(data, jsonFile2)
-    else:
+    else: # first trial, create new best_score
         params = get_current_parameter()
         with open(os.path.join(_trials, 'best_score.json'),'w') as f:
             json.dump({'score':metric, 'params':str(params) } , f)        
 
 def get_best_params():
+    """ Read best_score.json and return highest score (i.e accuracy)
+    """
     _sysdir = trial_env_vars.NNI_SYS_DIR
     _trials = os.path.dirname(_sysdir)
     if os.path.exists(os.path.join(_trials, 'best_score.json')):
diff --git a/prepare_data.py b/prepare_data.py
index 158001e55c..eba66deb28 100644
--- a/prepare_data.py
+++ b/prepare_data.py
@@ -6,7 +6,12 @@
 import glob
 
 def main(args):
+    """ CVAT's XML format has xml file that contains annotations and paths to images.
+        The script requires data to be in PyTorch's ImageFolder folder where there is one directory
+        per class.
 
+        This also splits data into train and test set.
+    """
     tree = ET.parse(args.xml_path)
     root = tree.getroot()
 
@@ -14,6 +19,7 @@ def main(args):
     for label in root.iter('label'):
         os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text))
         os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text))
+
     images_len = len(list(root.iter('tag')))
     test_len = (images_len * args.test_split )// 100
     count = 0
@@ -22,6 +28,7 @@ def main(args):
         lbl = img.find('tag').attrib['label']
         if lbl:
             if bool(random.getrandbits(1)) and count <= test_len :
+                # randomly put image into test or train dir
                 shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name']))
                 count += 1
             else:
@@ -29,7 +36,9 @@ def main(args):
 
 
 def train_test_split(args):
-
+    """
+        If Images are already in ImageFolder format, then just split images into train and test.
+    """
     for dirn in os.listdir(args.image_dir):
         os.makedirs(os.path.join(args.data_dir, 'train', dirn))
         os.makedirs(os.path.join(args.data_dir, 'test', dirn))

From 346eee190d923b2d3a5f4e4b48a898b12680b79d Mon Sep 17 00:00:00 2001
From: savan <vsavan7@gmail.com>
Date: Tue, 8 Dec 2020 14:59:58 -0600
Subject: [PATCH 70/70] add comments and support for new models

---
 .../trials/pytorch-classifier/create_yaml.py  |  4 ++
 examples/trials/pytorch-classifier/main.py    | 42 +++++++++++--------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py
index a89eced5ab..e415e2bed7 100644
--- a/examples/trials/pytorch-classifier/create_yaml.py
+++ b/examples/trials/pytorch-classifier/create_yaml.py
@@ -1,3 +1,4 @@
+# script to generate config yaml file dynamically
 import yaml
 import argparse
 import json
@@ -5,6 +6,7 @@
 def main(args):
     stream = open(args['config_path'], 'r')
     data = yaml.load(stream)
+    # update command based on args
     data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type'])
     # update config settings
     if 'max_trial_num' in args:
@@ -21,6 +23,8 @@ def main(args):
     with open(args['output_path'], 'w') as yaml_file:
         yaml_file.write(yaml.dump(data, default_flow_style=False))
 
+    # update search space for hyperparam tuning
+    # script needs to be updated for each new param
     json_data = {}
     if 'momentum_range' in args:
         mm_list = [int(item) for item in args['momentum_range'].split(',')]
diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py
index a901a7f67f..6369dc4db4 100644
--- a/examples/trials/pytorch-classifier/main.py
+++ b/examples/trials/pytorch-classifier/main.py
@@ -18,22 +18,6 @@
 
 logger = logging.getLogger('pytorch_classifier')
 
-
-# mean = 0.0
-# for images, _ in loader:
-#     batch_samples = images.size(0) 
-#     images = images.view(batch_samples, images.size(1), -1)
-#     mean += images.mean(2).sum(0)
-# mean = mean / len(loader.dataset)
-
-# var = 0.0
-# for images, _ in loader:
-#     batch_samples = images.size(0)
-#     images = images.view(batch_samples, images.size(1), -1)
-#     var += ((images - mean.unsqueeze(1))**2).sum([0,2])
-# std = torch.sqrt(var / (len(loader.dataset)*224*224))
-
-
 def build_model(model_type, num_classes):
     if model_type == "googlenet":
         model = models.googlenet(pretrained=True)
@@ -41,12 +25,36 @@ def build_model(model_type, num_classes):
     elif model_type == "resnet50":
         model = models.resnet50(pretrained=True)
         in_features = 2048
+    elif model_type == "resnet18":
+        model = models.resnet18(pretrained=True)
+        in_features = 512
     elif model_type == "alexnet":
         model = models.alexnet(pretrained=True)
         in_features = 4096
     elif model_type == "vgg19":
-        model = models.alexnet(pretrained=True)
+        model = models.vgg19(pretrained=True)
+        in_features = 4096
+    elif model_type == "vgg16":
+        model = models.vgg16(pretrained=True)
         in_features = 4096
+    elif model_type == "mobilenet_v2":
+        model = models.mobilenet_v2(pretrained=True)
+        model.classifier[1] = nn.Linear(1280, num_classes)
+        return model
+    elif model_type == "inception_v3":
+        model = models.inception_v3(pretrained=True)
+        model.fc = nn.Linear(2048, num_classes)
+        return model
+    elif model_type == "densenet161":
+        model = models.densenet161(pretrained=True) # other variants are 121, 169, 201
+        model.classifier = nn.Linear(2208, num_classes)
+        return model
+    elif model_type == "squeezenet":
+        # squeezenet has diff architecture, terminate here
+        model = models.squeezenet1_0(pretrained=True)
+        model.classifier[1] = nn.Conv2d(512, num_classes, 1)
+        return model
+
     if model_type in ['alexnet', 'vgg19']:
         model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes),
                                         nn.LogSoftmax(dim=1))