From 039a931f6decae00fd53c937a66a56e1724be7af Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 27 Oct 2020 18:18:14 -0500 Subject: [PATCH 01/70] use python instead of python3 --- examples/trials/mnist-tfv2/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/mnist-tfv2/config.yml b/examples/trials/mnist-tfv2/config.yml index 06e9af6be3..936aedd940 100644 --- a/examples/trials/mnist-tfv2/config.yml +++ b/examples/trials/mnist-tfv2/config.yml @@ -12,6 +12,6 @@ tuner: classArgs: optimize_mode: maximize # choices: maximize, minimize trial: - command: python3 mnist.py + command: python mnist.py codeDir: . gpuNum: 0 From 060677fc99ee3aa60ac340a5a8e0da91f1d475e1 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 28 Oct 2020 16:58:32 -0500 Subject: [PATCH 02/70] store models in the /mnt/output directory --- examples/nas/enas/search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index ae1e615a92..d229102989 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -12,7 +12,7 @@ from macro import GeneralNetwork from micro import MicroNetwork from nni.nas.pytorch import enas -from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, +from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, ModelCheckpoint, LRSchedulerCallback) from utils import accuracy, reward_accuracy @@ -49,7 +49,7 @@ metrics=accuracy, reward_function=reward_accuracy, optimizer=optimizer, - callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")], + callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")], batch_size=args.batch_size, num_epochs=num_epochs, dataset_train=dataset_train, From 5a0087f70ab18b038657bd9cbdbe6fa6ffb08669 Mon Sep 17 00:00:00 2001 From: savan Date: Fri, 30 Oct 2020 13:34:48 -0500 Subject: [PATCH 03/70] add template for nas --- template.yaml | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 template.yaml diff --git a/template.yaml b/template.yaml new file mode 100644 index 0000000000..65c080cdc0 --- /dev/null +++ b/template.yaml @@ -0,0 +1,85 @@ +entrypoint: main +arguments: + parameters: + - name: source + value: https://github.com/onepanelio/nni.git + - name: epochs + value: 1 + - name: search-method + value: macro + type: select.select + options: + - name: 'Macro' + value: macro + - name: 'Micro' + value: micro + - displayName: Node pool + hint: Name of node pool or group to run this workflow task + type: select.select + visibility: public + name: sys-node-pool + value: Standard_D4s_v3 + required: true + options: + - name: 'CPU: 2, RAM: 8GB' + value: Standard_D2s_v3 + - name: 'CPU: 4, RAM: 16GB' + value: Standard_D4s_v3 + - name: 'GPU: 1xK80, CPU: 6, RAM: 56GB' + value: Standard_NC6 + - name: 'GPU: 1xV100, CPU: 6, RAM: 56GB' + value: Standard_NC6s_v3 + +volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 2Gi + - metadata: + name: output + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 2Gi +templates: + - name: main + dag: + tasks: + - name: train-model + template: pytorch + - name: pytorch + inputs: + artifacts: + - name: src + path: /mnt/src + git: + repo: "{{workflow.parameters.source}}" + revision: "dev" + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + archive: + none: {} + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ && \ + pip install setuptools nni && \ + python /mnt/src/examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} + workingDir: /mnt + volumeMounts: + - name: data + mountPath: /mnt/data + - name: output + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' \ No newline at end of file From 4e04cb36d50ed175a4cc95456ddcecc8ca56d6cd Mon Sep 17 00:00:00 2001 From: savan Date: Fri, 30 Oct 2020 18:19:14 -0500 Subject: [PATCH 04/70] place viz logs into /mnt/output --- examples/nas/enas/search.py | 4 ++-- nni/nas/pytorch/trainer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index d229102989..1041c4b3ed 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -25,7 +25,7 @@ parser.add_argument("--log-frequency", default=10, type=int) parser.add_argument("--search-for", choices=["macro", "micro"], default="macro") parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") - parser.add_argument("--visualization", default=False, action="store_true") + parser.add_argument("--visualization", default=True, action="store_true") args = parser.parse_args() dataset_train, dataset_valid = datasets.get_dataset("cifar10") @@ -49,7 +49,7 @@ metrics=accuracy, reward_function=reward_accuracy, optimizer=optimizer, - callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")], + callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./output"), ModelCheckpoint("./output")], batch_size=args.batch_size, num_epochs=num_epochs, dataset_train=dataset_train, diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index 6a3881177a..7264c3792e 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -92,7 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, self.batch_size = batch_size self.workers = workers self.log_frequency = log_frequency - self.log_dir = os.path.join("logs", str(time.time())) + self.log_dir = os.path.join("/mnt/output", str(time.time())) os.makedirs(self.log_dir, exist_ok=True) self.status_writer = open(os.path.join(self.log_dir, "log"), "w") self.callbacks = callbacks if callbacks is not None else [] From 63610ed57020d689cb42c3aa939d5a038e82a0d7 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 2 Nov 2020 17:06:36 -0600 Subject: [PATCH 05/70] update paths for tf enas --- examples/nas/enas-tf/search.py | 2 +- examples/nas/enas/search.py | 4 ++-- nni/nas/pytorch/trainer.py | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/nas/enas-tf/search.py b/examples/nas/enas-tf/search.py index b68daf62f3..b7d5ee93d2 100644 --- a/examples/nas/enas-tf/search.py +++ b/examples/nas/enas-tf/search.py @@ -5,7 +5,7 @@ from tensorflow.keras.losses import Reduction, SparseCategoricalCrossentropy from tensorflow.keras.optimizers import SGD -from nni.nas.tensorflow import enas +from nni.algorithms.nas.tensorflow import enas import datasets from macro import GeneralNetwork diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 1041c4b3ed..380483cb1e 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -11,7 +11,7 @@ import datasets from macro import GeneralNetwork from micro import MicroNetwork -from nni.nas.pytorch import enas +from nni.algorithms.nas.pytorch import enas from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint, ModelCheckpoint, LRSchedulerCallback) from utils import accuracy, reward_accuracy @@ -49,7 +49,7 @@ metrics=accuracy, reward_function=reward_accuracy, optimizer=optimizer, - callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./output"), ModelCheckpoint("./output")], + callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")], batch_size=args.batch_size, num_epochs=num_epochs, dataset_train=dataset_train, diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index 7264c3792e..cc740e7595 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -92,6 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, self.batch_size = batch_size self.workers = workers self.log_frequency = log_frequency + print("Log dir...") self.log_dir = os.path.join("/mnt/output", str(time.time())) os.makedirs(self.log_dir, exist_ok=True) self.status_writer = open(os.path.join(self.log_dir, "log"), "w") @@ -184,6 +185,7 @@ def enable_visualization(self): break if sample is None: _logger.warning("Sample is %s.", sample) + _logger.info("Visualization: %s",self.log_dir) _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir) with open(os.path.join(self.log_dir, "graph.json"), "w") as f: json.dump(self.mutator.graph(sample), f) From c1c5a3182478177ed3957398383873ca5a9c3913 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 2 Nov 2020 18:46:12 -0600 Subject: [PATCH 06/70] allow users to specify dataset --- examples/nas/enas/datasets.py | 18 +++++++++++++++++- examples/nas/enas/search.py | 11 ++++++++--- nni/algorithms/nas/pytorch/enas/trainer.py | 1 + nni/nas/pytorch/trainer.py | 3 +-- 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/examples/nas/enas/datasets.py b/examples/nas/enas/datasets.py index 9a5128a8a9..d781ba0a8a 100644 --- a/examples/nas/enas/datasets.py +++ b/examples/nas/enas/datasets.py @@ -3,9 +3,23 @@ from torchvision import transforms from torchvision.datasets import CIFAR10 +from torchvision.datasets import ImageFolder +def get_custom_dataset(train_dir, valid_dir): + """ Load custom classification dataset using ImageFolder. + The train and test directory should have sub directories with name equals to label names. -def get_dataset(cls): + """ + transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Resize((32, 32)) + ]) + train_dataset = ImageFolder(root=train_dir, transform=transform) + valid_dataset = ImageFolder(root=valid_dir, transform=transform) + return train_dataset, valid_dataset + + +def get_dataset(cls, train_dir=None, valid_data=None): MEAN = [0.49139968, 0.48215827, 0.44653124] STD = [0.24703233, 0.24348505, 0.26158768] transf = [ @@ -23,6 +37,8 @@ def get_dataset(cls): if cls == "cifar10": dataset_train = CIFAR10(root="./data", train=True, download=True, transform=train_transform) dataset_valid = CIFAR10(root="./data", train=False, download=True, transform=valid_transform) + elif cls == "custom_classification": + dataset_train, dataset_valid = get_custom_dataset(train_dir, valid_data) else: raise NotImplementedError return dataset_train, dataset_valid diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 380483cb1e..073c96c44d 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -23,18 +23,23 @@ parser = ArgumentParser("enas") parser.add_argument("--batch-size", default=128, type=int) parser.add_argument("--log-frequency", default=10, type=int) + parser.add_argument("--num-classes", default=2, type=int) + parser.add_argument("--dataset", default="cifar10", choices=["cifar10", "custom_classification"]) parser.add_argument("--search-for", choices=["macro", "micro"], default="macro") parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") parser.add_argument("--visualization", default=True, action="store_true") + parser.add_argument("--train-data-dir", default="/home/savan/Documents/train_data", help="train dataset for classification") + parser.add_argument("--valid-data-dir", default="/home/savan/Documents/test_data", help="validation dataset for classification") args = parser.parse_args() - dataset_train, dataset_valid = datasets.get_dataset("cifar10") + dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir) + print(len(dataset_train)) if args.search_for == "macro": - model = GeneralNetwork() + model = GeneralNetwork(num_classes=args.num_classes) num_epochs = args.epochs or 310 mutator = None elif args.search_for == "micro": - model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True) + model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args.num_classes, use_aux_heads=True) num_epochs = args.epochs or 150 mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True) else: diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py index 5e7a966580..f67c38060a 100644 --- a/nni/algorithms/nas/pytorch/enas/trainer.py +++ b/nni/algorithms/nas/pytorch/enas/trainer.py @@ -100,6 +100,7 @@ def init_dataloader(self): indices = list(range(n_train)) train_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[:-split]) valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices[-split:]) + print("Loading dataset of size", n_train) self.train_loader = torch.utils.data.DataLoader(self.dataset_train, batch_size=self.batch_size, sampler=train_sampler, diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index cc740e7595..6024a05a4a 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -92,7 +92,6 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, self.batch_size = batch_size self.workers = workers self.log_frequency = log_frequency - print("Log dir...") self.log_dir = os.path.join("/mnt/output", str(time.time())) os.makedirs(self.log_dir, exist_ok=True) self.status_writer = open(os.path.join(self.log_dir, "log"), "w") @@ -180,12 +179,12 @@ def enable_visualization(self): Enable visualization. Write graph and training log to folder ``logs/``. """ sample = None + # print(len(self.train_loader)) for x, _ in self.train_loader: sample = x.to(self.device)[:2] break if sample is None: _logger.warning("Sample is %s.", sample) - _logger.info("Visualization: %s",self.log_dir) _logger.info("Creating graph json, writing to %s. Visualization enabled.", self.log_dir) with open(os.path.join(self.log_dir, "graph.json"), "w") as f: json.dump(self.mutator.graph(sample), f) From 812bdbbedc0d5b65520239fe9e946f7edb062e55 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 3 Nov 2020 10:31:57 -0600 Subject: [PATCH 07/70] perform resizing before converting it into tensor --- examples/nas/enas/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nas/enas/datasets.py b/examples/nas/enas/datasets.py index d781ba0a8a..3a45d97bef 100644 --- a/examples/nas/enas/datasets.py +++ b/examples/nas/enas/datasets.py @@ -11,8 +11,8 @@ def get_custom_dataset(train_dir, valid_dir): """ transform = transforms.Compose([ - transforms.ToTensor(), - transforms.Resize((32, 32)) + transforms.Resize((32, 32)), + transforms.ToTensor() ]) train_dataset = ImageFolder(root=train_dir, transform=transform) valid_dataset = ImageFolder(root=valid_dir, transform=transform) From d04ab5ec82a2d00f217d8c0f2ed7c9eaaa4812d7 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 5 Nov 2020 17:11:13 -0600 Subject: [PATCH 08/70] add data processing script --- examples/nas/enas/search.py | 1 - prepare_data.py | 27 +++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 prepare_data.py diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 073c96c44d..d4e2b03443 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -33,7 +33,6 @@ args = parser.parse_args() dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir) - print(len(dataset_train)) if args.search_for == "macro": model = GeneralNetwork(num_classes=args.num_classes) num_epochs = args.epochs or 310 diff --git a/prepare_data.py b/prepare_data.py new file mode 100644 index 0000000000..33806b66ab --- /dev/null +++ b/prepare_data.py @@ -0,0 +1,27 @@ +import xml.etree.ElementTree as ET +import os +import argparse + +def main(args): + + tree = ET.parse(args.xml_path) + root = tree.getroot() + + # create directories + for label in root.iter('label'): + os.makedirs(os.path.join(args.data_dir, label.find('name').text)) + + for img in root.iter('image'): + #move image + lbl = img.find('tag').attrib['label'] + if lbl: + os.rename(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name'])) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml') + parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data') + parser.add_argument('--image_dir', default='/mnt/data/datasets/images') + args = parser.parse_args() + main(args) \ No newline at end of file From da463dd758dc2ba6697341ba90fe45a3dd95f6ae Mon Sep 17 00:00:00 2001 From: savan Date: Fri, 6 Nov 2020 19:00:35 -0600 Subject: [PATCH 09/70] add generic pytorch classifier --- examples/trials/pytorch-classifier/config.yml | 21 +++ .../trials/pytorch-classifier/config_aml.yml | 25 +++ .../pytorch-classifier/config_assessor.yml | 27 +++ .../config_frameworkcontroller.yml | 40 ++++ .../pytorch-classifier/config_kubeflow.yml | 32 ++++ .../trials/pytorch-classifier/config_pai.yml | 35 ++++ .../pytorch-classifier/config_windows.yml | 21 +++ examples/trials/pytorch-classifier/main.py | 172 ++++++++++++++++++ .../pytorch-classifier/requirements.txt | 2 + .../pytorch-classifier/search_space.json | 6 + 10 files changed, 381 insertions(+) create mode 100644 examples/trials/pytorch-classifier/config.yml create mode 100644 examples/trials/pytorch-classifier/config_aml.yml create mode 100644 examples/trials/pytorch-classifier/config_assessor.yml create mode 100644 examples/trials/pytorch-classifier/config_frameworkcontroller.yml create mode 100644 examples/trials/pytorch-classifier/config_kubeflow.yml create mode 100644 examples/trials/pytorch-classifier/config_pai.yml create mode 100644 examples/trials/pytorch-classifier/config_windows.yml create mode 100644 examples/trials/pytorch-classifier/main.py create mode 100644 examples/trials/pytorch-classifier/requirements.txt create mode 100644 examples/trials/pytorch-classifier/search_space.json diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml new file mode 100644 index 0000000000..c671dcf3dd --- /dev/null +++ b/examples/trials/pytorch-classifier/config.yml @@ -0,0 +1,21 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: local +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 main.py + codeDir: . + gpuNum: 0 diff --git a/examples/trials/pytorch-classifier/config_aml.yml b/examples/trials/pytorch-classifier/config_aml.yml new file mode 100644 index 0000000000..d627e78b14 --- /dev/null +++ b/examples/trials/pytorch-classifier/config_aml.yml @@ -0,0 +1,25 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 main.py + codeDir: . + image: msranni/nni +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} + computeTarget: ${replace_to_your_computeTarget} diff --git a/examples/trials/pytorch-classifier/config_assessor.yml b/examples/trials/pytorch-classifier/config_assessor.yml new file mode 100644 index 0000000000..3aca3ffb5d --- /dev/null +++ b/examples/trials/pytorch-classifier/config_assessor.yml @@ -0,0 +1,27 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 50 +#choice: local, remote +trainingServicePlatform: local +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +assessor: + #choice: Medianstop, Curvefitting + builtinAssessorName: Curvefitting + classArgs: + epoch_num: 20 + threshold: 0.9 +trial: + command: python3 main.py + codeDir: . + gpuNum: 0 diff --git a/examples/trials/pytorch-classifier/config_frameworkcontroller.yml b/examples/trials/pytorch-classifier/config_frameworkcontroller.yml new file mode 100644 index 0000000000..aecf6b18bf --- /dev/null +++ b/examples/trials/pytorch-classifier/config_frameworkcontroller.yml @@ -0,0 +1,40 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: frameworkcontroller +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +assessor: + builtinAssessorName: Medianstop + classArgs: + optimize_mode: maximize +trial: + codeDir: . + taskRoles: + - name: worker + taskNum: 1 + command: python3 main.py + gpuNum: 1 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest + frameworkAttemptCompletionPolicy: + minFailedTaskCount: 1 + minSucceededTaskCount: 1 +frameworkcontrollerConfig: + storage: nfs + nfs: + # Your NFS server IP, like 10.10.10.10 + server: {your_nfs_server_ip} + # Your NFS server export path, like /var/nfs/nni + path: {your_nfs_server_export_path} \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/config_kubeflow.yml b/examples/trials/pytorch-classifier/config_kubeflow.yml new file mode 100644 index 0000000000..5bf2804352 --- /dev/null +++ b/examples/trials/pytorch-classifier/config_kubeflow.yml @@ -0,0 +1,32 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 1 +#choice: local, remote, pai, kubeflow +trainingServicePlatform: kubeflow +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + codeDir: . + worker: + replicas: 1 + command: python3 main.py + gpuNum: 0 + cpuNum: 1 + memoryMB: 8192 + image: msranni/nni:latest +kubeflowConfig: + operator: tf-operator + apiVersion: v1alpha2 + storage: nfs + nfs: + server: 10.10.10.10 + path: /var/nfs/general \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/config_pai.yml b/examples/trials/pytorch-classifier/config_pai.yml new file mode 100644 index 0000000000..032525f54d --- /dev/null +++ b/examples/trials/pytorch-classifier/config_pai.yml @@ -0,0 +1,35 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: pai +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 main.py + codeDir: . + gpuNum: 0 + cpuNum: 1 + memoryMB: 8196 + #The docker image to run nni job on pai + image: msranni/nni:latest + nniManagerNFSMountPath: {replace_to_your_nfs_mount_path} + containerNFSMountPath: {replace_to_your_container_mount_path} + paiStorageConfigName: {replace_to_your_storage_config_name} +paiConfig: + #The username to login pai + userName: username + #The token to login pai + token: token + #The host of restful server of pai + host: 10.10.10.10 \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/config_windows.yml b/examples/trials/pytorch-classifier/config_windows.yml new file mode 100644 index 0000000000..3dd7325b33 --- /dev/null +++ b/examples/trials/pytorch-classifier/config_windows.yml @@ -0,0 +1,21 @@ +authorName: default +experimentName: pytorch_classifier +trialConcurrency: 1 +maxExecDuration: 10h +maxTrialNum: 10 +#choice: local, remote, pai +trainingServicePlatform: local +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python main.py + codeDir: . + gpuNum: 0 diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py new file mode 100644 index 0000000000..1a01193de5 --- /dev/null +++ b/examples/trials/pytorch-classifier/main.py @@ -0,0 +1,172 @@ +""" +A general purpose classification script using PyTorch. +""" + +import os +import argparse +import logging +import nni +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from nni.utils import merge_parameter +from torchvision.datasets import ImageFolder +import torchvision.models as models +from torchvision import datasets, transforms + +logger = logging.getLogger('pytorch_classifier') + + +# mean = 0.0 +# for images, _ in loader: +# batch_samples = images.size(0) +# images = images.view(batch_samples, images.size(1), -1) +# mean += images.mean(2).sum(0) +# mean = mean / len(loader.dataset) + +# var = 0.0 +# for images, _ in loader: +# batch_samples = images.size(0) +# images = images.view(batch_samples, images.size(1), -1) +# var += ((images - mean.unsqueeze(1))**2).sum([0,2]) +# std = torch.sqrt(var / (len(loader.dataset)*224*224)) + + +def build_model(model_type, num_classes): + if model_type == "googlenet": + model = models.googlenet(pretrained=True) + in_features = 1024 + elif model_type == "resnet50": + model = models.resnet50(pretrained=True) + in_features = 2048 + model.fc = nn.Sequential(nn.Linear(in_features, num_classes), + nn.LogSoftmax(dim=1)) + return model + + +def train_one_epoch(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + if (args['batch_num'] is not None) and batch_idx >= args['batch_num']: + break + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args['log_interval'] == 0: + logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, reduction='sum').item() + # get the index of the max log-probability + pred = output.argmax(dim=1, keepdim=True) + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + accuracy = 100. * correct / len(test_loader.dataset) + + logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), accuracy)) + + return accuracy + + +def train(args): + use_cuda = not args['no_cuda'] and torch.cuda.is_available() + + torch.manual_seed(args['seed']) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + ImageFolder(root=args['train_dir'], transform=transforms.Compose([ + transforms.ToTensor(), + # add Normlize with mean and std + ])), + batch_size=args['batch_size'], shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + ImageFolder(root=args['test_dir'], transform=transforms.Compose([ + transforms.ToTensor(), + # add Normlize with mean and std + ])), + batch_size=args['batch_size'], shuffle=True, **kwargs) + + + model = build_model(args['model_type'], args['num_classes']).to(device) + optimizer = optim.SGD(model.parameters(), lr=args['lr'], + momentum=args['momentum']) + + for epoch in range(1, args['epochs'] + 1): + train_one_epoch(args, model, device, train_loader, optimizer, epoch) + test_acc = test(args, model, device, test_loader) + + # report intermediate result + nni.report_intermediate_result(test_acc) + logger.debug('test accuracy %g', test_acc) + logger.debug('Pipe send intermediate result done.') + + # report final result + nni.report_final_result(test_acc) + logger.debug('Final result is %g', test_acc) + logger.debug('Send final result done.') + + +def get_params(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch Classification Example') + parser.add_argument("--train_dir", type=str, + default='/home/savan/Documents/train_data', help="train data directory") + parser.add_argument("--test_dir", type=str, + default='/home/savan/Documents/test_data', help="test data directory") + parser.add_argument("--model_type", type=str, + default='googlenet', help="model to train") + parser.add_argument('--batch_size', type=int, default=1, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument("--batch_num", type=int, default=None) + parser.add_argument("--num_classes", type=int, default=2, metavar='N', + help='number of classes in the dataset') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--no_cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--log_interval', type=int, default=1000, metavar='N', + help='how many batches to wait before logging training status') + + + args, _ = parser.parse_known_args() + return args + + +if __name__ == '__main__': + try: + tuner_params = nni.get_next_parameter() + logger.debug(tuner_params) + params = vars(merge_parameter(get_params(), tuner_params)) + print("Current Parameters:\n") + print(params) + train(params) + except Exception as exception: + logger.exception(exception) + raise diff --git a/examples/trials/pytorch-classifier/requirements.txt b/examples/trials/pytorch-classifier/requirements.txt new file mode 100644 index 0000000000..01f6b72556 --- /dev/null +++ b/examples/trials/pytorch-classifier/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision diff --git a/examples/trials/pytorch-classifier/search_space.json b/examples/trials/pytorch-classifier/search_space.json new file mode 100644 index 0000000000..c26cdce369 --- /dev/null +++ b/examples/trials/pytorch-classifier/search_space.json @@ -0,0 +1,6 @@ +{ + "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]}, + "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]}, + "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}, + "momentum":{"_type":"uniform","_value":[0, 1]} +} From 7688fd281d7e2b5da0b1b209eb9d774171b12718 Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 05:17:23 -0600 Subject: [PATCH 10/70] train model with specific parameters --- .../trials/pytorch-classifier/train_model.py | 167 ++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 examples/trials/pytorch-classifier/train_model.py diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py new file mode 100644 index 0000000000..250c1e9b9e --- /dev/null +++ b/examples/trials/pytorch-classifier/train_model.py @@ -0,0 +1,167 @@ +""" +A general purpose classification script using PyTorch. +""" + +import argparse +import logging +import nni +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision.datasets import ImageFolder +import torchvision.models as models +from torchvision import datasets, transforms + +logger = logging.getLogger('pytorch_classifier') + + +# mean = 0.0 +# for images, _ in loader: +# batch_samples = images.size(0) +# images = images.view(batch_samples, images.size(1), -1) +# mean += images.mean(2).sum(0) +# mean = mean / len(loader.dataset) + +# var = 0.0 +# for images, _ in loader: +# batch_samples = images.size(0) +# images = images.view(batch_samples, images.size(1), -1) +# var += ((images - mean.unsqueeze(1))**2).sum([0,2]) +# std = torch.sqrt(var / (len(loader.dataset)*224*224)) + + +def build_model(model_type, num_classes): + if model_type == "googlenet": + model = models.googlenet(pretrained=True) + in_features = 1024 + elif model_type == "resnet50": + model = models.resnet50(pretrained=True) + in_features = 2048 + model.fc = nn.Sequential(nn.Linear(in_features, num_classes), + nn.LogSoftmax(dim=1)) + return model + + +def train_one_epoch(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + if (args['batch_num'] is not None) and batch_idx >= args['batch_num']: + break + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args['log_interval'] == 0: + logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + # sum up batch loss + test_loss += F.nll_loss(output, target, reduction='sum').item() + # get the index of the max log-probability + pred = output.argmax(dim=1, keepdim=True) + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + accuracy = 100. * correct / len(test_loader.dataset) + + logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), accuracy)) + + return accuracy + + +def train(args): + use_cuda = not args['no_cuda'] and torch.cuda.is_available() + + torch.manual_seed(args['seed']) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + ImageFolder(root=args['train_dir'], transform=transforms.Compose([ + transforms.ToTensor(), + # add Normlize with mean and std + ])), + batch_size=args['batch_size'], shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + ImageFolder(root=args['test_dir'], transform=transforms.Compose([ + transforms.ToTensor(), + # add Normlize with mean and std + ])), + batch_size=args['batch_size'], shuffle=True, **kwargs) + + + model = build_model(args['model_type'], args['num_classes']).to(device) + optimizer = optim.SGD(model.parameters(), lr=args['lr'], + momentum=args['momentum']) + + for epoch in range(1, args['epochs'] + 1): + train_one_epoch(args, model, device, train_loader, optimizer, epoch) + test_acc = test(args, model, device, test_loader) + + # report intermediate result + nni.report_intermediate_result(test_acc) + logger.debug('test accuracy %g', test_acc) + logger.debug('Pipe send intermediate result done.') + + # report final result + nni.report_final_result(test_acc) + logger.debug('Final result is %g', test_acc) + logger.debug('Send final result done.') + + +def get_params(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch Classification Example') + parser.add_argument("--train_dir", type=str, + default='/home/savan/Documents/train_data', help="train data directory") + parser.add_argument("--test_dir", type=str, + default='/home/savan/Documents/test_data', help="test data directory") + parser.add_argument("--model_type", type=str, + default='googlenet', help="model to train") + parser.add_argument('--batch_size', type=int, default=1, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument("--batch_num", type=int, default=None) + parser.add_argument("--num_classes", type=int, default=2, metavar='N', + help='number of classes in the dataset') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--no_cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--log_interval', type=int, default=1000, metavar='N', + help='how many batches to wait before logging training status') + + args, _ = parser.parse_known_args() + return args + + +if __name__ == '__main__': + try: + params = get_params() + print("Current Parameters:\n") + print(params) + train(params) + except Exception as exception: + logger.exception(exception) + raise From 4ba1420000a75dc1f854575291d944fa65eedadb Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 05:28:51 -0600 Subject: [PATCH 11/70] add workflow template --- .../trials/pytorch-classifier/template.yml | 196 ++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 examples/trials/pytorch-classifier/template.yml diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml new file mode 100644 index 0000000000..6e3a45808d --- /dev/null +++ b/examples/trials/pytorch-classifier/template.yml @@ -0,0 +1,196 @@ +entrypoint: main +arguments: + parameters: + - name: source + value: https://github.com/onepanelio/nni.git + - name: cvat-annotation-path + value: annotation-dump/animals/11052020231652 + displayName: Dataset path + hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated. + visibility: private + - name: num-classes + displayName: Number of classes + visibility: public + value: 2 + - name: learning-rate + value: 0.01 + displayName: Learning rate + visibility: public + - name: batch-size + value: 1 + displayName: Batch size + visibility: public + - name: momentum + value: 0.5 + displayName: Momentum + visibility: public + - name: model-type + displayName: Model type + visibility: public + value: googlenet + options: + - name: 'GoogleNet' + value: 'googlenet' + - name: 'ResNet50' + value: 'resnet50' + - name: epochs + value: 1 + - name: search-method + value: macro + type: select.select + options: + - name: 'Macro' + value: macro + - name: 'Micro' + value: micro + - displayName: Node pool + hint: Name of node pool or group to run this workflow task + type: select.select + visibility: public + name: sys-node-pool + value: Standard_D4s_v3 + required: true + options: + - name: 'CPU: 2, RAM: 8GB' + value: Standard_D2s_v3 + - name: 'CPU: 4, RAM: 16GB' + value: Standard_D4s_v3 + - name: 'GPU: 1xK80, CPU: 6, RAM: 56GB' + value: Standard_NC6 + - name: 'GPU: 1xV100, CPU: 6, RAM: 56GB' + value: Standard_NC6s_v3 + +volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 2Gi + - metadata: + name: output + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 2Gi +templates: + - name: main + dag: + tasks: + - name: nas + template: pytorch + - name: hyperparameter-tuning + template: hyperop + - name: model-training + template: model-param + - name: pytorch + inputs: + artifacts: + - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + archive: + none: {} + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ git && \ + python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd nni/ && \ + python3 setup.py install && \ + python3 prepare_data.py + python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \ + --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data + workingDir: /mnt + volumeMounts: + - name: data + mountPath: /mnt/data + - name: output + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + - name: hyperop + inputs: + artifacts: + - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + archive: + none: {} + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ git && \ + python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd nni/ && \ + python3 setup.py install && \ + python3 prepare_data.py && \ + python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ + --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data + workingDir: /mnt + volumeMounts: + - name: data + mountPath: /mnt/data + - name: output + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + + - name: model-param + inputs: + artifacts: + - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + archive: + none: {} + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ git && \ + python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd nni/ && \ + python3 setup.py install && \ + python3 prepare_data.py && \ + python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ + --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} + workingDir: /mnt + volumeMounts: + - name: data + mountPath: /mnt/data + - name: output + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' \ No newline at end of file From 6c6333d238c652108e0fee5311467516ceac0962 Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 13:16:48 -0600 Subject: [PATCH 12/70] use shutil to move files between different file systems --- prepare_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 33806b66ab..5ac01adfc4 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1,5 +1,6 @@ import xml.etree.ElementTree as ET import os +import shutil import argparse def main(args): @@ -15,7 +16,7 @@ def main(args): #move image lbl = img.find('tag').attrib['label'] if lbl: - os.rename(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name'])) + shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name'])) if __name__ == '__main__': @@ -24,4 +25,4 @@ def main(args): parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data') parser.add_argument('--image_dir', default='/mnt/data/datasets/images') args = parser.parse_args() - main(args) \ No newline at end of file + main(args) From a1e447bde4235c20d8f43b1e3e4488bd6a61a092 Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 15:22:31 -0600 Subject: [PATCH 13/70] add script for model training with specific parameters --- .../trials/pytorch-classifier/template.yml | 159 ++++++++++++++++-- .../trials/pytorch-classifier/train_model.py | 1 - 2 files changed, 148 insertions(+), 12 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index 6e3a45808d..6ed90c9258 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -8,6 +8,9 @@ arguments: displayName: Dataset path hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated. visibility: private + - name: cvat-output-path + value: workflow-data/output/nas/nas-model-comparison + visibility: private - name: num-classes displayName: Number of classes visibility: public @@ -67,31 +70,95 @@ volumeClaimTemplates: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 2Gi + storage: 20Gi - metadata: name: output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 2Gi + storage: 20Gi + - metadata: + name: data2 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: output2 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: data3 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: output3 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: data4 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: output4 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: data5 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + - metadata: + name: output5 + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi templates: - name: main dag: tasks: - - name: nas + - name: process-data + template: process-data + - name: neural-architecture-search template: pytorch + dependencies: [process-data] - name: hyperparameter-tuning template: hyperop - - name: model-training + dependencies: [process-data] + - name: train-model template: model-param + dependencies: [process-data] + - name: compare-models + template: compare-models + dependencies: [neural-architecture-search, hyperparameter-tuning, train-model] - name: pytorch inputs: artifacts: - name: data path: /mnt/data/datasets/ s3: - key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}' outputs: artifacts: - name: model @@ -110,7 +177,6 @@ templates: git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ - python3 prepare_data.py python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \ --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data workingDir: /mnt @@ -121,13 +187,14 @@ templates: mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + - name: hyperop inputs: artifacts: - name: data path: /mnt/data/datasets/ s3: - key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}' outputs: artifacts: - name: model @@ -146,19 +213,55 @@ templates: git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ - python3 prepare_data.py && \ python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data workingDir: /mnt volumeMounts: - - name: data + - name: data2 mountPath: /mnt/data - - name: output + - name: output2 mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' - name: model-param + inputs: + artifacts: + - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}' + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + archive: + none: {} + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ git && \ + python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd nni/ && \ + python3 setup.py install && \ + python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ + --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} + workingDir: /mnt + volumeMounts: + - name: data3 + mountPath: /mnt/data + - name: output3 + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + + + - name: compare-models inputs: artifacts: - name: data @@ -188,9 +291,43 @@ templates: --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} workingDir: /mnt volumeMounts: + - name: data4 + mountPath: /mnt/data + - name: output4 + mountPath: /mnt/output + nodeSelector: + beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + + - name: process-data + inputs: + artifacts: - name: data + path: /mnt/data/datasets/ + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + outputs: + artifacts: + - name: model + path: /mnt/output + optional: true + s3: + key: '{{workflow.namespace}}/{{workflow.parameters.cvat-output-path}}/{{workflow.name}}' + container: + image: pytorch/pytorch:latest + command: [sh,-c] + args: + - | + apt-get update && \ + apt-get install -y gcc g++ git && \ + python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd nni/ && \ + python3 prepare_data.py --data_dir=/mnt/output/processed_data + workingDir: /mnt + volumeMounts: + - name: data5 mountPath: /mnt/data - - name: output + - name: output5 mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 250c1e9b9e..efa9403427 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -75,7 +75,6 @@ def test(args, model, device, test_loader): correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) - accuracy = 100. * correct / len(test_loader.dataset) logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( From 4609a4c2b756e2210444ce65930147d0ff01ceaa Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 15:40:20 -0600 Subject: [PATCH 14/70] split dataset into train and test set --- prepare_data.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 5ac01adfc4..9006914470 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -2,6 +2,7 @@ import os import shutil import argparse +import random def main(args): @@ -10,13 +11,17 @@ def main(args): # create directories for label in root.iter('label'): - os.makedirs(os.path.join(args.data_dir, label.find('name').text)) + os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text)) + os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text)) for img in root.iter('image'): #move image lbl = img.find('tag').attrib['label'] if lbl: - shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, lbl, img.attrib['name'])) + if random.randrange(100) < args.test_split: + shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name'])) + else: + shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name'])) if __name__ == '__main__': @@ -24,5 +29,6 @@ def main(args): parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml') parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data') parser.add_argument('--image_dir', default='/mnt/data/datasets/images') + parser.add_argument('--test_split', default=20, type=int) args = parser.parse_args() main(args) From 3e05a4b46ca5344b3d1008ec1ba69f7b7a482280 Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 16:43:32 -0600 Subject: [PATCH 15/70] update logic for dataset split --- prepare_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 9006914470..6697653ced 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -13,13 +13,16 @@ def main(args): for label in root.iter('label'): os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text)) os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text)) - + images_len = len(list(root.iter('tag'))) + test_len = (images_len * args.test_split )// 100 + count = 0 for img in root.iter('image'): #move image lbl = img.find('tag').attrib['label'] if lbl: - if random.randrange(100) < args.test_split: + if bool(random.getrandbits(1)) and count <= test_len : shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name'])) + count += 1 else: shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name'])) From d96cd131fd18a93a9b9293161405b8dc9bafd19b Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 17:03:53 -0600 Subject: [PATCH 16/70] add support for vgg and alexnet --- examples/trials/pytorch-classifier/main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 1a01193de5..702f52b7a4 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -40,7 +40,17 @@ def build_model(model_type, num_classes): elif model_type == "resnet50": model = models.resnet50(pretrained=True) in_features = 2048 - model.fc = nn.Sequential(nn.Linear(in_features, num_classes), + elif model_type == "alexnet": + model = models.alexnet(pretrained=True) + in_features = 4096 + elif model_type == "vgg19": + model = models.alexnet(pretrained=True) + in_features = 4096 + if model_type in ['alexnet', 'vgg19']: + model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes), + nn.LogSoftmax(dim=1)) + else: + model.fc = nn.Sequential(nn.Linear(in_features, num_classes), nn.LogSoftmax(dim=1)) return model From 1a99b518f80f153e69e8f265f0d10f1c04940767 Mon Sep 17 00:00:00 2001 From: Savan Visalpara Date: Mon, 9 Nov 2020 17:54:36 -0600 Subject: [PATCH 17/70] resolve indentation and subscription issue --- examples/trials/pytorch-classifier/main.py | 12 ++++++------ examples/trials/pytorch-classifier/train_model.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 702f52b7a4..a08eac344a 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -34,12 +34,12 @@ def build_model(model_type, num_classes): - if model_type == "googlenet": - model = models.googlenet(pretrained=True) - in_features = 1024 - elif model_type == "resnet50": - model = models.resnet50(pretrained=True) - in_features = 2048 + if model_type == "googlenet": + model = models.googlenet(pretrained=True) + in_features = 1024 + elif model_type == "resnet50": + model = models.resnet50(pretrained=True) + in_features = 2048 elif model_type == "alexnet": model = models.alexnet(pretrained=True) in_features = 4096 diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index efa9403427..bc99af1072 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -157,7 +157,7 @@ def get_params(): if __name__ == '__main__': try: - params = get_params() + params = vars(get_params()) print("Current Parameters:\n") print(params) train(params) From fb23a16f58f9a32d2abe9ac090224fc35bb0fe08 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 9 Nov 2020 19:59:10 -0600 Subject: [PATCH 18/70] add alexnet and vgg support for specific param training --- examples/trials/pytorch-classifier/main.py | 8 ++--- .../trials/pytorch-classifier/train_model.py | 32 ++++++++++++------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index a08eac344a..de2d591467 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -50,9 +50,9 @@ def build_model(model_type, num_classes): model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes), nn.LogSoftmax(dim=1)) else: - model.fc = nn.Sequential(nn.Linear(in_features, num_classes), + model.fc = nn.Sequential(nn.Linear(in_features, num_classes), nn.LogSoftmax(dim=1)) - return model + return model def train_one_epoch(args, model, device, train_loader, optimizer, epoch): @@ -105,8 +105,8 @@ def train(args): kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( - ImageFolder(root=args['train_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + ImageFolder(root=args['train_dir'], transform=transforms.Compose([ + transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index bc99af1072..9edf6ebdf4 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -32,15 +32,25 @@ def build_model(model_type, num_classes): - if model_type == "googlenet": - model = models.googlenet(pretrained=True) - in_features = 1024 - elif model_type == "resnet50": - model = models.resnet50(pretrained=True) - in_features = 2048 - model.fc = nn.Sequential(nn.Linear(in_features, num_classes), + if model_type == "googlenet": + model = models.googlenet(pretrained=True) + in_features = 1024 + elif model_type == "resnet50": + model = models.resnet50(pretrained=True) + in_features = 2048 + elif model_type == "alexnet": + model = models.alexnet(pretrained=True) + in_features = 4096 + elif model_type == "vgg19": + model = models.alexnet(pretrained=True) + in_features = 4096 + if model_type in ['alexnet', 'vgg19']: + model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes), + nn.LogSoftmax(dim=1)) + else: + model.fc = nn.Sequential(nn.Linear(in_features, num_classes), nn.LogSoftmax(dim=1)) - return model + return model def train_one_epoch(args, model, device, train_loader, optimizer, epoch): @@ -92,14 +102,14 @@ def train(args): kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( - ImageFolder(root=args['train_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + ImageFolder(root=args['train_dir'], transform=transforms.Compose([ + transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( ImageFolder(root=args['test_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) From 00ba638f382b716ef88dec4fe9dae915dedd1b05 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 16:40:17 -0600 Subject: [PATCH 19/70] changes to persist metrics --- examples/nas/enas/search.py | 6 ++-- examples/trials/pytorch-classifier/main.py | 9 ++--- .../trials/pytorch-classifier/template.yml | 12 +++++-- .../trials/pytorch-classifier/train_model.py | 33 ++++++++++++------- nni/algorithms/nas/pytorch/enas/trainer.py | 1 + nni/nas/pytorch/trainer.py | 2 +- 6 files changed, 42 insertions(+), 21 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index d4e2b03443..c1ac75cdff 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -2,9 +2,8 @@ # Licensed under the MIT license. import logging -import time from argparse import ArgumentParser - +import json import torch import torch.nn as nn @@ -63,3 +62,6 @@ if args.visualization: trainer.enable_visualization() trainer.train() + metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}] + with open('/tmp/sys-metrics.json', 'w') as f: + json.dump(metrics, f) \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index de2d591467..46c32bbb0c 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -93,7 +93,7 @@ def test(args, model, device, test_loader): logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), accuracy)) - return accuracy + return accuracy, test_loss def train(args): @@ -124,7 +124,7 @@ def train(args): for epoch in range(1, args['epochs'] + 1): train_one_epoch(args, model, device, train_loader, optimizer, epoch) - test_acc = test(args, model, device, test_loader) + test_acc, test_loss = test(args, model, device, test_loader) # report intermediate result nni.report_intermediate_result(test_acc) @@ -133,7 +133,8 @@ def train(args): # report final result nni.report_final_result(test_acc) - logger.debug('Final result is %g', test_acc) + print(test_acc, test_loss) + logger.debug('Final result is %g and loss is %g', test_acc, test_loss) logger.debug('Send final result done.') @@ -145,7 +146,7 @@ def get_params(): parser.add_argument("--test_dir", type=str, default='/home/savan/Documents/test_data', help="test data directory") parser.add_argument("--model_type", type=str, - default='googlenet', help="model to train") + default='alexnet', help="model to train") parser.add_argument('--batch_size', type=int, default=1, metavar='N', help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index 6ed90c9258..9b5c32946d 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -30,7 +30,7 @@ arguments: - name: model-type displayName: Model type visibility: public - value: googlenet + value: alexnet options: - name: 'GoogleNet' value: 'googlenet' @@ -152,6 +152,14 @@ templates: - name: compare-models template: compare-models dependencies: [neural-architecture-search, hyperparameter-tuning, train-model] + arguments: + artifacts: + - name: nas-metrics + from: "{{tasks.neural-architecture-search.outputs.artifacts.sys-metrics}}" + - name: hyperop-metrics + from: "{{tasks.hyperparameter-tuning.outputs.artifacts.sys-metrics}}" + - name: singlemodel-metrics + from: "{{tasks.model-param.outputs.artifacts.sys-metrics}}" - name: pytorch inputs: artifacts: @@ -249,7 +257,7 @@ templates: git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ - python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ + python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} workingDir: /mnt volumeMounts: diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 9edf6ebdf4..61e08e099a 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -4,8 +4,9 @@ import argparse import logging -import nni +import json import torch +import os import torch.nn as nn import torch.nn.functional as F import torch.optim as optim @@ -90,7 +91,7 @@ def test(args, model, device, test_loader): logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( test_loss, correct, len(test_loader.dataset), accuracy)) - return accuracy + return accuracy, test_loss def train(args): @@ -118,20 +119,20 @@ def train(args): model = build_model(args['model_type'], args['num_classes']).to(device) optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum']) + + if not os.path.exists('/mnt/output/fixed-params'): + os.makedirs('/mnt/output/fixed-params') for epoch in range(1, args['epochs'] + 1): train_one_epoch(args, model, device, train_loader, optimizer, epoch) - test_acc = test(args, model, device, test_loader) - + test_acc, test_loss = test(args, model, device, test_loader) + torch.save(model, '/mnt/output/fixed-params/fixed-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2))) # report intermediate result - nni.report_intermediate_result(test_acc) - logger.debug('test accuracy %g', test_acc) - logger.debug('Pipe send intermediate result done.') + print('test accuracy: {} test loss: {}'.format(test_acc, test_loss)) # report final result - nni.report_final_result(test_acc) - logger.debug('Final result is %g', test_acc) - logger.debug('Send final result done.') + print('Final result is ', test_acc) + return test_acc, test_loss def get_params(): @@ -142,7 +143,7 @@ def get_params(): parser.add_argument("--test_dir", type=str, default='/home/savan/Documents/test_data', help="test data directory") parser.add_argument("--model_type", type=str, - default='googlenet', help="model to train") + default='alexnet', help="model to train") parser.add_argument('--batch_size', type=int, default=1, metavar='N', help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) @@ -170,7 +171,15 @@ def get_params(): params = vars(get_params()) print("Current Parameters:\n") print(params) - train(params) + acc, loss = train(params) + metrics = [ + {'name': 'accuracy', 'value': acc}, + {'name': 'loss', 'value': loss}, + ] + + # Write metrics to `/tmp/sys-metrics.json` + with open('/tmp/sys-metrics.json', 'w') as f: + json.dump(metrics, f) except Exception as exception: logger.exception(exception) raise diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py index f67c38060a..33147b6174 100644 --- a/nni/algorithms/nas/pytorch/enas/trainer.py +++ b/nni/algorithms/nas/pytorch/enas/trainer.py @@ -208,3 +208,4 @@ def validate_one_epoch(self, epoch): logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch, meters.summary()) + return meters \ No newline at end of file diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index 6024a05a4a..03a093328a 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -144,7 +144,7 @@ def train(self, validate=True): if validate: # validation _logger.info("Epoch %d Validating", epoch + 1) - self.validate_one_epoch(epoch) + self.val_model_summary = self.validate_one_epoch(epoch) for callback in self.callbacks: callback.on_epoch_end(epoch) From fd00e11e66dad037a1c017eef67ab4c0b9b41ba8 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 17:33:16 -0600 Subject: [PATCH 20/70] add model comparison script --- compare.py | 18 ++++++++++++++++++ examples/trials/pytorch-classifier/config.yml | 2 +- examples/trials/pytorch-classifier/main.py | 9 +++++++-- .../trials/pytorch-classifier/requirements.txt | 2 +- .../pytorch-classifier/search_space.json | 1 - 5 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 compare.py diff --git a/compare.py b/compare.py new file mode 100644 index 0000000000..253151583c --- /dev/null +++ b/compare.py @@ -0,0 +1,18 @@ +import json + +accuracies = {} +with open('/tmp/nas-metrics.json') as f: + nas = json.load(f) + +with open('/tmp/hyperop-metrics.json') as f: + hyper = json.load(f) + +with open('/tmp/singlemodel-metrics.json') as f: + fm = json.load(f) + +accuracies['nas_acc'] = [i['value'] for i in nas if i['name'] == 'accuracy'][0] +accuracies['hyper_acc'] = [i['value'] for i in hyper if i['name'] == 'accuracy'][0] +accuracies['fm_acc'] = [i['value'] for i in fm if i['name'] == 'accuracy'][0] + +max_acc_name = max(accuracies, key=accuracies.get) +print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name)) \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml index c671dcf3dd..d58b9e133d 100644 --- a/examples/trials/pytorch-classifier/config.yml +++ b/examples/trials/pytorch-classifier/config.yml @@ -2,7 +2,7 @@ authorName: default experimentName: pytorch_classifier trialConcurrency: 1 maxExecDuration: 10h -maxTrialNum: 10 +maxTrialNum: 15 #choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 46c32bbb0c..896fd9900c 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -103,16 +103,21 @@ def train(args): device = torch.device("cuda" if use_cuda else "cpu") + if args['model_type'] == 'alexnet': + w, h = 256, 256 + else: + w, h = 224, 224 + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( ImageFolder(root=args['train_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + transforms.Resize((w, h)), transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( ImageFolder(root=args['test_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + transforms.Resize((w, h)), transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) diff --git a/examples/trials/pytorch-classifier/requirements.txt b/examples/trials/pytorch-classifier/requirements.txt index 01f6b72556..e7ccd30e3d 100644 --- a/examples/trials/pytorch-classifier/requirements.txt +++ b/examples/trials/pytorch-classifier/requirements.txt @@ -1,2 +1,2 @@ torch -torchvision +torchvision \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/search_space.json b/examples/trials/pytorch-classifier/search_space.json index c26cdce369..978497f8fa 100644 --- a/examples/trials/pytorch-classifier/search_space.json +++ b/examples/trials/pytorch-classifier/search_space.json @@ -1,6 +1,5 @@ { "batch_size": {"_type":"choice", "_value": [16, 32, 64, 128]}, - "hidden_size":{"_type":"choice","_value":[128, 256, 512, 1024]}, "lr":{"_type":"choice","_value":[0.0001, 0.001, 0.01, 0.1]}, "momentum":{"_type":"uniform","_value":[0, 1]} } From d76ac418e97b2bad09b9d77b536a589f8e8822ee Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 17:37:16 -0600 Subject: [PATCH 21/70] update template --- examples/trials/pytorch-classifier/template.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index 9b5c32946d..a37b43d7d5 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -276,6 +276,12 @@ templates: path: /mnt/data/datasets/ s3: key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + - name: nas-metrics + value: /tmp/nas-metrics.json + - name: hyperop-metrics + value: /tmp/hyperop-metrics.json + - name: singlemodel-metrics + value: /tmp/singlemodel-metrics.json outputs: artifacts: - name: model From cf5a6cd4f252d4fd182c927d0e6dd3d89f9bc946 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 17:39:47 -0600 Subject: [PATCH 22/70] correct typos in template --- examples/trials/pytorch-classifier/template.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index a37b43d7d5..e9afa5de53 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -159,7 +159,7 @@ templates: - name: hyperop-metrics from: "{{tasks.hyperparameter-tuning.outputs.artifacts.sys-metrics}}" - name: singlemodel-metrics - from: "{{tasks.model-param.outputs.artifacts.sys-metrics}}" + from: "{{tasks.train-model.outputs.artifacts.sys-metrics}}" - name: pytorch inputs: artifacts: @@ -277,11 +277,11 @@ templates: s3: key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' - name: nas-metrics - value: /tmp/nas-metrics.json + path: /tmp/nas-metrics.json - name: hyperop-metrics - value: /tmp/hyperop-metrics.json + path: /tmp/hyperop-metrics.json - name: singlemodel-metrics - value: /tmp/singlemodel-metrics.json + path: /tmp/singlemodel-metrics.json outputs: artifacts: - name: model From c9a39042ad38253c99ae8705dc043d5a1c33f61b Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 18:14:14 -0600 Subject: [PATCH 23/70] update path for processed data --- examples/trials/pytorch-classifier/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 896fd9900c..9a5b2eff21 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -147,9 +147,9 @@ def get_params(): # Training settings parser = argparse.ArgumentParser(description='PyTorch Classification Example') parser.add_argument("--train_dir", type=str, - default='/home/savan/Documents/train_data', help="train data directory") + default='/mnt/data/datasets/processed_data/train', help="train data directory") parser.add_argument("--test_dir", type=str, - default='/home/savan/Documents/test_data', help="test data directory") + default='/mnt/data/datasets/processed_data/test', help="test data directory") parser.add_argument("--model_type", type=str, default='alexnet', help="model to train") parser.add_argument('--batch_size', type=int, default=1, metavar='N', From c628e5664de933726a02fff6fe25be3bd0bf630f Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 10 Nov 2020 19:13:27 -0600 Subject: [PATCH 24/70] handle case when loss is NaN --- examples/trials/pytorch-classifier/train_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 61e08e099a..46ae63e03b 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -172,6 +172,8 @@ def get_params(): print("Current Parameters:\n") print(params) acc, loss = train(params) + if loss is None: + loss = 0 metrics = [ {'name': 'accuracy', 'value': acc}, {'name': 'loss', 'value': loss}, From 72f491dde1ac152966596c8fd613dbc1ae16ddfb Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 11 Nov 2020 16:33:16 -0600 Subject: [PATCH 25/70] store model after every epoch --- examples/trials/pytorch-classifier/main.py | 5 +++++ examples/trials/pytorch-classifier/train_model.py | 9 +++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 9a5b2eff21..b3ccacaaab 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -126,12 +126,17 @@ def train(args): model = build_model(args['model_type'], args['num_classes']).to(device) optimizer = optim.SGD(model.parameters(), lr=args['lr'], momentum=args['momentum']) + + if not os.path.exists('/mnt/output/hyper-params'): + os.makedirs('/mnt/output/hyper-params') for epoch in range(1, args['epochs'] + 1): train_one_epoch(args, model, device, train_loader, optimizer, epoch) test_acc, test_loss = test(args, model, device, test_loader) + torch.save(model, '/mnt/output/hyper-params/hyper-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2))) # report intermediate result + print('test accuracy: {} test loss: {}'.format(test_acc, test_loss)) nni.report_intermediate_result(test_acc) logger.debug('test accuracy %g', test_acc) logger.debug('Pipe send intermediate result done.') diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 46ae63e03b..a6fad08c1e 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -101,16 +101,21 @@ def train(args): device = torch.device("cuda" if use_cuda else "cpu") + if args['model_type'] == 'alexnet': + w, h = 256, 256 + else: + w, h = 224, 224 + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( ImageFolder(root=args['train_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + transforms.Resize((w, h)),transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( ImageFolder(root=args['test_dir'], transform=transforms.Compose([ - transforms.ToTensor(), + transforms.Resize((w, h)),transforms.ToTensor(), # add Normlize with mean and std ])), batch_size=args['batch_size'], shuffle=True, **kwargs) From d899e4cee104e6d330d7d47b3554930a9a09fd43 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 11 Nov 2020 17:48:03 -0600 Subject: [PATCH 26/70] change nas log directory for visualization --- nni/nas/pytorch/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index 03a093328a..08c1384bf3 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -92,7 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, self.batch_size = batch_size self.workers = workers self.log_frequency = log_frequency - self.log_dir = os.path.join("/mnt/output", str(time.time())) + self.log_dir = "/mnt/output/naslogs" os.makedirs(self.log_dir, exist_ok=True) self.status_writer = open(os.path.join(self.log_dir, "log"), "w") self.callbacks = callbacks if callbacks is not None else [] From f085f9eac093e7e5e60b6bfa0fffbe11759568a7 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 11 Nov 2020 18:31:46 -0600 Subject: [PATCH 27/70] get best parameter for hyper param tuning --- examples/trials/pytorch-classifier/main.py | 3 ++- nni/trial.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index b3ccacaaab..126cd5fbc6 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -143,7 +143,8 @@ def train(args): # report final result nni.report_final_result(test_acc) - print(test_acc, test_loss) + best_params = nni.get_best_params() + print("Best param and score: ", best_params) logger.debug('Final result is %g and loss is %g', test_acc, test_loss) logger.debug('Send final result done.') diff --git a/nni/trial.py b/nni/trial.py index cdb2b1e683..07fc5cd228 100644 --- a/nni/trial.py +++ b/nni/trial.py @@ -22,6 +22,8 @@ _trial_id = platform.get_trial_id() _sequence_id = platform.get_sequence_id() +#keep track of highest accuracy +_best_score = {'params':None, 'score':0} def get_next_parameter(): """ @@ -139,3 +141,14 @@ def report_final_result(metric): 'value': to_json(metric) }) platform.send_metric(metric) + update_score(metric) + +def update_score(score): + global _best_score + if score > _best_score['score']: + _best_score['score'] = score + _best_score['params'] = get_current_parameter() + +def get_best_params(): + global _best_score + return _best_score \ No newline at end of file From f4ac73548e4a84d1ef07f26254706410510d53c5 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 12 Nov 2020 14:06:07 -0600 Subject: [PATCH 28/70] update node package version --- nni/trial.py | 4 +++- setup_ts.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nni/trial.py b/nni/trial.py index 07fc5cd228..66d690b840 100644 --- a/nni/trial.py +++ b/nni/trial.py @@ -133,6 +133,7 @@ def report_final_result(metric): """ assert _params or trial_env_vars.NNI_PLATFORM is None, \ 'nni.get_next_parameter() needs to be called before report_final_result' + update_score(metric) metric = to_json({ 'parameter_id': _params['parameter_id'] if _params else None, 'trial_job_id': trial_env_vars.NNI_TRIAL_JOB_ID, @@ -141,7 +142,7 @@ def report_final_result(metric): 'value': to_json(metric) }) platform.send_metric(metric) - update_score(metric) + def update_score(score): global _best_score @@ -151,4 +152,5 @@ def update_score(score): def get_best_params(): global _best_score + print("Best Score", _best_score) return _best_score \ No newline at end of file diff --git a/setup_ts.py b/setup_ts.py index 5872ce7bbd..5eb3cba7f0 100644 --- a/setup_ts.py +++ b/setup_ts.py @@ -22,7 +22,7 @@ from zipfile import ZipFile -node_version = 'v10.22.1' +node_version = 'v14.15.0' yarn_version = 'v1.22.10' @@ -59,7 +59,7 @@ def clean(clean_all=False): if sys.platform == 'linux' or sys.platform == 'darwin': node_executable = 'node' node_spec = f'node-{node_version}-{sys.platform}-x64' - node_download_url = f'https://nodejs.org/dist/latest-v10.x/{node_spec}.tar.xz' + node_download_url = f'https://nodejs.org/dist/{node_version}/{node_spec}.tar.xz' node_extractor = lambda data: tarfile.open(fileobj=BytesIO(data), mode='r:xz') node_executable_in_tarball = 'bin/node' From 918f773ac6eee284a282a2518b21fcb50dd32c69 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 12 Nov 2020 16:42:20 -0600 Subject: [PATCH 29/70] fixed issue with metrics dumping of tuner --- examples/trials/pytorch-classifier/main.py | 6 +++ .../trials/pytorch-classifier/train_model.py | 12 +++++- nni/trial.py | 41 ++++++++++++++----- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 126cd5fbc6..a836e97f2a 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -6,6 +6,7 @@ import argparse import logging import nni +import json import torch import torch.nn as nn import torch.nn.functional as F @@ -145,6 +146,11 @@ def train(args): nni.report_final_result(test_acc) best_params = nni.get_best_params() print("Best param and score: ", best_params) + metrics = [ + {'name': 'accuracy', 'value': best_params['score']}, + ] + with open('/tmp/sys-metrics.json', 'w') as f: + json.dump(metrics, f) logger.debug('Final result is %g and loss is %g', test_acc, test_loss) logger.debug('Send final result done.') diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index a6fad08c1e..56b85ba104 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -13,7 +13,8 @@ from torchvision.datasets import ImageFolder import torchvision.models as models from torchvision import datasets, transforms - +from torch.utils.tensorboard import SummaryWriter +writer = SummaryWriter("/mnt/output/fixed_param__tb") logger = logging.getLogger('pytorch_classifier') @@ -63,6 +64,11 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) + pred = output.argmax(dim=1, keepdim=True) + correct += pred.eq(target.view_as(pred)).sum().item() + accuracy = 100. * correct / len(train_loader.dataset) + writer.add_scalar("Loss/train", loss, batch_idx + (epoch * 10)) + writer.add_scalar("Accuracy/train", accuracy, batch_idx + (epoch * 10)) loss.backward() optimizer.step() if batch_idx % args['log_interval'] == 0: @@ -105,7 +111,7 @@ def train(args): w, h = 256, 256 else: w, h = 224, 224 - + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( ImageFolder(root=args['train_dir'], transform=transforms.Compose([ @@ -131,6 +137,8 @@ def train(args): for epoch in range(1, args['epochs'] + 1): train_one_epoch(args, model, device, train_loader, optimizer, epoch) test_acc, test_loss = test(args, model, device, test_loader) + writer.add_scalar("Loss/test", test_loss, epoch ) + writer.add_scalar("Accuracy/test", test_acc, epoch ) torch.save(model, '/mnt/output/fixed-params/fixed-params-model-epochs-{}-acc-{}'.format(epoch, round(test_acc, 2))) # report intermediate result print('test accuracy: {} test loss: {}'.format(test_acc, test_loss)) diff --git a/nni/trial.py b/nni/trial.py index 66d690b840..0d24274899 100644 --- a/nni/trial.py +++ b/nni/trial.py @@ -4,7 +4,7 @@ from .utils import to_json from .runtime.env_vars import trial_env_vars from .runtime import platform - +import os, json __all__ = [ 'get_next_parameter', @@ -13,7 +13,8 @@ 'report_final_result', 'get_experiment_id', 'get_trial_id', - 'get_sequence_id' + 'get_sequence_id', + 'get_best_params' ] @@ -23,7 +24,8 @@ _sequence_id = platform.get_sequence_id() #keep track of highest accuracy -_best_score = {'params':None, 'score':0} +#_best_params = os.getenv('_BEST_PARAMS', None) +#_best_score = os.getenv('_BEST_SCORE', 0) def get_next_parameter(): """ @@ -144,13 +146,30 @@ def report_final_result(metric): platform.send_metric(metric) -def update_score(score): - global _best_score - if score > _best_score['score']: - _best_score['score'] = score - _best_score['params'] = get_current_parameter() +def update_score(metric): + + #keep track of highest accuracy + _sysdir = trial_env_vars.NNI_SYS_DIR + _trials = os.path.dirname(_sysdir) + if os.path.exists(os.path.join(_trials, 'best_score.json')): + with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile: + data = json.load(jsonFile) + if float(data['score']) < metric: + data['score'] = str(metric) + + with open(os.path.join(_trials, 'best_score.json'), "w") as jsonFile2: + print("updating json file", data) + json.dump(data, jsonFile2) + else: + params = get_current_parameter() + with open(os.path.join(_trials, 'best_score.json'),'w') as f: + json.dump({'score':metric, 'params':str(params) } , f) def get_best_params(): - global _best_score - print("Best Score", _best_score) - return _best_score \ No newline at end of file + _sysdir = trial_env_vars.NNI_SYS_DIR + _trials = os.path.dirname(_sysdir) + if os.path.exists(os.path.join(_trials, 'best_score.json')): + with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile: + data = json.load(jsonFile) + return data + return None From e25b239bd58bdf568fc89265d1397f1b00c9bf96 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 12 Nov 2020 16:45:11 -0600 Subject: [PATCH 30/70] fixed a typo --- examples/trials/pytorch-classifier/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 56b85ba104..93435b6163 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -65,7 +65,7 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch): output = model(data) loss = F.nll_loss(output, target) pred = output.argmax(dim=1, keepdim=True) - correct += pred.eq(target.view_as(pred)).sum().item() + correct = pred.eq(target.view_as(pred)).sum().item() accuracy = 100. * correct / len(train_loader.dataset) writer.add_scalar("Loss/train", loss, batch_idx + (epoch * 10)) writer.add_scalar("Accuracy/train", accuracy, batch_idx + (epoch * 10)) From 47f49824a4d58c9e3dc987821b96e1f56f146be5 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 12 Nov 2020 16:48:10 -0600 Subject: [PATCH 31/70] fixed a typo in a path --- examples/trials/pytorch-classifier/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 93435b6163..d42ad44593 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -14,7 +14,7 @@ import torchvision.models as models from torchvision import datasets, transforms from torch.utils.tensorboard import SummaryWriter -writer = SummaryWriter("/mnt/output/fixed_param__tb") +writer = SummaryWriter("/mnt/output/fixed_param_tb") logger = logging.getLogger('pytorch_classifier') From 5b048d6b1bd1eac12fdda68b45ae4735ff0b9001 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 12 Nov 2020 17:24:35 -0600 Subject: [PATCH 32/70] update template --- .../trials/pytorch-classifier/template.yml | 38 +++++++++++++++---- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index e9afa5de53..e82b326451 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -181,7 +181,7 @@ templates: - | apt-get update && \ apt-get install -y gcc g++ git && \ - python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ @@ -195,6 +195,20 @@ templates: mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + sidecars: + - name: nni-web-ui + image: 'tensorflow/tensorflow:2.3.0' + command: + - sh + - '-c' + tty: true + args: + - | + pip install nni && \ + nnictl webui nas --logdir /mnt/output/naslogs --port 8888 + ports: + - containerPort: 8888 + name: nni - name: hyperop inputs: @@ -217,12 +231,11 @@ templates: - | apt-get update && \ apt-get install -y gcc g++ git && \ - python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ - python3 examples/trials/pytorch-classifier/main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ - --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data + nnictl create --config examples/trials/pytorch-classifier/config.yml workingDir: /mnt volumeMounts: - name: data2 @@ -253,7 +266,7 @@ templates: - | apt-get update && \ apt-get install -y gcc g++ git && \ - python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ @@ -267,7 +280,18 @@ templates: mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' - + sidecars: + - name: tensorboard + image: 'tensorflow/tensorflow:2.3.0' + command: + - sh + - '-c' + tty: true + args: + - tensorboard --logdir /mnt/output/fixed_param_tb + ports: + - containerPort: 6006 + name: tensorboard - name: compare-models inputs: @@ -333,7 +357,7 @@ templates: - | apt-get update && \ apt-get install -y gcc g++ git && \ - python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + python3 -m pip install setuptools && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 prepare_data.py --data_dir=/mnt/output/processed_data From 603638ccbb821dfe4cb394f4e7b3ac26ad18afd1 Mon Sep 17 00:00:00 2001 From: savan Date: Sun, 15 Nov 2020 18:55:25 -0600 Subject: [PATCH 33/70] update workflow --- examples/trials/pytorch-classifier/template.yml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index e82b326451..5b3825e6c2 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -180,11 +180,12 @@ templates: args: - | apt-get update && \ - apt-get install -y gcc g++ git && \ + apt-get install -y gcc g++ git curl && \ python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 setup.py install && \ + python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \ --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data workingDir: /mnt @@ -230,11 +231,15 @@ templates: args: - | apt-get update && \ - apt-get install -y gcc g++ git && \ + apt-get install -y gcc g++ git curl wget && \ python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ + python3 -m pip install --upgrade requests git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 setup.py install && \ + curl -sL https://deb.nodesource.com/setup_14.x | bash - && \ + apt-get install -y nodejs && \ + wget https://github.com/onepanelio/nni/releases/download/2.0.0a0/nni-2.1-py3-none-manylinux1_x86_64.whl && \ + python3 -m pip install nni-2.1-py3-none-manylinux1_x86_64.whl && \ nnictl create --config examples/trials/pytorch-classifier/config.yml workingDir: /mnt volumeMounts: From 199aacf347263a71980f76eaffbe5c9d6b7149dd Mon Sep 17 00:00:00 2001 From: savan Date: Sun, 15 Nov 2020 23:37:50 -0600 Subject: [PATCH 34/70] handle case when loss is NaN --- examples/trials/pytorch-classifier/train_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index d42ad44593..438c7e944c 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -14,6 +14,7 @@ import torchvision.models as models from torchvision import datasets, transforms from torch.utils.tensorboard import SummaryWriter +import math writer = SummaryWriter("/mnt/output/fixed_param_tb") logger = logging.getLogger('pytorch_classifier') @@ -185,7 +186,7 @@ def get_params(): print("Current Parameters:\n") print(params) acc, loss = train(params) - if loss is None: + if loss is None or math.isnan(loss): loss = 0 metrics = [ {'name': 'accuracy', 'value': acc}, From b19ef2aeed06d698537e71da2cd8d404c02b2cc3 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 16 Nov 2020 13:33:58 -0600 Subject: [PATCH 35/70] reduce default epochs for testing --- examples/trials/pytorch-classifier/main.py | 2 +- examples/trials/pytorch-classifier/train_model.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index a836e97f2a..1012e09d9e 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -173,7 +173,7 @@ def get_params(): help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', + parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 438c7e944c..182de86d68 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -1,7 +1,6 @@ """ A general purpose classification script using PyTorch. """ - import argparse import logging import json From 3da6fb78fd15bb8e31132b8a2b4e13ad9289c290 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 16 Nov 2020 15:59:24 -0600 Subject: [PATCH 36/70] shut down process when experiment is finished --- ts/nni_manager/core/nnimanager.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/ts/nni_manager/core/nnimanager.ts b/ts/nni_manager/core/nnimanager.ts index 6ec4d0e21d..dfeb3c1bc9 100644 --- a/ts/nni_manager/core/nnimanager.ts +++ b/ts/nni_manager/core/nnimanager.ts @@ -588,6 +588,7 @@ class NNIManager implements Manager { await this.storeExperimentProfile(); // write this log for travis CI this.log.info('Experiment done.'); + return process.exit(0); } } } else { From 5acfbe4a8d29ef979a29d322ff8cfbbe36a86c0d Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 14:56:46 -0600 Subject: [PATCH 37/70] stop process when experiment is done --- .../trials/pytorch-classifier/template.yml | 62 ++++++++++--------- nni/tools/nnictl/launcher.py | 2 + 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index 5b3825e6c2..fa1d11c0f5 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -46,6 +46,26 @@ arguments: value: macro - name: 'Micro' value: micro + - name: hyperparamtuning-config + value: |- + epochs=1 + displayName: Settings for hyperparameter tuning + visibility: public + type: textarea.textarea + - name: nas-config + value: |- + epochs=1 + displayName: Settings for Neural Architecture Search + visibility: public + type: textarea.textarea + - name: fixedparam-config + value: |- + epochs=1 + momentum=0.5 + lr=0.01 + displayName: Settings for model training + visibility: public + type: textarea.textarea - displayName: Node pool hint: Name of node pool or group to run this workflow task type: select.select @@ -175,17 +195,12 @@ templates: archive: none: {} container: - image: pytorch/pytorch:latest + image: onepanel/nas:0.0.1 command: [sh,-c] args: - | - apt-get update && \ - apt-get install -y gcc g++ git curl && \ - python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 setup.py install && \ - python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \ --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data workingDir: /mnt @@ -226,21 +241,13 @@ templates: archive: none: {} container: - image: pytorch/pytorch:latest + image: onepanel/nas:0.0.1 command: [sh,-c] args: - | - apt-get update && \ - apt-get install -y gcc g++ git curl wget && \ - python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ - python3 -m pip install --upgrade requests git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - curl -sL https://deb.nodesource.com/setup_14.x | bash - && \ - apt-get install -y nodejs && \ - wget https://github.com/onepanelio/nni/releases/download/2.0.0a0/nni-2.1-py3-none-manylinux1_x86_64.whl && \ - python3 -m pip install nni-2.1-py3-none-manylinux1_x86_64.whl && \ - nnictl create --config examples/trials/pytorch-classifier/config.yml + nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground workingDir: /mnt volumeMounts: - name: data2 @@ -249,6 +256,13 @@ templates: mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' + sidecars: + - name: nni-web-ui + image: 'onepanel/nni-proxy:0.0.1' + tty: true + ports: + - containerPort: 8089 + name: nni - name: model-param inputs: @@ -265,16 +279,12 @@ templates: archive: none: {} container: - image: pytorch/pytorch:latest + image: onepanel/nas:0.0.1 command: [sh,-c] args: - | - apt-get update && \ - apt-get install -y gcc g++ git && \ - python3 -m pip install urllib3==1.12 setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 setup.py install && \ python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} workingDir: /mnt @@ -319,19 +329,13 @@ templates: archive: none: {} container: - image: pytorch/pytorch:latest + image: onepanel/nas:0.0.1 command: [sh,-c] args: - | - apt-get update && \ - apt-get install -y gcc g++ git && \ - python3 -m pip install setuptools torch==1.4.0 torchvision==0.5.0 tensorboard && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 setup.py install && \ - python3 prepare_data.py && \ - python3 examples/trials/pytorch-classifier/train_main.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ - --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} + python3 compare.py workingDir: /mnt volumeMounts: - name: data4 diff --git a/nni/tools/nnictl/launcher.py b/nni/tools/nnictl/launcher.py index fa0aa3baab..8a05ae9803 100644 --- a/nni/tools/nnictl/launcher.py +++ b/nni/tools/nnictl/launcher.py @@ -528,6 +528,8 @@ def launch_experiment(args, experiment_config, mode, config_file_name, experimen while True: log_content = rest_process.stdout.readline().strip().decode('utf-8') print(log_content) + if 'Experiment done.' in log_content: + sys.exit(0) except KeyboardInterrupt: kill_command(rest_process.pid) print_normal('Stopping experiment...') From 9265381ec67ddf7c7c68dad6c61d5d9b38a3f50d Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 16:15:27 -0600 Subject: [PATCH 38/70] accept settings in a single param --- .../trials/pytorch-classifier/train_model.py | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 182de86d68..8d48553088 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -118,23 +118,23 @@ def train(args): transforms.Resize((w, h)),transforms.ToTensor(), # add Normlize with mean and std ])), - batch_size=args['batch_size'], shuffle=True, **kwargs) + batch_size=int(args['batch_size']), shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( ImageFolder(root=args['test_dir'], transform=transforms.Compose([ transforms.Resize((w, h)),transforms.ToTensor(), # add Normlize with mean and std ])), - batch_size=args['batch_size'], shuffle=True, **kwargs) + batch_size=int(args['batch_size']), shuffle=True, **kwargs) - model = build_model(args['model_type'], args['num_classes']).to(device) - optimizer = optim.SGD(model.parameters(), lr=args['lr'], - momentum=args['momentum']) + model = build_model(args['model_type'], int(args['num_classes'])).to(device) + optimizer = optim.SGD(model.parameters(), lr=float(args['lr']), + momentum=float(args['momentum'])) if not os.path.exists('/mnt/output/fixed-params'): os.makedirs('/mnt/output/fixed-params') - for epoch in range(1, args['epochs'] + 1): + for epoch in range(1, int(args['epochs']) + 1): train_one_epoch(args, model, device, train_loader, optimizer, epoch) test_acc, test_loss = test(args, model, device, test_loader) writer.add_scalar("Loss/test", test_loss, epoch ) @@ -155,26 +155,26 @@ def get_params(): default='/home/savan/Documents/train_data', help="train data directory") parser.add_argument("--test_dir", type=str, default='/home/savan/Documents/test_data', help="test data directory") - parser.add_argument("--model_type", type=str, - default='alexnet', help="model to train") - parser.add_argument('--batch_size', type=int, default=1, metavar='N', - help='input batch size for training (default: 64)') +# parser.add_argument("--model_type", type=str, +# default='alexnet', help="model to train") +# parser.add_argument('--batch_size', type=int, default=1, metavar='N', +# help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) - parser.add_argument("--num_classes", type=int, default=2, metavar='N', - help='number of classes in the dataset') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') +# parser.add_argument("--num_classes", type=int, default=2, metavar='N', +# help='number of classes in the dataset') +# parser.add_argument('--lr', type=float, default=0.01, metavar='LR', +# help='learning rate (default: 0.01)') +# parser.add_argument('--momentum', type=float, default=0.5, metavar='M', +# help='SGD momentum (default: 0.5)') +# parser.add_argument('--epochs', type=int, default=10, metavar='N', +# help='number of epochs to train (default: 10)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--log_interval', type=int, default=1000, metavar='N', help='how many batches to wait before logging training status') - + parser.add_argument('--config', help="hyperparameters or other configs") args, _ = parser.parse_known_args() return args @@ -182,9 +182,14 @@ def get_params(): if __name__ == '__main__': try: params = vars(get_params()) + extras = params['config'].split("\\n") + extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + config.update(params) + config.pop('config') print("Current Parameters:\n") - print(params) - acc, loss = train(params) + print(config) + acc, loss = train(config) if loss is None or math.isnan(loss): loss = 0 metrics = [ From a89139eb70dd00e1eadbcfab947d1171d8a8ebaf Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 17:30:01 -0600 Subject: [PATCH 39/70] get config args from user --- .../trials/pytorch-classifier/create_yaml.py | 36 +++++++++++++++++++ examples/trials/pytorch-classifier/main.py | 2 +- .../trials/pytorch-classifier/template.yml | 22 ++++++++---- .../trials/pytorch-classifier/train_model.py | 6 ++-- 4 files changed, 56 insertions(+), 10 deletions(-) create mode 100644 examples/trials/pytorch-classifier/create_yaml.py diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py new file mode 100644 index 0000000000..18a68830f2 --- /dev/null +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -0,0 +1,36 @@ +import yaml +import argparse +import json + +def main(args): + stream = open(args['config_path'], 'r') + data = yaml.load(stream) + data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs']) + + with open(args['output_path'], 'w') as yaml_file: + yaml_file.write( yaml.dump(data, default_flow_style=False)) + mm_list = [int(item) for item in args['momentum_range'].split(',')] + lr_list = [float(item) for item in args['lr_list'].split(',')] + bs_list = [int(item) for item in args['batch_size_list'].split(',')] + with open(args['output_search_space_path'], 'w') as json_file: + json_data = {'batch_size': {'_type':'choice', '_value':bs_list}, 'lr':{"_type":"choice","_value":lr_list} , 'momentum':{"_type":"uniform","_value":mm_list}} + json.dump(json_data, json_file) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='PyTorch Classification Example') + parser.add_argument("--config_path", type=str, + default='/mnt/nni/examples/trials/pytorch-classifier/config.yml', help="train data directory") + parser.add_argument("--output_path", type=str, + default='/mnt/nni/examples/trials/pytorch-classifier/config.yml', help="model to train") + parser.add_argument("--output_search_space_path", type=str, + default='/mnt/nni/examples/trials/pytorch-classifier/search_space.json', help="model to train") + parser.add_argument("--num_classes", type=int, default=2, + help="number of classes in the dataset") + parser.add_argument("--config", default="batch_size_list=16,32,64,128\nlr_list=0.001,0.001\nmomentum_range=0,1\nepochs=10") + args = parser.parse_args() + extras = args.config.split("\n") + extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + config.update(vars(args)) + print(config) + main(config) \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 1012e09d9e..3f8d54fa07 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -165,7 +165,7 @@ def get_params(): parser.add_argument("--model_type", type=str, default='alexnet', help="model to train") parser.add_argument('--batch_size', type=int, default=1, metavar='N', - help='input batch size for training (default: 64)') + help='input batch size for training (default: 1)') parser.add_argument("--batch_num", type=int, default=None) parser.add_argument("--num_classes", type=int, default=2, metavar='N', help='number of classes in the dataset') diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index fa1d11c0f5..fd07fbd21f 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -63,9 +63,17 @@ arguments: epochs=1 momentum=0.5 lr=0.01 + model_type=alexnet + batch_size=1 displayName: Settings for model training visibility: public type: textarea.textarea + - name: searchspace-config + value: |- + batch_size_list=16,32,64,128 + lr_list=0.0001,0.001,0.01,0.1 + momentum_range=0,1 + epochs=10 - displayName: Node pool hint: Name of node pool or group to run this workflow task type: select.select @@ -195,7 +203,7 @@ templates: archive: none: {} container: - image: onepanel/nas:0.0.1 + image: onepanel/nni:0.0.4 command: [sh,-c] args: - | @@ -241,12 +249,14 @@ templates: archive: none: {} container: - image: onepanel/nas:0.0.1 + image: onepanel/nni:0.0.4 command: [sh,-c] args: - | git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ + python3 examples/trials/pytorch-classifier/create_yaml.py --config {{workflow.parameters.searchspace-config}} \ + --num_classes {{workflow.parameters.num-classes}} nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground workingDir: /mnt volumeMounts: @@ -279,14 +289,14 @@ templates: archive: none: {} container: - image: onepanel/nas:0.0.1 + image: onepanel/nni:0.0.4 command: [sh,-c] args: - | git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 examples/trials/pytorch-classifier/train_model.py --model_type={{workflow.parameters.model-type}} --epochs {{workflow.parameters.epochs}} --num_classes {{workflow.parameters.num-classes}} \ - --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --lr {{workflow.parameters.learning-rate}} --momentum {{workflow.parameters.momentum}} --batch_size {{workflow.parameters.batch-size}} + python3 examples/trials/pytorch-classifier/train_model.py --num_classes {{workflow.parameters.num-classes}} \ + --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --config {{workflow.parameters.fixedparam-config}} workingDir: /mnt volumeMounts: - name: data3 @@ -329,7 +339,7 @@ templates: archive: none: {} container: - image: onepanel/nas:0.0.1 + image: onepanel/nni:0.0.4 command: [sh,-c] args: - | diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 8d48553088..7721b81290 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -160,8 +160,8 @@ def get_params(): # parser.add_argument('--batch_size', type=int, default=1, metavar='N', # help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) -# parser.add_argument("--num_classes", type=int, default=2, metavar='N', -# help='number of classes in the dataset') + parser.add_argument("--num_classes", type=int, default=2, metavar='N', + help='number of classes in the dataset') # parser.add_argument('--lr', type=float, default=0.01, metavar='LR', # help='learning rate (default: 0.01)') # parser.add_argument('--momentum', type=float, default=0.5, metavar='M', @@ -182,7 +182,7 @@ def get_params(): if __name__ == '__main__': try: params = vars(get_params()) - extras = params['config'].split("\\n") + extras = params['config'].split("\n") extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} config.update(params) From c6a770601b2748808301c1c9ad6ccb18395fdb1d Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 19:33:23 -0600 Subject: [PATCH 40/70] add log statements --- examples/trials/pytorch-classifier/train_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 7721b81290..e6179d201c 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -182,9 +182,13 @@ def get_params(): if __name__ == '__main__': try: params = vars(get_params()) + print("Older params:", params) extras = params['config'].split("\n") + print("extras", extras) extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + print("extra processed", extras_processed) config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + print("config", config) config.update(params) config.pop('config') print("Current Parameters:\n") From ed43e9c3ad98e4b1e8652bc425207ad150a7fc45 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 21:53:40 -0600 Subject: [PATCH 41/70] revert changes related to single param config --- .../trials/pytorch-classifier/train_model.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index e6179d201c..fc7423ccd9 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -182,18 +182,18 @@ def get_params(): if __name__ == '__main__': try: params = vars(get_params()) - print("Older params:", params) - extras = params['config'].split("\n") - print("extras", extras) - extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] - print("extra processed", extras_processed) - config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} - print("config", config) - config.update(params) - config.pop('config') + # print("Older params:", params) + # extras = params['config'].split("\n") + # print("extras", extras) + # extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + # print("extra processed", extras_processed) + # config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + # print("config", config) + # config.update(params) + # config.pop('config') print("Current Parameters:\n") - print(config) - acc, loss = train(config) + print(params) + acc, loss = train(params) if loss is None or math.isnan(loss): loss = 0 metrics = [ From 7e49f77e2dcd71739656bfb2eab0d642f170b50f Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 23:18:24 -0600 Subject: [PATCH 42/70] revert parameter changes --- .../trials/pytorch-classifier/template.yml | 1 + .../trials/pytorch-classifier/train_model.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index fd07fbd21f..fadc1299d4 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -69,6 +69,7 @@ arguments: visibility: public type: textarea.textarea - name: searchspace-config + type: textarea.textarea value: |- batch_size_list=16,32,64,128 lr_list=0.0001,0.001,0.01,0.1 diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index fc7423ccd9..af86510659 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -155,19 +155,19 @@ def get_params(): default='/home/savan/Documents/train_data', help="train data directory") parser.add_argument("--test_dir", type=str, default='/home/savan/Documents/test_data', help="test data directory") -# parser.add_argument("--model_type", type=str, -# default='alexnet', help="model to train") -# parser.add_argument('--batch_size', type=int, default=1, metavar='N', -# help='input batch size for training (default: 64)') + parser.add_argument("--model_type", type=str, + default='alexnet', help="model to train") + parser.add_argument('--batch_size', type=int, default=1, metavar='N', + help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) parser.add_argument("--num_classes", type=int, default=2, metavar='N', help='number of classes in the dataset') -# parser.add_argument('--lr', type=float, default=0.01, metavar='LR', -# help='learning rate (default: 0.01)') -# parser.add_argument('--momentum', type=float, default=0.5, metavar='M', -# help='SGD momentum (default: 0.5)') -# parser.add_argument('--epochs', type=int, default=10, metavar='N', -# help='number of epochs to train (default: 10)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--no_cuda', action='store_true', default=False, From 2f6f5c7310906f030146e7c79e750b2587fe9b2d Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 17 Nov 2020 23:45:44 -0600 Subject: [PATCH 43/70] convert metrics to float explicitly --- compare.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compare.py b/compare.py index 253151583c..8e036f4872 100644 --- a/compare.py +++ b/compare.py @@ -10,9 +10,9 @@ with open('/tmp/singlemodel-metrics.json') as f: fm = json.load(f) -accuracies['nas_acc'] = [i['value'] for i in nas if i['name'] == 'accuracy'][0] -accuracies['hyper_acc'] = [i['value'] for i in hyper if i['name'] == 'accuracy'][0] -accuracies['fm_acc'] = [i['value'] for i in fm if i['name'] == 'accuracy'][0] +accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0] +accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0] +accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0] max_acc_name = max(accuracies, key=accuracies.get) print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name)) \ No newline at end of file From 6dad848a47aba51b2eca40bc25cc7eacf839087c Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 13:13:19 -0600 Subject: [PATCH 44/70] read parameters from config --- .../trials/pytorch-classifier/train_model.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index af86510659..1aa03d19b6 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -155,19 +155,19 @@ def get_params(): default='/home/savan/Documents/train_data', help="train data directory") parser.add_argument("--test_dir", type=str, default='/home/savan/Documents/test_data', help="test data directory") - parser.add_argument("--model_type", type=str, - default='alexnet', help="model to train") - parser.add_argument('--batch_size', type=int, default=1, metavar='N', - help='input batch size for training (default: 64)') + # parser.add_argument("--model_type", type=str, + # default='alexnet', help="model to train") + # parser.add_argument('--batch_size', type=int, default=1, metavar='N', + # help='input batch size for training (default: 64)') parser.add_argument("--batch_num", type=int, default=None) parser.add_argument("--num_classes", type=int, default=2, metavar='N', help='number of classes in the dataset') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') + # parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + # help='learning rate (default: 0.01)') + # parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + # help='SGD momentum (default: 0.5)') + # parser.add_argument('--epochs', type=int, default=10, metavar='N', + # help='number of epochs to train (default: 10)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--no_cuda', action='store_true', default=False, @@ -182,18 +182,18 @@ def get_params(): if __name__ == '__main__': try: params = vars(get_params()) - # print("Older params:", params) - # extras = params['config'].split("\n") - # print("extras", extras) - # extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] - # print("extra processed", extras_processed) - # config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} - # print("config", config) - # config.update(params) - # config.pop('config') + print("Older params:", params) + extras = params['config'].split("\n") + print("extras", extras) + extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + print("extra processed", extras_processed) + config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + print("config", config) + config.update(params) + config.pop('config') print("Current Parameters:\n") - print(params) - acc, loss = train(params) + print(config) + acc, loss = train(config) if loss is None or math.isnan(loss): loss = 0 metrics = [ From 4454f50c36ab4a4f4fa620a8d57ca6a789e2661e Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 15:14:21 -0600 Subject: [PATCH 45/70] accept config for enas --- compare.py | 3 ++ examples/nas/enas/search.py | 35 ++++++++++++------- .../trials/pytorch-classifier/create_yaml.py | 4 ++- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/compare.py b/compare.py index 8e036f4872..b4ce35929f 100644 --- a/compare.py +++ b/compare.py @@ -3,12 +3,15 @@ accuracies = {} with open('/tmp/nas-metrics.json') as f: nas = json.load(f) + print("Metrics for Neural Architecture Search: ", nas) with open('/tmp/hyperop-metrics.json') as f: hyper = json.load(f) + print("Metrics for hyper parameter optimization: ", hyper) with open('/tmp/singlemodel-metrics.json') as f: fm = json.load(f) + print("Metrics for model trained with fixed parameters: ", fm) accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0] accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0] diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index c1ac75cdff..fece3fd380 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -20,25 +20,34 @@ if __name__ == "__main__": parser = ArgumentParser("enas") - parser.add_argument("--batch-size", default=128, type=int) + # parser.add_argument("--batch-size", default=128, type=int) parser.add_argument("--log-frequency", default=10, type=int) parser.add_argument("--num-classes", default=2, type=int) parser.add_argument("--dataset", default="cifar10", choices=["cifar10", "custom_classification"]) - parser.add_argument("--search-for", choices=["macro", "micro"], default="macro") - parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") + # parser.add_argument("--search-for", choices=["macro", "micro"], default="macro") + # parser.add_argument("--epochs", default=None, type=int, help="Number of epochs (default: macro 310, micro 150)") parser.add_argument("--visualization", default=True, action="store_true") parser.add_argument("--train-data-dir", default="/home/savan/Documents/train_data", help="train dataset for classification") parser.add_argument("--valid-data-dir", default="/home/savan/Documents/test_data", help="validation dataset for classification") + parser.add_argument("--config", default="batch-size=128 \n search-for=macro \n epochs=30") args = parser.parse_args() - dataset_train, dataset_valid = datasets.get_dataset(args.dataset, train_dir=args.train_data_dir, valid_data=args.valid_data_dir) - if args.search_for == "macro": - model = GeneralNetwork(num_classes=args.num_classes) - num_epochs = args.epochs or 310 + extras = args['config'].split("\n") + print("nas extras", extras) + extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] + print("nas extra processed", extras_processed) + config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + print("nas config", config) + config.update(args) + + dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir']) + if args['search_for'] == "macro": + model = GeneralNetwork(num_classes=args['num_classes']) + num_epochs = args['epochs'] or 310 mutator = None - elif args.search_for == "micro": - model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args.num_classes, use_aux_heads=True) - num_epochs = args.epochs or 150 + elif args['search_for'] == "micro": + model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args['num_classes'], use_aux_heads=True) + num_epochs = args['epochs'] or 150 mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True) else: raise AssertionError @@ -53,13 +62,13 @@ reward_function=reward_accuracy, optimizer=optimizer, callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")], - batch_size=args.batch_size, + batch_size=args['batch_size'], num_epochs=num_epochs, dataset_train=dataset_train, dataset_valid=dataset_valid, - log_frequency=args.log_frequency, + log_frequency=args['log_frequency'], mutator=mutator) - if args.visualization: + if args['visualization']: trainer.enable_visualization() trainer.train() metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}] diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index 18a68830f2..ce641998c9 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -28,9 +28,11 @@ def main(args): help="number of classes in the dataset") parser.add_argument("--config", default="batch_size_list=16,32,64,128\nlr_list=0.001,0.001\nmomentum_range=0,1\nepochs=10") args = parser.parse_args() + print("Arguments: ", args) extras = args.config.split("\n") extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} + print("[Create YAML] Config: ", config) config.update(vars(args)) - print(config) + print("Final Arguments: ", config) main(config) \ No newline at end of file From b4673e7820bcd2008f3b328be78fd8cc1bcc5f1a Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 15:17:06 -0600 Subject: [PATCH 46/70] convert score to float --- examples/trials/pytorch-classifier/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index 3f8d54fa07..b38516e588 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -147,7 +147,7 @@ def train(args): best_params = nni.get_best_params() print("Best param and score: ", best_params) metrics = [ - {'name': 'accuracy', 'value': best_params['score']}, + {'name': 'accuracy', 'value': float(best_params['score'])}, ] with open('/tmp/sys-metrics.json', 'w') as f: json.dump(metrics, f) From 70517e652d9658c737dc10ea1128bef57a47a300 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 15:23:16 -0600 Subject: [PATCH 47/70] add error handling in comparison script --- compare.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/compare.py b/compare.py index b4ce35929f..2783e810ef 100644 --- a/compare.py +++ b/compare.py @@ -1,21 +1,30 @@ import json accuracies = {} -with open('/tmp/nas-metrics.json') as f: - nas = json.load(f) - print("Metrics for Neural Architecture Search: ", nas) -with open('/tmp/hyperop-metrics.json') as f: - hyper = json.load(f) - print("Metrics for hyper parameter optimization: ", hyper) +try: + with open('/tmp/nas-metrics.json') as f: + nas = json.load(f) + print("Metrics for Neural Architecture Search: ", nas) + accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0] +except RuntimeError as e: + print("Error occurred while reading metrics for NAS: ", e) -with open('/tmp/singlemodel-metrics.json') as f: - fm = json.load(f) - print("Metrics for model trained with fixed parameters: ", fm) +try: + with open('/tmp/hyperop-metrics.json') as f: + hyper = json.load(f) + print("Metrics for hyper parameter optimization: ", hyper) + accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0] +except RuntimeError as e: + print("Error occurred while reading metrics for hyperparameter optimization: ", e) -accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0] -accuracies['hyper_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0] -accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0] +try: + with open('/tmp/singlemodel-metrics.json') as f: + fm = json.load(f) + print("Metrics for model trained with fixed parameters: ", fm) + accuracies['fm_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0] +except RuntimeError as e: + print("Error occurred while reading metrics for fixed-param model: ", e) max_acc_name = max(accuracies, key=accuracies.get) print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name)) \ No newline at end of file From c54a626c24b0cb3c79bc697ef13edf457cd46fd9 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 16:17:12 -0600 Subject: [PATCH 48/70] remove subscription from argparse var --- examples/nas/enas/search.py | 2 +- .../trials/pytorch-classifier/template.yml | 96 ++++++++----------- 2 files changed, 41 insertions(+), 57 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index fece3fd380..8a0e84b544 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -32,7 +32,7 @@ parser.add_argument("--config", default="batch-size=128 \n search-for=macro \n epochs=30") args = parser.parse_args() - extras = args['config'].split("\n") + extras = args.config.split("\n") print("nas extras", extras) extras_processed = [i.split("#")[0].replace(" ","") for i in extras if i] print("nas extra processed", extras_processed) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index fadc1299d4..af747fbc1a 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -1,8 +1,6 @@ entrypoint: main arguments: parameters: - - name: source - value: https://github.com/onepanelio/nni.git - name: cvat-annotation-path value: annotation-dump/animals/11052020231652 displayName: Dataset path @@ -14,67 +12,44 @@ arguments: - name: num-classes displayName: Number of classes visibility: public - value: 2 - - name: learning-rate - value: 0.01 - displayName: Learning rate + value: '2' + - name: test-split + displayName: Percentage of images to use for testing visibility: public - - name: batch-size - value: 1 - displayName: Batch size - visibility: public - - name: momentum - value: 0.5 - displayName: Momentum - visibility: public - - name: model-type - displayName: Model type - visibility: public - value: alexnet - options: - - name: 'GoogleNet' - value: 'googlenet' - - name: 'ResNet50' - value: 'resnet50' - - name: epochs - value: 1 - - name: search-method - value: macro - type: select.select - options: - - name: 'Macro' - value: macro - - name: 'Micro' - value: micro + value: '20' - name: hyperparamtuning-config value: |- - epochs=1 + epochs=10 displayName: Settings for hyperparameter tuning visibility: public type: textarea.textarea + - name: searchspace-config + type: textarea.textarea + displayName: Search space for hyperparameter tuning + value: |- + batch_size_list=16,32,64,128 + lr_list=0.0001,0.001,0.01,0.1 + momentum_range=0,1 + epochs=10 - name: nas-config value: |- - epochs=1 + epochs=20 + batch-size=128 + search-for=macro displayName: Settings for Neural Architecture Search visibility: public type: textarea.textarea - name: fixedparam-config value: |- - epochs=1 momentum=0.5 lr=0.01 model_type=alexnet - batch_size=1 + batch_size=16 + epochs=10 displayName: Settings for model training visibility: public type: textarea.textarea - - name: searchspace-config - type: textarea.textarea - value: |- - batch_size_list=16,32,64,128 - lr_list=0.0001,0.001,0.01,0.1 - momentum_range=0,1 - epochs=10 + - displayName: Node pool hint: Name of node pool or group to run this workflow task type: select.select @@ -208,10 +183,14 @@ templates: command: [sh,-c] args: - | - git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 examples/nas/enas/search.py --search-for {{workflow.parameters.search-method}} --epochs {{workflow.parameters.epochs}} --num-classes {{workflow.parameters.num-classes}} \ - --dataset custom_classification --train-data-dir /mnt/data/datasets/processed_data --valid-data-dir /mnt/data/datasets/processed_data + python3 examples/nas/enas/search.py \ + --config="{{workflow.parameters.nas-config}}" \ + --num-classes="{{workflow.parameters.num-classes}}" \ + --dataset="custom_classification" \ + --train-data-dir="/mnt/data/datasets/processed_data" \ + --valid-data-dir="/mnt/data/datasets/processed_data" workingDir: /mnt volumeMounts: - name: data @@ -221,7 +200,7 @@ templates: nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' sidecars: - - name: nni-web-ui + - name: nni-nas-ui image: 'tensorflow/tensorflow:2.3.0' command: - sh @@ -254,10 +233,11 @@ templates: command: [sh,-c] args: - | - git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 examples/trials/pytorch-classifier/create_yaml.py --config {{workflow.parameters.searchspace-config}} \ - --num_classes {{workflow.parameters.num-classes}} + python3 examples/trials/pytorch-classifier/create_yaml.py \ + --config="{{workflow.parameters.searchspace-config}}" \ + --num_classes="{{workflow.parameters.num-classes}}" && \ nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground workingDir: /mnt volumeMounts: @@ -268,7 +248,7 @@ templates: nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' sidecars: - - name: nni-web-ui + - name: nni-hyperparamopt-ui image: 'onepanel/nni-proxy:0.0.1' tty: true ports: @@ -294,10 +274,13 @@ templates: command: [sh,-c] args: - | - git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 examples/trials/pytorch-classifier/train_model.py --num_classes {{workflow.parameters.num-classes}} \ - --train_dir /mnt/data/datasets/processed_data --test_dir /mnt/data/datasets/processed_data --config {{workflow.parameters.fixedparam-config}} + python3 examples/trials/pytorch-classifier/train_model.py \ + --num_classes="{{workflow.parameters.num-classes}}" \ + --train_dir="/mnt/data/datasets/processed_data" \ + --test_dir="/mnt/data/datasets/processed_data" \ + --config="{{workflow.parameters.fixedparam-config}}" \ workingDir: /mnt volumeMounts: - name: data3 @@ -380,7 +363,8 @@ templates: python3 -m pip install setuptools && \ git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 prepare_data.py --data_dir=/mnt/output/processed_data + python3 prepare_data.py --data_dir="/mnt/output/processed_data" \ + --test_split="{{workflow.parameters.test-split}}" workingDir: /mnt volumeMounts: - name: data5 From e3a8f93f0a7ba4caac4f9a691b9b6df7c6a138a4 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 16:41:38 -0600 Subject: [PATCH 49/70] convert args to dictionary --- examples/nas/enas/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 8a0e84b544..701f099d14 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -38,7 +38,7 @@ print("nas extra processed", extras_processed) config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} print("nas config", config) - config.update(args) + config.update(vars(args)) dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir']) if args['search_for'] == "macro": From 3fc027beea9a4e30decab7d60e44ec7900a292c3 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 16:59:04 -0600 Subject: [PATCH 50/70] assign config back to args --- examples/nas/enas/search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 701f099d14..2e574d8b28 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -39,6 +39,7 @@ config = {i.split('=')[0]:i.split('=')[1] for i in extras_processed} print("nas config", config) config.update(vars(args)) + args = config dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir']) if args['search_for'] == "macro": From ccee20fe296c01c370b118cd3ce14721b6e01006 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 17:51:32 -0600 Subject: [PATCH 51/70] convert numerican strings to int --- examples/nas/enas/search.py | 10 +++++----- examples/trials/pytorch-classifier/template.yml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 2e574d8b28..234e4b2189 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -43,12 +43,12 @@ dataset_train, dataset_valid = datasets.get_dataset(args['dataset'], train_dir=args['train_data_dir'], valid_data=args['valid_data_dir']) if args['search_for'] == "macro": - model = GeneralNetwork(num_classes=args['num_classes']) - num_epochs = args['epochs'] or 310 + model = GeneralNetwork(num_classes=int(args['num_classes'])) + num_epochs = int(args['epochs']) or 310 mutator = None elif args['search_for'] == "micro": - model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=args['num_classes'], use_aux_heads=True) - num_epochs = args['epochs'] or 150 + model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, num_classes=int(args['num_classes']), use_aux_heads=True) + num_epochs = int(args['epochs']) or 150 mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True) else: raise AssertionError @@ -63,7 +63,7 @@ reward_function=reward_accuracy, optimizer=optimizer, callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("/mnt/output"), ModelCheckpoint("/mnt/output")], - batch_size=args['batch_size'], + batch_size=int(args['batch_size']), num_epochs=num_epochs, dataset_train=dataset_train, dataset_valid=dataset_valid, diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index af747fbc1a..a4b52ec377 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -34,8 +34,8 @@ arguments: - name: nas-config value: |- epochs=20 - batch-size=128 - search-for=macro + batch_size=128 + search_for=macro displayName: Settings for Neural Architecture Search visibility: public type: textarea.textarea From b890319123e4808094d10e402c44451397abee2c Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 18 Nov 2020 17:56:45 -0600 Subject: [PATCH 52/70] round numbers to 2 decimal points --- examples/nas/enas/search.py | 2 +- examples/trials/pytorch-classifier/main.py | 2 +- examples/trials/pytorch-classifier/train_model.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/nas/enas/search.py b/examples/nas/enas/search.py index 234e4b2189..f6d7ea926e 100644 --- a/examples/nas/enas/search.py +++ b/examples/nas/enas/search.py @@ -72,6 +72,6 @@ if args['visualization']: trainer.enable_visualization() trainer.train() - metrics = [{'name':'accuracy', 'value':trainer.val_model_summary['acc1'].avg}, {'name':'loss', 'value':trainer.val_model_summary['loss'].avg}] + metrics = [{'name':'accuracy', 'value':round(trainer.val_model_summary['acc1'].avg, 2)}, {'name':'loss', 'value':round(trainer.val_model_summary['loss'].avg,2)}] with open('/tmp/sys-metrics.json', 'w') as f: json.dump(metrics, f) \ No newline at end of file diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index b38516e588..a901a7f67f 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -147,7 +147,7 @@ def train(args): best_params = nni.get_best_params() print("Best param and score: ", best_params) metrics = [ - {'name': 'accuracy', 'value': float(best_params['score'])}, + {'name': 'accuracy', 'value': round(float(best_params['score']),2)}, ] with open('/tmp/sys-metrics.json', 'w') as f: json.dump(metrics, f) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index 1aa03d19b6..e63f01cb21 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -197,8 +197,8 @@ def get_params(): if loss is None or math.isnan(loss): loss = 0 metrics = [ - {'name': 'accuracy', 'value': acc}, - {'name': 'loss', 'value': loss}, + {'name': 'accuracy', 'value': round(acc,2)}, + {'name': 'loss', 'value': round(loss,2)}, ] # Write metrics to `/tmp/sys-metrics.json` From 63fb18378e9ff108e52725739780a82becb7e7c8 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 19 Nov 2020 19:22:56 -0600 Subject: [PATCH 53/70] add a flag to skip the preprocessing --- examples/trials/pytorch-classifier/template.yml | 6 ------ prepare_data.py | 8 +++++++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index a4b52ec377..6f564fb401 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -17,12 +17,6 @@ arguments: displayName: Percentage of images to use for testing visibility: public value: '20' - - name: hyperparamtuning-config - value: |- - epochs=10 - displayName: Settings for hyperparameter tuning - visibility: public - type: textarea.textarea - name: searchspace-config type: textarea.textarea displayName: Search space for hyperparameter tuning diff --git a/prepare_data.py b/prepare_data.py index 6697653ced..c1368eca0a 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -33,5 +33,11 @@ def main(args): parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data') parser.add_argument('--image_dir', default='/mnt/data/datasets/images') parser.add_argument('--test_split', default=20, type=int) + parser.add_argument('--skip', default=True, type=bool) args = parser.parse_args() - main(args) + if not args.skip: + main(args) + else: + os.makedirs("/mnt/output/processed_data") + for imdir in os.listdir("/mnt/data/datasets/"): + shutil.move(os.path.join("/mnt/data/datasets", imdir), "/mnt/output/processed_data/") From aeeededa9ebc69b4635cc877a794d7faaae53b76 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 19 Nov 2020 19:27:35 -0600 Subject: [PATCH 54/70] change argument type for skip --- prepare_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_data.py b/prepare_data.py index c1368eca0a..b04d51cf31 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -35,7 +35,7 @@ def main(args): parser.add_argument('--test_split', default=20, type=int) parser.add_argument('--skip', default=True, type=bool) args = parser.parse_args() - if not args.skip: + if args.skip == "false": main(args) else: os.makedirs("/mnt/output/processed_data") From d85c27ca8f3c8ad378375bf68dbdae9da5594de9 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 19 Nov 2020 20:25:09 -0600 Subject: [PATCH 55/70] change argument type to string --- prepare_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prepare_data.py b/prepare_data.py index b04d51cf31..8bcb91f80d 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -33,7 +33,7 @@ def main(args): parser.add_argument('--data_dir', default='/mnt/data/datasets/processed_data') parser.add_argument('--image_dir', default='/mnt/data/datasets/images') parser.add_argument('--test_split', default=20, type=int) - parser.add_argument('--skip', default=True, type=bool) + parser.add_argument('--skip', default="false") args = parser.parse_args() if args.skip == "false": main(args) From 198997d27ae4a7fef494e4dd497c96433c165d0b Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 19 Nov 2020 21:09:42 -0600 Subject: [PATCH 56/70] update template --- .../trials/pytorch-classifier/template.yml | 93 +++++++++++-------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index 6f564fb401..eaeecbf570 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -2,44 +2,60 @@ entrypoint: main arguments: parameters: - name: cvat-annotation-path - value: annotation-dump/animals/11052020231652 + value: annotation-dump/patch_medical/valid displayName: Dataset path hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated. visibility: private + - name: cvat-output-path value: workflow-data/output/nas/nas-model-comparison visibility: private + - name: num-classes displayName: Number of classes visibility: public value: '2' + hint: 'Number of classes in a dataset' + - name: test-split displayName: Percentage of images to use for testing visibility: public value: '20' + hint: 'Percentage of data to be used for test set' + + - name: skip-preprocessing + displayName: Whether to skip preprocessing data or not + visibility: public + value: 'false' + hint: 'Specify whether to skip preprocessing or not. Skip preprocessing if your dataset is already in a required format.' + - name: searchspace-config type: textarea.textarea displayName: Search space for hyperparameter tuning value: |- - batch_size_list=16,32,64,128 - lr_list=0.0001,0.001,0.01,0.1 - momentum_range=0,1 - epochs=10 + batch_size_list=16,32,64,128 # batch sizes for hyperparameter tuner + lr_list=0.0001,0.001,0.01,0.1 # learning rates for hyperparameter tuner + momentum_range=0,1 # range for momentum for hyperparameter tuner + epochs=10 # epochs to train each model for + hint: 'Define parameters for hyperparameter tuning' + - name: nas-config value: |- - epochs=20 - batch_size=128 - search_for=macro + epochs=20 # epochs to train a model for + batch_size=128 # batch size for training + search_for=macro # macro or micro search technique for ENAS displayName: Settings for Neural Architecture Search visibility: public + hint: 'Define parameters for Neural Architecture Search' + type: textarea.textarea - name: fixedparam-config value: |- - momentum=0.5 - lr=0.01 - model_type=alexnet - batch_size=16 - epochs=10 + momentum=0.5 # momentum to use for training + lr=0.01 # learning rate for training + model_type=alexnet # model to train (i.e alexnet, googlenet) + batch_size=16 # batch size for training + epochs=10 # epochs to train a model for displayName: Settings for model training visibility: public type: textarea.textarea @@ -68,70 +84,70 @@ volumeClaimTemplates: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: name: output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: data2 + name: hyperparamtuning-data spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: output2 + name: hyperparamtuning-output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: data3 + name: fixedparam-data spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: output3 + name: fixedparam-output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: data4 + name: comparemodel-data spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi - metadata: - name: output4 + name: comparemodel-output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 20Gi - metadata: - name: data5 + name: preprocess-data spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 20Gi - metadata: - name: output5 + name: preprocess-output spec: accessModes: [ "ReadWriteOnce" ] resources: requests: - storage: 20Gi + storage: 200Gi templates: - name: main dag: @@ -235,9 +251,9 @@ templates: nnictl create --config examples/trials/pytorch-classifier/config.yml --port 8089 --foreground workingDir: /mnt volumeMounts: - - name: data2 + - name: hyperparamtuning-data mountPath: /mnt/data - - name: output2 + - name: hyperparamtuning-output mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' @@ -277,9 +293,9 @@ templates: --config="{{workflow.parameters.fixedparam-config}}" \ workingDir: /mnt volumeMounts: - - name: data3 + - name: fixedparam-data mountPath: /mnt/data - - name: output3 + - name: fixedparam-output mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' @@ -326,9 +342,9 @@ templates: python3 compare.py workingDir: /mnt volumeMounts: - - name: data4 + - name: comparemodel-data mountPath: /mnt/data - - name: output4 + - name: comparemodel-output mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' @@ -358,12 +374,13 @@ templates: git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 prepare_data.py --data_dir="/mnt/output/processed_data" \ - --test_split="{{workflow.parameters.test-split}}" + --test_split="{{workflow.parameters.test-split}}" \ + --skip="{{workflow.parameters.skip-preprocessing}}" workingDir: /mnt volumeMounts: - - name: data5 + - name: preprocess-data mountPath: /mnt/data - - name: output5 + - name: preprocess-output mountPath: /mnt/output nodeSelector: beta.kubernetes.io/instance-type: '{{workflow.parameters.sys-node-pool}}' \ No newline at end of file From 5ad2cfebef10dfecd5c4277a7534922932c648ff Mon Sep 17 00:00:00 2001 From: savan Date: Fri, 20 Nov 2020 12:14:06 -0600 Subject: [PATCH 57/70] handle case when data is already processed --- prepare_data.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 8bcb91f80d..afdf3855a8 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -3,6 +3,7 @@ import shutil import argparse import random +import glob def main(args): @@ -27,6 +28,25 @@ def main(args): shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'train', lbl, img.attrib['name'])) +def train_test_split(args): + + for dir in os.listdir('/mnt/data/datasets'): + os.makedirs(os.path.join(args.data_dir, 'train', dir)) + os.makedirs(os.path.join(args.data_dir, 'test', dir)) + a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg') + a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png')) + test_len = (len(a) * int(args.test_split) )// 100 + count = 0 + for file in a: + print(file) + img_path = os.path.split(file)[-1] + if bool(random.getrandbits(1)) and count <= test_len: + shutil.move(file, os.path.join(args.data_dir, 'test', dir, img_path)) + count += 1 + else: + shutil.move(file, os.path.join(args.data_dir, 'train', dir, img_path)) + + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--xml_path', default='/mnt/data/datasets/annotations/default.xml') @@ -38,6 +58,4 @@ def main(args): if args.skip == "false": main(args) else: - os.makedirs("/mnt/output/processed_data") - for imdir in os.listdir("/mnt/data/datasets/"): - shutil.move(os.path.join("/mnt/data/datasets", imdir), "/mnt/output/processed_data/") + train_test_split(args) \ No newline at end of file From 5235b99ad4ad8b7c88d525fdb85d76da0f2ee626 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 23 Nov 2020 14:14:32 -0600 Subject: [PATCH 58/70] add log lines --- prepare_data.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index afdf3855a8..6be6079a66 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -30,21 +30,20 @@ def main(args): def train_test_split(args): - for dir in os.listdir('/mnt/data/datasets'): - os.makedirs(os.path.join(args.data_dir, 'train', dir)) - os.makedirs(os.path.join(args.data_dir, 'test', dir)) + for dirn in os.listdir('/mnt/data/datasets'): + os.makedirs(os.path.join(args.data_dir, 'train', dirn)) + os.makedirs(os.path.join(args.data_dir, 'test', dirn)) a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg') a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png')) test_len = (len(a) * int(args.test_split) )// 100 count = 0 for file in a: - print(file) img_path = os.path.split(file)[-1] if bool(random.getrandbits(1)) and count <= test_len: - shutil.move(file, os.path.join(args.data_dir, 'test', dir, img_path)) + shutil.move(file, os.path.join(args.data_dir, 'test', dirn, img_path)) count += 1 else: - shutil.move(file, os.path.join(args.data_dir, 'train', dir, img_path)) + shutil.move(file, os.path.join(args.data_dir, 'train', dirn, img_path)) if __name__ == '__main__': @@ -56,6 +55,8 @@ def train_test_split(args): parser.add_argument('--skip', default="false") args = parser.parse_args() if args.skip == "false": + print("Processing data...") main(args) else: + print("Moving files to appropriate directories...") train_test_split(args) \ No newline at end of file From d68b5f728637547c7338eec6d7ca5bc3a18b217c Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 23 Nov 2020 15:30:49 -0600 Subject: [PATCH 59/70] correct typo in var name --- prepare_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 6be6079a66..4eac1e55ac 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -33,8 +33,8 @@ def train_test_split(args): for dirn in os.listdir('/mnt/data/datasets'): os.makedirs(os.path.join(args.data_dir, 'train', dirn)) os.makedirs(os.path.join(args.data_dir, 'test', dirn)) - a = glob.glob('/mnt/data/datasets/'+dir+'/*.jpg') - a.extend(glob.glob('/mnt/data/datasets/'+dir+'/*.png')) + a = glob.glob('/mnt/data/datasets/'+dirn+'/*.jpg') + a.extend(glob.glob('/mnt/data/datasets/'+dirn+'/*.png')) test_len = (len(a) * int(args.test_split) )// 100 count = 0 for file in a: From 50b9016fd37f89aa77bdb188c993de4442e6f3c9 Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 23 Nov 2020 17:32:37 -0600 Subject: [PATCH 60/70] replace logger with print --- examples/trials/pytorch-classifier/train_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/train_model.py b/examples/trials/pytorch-classifier/train_model.py index e63f01cb21..67e2d0c816 100644 --- a/examples/trials/pytorch-classifier/train_model.py +++ b/examples/trials/pytorch-classifier/train_model.py @@ -72,7 +72,7 @@ def train_one_epoch(args, model, device, train_loader, optimizer, epoch): loss.backward() optimizer.step() if batch_idx % args['log_interval'] == 0: - logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) From 56df7a8542955069c34d7cc85fca2b14fd7d6047 Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 24 Nov 2020 13:46:17 -0600 Subject: [PATCH 61/70] update template --- .../trials/pytorch-classifier/template.yml | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index eaeecbf570..b82467eb58 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -2,7 +2,7 @@ entrypoint: main arguments: parameters: - name: cvat-annotation-path - value: annotation-dump/patch_medical/valid + value: annotation-dump/patch_medical/compressed_valid displayName: Dataset path hint: Path to annotated data in default object storage (i.e S3). In CVAT, this parameter will be pre-populated. visibility: private @@ -26,7 +26,7 @@ arguments: - name: skip-preprocessing displayName: Whether to skip preprocessing data or not visibility: public - value: 'false' + value: 'true' hint: 'Specify whether to skip preprocessing or not. Skip preprocessing if your dataset is already in a required format.' - name: searchspace-config @@ -189,18 +189,20 @@ templates: archive: none: {} container: - image: onepanel/nni:0.0.4 + image: onepanel/nni:1.0.0 command: [sh,-c] args: - | + mv /mnt/data/datasets/processed_data.zip ./ && \ + unzip processed_data.zip -d /mnt/data/datasets/ && \ git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 examples/nas/enas/search.py \ --config="{{workflow.parameters.nas-config}}" \ --num-classes="{{workflow.parameters.num-classes}}" \ --dataset="custom_classification" \ - --train-data-dir="/mnt/data/datasets/processed_data" \ - --valid-data-dir="/mnt/data/datasets/processed_data" + --train-data-dir="/mnt/data/datasets/processed_data/train" \ + --valid-data-dir="/mnt/data/datasets/processed_data/test" workingDir: /mnt volumeMounts: - name: data @@ -239,10 +241,12 @@ templates: archive: none: {} container: - image: onepanel/nni:0.0.4 + image: onepanel/nni:1.0.0 command: [sh,-c] args: - | + mv /mnt/data/datasets/processed_data.zip ./ && \ + unzip processed_data.zip -d /mnt/data/datasets/ && \ git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 examples/trials/pytorch-classifier/create_yaml.py \ @@ -280,17 +284,20 @@ templates: archive: none: {} container: - image: onepanel/nni:0.0.4 + image: onepanel/nni:1.0.0 command: [sh,-c] args: - | + mv /mnt/data/datasets/processed_data.zip ./ && \ + unzip processed_data.zip -d /mnt/data/datasets/ && \ git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 examples/trials/pytorch-classifier/train_model.py \ --num_classes="{{workflow.parameters.num-classes}}" \ - --train_dir="/mnt/data/datasets/processed_data" \ - --test_dir="/mnt/data/datasets/processed_data" \ + --train_dir="/mnt/data/datasets/processed_data/train" \ + --test_dir="/mnt/data/datasets/processed_data/test" \ --config="{{workflow.parameters.fixedparam-config}}" \ + --log_interval=1 workingDir: /mnt volumeMounts: - name: fixedparam-data @@ -315,10 +322,6 @@ templates: - name: compare-models inputs: artifacts: - - name: data - path: /mnt/data/datasets/ - s3: - key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' - name: nas-metrics path: /tmp/nas-metrics.json - name: hyperop-metrics @@ -333,11 +336,11 @@ templates: archive: none: {} container: - image: onepanel/nni:0.0.4 + image: onepanel/nni:1.0.0 command: [sh,-c] args: - | - git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 compare.py workingDir: /mnt @@ -356,6 +359,7 @@ templates: path: /mnt/data/datasets/ s3: key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + outputs: artifacts: - name: model @@ -369,13 +373,18 @@ templates: args: - | apt-get update && \ - apt-get install -y gcc g++ git && \ + apt-get install -y gcc g++ git unzip zip && \ python3 -m pip install setuptools && \ - git clone --single-branch --branch dev https://github.com/onepanelio/nni.git && \ + cd /mnt/data/datasets && \ + unzip processed_data.zip && \ + rm -f processed_data.zip && \ + cd /mnt && \ + git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ - python3 prepare_data.py --data_dir="/mnt/output/processed_data" \ + python3 prepare_data.py --data_dir="./processed_data" \ --test_split="{{workflow.parameters.test-split}}" \ - --skip="{{workflow.parameters.skip-preprocessing}}" + --skip="{{workflow.parameters.skip-preprocessing}}" && \ + zip -r /mnt/output/processed_data.zip ./processed_data workingDir: /mnt volumeMounts: - name: preprocess-data From 9c6fb6f16d24b4e2df228d07b255e9af4b3cf092 Mon Sep 17 00:00:00 2001 From: savan Date: Sun, 29 Nov 2020 16:09:04 -0600 Subject: [PATCH 62/70] handle case when search params aren;t provided --- .../trials/pytorch-classifier/create_yaml.py | 18 +++++-- .../trials/pytorch-classifier/template.yml | 47 ++++++++++++++++--- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index ce641998c9..8e28111c2b 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -8,12 +8,20 @@ def main(args): data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs']) with open(args['output_path'], 'w') as yaml_file: - yaml_file.write( yaml.dump(data, default_flow_style=False)) - mm_list = [int(item) for item in args['momentum_range'].split(',')] - lr_list = [float(item) for item in args['lr_list'].split(',')] - bs_list = [int(item) for item in args['batch_size_list'].split(',')] + yaml_file.write(yaml.dump(data, default_flow_style=False)) + + json_data = {} + if 'momentum_range' in args: + mm_list = [int(item) for item in args['momentum_range'].split(',')] + json_data['momentum'] = {"_type":"uniform","_value":mm_list} + if 'lr_list' in args: + lr_list = [float(item) for item in args['lr_list'].split(',')] + json_data['lr'] = {"_type":"choice","_value":lr_list} + if 'batch_size_list' in args: + bs_list = [int(item) for item in args['batch_size_list'].split(',')] + json_data['batch_size'] = {'_type':'choice', '_value':bs_list} + with open(args['output_search_space_path'], 'w') as json_file: - json_data = {'batch_size': {'_type':'choice', '_value':bs_list}, 'lr':{"_type":"choice","_value":lr_list} , 'momentum':{"_type":"uniform","_value":mm_list}} json.dump(json_data, json_file) if __name__ == "__main__": diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index b82467eb58..a4d962deac 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -335,14 +335,47 @@ templates: optional: true archive: none: {} - container: + script: image: onepanel/nni:1.0.0 - command: [sh,-c] - args: - - | - git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ - cd nni/ && \ - python3 compare.py + command: [python3, '-u'] + source: | + import json + + accuracies = {} + try: + with open('/tmp/nas-metrics.json') as f: + nas = json.load(f) + print("Metrics for Neural Architecture Search: ", nas) + accuracies['nas_acc'] = [float(i['value']) for i in nas if i['name'] == 'accuracy'][0] + except RuntimeError as e: + print("Error occurred while reading metrics for NAS: ", e) + + try: + with open('/tmp/hyperop-metrics.json') as f: + hyper = json.load(f) + print("Metrics for hyper parameter optimization: ", hyper) + accuracies['hyperparam_acc'] = [float(i['value']) for i in hyper if i['name'] == 'accuracy'][0] + except RuntimeError as e: + print("Error occurred while reading metrics for hyperparameter optimization: ", e) + + try: + with open('/tmp/singlemodel-metrics.json') as f: + fm = json.load(f) + print("Metrics for model trained with fixed parameters: ", fm) + accuracies['fixedparam_acc'] = [float(i['value']) for i in fm if i['name'] == 'accuracy'][0] + except RuntimeError as e: + print("Error occurred while reading metrics for fixed-param model: ", e) + + max_acc_name = max(accuracies, key=accuracies.get) + print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name)) + + metrics = [{'name': mac_acc_name, 'value': round(max(accuracies.values()),2)}] + try: + with open('/tmp/sys-metrics.json') as f: + json.dump(metrics, f) + except: + pass + workingDir: /mnt volumeMounts: - name: comparemodel-data From 1e2f39eb9b91d8182e99f14e7f5f2d2bd5632fdb Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 1 Dec 2020 16:13:01 -0600 Subject: [PATCH 63/70] update maximum trials --- examples/trials/pytorch-classifier/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/config.yml b/examples/trials/pytorch-classifier/config.yml index d58b9e133d..ff6e8ff5cf 100644 --- a/examples/trials/pytorch-classifier/config.yml +++ b/examples/trials/pytorch-classifier/config.yml @@ -2,7 +2,7 @@ authorName: default experimentName: pytorch_classifier trialConcurrency: 1 maxExecDuration: 10h -maxTrialNum: 15 +maxTrialNum: 1 #choice: local, remote, pai trainingServicePlatform: local searchSpacePath: search_space.json From 24088cdf0afe306ba76d873f0e716edb8a593c89 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 2 Dec 2020 10:25:05 -0600 Subject: [PATCH 64/70] update data directory for preprocessing --- prepare_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/prepare_data.py b/prepare_data.py index 4eac1e55ac..343a01a10f 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -30,11 +30,11 @@ def main(args): def train_test_split(args): - for dirn in os.listdir('/mnt/data/datasets'): + for dirn in os.listdir(args.image_dir): os.makedirs(os.path.join(args.data_dir, 'train', dirn)) os.makedirs(os.path.join(args.data_dir, 'test', dirn)) - a = glob.glob('/mnt/data/datasets/'+dirn+'/*.jpg') - a.extend(glob.glob('/mnt/data/datasets/'+dirn+'/*.png')) + a = glob.glob(args.image_dir+'/'+dirn+'/*.jpg') + a.extend(glob.glob(args.image_dir+'/'+dirn+'/*.png')) test_len = (len(a) * int(args.test_split) )// 100 count = 0 for file in a: From 2d29f3f364f6325b5eb7ea0f03a8a2499841f237 Mon Sep 17 00:00:00 2001 From: savan Date: Wed, 2 Dec 2020 11:58:50 -0600 Subject: [PATCH 65/70] delete lost+found directories --- prepare_data.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/prepare_data.py b/prepare_data.py index 343a01a10f..158001e55c 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -59,4 +59,11 @@ def train_test_split(args): main(args) else: print("Moving files to appropriate directories...") - train_test_split(args) \ No newline at end of file + train_test_split(args) + # clean up, lost+found directory causes PyTorch to think there are three classes + # so, remove it + try: + shutil.rmtree(os.path.join(args.data_dir, 'train', 'lost+found')) + shutil.rmtree(os.path.join(args.data_dir, 'test', 'lost+found')) + except: + pass \ No newline at end of file From 0bf71940b0cd57e7bad330f16e42efeb141bbf29 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 3 Dec 2020 14:22:38 -0600 Subject: [PATCH 66/70] pass model_type to main script --- examples/trials/pytorch-classifier/create_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index 8e28111c2b..9905c0b7af 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -5,7 +5,7 @@ def main(args): stream = open(args['config_path'], 'r') data = yaml.load(stream) - data['trial']['command'] = "python3 main.py --num_classes {} --epochs {}".format(args['num_classes'], args['epochs']) + data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type']) with open(args['output_path'], 'w') as yaml_file: yaml_file.write(yaml.dump(data, default_flow_style=False)) From 686a1b56137873013a7579c186d9e41cbb638367 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 3 Dec 2020 18:17:29 -0600 Subject: [PATCH 67/70] allow user to update settings in config.yml --- examples/trials/pytorch-classifier/create_yaml.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index 9905c0b7af..9d6c2bc53d 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -6,6 +6,17 @@ def main(args): stream = open(args['config_path'], 'r') data = yaml.load(stream) data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type']) + # update config settings + if 'max_trial_num' in args: + data['maxTrialNum'] = int(args['max_trial_num']) + if 'max_exec_duration' in args: + data['maxExecDuration'] = args['max_exec_duration'] + if 'trial_concurrency' in args: + data['trialConcurrency'] = int(args['trial_concurrency']) + if 'use_annotation' in args: + data['useAnnotation'] = args['use_annotation'] + if 'tuner' in args: + data['builtinTunerName'] = args['tuner'] with open(args['output_path'], 'w') as yaml_file: yaml_file.write(yaml.dump(data, default_flow_style=False)) From 48baeb6720c7d16042fbac3d1893105de1ebb176 Mon Sep 17 00:00:00 2001 From: savan Date: Thu, 3 Dec 2020 18:33:42 -0600 Subject: [PATCH 68/70] resolve key access error --- examples/trials/pytorch-classifier/create_yaml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index 9d6c2bc53d..a89eced5ab 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -16,7 +16,7 @@ def main(args): if 'use_annotation' in args: data['useAnnotation'] = args['use_annotation'] if 'tuner' in args: - data['builtinTunerName'] = args['tuner'] + data['tuner']['builtinTunerName'] = args['tuner'] with open(args['output_path'], 'w') as yaml_file: yaml_file.write(yaml.dump(data, default_flow_style=False)) From 1a840b5d444ee8f95f2a6090f7401ba1f1f3ecad Mon Sep 17 00:00:00 2001 From: savan Date: Mon, 7 Dec 2020 21:58:47 -0600 Subject: [PATCH 69/70] add comments --- compare.py | 2 + .../trials/pytorch-classifier/template.yml | 72 +++++++++++++------ nni/algorithms/nas/pytorch/enas/trainer.py | 1 + nni/nas/pytorch/trainer.py | 2 + nni/trial.py | 17 +++-- prepare_data.py | 11 ++- 6 files changed, 74 insertions(+), 31 deletions(-) diff --git a/compare.py b/compare.py index 2783e810ef..7e62fc923d 100644 --- a/compare.py +++ b/compare.py @@ -1,3 +1,5 @@ +# Read sys-metrics files and find the one with the highest accuracy. + import json accuracies = {} diff --git a/examples/trials/pytorch-classifier/template.yml b/examples/trials/pytorch-classifier/template.yml index a4d962deac..a5f099441f 100644 --- a/examples/trials/pytorch-classifier/template.yml +++ b/examples/trials/pytorch-classifier/template.yml @@ -33,10 +33,13 @@ arguments: type: textarea.textarea displayName: Search space for hyperparameter tuning value: |- - batch_size_list=16,32,64,128 # batch sizes for hyperparameter tuner - lr_list=0.0001,0.001,0.01,0.1 # learning rates for hyperparameter tuner - momentum_range=0,1 # range for momentum for hyperparameter tuner - epochs=10 # epochs to train each model for + model_type=alexnet # any model type supported by torchvision + batch_size_list=16,32,64,128 # batch sizes for hyperparameter tuner + lr_list=0.0001,0.001,0.01,0.1 # learning rates for hyperparameter tuner + momentum_range=0,1 # range for momentum for hyperparameter tuner + epochs=10 # epochs to train each model for + tuner=TPE # any tuner supprted by NNI + max_trial_num=2 # max number of trials to run hint: 'Define parameters for hyperparameter tuning' - name: nas-config @@ -189,7 +192,7 @@ templates: archive: none: {} container: - image: onepanel/nni:1.0.0 + image: onepanel/nni:1.0.2 command: [sh,-c] args: - | @@ -241,7 +244,7 @@ templates: archive: none: {} container: - image: onepanel/nni:1.0.0 + image: onepanel/nni:1.0.2 command: [sh,-c] args: - | @@ -284,12 +287,14 @@ templates: archive: none: {} container: - image: onepanel/nni:1.0.0 + image: onepanel/nni:1.0.2 command: [sh,-c] args: - | mv /mnt/data/datasets/processed_data.zip ./ && \ unzip processed_data.zip -d /mnt/data/datasets/ && \ + ls /mnt/data/datasets/ && \ + ls /mnt/data/datasets/processed_data/train/ && \ git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 examples/trials/pytorch-classifier/train_model.py \ @@ -336,10 +341,15 @@ templates: archive: none: {} script: - image: onepanel/nni:1.0.0 + image: onepanel/nni:1.0.2 command: [python3, '-u'] source: | import json + import os + import onepanel.core.api + from onepanel.core.api.models.metric import Metric + from onepanel.core.api.rest import ApiException + from onepanel.core.api.models import Parameter accuracies = {} try: @@ -368,14 +378,32 @@ templates: max_acc_name = max(accuracies, key=accuracies.get) print("Maximum accuracy was {} for {}".format(max(accuracies.values()), max_acc_name)) + metrics = [{'name': 'accuracy', 'value': round(max(accuracies.values()),2)}] + with open('/var/run/secrets/kubernetes.io/serviceaccount/token') as f: + token = f.read() + + # Configure API authorization + configuration = onepanel.core.api.Configuration( + host = os.getenv('ONEPANEL_API_URL'), + api_key = { + 'authorization': token + } + ) + configuration.api_key_prefix['authorization'] = 'Bearer' - metrics = [{'name': mac_acc_name, 'value': round(max(accuracies.values()),2)}] - try: - with open('/tmp/sys-metrics.json') as f: - json.dump(metrics, f) - except: - pass - + # Call SDK method to save metrics + with onepanel.core.api.ApiClient(configuration) as api_client: + api_instance = onepanel.core.api.WorkflowServiceApi(api_client) + namespace = '{{workflow.namespace}}' + uid = '{{workflow.name}}' + body = onepanel.core.api.AddWorkflowExecutionsMetricsRequest() + body.metrics = metrics + try: + api_response = api_instance.add_workflow_execution_metrics(namespace, uid, body) + print('Metrics added.') + except ApiException as e: + print("Exception when calling WorkflowServiceApi->add_workflow_execution_metrics: %s\n" % e) + workingDir: /mnt volumeMounts: - name: comparemodel-data @@ -389,9 +417,9 @@ templates: inputs: artifacts: - name: data - path: /mnt/data/datasets/ - s3: - key: '{{workflow.namespace}}/{{workflow.parameters.cvat-annotation-path}}' + path: /mnt/data/patch_medical_valid.zip + http: + url: https://github.com/onepanelio/templates/releases/download/v0.2.0/patch_medical_valid.zip outputs: artifacts: @@ -408,14 +436,16 @@ templates: apt-get update && \ apt-get install -y gcc g++ git unzip zip && \ python3 -m pip install setuptools && \ - cd /mnt/data/datasets && \ - unzip processed_data.zip && \ - rm -f processed_data.zip && \ + ls /mnt/data && \ + cd /mnt/data/ && \ + unzip patch_medical_valid.zip && \ + rm -f patch_medical_valid.zip && \ cd /mnt && \ git clone --single-branch --branch fix/config_param https://github.com/onepanelio/nni.git && \ cd nni/ && \ python3 prepare_data.py --data_dir="./processed_data" \ --test_split="{{workflow.parameters.test-split}}" \ + --image_dir="/mnt/data" \ --skip="{{workflow.parameters.skip-preprocessing}}" && \ zip -r /mnt/output/processed_data.zip ./processed_data workingDir: /mnt diff --git a/nni/algorithms/nas/pytorch/enas/trainer.py b/nni/algorithms/nas/pytorch/enas/trainer.py index 33147b6174..dc45d6da43 100644 --- a/nni/algorithms/nas/pytorch/enas/trainer.py +++ b/nni/algorithms/nas/pytorch/enas/trainer.py @@ -208,4 +208,5 @@ def validate_one_epoch(self, epoch): logger.info("Test Epoch [%d/%d] Arc [%d/%d] Summary %s", epoch + 1, self.num_epochs, arc_id + 1, self.test_arc_per_epoch, meters.summary()) + # return metrics so that it can be saved later return meters \ No newline at end of file diff --git a/nni/nas/pytorch/trainer.py b/nni/nas/pytorch/trainer.py index 08c1384bf3..7f49e0c059 100644 --- a/nni/nas/pytorch/trainer.py +++ b/nni/nas/pytorch/trainer.py @@ -92,6 +92,7 @@ def __init__(self, model, mutator, loss, metrics, optimizer, num_epochs, self.batch_size = batch_size self.workers = workers self.log_frequency = log_frequency + # update log dir for Onepanel self.log_dir = "/mnt/output/naslogs" os.makedirs(self.log_dir, exist_ok=True) self.status_writer = open(os.path.join(self.log_dir, "log"), "w") @@ -144,6 +145,7 @@ def train(self, validate=True): if validate: # validation _logger.info("Epoch %d Validating", epoch + 1) + # keep track of metrics so that it can be used later self.val_model_summary = self.validate_one_epoch(epoch) for callback in self.callbacks: diff --git a/nni/trial.py b/nni/trial.py index 0d24274899..13b6e3289e 100644 --- a/nni/trial.py +++ b/nni/trial.py @@ -23,10 +23,6 @@ _trial_id = platform.get_trial_id() _sequence_id = platform.get_sequence_id() -#keep track of highest accuracy -#_best_params = os.getenv('_BEST_PARAMS', None) -#_best_score = os.getenv('_BEST_SCORE', 0) - def get_next_parameter(): """ Get the hyper paremeters generated by tuner. For a multiphase experiment, it returns a new group of hyper @@ -147,25 +143,28 @@ def report_final_result(metric): def update_score(metric): - - #keep track of highest accuracy + """ + Keep track of metrics over trials. Maintain highest accuracy so far. + """ _sysdir = trial_env_vars.NNI_SYS_DIR _trials = os.path.dirname(_sysdir) if os.path.exists(os.path.join(_trials, 'best_score.json')): with open(os.path.join(_trials, 'best_score.json'), "r") as jsonFile: data = json.load(jsonFile) - if float(data['score']) < metric: + if float(data['score']) < metric: # new accuracy is higher than prev one data['score'] = str(metric) - + # update best_score file with the new one with open(os.path.join(_trials, 'best_score.json'), "w") as jsonFile2: print("updating json file", data) json.dump(data, jsonFile2) - else: + else: # first trial, create new best_score params = get_current_parameter() with open(os.path.join(_trials, 'best_score.json'),'w') as f: json.dump({'score':metric, 'params':str(params) } , f) def get_best_params(): + """ Read best_score.json and return highest score (i.e accuracy) + """ _sysdir = trial_env_vars.NNI_SYS_DIR _trials = os.path.dirname(_sysdir) if os.path.exists(os.path.join(_trials, 'best_score.json')): diff --git a/prepare_data.py b/prepare_data.py index 158001e55c..eba66deb28 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -6,7 +6,12 @@ import glob def main(args): + """ CVAT's XML format has xml file that contains annotations and paths to images. + The script requires data to be in PyTorch's ImageFolder folder where there is one directory + per class. + This also splits data into train and test set. + """ tree = ET.parse(args.xml_path) root = tree.getroot() @@ -14,6 +19,7 @@ def main(args): for label in root.iter('label'): os.makedirs(os.path.join(args.data_dir, 'train', label.find('name').text)) os.makedirs(os.path.join(args.data_dir, 'test', label.find('name').text)) + images_len = len(list(root.iter('tag'))) test_len = (images_len * args.test_split )// 100 count = 0 @@ -22,6 +28,7 @@ def main(args): lbl = img.find('tag').attrib['label'] if lbl: if bool(random.getrandbits(1)) and count <= test_len : + # randomly put image into test or train dir shutil.move(os.path.join(args.image_dir, img.attrib['name']), os.path.join(args.data_dir, 'test', lbl, img.attrib['name'])) count += 1 else: @@ -29,7 +36,9 @@ def main(args): def train_test_split(args): - + """ + If Images are already in ImageFolder format, then just split images into train and test. + """ for dirn in os.listdir(args.image_dir): os.makedirs(os.path.join(args.data_dir, 'train', dirn)) os.makedirs(os.path.join(args.data_dir, 'test', dirn)) From 346eee190d923b2d3a5f4e4b48a898b12680b79d Mon Sep 17 00:00:00 2001 From: savan Date: Tue, 8 Dec 2020 14:59:58 -0600 Subject: [PATCH 70/70] add comments and support for new models --- .../trials/pytorch-classifier/create_yaml.py | 4 ++ examples/trials/pytorch-classifier/main.py | 42 +++++++++++-------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/examples/trials/pytorch-classifier/create_yaml.py b/examples/trials/pytorch-classifier/create_yaml.py index a89eced5ab..e415e2bed7 100644 --- a/examples/trials/pytorch-classifier/create_yaml.py +++ b/examples/trials/pytorch-classifier/create_yaml.py @@ -1,3 +1,4 @@ +# script to generate config yaml file dynamically import yaml import argparse import json @@ -5,6 +6,7 @@ def main(args): stream = open(args['config_path'], 'r') data = yaml.load(stream) + # update command based on args data['trial']['command'] = "python3 main.py --num_classes {} --epochs {} --model_type {}".format(args['num_classes'], args['epochs'], args['model_type']) # update config settings if 'max_trial_num' in args: @@ -21,6 +23,8 @@ def main(args): with open(args['output_path'], 'w') as yaml_file: yaml_file.write(yaml.dump(data, default_flow_style=False)) + # update search space for hyperparam tuning + # script needs to be updated for each new param json_data = {} if 'momentum_range' in args: mm_list = [int(item) for item in args['momentum_range'].split(',')] diff --git a/examples/trials/pytorch-classifier/main.py b/examples/trials/pytorch-classifier/main.py index a901a7f67f..6369dc4db4 100644 --- a/examples/trials/pytorch-classifier/main.py +++ b/examples/trials/pytorch-classifier/main.py @@ -18,22 +18,6 @@ logger = logging.getLogger('pytorch_classifier') - -# mean = 0.0 -# for images, _ in loader: -# batch_samples = images.size(0) -# images = images.view(batch_samples, images.size(1), -1) -# mean += images.mean(2).sum(0) -# mean = mean / len(loader.dataset) - -# var = 0.0 -# for images, _ in loader: -# batch_samples = images.size(0) -# images = images.view(batch_samples, images.size(1), -1) -# var += ((images - mean.unsqueeze(1))**2).sum([0,2]) -# std = torch.sqrt(var / (len(loader.dataset)*224*224)) - - def build_model(model_type, num_classes): if model_type == "googlenet": model = models.googlenet(pretrained=True) @@ -41,12 +25,36 @@ def build_model(model_type, num_classes): elif model_type == "resnet50": model = models.resnet50(pretrained=True) in_features = 2048 + elif model_type == "resnet18": + model = models.resnet18(pretrained=True) + in_features = 512 elif model_type == "alexnet": model = models.alexnet(pretrained=True) in_features = 4096 elif model_type == "vgg19": - model = models.alexnet(pretrained=True) + model = models.vgg19(pretrained=True) + in_features = 4096 + elif model_type == "vgg16": + model = models.vgg16(pretrained=True) in_features = 4096 + elif model_type == "mobilenet_v2": + model = models.mobilenet_v2(pretrained=True) + model.classifier[1] = nn.Linear(1280, num_classes) + return model + elif model_type == "inception_v3": + model = models.inception_v3(pretrained=True) + model.fc = nn.Linear(2048, num_classes) + return model + elif model_type == "densenet161": + model = models.densenet161(pretrained=True) # other variants are 121, 169, 201 + model.classifier = nn.Linear(2208, num_classes) + return model + elif model_type == "squeezenet": + # squeezenet has diff architecture, terminate here + model = models.squeezenet1_0(pretrained=True) + model.classifier[1] = nn.Conv2d(512, num_classes, 1) + return model + if model_type in ['alexnet', 'vgg19']: model.classifier._modules['6'] = nn.Sequential(nn.Linear(in_features, num_classes), nn.LogSoftmax(dim=1))