0.5.6 (#284)

mir-group · Dec 20, 2022 · dceaf49 · dceaf49
2 parents a65b3eb + 5a365e0
commit dceaf49
Show file tree

Hide file tree

Showing 52 changed files with 1,388 additions and 909 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -15,8 +15,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.9]
-        torch-version: [1.10.0, 1.11.0]
+        python-version: [3.9]
+        torch-version: [1.11.0, 1.12.1]
 
     steps:
     - uses: actions/checkout@v2
@@ -44,4 +44,4 @@ jobs:
     - name: Test with pytest
       run: |
         # See https://github.com/pytest-dev/pytest/issues/1075
-        PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ .
+        PYTHONHASHSEED=0 pytest -n auto tests/
diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.9]
-        torch-version: [1.11.0]
+        torch-version: [1.12.1]
 
     steps:
     - uses: actions/checkout@v2
@@ -44,4 +44,4 @@ jobs:
     - name: Test with pytest
       run: |
         # See https://github.com/pytest-dev/pytest/issues/1075
-        PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ .
+        PYTHONHASHSEED=0 pytest -n auto tests/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Most recent change on the bottom.
 
 
-## [Unreleased] - 0.5.6
+## [0.5.6] - 2022-12-19
+### Added
+- sklearn dependency removed
+- `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights
+- `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary`
+- `nequip-benchmark --pdb` for debugging model (builder) errors
+- More information in `nequip-deploy info`
+
+### Changed
+- Minimum e3nn is now 0.4.4
+- `--equivariance-test` now prints much more information, especially when there is a failure
+
+### Fixed
+- Git utilities when installed as ZIPed `.egg` (#264)
 
 ## [0.5.5] - 2022-06-20
 ### Added

diff --git a/configs/full.yaml b/configs/full.yaml
@@ -17,7 +17,20 @@ default_dtype: float32
 allow_tf32: false                                                                 # whether to use TensorFloat32 if it is available
 # device:  cuda                                                                   # which device to use. Default: automatically detected cuda or "cpu"
 
-# network
+# == network ==
+
+# `model_builders` defines a series of functions that will be called to construct the model
+# each model builder has the opportunity to update the model, the config, or both
+# model builders from other packages are allowed (see mir-group/allegro for an example); those from `nequip.model` don't require a prefix
+# these are the default model builders:
+model_builders:
+ - SimpleIrrepsConfig         # update the config with all the irreps for the network if using the simplified `l_max` / `num_features` / `parity` syntax
+ - EnergyModel                # build a full NequIP model
+ - PerSpeciesRescale          # add per-atom / per-species scaling and shifting to the NequIP model before the total energy sum
+ - ForceOutput                # wrap the energy model in a module that uses autodifferention to compute the forces
+ - RescaleEnergyEtc           # wrap the entire model in the appropriate global rescaling of the energy, forces, etc.
+#   ^ global rescaling blocks must always go last!
+
 r_max: 4.0                                                                        # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan
 num_layers: 4                                                                     # number of interaction blocks, we find 3-5 to work best
 
@@ -198,6 +211,8 @@ loss_coeffs:
   total_energy:                                                                    
     - 1
     - PerAtomMSELoss
+# note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot.
+
 
 # # default loss function is MSELoss, the name has to be exactly the same as those in torch.nn.
 # the only supprted targets are forces and total_energy
@@ -302,10 +317,10 @@ per_species_rescale_scales: dataset_forces_rms
 # If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained.
 # per_species_rescale_kwargs: 
 #   total_energy: 
-#     alpha: 0.1
+#     alpha: 0.001
 #     max_iteration: 20
 #     stride: 100
-# keywords for GP decomposition of per specie energy. Optional. Defaults to 0.1
+# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2
 # per_species_rescale_arguments_in_dataset_units: True
 # if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values.
 
@@ -329,9 +344,10 @@ global_rescale_scale_trainable: false
 # global_rescale_shift_trainable: false
 # global_rescale_scale: dataset_forces_rms
 # global_rescale_scale_trainable: false
-# per_species_rescale_trainable: true
-# per_species_rescale_shifts: dataset_per_atom_total_energy_mean
-# per_species_rescale_scales: dataset_per_atom_total_energy_std
+# per_species_rescale_shifts_trainable: false
+# per_species_rescale_scales_trainable: true
+# per_species_rescale_shifts: dataset_per_species_total_energy_mean
+# per_species_rescale_scales: dataset_per_species_forces_rms
 
 # # full block needed for global rescale
 # global_rescale_shift: dataset_total_energy_mean

diff --git a/configs/minimal_toy_emt.yaml b/configs/minimal_toy_emt.yaml
@@ -6,15 +6,18 @@ dataset_seed: 456
 
 # network
 model_builders:
+  - SimpleIrrepsConfig
   - EnergyModel
   - PerSpeciesRescale
   - StressForceOutput
   - RescaleEnergyEtc
+
 num_basis: 8
 r_max: 4.0
-irreps_edge_sh: 0e + 1o
-conv_to_output_hidden_irreps_out: 16x0e
-feature_irreps_hidden: 16x0o + 16x0e + 16x1o + 16x1e
+l_max: 1
+parity: true
+num_features: 16
+num_layers: 4
 
 # data set
 dataset: EMTTest                                                                       # type of data set, can be npz or ase
@@ -23,10 +26,6 @@ dataset_num_frames: 100
 chemical_symbols:
   - Cu
 
-global_rescale_scale: dataset_total_energy_std
-per_species_rescale_shifts: dataset_per_atom_total_energy_mean
-per_species_rescale_scales: dataset_per_atom_total_energy_std
-
 # logging
 wandb: false
 # verbose: debug

diff --git a/docs/api/nequip.rst b/docs/api/nequip.rst
@@ -3,4 +3,5 @@ Python API
 
  .. toctree::
 
-    data
+    data
+    trainer
diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst
@@ -0,0 +1,10 @@
+nequip.trainer
+==============
+
+ .. automodule:: nequip.train.trainer
+    :members:
+    :imported-members:
+
+ .. automodule:: nequip.train.trainer_wandb
+    :members:
+    :imported-members:
diff --git a/docs/cite.rst b/docs/cite.rst
@@ -0,0 +1,3 @@
+Citing Nequip
+=============
+
diff --git a/docs/commandline/commands.rst b/docs/commandline/commands.rst
@@ -0,0 +1,132 @@
+Command-line Executables
+========================
+
+``nequip-train``
+----------------
+
+ .. code ::
+
+    usage: nequip-train [-h] [--equivariance-test] [--model-debug-mode] [--grad-anomaly-mode] [--log LOG] config
+
+Train (or restart training of) a NequIP model.
+
+positional arguments:
+  config               YAML file configuring the model, dataset, and other options
+
+optional arguments:
+  -h, --help           show this help message and exit
+  --equivariance-test  test the model's equivariance before training
+  --model-debug-mode   enable model debug mode, which can sometimes give much more useful error messages at the
+                       cost of some speed. Do not use for production training!
+  --grad-anomaly-mode  enable PyTorch autograd anomaly mode to debug NaN gradients. Do not use for production
+                       training!
+  --log LOG            log file to store all the screen logging
+
+``nequip-evaluate``
+-------------------
+
+ .. code ::
+
+    usage: nequip-evaluate [-h] [--train-dir TRAIN_DIR] [--model MODEL] [--dataset-config DATASET_CONFIG]
+                        [--metrics-config METRICS_CONFIG] [--test-indexes TEST_INDEXES] [--batch-size BATCH_SIZE]
+                        [--device DEVICE] [--output OUTPUT] [--log LOG]
+
+Compute the error of a model on a test set using various metrics. The model, metrics, dataset, etc. can specified
+in individual YAML config files, or a training session can be indicated with ``--train-dir``. In order of priority,
+the global settings (dtype, TensorFloat32, etc.) are taken from: (1) the model config (for a training session), (2)
+the dataset config (for a deployed model), or (3) the defaults. Prints only the final result in ``name = num`` format
+to stdout; all other information is ``logging.debug``ed to stderr. WARNING: Please note that results of CUDA models
+are rarely exactly reproducible, and that even CPU models can be nondeterministic.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --train-dir TRAIN_DIR
+                        Path to a working directory from a training session.
+  --model MODEL         A deployed or pickled NequIP model to load. If omitted, defaults to `best_model.pth` in
+                        `train_dir`.
+  --dataset-config DATASET_CONFIG
+                        A YAML config file specifying the dataset to load test data from. If omitted, `config.yaml`
+                        in `train_dir` will be used
+  --metrics-config METRICS_CONFIG
+                        A YAML config file specifying the metrics to compute. If omitted, `config.yaml` in
+                        `train_dir` will be used. If the config does not specify `metrics_components`, the default
+                        is to logging.debug MAEs and RMSEs for all fields given in the loss function. If the
+                        literal string `None`, no metrics will be computed.
+  --test-indexes TEST_INDEXES
+                        Path to a file containing the indexes in the dataset that make up the test set. If omitted,
+                        all data frames *not* used as training or validation data in the training session
+                        `train_dir` will be used.
+  --batch-size BATCH_SIZE
+                        Batch size to use. Larger is usually faster on GPU.
+  --device DEVICE       Device to run the model on. If not provided, defaults to CUDA if available and CPU
+                        otherwise.
+  --output OUTPUT       XYZ file to write out the test set and model predicted forces, energies, etc. to.
+  --log LOG             log file to store all the metrics and screen logging.debug
+
+``nequip-deploy``
+-----------------
+
+ .. code ::
+
+    usage: nequip-deploy [-h] {info,build} ...
+
+Deploy and view information about previously deployed NequIP models.
+
+optional arguments:
+  -h, --help    show this help message and exit
+
+commands:
+  {info,build}
+    info        Get information from a deployed model file
+    build       Build a deployment model
+
+``nequip-deploy info``
+~~~~~~~~~~~~~~~~~~~~~~
+
+ .. code ::
+
+    usage: nequip-deploy info [-h] model_path
+
+positional arguments:
+  model_path  Path to a deployed model file.
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+
+``nequip-deploy build``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+ .. code ::
+
+    usage: nequip-deploy build [-h] train_dir out_file
+
+positional arguments:
+  train_dir   Path to a working directory from a training session.
+  out_file    Output file for deployed model.
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+
+``nequip-benchmark``
+--------------------
+
+ .. code ::
+
+    usage: nequip-benchmark [-h] [--profile PROFILE] [--device DEVICE] [-n N] [--n-data N_DATA] [--timestep TIMESTEP]
+                            config
+
+Benchmark the approximate MD performance of a given model configuration / dataset pair.
+
+positional arguments:
+  config               configuration file
+
+optional arguments:
+  -h, --help           show this help message and exit
+  --profile PROFILE    Profile instead of timing, creating and outputing a Chrome trace JSON to the given path.
+  --device DEVICE      Device to run the model on. If not provided, defaults to CUDA if available and CPU
+                       otherwise.
+  -n N                 Number of trials.
+  --n-data N_DATA      Number of frames to use.
+  --timestep TIMESTEP  MD timestep for ns/day esimation, in fs. Defauts to 1fs.
diff --git a/docs/guide/FAQ.rst → docs/errors/errors.rst b/docs/guide/FAQ.rst → docs/errors/errors.rst
@@ -1,14 +1,5 @@
-FAQ
-===
-
-How do I...
------------
-
-... continue to train a model that reached a stopping condition?
-    There will be an answer here.
-
-1. Reload the model trained with version 0.3.3 to the code in 0.4.
-   check out the migration note at :ref:`migration_note`.
+Errors
+======
 
 Common errors
 -------------

diff --git a/docs/faq/FAQ.rst b/docs/faq/FAQ.rst
@@ -0,0 +1,14 @@
+FAQ
+===
+
+How do I...
+-----------
+
+... continue to train a model that reached a stopping condition?
+    There will be an answer here.
+
+1. Reload the model trained with version 0.3.3 to the code in 0.4.
+   check out the migration note at :ref:`migration_note`.
+
+2. Specify my dataset for `nequip-train` and `nequip-eval`, see :ref:`_dataset_note`.
+
diff --git a/docs/guide/guide.rst b/docs/guide/guide.rst
diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst
diff --git a/docs/guide/irreps.rst b/docs/guide/irreps.rst
diff --git a/docs/guide/conventions.rst → docs/howto/conventions.rst b/docs/guide/conventions.rst → docs/howto/conventions.rst