diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index dea57ca1..1f835e90 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,8 +15,8 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.7, 3.9]
-        torch-version: [1.10.0, 1.11.0]
+        python-version: [3.9]
+        torch-version: [1.11.0, 1.12.1]
 
     steps:
     - uses: actions/checkout@v2
@@ -44,4 +44,4 @@ jobs:
     - name: Test with pytest
       run: |
         # See https://github.com/pytest-dev/pytest/issues/1075
-        PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ .
+        PYTHONHASHSEED=0 pytest -n auto tests/
diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml
index bae5795e..2c23350c 100644
--- a/.github/workflows/tests_develop.yml
+++ b/.github/workflows/tests_develop.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.9]
-        torch-version: [1.11.0]
+        torch-version: [1.12.1]
 
     steps:
     - uses: actions/checkout@v2
@@ -44,4 +44,4 @@ jobs:
     - name: Test with pytest
       run: |
         # See https://github.com/pytest-dev/pytest/issues/1075
-        PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ .
+        PYTHONHASHSEED=0 pytest -n auto tests/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f13a642..cf50972d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Most recent change on the bottom.
 
 
-## [Unreleased] - 0.5.6
+## [0.5.6] - 2022-12-19
+### Added
+- sklearn dependency removed
+- `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights
+- `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary`
+- `nequip-benchmark --pdb` for debugging model (builder) errors
+- More information in `nequip-deploy info`
+
+### Changed
+- Minimum e3nn is now 0.4.4
+- `--equivariance-test` now prints much more information, especially when there is a failure
+
+### Fixed
+- Git utilities when installed as ZIPed `.egg` (#264)
 
 ## [0.5.5] - 2022-06-20
 ### Added
diff --git a/configs/full.yaml b/configs/full.yaml
index 1b8a3a2c..2f98164e 100644
--- a/configs/full.yaml
+++ b/configs/full.yaml
@@ -17,7 +17,20 @@ default_dtype: float32
 allow_tf32: false                                                                 # whether to use TensorFloat32 if it is available
 # device:  cuda                                                                   # which device to use. Default: automatically detected cuda or "cpu"
 
-# network
+# == network ==
+
+# `model_builders` defines a series of functions that will be called to construct the model
+# each model builder has the opportunity to update the model, the config, or both
+# model builders from other packages are allowed (see mir-group/allegro for an example); those from `nequip.model` don't require a prefix
+# these are the default model builders:
+model_builders:
+ - SimpleIrrepsConfig         # update the config with all the irreps for the network if using the simplified `l_max` / `num_features` / `parity` syntax
+ - EnergyModel                # build a full NequIP model
+ - PerSpeciesRescale          # add per-atom / per-species scaling and shifting to the NequIP model before the total energy sum
+ - ForceOutput                # wrap the energy model in a module that uses autodifferention to compute the forces
+ - RescaleEnergyEtc           # wrap the entire model in the appropriate global rescaling of the energy, forces, etc.
+#   ^ global rescaling blocks must always go last!
+
 r_max: 4.0                                                                        # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan
 num_layers: 4                                                                     # number of interaction blocks, we find 3-5 to work best
 
@@ -198,6 +211,8 @@ loss_coeffs:
   total_energy:                                                                    
     - 1
     - PerAtomMSELoss
+# note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot.
+
 
 # # default loss function is MSELoss, the name has to be exactly the same as those in torch.nn.
 # the only supprted targets are forces and total_energy
@@ -302,10 +317,10 @@ per_species_rescale_scales: dataset_forces_rms
 # If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained.
 # per_species_rescale_kwargs: 
 #   total_energy: 
-#     alpha: 0.1
+#     alpha: 0.001
 #     max_iteration: 20
 #     stride: 100
-# keywords for GP decomposition of per specie energy. Optional. Defaults to 0.1
+# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2
 # per_species_rescale_arguments_in_dataset_units: True
 # if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values.
 
@@ -329,9 +344,10 @@ global_rescale_scale_trainable: false
 # global_rescale_shift_trainable: false
 # global_rescale_scale: dataset_forces_rms
 # global_rescale_scale_trainable: false
-# per_species_rescale_trainable: true
-# per_species_rescale_shifts: dataset_per_atom_total_energy_mean
-# per_species_rescale_scales: dataset_per_atom_total_energy_std
+# per_species_rescale_shifts_trainable: false
+# per_species_rescale_scales_trainable: true
+# per_species_rescale_shifts: dataset_per_species_total_energy_mean
+# per_species_rescale_scales: dataset_per_species_forces_rms
 
 # # full block needed for global rescale
 # global_rescale_shift: dataset_total_energy_mean
diff --git a/configs/minimal_toy_emt.yaml b/configs/minimal_toy_emt.yaml
index c9c904d1..38b7f95d 100644
--- a/configs/minimal_toy_emt.yaml
+++ b/configs/minimal_toy_emt.yaml
@@ -6,15 +6,18 @@ dataset_seed: 456
 
 # network
 model_builders:
+  - SimpleIrrepsConfig
   - EnergyModel
   - PerSpeciesRescale
   - StressForceOutput
   - RescaleEnergyEtc
+
 num_basis: 8
 r_max: 4.0
-irreps_edge_sh: 0e + 1o
-conv_to_output_hidden_irreps_out: 16x0e
-feature_irreps_hidden: 16x0o + 16x0e + 16x1o + 16x1e
+l_max: 1
+parity: true
+num_features: 16
+num_layers: 4
 
 # data set
 dataset: EMTTest                                                                       # type of data set, can be npz or ase
@@ -23,10 +26,6 @@ dataset_num_frames: 100
 chemical_symbols:
   - Cu
 
-global_rescale_scale: dataset_total_energy_std
-per_species_rescale_shifts: dataset_per_atom_total_energy_mean
-per_species_rescale_scales: dataset_per_atom_total_energy_std
-
 # logging
 wandb: false
 # verbose: debug
diff --git a/docs/api/nequip.rst b/docs/api/nequip.rst
index 13bc37ca..6f6250cf 100644
--- a/docs/api/nequip.rst
+++ b/docs/api/nequip.rst
@@ -3,4 +3,5 @@ Python API
 
  .. toctree::
 
-    data
\ No newline at end of file
+    data
+    trainer
diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst
new file mode 100644
index 00000000..983e6f6b
--- /dev/null
+++ b/docs/api/trainer.rst
@@ -0,0 +1,10 @@
+nequip.trainer
+==============
+ 
+ .. automodule:: nequip.train.trainer
+    :members:
+    :imported-members:
+
+ .. automodule:: nequip.train.trainer_wandb
+    :members:
+    :imported-members:
diff --git a/docs/cite.rst b/docs/cite.rst
new file mode 100644
index 00000000..9f8296cc
--- /dev/null
+++ b/docs/cite.rst
@@ -0,0 +1,3 @@
+Citing Nequip
+=============
+
diff --git a/docs/commandline/commands.rst b/docs/commandline/commands.rst
new file mode 100644
index 00000000..b58c87ab
--- /dev/null
+++ b/docs/commandline/commands.rst
@@ -0,0 +1,132 @@
+Command-line Executables
+========================
+
+``nequip-train``
+----------------
+
+ .. code ::
+
+    usage: nequip-train [-h] [--equivariance-test] [--model-debug-mode] [--grad-anomaly-mode] [--log LOG] config
+
+Train (or restart training of) a NequIP model.
+
+positional arguments:
+  config               YAML file configuring the model, dataset, and other options
+
+optional arguments:
+  -h, --help           show this help message and exit
+  --equivariance-test  test the model's equivariance before training
+  --model-debug-mode   enable model debug mode, which can sometimes give much more useful error messages at the
+                       cost of some speed. Do not use for production training!
+  --grad-anomaly-mode  enable PyTorch autograd anomaly mode to debug NaN gradients. Do not use for production
+                       training!
+  --log LOG            log file to store all the screen logging
+
+``nequip-evaluate``
+-------------------
+
+ .. code ::
+
+    usage: nequip-evaluate [-h] [--train-dir TRAIN_DIR] [--model MODEL] [--dataset-config DATASET_CONFIG]
+                        [--metrics-config METRICS_CONFIG] [--test-indexes TEST_INDEXES] [--batch-size BATCH_SIZE]
+                        [--device DEVICE] [--output OUTPUT] [--log LOG]
+
+Compute the error of a model on a test set using various metrics. The model, metrics, dataset, etc. can specified
+in individual YAML config files, or a training session can be indicated with ``--train-dir``. In order of priority,
+the global settings (dtype, TensorFloat32, etc.) are taken from: (1) the model config (for a training session), (2)
+the dataset config (for a deployed model), or (3) the defaults. Prints only the final result in ``name = num`` format
+to stdout; all other information is ``logging.debug``ed to stderr. WARNING: Please note that results of CUDA models
+are rarely exactly reproducible, and that even CPU models can be nondeterministic.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --train-dir TRAIN_DIR
+                        Path to a working directory from a training session.
+  --model MODEL         A deployed or pickled NequIP model to load. If omitted, defaults to `best_model.pth` in
+                        `train_dir`.
+  --dataset-config DATASET_CONFIG
+                        A YAML config file specifying the dataset to load test data from. If omitted, `config.yaml`
+                        in `train_dir` will be used
+  --metrics-config METRICS_CONFIG
+                        A YAML config file specifying the metrics to compute. If omitted, `config.yaml` in
+                        `train_dir` will be used. If the config does not specify `metrics_components`, the default
+                        is to logging.debug MAEs and RMSEs for all fields given in the loss function. If the
+                        literal string `None`, no metrics will be computed.
+  --test-indexes TEST_INDEXES
+                        Path to a file containing the indexes in the dataset that make up the test set. If omitted,
+                        all data frames *not* used as training or validation data in the training session
+                        `train_dir` will be used.
+  --batch-size BATCH_SIZE
+                        Batch size to use. Larger is usually faster on GPU.
+  --device DEVICE       Device to run the model on. If not provided, defaults to CUDA if available and CPU
+                        otherwise.
+  --output OUTPUT       XYZ file to write out the test set and model predicted forces, energies, etc. to.
+  --log LOG             log file to store all the metrics and screen logging.debug
+
+``nequip-deploy``
+-----------------
+
+ .. code ::
+
+    usage: nequip-deploy [-h] {info,build} ...
+
+Deploy and view information about previously deployed NequIP models.
+
+optional arguments:
+  -h, --help    show this help message and exit
+
+commands:
+  {info,build}
+    info        Get information from a deployed model file
+    build       Build a deployment model
+
+``nequip-deploy info``
+~~~~~~~~~~~~~~~~~~~~~~
+
+ .. code ::
+
+    usage: nequip-deploy info [-h] model_path
+
+positional arguments:
+  model_path  Path to a deployed model file.
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+
+``nequip-deploy build``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+ .. code ::
+
+    usage: nequip-deploy build [-h] train_dir out_file
+
+positional arguments:
+  train_dir   Path to a working directory from a training session.
+  out_file    Output file for deployed model.
+
+optional arguments:
+  -h, --help  show this help message and exit
+
+
+``nequip-benchmark``
+--------------------
+
+ .. code ::
+
+    usage: nequip-benchmark [-h] [--profile PROFILE] [--device DEVICE] [-n N] [--n-data N_DATA] [--timestep TIMESTEP]
+                            config
+
+Benchmark the approximate MD performance of a given model configuration / dataset pair.
+
+positional arguments:
+  config               configuration file
+
+optional arguments:
+  -h, --help           show this help message and exit
+  --profile PROFILE    Profile instead of timing, creating and outputing a Chrome trace JSON to the given path.
+  --device DEVICE      Device to run the model on. If not provided, defaults to CUDA if available and CPU
+                       otherwise.
+  -n N                 Number of trials.
+  --n-data N_DATA      Number of frames to use.
+  --timestep TIMESTEP  MD timestep for ns/day esimation, in fs. Defauts to 1fs.
diff --git a/docs/guide/FAQ.rst b/docs/errors/errors.rst
similarity index 56%
rename from docs/guide/FAQ.rst
rename to docs/errors/errors.rst
index 92ac758e..576e553d 100644
--- a/docs/guide/FAQ.rst
+++ b/docs/errors/errors.rst
@@ -1,14 +1,5 @@
-FAQ
-===
-
-How do I...
------------
-
-... continue to train a model that reached a stopping condition?
-    There will be an answer here.
-
-1. Reload the model trained with version 0.3.3 to the code in 0.4.
-   check out the migration note at :ref:`migration_note`.
+Errors
+======
 
 Common errors
 -------------
diff --git a/docs/faq/FAQ.rst b/docs/faq/FAQ.rst
new file mode 100644
index 00000000..411e77c1
--- /dev/null
+++ b/docs/faq/FAQ.rst
@@ -0,0 +1,14 @@
+FAQ
+===
+
+How do I...
+-----------
+
+... continue to train a model that reached a stopping condition?
+    There will be an answer here.
+
+1. Reload the model trained with version 0.3.3 to the code in 0.4.
+   check out the migration note at :ref:`migration_note`.
+
+2. Specify my dataset for `nequip-train` and `nequip-eval`, see :ref:`_dataset_note`.
+
diff --git a/docs/guide/guide.rst b/docs/guide/guide.rst
deleted file mode 100644
index 6def3859..00000000
--- a/docs/guide/guide.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-NequIP User Guide
-=================
-
- .. toctree::
-
-    intro
-    irreps
-    conventions
-    FAQ
\ No newline at end of file
diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst
deleted file mode 100644
index 7afa4132..00000000
--- a/docs/guide/intro.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Tutorial: Introduction to NequIP
-================================
-
-TODO
\ No newline at end of file
diff --git a/docs/guide/irreps.rst b/docs/guide/irreps.rst
deleted file mode 100644
index 5f9b2735..00000000
--- a/docs/guide/irreps.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Irreps
-======
-
-.. _Irreps:
-
-Syntax to specify irreps
-------------------------
-
-TODO: descripe irreps syntax here
\ No newline at end of file
diff --git a/docs/guide/conventions.rst b/docs/howto/conventions.rst
similarity index 100%
rename from docs/guide/conventions.rst
rename to docs/howto/conventions.rst
diff --git a/docs/howto/dataset.rst b/docs/howto/dataset.rst
new file mode 100644
index 00000000..2b5267e7
--- /dev/null
+++ b/docs/howto/dataset.rst
@@ -0,0 +1,156 @@
+.. _dataset_note:
+   
+How to prepare training dataset
+===============================
+
+What does NequIP behind the scene
+---------------------------------
+
+NequIP uses AtomicDataset class to store the atomic configurations. 
+During the initialization of an AtomicDataset object, 
+NequIP reads the atomic structures from the dataset, 
+computes the neighbor list and other data structures needed for the GNN 
+by converting raw data to a list of ``AtomicData`` objects.
+
+The computed results are then cached on harddisk ``root/processed_hashkey`` folder.
+The hashing is based on all the metadata provided for the dataset, 
+which includes the file name, the cutoff radius, float number precision and etc.
+In the case where multiple training/evaluation runs use the same dataset,
+the neighbor list will only be computed in the first NequIP run.
+The later runs will directly load the AtomicDataset object from the cache file to save computation time.
+
+Note: be careful to the cached file. If you update your raw data file but keep using the same filename,
+NequIP will not automatically update the cached data.
+
+Key concepts
+------------
+
+fixed_fields
+~~~~~~~~~~~~
+Fixed fields are the quantities that are shared among all the configurations in the dataset.
+For example, if the dataset is a trajectory of an NVT MD simulation, the super cell size and the atomic species 
+are indeed a constant matrix/vector through out the whole dataset.
+In this case, in stead of repeating the same values for many times, 
+we specify the cell and species as fixed fields and only provide them once.
+
+yaml interface
+~~~~~~~~~~~~~~
+``nequip-train`` and ``nequip-evaluate`` automatically construct the AtomicDataset based on the yaml arguments.
+Later sections offer a couple different examples.
+
+If the training and validation datasets are from different raw files, the arguments for each set
+can be defined with ``dataset`` prefix and ``validation_dataset`` prefix, respectively.
+
+For example, ``dataset_file_name`` is used for training data and ``validation_dataset_file_name`` is for validation data.
+
+Python interface
+~~~~~~~~~~~~~~~~
+See ``nequip.data.dataset.AtomicInMemoryDataset``.
+
+Prepare dataset and specify in yaml config
+------------------------------------------
+
+ASE format
+~~~~~~~~~~
+
+NequIP accept all format that can be parsed by `ase.io.read` function. 
+We recommend `extxyz`.
+
+Example: Given an atomic data stored in "H2.extxyz" that looks like below:
+
+.. code:: extxyz
+
+   2
+   Properties=species:S:1:pos:R:3 energy=-10 user_label=2.0 pbc="F F F"
+   H       0.00000000       0.00000000       0.00000000
+   H       0.00000000       0.00000000       1.02000000
+
+The yaml input should be
+
+.. code:: yaml
+
+   dataset: ase
+   dataset_file_name: H2.extxyz
+   ase_args:
+   format: extxyz
+   include_keys:
+     - user_label
+   key_mapping:
+     user_label: label0
+   chemical_symbol_to_type:
+     H: 0
+
+For other formats than `extxyz`, be careful to the ase parsers; they may have different behavior from the extxyz parser.
+For example, the ase vasp parser store potential energy to `free_energy` instead of `energy`.
+Because we optimize our code to the `extxyz` parser, NequIP will not be able to load any `total_energy` labels.
+We need some additional keys to help NequIP to understand the situtaion
+Here's an example for vasp outcar. 
+
+.. code:: yaml
+
+   dataset: ase
+   dataset_file_name: OUTCAR
+   ase_args:
+     format: vasp-out
+   key_mapping:
+     free_energy: total_energy
+   chemical_symbol_to_type:
+     H: 0
+
+The way around is to use key mapping, please see more note below.
+
+NPZ format
+~~~~~~~~~~
+
+If your dataset constitute configurations that always have the same number of atoms, npz data format can be an option.
+
+In the npz file, all the values should have the same row as the number of the configurations. 
+For example, the force array of 36 atomic configurations of an N-atom system should have the shape of (36, N, 3);
+their total_energy array should have the shape of (36).
+
+Below is an example of the yaml specification.
+
+.. code:: yaml
+
+   dataset: npz
+   dataset_file_name: example.npz
+   include_keys:
+     - user_label1
+     - user_label2
+   npz_fixed_field_keys:
+     - cell
+     - atomic_numbers
+   key_mapping:
+     position: pos
+     force: forces
+     energy: total_energy
+     Z: atomic_numbers
+
+
+Note on key mapping
+~~~~~~~~~~~~~~~~~~~
+
+NequIP has default key names for energy, force, cell (defined at nequip.data._keys)
+Unlike in the ASE format where these information is automatically parsed,
+in the npz data format, the correct key names have to be provided.
+The common key names are: `total_energy`, `forces`, `atomic_numbers`, `pos`, `cell`, `pbc`.
+the key_mapping can help to convert the user defined name (key) to NequIP default name (value).
+
+
+Advanced options
+----------------
+
+skip frames during data processing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The `include_frame` argument can be specified in yaml to skip certain frames in the raw datafile.
+The item has to be a list or a python iteratable object.
+
+register user-defined graph, node, edge fields
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Graph, node, edge fields are quantities that belong to 
+the whole graph, each atom, each edge, respectively.
+Example graph fields include cell, pbc, and total_energy.
+Example node fields include pos, forces 
+
+To help NequIP to properly assemble the batch data, graph quantity other than 
+cell, pbc, total_energy should be registered.
diff --git a/docs/howto/howto.rst b/docs/howto/howto.rst
new file mode 100644
index 00000000..07e84e84
--- /dev/null
+++ b/docs/howto/howto.rst
@@ -0,0 +1,7 @@
+How-to Tutorials
+================
+
+ .. toctree::
+
+    dataset
+    migrate
diff --git a/docs/guide/migrate.rst b/docs/howto/migrate.rst
similarity index 100%
rename from docs/guide/migrate.rst
rename to docs/howto/migrate.rst
diff --git a/docs/index.rst b/docs/index.rst
index dc6ecd43..d2edd1a6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -9,12 +9,20 @@ NequIP
 NequIP is an open-source package for creating, training, and using E(3)-equivariant machine learning interatomic potentials.
 
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 2
    :caption: Contents:
 
-   guide/guide
+   introduction/intro
+   cite
+   installation/install
+   yaml/yaml
+   howto/howto
+   faq/FAQ
+   commandline/commands
+   lammps/all
    options/options
    api/nequip
+   errors/errors
 
 
 
diff --git a/docs/installation/install.rst b/docs/installation/install.rst
new file mode 100644
index 00000000..3e946815
--- /dev/null
+++ b/docs/installation/install.rst
@@ -0,0 +1,39 @@
+Installation
+============
+
+NequIP requires:
+
+ * Python >= 3.6
+ * PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the `instructions from their documentation <https://pytorch.org/get-started/locally/>`_. Note that neither ``torchvision`` nor ``torchaudio``, included in the default install command, are needed for NequIP.
+
+To install:
+
+ * We use `Weights&Biases <https://wandb.ai>`_ to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account `here <https://wandb.ai/login?signup=true>`_ and install the Python package::
+
+    pip install wandb
+
+ * Install the latest stable NequIP::
+
+    pip install https://github.com/mir-group/nequip/archive/main.zip
+
+To install previous versions of NequIP, please clone the repository from GitHub and check out the appropriate tag (for example ``v0.3.3`` for version 0.3.3).
+
+To install the current **unstable** development version of NequIP, please clone our repository and check out the ``develop`` branch.
+
+Installation Issues
+-------------------
+
+The easiest way to check if your installation is working is to train a _toy_ model::
+
+    nequip-train configs/minimal.yaml
+
+If you suspect something is wrong, encounter errors, or just want to confirm that everything is in working order, you can also run the unit tests::
+
+    pip install pytest
+    pytest tests/unit/
+
+To run the full tests, including a set of longer/more intensive integration tests, run::
+
+    pytest tests/
+
+If a GPU is present, the unit tests will use it.
\ No newline at end of file
diff --git a/docs/introduction/intro.rst b/docs/introduction/intro.rst
new file mode 100644
index 00000000..e0dcc32c
--- /dev/null
+++ b/docs/introduction/intro.rst
@@ -0,0 +1,4 @@
+Overview
+========
+
+TODO
diff --git a/docs/lammps/all.rst b/docs/lammps/all.rst
new file mode 100644
index 00000000..9faac07e
--- /dev/null
+++ b/docs/lammps/all.rst
@@ -0,0 +1,7 @@
+Integration to LAMMPS, ASE
+==========================
+
+ .. toctree::
+
+    lammps
+    ase
diff --git a/docs/lammps/ase.rst b/docs/lammps/ase.rst
new file mode 100644
index 00000000..3729cde3
--- /dev/null
+++ b/docs/lammps/ase.rst
@@ -0,0 +1,2 @@
+ASE
+===
diff --git a/docs/lammps/lammps.rst b/docs/lammps/lammps.rst
new file mode 100644
index 00000000..f9d0ba9f
--- /dev/null
+++ b/docs/lammps/lammps.rst
@@ -0,0 +1,2 @@
+LAMMPS
+======
diff --git a/docs/options/dataset.rst b/docs/options/dataset.rst
index 54b39fc9..f3ca194c 100644
--- a/docs/options/dataset.rst
+++ b/docs/options/dataset.rst
@@ -33,7 +33,7 @@ key_mapping
     | Type: dict
     | Default: ``{'positions': 'pos', 'energy': 'total_energy', 'force': 'forces', 'forces': 'forces', 'Z': 'atomic_numbers', 'atomic_number': 'atomic_numbers'}``
 
-npz_keys
+include_keys
 ^^^^^^^^
     | Type: list
     | Default: ``[]``
@@ -68,5 +68,11 @@ include_frames
     | Type: NoneType
     | Default: ``None``
 
+ase_args
+^^^^^^^^
+    | Type: dict
+    | Default: ``{}``
+
 Advanced
---------
\ No newline at end of file
+--------
+See tutorial on :ref:`../guide/_dataset_note`.
diff --git a/docs/yaml/yaml.rst b/docs/yaml/yaml.rst
new file mode 100644
index 00000000..fd804436
--- /dev/null
+++ b/docs/yaml/yaml.rst
@@ -0,0 +1,4 @@
+YAML input
+==========
+
+TODO
diff --git a/nequip/_version.py b/nequip/_version.py
index 91faf40b..b02164d2 100644
--- a/nequip/_version.py
+++ b/nequip/_version.py
@@ -2,4 +2,4 @@
 # See Python packaging guide
 # https://packaging.python.org/guides/single-sourcing-package-version/
 
-__version__ = "0.5.5"
+__version__ = "0.5.6"
diff --git a/nequip/data/AtomicData.py b/nequip/data/AtomicData.py
index 3f2e348b..728c260b 100644
--- a/nequip/data/AtomicData.py
+++ b/nequip/data/AtomicData.py
@@ -48,6 +48,7 @@
     AtomicDataDict.EDGE_LENGTH_KEY,
     AtomicDataDict.EDGE_ATTRS_KEY,
     AtomicDataDict.EDGE_EMBEDDING_KEY,
+    AtomicDataDict.EDGE_FEATURES_KEY,
 }
 _DEFAULT_GRAPH_FIELDS: Set[str] = {
     AtomicDataDict.TOTAL_ENERGY_KEY,
@@ -773,7 +774,7 @@ def neighbor_list_and_relative_vec(
         keep_edge = ~bad_edge
         if not np.any(keep_edge):
             raise ValueError(
-                "After eliminating self edges, no edges remain in this system."
+                f"Every single atom has no neighbors within the cutoff r_max={r_max} (after eliminating self edges, no edges remain in this system)"
             )
         first_idex = first_idex[keep_edge]
         second_idex = second_idex[keep_edge]
diff --git a/nequip/data/_keys.py b/nequip/data/_keys.py
index c0535edd..54b66ce3 100644
--- a/nequip/data/_keys.py
+++ b/nequip/data/_keys.py
@@ -44,6 +44,7 @@
 EDGE_ATTRS_KEY: Final[str] = "edge_attrs"
 # [n_edge, dim] invariant embedding of the edges
 EDGE_EMBEDDING_KEY: Final[str] = "edge_embedding"
+EDGE_FEATURES_KEY: Final[str] = "edge_features"
 
 NODE_FEATURES_KEY: Final[str] = "node_features"
 NODE_ATTRS_KEY: Final[str] = "node_attrs"
diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py
index 847b3795..c38b8eae 100644
--- a/nequip/data/dataset.py
+++ b/nequip/data/dataset.py
@@ -295,7 +295,13 @@ def process(self):
         # type conversion
         _process_dict(fixed_fields, ignore_fields=["r_max"])
 
-        logging.info(f"Loaded data: {data}")
+        total_MBs = sum(item.numel() * item.element_size() for _, item in data) / (
+            1024 * 1024
+        )
+        logging.info(
+            f"Loaded data: {data}\n    processed data size: ~{total_MBs:.2f} MB"
+        )
+        del total_MBs
 
         # use atomic writes to avoid race conditions between
         # different trainings that use the same dataset
@@ -635,7 +641,7 @@ class NpzDataset(AtomicInMemoryDataset):
     """Load data from an npz file.
 
     To avoid loading unneeded data, keys are ignored by default unless they are in ``key_mapping``, ``include_keys``,
-    ``npz_fixed_fields`` or ``extra_fixed_fields``.
+    ``npz_fixed_fields_keys`` or ``extra_fixed_fields``.
 
     Args:
         key_mapping (Dict[str, str]): mapping of npz keys to ``AtomicData`` keys. Optional
diff --git a/nequip/data/transforms.py b/nequip/data/transforms.py
index f2c7ec32..4f6331b7 100644
--- a/nequip/data/transforms.py
+++ b/nequip/data/transforms.py
@@ -121,11 +121,13 @@ def transform(self, atomic_numbers):
                 f"Data included atomic numbers {bad_set} that are not part of the atomic number -> type mapping!"
             )
 
-        return self._Z_to_index[atomic_numbers - self._min_Z]
+        return self._Z_to_index.to(device=atomic_numbers.device)[
+            atomic_numbers - self._min_Z
+        ]
 
     def untransform(self, atom_types):
         """Transform atom types back into atomic numbers"""
-        return self._index_to_Z[atom_types]
+        return self._index_to_Z[atom_types].to(device=atom_types.device)
 
     @property
     def has_chemical_symbols(self) -> bool:
diff --git a/nequip/model/_build.py b/nequip/model/_build.py
index 0fe4e21d..7e1a63fd 100644
--- a/nequip/model/_build.py
+++ b/nequip/model/_build.py
@@ -8,7 +8,10 @@
 
 
 def model_from_config(
-    config, initialize: bool = False, dataset: Optional[AtomicDataset] = None
+    config,
+    initialize: bool = False,
+    dataset: Optional[AtomicDataset] = None,
+    deploy: bool = False,
 ) -> GraphModuleMixin:
     """Build a model based on `config`.
 
@@ -17,11 +20,13 @@ def model_from_config(
      - ``model``: the model produced by the previous builder. Cannot be requested by the first builder, must be requested by subsequent ones.
      - ``initialize``: whether to initialize the model
      - ``dataset``: if ``initialize`` is True, the dataset
+     - ``deploy``: whether the model object is for deployment / inference
 
     Args:
         config
-        initialize (bool): if True (default False), ``model_initializers`` will also be run.
+        initialize (bool): whether ``model_builders`` should be instructed to initialize the model
         dataset: dataset for initializers if ``initialize`` is True.
+        deploy (bool): whether ``model_builders`` should be told the model is for deployment / inference
 
     Returns:
         The build model.
@@ -61,6 +66,8 @@ def model_from_config(
         params = {}
         if "initialize" in pnames:
             params["initialize"] = initialize
+        if "deploy" in pnames:
+            params["deploy"] = deploy
         if "config" in pnames:
             params["config"] = config
         if "dataset" in pnames:
diff --git a/nequip/model/_scaling.py b/nequip/model/_scaling.py
index f5554d25..8a7ffa46 100644
--- a/nequip/model/_scaling.py
+++ b/nequip/model/_scaling.py
@@ -12,7 +12,10 @@
 
 
 def RescaleEnergyEtc(
-    model: GraphModuleMixin, config, dataset: AtomicDataset, initialize: bool
+    model: GraphModuleMixin,
+    config,
+    initialize: bool,
+    dataset: Optional[AtomicDataset] = None,
 ):
     return GlobalRescale(
         model=model,
@@ -34,7 +37,6 @@ def RescaleEnergyEtc(
 def GlobalRescale(
     model: GraphModuleMixin,
     config,
-    dataset: AtomicDataset,
     initialize: bool,
     module_prefix: str,
     default_scale: Union[str, float, list],
@@ -43,6 +45,7 @@ def GlobalRescale(
     default_shift_keys: list,
     default_related_scale_keys: list,
     default_related_shift_keys: list,
+    dataset: Optional[AtomicDataset] = None,
 ):
     """Add global rescaling for energy(-based quantities).
 
@@ -75,11 +78,12 @@ def GlobalRescale(
                 raise ValueError(f"Invalid global scale `{value}`")
 
         # = Compute shifts and scales =
-        computed_stats = _compute_stats(
-            str_names=str_names,
-            dataset=dataset,
-            stride=config.dataset_statistics_stride,
-        )
+        if len(str_names) > 0:
+            computed_stats = _compute_stats(
+                str_names=str_names,
+                dataset=dataset,
+                stride=config.dataset_statistics_stride,
+            )
 
         if isinstance(global_scale, str):
             s = global_scale
@@ -129,8 +133,8 @@ def GlobalRescale(
 def PerSpeciesRescale(
     model: GraphModuleMixin,
     config,
-    dataset: AtomicDataset,
     initialize: bool,
+    dataset: Optional[AtomicDataset] = None,
 ):
     """Add global rescaling for energy(-based quantities).
 
@@ -199,12 +203,13 @@ def PerSpeciesRescale(
                 ], "Requested to set either the shifts or scales of the per_species_rescale using dataset values, but chose to provide the other in non-dataset units. Please give the explictly specified shifts/scales in dataset units and set per_species_rescale_arguments_in_dataset_units"
 
         # = Compute shifts and scales =
-        computed_stats = _compute_stats(
-            str_names=str_names,
-            dataset=dataset,
-            stride=config.dataset_statistics_stride,
-            kwargs=config.get(module_prefix + "_kwargs", {}),
-        )
+        if len(str_names) > 0:
+            computed_stats = _compute_stats(
+                str_names=str_names,
+                dataset=dataset,
+                stride=config.dataset_statistics_stride,
+                kwargs=config.get(module_prefix + "_kwargs", {}),
+            )
 
         if isinstance(scales, str):
             s = scales
diff --git a/nequip/nn/_grad_output.py b/nequip/nn/_grad_output.py
index ffc13140..673f8ff0 100644
--- a/nequip/nn/_grad_output.py
+++ b/nequip/nn/_grad_output.py
@@ -315,10 +315,10 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type:
                 torch.cross(cell[:, 1, :], cell[:, 2, :], dim=1),
             ).unsqueeze(-1)
             stress = virial / volume.view(-1, 1, 1)
-            data[AtomicDataDict.STRESS_KEY] = stress
             data[AtomicDataDict.CELL_KEY] = orig_cell
         else:
             stress = self._empty  # torchscript
+        data[AtomicDataDict.STRESS_KEY] = stress
 
         # see discussion in https://github.com/libAtoms/QUIP/issues/227 about sign convention
         # they say the standard convention is virial = -stress x volume
diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py
index 579e60ea..1deb0de2 100644
--- a/nequip/scripts/benchmark.py
+++ b/nequip/scripts/benchmark.py
@@ -3,6 +3,10 @@
 import tempfile
 import itertools
 import time
+import logging
+import sys
+import pdb
+import traceback
 
 import torch
 from torch.utils.benchmark import Timer, Measurement
@@ -11,9 +15,10 @@
 from e3nn.util.jit import script
 
 from nequip.utils import Config
+from nequip.utils.test import assert_AtomicData_equivariant
 from nequip.data import AtomicData, AtomicDataDict, dataset_from_config
 from nequip.model import model_from_config
-from nequip.scripts.deploy import _compile_for_deploy
+from nequip.scripts.deploy import _compile_for_deploy, load_deployed_model
 from nequip.scripts.train import default_config, check_code_version
 from nequip.utils._global_options import _set_global_options
 
@@ -25,12 +30,23 @@ def main(args=None):
         )
     )
     parser.add_argument("config", help="configuration file")
+    parser.add_argument(
+        "--model",
+        help="A deployed model to load instead of building a new one from `config`. ",
+        type=str,
+        default=None,
+    )
     parser.add_argument(
         "--profile",
         help="Profile instead of timing, creating and outputing a Chrome trace JSON to the given path.",
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--equivariance-test",
+        help="test the model's equivariance on `--n-data` frames.",
+        action="store_true",
+    )
     parser.add_argument(
         "--device",
         help="Device to run the model on. If not provided, defaults to CUDA if available and CPU otherwise.",
@@ -55,11 +71,33 @@ def main(args=None):
         type=float,
         default=1,
     )
-
-    # TODO: option to show memory use
+    parser.add_argument(
+        "--no-compile",
+        help="Don't compile the model to TorchScript",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--memory-summary",
+        help="Print torch.cuda.memory_summary() after running the model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--verbose", help="Logging verbosity level", type=str, default="error"
+    )
+    parser.add_argument(
+        "--pdb",
+        help="Run model builders and model under debugger to easily drop to debugger to investigate errors.",
+        action="store_true",
+    )
 
     # Parse the args
     args = parser.parse_args(args=args)
+    if args.pdb:
+        assert args.profile is None
+
+    root_logger = logging.getLogger()
+    root_logger.setLevel(getattr(logging, args.verbose.upper()))
+    root_logger.handlers = [logging.StreamHandler(sys.stderr)]
 
     if args.device is None:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -80,12 +118,12 @@ def main(args=None):
     print(f"    loading dataset took {dataset_time:.4f}s")
     dataset_rng = torch.Generator()
     dataset_rng.manual_seed(config.get("dataset_seed", config.get("seed", 12345)))
-    datas = [
+    datas_list = [
         AtomicData.to_AtomicDataDict(dataset[i].to(device))
         for i in torch.randperm(len(dataset), generator=dataset_rng)[: args.n_data]
     ]
-    n_atom: int = len(datas[0]["pos"])
-    if not all(len(d["pos"]) == n_atom for d in datas):
+    n_atom: int = len(datas_list[0]["pos"])
+    if not all(len(d["pos"]) == n_atom for d in datas_list):
         raise NotImplementedError(
             "nequip-benchmark does not currently handle benchmarking on data frames with variable number of atoms"
         )
@@ -97,7 +135,7 @@ def main(args=None):
     print(f"         number of atoms: {n_atom}")
     print(f"         number of types: {dataset.type_mapper.num_types}")
     print(
-        f"          avg. num edges: {sum(d[AtomicDataDict.EDGE_INDEX_KEY].shape[1] for d in datas) / len(datas)}"
+        f"          avg. num edges: {sum(d[AtomicDataDict.EDGE_INDEX_KEY].shape[1] for d in datas_list) / len(datas_list)}"
     )
     avg_edges_per_atom = torch.mean(
         torch.cat(
@@ -106,14 +144,14 @@ def main(args=None):
                     d[AtomicDataDict.EDGE_INDEX_KEY][0],
                     minlength=d[AtomicDataDict.POSITIONS_KEY].shape[0],
                 ).float()
-                for d in datas
+                for d in datas_list
             ]
         )
     ).item()
     print(f"         avg. neigh/atom: {avg_edges_per_atom}")
 
     # cycle over the datas we loaded
-    datas = itertools.cycle(datas)
+    datas = itertools.cycle(datas_list)
 
     # short circut
     if args.n == 0:
@@ -121,29 +159,64 @@ def main(args=None):
         return
 
     # Load model:
-    print("Building model... ")
-    model_time = time.time()
-    model = model_from_config(config, initialize=True, dataset=dataset)
-    model_time = time.time() - model_time
-    print(f"    building model took {model_time:.4f}s")
-    print("Compile...")
-    # "Deploy" it
+    if args.model is None:
+        print("Building model... ")
+        model_time = time.time()
+        try:
+            model = model_from_config(
+                config, initialize=True, dataset=dataset, deploy=True
+            )
+        except:  # noqa: E722
+            if args.pdb:
+                traceback.print_exc()
+                pdb.post_mortem()
+            else:
+                raise
+        model_time = time.time() - model_time
+        print(f"    building model took {model_time:.4f}s")
+    else:
+        print("Loading model...")
+        model, metadata = load_deployed_model(args.model, device=device, freeze=False)
+        print("    deployed model has metadata:")
+        print(
+            "\n".join(
+                "        %s: %s" % e for e in metadata.items() if e[0] != "config"
+            )
+        )
+    print(f"    model has {sum(p.numel() for p in model.parameters())} weights")
+    print(
+        f"    model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights"
+    )
+    print(
+        f"    model weights and buffers take {sum(p.numel() * p.element_size() for p in itertools.chain(model.parameters(), model.buffers())) / (1024 * 1024):.2f} MB"
+    )
+
     model.eval()
-    compile_time = time.time()
-    model = script(model)
-    model = _compile_for_deploy(model)
-    compile_time = time.time() - compile_time
-    print(f"    compilation took {compile_time:.4f}s")
-
-    # save and reload to avoid bugs
-    with tempfile.NamedTemporaryFile() as f:
-        torch.jit.save(model, f.name)
-        model = torch.jit.load(f.name, map_location=device)
-        # freeze like in the LAMMPS plugin
-        model = torch.jit.freeze(model)
-        # and reload again just to avoid bugs
-        torch.jit.save(model, f.name)
-        model = torch.jit.load(f.name, map_location=device)
+    if args.equivariance_test:
+        args.no_compile = True
+        if args.model is not None:
+            raise RuntimeError("Can't equivariance test a deployed model.")
+
+    if args.no_compile:
+        model = model.to(device)
+    else:
+        print("Compile...")
+        # "Deploy" it
+        compile_time = time.time()
+        model = script(model)
+        model = _compile_for_deploy(model)
+        compile_time = time.time() - compile_time
+        print(f"    compilation took {compile_time:.4f}s")
+
+        # save and reload to avoid bugs
+        with tempfile.NamedTemporaryFile() as f:
+            torch.jit.save(model, f.name)
+            model = torch.jit.load(f.name, map_location=device)
+            # freeze like in the LAMMPS plugin
+            model = torch.jit.freeze(model)
+            # and reload again just to avoid bugs
+            torch.jit.save(model, f.name)
+            model = torch.jit.load(f.name, map_location=device)
 
     # Make sure we're warm past compilation
     warmup = config["_jit_bailout_depth"] + 4  # just to be safe...
@@ -154,7 +227,7 @@ def trace_handler(p):
             p.export_chrome_trace(args.profile)
             print(f"Wrote profiling trace to `{args.profile}`")
 
-        print("Starting...")
+        print("Starting profiling...")
         with torch.profiler.profile(
             activities=[
                 torch.profiler.ProfilerActivity.CPU,
@@ -168,6 +241,34 @@ def trace_handler(p):
             for _ in range(1 + warmup + args.n):
                 model(next(datas).copy())
                 p.step()
+    elif args.pdb:
+        print("Running model under debugger...")
+        try:
+            for _ in range(args.n):
+                model(next(datas).copy())
+        except:  # noqa: E722
+            traceback.print_exc()
+            pdb.post_mortem()
+        print("Done.")
+    elif args.equivariance_test:
+        print("Warmup...")
+        warmup_time = time.time()
+        for _ in range(warmup):
+            model(next(datas).copy())
+        warmup_time = time.time() - warmup_time
+        print(f"    {warmup} calls of warmup took {warmup_time:.4f}s")
+        print("Running equivariance test...")
+        errstr = assert_AtomicData_equivariant(model, datas_list)
+        print(
+            "    Equivariance test passed; equivariance errors:\n"
+            "    Errors are in real units, where relevant.\n"
+            "    Please note that the large scale of the typical\n"
+            "    shifts to the (atomic) energy can cause\n"
+            "    catastrophic cancellation and give incorrectly\n"
+            "    the equivariance error as zero for those fields.\n"
+            f"{errstr}"
+        )
+        del errstr
     else:
         print("Warmup...")
         warmup_time = time.time()
@@ -176,13 +277,17 @@ def trace_handler(p):
         warmup_time = time.time() - warmup_time
         print(f"    {warmup} calls of warmup took {warmup_time:.4f}s")
 
-        print("Starting...")
+        print("Benchmarking...")
         # just time
         t = Timer(
             stmt="model(next(datas).copy())", globals={"model": model, "datas": datas}
         )
         perloop: Measurement = t.timeit(args.n)
 
+        if args.memory_summary and torch.cuda.is_available():
+            print("Memory usage summary:")
+            print(torch.cuda.memory_summary())
+
         print(" -- Results --")
         print(
             f"PLEASE NOTE: these are speeds for the MODEL, evaluated on --n-data={args.n_data} configurations kept in memory."
diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py
index 8185ab75..394c0005 100644
--- a/nequip/scripts/deploy.py
+++ b/nequip/scripts/deploy.py
@@ -9,6 +9,7 @@
 import pathlib
 import logging
 import yaml
+import itertools
 
 # This is a weird hack to avoid Intel MKL issues on the cluster when this is called as a subprocess of a process that has itself initialized PyTorch.
 # Since numpy gets imported later anyway for dataset stuff, this shouldn't affect performance.
@@ -129,7 +130,7 @@ def load_deployed_model(
 
 def main(args=None):
     parser = argparse.ArgumentParser(
-        description="Create and view information about deployed NequIP potentials."
+        description="Deploy and view information about previously deployed NequIP models."
     )
     # backward compat for 3.6
     if sys.version_info[1] > 6:
@@ -146,6 +147,11 @@ def main(args=None):
         help="Path to a deployed model file.",
         type=pathlib.Path,
     )
+    info_parser.add_argument(
+        "--print-config",
+        help="Print the full config of the model.",
+        action="store_true",
+    )
 
     build_parser = subparsers.add_parser("build", help="Build a deployment model")
     build_parser.add_argument(
@@ -169,13 +175,25 @@ def main(args=None):
     logging.basicConfig(level=getattr(logging, args.verbose.upper()))
 
     if args.command == "info":
-        model, metadata = load_deployed_model(args.model_path, set_global_options=False)
-        del model
+        model, metadata = load_deployed_model(
+            args.model_path, set_global_options=False, freeze=False
+        )
         config = metadata.pop(CONFIG_KEY)
-        metadata_str = "\n".join("  %s: %s" % e for e in metadata.items())
-        logging.info(f"Loaded TorchScript model with metadata:\n{metadata_str}\n")
-        logging.info("Model was built with config:")
-        print(config)
+        if args.print_config:
+            print(config)
+        else:
+            metadata_str = "\n".join("  %s: %s" % e for e in metadata.items())
+            logging.info(f"Loaded TorchScript model with metadata:\n{metadata_str}\n")
+            logging.info(
+                f"Model has {sum(p.numel() for p in model.parameters())} weights"
+            )
+            logging.info(
+                f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights"
+            )
+            logging.info(
+                f"Model weights and buffers take {sum(p.numel() * p.element_size() for p in itertools.chain(model.parameters(), model.buffers())) / (1024 * 1024):.2f} MB"
+            )
+            logging.debug(f"Model had config:\n{config}")
 
     elif args.command == "build":
         if args.model and args.train_dir:
@@ -198,7 +216,7 @@ def main(args=None):
                 args.train_dir, model_name="best_model.pth", device="cpu"
             )
         elif args.model is not None:
-            model = model_from_config(config)
+            model = model_from_config(config, deploy=True)
         else:
             raise AssertionError
 
diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py
index d67d750f..f7dfa12b 100644
--- a/nequip/scripts/evaluate.py
+++ b/nequip/scripts/evaluate.py
@@ -30,13 +30,13 @@ def main(args=None, running_as_script: bool = True):
         description=textwrap.dedent(
             """Compute the error of a model on a test set using various metrics.
 
-            The model, metrics, dataset, etc. can specified individually, or a training session can be indicated with `--train-dir`.
+            The model, metrics, dataset, etc. can specified in individual YAML config files, or a training session can be indicated with `--train-dir`.
             In order of priority, the global settings (dtype, TensorFloat32, etc.) are taken from:
-              1. The model config (for a training session)
-              2. The dataset config (for a deployed model)
-              3. The defaults
+              (1) the model config (for a training session),
+              (2) the dataset config (for a deployed model),
+              or (3) the defaults.
 
-            Prints only the final result in `name = num` format to stdout; all other information is logging.debuged to stderr.
+            Prints only the final result in `name = num` format to stdout; all other information is `logging.debug`ed to stderr.
 
             WARNING: Please note that results of CUDA models are rarely exactly reproducible, and that even CPU models can be nondeterministic.
             """
@@ -74,7 +74,7 @@ def main(args=None, running_as_script: bool = True):
     )
     parser.add_argument(
         "--batch-size",
-        help="Batch size to use. Larger is usually faster on GPU. If you run out of memory, lower this.",
+        help="Batch size to use. Larger is usually faster on GPU. If you run out of memory, lower this. You can also try to raise this for faster evaluation. Default: 50.",
         type=int,
         default=50,
     )
diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py
index c6aa7785..88b55f7e 100644
--- a/nequip/scripts/train.py
+++ b/nequip/scripts/train.py
@@ -81,8 +81,12 @@ def main(args=None, running_as_script: bool = True):
 
 
 def parse_command_line(args=None):
-    parser = argparse.ArgumentParser(description="Train a NequIP model.")
-    parser.add_argument("config", help="configuration file")
+    parser = argparse.ArgumentParser(
+        description="Train (or restart training of) a NequIP model."
+    )
+    parser.add_argument(
+        "config", help="YAML file configuring the model, dataset, and other options"
+    )
     parser.add_argument(
         "--equivariance-test",
         help="test the model's equivariance before training on n (default 1) random frames from the dataset",
diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py
index 7ce4d4e1..55efec32 100644
--- a/nequip/train/trainer.py
+++ b/nequip/train/trainer.py
@@ -706,6 +706,9 @@ def init(self):
 
         self.num_weights = sum(p.numel() for p in self.model.parameters())
         self.logger.info(f"Number of weights: {self.num_weights}")
+        self.logger.info(
+            f"Number of trainable weights: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}"
+        )
 
         self.rescale_layers = []
         outer_layer = self.model
@@ -1177,7 +1180,9 @@ def set_dataset(
                 if self.n_train > len(dataset):
                     raise ValueError("Not enough data in dataset for requested n_train")
                 if self.n_val > len(validation_dataset):
-                    raise ValueError("Not enough data in dataset for requested n_train")
+                    raise ValueError(
+                        "Not enough data in validation dataset for requested n_val"
+                    )
                 if self.train_val_split == "random":
                     self.train_idcs = torch.randperm(
                         len(dataset), generator=self.dataset_rng
diff --git a/nequip/utils/git.py b/nequip/utils/git.py
index a78a87fc..a5fbe7f3 100644
--- a/nequip/utils/git.py
+++ b/nequip/utils/git.py
@@ -8,7 +8,14 @@
 def get_commit(module: str) -> Optional[str]:
 
     module = import_module(module)
-    path = str(Path(module.__file__).parents[0] / "..")
+    package = Path(module.__file__).parents[0]
+    if package.is_file():
+        # We're installed as a ZIP .egg file,
+        # which means there's no git information
+        # and looking for the parent would fail anyway
+        # https://github.com/mir-group/nequip/issues/264
+        return None
+    path = str(package / "..")
 
     retcode = subprocess.run(
         "git show --oneline --abbrev=40 -s".split(),
diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py
index 3d23cf84..76d140bc 100644
--- a/nequip/utils/regressor.py
+++ b/nequip/utils/regressor.py
@@ -1,181 +1,76 @@
 import logging
 import torch
-import numpy as np
-from typing import Optional
-from sklearn.gaussian_process import GaussianProcessRegressor
-from sklearn.gaussian_process.kernels import DotProduct, Kernel, Hyperparameter
 
+from torch import matmul
+from torch.linalg import solve, inv
+from typing import Optional, Sequence
+from opt_einsum import contract
 
-def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwargs):
-    if regressor == "GaussianProcess":
-        return gp(X, y, **kwargs)
-    elif regressor == "NormalizedGaussianProcess":
-        return normalized_gp(X, y, **kwargs)
-    else:
-        raise NotImplementedError(f"{regressor} is not implemented")
 
+def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kwargs):
+    # results are in the same "units" as y, so same dtype too:
+    dtype_out = y.dtype
+    # always solve in float64 for numerical stability
+    dtype = torch.float64
+    X = X[::stride].to(dtype)
+    y = y[::stride].to(dtype)
+
+    X, y = down_sampling_by_composition(X, y)
+
+    X_norm = torch.sum(X)
+
+    X = X / X_norm
+    y = y / X_norm
 
-def normalized_gp(X, y, **kwargs):
-    feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0))
-    feature_rms = np.nan_to_num(feature_rms, 1)
     y_mean = torch.sum(y) / torch.sum(X)
-    mean, std = base_gp(
-        X,
-        y - (torch.sum(X, axis=1) * y_mean).reshape(y.shape),
-        NormalizedDotProduct,
-        {"diagonal_elements": feature_rms},
-        **kwargs,
-    )
-    return mean + y_mean, std
 
+    feature_rms = torch.sqrt(torch.mean(X**2, axis=0))
 
-def gp(X, y, **kwargs):
-    return base_gp(
-        X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, **kwargs
-    )
+    alpha_mat = torch.diag(feature_rms) * alpha * alpha
+
+    A = matmul(X.T, X) + alpha_mat
+    dy = y - (torch.sum(X, axis=1, keepdim=True) * y_mean).reshape(y.shape)
+    Xy = matmul(X.T, dy)
+
+    mean = solve(A, Xy)
+
+    sigma2 = torch.var(matmul(X, mean) - dy)
+    Ainv = inv(A)
+    cov = torch.sqrt(sigma2 * contract("ij,kj,kl,li->i", Ainv, X, X, Ainv))
 
+    mean = mean + y_mean.reshape([-1])
 
-def base_gp(
-    X,
-    y,
-    kernel,
-    kernel_kwargs,
-    alpha: Optional[float] = 0.1,
-    max_iteration: int = 20,
-    stride: Optional[int] = None,
+    logging.debug(f"Ridge Regression, residue {sigma2}")
+
+    return mean.to(dtype_out), cov.to(dtype_out)
+
+
+def down_sampling_by_composition(
+    X: torch.Tensor, y: torch.Tensor, percentage: Sequence = [0.25, 0.5, 0.75]
 ):
 
-    if len(y.shape) == 1:
-        y = y.reshape([-1, 1])
-
-    if stride is not None:
-        X = X[::stride]
-        y = y[::stride]
-
-    not_fit = True
-    iteration = 0
-    mean = None
-    std = None
-    while not_fit:
-        logging.debug(f"GP fitting iteration {iteration} {alpha}")
-        try:
-            _kernel = kernel(**kernel_kwargs)
-            gpr = GaussianProcessRegressor(kernel=_kernel, random_state=0, alpha=alpha)
-            gpr = gpr.fit(X, y)
-
-            vec = torch.diag(torch.ones(X.shape[1]))
-            mean, std = gpr.predict(vec, return_std=True)
-
-            mean = torch.as_tensor(mean, dtype=torch.get_default_dtype()).reshape([-1])
-            # ignore all the off-diagonal terms
-            std = torch.as_tensor(std, dtype=torch.get_default_dtype()).reshape([-1])
-            likelihood = gpr.log_marginal_likelihood()
-
-            res = torch.sqrt(
-                torch.square(torch.matmul(X, mean.reshape([-1, 1])) - y).mean()
-            )
-
-            logging.debug(
-                f"GP fitting: alpha {alpha}:\n"
-                f"            residue {res}\n"
-                f"            mean {mean} std {std}\n"
-                f"            log marginal likelihood {likelihood}"
-            )
-            not_fit = False
-
-        except Exception as e:
-            logging.info(f"GP fitting failed for alpha={alpha} and {e.args}")
-            if alpha == 0 or alpha is None:
-                logging.info("try a non-zero alpha")
-                not_fit = False
-                raise ValueError(
-                    f"Please set the {alpha} to non-zero value. \n"
-                    "The dataset energy is rank deficient to be solved with GP"
-                )
-            else:
-                alpha = alpha * 2
-                iteration += 1
-                logging.debug(f"           increase alpha to {alpha}")
-
-            if iteration >= max_iteration or not_fit is False:
-                raise ValueError(
-                    "Please set the per species shift and scale to zeros and ones. \n"
-                    "The dataset energy is to diverge to be solved with GP"
-                )
-
-    return mean, std
-
-
-class NormalizedDotProduct(Kernel):
-    r"""Dot-Product kernel.
-    .. math::
-        k(x_i, x_j) = x_i \cdot A \cdot x_j
-    """
-
-    def __init__(self, diagonal_elements):
-        # TO DO: check shape
-        self.diagonal_elements = diagonal_elements
-        self.A = np.diag(diagonal_elements)
-
-    def __call__(self, X, Y=None, eval_gradient=False):
-        """Return the kernel k(X, Y) and optionally its gradient.
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples_X, n_features)
-            Left argument of the returned kernel k(X, Y)
-        Y : ndarray of shape (n_samples_Y, n_features), default=None
-            Right argument of the returned kernel k(X, Y). If None, k(X, X)
-            if evaluated instead.
-        eval_gradient : bool, default=False
-            Determines whether the gradient with respect to the log of
-            the kernel hyperparameter is computed.
-            Only supported when Y is None.
-        Returns
-        -------
-        K : ndarray of shape (n_samples_X, n_samples_Y)
-            Kernel k(X, Y)
-        K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\
-                optional
-            The gradient of the kernel k(X, X) with respect to the log of the
-            hyperparameter of the kernel. Only returned when `eval_gradient`
-            is True.
-        """
-        X = np.atleast_2d(X)
-        if Y is None:
-            K = (X.dot(self.A)).dot(X.T)
-        else:
-            if eval_gradient:
-                raise ValueError("Gradient can only be evaluated when Y is None.")
-            K = (X.dot(self.A)).dot(Y.T)
-
-        if eval_gradient:
-            return K, np.empty((X.shape[0], X.shape[0], 0))
-        else:
-            return K
-
-    def diag(self, X):
-        """Returns the diagonal of the kernel k(X, X).
-        The result of this method is identical to np.diag(self(X)); however,
-        it can be evaluated more efficiently since only the diagonal is
-        evaluated.
-        Parameters
-        ----------
-        X : ndarray of shape (n_samples_X, n_features)
-            Left argument of the returned kernel k(X, Y).
-        Returns
-        -------
-        K_diag : ndarray of shape (n_samples_X,)
-            Diagonal of kernel k(X, X).
-        """
-        return np.einsum("ij,ij,jj->i", X, X, self.A)
-
-    def __repr__(self):
-        return ""
-
-    def is_stationary(self):
-        """Returns whether the kernel is stationary."""
-        return False
-
-    @property
-    def hyperparameter_diagonal_elements(self):
-        return Hyperparameter("diagonal_elements", "numeric", "fixed")
+    unique_comps, comp_ids = torch.unique(X, dim=0, return_inverse=True)
+
+    n_types = torch.max(comp_ids) + 1
+
+    sort_by = torch.argsort(comp_ids)
+
+    # find out the block for each composition
+    d_icomp = comp_ids[sort_by]
+    d_icomp = d_icomp[:-1] - d_icomp[1:]
+    node_icomp = torch.where(d_icomp != 0)[0]
+    id_start = torch.cat((torch.as_tensor([0]), node_icomp + 1))
+    id_end = torch.cat((node_icomp + 1, torch.as_tensor([len(sort_by)])))
+
+    n_points = len(percentage)
+    new_X = torch.zeros(
+        (n_types * n_points, X.shape[1]), dtype=X.dtype, device=X.device
+    )
+    new_y = torch.zeros((n_types * n_points), dtype=y.dtype, device=y.device)
+    for i in range(n_types):
+        ids = sort_by[id_start[i] : id_end[i]]
+        for j, p in enumerate(percentage):
+            new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear")
+            new_X[i * n_points + j] = unique_comps[i]
+
+    return new_X, new_y
diff --git a/nequip/utils/test.py b/nequip/utils/test.py
index edf2c1e8..60e68730 100644
--- a/nequip/utils/test.py
+++ b/nequip/utils/test.py
@@ -28,7 +28,7 @@ def assert_permutation_equivariant(
     data_in: AtomicDataDict.Type,
     tolerance: Optional[float] = None,
     raise_error: bool = True,
-):
+) -> str:
     r"""Test the permutation equivariance of ``func``.
 
     Standard fields are assumed to be equivariant to node or edge permutations according to their standard interpretions; all other fields are assumed to be invariant to all permutations. Non-standard fields can be registered as node/edge permutation equivariant using ``register_fields``.
@@ -93,38 +93,42 @@ def assert_permutation_equivariant(
         out_perm.keys()
     ), "Permutation changed the set of fields returned by model"
 
-    problems = []
+    messages = []
+    num_problems: int = 0
     for k in out_orig.keys():
         if k in node_permute_fields:
-            if not torch.allclose(out_orig[k][node_perm], out_perm[k], atol=atol):
-                err = (out_orig[k][node_perm] - out_perm[k]).abs().max()
-                problems.append(
-                    f"node permutation equivariance violated for field {k}; maximum componentwise error: {err:e}"
-                )
+            err = (out_orig[k][node_perm] - out_perm[k]).abs().max()
+            fail = not torch.allclose(out_orig[k][node_perm], out_perm[k], atol=atol)
+            if fail:
+                num_problems += 1
+            messages.append(
+                f"   node permutation equivariance of field {k:20}       -> max error={err:.3e}{'  FAIL' if fail else ''}"
+            )
         elif k in edge_permute_fields:
-            if not torch.allclose(out_orig[k][edge_perm], out_perm[k], atol=atol):
-                err = (out_orig[k][edge_perm] - out_perm[k]).abs().max()
-                problems.append(
-                    f"edge permutation equivariance violated for field {k}; maximum componentwise error: {err:e}"
-                )
+            err = (out_orig[k][edge_perm] - out_perm[k]).abs().max()
+            fail = not torch.allclose(out_orig[k][edge_perm], out_perm[k], atol=atol)
+            if fail:
+                num_problems += 1
+            messages.append(
+                f"   edge permutation equivariance of field {k:20}       -> max error={err:.3e}{'  FAIL' if fail else ''}"
+            )
         elif k == AtomicDataDict.EDGE_INDEX_KEY:
             pass
         else:
             # Assume invariant
             if out_orig[k].dtype == torch.bool:
-                if not torch.all(out_orig[k] == out_perm[k]):
-                    problems.append(
-                        f"edge/node permutation invariance violated for field {k} ({k} was assumed to be invariant, should it have been marked as equivariant?)"
-                    )
+                err = (out_orig[k] != out_perm[k]).max()
             else:
-                if not torch.allclose(out_orig[k], out_perm[k], atol=atol):
-                    err = (out_orig[k] - out_perm[k]).abs().max()
-                    problems.append(
-                        f"edge/node permutation invariance violated for field {k}; maximum componentwise error: {err:e}. (`{k}` was assumed to be invariant, should it have been marked as equivariant?)"
-                    )
-    msg = "\n".join(problems)
-    if len(problems) == 0:
-        return
+                err = (out_orig[k] - out_perm[k]).abs().max()
+            fail = not torch.allclose(out_orig[k], out_perm[k], atol=atol)
+            if fail:
+                num_problems += 1
+            messages.append(
+                f"   edge & node permutation invariance for field {k:20} -> max error={err:.3e}{'  FAIL' if fail else ''}"
+            )
+    msg = "\n".join(messages)
+    if num_problems == 0:
+        return msg
     else:
         if raise_error:
             raise AssertionError(msg)
@@ -169,7 +173,7 @@ def assert_AtomicData_equivariant(
 
     # == Test permutation of graph nodes ==
     # since permutation is discrete and should not be data dependent, run only on one frame.
-    permutation_problems = assert_permutation_equivariant(
+    permutation_message = assert_permutation_equivariant(
         func, data_in[0], tolerance=permutation_tolerance, raise_error=False
     )
 
@@ -255,53 +259,23 @@ def wrapper(*args):
 
     if o3_tolerance is None:
         o3_tolerance = FLOAT_TOLERANCE[torch.get_default_dtype()]
-    anerr = next(iter(errs.values()))
-    if isinstance(anerr, float) or anerr.ndim == 0:
-        # old e3nn doesn't report which key
-        problems = {k: v for k, v in errs.items() if v > o3_tolerance}
-
-        def _describe(errors):
-            return (
-                permutation_problems + "\n" if permutation_problems is not None else ""
-            ) + "\n".join(
-                "(parity_k={:d}, did_translate={}) -> max error={:.3e}".format(
-                    int(k[0]),
-                    bool(k[1]),
-                    float(v),
-                )
-                for k, v in errors.items()
-            )
-
-        if len(problems) > 0 or permutation_problems is not None:
-            raise AssertionError(
-                "Equivariance test failed for cases:" + _describe(problems)
-            )
-
-        return _describe(errs)
-    else:
-        # it's newer and tells us which is which
-        all_errs = []
-        for case, err in errs.items():
-            for key, this_err in zip(irreps_out.keys(), err):
-                all_errs.append(case + (key, this_err))
-        problems = [e for e in all_errs if e[-1] > o3_tolerance]
-
-        def _describe(errors):
-            return (
-                permutation_problems + "\n" if permutation_problems is not None else ""
-            ) + "\n".join(
-                "   (parity_k={:1d}, did_translate={:5}, field={:20}) -> max error={:.3e}".format(
-                    int(k[0]), str(bool(k[1])), str(k[2]), float(k[3])
-                )
-                for k in errors
-            )
+    all_errs = []
+    for case, err in errs.items():
+        for key, this_err in zip(irreps_out.keys(), err):
+            all_errs.append(case + (key, this_err))
+    is_problem = [e[-1] > o3_tolerance for e in all_errs]
+
+    message = (permutation_message + "\n") + "\n".join(
+        "   (parity_k={:1d}, did_translate={:5}, field={:20})     -> max error={:.3e}".format(
+            int(k[0]), str(bool(k[1])), str(k[2]), float(k[3])
+        )
+        for k, prob in zip(all_errs, is_problem)
+    )
 
-        if len(problems) > 0 or permutation_problems is not None:
-            raise AssertionError(
-                "Equivariance test failed for cases:\n" + _describe(problems)
-            )
+    if sum(is_problem) > 0 or "FAIL" in permutation_message:
+        raise AssertionError(f"Equivariance test failed for cases:\n{message}")
 
-        return _describe(all_errs)
+    return message
 
 
 _DEBUG_HOOKS = None
diff --git a/nequip/utils/unittests/__init__.py b/nequip/utils/unittests/__init__.py
new file mode 100644
index 00000000..2309cb02
--- /dev/null
+++ b/nequip/utils/unittests/__init__.py
@@ -0,0 +1,3 @@
+import pathlib
+
+CONFTEST_PATH = pathlib.Path(__file__).parent / "conftest.py"
diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py
new file mode 100644
index 00000000..4cfa98ff
--- /dev/null
+++ b/nequip/utils/unittests/conftest.py
@@ -0,0 +1,154 @@
+from typing import List, Tuple
+import numpy as np
+import pathlib
+import pytest
+import tempfile
+import os
+
+from ase.atoms import Atoms
+from ase.build import molecule
+from ase.calculators.singlepoint import SinglePointCalculator
+from ase.io import write
+
+import torch
+
+from nequip.utils.test import set_irreps_debug
+from nequip.data import AtomicData, ASEDataset
+from nequip.data.transforms import TypeMapper
+from nequip.utils.torch_geometric import Batch
+from nequip.utils._global_options import _set_global_options
+from nequip.utils.misc import dtype_from_name
+
+if "NEQUIP_NUM_TASKS" not in os.environ:
+    # Test parallelization, but don't waste time spawning tons of workers if lots of cores available
+    os.environ["NEQUIP_NUM_TASKS"] = "2"
+
+# The default float tolerance
+FLOAT_TOLERANCE = {
+    t: torch.as_tensor(v, dtype=dtype_from_name(t))
+    for t, v in {"float32": 1e-3, "float64": 1e-10}.items()
+}
+
+
+@pytest.fixture(scope="session", autouse=True, params=["float32", "float64"])
+def float_tolerance(request):
+    """Run all tests with various PyTorch default dtypes.
+
+    This is a session-wide, autouse fixture — you only need to request it explicitly if a test needs to know the tolerance for the current default dtype.
+
+    Returns
+    --------
+        A precision threshold to use for closeness tests.
+    """
+    old_dtype = torch.get_default_dtype()
+    dtype = request.param
+    _set_global_options({"default_dtype": dtype})
+    yield FLOAT_TOLERANCE[dtype]
+    _set_global_options(
+        {
+            "default_dtype": {torch.float32: "float32", torch.float64: "float64"}[
+                old_dtype
+            ]
+        }
+    )
+
+
+# - Ampere and TF32 -
+# Many of the tests for NequIP involve numerically checking
+# algebraic properties— normalization, equivariance,
+# continuity, etc.
+# With the added numerical noise of TF32, some of those tests fail
+# with the current (and usually generous) thresholds.
+#
+# Thus we go on the assumption that PyTorch + NVIDIA got everything
+# right, that this setting DOES NOT AFFECT the model outputs except
+# for increased numerical noise, and only test without it.
+#
+# TODO: consider running tests with and without
+# TODO: check how much thresholds have to be changed to accomidate TF32
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+
+
+@pytest.fixture(scope="session")
+def BENCHMARK_ROOT():
+    return pathlib.Path(__file__).parent / "../benchmark_data/"
+
+
+@pytest.fixture(scope="session")
+def temp_data(float_tolerance):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        yield tmpdirname
+
+
+@pytest.fixture(scope="session")
+def CH3CHO(CH3CHO_no_typemap) -> Tuple[Atoms, AtomicData]:
+    atoms, data = CH3CHO_no_typemap
+    tm = TypeMapper(chemical_symbol_to_type={"C": 0, "O": 1, "H": 2})
+    data = tm(data)
+    return atoms, data
+
+
+@pytest.fixture(scope="session")
+def CH3CHO_no_typemap(float_tolerance) -> Tuple[Atoms, AtomicData]:
+    atoms = molecule("CH3CHO")
+    data = AtomicData.from_ase(atoms, r_max=2.0)
+    return atoms, data
+
+
+@pytest.fixture(scope="session")
+def molecules() -> List[Atoms]:
+    atoms_list = []
+    for i in range(8):
+        atoms = molecule("CH3CHO" if i % 2 == 0 else "H2")
+        atoms.rattle()
+        atoms.calc = SinglePointCalculator(
+            energy=np.random.random(),
+            forces=np.random.random((len(atoms), 3)),
+            stress=None,
+            magmoms=None,
+            atoms=atoms,
+        )
+        atoms_list.append(atoms)
+    return atoms_list
+
+
+@pytest.fixture(scope="session")
+def nequip_dataset(molecules, temp_data, float_tolerance):
+    with tempfile.NamedTemporaryFile(suffix=".xyz") as fp:
+        for atoms in molecules:
+            write(fp.name, atoms, format="extxyz", append=True)
+        a = ASEDataset(
+            file_name=fp.name,
+            root=temp_data,
+            extra_fixed_fields={"r_max": 3.0},
+            ase_args=dict(format="extxyz"),
+            type_mapper=TypeMapper(chemical_symbol_to_type={"H": 0, "C": 1, "O": 2}),
+        )
+        yield a
+
+
+@pytest.fixture(scope="session")
+def atomic_batch(nequip_dataset):
+    return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]])
+
+
+@pytest.fixture(scope="function")
+def per_species_set():
+    dtype = torch.get_default_dtype()
+    rng = torch.Generator().manual_seed(127)
+    mean_min = 1
+    mean_max = 100
+    std = 20
+    n_sample = 1000
+    n_species = 9
+    ref_mean = torch.rand((n_species), generator=rng) * (mean_max - mean_min) + mean_min
+    t_mean = torch.ones((n_sample, 1)) * ref_mean.reshape([1, -1])
+    ref_std = torch.rand((n_species), generator=rng) * std
+    t_std = torch.ones((n_sample, 1)) * ref_std.reshape([1, -1])
+    E = torch.normal(t_mean, t_std, generator=rng)
+    return ref_mean.to(dtype), ref_std.to(dtype), E.to(dtype), n_sample, n_species
+
+
+# Use debug mode
+set_irreps_debug(True)
diff --git a/tests/unit/model/test_eng_force.py b/nequip/utils/unittests/model_tests.py
similarity index 56%
rename from tests/unit/model/test_eng_force.py
rename to nequip/utils/unittests/model_tests.py
index 0adcd4c9..2b6a8b63 100644
--- a/tests/unit/model/test_eng_force.py
+++ b/nequip/utils/unittests/model_tests.py
@@ -1,150 +1,87 @@
 import pytest
 
-import logging
 import tempfile
 import functools
 import torch
 
 import numpy as np
 
-from e3nn import o3
 from e3nn.util.jit import script
 
-from nequip.data import AtomicDataDict, AtomicData, Collater
+from nequip.data import (
+    AtomicDataDict,
+    AtomicData,
+    Collater,
+    _GRAPH_FIELDS,
+    _NODE_FIELDS,
+    _EDGE_FIELDS,
+)
 from nequip.data.transforms import TypeMapper
-from nequip.model import model_from_config, uniform_initialize_FCs
-from nequip.nn import GraphModuleMixin, AtomwiseLinear
+from nequip.model import model_from_config
+from nequip.nn import GraphModuleMixin
 from nequip.utils.test import assert_AtomicData_equivariant
 
 
-logging.basicConfig(level=logging.DEBUG)
-
-COMMON_CONFIG = {
-    "num_types": 3,
-    "types_names": ["H", "C", "O"],
-    "avg_num_neighbors": None,
-}
-r_max = 3
-minimal_config1 = dict(
-    irreps_edge_sh="0e + 1o",
-    r_max=4,
-    feature_irreps_hidden="4x0e + 4x1o",
-    num_layers=2,
-    num_basis=8,
-    PolynomialCutoff_p=6,
-    nonlinearity_type="norm",
-    **COMMON_CONFIG
-)
-minimal_config2 = dict(
-    irreps_edge_sh="0e + 1o",
-    r_max=4,
-    chemical_embedding_irreps_out="8x0e + 8x0o + 8x1e + 8x1o",
-    irreps_mid_output_block="2x0e",
-    feature_irreps_hidden="4x0e + 4x1o",
-    **COMMON_CONFIG
-)
-minimal_config3 = dict(
-    irreps_edge_sh="0e + 1o",
-    r_max=4,
-    feature_irreps_hidden="4x0e + 4x1o",
-    num_layers=2,
-    num_basis=8,
-    PolynomialCutoff_p=6,
-    nonlinearity_type="gate",
-    **COMMON_CONFIG
-)
-minimal_config4 = dict(
-    irreps_edge_sh="0e + 1o + 2e",
-    r_max=4,
-    feature_irreps_hidden="2x0e + 2x1o + 2x2e",
-    num_layers=2,
-    num_basis=3,
-    PolynomialCutoff_p=6,
-    nonlinearity_type="gate",
-    # test custom nonlinearities
-    nonlinearity_scalars={"e": "silu", "o": "tanh"},
-    nonlinearity_gates={"e": "silu", "o": "abs"},
-    **COMMON_CONFIG
-)
-
-
-@pytest.fixture(
-    scope="module",
-    params=[minimal_config1, minimal_config2, minimal_config3, minimal_config4],
-)
-def config(request):
-    return request.param
+# see https://github.com/pytest-dev/pytest/issues/421#issuecomment-943386533
+# to allow external packages to import tests through subclassing
+class BaseModelTests:
+    @pytest.fixture(scope="class")
+    def config(self):
+        """Implemented by subclasses.
 
+        Return a tuple of config, out_field
+        """
+        raise NotImplementedError
 
-@pytest.fixture(
-    params=[
-        (
-            ["EnergyModel", "ForceOutput"],
-            AtomicDataDict.FORCE_KEY,
+    @pytest.fixture(
+        scope="class",
+        params=(
+            [torch.device("cuda"), torch.device("cpu")]
+            if torch.cuda.is_available()
+            else [torch.device("cpu")]
         ),
-        (
-            ["EnergyModel"],
-            AtomicDataDict.TOTAL_ENERGY_KEY,
-        ),
-        (
-            ["EnergyModel", "StressForceOutput"],
-            AtomicDataDict.STRESS_KEY,
-        ),
-    ]
-)
-def model(request, config):
-    torch.manual_seed(0)
-    np.random.seed(0)
-    builder, out_field = request.param
-    config = config.copy()
-    config["model_builders"] = builder
-    return model_from_config(config), out_field
-
-
-@pytest.fixture(
-    scope="module",
-    params=(
-        [torch.device("cuda"), torch.device("cpu")]
-        if torch.cuda.is_available()
-        else [torch.device("cpu")]
-    ),
-)
-def device(request):
-    return request.param
-
+    )
+    def device(self, request):
+        return request.param
+
+    @staticmethod
+    def make_model(config, device, initialize: bool = True, deploy: bool = False):
+        torch.manual_seed(127)
+        np.random.seed(193)
+        config = config.copy()
+        config.update(
+            {
+                "num_types": 3,
+                "types_names": ["H", "C", "O"],
+            }
+        )
+        model = model_from_config(config, initialize=initialize, deploy=deploy)
+        model = model.to(device)
+        return model
 
-class TestWorkflow:
-    """
-    test class methods
-    """
+    @pytest.fixture(scope="class")
+    def model(self, config, device):
+        config, out_fields = config
+        model = self.make_model(config, device=device)
+        return model, out_fields
 
+    # == common tests for all models ==
     def test_init(self, model):
         instance, _ = model
         assert isinstance(instance, GraphModuleMixin)
 
-    def test_weight_init(self, model, atomic_batch, device):
-        instance, out_field = model
-        data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device))
-        instance = instance.to(device=device)
-
-        out_orig = instance(data)[out_field]
-
-        instance = uniform_initialize_FCs(instance, initialize=True)
-
-        out_unif = instance(data)[out_field]
-        assert not torch.allclose(out_orig, out_unif)
-
     def test_jit(self, model, atomic_batch, device):
-        instance, out_field = model
+        instance, out_fields = model
         data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device))
         instance = instance.to(device=device)
         model_script = script(instance)
 
-        assert torch.allclose(
-            instance(data)[out_field],
-            model_script(data)[out_field],
-            atol=1e-6,
-        )
+        for out_field in out_fields:
+            assert torch.allclose(
+                instance(data)[out_field],
+                model_script(data)[out_field],
+                atol=1e-6,
+            )
 
         # - Try saving, loading in another process, and running -
         with tempfile.TemporaryDirectory() as tmpdir:
@@ -163,39 +100,25 @@ def test_jit(self, model, atomic_batch, device):
                 torch.float64: 1e-10,
             }[torch.get_default_dtype()]
 
-            assert torch.allclose(
-                model_script(data)[out_field],
-                load_model(load_dat)[out_field],
-                atol=atol,
-            )
-
-    def test_submods(self):
-        config = minimal_config2.copy()
-        config["model_builders"] = ["EnergyModel"]
-        model = model_from_config(config=config, initialize=True)
-        assert isinstance(model.chemical_embedding, AtomwiseLinear)
-        true_irreps = o3.Irreps(minimal_config2["chemical_embedding_irreps_out"])
-        assert (
-            model.chemical_embedding.irreps_out[model.chemical_embedding.out_field]
-            == true_irreps
-        )
-        # Make sure it propagates
-        assert (
-            model.layer0_convnet.irreps_in[model.chemical_embedding.out_field]
-            == true_irreps
-        )
+            for out_field in out_fields:
+                assert torch.allclose(
+                    model_script(data)[out_field],
+                    load_model(load_dat)[out_field],
+                    atol=atol,
+                )
 
     def test_forward(self, model, atomic_batch, device):
-        instance, out_field = model
+        instance, out_fields = model
         instance.to(device)
         data = atomic_batch.to(device)
         output = instance(AtomicData.to_AtomicDataDict(data))
-        assert out_field in output
+        for out_field in out_fields:
+            assert out_field in output
 
     def test_batch(self, model, atomic_batch, device, float_tolerance):
         """Confirm that the results for individual examples are the same regardless of whether they are batched."""
         allclose = functools.partial(torch.allclose, atol=float_tolerance)
-        instance, out_field = model
+        instance, out_fields = model
         instance.to(device)
         data = atomic_batch.to(device)
         data1 = data.get_example(0)
@@ -203,33 +126,186 @@ def test_batch(self, model, atomic_batch, device, float_tolerance):
         output1 = instance(AtomicData.to_AtomicDataDict(data1))
         output2 = instance(AtomicData.to_AtomicDataDict(data2))
         output = instance(AtomicData.to_AtomicDataDict(data))
-        if out_field in (AtomicDataDict.TOTAL_ENERGY_KEY, AtomicDataDict.STRESS_KEY):
-            assert allclose(
-                output1[out_field],
-                output[out_field][0],
-            )
-            assert allclose(
-                output2[out_field],
-                output[out_field][1],
-            )
-        elif out_field in (AtomicDataDict.FORCE_KEY,):
-            assert allclose(
-                output1[out_field],
-                output[out_field][output[AtomicDataDict.BATCH_KEY] == 0],
-            )
-            assert allclose(
-                output2[out_field],
-                output[out_field][output[AtomicDataDict.BATCH_KEY] == 1],
-            )
+        for out_field in out_fields:
+            if out_field in _GRAPH_FIELDS:
+                assert allclose(
+                    output1[out_field],
+                    output[out_field][0],
+                )
+                assert allclose(
+                    output2[out_field],
+                    output[out_field][1],
+                )
+            elif out_field in _NODE_FIELDS:
+                assert allclose(
+                    output1[out_field],
+                    output[out_field][output[AtomicDataDict.BATCH_KEY] == 0],
+                )
+                assert allclose(
+                    output2[out_field],
+                    output[out_field][output[AtomicDataDict.BATCH_KEY] == 1],
+                )
+            elif out_field in _EDGE_FIELDS:
+                assert allclose(
+                    output1[out_field],
+                    output[out_field][
+                        output[AtomicDataDict.BATCH_KEY][
+                            output[AtomicDataDict.EDGE_INDEX_KEY][0]
+                        ]
+                        == 0
+                    ],
+                )
+                assert allclose(
+                    output2[out_field],
+                    output[out_field][
+                        output[AtomicDataDict.BATCH_KEY][
+                            output[AtomicDataDict.EDGE_INDEX_KEY][0]
+                        ]
+                        == 1
+                    ],
+                )
+            else:
+                raise NotImplementedError
+
+    def test_equivariance(self, model, atomic_batch, device):
+        instance, out_fields = model
+        instance = instance.to(device=device)
+        atomic_batch = atomic_batch.to(device=device)
+        assert_AtomicData_equivariant(func=instance, data_in=atomic_batch)
+
+    def test_embedding_cutoff(self, model, config, device):
+        instance, out_fields = model
+        config, out_fields = config
+        r_max = config["r_max"]
+
+        # make a synthetic three atom example
+        data = AtomicData(
+            atom_types=np.random.choice([0, 1, 2], size=3),
+            pos=np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
+            edge_index=np.array([[0, 1, 0, 2], [1, 0, 2, 0]]),
+        )
+        data = data.to(device)
+        edge_embed = instance(AtomicData.to_AtomicDataDict(data))
+        if AtomicDataDict.EDGE_FEATURES_KEY in edge_embed:
+            key = AtomicDataDict.EDGE_FEATURES_KEY
         else:
-            raise NotImplementedError
+            key = AtomicDataDict.EDGE_EMBEDDING_KEY
+        edge_embed = edge_embed[key]
+        data.pos[2, 1] = r_max  # put it past the cutoff
+        edge_embed2 = instance(AtomicData.to_AtomicDataDict(data))[key]
 
+        if key == AtomicDataDict.EDGE_EMBEDDING_KEY:
+            # we can only check that other edges are unaffected if we know it's an embedding
+            # For example, an Allegro edge feature is many body so will be affected
+            assert torch.allclose(edge_embed[:2], edge_embed2[:2])
+        assert edge_embed[2:].abs().sum() > 1e-6  # some nonzero terms
+        assert torch.allclose(edge_embed2[2:], torch.zeros(1, device=device))
 
-class TestGradient:
-    def test_numeric_gradient(self, config, atomic_batch, device, float_tolerance):
-        config = config.copy()
-        config["model_builders"] = ["EnergyModel", "ForceOutput"]
-        model = model_from_config(config=config, initialize=True)
+        # test gradients
+        in_dict = AtomicData.to_AtomicDataDict(data)
+        in_dict[AtomicDataDict.POSITIONS_KEY].requires_grad_(True)
+
+        with torch.autograd.set_detect_anomaly(True):
+            out = instance(in_dict)
+
+            # is the edge embedding of the cutoff length edge unchanged at the cutoff?
+            grads = torch.autograd.grad(
+                outputs=out[key][2:].sum(),
+                inputs=in_dict[AtomicDataDict.POSITIONS_KEY],
+                retain_graph=True,
+            )[0]
+            assert torch.allclose(grads, torch.zeros(1, device=device))
+
+            if AtomicDataDict.PER_ATOM_ENERGY_KEY in out:
+                # are the first two atom's energies unaffected by atom at the cutoff?
+                grads = torch.autograd.grad(
+                    outputs=out[AtomicDataDict.PER_ATOM_ENERGY_KEY][:2].sum(),
+                    inputs=in_dict[AtomicDataDict.POSITIONS_KEY],
+                )[0]
+                print(grads)
+                # only care about gradient wrt moved atom
+                assert grads.shape == (3, 3)
+                assert torch.allclose(grads[2], torch.zeros(1, device=device))
+
+
+class BaseEnergyModelTests(BaseModelTests):
+    def test_large_separation(self, model, config, molecules, device):
+        atol = {torch.float32: 1e-4, torch.float64: 1e-10}[torch.get_default_dtype()]
+        instance, _ = model
+        instance.to(device)
+        config, out_fields = config
+        r_max = config["r_max"]
+        atoms1 = molecules[0].copy()
+        atoms2 = molecules[1].copy()
+        # translate atoms2 far away
+        atoms2.positions += 40.0 + np.random.randn(3)
+        atoms_both = atoms1.copy()
+        atoms_both.extend(atoms2)
+        tm = TypeMapper(chemical_symbols=["H", "C", "O"])
+        data1 = tm(AtomicData.from_ase(atoms1, r_max=r_max).to(device=device))
+        data2 = tm(AtomicData.from_ase(atoms2, r_max=r_max).to(device=device))
+        data_both = tm(AtomicData.from_ase(atoms_both, r_max=r_max).to(device=device))
+        assert (
+            data_both[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
+            == data1[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
+            + data2[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
+        )
+
+        out1 = instance(AtomicData.to_AtomicDataDict(data1))
+        out2 = instance(AtomicData.to_AtomicDataDict(data2))
+        out_both = instance(AtomicData.to_AtomicDataDict(data_both))
+
+        assert torch.allclose(
+            out1[AtomicDataDict.TOTAL_ENERGY_KEY]
+            + out2[AtomicDataDict.TOTAL_ENERGY_KEY],
+            out_both[AtomicDataDict.TOTAL_ENERGY_KEY],
+            atol=atol,
+        )
+
+        atoms_both2 = atoms1.copy()
+        atoms3 = atoms2.copy()
+        atoms3.positions += np.random.randn(3)
+        atoms_both2.extend(atoms3)
+        data_both2 = tm(AtomicData.from_ase(atoms_both2, r_max=r_max).to(device=device))
+        out_both2 = instance(AtomicData.to_AtomicDataDict(data_both2))
+        assert torch.allclose(
+            out_both2[AtomicDataDict.TOTAL_ENERGY_KEY],
+            out_both[AtomicDataDict.TOTAL_ENERGY_KEY],
+            atol=atol,
+        )
+        assert torch.allclose(
+            out_both2[AtomicDataDict.PER_ATOM_ENERGY_KEY],
+            out_both[AtomicDataDict.PER_ATOM_ENERGY_KEY],
+            atol=atol,
+        )
+
+    def test_cross_frame_grad(self, model, device, nequip_dataset):
+        c = Collater.for_dataset(nequip_dataset)
+        batch = c([nequip_dataset[i] for i in range(len(nequip_dataset))])
+        energy_model, out_fields = model
+        energy_model.to(device)
+        data = AtomicData.to_AtomicDataDict(batch.to(device))
+        data[AtomicDataDict.POSITIONS_KEY].requires_grad = True
+
+        output = energy_model(data)
+        grads = torch.autograd.grad(
+            outputs=output[AtomicDataDict.TOTAL_ENERGY_KEY][-1],
+            inputs=data[AtomicDataDict.POSITIONS_KEY],
+            allow_unused=True,
+        )[0]
+
+        last_frame_n_atom = batch.ptr[-1] - batch.ptr[-2]
+
+        in_frame_grad = grads[-last_frame_n_atom:]
+        cross_frame_grad = grads[:-last_frame_n_atom]
+
+        assert cross_frame_grad.abs().max().item() == 0
+        assert in_frame_grad.abs().max().item() > 0
+
+    def test_numeric_gradient(self, model, atomic_batch, device):
+        model, out_fields = model
+        if AtomicDataDict.FORCE_KEY not in out_fields:
+            pytest.skip()
         model.to(device)
         data = atomic_batch.to(device)
         output = model(AtomicData.to_AtomicDataDict(data))
@@ -256,16 +332,15 @@ def test_numeric_gradient(self, config, atomic_batch, device, float_tolerance):
                 numeric, analytical, rtol=5e-2
             )
 
-    def test_partial_forces(self, atomic_batch, device):
-        config = minimal_config1.copy()
-        config["model_builders"] = [
-            "EnergyModel",
-            "ForceOutput",
-        ]
+    def test_partial_forces(self, config, atomic_batch, device, strict_locality):
+        config, out_fields = config
+        if "ForceOutput" not in config["model_builders"]:
+            pytest.skip()
+        config = config.copy()
         partial_config = config.copy()
         partial_config["model_builders"] = [
-            "EnergyModel",
-            "PartialForceOutput",
+            "PartialForceOutput" if b == "ForceOutput" else b
+            for b in partial_config["model_builders"]
         ]
         model = model_from_config(config=config, initialize=True)
         partial_model = model_from_config(config=partial_config, initialize=True)
@@ -284,7 +359,7 @@ def test_partial_forces(self, atomic_batch, device):
                 assert torch.allclose(
                     output[k],
                     output_partial[k],
-                    atol=1e-6 if k == AtomicDataDict.FORCE_KEY else 1e-8,
+                    atol=1e-8 if k == AtomicDataDict.TOTAL_ENERGY_KEY else 1e-6,
                 )
             else:
                 assert torch.equal(output[k], output_partial[k])
@@ -293,152 +368,17 @@ def test_partial_forces(self, atomic_batch, device):
         assert partial_forces.shape == (n_at, n_at, 3)
         # confirm that sparsity matches graph topology:
         edge_index = data[AtomicDataDict.EDGE_INDEX_KEY]
-        adjacency = torch.zeros(n_at, n_at, dtype=torch.bool)
-        strict_locality = False
+        adjacency = torch.zeros(
+            n_at, n_at, dtype=torch.bool, device=partial_forces.device
+        )
         if strict_locality:
             # only adjacent for nonzero deriv to neighbors
             adjacency[edge_index[0], edge_index[1]] = True
-            adjacency[
-                torch.arange(n_at), torch.arange(n_at)
-            ] = True  # diagonal is ofc True
+            arange = torch.arange(n_at, device=partial_forces.device)
+            adjacency[arange, arange] = True  # diagonal is ofc True
         else:
             # technically only adjacent to n-th degree neighbor, but in this tiny test system that is same as all-to-all and easier to program
             adjacency = data[AtomicDataDict.BATCH_KEY].view(-1, 1) == data[
                 AtomicDataDict.BATCH_KEY
             ].view(1, -1)
         assert torch.equal(adjacency, torch.any(partial_forces != 0, dim=-1))
-
-
-class TestAutoGradient:
-    def test_cross_frame_grad(self, config, nequip_dataset):
-        c = Collater.for_dataset(nequip_dataset)
-        batch = c([nequip_dataset[i] for i in range(len(nequip_dataset))])
-        device = "cpu"
-        config = config.copy()
-        config["model_builders"] = ["EnergyModel"]
-        energy_model = model_from_config(config=config, initialize=True)
-        energy_model.to(device)
-        data = AtomicData.to_AtomicDataDict(batch.to(device))
-        data[AtomicDataDict.POSITIONS_KEY].requires_grad = True
-
-        output = energy_model(data)
-        grads = torch.autograd.grad(
-            outputs=output[AtomicDataDict.TOTAL_ENERGY_KEY][-1],
-            inputs=data[AtomicDataDict.POSITIONS_KEY],
-            allow_unused=True,
-        )[0]
-
-        last_frame_n_atom = batch.ptr[-1] - batch.ptr[-2]
-
-        in_frame_grad = grads[-last_frame_n_atom:]
-        cross_frame_grad = grads[:-last_frame_n_atom]
-
-        assert cross_frame_grad.abs().max().item() == 0
-        assert in_frame_grad.abs().max().item() > 0
-
-
-class TestEquivariance:
-    def test_forward(self, model, atomic_batch, device):
-        instance, out_field = model
-        instance = instance.to(device=device)
-        atomic_batch = atomic_batch.to(device=device)
-        assert_AtomicData_equivariant(func=instance, data_in=atomic_batch)
-
-
-class TestCutoff:
-    def test_large_separation(self, model, config, molecules):
-        atol = {torch.float32: 1e-4, torch.float64: 1e-10}[torch.get_default_dtype()]
-        instance, _ = model
-        r_max = config["r_max"]
-        atoms1 = molecules[0].copy()
-        atoms2 = molecules[1].copy()
-        # translate atoms2 far away
-        atoms2.positions += 40.0 + np.random.randn(3)
-        atoms_both = atoms1.copy()
-        atoms_both.extend(atoms2)
-        tm = TypeMapper(chemical_symbols=["H", "C", "O"])
-        data1 = tm(AtomicData.from_ase(atoms1, r_max=r_max))
-        data2 = tm(AtomicData.from_ase(atoms2, r_max=r_max))
-        data_both = tm(AtomicData.from_ase(atoms_both, r_max=r_max))
-        assert (
-            data_both[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
-            == data1[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
-            + data2[AtomicDataDict.EDGE_INDEX_KEY].shape[1]
-        )
-
-        out1 = instance(AtomicData.to_AtomicDataDict(data1))
-        out2 = instance(AtomicData.to_AtomicDataDict(data2))
-        out_both = instance(AtomicData.to_AtomicDataDict(data_both))
-
-        assert torch.allclose(
-            out1[AtomicDataDict.TOTAL_ENERGY_KEY]
-            + out2[AtomicDataDict.TOTAL_ENERGY_KEY],
-            out_both[AtomicDataDict.TOTAL_ENERGY_KEY],
-            atol=atol,
-        )
-
-        atoms_both2 = atoms1.copy()
-        atoms3 = atoms2.copy()
-        atoms3.positions += np.random.randn(3)
-        atoms_both2.extend(atoms3)
-        data_both2 = tm(AtomicData.from_ase(atoms_both2, r_max=r_max))
-        out_both2 = instance(AtomicData.to_AtomicDataDict(data_both2))
-        assert torch.allclose(
-            out_both2[AtomicDataDict.TOTAL_ENERGY_KEY],
-            out_both[AtomicDataDict.TOTAL_ENERGY_KEY],
-            atol=atol,
-        )
-        assert torch.allclose(
-            out_both2[AtomicDataDict.PER_ATOM_ENERGY_KEY],
-            out_both[AtomicDataDict.PER_ATOM_ENERGY_KEY],
-            atol=atol,
-        )
-
-    def test_embedding_cutoff(self, config):
-        config = config.copy()
-        config["model_builders"] = ["EnergyModel"]
-        instance = model_from_config(config=config, initialize=True)
-        r_max = config["r_max"]
-
-        # make a synthetic three atom example
-        data = AtomicData(
-            atom_types=np.random.choice([0, 1, 2], size=3),
-            pos=np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]),
-            edge_index=np.array([[0, 1, 0, 2], [1, 0, 2, 0]]),
-        )
-        edge_embed = instance(AtomicData.to_AtomicDataDict(data))[
-            AtomicDataDict.EDGE_EMBEDDING_KEY
-        ]
-        data.pos[2, 1] = r_max  # put it past the cutoff
-        edge_embed2 = instance(AtomicData.to_AtomicDataDict(data))[
-            AtomicDataDict.EDGE_EMBEDDING_KEY
-        ]
-
-        assert torch.allclose(edge_embed[:2], edge_embed2[:2])
-        assert edge_embed[2:].abs().sum() > 1e-6  # some nonzero terms
-        assert torch.allclose(edge_embed2[2:], torch.zeros(1))
-
-        # test gradients
-        in_dict = AtomicData.to_AtomicDataDict(data)
-        in_dict[AtomicDataDict.POSITIONS_KEY].requires_grad_(True)
-
-        with torch.autograd.set_detect_anomaly(True):
-            out = instance(in_dict)
-
-            # is the edge embedding of the cutoff length edge unchanged at the cutoff?
-            grads = torch.autograd.grad(
-                outputs=out[AtomicDataDict.EDGE_EMBEDDING_KEY][2:].sum(),
-                inputs=in_dict[AtomicDataDict.POSITIONS_KEY],
-                retain_graph=True,
-            )[0]
-            assert torch.allclose(grads, torch.zeros(1))
-
-            # are the first two atom's energies unaffected by atom at the cutoff?
-            grads = torch.autograd.grad(
-                outputs=out[AtomicDataDict.PER_ATOM_ENERGY_KEY][:2].sum(),
-                inputs=in_dict[AtomicDataDict.POSITIONS_KEY],
-            )[0]
-            print(grads)
-            # only care about gradient wrt moved atom
-            assert grads.shape == (3, 3)
-            assert torch.allclose(grads[2], torch.zeros(1))
diff --git a/setup.py b/setup.py
index 8c977e0a..d7a5b465 100644
--- a/setup.py
+++ b/setup.py
@@ -29,15 +29,14 @@
         "numpy",
         "ase",
         "tqdm",
-        "torch>=1.8,<=1.12,!=1.9.0",  # torch.fx added in 1.8
-        "e3nn>=0.3.5,<0.6.0",
+        "torch>=1.10.0,<1.13,!=1.9.0",
+        "e3nn>=0.4.4,<0.6.0",
         "pyyaml",
         "contextlib2;python_version<'3.7'",  # backport of nullcontext
         'contextvars;python_version<"3.7"',  # backport of contextvars for savenload
         "typing_extensions;python_version<'3.8'",  # backport of Final
         "torch-runstats>=0.2.0",
         "torch-ema>=0.3.0",
-        "scikit_learn<=1.0.1",  # for GaussianProcess for per-species statistics; 1.0.2 has a bug!
     ],
     zip_safe=True,
 )
diff --git a/tests/conftest.py b/tests/conftest.py
index 060e5e7b..e9719bcd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,137 +1,5 @@
-from typing import List, Tuple
-import numpy as np
-import pathlib
-import pytest
-import tempfile
-import os
+from nequip.utils.unittests import CONFTEST_PATH
 
-from ase.atoms import Atoms
-from ase.build import molecule
-from ase.calculators.singlepoint import SinglePointCalculator
-from ase.io import write
-
-import torch
-
-from nequip.utils.test import set_irreps_debug
-from nequip.data import AtomicData, ASEDataset
-from nequip.data.transforms import TypeMapper
-from nequip.utils.torch_geometric import Batch
-from nequip.utils._global_options import _set_global_options
-from nequip.utils.misc import dtype_from_name
-
-if "NEQUIP_NUM_TASKS" not in os.environ:
-    # Test parallelization, but don't waste time spawning tons of workers if lots of cores available
-    os.environ["NEQUIP_NUM_TASKS"] = "2"
-
-# The default float tolerance
-FLOAT_TOLERANCE = {
-    t: torch.as_tensor(v, dtype=dtype_from_name(t))
-    for t, v in {"float32": 1e-3, "float64": 1e-10}.items()
-}
-
-
-@pytest.fixture(scope="session", autouse=True, params=["float32", "float64"])
-def float_tolerance(request):
-    """Run all tests with various PyTorch default dtypes.
-
-    This is a session-wide, autouse fixture — you only need to request it explicitly if a test needs to know the tolerance for the current default dtype.
-
-    Returns
-    --------
-        A precision threshold to use for closeness tests.
-    """
-    old_dtype = torch.get_default_dtype()
-    dtype = request.param
-    _set_global_options({"default_dtype": dtype})
-    yield FLOAT_TOLERANCE[dtype]
-    _set_global_options(
-        {
-            "default_dtype": {torch.float32: "float32", torch.float64: "float64"}[
-                old_dtype
-            ]
-        }
-    )
-
-
-# - Ampere and TF32 -
-# Many of the tests for NequIP involve numerically checking
-# algebraic properties— normalization, equivariance,
-# continuity, etc.
-# With the added numerical noise of TF32, some of those tests fail
-# with the current (and usually generous) thresholds.
-#
-# Thus we go on the assumption that PyTorch + NVIDIA got everything
-# right, that this setting DOES NOT AFFECT the model outputs except
-# for increased numerical noise, and only test without it.
-#
-# TODO: consider running tests with and without
-# TODO: check how much thresholds have to be changed to accomidate TF32
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
-
-
-@pytest.fixture(scope="session")
-def BENCHMARK_ROOT():
-    return pathlib.Path(__file__).parent / "../benchmark_data/"
-
-
-@pytest.fixture(scope="session")
-def temp_data(float_tolerance):
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        yield tmpdirname
-
-
-@pytest.fixture(scope="session")
-def CH3CHO(CH3CHO_no_typemap) -> Tuple[Atoms, AtomicData]:
-    atoms, data = CH3CHO_no_typemap
-    tm = TypeMapper(chemical_symbol_to_type={"C": 0, "O": 1, "H": 2})
-    data = tm(data)
-    return atoms, data
-
-
-@pytest.fixture(scope="session")
-def CH3CHO_no_typemap(float_tolerance) -> Tuple[Atoms, AtomicData]:
-    atoms = molecule("CH3CHO")
-    data = AtomicData.from_ase(atoms, r_max=2.0)
-    return atoms, data
-
-
-@pytest.fixture(scope="session")
-def molecules() -> List[Atoms]:
-    atoms_list = []
-    for i in range(8):
-        atoms = molecule("CH3CHO" if i % 2 == 0 else "H2")
-        atoms.rattle()
-        atoms.calc = SinglePointCalculator(
-            energy=np.random.random(),
-            forces=np.random.random((len(atoms), 3)),
-            stress=None,
-            magmoms=None,
-            atoms=atoms,
-        )
-        atoms_list.append(atoms)
-    return atoms_list
-
-
-@pytest.fixture(scope="session")
-def nequip_dataset(molecules, temp_data, float_tolerance):
-    with tempfile.NamedTemporaryFile(suffix=".xyz") as fp:
-        for atoms in molecules:
-            write(fp.name, atoms, format="extxyz", append=True)
-        a = ASEDataset(
-            file_name=fp.name,
-            root=temp_data,
-            extra_fixed_fields={"r_max": 3.0},
-            ase_args=dict(format="extxyz"),
-            type_mapper=TypeMapper(chemical_symbol_to_type={"H": 0, "C": 1, "O": 2}),
-        )
-        yield a
-
-
-@pytest.fixture(scope="session")
-def atomic_batch(nequip_dataset):
-    return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]])
-
-
-# Use debug mode
-set_irreps_debug(True)
+# like `source` in bash
+with open(CONFTEST_PATH) as f:
+    exec(f.read())
diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py
index f45e0ca8..95cfe48d 100644
--- a/tests/unit/data/test_dataset.py
+++ b/tests/unit/data/test_dataset.py
@@ -31,7 +31,7 @@ def ase_file(molecules):
 
 
 MAX_ATOMIC_NUMBER: int = 5
-NATOMS = 3
+NATOMS = 10
 
 
 @pytest.fixture(scope="function")
@@ -231,8 +231,8 @@ def test_per_graph_field(self, npz_dataset, fixed_field, subset, key, dim):
         if npz_dataset is None:
             return
 
-        torch.manual_seed(0)
-        E = torch.rand((npz_dataset.len(),) + dim)
+        rng = torch.Generator().manual_seed(454)
+        E = torch.rand((npz_dataset.len(),) + dim, generator=rng)
         ref_mean = torch.mean(E / NATOMS, dim=0)
         ref_std = torch.std(E / NATOMS, dim=0)
 
@@ -277,16 +277,11 @@ def test_per_node_field(self, npz_dataset, fixed_field, mode, subset):
         )
         print(result)
 
-    @pytest.mark.parametrize("alpha", [1e-5, 1e-3, 0.1, 0.5])
+    @pytest.mark.parametrize("alpha", [0, 1e-3, 0.01])
     @pytest.mark.parametrize("fixed_field", [True, False])
     @pytest.mark.parametrize("full_rank", [True, False])
     @pytest.mark.parametrize("subset", [True, False])
-    @pytest.mark.parametrize(
-        "regressor", ["NormalizedGaussianProcess", "GaussianProcess"]
-    )
-    def test_per_graph_field(
-        self, npz_dataset, alpha, fixed_field, full_rank, regressor, subset
-    ):
+    def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subset):
 
         if alpha <= 1e-4 and not full_rank:
             return
@@ -308,10 +303,7 @@ def test_per_graph_field(
         del n_spec
         del Ns
 
-        if alpha == 1e-5:
-            ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0)
-        else:
-            ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5)
+        ref_mean, ref_std, E = generate_E(N, 100, 1000, 10)
 
         if subset:
             E_orig_order = torch.zeros_like(
@@ -333,7 +325,6 @@ def test_per_graph_field(
                 AtomicDataDict.TOTAL_ENERGY_KEY
                 + "per_species_mean_std": {
                     "alpha": alpha,
-                    "regressor": regressor,
                     "stride": 1,
                 }
             },
@@ -341,21 +332,18 @@ def test_per_graph_field(
 
         res = torch.matmul(N, mean.reshape([-1, 1])) - E.reshape([-1, 1])
         res2 = torch.sum(torch.square(res))
-        print("residue", alpha, res2 - ref_res2)
+        print("alpha, residue, actual residue", alpha, res2, ref_res2)
         print("mean", mean, ref_mean)
         print("diff in mean", mean - ref_mean)
         print("std", std, ref_std)
 
+        tolerance = torch.max(ref_std) * 4
         if full_rank:
-            if alpha == 1e-5:
-                assert torch.allclose(mean, ref_mean, rtol=1e-1)
-            else:
-                assert torch.allclose(mean, ref_mean, rtol=1)
-                assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100)
-        elif regressor == "NormalizedGaussianProcess":
-            assert torch.std(mean).numpy() == 0
+            assert torch.allclose(mean, ref_mean, atol=tolerance)
+            # assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100)
         else:
-            assert mean[0] == mean[1] * 2
+            assert torch.allclose(mean, mean[0], atol=tolerance)
+            # assert torch.std(mean).numpy() == 0
 
 
 class TestReload:
@@ -449,12 +437,14 @@ def test_from_atoms(self, molecules):
 
 
 def generate_E(N, mean_min, mean_max, std):
-    torch.manual_seed(0)
-    ref_mean = torch.rand((N.shape[1])) * (mean_max - mean_min) + mean_min
+    rng = torch.Generator().manual_seed(568)
+    ref_mean = (
+        torch.rand((N.shape[1]), generator=rng) * (mean_max - mean_min) + mean_min
+    )
     t_mean = torch.ones((N.shape[0], 1)) * ref_mean.reshape([1, -1])
-    ref_std = torch.rand((N.shape[1])) * std
+    ref_std = torch.rand((N.shape[1]), generator=rng) * std
     t_std = torch.ones((N.shape[0], 1)) * ref_std.reshape([1, -1])
-    E = torch.normal(t_mean, t_std)
+    E = torch.normal(t_mean, t_std, generator=rng)
     return ref_mean, ref_std, (N * E).sum(axis=-1)
 
 
diff --git a/tests/unit/model/test_nequip_model.py b/tests/unit/model/test_nequip_model.py
new file mode 100644
index 00000000..2aa82e15
--- /dev/null
+++ b/tests/unit/model/test_nequip_model.py
@@ -0,0 +1,122 @@
+import pytest
+
+from e3nn import o3
+
+from nequip.data import AtomicDataDict
+from nequip.model import model_from_config
+from nequip.nn import AtomwiseLinear
+from nequip.utils.unittests.model_tests import BaseEnergyModelTests
+
+COMMON_CONFIG = {
+    "avg_num_neighbors": None,
+    "num_types": 3,
+    "types_names": ["H", "C", "O"],
+}
+r_max = 3
+minimal_config1 = dict(
+    irreps_edge_sh="0e + 1o",
+    r_max=4,
+    feature_irreps_hidden="4x0e + 4x1o",
+    num_layers=2,
+    num_basis=8,
+    PolynomialCutoff_p=6,
+    nonlinearity_type="norm",
+    **COMMON_CONFIG
+)
+minimal_config2 = dict(
+    irreps_edge_sh="0e + 1o",
+    r_max=4,
+    chemical_embedding_irreps_out="8x0e + 8x0o + 8x1e + 8x1o",
+    irreps_mid_output_block="2x0e",
+    feature_irreps_hidden="4x0e + 4x1o",
+    **COMMON_CONFIG
+)
+minimal_config3 = dict(
+    irreps_edge_sh="0e + 1o",
+    r_max=4,
+    feature_irreps_hidden="4x0e + 4x1o",
+    num_layers=2,
+    num_basis=8,
+    PolynomialCutoff_p=6,
+    nonlinearity_type="gate",
+    **COMMON_CONFIG
+)
+minimal_config4 = dict(
+    irreps_edge_sh="0e + 1o + 2e",
+    r_max=4,
+    feature_irreps_hidden="2x0e + 2x1o + 2x2e",
+    num_layers=2,
+    num_basis=3,
+    PolynomialCutoff_p=6,
+    nonlinearity_type="gate",
+    # test custom nonlinearities
+    nonlinearity_scalars={"e": "silu", "o": "tanh"},
+    nonlinearity_gates={"e": "silu", "o": "abs"},
+    **COMMON_CONFIG
+)
+
+
+class TestNequIPModel(BaseEnergyModelTests):
+    @pytest.fixture
+    def strict_locality(self):
+        return False
+
+    @pytest.fixture(
+        params=[minimal_config1, minimal_config2, minimal_config3, minimal_config4],
+        scope="class",
+    )
+    def base_config(self, request):
+        return request.param
+
+    @pytest.fixture(
+        params=[
+            (
+                ["EnergyModel", "ForceOutput"],
+                [
+                    AtomicDataDict.TOTAL_ENERGY_KEY,
+                    AtomicDataDict.PER_ATOM_ENERGY_KEY,
+                    AtomicDataDict.FORCE_KEY,
+                ],
+            ),
+            (
+                ["EnergyModel"],
+                [
+                    AtomicDataDict.TOTAL_ENERGY_KEY,
+                    AtomicDataDict.PER_ATOM_ENERGY_KEY,
+                ],
+            ),
+            (
+                ["EnergyModel", "StressForceOutput"],
+                [
+                    AtomicDataDict.TOTAL_ENERGY_KEY,
+                    AtomicDataDict.PER_ATOM_ENERGY_KEY,
+                    AtomicDataDict.FORCE_KEY,
+                    AtomicDataDict.STRESS_KEY,
+                    AtomicDataDict.VIRIAL_KEY,
+                ],
+            ),
+        ],
+        scope="class",
+    )
+    def config(self, request, base_config):
+        config = base_config.copy()
+        builder, out_fields = request.param
+        config = config.copy()
+        config["model_builders"] = builder
+        return config, out_fields
+
+    def test_submods(self):
+        config = minimal_config2.copy()
+        config["model_builders"] = ["EnergyModel"]
+        model = model_from_config(config=config, initialize=True)
+        assert isinstance(model.chemical_embedding, AtomwiseLinear)
+        true_irreps = o3.Irreps(minimal_config2["chemical_embedding_irreps_out"])
+        assert (
+            model.chemical_embedding.irreps_out[model.chemical_embedding.out_field]
+            == true_irreps
+        )
+        # Make sure it propagates
+        assert (
+            model.layer0_convnet.irreps_in[model.chemical_embedding.out_field]
+            == true_irreps
+        )
diff --git a/tests/unit/trainer/test_trainer.py b/tests/unit/trainer/test_trainer.py
index c8169fda..860be357 100644
--- a/tests/unit/trainer/test_trainer.py
+++ b/tests/unit/trainer/test_trainer.py
@@ -106,26 +106,6 @@ def test_save(self, trainer, format, suffix):
             assert isfile(file_name), "fail to save to file"
             assert suffix in file_name
 
-    @pytest.mark.parametrize("append", [True])  # , False])
-    def test_from_dict(self, trainer, append):
-
-        # torch.save(trainer.model, trainer.best_model_path)
-
-        dictionary = trainer.as_dict(state_dict=True, training_progress=True)
-        trainer1 = Trainer.from_dict(dictionary, append=append)
-
-        for key in [
-            "best_model_path",
-            "last_model_path",
-            "logfile",
-            "epoch_log",
-            "batch_log",
-            "workdir",
-        ]:
-            v1 = getattr(trainer, key, None)
-            v2 = getattr(trainer1, key, None)
-            assert append == (v1 == v2)
-
     @pytest.mark.parametrize("append", [True])  # , False])
     def test_from_file(self, trainer, append):
 
diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py
deleted file mode 100644
index 4792b9d2..00000000
--- a/tests/unit/utils/test_gp.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import torch
-import pytest
-
-from nequip.utils.regressor import base_gp
-from sklearn.gaussian_process.kernels import DotProduct
-
-
-# @pytest.mark.parametrize("full_rank", [True, False])
-@pytest.mark.parametrize("full_rank", [False])
-@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1])
-def test_random(full_rank, alpha):
-
-    if alpha == 0 and not full_rank:
-        return
-
-    torch.manual_seed(0)
-    n_samples = 10
-    n_dim = 3
-
-    if full_rank:
-        X = torch.randint(low=1, high=10, size=(n_samples, n_dim))
-    else:
-        X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones(
-            (n_samples, n_dim)
-        )
-
-    ref_mean = torch.rand((n_dim, 1))
-    y = torch.matmul(X, ref_mean)
-
-    mean, std = base_gp(
-        X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1
-    )
-
-    if full_rank:
-        assert torch.allclose(ref_mean, mean, rtol=0.5)
-    else:
-        assert torch.allclose(mean, mean[0], rtol=1e-3)
diff --git a/tests/unit/utils/test_solver.py b/tests/unit/utils/test_solver.py
new file mode 100644
index 00000000..de78cbd8
--- /dev/null
+++ b/tests/unit/utils/test_solver.py
@@ -0,0 +1,38 @@
+import torch
+import pytest
+
+from nequip.utils.regressor import solver
+
+
+@pytest.mark.parametrize("full_rank", [True, False])
+@pytest.mark.parametrize("alpha", [0, 1e-3, 1e-2])
+def test_random(full_rank, alpha, per_species_set):
+
+    if alpha == 0 and not full_rank:
+        return
+
+    rng = torch.Generator().manual_seed(343)
+
+    ref_mean, ref_std, E, n_samples, n_dim = per_species_set
+
+    X = torch.randint(low=1, high=10, size=(n_samples, n_dim), generator=rng).to(
+        torch.get_default_dtype()
+    )
+    if not full_rank:
+        X[:, n_dim - 2] = X[:, n_dim - 1] * 2
+    y = (X * E).sum(axis=-1)
+
+    mean, std = solver(X, y, alpha=alpha)
+
+    tolerance = torch.max(ref_std)
+
+    print("tolerance", tolerance)
+    print("solution", mean, std)
+    print("diff", mean - ref_mean)
+
+    if full_rank:
+        assert torch.allclose(ref_mean, mean, atol=tolerance)
+    else:
+        assert torch.allclose(mean[n_dim - 1], mean[n_dim - 2], atol=tolerance)
+
+    assert torch.max(std) < tolerance