Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Qcarchive update #187

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
88c66cb
updated qcharcive code to fetch OpenFF Full Optimization Benchmark 1 …
chrisiacovella Sep 20, 2023
6f8198e
Updated torsiondrive parsing. I'm not sure this has sufficient testing.
chrisiacovella Sep 21, 2023
3e5a58f
Adding in some testing of the torsion function
chrisiacovella Sep 21, 2023
94add0c
Adding in some testing of the torsion function.
chrisiacovella Sep 21, 2023
908c35f
Updated collection type name.
chrisiacovella Sep 21, 2023
2afcd1d
fixed import issue in test
chrisiacovella Sep 21, 2023
aa41891
fixed parsing of the schema.
chrisiacovella Sep 21, 2023
dc7b10c
fixing a typo that was causing failure of torsion test.
chrisiacovella Sep 21, 2023
bc3e983
Merge branch 'main' into qcarchive_update
mikemhenry Sep 21, 2023
9cb3189
Slight change to code to add in a function that uses the iterate_reco…
chrisiacovella Sep 21, 2023
59dd1a8
Merge remote-tracking branch 'origin/qcarchive_update' into qcarchive…
chrisiacovella Sep 21, 2023
cb28a48
merged with updated dgl update; removing qcportal pinning to the old …
chrisiacovella Sep 21, 2023
f48ae7c
Made spec_name be a variable
chrisiacovella Sep 21, 2023
5f638fd
Adding in some docstrings
chrisiacovella Sep 21, 2023
86042f3
Addressed Mike's comment.s
chrisiacovella Sep 21, 2023
3861bd7
Added additional basic docstring.
chrisiacovella Sep 21, 2023
4821aad
Added additional basic docstring for torsion parsing
chrisiacovella Sep 21, 2023
a6a4cdf
Fixed bug in iterate function; added test to catch that bug .
chrisiacovella Sep 22, 2023
54ce464
Added support for singlepoint datasets
chrisiacovella Sep 22, 2023
a05bff7
fixing error in test.
chrisiacovella Sep 22, 2023
87e9d5a
Changing the dataset for singlepoint testing as we need to ensure the…
chrisiacovella Sep 22, 2023
384bcc1
Changing the dataset for singlepoint testing as we need to ensure the…
chrisiacovella Sep 22, 2023
cac1776
Move the schema conversion to after checking if a dataset is supporte…
chrisiacovella Sep 22, 2023
01c9de5
Removed support for singlepoint dataset, as openff.molecule cannot pa…
chrisiacovella Sep 22, 2023
3cf37d0
Removed support for singlepoint dataset, as openff.molecule cannot pa…
chrisiacovella Sep 23, 2023
3f32536
Merge branch 'main' into qcarchive_update
mikemhenry Oct 8, 2024
3a3700c
Merge branch 'main' into qcarchive_update
mikemhenry Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions devtools/conda-envs/espaloma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies:
- tqdm
- pydantic <2 # We need our deps to fix this
- dgl =1.1.2
- qcportal =0.15.8
- qcportal>=0.5
# Testing
- pytest
- pytest-cov
Expand All @@ -30,6 +30,5 @@ dependencies:
- nose
- nose-timer
- coverage
- qcportal>=0.15.0
- sphinx
- sphinx_rtd_theme
194 changes: 144 additions & 50 deletions espaloma/data/qcarchive_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Tuple

import numpy as np
import qcportal as ptl
import qcportal
import torch
from openmm import unit
from openmm.unit import Quantity
Expand All @@ -21,35 +21,69 @@
# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================
def get_client():
return ptl.FractalClient()
def get_client(url: str = "api.qcarchive.molssi.org") -> qcportal.client.PortalClient:
"""
Returns a instance of the qcportal client.

Parameters
----------
url: str, default="api.qcarchive.molssi.org"
qcportal instance to connect

Returns
-------
qcportal.client.PortalClient
qcportal client instance.
"""
# Note, this may need to be modified to include username/password for non-public servers
return qcportal.PortalClient(url)


def get_collection(
client,
collection_type="OptimizationDataset",
name="OpenFF Full Optimization Benchmark 1",
client,
collection_type="optimization",
name="OpenFF Full Optimization Benchmark 1",
):
collection = client.get_collection(
collection_type,
name,
"""
Connects to a specific dataset on qcportal

Parameters
----------
client: qcportal.client, required
The qcportal client instance
collection_type: str, default="optimization"
The type of qcarchive collection, options are
"torsiondrive", "optimization", "gridoptimization", "reaction", "singlepoint" "manybody"
name: str, default="OpenFF Full Optimization Benchmark 1"
Name of the dataset

Returns
-------
(qcportal dataset, list(str))
Tuple with an instance of qcportal dataset and list of record names

"""
collection = client.get_dataset(
dataset_type=collection_type,
dataset_name=name,
)

record_names = list(collection.data.records)
record_names = collection.entry_names

return collection, record_names


def get_graph(collection, record_name):
# get record and trajectory
record = collection.get_record(record_name, specification="default")
entry = collection.get_entry(record_name)
def process_record(record, entry):
"""
Processes a given record/entry pair from a dataset and returns the graph
"""

from openff.toolkit.topology import Molecule

mol = Molecule.from_qcschema(entry)
mol = Molecule.from_qcschema(entry.dict())

try:
trajectory = record.get_trajectory()
trajectory = record.trajectory
except:
chrisiacovella marked this conversation as resolved.
Show resolved Hide resolved
return None

Expand All @@ -62,7 +96,7 @@ def get_graph(collection, record_name):
g.nodes["g"].data["u_ref"] = torch.tensor(
[
Quantity(
snapshot.properties.scf_total_energy,
snapshot.properties["scf_total_energy"],
esp.units.HARTREE_PER_PARTICLE,
).value_in_unit(esp.units.ENERGY_UNIT)
for snapshot in trajectory
Expand All @@ -74,7 +108,7 @@ def get_graph(collection, record_name):
np.stack(
[
Quantity(
snapshot.get_molecule().geometry,
snapshot.molecule.geometry,
unit.bohr,
).value_in_unit(esp.units.DISTANCE_UNIT)
for snapshot in trajectory
Expand All @@ -89,7 +123,7 @@ def get_graph(collection, record_name):
[
torch.tensor(
Quantity(
snapshot.dict()["return_result"],
np.array(snapshot.properties["return_result"]).reshape((-1, 3)),
esp.units.HARTREE_PER_PARTICLE / unit.bohr,
).value_in_unit(esp.units.FORCE_UNIT),
dtype=torch.get_default_dtype(),
Expand All @@ -102,21 +136,100 @@ def get_graph(collection, record_name):
return g


def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
final_molecules = record.get_final_molecules()
final_results = record.get_final_results()
def get_graph(collection, record_name, spec_name="default"):
"""
Processes the qcportal data for a given record name.

Parameters
----------
collection, qcportal dataset, required
The instance of the qcportal dataset
record_name, str, required
The name of a give record
spec_name, str, default="default"
Retrieve data for a given qcportal specification.
Returns
-------
Graph
"""
# get record and trajectory
record = collection.get_record(record_name, specification_name=spec_name)
entry = collection.get_entry(record_name)

g = process_record(record, entry)

return g


angle_keys = list(final_molecules.keys())
def get_graphs(collection, record_names, spec_name="default"):
"""
Processes the qcportal data for a given set of record names.
This uses the qcportal iteration functions which are faster than processing
records one at a time

Parameters
----------
collection, qcportal dataset, required
The instance of the qcportal dataset
record_name, str, required
The name of a give record
spec_name, str, default="default"
Retrieve data for a given qcportal specification.
Returns
-------
list(graph)
Returns a list of the corresponding graph for each record name
"""
g_list = []
for record, entry in zip(
collection.iterate_records(record_names, specification_names=[spec_name]),
collection.iterate_entries(record_names),
):
# note interate records returns a tuple of lenth 3 (name, spec_name, actual record)
g = process_record(record[2], entry)
chrisiacovella marked this conversation as resolved.
Show resolved Hide resolved
g_list.append(g)

return g_list


def fetch_td_record(record: qcportal.torsiondrive.record_models.TorsiondriveRecord):
"""
Fetches configuration, energy, and gradients for a given torsiondrive record as a function of different angles.

Parameters
----------
record: qcportal.torsiondrive.record_models.TorsiondriveRecord, required
Torsiondrive record of interest
Returns
-------
tuple, ( numpy.array, numpy.array, numpy.array,numpy.array)
Returned data is a tuple of numpy arrays.
The first index contains angles and subsequent arrays represent
molecule coordinate, energy and gradients associated with each angle.

"""
molecule_optimization = record.optimizations

angle_keys = list(molecule_optimization.keys())

xyzs = []
energies = []
gradients = []

for angle in angle_keys:
result = final_results[angle]
mol = final_molecules[angle]
# NOTE: this is calling the first index of the optimization array
# this gives the same value as the prior implementation.
# however it seems to be that this contains multiple different initial configurations
# that have been optimized. Should all conformers and energies/gradients be considered?
mol = molecule_optimization[angle][0].final_molecule
result = molecule_optimization[angle][0].trajectory[-1].properties

"""Note: force = - gradient"""

e, g = get_energy_and_gradient(result)
# TODO: attach units here? or later?

e = result["current energy"]
g = np.array(result["current gradient"]).reshape(-1, 3)

xyzs.append(mol.geometry)
energies.append(e)
Expand Down Expand Up @@ -146,22 +259,6 @@ def fetch_td_record(record: ptl.models.torsiondrive.TorsionDriveRecord):
return flat_angles, xyz_in_order, energies_in_order, gradients_in_order


def get_energy_and_gradient(
snapshot: ptl.models.records.ResultRecord,
) -> Tuple[float, np.ndarray]:
"""Note: force = - gradient"""

# TODO: attach units here? or later?

d = snapshot.dict()
qcvars = d["extras"]["qcvars"]
energy = qcvars["CURRENT ENERGY"]
flat_gradient = np.array(qcvars["CURRENT GRADIENT"])
num_atoms = len(flat_gradient) // 3
gradient = flat_gradient.reshape((num_atoms, 3))
return energy, gradient


MolWithTargets = namedtuple(
"MolWithTargets", ["offmol", "xyz", "energies", "gradients"]
)
Expand All @@ -184,9 +281,9 @@ def get_smiles(x):
g = esp.Graph(mol_ref)

u_ref = np.concatenate(group["energies"].values)
u_ref_prime = np.concatenate(
group["gradients"].values, axis=0
).transpose(1, 0, 2)
u_ref_prime = np.concatenate(group["gradients"].values, axis=0).transpose(
1, 0, 2
)
xyz = np.concatenate(group["xyz"].values, axis=0).transpose(1, 0, 2)

assert u_ref_prime.shape[0] == xyz.shape[0] == mol_ref.n_atoms
Expand Down Expand Up @@ -229,7 +326,7 @@ def breakdown_along_time_axis(g, batch_size=32):

shuffle(idxs)
chunks = [
idxs[_idx * batch_size : (_idx + 1) * batch_size]
idxs[_idx * batch_size: (_idx + 1) * batch_size]
for _idx in range(n_snapshots // batch_size)
]

Expand Down Expand Up @@ -259,10 +356,7 @@ def make_batch_size_consistent(ds, batch_size=32):
return esp.data.dataset.GraphDataset(
list(
itertools.chain.from_iterable(
[
breakdown_along_time_axis(g, batch_size=batch_size)
for g in ds
]
[breakdown_along_time_axis(g, batch_size=batch_size) for g in ds]
)
)
)
Expand Down
69 changes: 69 additions & 0 deletions espaloma/data/tests/test_qcarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,72 @@ def test_get_graph():
collection, record_names = qcarchive_utils.get_collection(client)
record_name = record_names[0]
graph = qcarchive_utils.get_graph(collection, record_name)
assert graph is not None

graphs = qcarchive_utils.get_graphs(collection, record_names[0:2])
assert len(graphs) == 2
assert graphs[0] is not None


def test_get_torsiondrive():
chrisiacovella marked this conversation as resolved.
Show resolved Hide resolved
from espaloma.data import qcarchive_utils
import numpy as np

record_name = "[h]c1c(c(c(c([c:1]1[n:2]([c:3](=[o:4])c(=c([h])[h])[h])c([h])([h])[h])[h])[h])n(=o)=o)[h]"

# example dataset
name = "OpenFF Amide Torsion Set v1.0"
collection_type = "torsiondrive"

collection, record_names = qcarchive_utils.get_collection(
qcarchive_utils.get_client(), collection_type, name
)
record_info = collection.get_record(record_name, specification_name="default")

(
flat_angles,
xyz_in_order,
energies_in_order,
gradients_in_order,
) = qcarchive_utils.fetch_td_record(record_info)

assert flat_angles.shape == (24,)
assert energies_in_order.shape == (24,)
assert gradients_in_order.shape == (24, 25, 3)
assert xyz_in_order.shape == (24, 25, 3)

assert np.isclose(energies_in_order[0], -722.2850260791969)
assert np.all(
flat_angles
== np.array(
[
-165,
-150,
-135,
-120,
-105,
-90,
-75,
-60,
-45,
-30,
-15,
0,
15,
30,
45,
60,
75,
90,
105,
120,
135,
150,
165,
180,
]
)
)
assert np.all(
xyz_in_order[0][0] == np.array([-0.66407807, -8.59922225, -0.02685972])
)
Loading