-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support local paths for InputDataset.source
#30
Changes from all commits
4fe22bd
7e24a5a
1a47dbf
d0ba139
b032dcc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,5 +11,5 @@ dependencies: | |
- compilers | ||
- netcdf-fortran | ||
- mpich | ||
- nco | ||
- ncview | ||
- xarray | ||
- netCDF4 |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
import datetime as dt | ||
import dateutil.parser | ||
from typing import Optional | ||
from cstar_ocean.utils import _get_source_type | ||
from cstar_ocean.base_model import BaseModel | ||
|
||
|
||
|
@@ -16,13 +17,13 @@ class InputDataset: | |
base_model: BaseModel | ||
The base model with which this input dataset is associated | ||
source: str | ||
URL pointing to the netCDF file containing this input dataset | ||
local path or URL pointing to the netCDF file containing this input dataset | ||
file_hash: str | ||
The 256 bit SHA sum associated with the file for verifying downloads | ||
exists_locally: bool, default None | ||
True if the input dataset has been fetched to the local machine, set when `check_exists_locally()` method is called | ||
True if the input dataset exists on the local machine, set by `check_exists_locally()` method if source is a URL | ||
local_path: str, default None | ||
The path to where the input dataset has been fetched locally, set when `get()` method is called | ||
The path where the input dataset exists locally, set when `get()` is called if source is a URL | ||
|
||
Methods: | ||
-------- | ||
|
@@ -48,17 +49,23 @@ def __init__( | |
base_model: BaseModel | ||
The base model with which this input dataset is associated | ||
source: str | ||
URL pointing to the netCDF file containing this input dataset | ||
URL or path pointing to the netCDF file containing this input dataset | ||
file_hash: str | ||
The 256 bit SHA sum associated with the file for verifying downloads | ||
The 256 bit SHA sum associated with the file for verification | ||
|
||
""" | ||
|
||
self.base_model: BaseModel = base_model | ||
|
||
self.source: str = source | ||
self.file_hash: str = file_hash | ||
|
||
self.exists_locally: Optional[bool] = None | ||
self.local_path: Optional[str] = None | ||
if _get_source_type(source) == "path": | ||
self.exists_locally = True | ||
self.local_path = source | ||
|
||
self.start_date = start_date | ||
self.end_date = end_date | ||
if isinstance(start_date, str): | ||
|
@@ -76,7 +83,7 @@ def __str__(self): | |
base_str += "\n" + "-" * (len(name) + 7) | ||
|
||
base_str += f"\nBase model: {self.base_model.name}" | ||
base_str += f"\nRemote path URL: {self.source}" | ||
base_str += f"\nsource: {self.source}" | ||
if self.start_date is not None: | ||
base_str += f"\nstart_date: {self.start_date}" | ||
if self.end_date is not None: | ||
|
@@ -91,35 +98,59 @@ def __str__(self): | |
def __repr__(self): | ||
return self.__str__() | ||
|
||
def get(self, local_path: str): | ||
def get(self, local_dir: str): | ||
""" | ||
Fetch the file containing this input dataset and save it to `local_path` using Pooch. | ||
Make the file containing this input dataset available in `local_dir/input_datasets` | ||
|
||
This method updates the `local_path` attribute of the calling InputDataset object | ||
If InputDataset.source is... | ||
- ...a local path: create a symbolic link to the file in `local_dir/input_datasets`. | ||
- ...a URL: fetch the file to `local_dir/input_datasets` using Pooch | ||
(updating the `local_path` attribute of the calling InputDataset) | ||
Comment on lines
+105
to
+108
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah this is another argument for abstracting out the concept of a "source" into a standalone class / type. This is a similar idea to using |
||
|
||
Parameters: | ||
----------- | ||
local_path: str | ||
The local path where this input dataset will be saved. | ||
local_dir: str | ||
The local directory in which this input dataset will be saved. | ||
|
||
""" | ||
|
||
tgt_dir = local_path + "/input_datasets/" + self.base_model.name + "/" | ||
tgt_dir = local_dir + "/input_datasets/" + self.base_model.name + "/" | ||
os.makedirs(tgt_dir, exist_ok=True) | ||
tgt_path = tgt_dir + os.path.basename(self.source) | ||
|
||
# NOTE: default timeout was leading to a lot of timeouterrors | ||
downloader = pooch.HTTPDownloader(timeout=120) | ||
to_fetch = pooch.create( | ||
path=tgt_dir, | ||
base_url=os.path.dirname(self.source), | ||
registry={os.path.basename(self.source): self.file_hash}, | ||
) | ||
|
||
to_fetch.fetch(os.path.basename(self.source), downloader=downloader) | ||
self.exists_locally = True | ||
self.local_path = tgt_dir + "/" + os.path.basename(self.source) | ||
# If the file is somewhere else on the system, make a symbolic link where we want it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could all go behind a |
||
if self.exists_locally: | ||
assert ( | ||
self.local_path is not None | ||
), "local_path should always be set when exists_locally is True" | ||
if os.path.abspath(self.local_path) != os.path.abspath(tgt_path): | ||
if os.path.exists(tgt_path): | ||
raise FileExistsError( | ||
f"A file by the name of {os.path.basename(self.source)}" | ||
+ f"already exists at {tgt_dir}." | ||
) | ||
# TODO maybe this should check the hash and just `return` if it matches? | ||
else: | ||
os.symlink(self.local_path, tgt_path) | ||
return | ||
else: | ||
# nothing to do as file is already at tgt_path | ||
return | ||
|
||
def check_exists_locally(self, local_path: str) -> bool: | ||
else: | ||
# Otherwise, download the file | ||
# NOTE: default timeout was leading to a lot of timeouterrors | ||
downloader = pooch.HTTPDownloader(timeout=120) | ||
to_fetch = pooch.create( | ||
path=tgt_dir, | ||
base_url=os.path.dirname(self.source), | ||
registry={os.path.basename(self.source): self.file_hash}, | ||
) | ||
|
||
to_fetch.fetch(os.path.basename(self.source), downloader=downloader) | ||
self.exists_locally = True | ||
self.local_path = tgt_dir + "/" + os.path.basename(self.source) | ||
|
||
def check_exists_locally(self, local_dir: str) -> bool: | ||
""" | ||
Checks whether this InputDataset has already been fetched to the local machine | ||
|
||
|
@@ -128,38 +159,38 @@ def check_exists_locally(self, local_path: str) -> bool: | |
|
||
Parameters: | ||
----------- | ||
local_path (str): | ||
The local path to check for the existence of this input dataset | ||
local_dir (str): | ||
The local directory in which to check for the existence of this input dataset | ||
|
||
Returns: | ||
-------- | ||
exists_locally (bool): | ||
True if the method has verified the local existence of the dataset | ||
""" | ||
tgt_dir = local_path + "/input_datasets/" + self.base_model.name + "/" | ||
fpath = tgt_dir + os.path.basename(self.source) | ||
if os.path.exists(fpath): | ||
sha256_hash = hashlib.sha256() | ||
with open(fpath, "rb") as f: | ||
for chunk in iter(lambda: f.read(4096), b""): | ||
sha256_hash.update(chunk) | ||
|
||
hash_hex = sha256_hash.hexdigest() | ||
if self.file_hash != hash_hex: | ||
raise ValueError( | ||
f"{fpath} exists locally but the local file hash {hash_hex}" | ||
+ "does not match that associated with this InputDataset object" | ||
+ f"{self.file_hash}" | ||
) | ||
|
||
if self.exists_locally is None: | ||
tgt_dir = local_dir + "/input_datasets/" + self.base_model.name + "/" | ||
fpath = tgt_dir + os.path.basename(self.source) | ||
if os.path.exists(fpath): | ||
sha256_hash = hashlib.sha256() | ||
with open(fpath, "rb") as f: | ||
for chunk in iter(lambda: f.read(4096), b""): | ||
sha256_hash.update(chunk) | ||
|
||
hash_hex = sha256_hash.hexdigest() | ||
if self.file_hash != hash_hex: | ||
raise ValueError( | ||
f"{fpath} exists locally but the local file hash {hash_hex}" | ||
+ "does not match that associated with this InputDataset object" | ||
+ f"{self.file_hash}" | ||
) | ||
else: | ||
self.exists_locally = True | ||
self.local_path = tgt_dir | ||
else: | ||
self.exists_locally = True | ||
self.exists_locally = False | ||
|
||
if self.exists_locally: | ||
self.local_path = tgt_dir | ||
return True | ||
else: | ||
self.exists_locally = False | ||
return False | ||
return self.exists_locally | ||
|
||
|
||
class ModelGrid(InputDataset): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Again I wonder if this could all be abstracted behind a class. e.g.
then later use
self.source.exists_locally
andself.source.path
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I began implementing this for this PR but it got hairier than expected. I think it requires a lot of design considerations and should be returned to once the core feature set is complete.
AdditionalCode
andBaseModel
also have asource_repo
attribute (that I think is about to be renamedsource
to accommodate local files) that could be replaced with an instance of whatever this class will be. I think this could be part of the tidy-up that brings inpathlib
. I'll raise an issue for now.