Skip to content

Commit

Permalink
Make HNS check and client instantiation more reliable (#478)
Browse files Browse the repository at this point in the history
* Fix HNS check and credentials passing

* add default credential test

* add azure-identity for tests

* Add config for environment credentials in azure

* Use root dir to check HNS fallback

* Update history + prep for release

* mypy pass

* Make history more explicit about issue fixed
  • Loading branch information
pjbull authored Oct 18, 2024
1 parent 8207b3d commit 68774bf
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 22 deletions.
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,9 @@ LIVE_GS_BUCKET=a-bucket-you-can-access
# Custom S3, e.g. MinIO
CUSTOM_S3_BUCKET=a-bucket-you-can-access
CUSTOM_S3_ENDPOINT=your_custom_s3_endpoint

# From a registered Azure App as a service principal; currently used just for
# live tests with DefaultCredentials
AZURE_CLIENT_ID=
AZURE_TENANT_ID=
AZURE_CLIENT_SECRET=
3 changes: 3 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ jobs:
LIVE_AZURE_CONTAINER: ${{ secrets.LIVE_AZURE_CONTAINER }}
AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
AZURE_STORAGE_GEN2_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_GEN2_CONNECTION_STRING }}
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }}
LIVE_GS_BUCKET: ${{ secrets.LIVE_GS_BUCKET }}
LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand Down
3 changes: 2 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# cloudpathlib Changelog

## UNRELEASED
## v0.20.0 (2024-10-18)

- Added support for custom schemes in CloudPath and Client subclases. (Issue [#466](https://github.com/drivendataorg/cloudpathlib/issues/466), PR [#467](https://github.com/drivendataorg/cloudpathlib/pull/467))
- Fixed `ResourceNotFoundError` on Azure gen2 storage accounts with HNS enabled and issue that some Azure credentials do not have `account_name`. (Issue [#470](https://github.com/drivendataorg/cloudpathlib/issues/470), Issue [#476](https://github.com/drivendataorg/cloudpathlib/issues/476), PR [#478](https://github.com/drivendataorg/cloudpathlib/pull/478))

## v0.19.0 (2024-08-29)

Expand Down
78 changes: 59 additions & 19 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
try:
from azure.core.exceptions import ResourceNotFoundError
from azure.core.credentials import AzureNamedKeyCredential

from azure.storage.blob import (
BlobPrefix,
BlobSasPermissions,
Expand All @@ -24,7 +25,15 @@
generate_blob_sas,
)

from azure.storage.blob._shared.authentication import (
SharedKeyCredentialPolicy as BlobSharedKeyCredentialPolicy,
)

from azure.storage.filedatalake import DataLakeServiceClient, FileProperties
from azure.storage.filedatalake._shared.authentication import (
SharedKeyCredentialPolicy as DataLakeSharedKeyCredentialPolicy,
)

except ModuleNotFoundError:
implementation_registry["azure"].dependencies_loaded = False

Expand Down Expand Up @@ -104,19 +113,29 @@ def __init__(
if connection_string is None:
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None)

self.data_lake_client = None # only needs to end up being set if HNS is enabled
self.data_lake_client: Optional[DataLakeServiceClient] = (
None # only needs to end up being set if HNS is enabled
)

if blob_service_client is not None:
self.service_client = blob_service_client

# create from blob service client if not passed
if data_lake_client is None:
self.data_lake_client = DataLakeServiceClient(
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
credential=AzureNamedKeyCredential(
credential = (
blob_service_client.credential
if not isinstance(
blob_service_client.credential, BlobSharedKeyCredentialPolicy
)
else AzureNamedKeyCredential(
blob_service_client.credential.account_name,
blob_service_client.credential.account_key,
),
)
)

self.data_lake_client = DataLakeServiceClient(
account_url=self.service_client.url.replace(".blob.", ".dfs.", 1),
credential=credential,
)
else:
self.data_lake_client = data_lake_client
Expand All @@ -125,12 +144,21 @@ def __init__(
self.data_lake_client = data_lake_client

if blob_service_client is None:
self.service_client = BlobServiceClient(
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
credential=AzureNamedKeyCredential(

credential = (
data_lake_client.credential
if not isinstance(
data_lake_client.credential, DataLakeSharedKeyCredentialPolicy
)
else AzureNamedKeyCredential(
data_lake_client.credential.account_name,
data_lake_client.credential.account_key,
),
)
)

self.service_client = BlobServiceClient(
account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1),
credential=credential,
)

elif connection_string is not None:
Expand Down Expand Up @@ -167,19 +195,31 @@ def __init__(
"Credentials are required; see docs for options."
)

self._hns_enabled = None
self._hns_enabled: Optional[bool] = None

def _check_hns(self) -> Optional[bool]:
def _check_hns(self, cloud_path: AzureBlobPath) -> Optional[bool]:
if self._hns_enabled is None:
account_info = self.service_client.get_account_information() # type: ignore
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
try:
account_info = self.service_client.get_account_information() # type: ignore
self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore
except ResourceNotFoundError:
# get_account_information() not supported with this credential; we have to fallback to
# checking if the root directory exists and is a has 'metadata': {'hdi_isfolder': 'true'}
root_dir = self.service_client.get_blob_client(
container=cloud_path.container, blob="/"
)
self._hns_enabled = (
root_dir.exists()
and root_dir.get_blob_properties().metadata.get("hdi_isfolder", False)
== "true"
)

return self._hns_enabled

def _get_metadata(
self, cloud_path: AzureBlobPath
) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]:
if self._check_hns():
if self._check_hns(cloud_path):

# works on both files and directories
fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
Expand Down Expand Up @@ -292,7 +332,7 @@ def _list_dir(
if prefix and not prefix.endswith("/"):
prefix += "/"

if self._check_hns():
if self._check_hns(cloud_path):
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive)

Expand Down Expand Up @@ -334,7 +374,7 @@ def _move_file(
)

# we can use rename API when the same account on adls gen2
elif remove_src and (src.client is dst.client) and self._check_hns():
elif remove_src and (src.client is dst.client) and self._check_hns(src):
fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore

if src.is_dir():
Expand All @@ -358,7 +398,7 @@ def _move_file(
def _mkdir(
self, cloud_path: AzureBlobPath, parents: bool = False, exist_ok: bool = False
) -> None:
if self._check_hns():
if self._check_hns(cloud_path):
file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore
directory_client = file_system_client.get_directory_client(cloud_path.blob)

Expand All @@ -379,7 +419,7 @@ def _mkdir(
def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None:
file_or_dir = self._is_file_or_dir(cloud_path)
if file_or_dir == "dir":
if self._check_hns():
if self._check_hns(cloud_path):
_hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob)
return

Expand Down Expand Up @@ -432,7 +472,7 @@ def _generate_presigned_url(
self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60
) -> str:
sas_token = generate_blob_sas(
self.service_client.account_name,
self.service_client.account_name, # type: ignore[arg-type]
container_name=cloud_path.container,
blob_name=cloud_path.blob,
account_key=self.service_client.credential.account_key,
Expand Down
2 changes: 1 addition & 1 deletion cloudpathlib/azure/azblobpath.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def replace(self, target: "AzureBlobPath") -> "AzureBlobPath":

# we can rename directories on ADLS Gen2
except CloudPathIsADirectoryError:
if self.client._check_hns():
if self.client._check_hns(self):
return self.client._move_file(self, target)
else:
raise
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"

[project]
name = "cloudpathlib"
version = "0.19.0"
version = "0.20.0"
description = "pathlib-style classes for cloud storage services."
readme = "README.md"
authors = [{ name = "DrivenData", email = "info@drivendata.org" }]
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-e .[all]

azure-identity
black[jupyter]>=24.1.0;python_version>='3.8'
build
flake8
Expand Down
15 changes: 15 additions & 0 deletions tests/test_azure_specific.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

from azure.core.credentials import AzureNamedKeyCredential
from azure.identity import DefaultAzureCredential
from azure.storage.blob import (
BlobServiceClient,
StorageStreamDownloader,
Expand Down Expand Up @@ -134,6 +135,20 @@ def _check_access(az_client, gen2=False):
)
_check_access(cl, gen2=azure_rigs.is_adls_gen2)

# discover and use credentials for service principal by having set:
# AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID
credential = DefaultAzureCredential()
cl: AzureBlobClient = azure_rigs.client_class(credential=credential, account_url=bsc.url)
_check_access(cl, gen2=azure_rigs.is_adls_gen2)

# add basic checks for gen2 to exercise limited-privilege access scenarios
p = azure_rigs.create_cloud_path("new_dir/new_file.txt", client=cl)
assert cl._check_hns(p) == azure_rigs.is_adls_gen2
p.write_text("Hello")
assert p.exists()
assert p.read_text() == "Hello"
assert list(p.parent.iterdir()) == [p]


def test_adls_gen2_mkdir(azure_gen2_rig):
"""Since directories can be created on gen2, we should test mkdir, rmdir, rmtree, and unlink
Expand Down

0 comments on commit 68774bf

Please sign in to comment.