diff --git a/.env.example b/.env.example index 7e6fbfaf..c081e8bc 100644 --- a/.env.example +++ b/.env.example @@ -30,3 +30,9 @@ LIVE_GS_BUCKET=a-bucket-you-can-access # Custom S3, e.g. MinIO CUSTOM_S3_BUCKET=a-bucket-you-can-access CUSTOM_S3_ENDPOINT=your_custom_s3_endpoint + +# From a registered Azure App as a service principal; currently used just for +# live tests with DefaultCredentials +AZURE_CLIENT_ID= +AZURE_TENANT_ID= +AZURE_CLIENT_SECRET= diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index edd56b28..b81ca408 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -103,6 +103,9 @@ jobs: LIVE_AZURE_CONTAINER: ${{ secrets.LIVE_AZURE_CONTAINER }} AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }} AZURE_STORAGE_GEN2_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_GEN2_CONNECTION_STRING }} + AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} + AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} + AZURE_CLIENT_SECRET: ${{ secrets.AZURE_CLIENT_SECRET }} LIVE_GS_BUCKET: ${{ secrets.LIVE_GS_BUCKET }} LIVE_S3_BUCKET: ${{ secrets.LIVE_S3_BUCKET }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} diff --git a/HISTORY.md b/HISTORY.md index 0deb48db..66b2baad 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,8 +1,9 @@ # cloudpathlib Changelog -## UNRELEASED +## v0.20.0 (2024-10-18) - Added support for custom schemes in CloudPath and Client subclases. (Issue [#466](https://github.com/drivendataorg/cloudpathlib/issues/466), PR [#467](https://github.com/drivendataorg/cloudpathlib/pull/467)) +- Fixed `ResourceNotFoundError` on Azure gen2 storage accounts with HNS enabled and issue that some Azure credentials do not have `account_name`. (Issue [#470](https://github.com/drivendataorg/cloudpathlib/issues/470), Issue [#476](https://github.com/drivendataorg/cloudpathlib/issues/476), PR [#478](https://github.com/drivendataorg/cloudpathlib/pull/478)) ## v0.19.0 (2024-08-29) diff --git a/cloudpathlib/azure/azblobclient.py b/cloudpathlib/azure/azblobclient.py index 289eb236..80a4a92b 100644 --- a/cloudpathlib/azure/azblobclient.py +++ b/cloudpathlib/azure/azblobclient.py @@ -15,6 +15,7 @@ try: from azure.core.exceptions import ResourceNotFoundError from azure.core.credentials import AzureNamedKeyCredential + from azure.storage.blob import ( BlobPrefix, BlobSasPermissions, @@ -24,7 +25,15 @@ generate_blob_sas, ) + from azure.storage.blob._shared.authentication import ( + SharedKeyCredentialPolicy as BlobSharedKeyCredentialPolicy, + ) + from azure.storage.filedatalake import DataLakeServiceClient, FileProperties + from azure.storage.filedatalake._shared.authentication import ( + SharedKeyCredentialPolicy as DataLakeSharedKeyCredentialPolicy, + ) + except ModuleNotFoundError: implementation_registry["azure"].dependencies_loaded = False @@ -104,19 +113,29 @@ def __init__( if connection_string is None: connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None) - self.data_lake_client = None # only needs to end up being set if HNS is enabled + self.data_lake_client: Optional[DataLakeServiceClient] = ( + None # only needs to end up being set if HNS is enabled + ) if blob_service_client is not None: self.service_client = blob_service_client # create from blob service client if not passed if data_lake_client is None: - self.data_lake_client = DataLakeServiceClient( - account_url=self.service_client.url.replace(".blob.", ".dfs.", 1), - credential=AzureNamedKeyCredential( + credential = ( + blob_service_client.credential + if not isinstance( + blob_service_client.credential, BlobSharedKeyCredentialPolicy + ) + else AzureNamedKeyCredential( blob_service_client.credential.account_name, blob_service_client.credential.account_key, - ), + ) + ) + + self.data_lake_client = DataLakeServiceClient( + account_url=self.service_client.url.replace(".blob.", ".dfs.", 1), + credential=credential, ) else: self.data_lake_client = data_lake_client @@ -125,12 +144,21 @@ def __init__( self.data_lake_client = data_lake_client if blob_service_client is None: - self.service_client = BlobServiceClient( - account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1), - credential=AzureNamedKeyCredential( + + credential = ( + data_lake_client.credential + if not isinstance( + data_lake_client.credential, DataLakeSharedKeyCredentialPolicy + ) + else AzureNamedKeyCredential( data_lake_client.credential.account_name, data_lake_client.credential.account_key, - ), + ) + ) + + self.service_client = BlobServiceClient( + account_url=self.data_lake_client.url.replace(".dfs.", ".blob.", 1), + credential=credential, ) elif connection_string is not None: @@ -167,19 +195,31 @@ def __init__( "Credentials are required; see docs for options." ) - self._hns_enabled = None + self._hns_enabled: Optional[bool] = None - def _check_hns(self) -> Optional[bool]: + def _check_hns(self, cloud_path: AzureBlobPath) -> Optional[bool]: if self._hns_enabled is None: - account_info = self.service_client.get_account_information() # type: ignore - self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore + try: + account_info = self.service_client.get_account_information() # type: ignore + self._hns_enabled = account_info.get("is_hns_enabled", False) # type: ignore + except ResourceNotFoundError: + # get_account_information() not supported with this credential; we have to fallback to + # checking if the root directory exists and is a has 'metadata': {'hdi_isfolder': 'true'} + root_dir = self.service_client.get_blob_client( + container=cloud_path.container, blob="/" + ) + self._hns_enabled = ( + root_dir.exists() + and root_dir.get_blob_properties().metadata.get("hdi_isfolder", False) + == "true" + ) return self._hns_enabled def _get_metadata( self, cloud_path: AzureBlobPath ) -> Union["BlobProperties", "FileProperties", Dict[str, Any]]: - if self._check_hns(): + if self._check_hns(cloud_path): # works on both files and directories fsc = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore @@ -292,7 +332,7 @@ def _list_dir( if prefix and not prefix.endswith("/"): prefix += "/" - if self._check_hns(): + if self._check_hns(cloud_path): file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore paths = file_system_client.get_paths(path=cloud_path.blob, recursive=recursive) @@ -334,7 +374,7 @@ def _move_file( ) # we can use rename API when the same account on adls gen2 - elif remove_src and (src.client is dst.client) and self._check_hns(): + elif remove_src and (src.client is dst.client) and self._check_hns(src): fsc = self.data_lake_client.get_file_system_client(src.container) # type: ignore if src.is_dir(): @@ -358,7 +398,7 @@ def _move_file( def _mkdir( self, cloud_path: AzureBlobPath, parents: bool = False, exist_ok: bool = False ) -> None: - if self._check_hns(): + if self._check_hns(cloud_path): file_system_client = self.data_lake_client.get_file_system_client(cloud_path.container) # type: ignore directory_client = file_system_client.get_directory_client(cloud_path.blob) @@ -379,7 +419,7 @@ def _mkdir( def _remove(self, cloud_path: AzureBlobPath, missing_ok: bool = True) -> None: file_or_dir = self._is_file_or_dir(cloud_path) if file_or_dir == "dir": - if self._check_hns(): + if self._check_hns(cloud_path): _hns_rmtree(self.data_lake_client, cloud_path.container, cloud_path.blob) return @@ -432,7 +472,7 @@ def _generate_presigned_url( self, cloud_path: AzureBlobPath, expire_seconds: int = 60 * 60 ) -> str: sas_token = generate_blob_sas( - self.service_client.account_name, + self.service_client.account_name, # type: ignore[arg-type] container_name=cloud_path.container, blob_name=cloud_path.blob, account_key=self.service_client.credential.account_key, diff --git a/cloudpathlib/azure/azblobpath.py b/cloudpathlib/azure/azblobpath.py index 265cfd81..282ea18d 100644 --- a/cloudpathlib/azure/azblobpath.py +++ b/cloudpathlib/azure/azblobpath.py @@ -91,7 +91,7 @@ def replace(self, target: "AzureBlobPath") -> "AzureBlobPath": # we can rename directories on ADLS Gen2 except CloudPathIsADirectoryError: - if self.client._check_hns(): + if self.client._check_hns(self): return self.client._move_file(self, target) else: raise diff --git a/pyproject.toml b/pyproject.toml index 74604c77..ab8241e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi" [project] name = "cloudpathlib" -version = "0.19.0" +version = "0.20.0" description = "pathlib-style classes for cloud storage services." readme = "README.md" authors = [{ name = "DrivenData", email = "info@drivendata.org" }] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7c526692..ac0544f5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,6 @@ -e .[all] +azure-identity black[jupyter]>=24.1.0;python_version>='3.8' build flake8 diff --git a/tests/test_azure_specific.py b/tests/test_azure_specific.py index 474525e2..78b53bf3 100644 --- a/tests/test_azure_specific.py +++ b/tests/test_azure_specific.py @@ -1,6 +1,7 @@ import os from azure.core.credentials import AzureNamedKeyCredential +from azure.identity import DefaultAzureCredential from azure.storage.blob import ( BlobServiceClient, StorageStreamDownloader, @@ -134,6 +135,20 @@ def _check_access(az_client, gen2=False): ) _check_access(cl, gen2=azure_rigs.is_adls_gen2) + # discover and use credentials for service principal by having set: + # AZURE_CLIENT_ID, AZURE_CLIENT_SECRET, AZURE_TENANT_ID + credential = DefaultAzureCredential() + cl: AzureBlobClient = azure_rigs.client_class(credential=credential, account_url=bsc.url) + _check_access(cl, gen2=azure_rigs.is_adls_gen2) + + # add basic checks for gen2 to exercise limited-privilege access scenarios + p = azure_rigs.create_cloud_path("new_dir/new_file.txt", client=cl) + assert cl._check_hns(p) == azure_rigs.is_adls_gen2 + p.write_text("Hello") + assert p.exists() + assert p.read_text() == "Hello" + assert list(p.parent.iterdir()) == [p] + def test_adls_gen2_mkdir(azure_gen2_rig): """Since directories can be created on gen2, we should test mkdir, rmdir, rmtree, and unlink