drivendataorg · pjbull · Sep 1, 2024 · Sep 1, 2024 · Sep 16, 2024 · Sep 17, 2024
diff --git a/cloudpathlib/__init__.py b/cloudpathlib/__init__.py
@@ -4,9 +4,11 @@
 from .azure.azblobclient import AzureBlobClient
 from .azure.azblobpath import AzureBlobPath
 from .cloudpath import CloudPath, implementation_registry
-from .s3.s3client import S3Client
-from .gs.gspath import GSPath
 from .gs.gsclient import GSClient
+from .gs.gspath import GSPath
+from .http.httpclient import HttpClient
+from .http.httppath import HttpPath
+from .s3.s3client import S3Client
 from .s3.s3path import S3Path
 
 
@@ -27,6 +29,8 @@
     "implementation_registry",
     "GSClient",
     "GSPath",
+    "HttpClient",
+    "HttpPath",
     "S3Client",
     "S3Path",
 ]
diff --git a/cloudpathlib/cloudpath.py b/cloudpathlib/cloudpath.py
@@ -27,7 +27,6 @@
     Generator,
     List,
     Optional,
-    Sequence,
     Tuple,
     Type,
     TYPE_CHECKING,
@@ -286,11 +285,11 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
 
     @property
     def _no_prefix(self) -> str:
-        return self._str[len(self.cloud_prefix) :]
+        return self._str[len(self.anchor) :]
 
     @property
     def _no_prefix_no_drive(self) -> str:
-        return self._str[len(self.cloud_prefix) + len(self.drive) :]
+        return self._str[len(self.anchor) + len(self.drive) :]
 
     @overload
     @classmethod
@@ -881,9 +880,9 @@ def relative_to(self, other: Self, walk_up: bool = False) -> PurePosixPath:
         # absolute)
         if not isinstance(other, CloudPath):
             raise ValueError(f"{self} is a cloud path, but {other} is not")
-        if self.cloud_prefix != other.cloud_prefix:
+        if self.anchor != other.anchor:
             raise ValueError(
-                f"{self} is a {self.cloud_prefix} path, but {other} is a {other.cloud_prefix} path"
+                f"{self} is a {self.anchor} path, but {other} is a {other.anchor} path"
             )
 
         kwargs = dict(walk_up=walk_up)
@@ -921,7 +920,7 @@ def parent(self) -> Self:
         return self._dispatch_to_path("parent")
 
     @property
-    def parents(self) -> Sequence[Self]:
+    def parents(self) -> Tuple[Self, ...]:
         return self._dispatch_to_path("parents")
 
     @property
@@ -1176,7 +1175,7 @@ def copytree(self, destination, force_overwrite_to_cloud=None, ignore=None):
                 )
             elif subpath.is_dir():
                 subpath.copytree(
-                    destination / subpath.name,
+                    destination / (subpath.name + ("" if subpath.name.endswith("/") else "/")),
                     force_overwrite_to_cloud=force_overwrite_to_cloud,
                     ignore=ignore,
                 )
@@ -1210,8 +1209,8 @@ def _new_cloudpath(self, path: Union[str, os.PathLike]) -> Self:
             path = path[1:]
 
         # add prefix/anchor if it is not already
-        if not path.startswith(self.cloud_prefix):
-            path = f"{self.cloud_prefix}{path}"
+        if not path.startswith(self.anchor):
+            path = f"{self.anchor}{path}"
 
         return self.client.CloudPath(path)
 

diff --git a/cloudpathlib/http/__init__.py b/cloudpathlib/http/__init__.py
@@ -0,0 +1,7 @@
+from .httpclient import HttpClient
+from .httppath import HttpPath
+
+__all__ = [
+    "HttpClient",
+    "HttpPath",
+]
diff --git a/cloudpathlib/http/httpclient.py b/cloudpathlib/http/httpclient.py
@@ -0,0 +1,173 @@
+from datetime import datetime
+import os
+import re
+import urllib.request
+import urllib.parse
+import urllib.error
+from pathlib import Path
+from typing import Iterable, Optional, Tuple, Union, Callable
+import shutil
+import mimetypes
+import urllib.response
+
+import pytz
+
+from cloudpathlib.client import Client, register_client_class
+from cloudpathlib.enums import FileCacheMode
+
+from .httppath import HttpPath
+
+
+@register_client_class("http")
+class HttpClient(Client):
+    def __init__(
+        self,
+        file_cache_mode: Optional[Union[str, FileCacheMode]] = None,
+        local_cache_dir: Optional[Union[str, os.PathLike]] = None,
+        content_type_method: Optional[Callable] = mimetypes.guess_type,
+        auth: Optional[urllib.request.BaseHandler] = None,
+        custom_list_page_parser: Optional[Callable[[str], Iterable[str]]] = None,
+        custom_dir_matcher: Optional[Callable[[str], bool]] = None,
+    ):
+        super().__init__(file_cache_mode, local_cache_dir, content_type_method)
+        self.auth = auth
+
+        if self.auth is None:
+            self.opener = urllib.request.build_opener()
+        else:
+            self.opener = urllib.request.build_opener(self.auth)
+
+        self.custom_list_page_parser = custom_list_page_parser
+
+        self.dir_matcher = (
+            custom_dir_matcher if custom_dir_matcher is not None else lambda x: x.endswith("/")
+        )
+
+    def _get_metadata(self, cloud_path: HttpPath) -> dict:
+        with self.opener.open(cloud_path.as_url()) as response:
+            last_modified = response.headers.get("Last-Modified", None)
+
+            if last_modified is not None:
+                # per https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
+                last_modified = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
+
+                # should always be utc https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified#gmt
+                last_modified = last_modified.replace(tzinfo=pytz.UTC)
+
+            return {
+                "size": int(response.headers.get("Content-Length", 0)),
+                "last_modified": last_modified,
+                "content_type": response.headers.get("Content-Type", None),
+            }
+
+    def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path:
+        local_path = Path(local_path)
+        with self.opener.open(cloud_path.as_url()) as response:
+            with open(local_path, "wb") as out_file:
+                shutil.copyfileobj(response, out_file)
+        return local_path
+
+    def _exists(self, cloud_path: HttpPath) -> bool:
+        request = urllib.request.Request(cloud_path.as_url(), method="HEAD")
+        try:
+            with self.opener.open(request) as response:
+                return response.status == 200
+        except (urllib.error.HTTPError, urllib.error.URLError) as e:
+            if isinstance(e, urllib.error.URLError) or e.code == 404:
+                return False
+            raise
+
+    def _move_file(self, src: HttpPath, dst: HttpPath, remove_src: bool = True) -> HttpPath:
+        self._upload_file(src, dst)
+        if remove_src:
+            self._remove(src)
+        return dst
+
+    def _remove(self, cloud_path: HttpPath, missing_ok: bool = True) -> None:
+        request = urllib.request.Request(cloud_path.as_url(), method="DELETE")
+        try:
+            with self.opener.open(request) as response:
+                if response.status != 204:
+                    raise Exception(f"Failed to delete {cloud_path}.")
+        except urllib.error.HTTPError as e:
+            if e.code == 404 and missing_ok:
+                pass
+            else:
+                raise FileNotFoundError(f"Failed to delete {cloud_path}.")
+
+    def _list_dir(self, cloud_path: HttpPath, recursive: bool) -> Iterable[Tuple[HttpPath, bool]]:
+        try:
+            with self.opener.open(cloud_path.as_url()) as response:
+                # Parse the directory listing
+                for path, is_dir in self._parse_list_dir_response(
+                    response.read().decode(), base_url=str(cloud_path)
+                ):
+                    yield path, is_dir
+
+                    # If it's a directory and recursive is True, list the contents of the directory
+                    if recursive and is_dir:
+                        yield from self._list_dir(path, recursive=True)
+
+        except Exception as e:  # noqa E722
+            raise NotImplementedError(
+                f"Unable to parse response as a listing of files; please provide a custom parser as `custom_list_page_parser`. Error raised: {e}"
+            )
+
+    def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: HttpPath) -> HttpPath:
+        local_path = Path(local_path)
+        if self.content_type_method is not None:
+            content_type, _ = self.content_type_method(local_path)
+
+        headers = {"Content-Type": content_type or "application/octet-stream"}
+
+        with open(local_path, "rb") as file_data:
+            request = urllib.request.Request(
+                cloud_path.as_url(), data=file_data.read(), method="PUT", headers=headers
+            )
+            with self.opener.open(request) as response:
+                if response.status != 201 and response.status != 200:
+                    raise Exception(f"Failed to upload {local_path} to {cloud_path}.")
+        return cloud_path
+
+    def _get_public_url(self, cloud_path: HttpPath) -> str:
+        return cloud_path.as_url()
+
+    def _generate_presigned_url(self, cloud_path: HttpPath, expire_seconds: int = 60 * 60) -> str:
+        raise NotImplementedError("Presigned URLs are not supported using urllib.")
+
+    def _parse_list_dir_response(
+        self, response: str, base_url: str
+    ) -> Iterable[Tuple[HttpPath, bool]]:
+        # Ensure base_url ends with a trailing slash so joining works
+        if not base_url.endswith("/"):
+            base_url += "/"
+
+        def _simple_links(html: str) -> Iterable[str]:
+            return re.findall(r'<a\s+href="([^"]+)"', html)
+
+        parser: Callable[[str], Iterable[str]] = (
+            self.custom_list_page_parser
+            if self.custom_list_page_parser is not None
+            else _simple_links
+        )
+
+        yield from (
+            (self.CloudPath((urllib.parse.urljoin(base_url, match))), self.dir_matcher(match))
+            for match in parser(response)
+        )
+
+    def request(self, url: HttpPath, method: str, **kwargs) -> None:
+        request = urllib.request.Request(url.as_url(), method=method, **kwargs)
+        with self.opener.open(request) as response:
+            return response
+
+
+HttpClient.HttpPath = HttpClient.CloudPath  # type: ignore
+
+
+@register_client_class("https")
+class HttpsClient(HttpClient):
+    pass
+
+
+HttpsClient.HttpsPath = HttpsClient.CloudPath  # type: ignore