From fdf762ef44713dc21d0ea0ada86bdb05bf1f594b Mon Sep 17 00:00:00 2001 From: Daniel Lea Date: Fri, 21 Jul 2023 19:17:52 +0100 Subject: [PATCH 01/18] Generic getting from git remotes (#1) * clear out old logic full pulling and updating * `doMaterializeRepo` pulls repo * git pull on update * git effective checkout * tidy up `fetch` * tidy first if block in `guess_git_repo_params` * remove separate enum types for git remote providers * tidy up `find_git_repo_in_uri` * attempt to generalise git fetcher * tests * accommodate ssh * handle tags * test subdirectory syntax * test file based git urls * tidy up test * test None when no git repo indicated * docstring update * run black on test code * remove now unused functions * put back old git checkout logic * put back old fetch --- tests/fetchers/test_git.py | 125 +++++++++++++++ wfexs_backend/common.py | 4 +- wfexs_backend/fetchers/git.py | 280 ++++++---------------------------- 3 files changed, 174 insertions(+), 235 deletions(-) create mode 100644 tests/fetchers/test_git.py diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py new file mode 100644 index 00000000..8efd7927 --- /dev/null +++ b/tests/fetchers/test_git.py @@ -0,0 +1,125 @@ +import pytest +import logging +from wfexs_backend.common import RemoteRepo, RepoType +from wfexs_backend.fetchers.git import guess_git_repo_params + + +@pytest.mark.parametrize( + ["url", "expected"], + [ + ( + "https://github.com/inab/WfExS-backend.git", + RemoteRepo( + repo_url="https://github.com/inab/WfExS-backend.git", + repo_type=RepoType.Git, + ), + ), + ( + "git+https://github.com/inab/WfExS-backend.git", + RemoteRepo( + repo_url="https://github.com/inab/WfExS-backend.git", + repo_type=RepoType.Git, + ), + ), + ( + "https://github.com/inab/WfExS-backend.git@0.1.2", + RemoteRepo( + repo_url="https://github.com/inab/WfExS-backend.git", + repo_type=RepoType.Git, + tag="0.1.2", + ), + ), + ( + "https://github.com/inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + RemoteRepo( + repo_url="https://github.com/inab/WfExS-backend.git", + repo_type=RepoType.Git, + rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + ), + ), + ( + "ssh://git@github.com:inab/WfExS-backend.git", + RemoteRepo( + repo_url="git@github.com:inab/WfExS-backend.git", + repo_type=RepoType.Git, + ), + ), + ( + "git+ssh://git@github.com:inab/WfExS-backend.git", + RemoteRepo( + repo_url="git@github.com:inab/WfExS-backend.git", + repo_type=RepoType.Git, + ), + ), + ( + "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", + RemoteRepo( + repo_url="git@github.com:inab/WfExS-backend.git", + repo_type=RepoType.Git, + tag="0.1.2", + ), + ), + ( + "ssh://git@github.com:inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + RemoteRepo( + repo_url="git@github.com:inab/WfExS-backend.git", + repo_type=RepoType.Git, + rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + ), + ), + ( + "file:///inab/WfExS-backend/.git", + RemoteRepo( + repo_url="file:///inab/WfExS-backend/.git", + repo_type=RepoType.Git, + ), + ), + ( + "git+file:///inab/WfExS-backend/.git", + RemoteRepo( + repo_url="file:///inab/WfExS-backend/.git", + repo_type=RepoType.Git, + ), + ), + ( + "file:///inab/WfExS-backend/.git@0.1.2", + RemoteRepo( + repo_url="file:///inab/WfExS-backend/.git", + repo_type=RepoType.Git, + tag="0.1.2", + ), + ), + ( + "file:///inab/WfExS-backend/.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + RemoteRepo( + repo_url="file:///inab/WfExS-backend/.git", + repo_type=RepoType.Git, + rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + ), + ), + ( + "github.com/inab/WfExS-backend.git", + None, + ), + ( + "git@github.com:inab/WfExS-backend.git", + None, + ), + ( + "ssh://git@github.com:inab/WfExS-backend", + None, + ), + ( + "https://github.com/inab/WfExS-backend", + None, + ), + ( + "file:///inab/WfExS-backend", + None, + ), + ], +) +def test_guess_git_repo_params(url, expected): + logger = logging.Logger("name") + output = guess_git_repo_params(url, logger=logger) + assert output == expected diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index 77f6631c..c1b04184 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -639,9 +639,7 @@ def _value_fixes(cls) -> "Mapping[str, Optional[str]]": class RepoType(enum.Enum): - GitHub = "github" - GitLab = "gitlab" - BitBucket = "bitbucket" + Git = "git" Raw = "raw" Other = "other" SoftwareHeritage = "swh" diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index dcd8573a..9b026b0f 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -66,6 +66,7 @@ from urllib import parse, request import dulwich.porcelain +import git from . import ( AbstractRepoFetcher, @@ -124,8 +125,9 @@ def doMaterializeRepo( ) -> "Tuple[AbsPath, RepoDesc, Sequence[URIWithMetadata]]": """ - :param repoURL: - :param repoTag: + :param repoURL: The URL to the repository. + :param repoTag: The tag or branch to checkout. + By default, checkout the repository's default branch. :param doUpdate: :return: """ @@ -358,6 +360,7 @@ def fetch( HEAD_LABEL = b"HEAD" REFS_HEADS_PREFIX = b"refs/heads/" REFS_TAGS_PREFIX = b"refs/tags/" +GIT_SCHEMES = ["https", "git+https", "ssh", "git+ssh", "file", "git+file"] def guess_git_repo_params( @@ -365,10 +368,21 @@ def guess_git_repo_params( logger: "logging.Logger", fail_ok: "bool" = False, ) -> "Optional[RemoteRepo]": + """Extract the parameters for a git repo from the given URL. If an invalid URL is passed, + this function returns `None`. + + The acceptable form for the URL can be found [here](https://pip.pypa.io/en/stable/topics/vcs-support/#git). + + :param wf_url: The URL to the repo. + :param logger: A `logging.Logger` instance for debugging purposes. + :param fail_ok: _description_, defaults to False. Deprecated, ignored. + :return: A `RemoteRepo` instance containing parameters of the git repo or `None` + if no repo was found. + """ repoURL = None repoTag = None repoRelPath = None - repoType: "Optional[RepoType]" = None + repoType: "Optional[RepoType]" = RepoType.Git # Deciding which is the input if isinstance(wf_url, parse.ParseResult): @@ -376,140 +390,40 @@ def guess_git_repo_params( else: parsed_wf_url = parse.urlparse(wf_url) - # These are the usual URIs which can be understood by pip - # See https://pip.pypa.io/en/stable/cli/pip_install/#git - found_params: "Optional[Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]]" = None - try: - if parsed_wf_url.scheme in GitFetcher.GetSchemeHandlers(): - # Getting the scheme git is going to understand - if len(parsed_wf_url.scheme) >= len(GitFetcher.GIT_PROTO_PREFIX): - gitScheme = parsed_wf_url.scheme[len(GitFetcher.GIT_PROTO_PREFIX) :] - else: - gitScheme = parsed_wf_url.scheme + # Return None if no scheme in URL. Can't choose how to proceed + if not parsed_wf_url.scheme: + logger.debug( + f"No scheme in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" + ) + return None + + # Return None if no scheme in URL. Can't choose how to proceed + if not ".git" in parsed_wf_url.path: + logger.debug( + f"URL does not seem to point to a git repo." + ) + return None - # Getting the tag or branch - if "@" in parsed_wf_url.path: - gitPath, repoTag = parsed_wf_url.path.split("@", 1) - else: - gitPath = parsed_wf_url.path - - # Getting the repoRelPath (if available) - if len(parsed_wf_url.fragment) > 0: - frag_qs = parse.parse_qs(parsed_wf_url.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if len(subDirArr) > 0: - repoRelPath = subDirArr[0] - - # Now, reassemble the repoURL - repoURL = parse.urlunparse( - (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") - ) - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - - elif parsed_wf_url.scheme == GITHUB_SCHEME: - repoType = RepoType.GitHub - - gh_path_split = parsed_wf_url.path.split("/") - gh_path = "/".join(gh_path_split[:2]) - gh_post_path = list(map(parse.unquote_plus, gh_path_split[2:])) - if len(gh_post_path) > 0: - repoTag = gh_post_path[0] - if len(gh_post_path) > 1: - repoRelPath = "/".join(gh_post_path[1:]) - - repoURL = parse.urlunparse( - parse.ParseResult( - scheme="https", - netloc=GITHUB_NETLOC, - path=gh_path, - params="", - query="", - fragment="", - ) - ) - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - - elif parsed_wf_url.netloc == GITHUB_NETLOC: - found_params = find_git_repo_in_uri(parsed_wf_url) - repoURL = found_params[0].repo_url - - # And now, guessing the tag and the relative path - # WARNING! This code can have problems with tags which contain slashes - wf_path = found_params[1] - repo_branches_tags = found_params[2] - if len(wf_path) > 1 and (wf_path[0] in ("blob", "tree")): - wf_path_tag = list(map(parse.unquote_plus, wf_path[1:])) - - tag_relpath = "/".join(wf_path_tag) - for repo_branch_tag in repo_branches_tags: - if repo_branch_tag == tag_relpath or tag_relpath.startswith( - repo_branch_tag + "/" - ): - repoTag = repo_branch_tag - if len(tag_relpath) > len(repo_branch_tag): - tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] - if len(tag_relpath) > 0: - repoRelPath = tag_relpath - break - else: - # Fallback - repoTag = wf_path_tag[0] - if len(wf_path_tag) > 0: - repoRelPath = "/".join(wf_path_tag[1:]) - elif parsed_wf_url.netloc == "raw.githubusercontent.com": - wf_path = list(map(parse.unquote_plus, parsed_wf_url.path.split("/"))) - if len(wf_path) >= 3: - # Rebuilding it - repoGitPath = wf_path[:3] - repoGitPath[-1] += ".git" - - # Rebuilding repo git path - repoURL = parse.urlunparse( - ("https", GITHUB_NETLOC, "/".join(repoGitPath), "", "", "") - ) + # Getting the scheme git is going to understand + git_scheme = parsed_wf_url.scheme.removeprefix("git+") - # And now, guessing the tag/checkout and the relative path - # WARNING! This code can have problems with tags which contain slashes - found_params = find_git_repo_in_uri(cast("URIType", repoURL)) - if len(wf_path) >= 4: - repo_branches_tags = found_params[2] - # Validate against existing branch and tag names - tag_relpath = "/".join(wf_path[3:]) - for repo_branch_tag in repo_branches_tags: - if repo_branch_tag == tag_relpath or tag_relpath.startswith( - repo_branch_tag + "/" - ): - repoTag = repo_branch_tag - if len(tag_relpath) > len(repo_branch_tag): - tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] - if len(tag_relpath) > 0: - repoRelPath = tag_relpath - break - else: - # Fallback - repoTag = wf_path[3] - if len(wf_path) > 4: - repoRelPath = "/".join(wf_path[4:]) - else: - repoType = RepoType.GitHub - # TODO handling other popular cases, like bitbucket - else: - found_params = find_git_repo_in_uri(parsed_wf_url) + # Getting the tag or branch + gitPath = parsed_wf_url.path + if "@" in parsed_wf_url.path: + gitPath, repoTag = parsed_wf_url.path.split("@", 1) - except RepoGuessException as gge: - if not fail_ok: - raise FetcherException( - f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" - ) from gge - - if found_params is not None: - if repoTag is None: - repoTag = found_params[0].tag - repoType = found_params[0].repo_type - elif not fail_ok: - raise FetcherException( - "FIXME: Unsupported http(s) git repository {}".format(wf_url) - ) + # Getting the repoRelPath (if available) + if parsed_wf_url.fragment: + frag_qs = parse.parse_qs(parsed_wf_url.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if subDirArr: + repoRelPath = subDirArr[0] + + # Now, reassemble the repoURL + if git_scheme == "ssh": + repoURL = parsed_wf_url.netloc + gitPath + else: + repoURL = parse.urlunparse((git_scheme, parsed_wf_url.netloc, gitPath, "", "", "")) logger.debug( "From {} was derived (type {}) {} {} {}".format( @@ -517,107 +431,9 @@ def guess_git_repo_params( ) ) - if repoURL is None: - return None - return RemoteRepo( repo_url=cast("RepoURL", repoURL), tag=cast("Optional[RepoTag]", repoTag), rel_path=cast("Optional[RelPath]", repoRelPath), repo_type=repoType, ) - - -def find_git_repo_in_uri( - remote_file: "Union[URIType, parse.ParseResult]", -) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": - if isinstance(remote_file, parse.ParseResult): - parsedInputURL = remote_file - else: - parsedInputURL = parse.urlparse(remote_file) - sp_path = parsedInputURL.path.split("/") - - shortest_pre_path: "Optional[URIType]" = None - longest_post_path: "Optional[Sequence[str]]" = None - repo_type: "Optional[RepoType]" = None - the_remote_uri: "Optional[str]" = None - b_default_repo_tag: "Optional[str]" = None - repo_branches: "Optional[MutableSequence[RepoTag]]" = None - for pos in range(len(sp_path), 0, -1): - pre_path = "/".join(sp_path[:pos]) - if pre_path == "": - pre_path = "/" - remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) - - remote_refs_dict: "Mapping[bytes, bytes]" - try: - remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) - except dulwich.errors.NotGitRepository as ngr: - # Skip and continue - continue - - the_remote_uri = remote_uri_anc - - head_remote_ref = remote_refs_dict[HEAD_LABEL] - repo_branches = [] - b_default_repo_tag = None - for remote_label, remote_ref in remote_refs_dict.items(): - if remote_label.startswith(REFS_HEADS_PREFIX): - b_repo_tag = remote_label[len(REFS_HEADS_PREFIX) :].decode( - "utf-8", errors="continue" - ) - repo_branches.append(cast("RepoTag", b_repo_tag)) - if b_default_repo_tag is None and remote_ref == head_remote_ref: - b_default_repo_tag = b_repo_tag - - # It is considered a git repo! - shortest_pre_path = cast("URIType", pre_path) - longest_post_path = sp_path[pos:] - if repo_type is None: - # Metadata is all we really need - repo_type = RepoType.Raw - req = request.Request(remote_uri_anc, method="HEAD") - try: - with request.urlopen(req) as resp: - # Is it gitlab? - if list( - filter( - lambda c: "gitlab" in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.GitLab - elif list( - filter( - lambda c: GITHUB_NETLOC in c, - resp.headers.get_all("Set-Cookie"), - ) - ): - repo_type = RepoType.GitHub - elif list( - filter( - lambda c: "bitbucket" in c, - resp.headers.get_all("X-View-Name"), - ) - ): - repo_type = RepoType.BitBucket - except Exception as e: - pass - - if repo_type is None: - raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") - - if b_default_repo_tag is None: - raise RepoGuessException( - f"No tag was obtained while getting default branch name from {remote_file}" - ) - - assert longest_post_path is not None - assert repo_branches is not None - - repo = RemoteRepo( - repo_url=cast("RepoURL", the_remote_uri), - tag=cast("RepoTag", b_default_repo_tag), - repo_type=repo_type, - ) - return repo, longest_post_path, repo_branches From 45563a6985fe0c33d752de7094a061e6ddc723d4 Mon Sep 17 00:00:00 2001 From: Daniel Lea Date: Mon, 31 Jul 2023 17:44:32 +0100 Subject: [PATCH 02/18] Not calling changed code (#2) * fix behavour for urls; should not require ".git" * remove git import * assume default bracnh/tag is main --- tests/fetchers/test_git.py | 27 ++++++++++++++++++++++++--- wfexs_backend/fetchers/git.py | 10 +--------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 8efd7927..f55d58cf 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -12,6 +12,7 @@ RemoteRepo( repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -19,6 +20,7 @@ RemoteRepo( repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -35,6 +37,7 @@ repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + tag="main", ), ), ( @@ -42,6 +45,7 @@ RemoteRepo( repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -49,6 +53,7 @@ RemoteRepo( repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -65,6 +70,7 @@ repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + tag="main", ), ), ( @@ -72,6 +78,7 @@ RemoteRepo( repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -79,6 +86,7 @@ RemoteRepo( repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, + tag="main", ), ), ( @@ -95,6 +103,7 @@ repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + tag="main", ), ), ( @@ -107,15 +116,27 @@ ), ( "ssh://git@github.com:inab/WfExS-backend", - None, + RemoteRepo( + repo_url="git@github.com:inab/WfExS-backend", + repo_type=RepoType.Git, + tag="main", + ), ), ( "https://github.com/inab/WfExS-backend", - None, + RemoteRepo( + repo_url="https://github.com/inab/WfExS-backend", + repo_type=RepoType.Git, + tag="main", + ), ), ( "file:///inab/WfExS-backend", - None, + RemoteRepo( + repo_url="file:///inab/WfExS-backend", + repo_type=RepoType.Git, + tag="main", + ), ), ], ) diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index 9b026b0f..d9c32ab1 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -66,7 +66,6 @@ from urllib import parse, request import dulwich.porcelain -import git from . import ( AbstractRepoFetcher, @@ -380,7 +379,7 @@ def guess_git_repo_params( if no repo was found. """ repoURL = None - repoTag = None + repoTag = "main" repoRelPath = None repoType: "Optional[RepoType]" = RepoType.Git @@ -396,13 +395,6 @@ def guess_git_repo_params( f"No scheme in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" ) return None - - # Return None if no scheme in URL. Can't choose how to proceed - if not ".git" in parsed_wf_url.path: - logger.debug( - f"URL does not seem to point to a git repo." - ) - return None # Getting the scheme git is going to understand git_scheme = parsed_wf_url.scheme.removeprefix("git+") From 4038c0a7d1532874797b4e6d7724725e05f5098b Mon Sep 17 00:00:00 2001 From: Daniel Lea Date: Tue, 1 Aug 2023 14:17:22 +0100 Subject: [PATCH 03/18] Dont require default to main (#3) * test putting back tag = None * push some debug info * try adding type hint * remove type hint, didn't help * type hint but no cast * no hint and no cast * put back cast * shitty hack * a little more debug info * fix if statement * missing check * fix checks again * or to and * fix tests * debug info * remove dumb hack * more debug info * more debug * check version type * try to find out why version is str None not NoneType * try new check * remove debug clutter * tidy up --- tests/fetchers/test_git.py | 12 ------------ wfexs_backend/fetchers/git.py | 2 +- wfexs_backend/workflow.py | 2 +- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index f55d58cf..07a937c6 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -12,7 +12,6 @@ RemoteRepo( repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -20,7 +19,6 @@ RemoteRepo( repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -37,7 +35,6 @@ repo_url="https://github.com/inab/WfExS-backend.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", - tag="main", ), ), ( @@ -45,7 +42,6 @@ RemoteRepo( repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -53,7 +49,6 @@ RemoteRepo( repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -70,7 +65,6 @@ repo_url="git@github.com:inab/WfExS-backend.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", - tag="main", ), ), ( @@ -78,7 +72,6 @@ RemoteRepo( repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -86,7 +79,6 @@ RemoteRepo( repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -103,7 +95,6 @@ repo_url="file:///inab/WfExS-backend/.git", repo_type=RepoType.Git, rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", - tag="main", ), ), ( @@ -119,7 +110,6 @@ RemoteRepo( repo_url="git@github.com:inab/WfExS-backend", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -127,7 +117,6 @@ RemoteRepo( repo_url="https://github.com/inab/WfExS-backend", repo_type=RepoType.Git, - tag="main", ), ), ( @@ -135,7 +124,6 @@ RemoteRepo( repo_url="file:///inab/WfExS-backend", repo_type=RepoType.Git, - tag="main", ), ), ], diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index d9c32ab1..052d3f5a 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -379,7 +379,7 @@ def guess_git_repo_params( if no repo was found. """ repoURL = None - repoTag = "main" + repoTag = None repoRelPath = None repoType: "Optional[RepoType]" = RepoType.Git diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 83778348..01f9bdb1 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -415,7 +415,7 @@ def __init__( self.creds_config = creds_config self.id = str(workflow_id) - self.version_id = str(version_id) + self.version_id = None if version_id is None else str(version_id) self.descriptor_type = descriptor_type self.params = params self.placeholders = placeholders From 50faa28fddf1a282472ad50ac24364c27956548d Mon Sep 17 00:00:00 2001 From: Daniel Lea Date: Thu, 3 Aug 2023 15:30:23 +0100 Subject: [PATCH 04/18] Fix podman (#4) * debug info * test hack * try no container * remove no-container * comment out disable pull * uncomment disable-pull * put back lame hack --- wfexs_backend/cwl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/cwl_engine.py b/wfexs_backend/cwl_engine.py index 8eb1ec0b..82983443 100644 --- a/wfexs_backend/cwl_engine.py +++ b/wfexs_backend/cwl_engine.py @@ -1059,7 +1059,7 @@ def launchWorkflow( else: cmd_arr.extend( [ - "--disable-pull", + # "--disable-pull", "--podman", ] ) From b7e3aec58b9296570d403eba4c04b597c0db12f6 Mon Sep 17 00:00:00 2001 From: Daniel Lea Date: Tue, 22 Aug 2023 12:59:40 +0100 Subject: [PATCH 05/18] add podman key in ro_crate.py --- wfexs_backend/ro_crate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index afc74dae..165bfe1e 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -340,6 +340,7 @@ class WorkflowRunROCrate: ContainerTypeIds: "Final[Mapping[ContainerType, str]]" = { ContainerType.Singularity: "https://apptainer.org/", ContainerType.Docker: "https://www.docker.com/", + ContainerType.Podman: "https://podman.io/" } def __init__( From 88c393338d530af250e1dfe665f6b4c6e5db4746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Aug 2023 15:16:00 +0200 Subject: [PATCH 06/18] Added new pytest dependency --- dev-requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dev-requirements.txt b/dev-requirements.txt index 460a0515..03765682 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,7 @@ wheel pylint < 2.14.0 ; python_version == '3.6' pylint >= 2.15.5 ; python_version >= '3.7' +pytest pyflakes >= 2.5.0 flake8 < 6.0.0 ; python_version < '3.8' flake8 >= 6.0.0 ; python_version >= '3.8' From af3bcb34b6017ac777674fae97b05ee45079ee9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Aug 2023 15:20:44 +0200 Subject: [PATCH 07/18] Added Daniel Lea as contributor to CITATION.cff --- CITATION.cff | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CITATION.cff b/CITATION.cff index aac84cff..d47a27fb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -11,6 +11,9 @@ authors: - family-names: Iborra given-names: Paula orcid: "https://orcid.org/0000-0003-0504-3029" + - family-names: Lea + given-names: Daniel + orcid: "https://orcid.org/0000-0001-8152-0398" cff-version: 1.2.0 date-released: "2023-08-01" identifiers: From beb0f48f32f465981ef2b6914658936f04efc050 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Thu, 24 Aug 2023 17:06:02 +0200 Subject: [PATCH 08/18] Cosmetic changes in tests so `pre-commit run -a` is happy. --- .pre-commit-config.yaml | 2 +- tests/fetchers/test_git.py | 65 ++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0d355323..2ba7fb43 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,7 +10,7 @@ repos: types: - python exclude: "^[^/]*env/|dev-[^/]*/" - entry: pylint -j 4 + entry: pylint -j 4 --source-roots . - id: mypy name: Local MyPy language: system diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 07a937c6..6154eb72 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -1,5 +1,18 @@ import pytest import logging +from typing import ( + cast, + TYPE_CHECKING, +) + +if TYPE_CHECKING: + from wfexs_backend.common import ( + RelPath, + RepoTag, + RepoURL, + URIType, + ) + from wfexs_backend.common import RemoteRepo, RepoType from wfexs_backend.fetchers.git import guess_git_repo_params @@ -10,91 +23,97 @@ ( "https://github.com/inab/WfExS-backend.git", RemoteRepo( - repo_url="https://github.com/inab/WfExS-backend.git", + repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, ), ), ( "git+https://github.com/inab/WfExS-backend.git", RemoteRepo( - repo_url="https://github.com/inab/WfExS-backend.git", + repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, ), ), ( "https://github.com/inab/WfExS-backend.git@0.1.2", RemoteRepo( - repo_url="https://github.com/inab/WfExS-backend.git", + repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, - tag="0.1.2", + tag=cast("RepoTag", "0.1.2"), ), ), ( "https://github.com/inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", RemoteRepo( - repo_url="https://github.com/inab/WfExS-backend.git", + repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, - rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + rel_path=cast( + "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" + ), ), ), ( "ssh://git@github.com:inab/WfExS-backend.git", RemoteRepo( - repo_url="git@github.com:inab/WfExS-backend.git", + repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), repo_type=RepoType.Git, ), ), ( "git+ssh://git@github.com:inab/WfExS-backend.git", RemoteRepo( - repo_url="git@github.com:inab/WfExS-backend.git", + repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), repo_type=RepoType.Git, ), ), ( "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", RemoteRepo( - repo_url="git@github.com:inab/WfExS-backend.git", + repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), repo_type=RepoType.Git, - tag="0.1.2", + tag=cast("RepoTag", "0.1.2"), ), ), ( "ssh://git@github.com:inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", RemoteRepo( - repo_url="git@github.com:inab/WfExS-backend.git", + repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), repo_type=RepoType.Git, - rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + rel_path=cast( + "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" + ), ), ), ( "file:///inab/WfExS-backend/.git", RemoteRepo( - repo_url="file:///inab/WfExS-backend/.git", + repo_url=cast("RepoURL", "file:///inab/WfExS-backend/.git"), repo_type=RepoType.Git, ), ), ( "git+file:///inab/WfExS-backend/.git", RemoteRepo( - repo_url="file:///inab/WfExS-backend/.git", + repo_url=cast("RepoURL", "file:///inab/WfExS-backend/.git"), repo_type=RepoType.Git, ), ), ( "file:///inab/WfExS-backend/.git@0.1.2", RemoteRepo( - repo_url="file:///inab/WfExS-backend/.git", + repo_url=cast("RepoURL", "file:///inab/WfExS-backend/.git"), repo_type=RepoType.Git, - tag="0.1.2", + tag=cast("RepoTag", "0.1.2"), ), ), ( "file:///inab/WfExS-backend/.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", RemoteRepo( - repo_url="file:///inab/WfExS-backend/.git", + repo_url=cast("RepoURL", "file:///inab/WfExS-backend/.git"), repo_type=RepoType.Git, - rel_path="workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", + rel_path=cast( + "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" + ), ), ), ( @@ -108,27 +127,27 @@ ( "ssh://git@github.com:inab/WfExS-backend", RemoteRepo( - repo_url="git@github.com:inab/WfExS-backend", + repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend"), repo_type=RepoType.Git, ), ), ( "https://github.com/inab/WfExS-backend", RemoteRepo( - repo_url="https://github.com/inab/WfExS-backend", + repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend"), repo_type=RepoType.Git, ), ), ( "file:///inab/WfExS-backend", RemoteRepo( - repo_url="file:///inab/WfExS-backend", + repo_url=cast("RepoURL", "file:///inab/WfExS-backend"), repo_type=RepoType.Git, ), ), ], ) -def test_guess_git_repo_params(url, expected): +def test_guess_git_repo_params(url: "str", expected: "RemoteRepo") -> "None": logger = logging.Logger("name") - output = guess_git_repo_params(url, logger=logger) + output = guess_git_repo_params(cast("URIType", url), logger=logger) assert output == expected From 31b55958b370ee2628e5ad25fa7493657876680b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 25 Aug 2023 00:17:49 +0200 Subject: [PATCH 09/18] Now podman support was fixed in a separate branch, `--disable-pull` flag can be re-enabled --- wfexs_backend/cwl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/cwl_engine.py b/wfexs_backend/cwl_engine.py index ce0e6b66..28f09032 100644 --- a/wfexs_backend/cwl_engine.py +++ b/wfexs_backend/cwl_engine.py @@ -1069,7 +1069,7 @@ def launchWorkflow( else: cmd_arr.extend( [ - # "--disable-pull", + "--disable-pull", "--podman", ] ) From fff7206b93f13fcc5b967aa8b02a6398d52a0643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 25 Aug 2023 18:28:24 +0200 Subject: [PATCH 10/18] Added needed __init__.py files which make pytest usable from command line. Without those files, pytest had to be run as `python -m pytest` --- tests/__init__.py | 1 + tests/fetchers/__init__.py | 0 2 files changed, 1 insertion(+) create mode 100644 tests/__init__.py create mode 100644 tests/fetchers/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..b8a47f34 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for WfExS-backend""" diff --git a/tests/fetchers/__init__.py b/tests/fetchers/__init__.py new file mode 100644 index 00000000..e69de29b From b661b713f2affb7751c38438fe424ad7512f4131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 25 Aug 2023 18:29:43 +0200 Subject: [PATCH 11/18] Added minimal pytest configuration file --- .pytest.ini | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .pytest.ini diff --git a/.pytest.ini b/.pytest.ini new file mode 100644 index 00000000..5ee64771 --- /dev/null +++ b/.pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests From 33e0459390de0bee211c6e58a98b2126bd6a5714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Fri, 25 Aug 2023 19:57:41 +0200 Subject: [PATCH 12/18] Added a couple of pytest plugins --- dev-requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev-requirements.txt b/dev-requirements.txt index 03765682..c4e4b1d0 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -2,6 +2,8 @@ wheel pylint < 2.14.0 ; python_version == '3.6' pylint >= 2.15.5 ; python_version >= '3.7' pytest +pytest-cov +pytest-xdist pyflakes >= 2.5.0 flake8 < 6.0.0 ; python_version < '3.8' flake8 >= 6.0.0 ; python_version >= '3.8' From ab4d4f3da0388f685870dcb30ab4422ca637eac8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 28 Aug 2023 21:50:25 +0200 Subject: [PATCH 13/18] cwltool now has the needed fixes for Podman in upstream --- wfexs_backend/cwl_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/cwl_engine.py b/wfexs_backend/cwl_engine.py index 28f09032..e0fb26f4 100644 --- a/wfexs_backend/cwl_engine.py +++ b/wfexs_backend/cwl_engine.py @@ -123,7 +123,8 @@ class CWLWorkflowEngine(WorkflowEngine): CWL_REPO = "https://github.com/common-workflow-language/" CWLTOOL_REPO = CWL_REPO + CWLTOOL_PYTHON_PACKAGE - DEVEL_CWLTOOL_REPO = "https://github.com/jmfernandez/" + CWLTOOL_PYTHON_PACKAGE + # DEVEL_CWLTOOL_REPO = "https://github.com/jmfernandez/" + CWLTOOL_PYTHON_PACKAGE + DEVEL_CWLTOOL_REPO = CWLTOOL_REPO CWL_UTILS_REPO = CWL_REPO + CWL_UTILS_PYTHON_PACKAGE DEFAULT_CWLTOOL_VERSION = cast("EngineVersion", "3.1.20230719185429") @@ -132,7 +133,7 @@ class CWLWorkflowEngine(WorkflowEngine): DEVEL_CWLTOOL_PACKAGE = f"git+{DEVEL_CWLTOOL_REPO}.git" # Set this constant to something meaningful only when a hotfix # between releases is needed - DEVEL_CWLTOOL_VERSION = "191841853da642f2446b7298bc814db81e7149a7" + DEVEL_CWLTOOL_VERSION = "509ffb9d6802c837ec2a818b799ef4c332c34d04" # DEVEL_CWLTOOL_VERSION = None # DEFAULT_CWL_UTILS_VERSION = 'v0.10' From 6e6957b466dd686b662f7e23073de071cf8e0961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 28 Aug 2023 21:59:18 +0200 Subject: [PATCH 14/18] Cosmetic comment fix --- wfexs_backend/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/workflow.py b/wfexs_backend/workflow.py index 37c0841a..de230613 100644 --- a/wfexs_backend/workflow.py +++ b/wfexs_backend/workflow.py @@ -1141,7 +1141,7 @@ def fetchWorkflow( Fetch the whole workflow description based on the data obtained from the TRS where it is being published. - If the workflow id is an URL, it is supposed to be a git repository, + If the workflow id is an URL, it is supposed to be a repository (git, swh, ...), and the version will represent either the branch, tag or specific commit. So, the whole TRS fetching machinery is bypassed. """ From 47233a2b5d631a6d6d3ff55dc8acdab976e9c5e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Mon, 28 Aug 2023 22:20:21 +0200 Subject: [PATCH 15/18] Now it is possible to refer workflows using trs URI. This is additional to the original decomposed form. --- wfexs_backend/wfexs_backend.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/wfexs_backend/wfexs_backend.py b/wfexs_backend/wfexs_backend.py index 6f80ca9a..07609e65 100644 --- a/wfexs_backend/wfexs_backend.py +++ b/wfexs_backend/wfexs_backend.py @@ -130,6 +130,7 @@ ) from .fetchers.trs_files import ( + TRS_SCHEME_PREFIX, INTERNAL_TRS_SCHEME_PREFIX, ) @@ -1452,7 +1453,8 @@ def cacheWorkflow( and the version will represent either the branch, tag or specific commit. So, the whole TRS fetching machinery is bypassed. """ - parsedRepoURL = urllib.parse.urlparse(str(workflow_id)) + putative_repo_url = str(workflow_id) + parsedRepoURL = urllib.parse.urlparse(putative_repo_url) # It is not an absolute URL, so it is being an identifier in the workflow i_workflow: "Optional[IdentifiedWorkflow]" = None @@ -1461,7 +1463,30 @@ def cacheWorkflow( repoDir: "Optional[AbsPath]" = None putative: "bool" = False cached_putative_path: "Optional[AbsPath]" = None - if parsedRepoURL.scheme == "": + if parsedRepoURL.scheme in ("", TRS_SCHEME_PREFIX): + # Extracting the TRS endpoint details from the parsedRepoURL + if parsedRepoURL.scheme == TRS_SCHEME_PREFIX: + # Duplication of code borrowed from trs_files.py + path_steps: "Sequence[str]" = parsedRepoURL.path.split("/") + if len(path_steps) < 3 or path_steps[0] != "": + raise WfExSBackendException( + f"Ill-formed TRS CURIE {putative_repo_url}. It should be in the format of {TRS_SCHEME_PREFIX}://id/version or {TRS_SCHEME_PREFIX}://prefix-with-slashes/id/version" + ) + trs_steps = cast("MutableSequence[str]", path_steps[0:-2]) + trs_steps.extend(["ga4gh", "trs", "v2", "tools"]) + trs_endpoint = urllib.parse.urlunparse( + urllib.parse.ParseResult( + scheme="https", + netloc=parsedRepoURL.netloc, + path="/".join(trs_steps), + params="", + query="", + fragment="", + ) + ) + + workflow_id = urllib.parse.unquote(path_steps[-2]) + version_id = urllib.parse.unquote(path_steps[-1]) if (trs_endpoint is not None) and len(trs_endpoint) > 0: i_workflow, repoDir = self.getWorkflowRepoFromTRS( trs_endpoint, From 3617932782df1c7a42b16be97e9f578f95316c21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 29 Aug 2023 21:56:41 +0200 Subject: [PATCH 16/18] Cosmetic "black" fix --- wfexs_backend/ro_crate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wfexs_backend/ro_crate.py b/wfexs_backend/ro_crate.py index 6ab55341..abcbbdc0 100644 --- a/wfexs_backend/ro_crate.py +++ b/wfexs_backend/ro_crate.py @@ -412,7 +412,7 @@ class WorkflowRunROCrate: ContainerTypeIds: "Final[Mapping[ContainerType, str]]" = { ContainerType.Singularity: "https://apptainer.org/", ContainerType.Docker: "https://www.docker.com/", - ContainerType.Podman: "https://podman.io/" + ContainerType.Podman: "https://podman.io/", } def __init__( From 69135925faa5c239068790b0495995cffcfcc418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 29 Aug 2023 21:57:46 +0200 Subject: [PATCH 17/18] Added the concept of RepoGuessFlavor, which can be used to track down whether the repo "URL" was whether a legit one or a guessed one from common, known patterns. --- wfexs_backend/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/wfexs_backend/common.py b/wfexs_backend/common.py index 7e4f0c7b..aa0ccb52 100644 --- a/wfexs_backend/common.py +++ b/wfexs_backend/common.py @@ -667,6 +667,12 @@ class RepoType(enum.Enum): TRS = "trs" +class RepoGuessFlavor(enum.Enum): + GitHub = "github" + GitLab = "gitlab" + BitBucket = "bitbucket" + + class RemoteRepo(NamedTuple): """ Remote repository description @@ -677,6 +683,7 @@ class RemoteRepo(NamedTuple): rel_path: "Optional[RelPath]" = None repo_type: "Optional[RepoType]" = None web_url: "Optional[URIType]" = None + guess_flavor: "Optional[RepoGuessFlavor]" = None class IdentifiedWorkflow(NamedTuple): From 3d9a030a9b8e038f8a1543e8df2ebe7545267468 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20M=C2=AA=20Fern=C3=A1ndez?= Date: Tue, 29 Aug 2023 21:59:28 +0200 Subject: [PATCH 18/18] Restored previous "guessing" behaviours. Method `find_repo_in_git_uri` has been rescued, and improved its integration with `guess_git_repo_params`, so previous scenarios work again. --- tests/fetchers/test_git.py | 32 ++- wfexs_backend/fetchers/git.py | 355 ++++++++++++++++++++++++++++++---- 2 files changed, 342 insertions(+), 45 deletions(-) diff --git a/tests/fetchers/test_git.py b/tests/fetchers/test_git.py index 6154eb72..5e383ecc 100644 --- a/tests/fetchers/test_git.py +++ b/tests/fetchers/test_git.py @@ -13,7 +13,11 @@ URIType, ) -from wfexs_backend.common import RemoteRepo, RepoType +from wfexs_backend.common import ( + RepoGuessFlavor, + RemoteRepo, + RepoType, +) from wfexs_backend.fetchers.git import guess_git_repo_params @@ -24,6 +28,8 @@ "https://github.com/inab/WfExS-backend.git", RemoteRepo( repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), + tag=cast("RepoTag", "main"), + guess_flavor=RepoGuessFlavor.GitHub, repo_type=RepoType.Git, ), ), @@ -31,6 +37,7 @@ "git+https://github.com/inab/WfExS-backend.git", RemoteRepo( repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), + tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), ), @@ -47,6 +54,7 @@ RemoteRepo( repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, + tag=cast("RepoTag", "main"), rel_path=cast( "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" ), @@ -55,21 +63,23 @@ ( "ssh://git@github.com:inab/WfExS-backend.git", RemoteRepo( - repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), + tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), ), ( "git+ssh://git@github.com:inab/WfExS-backend.git", RemoteRepo( - repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), + tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), ), ( "ssh://git@github.com:inab/WfExS-backend.git@0.1.2", RemoteRepo( - repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, tag=cast("RepoTag", "0.1.2"), ), @@ -77,8 +87,9 @@ ( "ssh://git@github.com:inab/WfExS-backend.git#subdirectory=workflow_examples/ipc/cosifer_test1_cwl.wfex.stage", RemoteRepo( - repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend.git"), + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), repo_type=RepoType.Git, + tag=cast("RepoTag", "main"), rel_path=cast( "RelPath", "workflow_examples/ipc/cosifer_test1_cwl.wfex.stage" ), @@ -122,12 +133,17 @@ ), ( "git@github.com:inab/WfExS-backend.git", - None, + RemoteRepo( + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend.git"), + tag=cast("RepoTag", "main"), + repo_type=RepoType.Git, + ), ), ( "ssh://git@github.com:inab/WfExS-backend", RemoteRepo( - repo_url=cast("RepoURL", "git@github.com:inab/WfExS-backend"), + repo_url=cast("RepoURL", "ssh://git@github.com/inab/WfExS-backend"), + tag=cast("RepoTag", "main"), repo_type=RepoType.Git, ), ), @@ -135,7 +151,9 @@ "https://github.com/inab/WfExS-backend", RemoteRepo( repo_url=cast("RepoURL", "https://github.com/inab/WfExS-backend"), + guess_flavor=RepoGuessFlavor.GitHub, repo_type=RepoType.Git, + tag=cast("RepoTag", "main"), ), ), ( diff --git a/wfexs_backend/fetchers/git.py b/wfexs_backend/fetchers/git.py index aac1a6fa..34d0a186 100644 --- a/wfexs_backend/fetchers/git.py +++ b/wfexs_backend/fetchers/git.py @@ -78,6 +78,7 @@ ContentKind, ProtocolFetcherReturn, RemoteRepo, + RepoGuessFlavor, RepoType, URIWithMetadata, ) @@ -107,8 +108,10 @@ def GetSchemeHandlers(cls) -> "Mapping[str, Type[AbstractStatefulFetcher]]": # These are de-facto schemes supported by pip and git client return { cls.GIT_PROTO: cls, + cls.GIT_PROTO_PREFIX + "file": cls, cls.GIT_PROTO_PREFIX + "https": cls, cls.GIT_PROTO_PREFIX + "http": cls, + cls.GIT_PROTO_PREFIX + "ssh": cls, cls.GITHUB_SCHEME: cls, } @@ -389,7 +392,7 @@ def fetch( HEAD_LABEL = b"HEAD" REFS_HEADS_PREFIX = b"refs/heads/" REFS_TAGS_PREFIX = b"refs/tags/" -GIT_SCHEMES = ["https", "git+https", "ssh", "git+ssh", "file", "git+file"] +GIT_SCHEMES = ["https", "git", "ssh", "file"] def guess_git_repo_params( @@ -397,21 +400,11 @@ def guess_git_repo_params( logger: "logging.Logger", fail_ok: "bool" = False, ) -> "Optional[RemoteRepo]": - """Extract the parameters for a git repo from the given URL. If an invalid URL is passed, - this function returns `None`. - - The acceptable form for the URL can be found [here](https://pip.pypa.io/en/stable/topics/vcs-support/#git). - - :param wf_url: The URL to the repo. - :param logger: A `logging.Logger` instance for debugging purposes. - :param fail_ok: _description_, defaults to False. Deprecated, ignored. - :return: A `RemoteRepo` instance containing parameters of the git repo or `None` - if no repo was found. - """ repoURL = None repoTag = None repoRelPath = None - repoType: "Optional[RepoType]" = RepoType.Git + repoType: "Optional[RepoType]" = None + guessedRepoFlavor: "Optional[RepoGuessFlavor]" = None web_url: "Optional[URIType]" = None # Deciding which is the input @@ -420,44 +413,330 @@ def guess_git_repo_params( else: parsed_wf_url = parse.urlparse(wf_url) - # Return None if no scheme in URL. Can't choose how to proceed - if not parsed_wf_url.scheme: - logger.debug( - f"No scheme in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" - ) - return None + # These are the usual URIs which can be understood by pip + # See https://pip.pypa.io/en/stable/cli/pip_install/#git + found_params: "Optional[Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]]" = None + try: + if parsed_wf_url.scheme == GitFetcher.GITHUB_SCHEME: + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + + gh_path_split = parsed_wf_url.path.split("/") + gh_path = "/".join(gh_path_split[:2]) + gh_post_path = list(map(parse.unquote_plus, gh_path_split[2:])) + if len(gh_post_path) > 0: + repoTag = gh_post_path[0] + if len(gh_post_path) > 1: + repoRelPath = "/".join(gh_post_path[1:]) + + repoURL = parse.urlunparse( + parse.ParseResult( + scheme="https", + netloc=GITHUB_NETLOC, + path=gh_path, + params="", + query="", + fragment="", + ) + ) + found_params = find_git_repo_in_uri(cast("URIType", repoURL)) + + elif ( + parsed_wf_url.scheme in ("http", "https") + and parsed_wf_url.netloc == GITHUB_NETLOC + and "@" not in parsed_wf_url.path + and parsed_wf_url.fragment == "" + ): + found_params = find_git_repo_in_uri(parsed_wf_url) + repoURL = found_params[0].repo_url + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + + # And now, guessing the tag and the relative path + # WARNING! This code can have problems with tags which contain slashes + wf_path = found_params[1] + repo_branches_tags = found_params[2] + if len(wf_path) > 1 and (wf_path[0] in ("blob", "tree")): + wf_path_tag = list(map(parse.unquote_plus, wf_path[1:])) + + tag_relpath = "/".join(wf_path_tag) + for repo_branch_tag in repo_branches_tags: + if repo_branch_tag == tag_relpath or tag_relpath.startswith( + repo_branch_tag + "/" + ): + repoTag = repo_branch_tag + if len(tag_relpath) > len(repo_branch_tag): + tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] + if len(tag_relpath) > 0: + repoRelPath = tag_relpath + break + else: + # Fallback + repoTag = wf_path_tag[0] + if len(wf_path_tag) > 0: + repoRelPath = "/".join(wf_path_tag[1:]) + elif ( + parsed_wf_url.scheme in ("http", "https") + and parsed_wf_url.netloc == "raw.githubusercontent.com" + ): + repoType = RepoType.Git + guessedRepoFlavor = RepoGuessFlavor.GitHub + wf_path = list(map(parse.unquote_plus, parsed_wf_url.path.split("/"))) + if len(wf_path) >= 3: + # Rebuilding it + repoGitPath = wf_path[:3] + repoGitPath[-1] += ".git" + + # Rebuilding repo git path + repoURL = parse.urlunparse( + ("https", GITHUB_NETLOC, "/".join(repoGitPath), "", "", "") + ) - # Getting the scheme git is going to understand - git_scheme = parsed_wf_url.scheme.removeprefix("git+") + # And now, guessing the tag/checkout and the relative path + # WARNING! This code can have problems with tags which contain slashes + found_params = find_git_repo_in_uri(cast("URIType", repoURL)) + if len(wf_path) >= 4: + repo_branches_tags = found_params[2] + # Validate against existing branch and tag names + tag_relpath = "/".join(wf_path[3:]) + for repo_branch_tag in repo_branches_tags: + if repo_branch_tag == tag_relpath or tag_relpath.startswith( + repo_branch_tag + "/" + ): + repoTag = repo_branch_tag + if len(tag_relpath) > len(repo_branch_tag): + tag_relpath = tag_relpath[len(repo_branch_tag) + 1 :] + if len(tag_relpath) > 0: + repoRelPath = tag_relpath + break + else: + # Fallback + repoTag = wf_path[3] + if len(wf_path) > 4: + repoRelPath = "/".join(wf_path[4:]) + elif ( + parsed_wf_url.scheme == "" + or (parsed_wf_url.scheme in GitFetcher.GetSchemeHandlers()) + or (parsed_wf_url.scheme in GIT_SCHEMES) + ): + if parsed_wf_url.scheme == "": + # It could be a checkout uri in the form of 'git@github.com:inab/WfExS-backend.git' + if ( + parsed_wf_url.netloc == "" + and ("@" in parsed_wf_url.path) + and (":" in parsed_wf_url.path) + ): + gitScheme = "ssh" + parsed_wf_url = parse.urlparse( + f"{gitScheme}://" + + parse.urlunparse(parsed_wf_url).replace(":", "/") + ) + else: + logger.debug( + f"No scheme in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" + ) + return None + # Getting the scheme git is going to understand + elif parsed_wf_url.scheme.startswith(GitFetcher.GIT_PROTO_PREFIX): + gitScheme = parsed_wf_url.scheme[len(GitFetcher.GIT_PROTO_PREFIX) :] + denorm_parsed_wf_url = parsed_wf_url._replace(scheme=gitScheme) + parsed_wf_url = parse.urlparse(parse.urlunparse(denorm_parsed_wf_url)) + else: + gitScheme = parsed_wf_url.scheme - # Getting the tag or branch - gitPath = parsed_wf_url.path - if "@" in parsed_wf_url.path: - gitPath, repoTag = parsed_wf_url.path.split("@", 1) + if gitScheme not in GIT_SCHEMES: + logger.debug( + f"Unknown scheme {gitScheme} in repo URL. Choices are: {', '.join(GIT_SCHEMES)}" + ) + return None + + # Beware ssh protocol!!!! I has a corner case with URLs like + # ssh://git@github.com:inab/WfExS-backend.git' + if parsed_wf_url.scheme == "ssh" and ":" in parsed_wf_url.netloc: + new_netloc = parsed_wf_url.netloc + # Translating it to something better + colon_pos = new_netloc.rfind(":") + new_netloc = new_netloc[:colon_pos] + "/" + new_netloc[colon_pos + 1 :] + denorm_parsed_wf_url = parsed_wf_url._replace(netloc=new_netloc) + parsed_wf_url = parse.urlparse(parse.urlunparse(denorm_parsed_wf_url)) + + # Getting the tag or branch + if "@" in parsed_wf_url.path: + gitPath, repoTag = parsed_wf_url.path.split("@", 1) + else: + gitPath = parsed_wf_url.path + + # Getting the repoRelPath (if available) + if len(parsed_wf_url.fragment) > 0: + frag_qs = parse.parse_qs(parsed_wf_url.fragment) + subDirArr = frag_qs.get("subdirectory", []) + if len(subDirArr) > 0: + repoRelPath = subDirArr[0] + + # Now, reassemble the repoURL + repoURL = parse.urlunparse( + (gitScheme, parsed_wf_url.netloc, gitPath, "", "", "") + ) + found_params = find_git_repo_in_uri(cast("URIType", repoURL)) + guessedRepoFlavor = found_params[0].guess_flavor + # TODO handling other popular cases, like bitbucket + else: + found_params = find_git_repo_in_uri(parsed_wf_url) - # Getting the repoRelPath (if available) - if parsed_wf_url.fragment: - frag_qs = parse.parse_qs(parsed_wf_url.fragment) - subDirArr = frag_qs.get("subdirectory", []) - if subDirArr: - repoRelPath = subDirArr[0] + except RepoGuessException as gge: + if not fail_ok: + import traceback - # Now, reassemble the repoURL - if git_scheme == "ssh": - repoURL = parsed_wf_url.netloc + gitPath - else: - repoURL = parse.urlunparse((git_scheme, parsed_wf_url.netloc, gitPath, "", "", "")) + traceback.print_exc() + raise FetcherException( + f"FIXME: Unsupported http(s) git repository {wf_url} (see cascade exception)" + ) from gge + + if found_params is not None: + if repoTag is None: + repoTag = found_params[0].tag + repoType = found_params[0].repo_type + if guessedRepoFlavor is None: + guessedRepoFlavor = found_params[0].guess_flavor + elif not fail_ok: + raise FetcherException( + f"FIXME: Unsupported git repository {wf_url}. (Is it really a git repo???)" + ) logger.debug( - "From {} was derived (type {}) {} {} {}".format( - wf_url, repoType, repoURL, repoTag, repoRelPath + "From {} was derived (type {}, flavor {}) {} {} {}".format( + wf_url, repoType, guessedRepoFlavor, repoURL, repoTag, repoRelPath ) ) + if repoURL is None: + return None + + # if repoType == RepoType.GitHub: + # wf_entrypoint_path = [ + # + # ] + # web_url = urllib.parse.urlunparse( + # ( + # "https", + # "raw.githubusercontent.com", + # "/".join(wf_entrypoint_path), + # "", + # "", + # "", + # ) + # ) + return RemoteRepo( repo_url=cast("RepoURL", repoURL), tag=cast("Optional[RepoTag]", repoTag), rel_path=cast("Optional[RelPath]", repoRelPath), repo_type=repoType, + guess_flavor=guessedRepoFlavor, web_url=web_url, ) + + +def find_git_repo_in_uri( + remote_file: "Union[URIType, parse.ParseResult]", +) -> "Tuple[RemoteRepo, Sequence[str], Sequence[RepoTag]]": + if isinstance(remote_file, parse.ParseResult): + parsedInputURL = remote_file + else: + parsedInputURL = parse.urlparse(remote_file) + sp_path = parsedInputURL.path.split("/") + + shortest_pre_path: "Optional[URIType]" = None + longest_post_path: "Optional[Sequence[str]]" = None + repo_type: "Optional[RepoType]" = None + guessed_repo_flavor: "Optional[RepoGuessFlavor]" = None + the_remote_uri: "Optional[str]" = None + b_default_repo_tag: "Optional[str]" = None + repo_branches: "Optional[MutableSequence[RepoTag]]" = None + for pos in range(len(sp_path), 0, -1): + pre_path = "/".join(sp_path[:pos]) + if pre_path == "": + pre_path = "/" + remote_uri_anc = parse.urlunparse(parsedInputURL._replace(path=pre_path)) + + remote_refs_dict: "Mapping[bytes, bytes]" + try: + # Dulwich works both with file, ssh, git and http(s) protocols + remote_refs_dict = dulwich.porcelain.ls_remote(remote_uri_anc) + repo_type = RepoType.Git + except ( + dulwich.errors.NotGitRepository, + dulwich.errors.GitProtocolError, + ) as ngr: + # Skip and continue + continue + + the_remote_uri = remote_uri_anc + + head_remote_ref = remote_refs_dict[HEAD_LABEL] + repo_branches = [] + b_default_repo_tag = None + for remote_label, remote_ref in remote_refs_dict.items(): + if remote_label.startswith(REFS_HEADS_PREFIX): + b_repo_tag = remote_label[len(REFS_HEADS_PREFIX) :].decode( + "utf-8", errors="continue" + ) + repo_branches.append(cast("RepoTag", b_repo_tag)) + if b_default_repo_tag is None and remote_ref == head_remote_ref: + b_default_repo_tag = b_repo_tag + + # It is considered a git repo! + shortest_pre_path = cast("URIType", pre_path) + longest_post_path = sp_path[pos:] + if repo_type is None: + # Metadata is all we really need + repo_type = RepoType.Raw + req = request.Request(remote_uri_anc, method="HEAD") + try: + with request.urlopen(req) as resp: + # Is it gitlab? + if list( + filter( + lambda c: "gitlab" in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitLab + elif list( + filter( + lambda c: GITHUB_NETLOC in c, + resp.headers.get_all("Set-Cookie"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.GitHub + elif list( + filter( + lambda c: "bitbucket" in c, + resp.headers.get_all("X-View-Name"), + ) + ): + repo_type = RepoType.Git + guessed_repo_flavor = RepoGuessFlavor.BitBucket + except Exception as e: + pass + + if repo_type is None: + raise RepoGuessException(f"Unable to identify {remote_file} as a git repo") + + if b_default_repo_tag is None: + raise RepoGuessException( + f"No tag was obtained while getting default branch name from {remote_file}" + ) + + assert longest_post_path is not None + assert repo_branches is not None + + repo = RemoteRepo( + repo_url=cast("RepoURL", the_remote_uri), + tag=cast("RepoTag", b_default_repo_tag), + repo_type=repo_type, + guess_flavor=guessed_repo_flavor, + ) + return repo, longest_post_path, repo_branches