From 7586285c578b980b596ab4b63053c7f57724f0d7 Mon Sep 17 00:00:00 2001 From: John Turner Date: Fri, 6 Sep 2024 12:11:43 -0400 Subject: [PATCH 1/4] Attempting to get reusable workflow to call checkout properly --- .github/workflows/check-links-self.yaml | 12 ++ .github/workflows/check-links.yaml | 33 +++++ .github/workflows/check_links.py | 168 ++++++++++++++++++++++ .github/workflows/lint.py | 53 +++++++ .github/workflows/notebook-lint-self.yaml | 14 ++ .github/workflows/notebook-lint.yaml | 49 +++++++ .github/workflows/requirements.txt | 1 + 7 files changed, 330 insertions(+) create mode 100644 .github/workflows/check-links-self.yaml create mode 100644 .github/workflows/check-links.yaml create mode 100644 .github/workflows/check_links.py create mode 100644 .github/workflows/lint.py create mode 100644 .github/workflows/notebook-lint-self.yaml create mode 100644 .github/workflows/notebook-lint.yaml create mode 100644 .github/workflows/requirements.txt diff --git a/.github/workflows/check-links-self.yaml b/.github/workflows/check-links-self.yaml new file mode 100644 index 0000000..755f049 --- /dev/null +++ b/.github/workflows/check-links-self.yaml @@ -0,0 +1,12 @@ +name: 'Check Links' +on: + workflow_dispatch: + push: + pull_request: + +jobs: + link_check: + name: 'Link Check' + uses: NIGMS/NIGMS-Sandbox/.github/workflows/check-links.yaml@reusable_workflow + with: + repo_link_ignore_list: "" \ No newline at end of file diff --git a/.github/workflows/check-links.yaml b/.github/workflows/check-links.yaml new file mode 100644 index 0000000..07869b4 --- /dev/null +++ b/.github/workflows/check-links.yaml @@ -0,0 +1,33 @@ +name: 'Check Links' +on: + workflow_call: + inputs: + directory: + required: false + type: string + repo_link_ignore_list: + required: true + type: string + secrets: + PAT: + required: false +jobs: + link_check: + name: 'Link Check' + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout + uses: actions/checkout@v4 + with: + repository: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main + path: reusable-workflow-repo + + - name: Link Check + run: | + python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/check_links.py + env: + LINK_IGNORE_LIST: https://www.sciencedirect.com,https://portlandpress.com + PAT: ${{ secrets.PAT }} \ No newline at end of file diff --git a/.github/workflows/check_links.py b/.github/workflows/check_links.py new file mode 100644 index 0000000..967fe7f --- /dev/null +++ b/.github/workflows/check_links.py @@ -0,0 +1,168 @@ +import http.client +import urllib.request, urllib.error +import os +import sys +import re + + + +# set some default variables +remove_characters = ['**', '\\n'] + +# text that tends to be at the end of the url that we need truncate everything past them +end_characters = [')',",","'",'`',"\"",'','',"\\",">","]"] + +big_regex = re.compile('|'.join(map(re.escape, remove_characters))) + +# if there are any URLs to ignore add here +link_ignore_list = [] +link_ignore_list_env = os.getenv("LINK_IGNORE_LIST") +if link_ignore_list_env and len(link_ignore_list_env) > 0: + link_ignore_list = link_ignore_list_env.split(',') + +# Add any repo specific ignores +link_ignore_list_env_2 = os.getenv("inputs.repo_link_ignore_list") +if link_ignore_list_env_2 and len(link_ignore_list_env_2) > 0: + link_ignore_list.extend(link_ignore_list_env.split(',')) + +print_valid = os.getenv("print_valid_links") is not None + +# If we are given a directory then use it, otherwise assume path is current directory +path = "." +if len(sys.argv) >1 and os.path.exists(sys.argv[1]): + path = sys.argv[1] + +# directory environment overrides the system arguments and default. +directory_env = os.getenv("inputs.directory") +if directory_env and len(directory_env) > 0: + path = directory_env + +pat_env = os.getenv("INPUT_PAT") +if directory_env and len(directory_env) > 0: + path = directory_env + +# list which stores all links to check +links_to_check = [] +link_file_map = {} +# Get the response code of the url to see if it exists +def getResponseCode(url): + content = None + try: + req = urllib.request.Request(url, + headers={'User-Agent': 'Mozilla/5.0'}) + conn = urllib.request.urlopen(req) + # Only get HTML if we have a potential anchor link + if "#" in url and "pdf" not in url: + content = conn.read().decode("utf-8") + except urllib.error.HTTPError as e: + return [e.code, content] + except urllib.error.URLError as e: + return [404, content] + except http.client.InvalidURL: + return [200, content] + return [conn.getcode(), content] + +def clean_link(link): + if link.endswith("."): + link = link[:link.rfind(".")] + if link.endswith("'"): + link = link[:link.rfind("'")] + if link.endswith("\""): + link = link[:link.rfind("\"")] + link_stripped = big_regex.sub("", link.strip()) + for end_c in end_characters: + end_index = link_stripped.find(end_c) + if end_index != -1: + link_stripped = link_stripped[:end_index] + return link_stripped + +def add_link(loc,link): + # this is a command being ran so difficult to validate in this script, skip it + if '$(uname' in link: + return False + + # get just from the http portion if there was more in from of the string we grabbed + link = link[link.find("http"):] + + # if there is a period at the end, truncate to that period. Other periods may be valid + # strip various characters that may be in the string + link_stripped = clean_link(link) + while link_stripped != link: + link = link_stripped + link_stripped = clean_link(link) + + # add link to be checked + links_to_check.append(link_stripped) + + # store where the link is so we can fix it + link_file_map[link_stripped] = loc +def check_link(link): + # try and get the url, if its 404 or 500 then its invalid, let us know and trigger the error flag + code = getResponseCode(link) + loc =link_file_map[link] + if code[0] in [404, 403, 500]: + + # If the link failed, but we are ignoring it then just mention that + for ignored_link in link_ignore_list: + if ignored_link in link: + print( + loc + ", " + link + ", Ignored") + return False + + # print(file+" Code:"+str(code[0])+" Line "+str(line_num)+"("+str(char)+"):"+item_stripped) + print( + loc + ", " + link + ", Failed") + return True + + # check for missing anchors + elif "#" in link and \ + code[1] is not None \ + and 'href=\"' + link[link.find("#"):] + '\"' not in \ + code[1]: + print( + loc + ", " + link + ", Failed - Anchor") + # print(file + " Missing Anchor Line " + str( + # line_num) + "(" + str( + # char) + "):" + item_stripped) + elif print_valid: + print( + loc + ", " + link + ", Valid") + return True + + +if __name__ == "__main__": + err = 0 + print("Directory is "+path) + # Loop through all files in path + for root, dirs, files in os.walk(path): + for file in files: + # only read file that match template ( txt, md or python notebook) + if file.endswith(".md") or file.endswith(".txt") or file.endswith( + ".ipynb"): + + # get content and separate into lines and then separate by spaces + raw_content = open(os.path.join(root, file), "r").read() + content = raw_content.split("\n") + content = [x.split(" ") for x in content] + loc = os.path.join(root, file) + # have an incrementer for line number later export + for line in content: + for item in line: + + if "https://" in item or "http://" in item: + if "](" in item: + add_link(loc,item[item.find("]"):]) + # if we get any error then add it + if item[item.find("("):] == item[item.find("]"):]: + continue + add_link(loc,item[item.find("("):]) + else: + add_link(loc,item) + + for link in set(links_to_check): + # if we get any error then add to err variable + err = check_link(link) + err + # if the error is > 1 then set it to 1 to error as 1 + if err > 1: + err = 1 + exit(err) diff --git a/.github/workflows/lint.py b/.github/workflows/lint.py new file mode 100644 index 0000000..5808266 --- /dev/null +++ b/.github/workflows/lint.py @@ -0,0 +1,53 @@ +import os +import shutil +import nbformat +from nbformat.v4 import new_notebook + +def clean_notebook(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + notebook = nbformat.read(f, as_version=4) + + # Clean cells + for cell in notebook.cells: + if 'outputs' in cell: + cell['outputs'] = [] + if 'execution_count' in cell: + cell['execution_count'] = None + if 'metadata' in cell: + cell['metadata'] = {} + + # Clean notebook metadata + if 'metadata' in notebook: + notebook['metadata'] = {} + + with open(file_path, 'w', encoding='utf-8') as f: + nbformat.write(notebook, f) + +def delete_checkpoints_dirs(root_dir): + # Walk through the directory tree + for dirpath, dirnames, filenames in os.walk(root_dir): + for dirname in dirnames: + # Check if the directory name is 'checkpoints' + if dirname == '.ipynb_checkpoints': + # Construct the full path to the directory + dir_to_delete = os.path.join(dirpath, dirname) + # Delete the directory + shutil.rmtree(dir_to_delete) + print(f'Deleted {dir_to_delete}') + print('Consider adding .ipynb_checkpoints to your .gitignore file!') + + +if __name__ == "__main__": + # Change this to the directory containing your notebooks + notebook_dir = '../../' + + for root, dirs, files in os.walk(notebook_dir): + for file in files: + if file.endswith('.ipynb'): + file_path = os.path.join(root, file) + clean_notebook(file_path) + print(f'Cleaned {file_path}') + + # Delete all 'checkpoints' directories + delete_checkpoints_dirs(notebook_dir) + diff --git a/.github/workflows/notebook-lint-self.yaml b/.github/workflows/notebook-lint-self.yaml new file mode 100644 index 0000000..035647c --- /dev/null +++ b/.github/workflows/notebook-lint-self.yaml @@ -0,0 +1,14 @@ +name: 'Lint Notebook' +on: + push: + workflow_dispatch: +permissions: + contents: write + id-token: write + +jobs: + lint: + name: 'Linting' + uses: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@reusable_workflow + with: + directory: . diff --git a/.github/workflows/notebook-lint.yaml b/.github/workflows/notebook-lint.yaml new file mode 100644 index 0000000..4268d13 --- /dev/null +++ b/.github/workflows/notebook-lint.yaml @@ -0,0 +1,49 @@ +name: 'Lint Notebook' +on: + workflow_call: + inputs: + directory: + required: false + type: string +permissions: + contents: write + id-token: write + +jobs: + lint: + name: 'Linting' + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout + uses: actions/checkout@v4 + with: + repository: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main + path: reusable-workflow-repo + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + + - name: Install requirements.txt + working-directory: ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/ + run: | + python3 -m pip install --upgrade pip + pip3 install nbformat + + - name: Notebook Linting + working-directory: ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/ + run: | + python3 lint.py + + - name: Commit changes + uses: EndBug/add-and-commit@v9 + with: + author_name: github-action + author_email: cbiit-github-action@github.com + message: 'Github Action: Refresh stats' diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt new file mode 100644 index 0000000..d0537a8 --- /dev/null +++ b/.github/workflows/requirements.txt @@ -0,0 +1 @@ +nbformat==5.10.4 From 0c4850c08dd2a1a9ec44ce3ddcd45e68fa489873 Mon Sep 17 00:00:00 2001 From: John Turner Date: Fri, 6 Sep 2024 12:14:14 -0400 Subject: [PATCH 2/4] fixed repo link and added ref --- .github/workflows/check-links.yaml | 3 ++- .github/workflows/notebook-lint.yaml | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/check-links.yaml b/.github/workflows/check-links.yaml index 07869b4..af9c395 100644 --- a/.github/workflows/check-links.yaml +++ b/.github/workflows/check-links.yaml @@ -22,8 +22,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - repository: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main + repository: NIGMS/NIGMS-Sandbox path: reusable-workflow-repo + ref: reusable_workflow - name: Link Check run: | diff --git a/.github/workflows/notebook-lint.yaml b/.github/workflows/notebook-lint.yaml index 4268d13..bd58904 100644 --- a/.github/workflows/notebook-lint.yaml +++ b/.github/workflows/notebook-lint.yaml @@ -21,7 +21,8 @@ jobs: - name: Checkout uses: actions/checkout@v4 with: - repository: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main + repository: NIGMS/NIGMS-Sandbox + ref: reusable_workflow path: reusable-workflow-repo - name: Set up Python @@ -31,13 +32,13 @@ jobs: cache: 'pip' - name: Install requirements.txt - working-directory: ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/ + working-directory: reusable-workflow-repo/.github/workflows/ run: | python3 -m pip install --upgrade pip pip3 install nbformat - name: Notebook Linting - working-directory: ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/ + working-directory: reusable-workflow-repo/.github/workflows/ run: | python3 lint.py From 610c38f581176a5176e131a406999c7847530b52 Mon Sep 17 00:00:00 2001 From: github-action Date: Fri, 6 Sep 2024 16:16:24 +0000 Subject: [PATCH 3/4] Github Action: Refresh stats --- reusable-workflow-repo | 1 + 1 file changed, 1 insertion(+) create mode 160000 reusable-workflow-repo diff --git a/reusable-workflow-repo b/reusable-workflow-repo new file mode 160000 index 0000000..0c4850c --- /dev/null +++ b/reusable-workflow-repo @@ -0,0 +1 @@ +Subproject commit 0c4850c08dd2a1a9ec44ce3ddcd45e68fa489873 From 7b5e357290ca270eba989eb5cc1326d0704088a3 Mon Sep 17 00:00:00 2001 From: John Turner Date: Fri, 6 Sep 2024 12:21:01 -0400 Subject: [PATCH 4/4] fixed repo link and added ref --- .github/workflows/check-links-self.yaml | 2 +- .github/workflows/notebook-lint-self.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/check-links-self.yaml b/.github/workflows/check-links-self.yaml index 755f049..be3315f 100644 --- a/.github/workflows/check-links-self.yaml +++ b/.github/workflows/check-links-self.yaml @@ -7,6 +7,6 @@ on: jobs: link_check: name: 'Link Check' - uses: NIGMS/NIGMS-Sandbox/.github/workflows/check-links.yaml@reusable_workflow + uses: NIGMS/NIGMS-Sandbox/.github/workflows/check-links.yaml@main with: repo_link_ignore_list: "" \ No newline at end of file diff --git a/.github/workflows/notebook-lint-self.yaml b/.github/workflows/notebook-lint-self.yaml index 035647c..2038a72 100644 --- a/.github/workflows/notebook-lint-self.yaml +++ b/.github/workflows/notebook-lint-self.yaml @@ -9,6 +9,6 @@ permissions: jobs: lint: name: 'Linting' - uses: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@reusable_workflow + uses: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main with: directory: .