Merge pull request #28 from NIGMS/reusable_workflow

Reusable workflow
NIGMS · Sep 6, 2024 · bbbc543 · bbbc543
2 parents 8f327f6 + 7b5e357
commit bbbc543
Show file tree

Hide file tree

Showing 8 changed files with 333 additions and 0 deletions.
diff --git a/.github/workflows/check-links-self.yaml b/.github/workflows/check-links-self.yaml
@@ -0,0 +1,12 @@
+name: 'Check Links'
+on:
+  workflow_dispatch:
+  push:
+  pull_request:
+
+jobs:
+  link_check:
+    name: 'Link Check'
+    uses: NIGMS/NIGMS-Sandbox/.github/workflows/check-links.yaml@main
+    with:
+      repo_link_ignore_list: ""
diff --git a/.github/workflows/check-links.yaml b/.github/workflows/check-links.yaml
@@ -0,0 +1,34 @@
+name: 'Check Links'
+on:
+  workflow_call:
+    inputs:
+      directory:
+        required: false
+        type: string
+      repo_link_ignore_list:
+        required: true
+        type: string
+    secrets:
+      PAT:
+        required: false
+jobs:
+  link_check:
+    name: 'Link Check'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+            repository: NIGMS/NIGMS-Sandbox
+            path: reusable-workflow-repo
+            ref: reusable_workflow
+
+      - name: Link Check
+        run: |
+          python3 ${GITHUB_WORKSPACE}/reusable-workflow-repo/.github/workflows/check_links.py
+        env:
+          LINK_IGNORE_LIST: https://www.sciencedirect.com,https://portlandpress.com
+          PAT: ${{ secrets.PAT }}
diff --git a/.github/workflows/check_links.py b/.github/workflows/check_links.py
@@ -0,0 +1,168 @@
+import http.client
+import urllib.request, urllib.error
+import os
+import sys
+import re
+
+
+
+# set some default variables
+remove_characters = ['**', '\\n']
+
+# text that tends to be at the end of the url that we need truncate everything past them
+end_characters = [')',",","'",'`',"\"",'</a>','</div>',"\\",">","]"]
+
+big_regex = re.compile('|'.join(map(re.escape, remove_characters)))
+
+# if there are any URLs to ignore add here
+link_ignore_list = []
+link_ignore_list_env = os.getenv("LINK_IGNORE_LIST")
+if link_ignore_list_env and len(link_ignore_list_env) > 0:
+    link_ignore_list = link_ignore_list_env.split(',')
+
+# Add any repo specific ignores
+link_ignore_list_env_2 = os.getenv("inputs.repo_link_ignore_list")
+if link_ignore_list_env_2 and len(link_ignore_list_env_2) > 0:
+    link_ignore_list.extend(link_ignore_list_env.split(','))
+
+print_valid = os.getenv("print_valid_links") is not None
+
+# If we are given a directory then use it, otherwise assume path is current directory
+path = "."
+if len(sys.argv) >1  and os.path.exists(sys.argv[1]):
+    path = sys.argv[1]
+
+# directory environment overrides the system arguments and default.
+directory_env = os.getenv("inputs.directory")
+if directory_env and len(directory_env) > 0:
+    path = directory_env
+
+pat_env = os.getenv("INPUT_PAT")
+if directory_env and len(directory_env) > 0:
+    path = directory_env
+
+# list which stores all links to check
+links_to_check = []
+link_file_map = {}
+# Get the response code of the url to see if it exists
+def getResponseCode(url):
+    content = None
+    try:
+        req = urllib.request.Request(url,
+                                     headers={'User-Agent': 'Mozilla/5.0'})
+        conn = urllib.request.urlopen(req)
+        # Only get HTML if we have a potential anchor link
+        if "#" in url and "pdf" not in url:
+            content = conn.read().decode("utf-8")
+    except urllib.error.HTTPError as e:
+        return [e.code, content]
+    except urllib.error.URLError as e:
+        return [404, content]
+    except http.client.InvalidURL:
+        return [200, content]
+    return [conn.getcode(), content]
+
+def clean_link(link):
+    if link.endswith("."):
+        link = link[:link.rfind(".")]
+    if link.endswith("'"):
+        link = link[:link.rfind("'")]
+    if link.endswith("\""):
+        link = link[:link.rfind("\"")]
+    link_stripped = big_regex.sub("", link.strip())
+    for end_c in end_characters:
+        end_index = link_stripped.find(end_c)
+        if end_index != -1:
+            link_stripped = link_stripped[:end_index]
+    return link_stripped
+
+def add_link(loc,link):
+    # this is a command being ran so difficult to validate in this script, skip it
+    if '$(uname' in link:
+        return False
+
+    # get just from the http portion if there was more in from of the string we grabbed
+    link = link[link.find("http"):]
+
+    # if there is a period at the end, truncate to that period. Other periods may be valid
+    # strip various characters that may be in the string
+    link_stripped = clean_link(link)
+    while link_stripped != link:
+        link = link_stripped
+        link_stripped = clean_link(link)
+
+    # add link to be checked
+    links_to_check.append(link_stripped)
+
+    # store where the link is so we can fix it
+    link_file_map[link_stripped] = loc
+def check_link(link):
+    # try and get the url, if its 404 or 500 then its invalid, let us know and trigger the error flag
+    code = getResponseCode(link)
+    loc =link_file_map[link]
+    if code[0] in [404, 403, 500]:
+
+        # If the link failed, but we are ignoring it then just mention that
+        for ignored_link in link_ignore_list:
+            if ignored_link in link:
+                print(
+                    loc + ", " + link + ", Ignored")
+                return False
+
+        # print(file+" Code:"+str(code[0])+" Line "+str(line_num)+"("+str(char)+"):"+item_stripped)
+        print(
+            loc + ", " + link + ", Failed")
+        return True
+
+    # check for missing anchors
+    elif "#" in link and \
+        code[1] is not None \
+        and 'href=\"' + link[link.find("#"):] + '\"' not in \
+        code[1]:
+        print(
+            loc + ", " + link + ", Failed - Anchor")
+    # print(file + " Missing Anchor Line " + str(
+    #     line_num) + "(" + str(
+    #     char) + "):" + item_stripped)
+    elif print_valid:
+        print(
+            loc + ", " + link + ", Valid")
+    return True
+
+
+if __name__ == "__main__":
+    err = 0
+    print("Directory is "+path)
+    # Loop through all files in path
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            #  only read file that match template ( txt, md or python notebook)
+            if file.endswith(".md") or file.endswith(".txt") or file.endswith(
+                ".ipynb"):
+
+                # get content and separate into lines and then separate by spaces
+                raw_content = open(os.path.join(root, file), "r").read()
+                content = raw_content.split("\n")
+                content = [x.split(" ") for x in content]
+                loc = os.path.join(root, file)
+                # have an incrementer for line number later export
+                for line in content:
+                    for item in line:
+
+                        if "https://" in item or "http://" in item:
+                            if "](" in item:
+                                add_link(loc,item[item.find("]"):])
+                                # if we get any error  then add it
+                                if item[item.find("("):] == item[item.find("]"):]:
+                                    continue
+                                add_link(loc,item[item.find("("):])
+                            else:
+                                add_link(loc,item)
+
+    for link in set(links_to_check):
+        # if we get any error  then add to err variable
+        err = check_link(link) + err
+    # if the error is > 1 then set it to 1 to error as 1
+    if err > 1:
+        err = 1
+    exit(err)
diff --git a/.github/workflows/lint.py b/.github/workflows/lint.py
@@ -0,0 +1,53 @@
+import os
+import shutil
+import nbformat
+from nbformat.v4 import new_notebook
+
+def clean_notebook(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        notebook = nbformat.read(f, as_version=4)
+
+    # Clean cells
+    for cell in notebook.cells:
+        if 'outputs' in cell:
+            cell['outputs'] = []
+        if 'execution_count' in cell:
+            cell['execution_count'] = None
+        if 'metadata' in cell:
+            cell['metadata'] = {}
+
+    # Clean notebook metadata
+    if 'metadata' in notebook:
+        notebook['metadata'] = {}
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        nbformat.write(notebook, f)
+
+def delete_checkpoints_dirs(root_dir):
+    # Walk through the directory tree
+    for dirpath, dirnames, filenames in os.walk(root_dir):
+        for dirname in dirnames:
+            # Check if the directory name is 'checkpoints'
+            if dirname == '.ipynb_checkpoints':
+                # Construct the full path to the directory
+                dir_to_delete = os.path.join(dirpath, dirname)
+                # Delete the directory
+                shutil.rmtree(dir_to_delete)
+                print(f'Deleted {dir_to_delete}')
+                print('Consider adding .ipynb_checkpoints to your .gitignore file!')
+
+
+if __name__ == "__main__":
+    # Change this to the directory containing your notebooks
+    notebook_dir = '../../'
+
+    for root, dirs, files in os.walk(notebook_dir):
+        for file in files:
+            if file.endswith('.ipynb'):
+                file_path = os.path.join(root, file)
+                clean_notebook(file_path)
+                print(f'Cleaned {file_path}')
+
+    # Delete all 'checkpoints' directories
+    delete_checkpoints_dirs(notebook_dir)
+
diff --git a/.github/workflows/notebook-lint-self.yaml b/.github/workflows/notebook-lint-self.yaml
@@ -0,0 +1,14 @@
+name: 'Lint Notebook'
+on:
+  push:
+  workflow_dispatch:
+permissions:
+  contents: write
+  id-token: write
+
+jobs:
+  lint:
+    name: 'Linting'
+    uses: NIGMS/NIGMS-Sandbox/.github/workflows/notebook-lint.yaml@main
+    with:
+      directory: .
diff --git a/.github/workflows/notebook-lint.yaml b/.github/workflows/notebook-lint.yaml
@@ -0,0 +1,50 @@
+name: 'Lint Notebook'
+on:
+  workflow_call:
+    inputs:
+      directory:
+        required: false
+        type: string
+permissions:
+  contents: write
+  id-token: write
+
+jobs:
+  lint:
+    name: 'Linting'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          repository: NIGMS/NIGMS-Sandbox
+          ref: reusable_workflow
+          path: reusable-workflow-repo
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          cache: 'pip'
+
+      - name: Install requirements.txt
+        working-directory: reusable-workflow-repo/.github/workflows/
+        run: |
+          python3 -m pip install --upgrade pip
+          pip3 install nbformat
+
+      - name: Notebook Linting
+        working-directory: reusable-workflow-repo/.github/workflows/
+        run: |
+          python3 lint.py
+
+      - name: Commit changes
+        uses: EndBug/add-and-commit@v9
+        with:
+          author_name: github-action
+          author_email: cbiit-github-action@github.com
+          message: 'Github Action: Refresh stats'
diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt
@@ -0,0 +1 @@
+nbformat==5.10.4
diff --git a/reusable-workflow-repo b/reusable-workflow-repo