Update to v0.9.1

HubTou · May 18, 2024 · b8c8d05 · b8c8d05
1 parent 3300cd4
commit b8c8d05
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 125 deletions.
diff --git a/LIBGH.1.md b/LIBGH.1.md
@@ -10,7 +10,6 @@ libgh - GitHub scraping tool
 \[--from\]
 \[--json|-j\]
 \[--prune|-p\]
-\[--repo|-r\]
 \[--topics\]
 \[--xml|-x\]
 \[--debug\]
@@ -23,8 +22,8 @@ The alias **lgh** is also available to shorten the command name.
 
 ## DESCRIPTION
 The **libgh** command-line utility scraps data from a list of GitHub
-accounts (either personal or organizational) or repositories if the
-*--repo|-r* option is used.
+accounts (either personal or organizational) or repositories (in
+account/repository form).
 
 By default this data is returned as pretty-printed text, or JSON data
 if the *--json|-j* option is used, or XML data if the *--xml|-x* option
@@ -59,7 +58,6 @@ Options | Use
 --from|Load repositories when forked_from is blank
 --json\|-j|Switch to JSON output instead of plain text
 --prune\|-p|Prune cache items olday than DAYS and cache index
---repo\|-r|Process repositories instead of accounts
 --topics|Load repositories when there are missing topics
 --xml\|-x|Switch to XML output instead of plain text
 --debug|Enable debug mode

diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
-[![Servier Inspired](https://raw.githubusercontent.com/servierhub/.github/main/badges/inspired.svg)](https://github.com/ServierHub/)
 [![PyPI package](https://repology.org/badge/version-for-repo/pypi/python:pnu-libgh.svg)](https://repology.org/project/python:pnu-libgh/versions)
+[![Servier Inspired](https://raw.githubusercontent.com/servierhub/.github/main/badges/inspired.svg)](https://github.com/ServierHub/)
 
 # libGH(1), libGH(3) - GitHub scraping tool and library
 

diff --git a/TODO.md b/TODO.md
@@ -5,8 +5,6 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion
 * Move libpnu2.py code into [HubTou/libpnu](https://github.com/HubTou/libpnu)
 
 ## Probable evolutions
-* libpnu2/get_url_bdata:
-  * Load browser signature from a file at first call
 * Fetch:
   * Accounts from:
     * Followers
@@ -16,6 +14,8 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion
   * Repos from:
     * Forks
   * Instructions to do so will be passed with the "complete" parameter.
+* libpnu2/get_url_bdata:
+  * Load browser signature from a file at first call
 
 ## Possible evolutions
 * Fetch:
@@ -26,6 +26,13 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion
     * Stars
     * Sponsoring
 * Add GitHub authenticated mode
+* libpnu2/get_url_bdata:
+  * Always create an index.txt file for the cache
+  * Replace the *cache_index* parameter by a *private* parameter
+    indicating if source URL are to be mentioned
+  * Write a cache expiration number of days in each line
+  * Make the *cache_days* parameter optional in *prune_cache()*
+    using the previous cache expiration number of days by default
 
 ## Unprobable evolutions
 * A *--all|-a* option to load all repositories individually

diff --git a/man/libgh.1 b/man/libgh.1
@@ -1,4 +1,4 @@
-.Dd May 7, 2024
+.Dd May 18, 2024
 .Dt LIBGH 1
 .Os
 .Sh NAME
@@ -11,7 +11,6 @@
 .Op Fl \-from
 .Op Fl \-json|\-j
 .Op Fl \-prune|\-p
-.Op Fl \-repo|\-r
 .Op Fl \-topics
 .Op Fl \-xml|\-x
 .Op Fl \-debug
@@ -28,9 +27,8 @@ is also available to shorten the command name.
 The
 .Nm
 command\-line utility scraps data from a list of GitHub
-accounts (either personal or organizational) or repositories if the
-.Op Fl \-repo|\-r
-option is used.
+accounts (either personal or organizational) or repositories (in
+account/repository form).
 .Pp
 By default this data is returned as pretty\-printed text, or JSON data
 if the
@@ -85,9 +83,6 @@ Switch to JSON output instead of plain text
 .Op Fl \-prune|\-p
 Prune cache items olday than DAYS and cache index
 .Pp
-.Op Fl \-repo|\-r
-Process repositories instead of accounts
-.Pp
 .Op Fl \-topics
 Load repositories when there are missing topics
 .Pp

diff --git a/setup.cfg b/setup.cfg
@@ -3,7 +3,7 @@ name = pnu-libgh
 description = GitHub scraping library and tool
 long_description = file: README.md
 long_description_content_type = text/markdown
-version = 0.9.0
+version = 0.9.1
 license = BSD 3-Clause License
 license_files = LICENSe
 author = Hubert Tournier

diff --git a/src/libgh/accounts.py b/src/libgh/accounts.py
@@ -29,11 +29,11 @@ def load_account(account_name, cache_days, force_fetch=False, complete=[]):
             max_per_hour=REQUESTS_PER_HOUR
         )
     except (LookupError, PermissionError) as error:
-        logging.error("libGH: %s", error)
+        logging.error("libgh: %s", error)
         return account
     for item in response:
         if item[0].startswith("x-ratelimit"):
-            logging.debug("libGH: HTTP response: %s=%s", item[0], item[1])
+            logging.debug("libgh: HTTP response: %s=%s", item[0], item[1])
 
     soup = BeautifulSoup(data, "html.parser")
 

diff --git a/src/libgh/libpnu2.py b/src/libgh/libpnu2.py
@@ -36,7 +36,7 @@ def _count_from(current_time, requests_times):
     one_hour = 60 * 60 # seconds
     one_minute = 60 # seconds
 
-    # The requests_times is not necessarily ordered by ascending timestamps
+    # The requests_times is not necessarily ordered by ascending timestamps...
     for request_time in requests_times:
         time_diff = current_time - request_time
         if time_diff <= one_day:
@@ -141,13 +141,24 @@ def get_url_bdata(
             last_hour,
             last_minute
         )
+        slowing_down = False
         if last_day >= max_per_day > 0:
-            raise PermissionError(f"Max requests per day reached for '{website}'")
+            logging.debug("libpnu/get_url_bdata: Max requests per day reached. Sleeping for 1 day")
+            time.sleep(24 * 60 * 60)
+            slowing_down = True
         if last_hour >= max_per_hour > 0:
-            raise PermissionError(f"Max requests per hour reached for '{website}'")
+            logging.debug(
+                "libpnu/get_url_bdata: Max requests per hour reached. Sleeping for 1 hour"
+            )
+            time.sleep(60 * 60)
+            slowing_down = True
         if last_minute >= max_per_minute > 0:
-            logging.debug("libpnu/get_url_bdata: Slowing down URL fetching. Sleeping for 1 minute")
+            logging.debug(
+                "libpnu/get_url_bdata: Max requests per minute reached. Sleeping for 1 minute"
+            )
             time.sleep(60)
+            slowing_down = True
+        if slowing_down:
             current_time = time.time()
             current_date = datetime.datetime.fromtimestamp(current_time)
 
@@ -245,6 +256,7 @@ def prune_cache(cache_dir, cache_days):
 
     # Load the index file if it exist
     index_name = f"{directory}{os.sep}index.txt"
+    lines = []
     if os.path.isfile(index_name):
         with open(index_name, encoding="utf-8", errors="ignore") as file:
             lines = file.read().splitlines()

diff --git a/src/libgh/main.py b/src/libgh/main.py
@@ -19,11 +19,10 @@
 from .libpnu2 import get_url_bdata, get_url_data, prune_cache, collection2xml
 
 # Version string used by the what(1) and ident(1) commands:
-ID = "@(#) $Id: libgh - GitHub scraping tool v0.9.0 (May 7, 2024) by Hubert Tournier $"
+ID = "@(#) $Id: libgh - GitHub scraping tool v0.9.1 (May 18, 2024) by Hubert Tournier $"
 
 # Default parameters. Can be overcome by environment variables, then command line options
 parameters = {
-    "Process repositories": False,
     "Prune cache": False,
     "Force fetching URL": False,
     "Cache days": CACHE_DAYS,
@@ -38,7 +37,7 @@ def _display_help():
     """ Display usage and help """
     #pylint: disable=C0301
     print("usage: libgh [--debug] [--help|-?] [--version]", file=sys.stderr)
-    print("       [--from] [--json|-j] [--repo|-r] [--topics] [--xml|-x]", file=sys.stderr)
+    print("       [--from] [--json|-j] [--topics] [--xml|-x]", file=sys.stderr)
     print("       [--days|-d DAYS] [--force|-f] [--prune|-p]", file=sys.stderr)
     print("       [--] account_or_repo [...]", file=sys.stderr)
     print("  ------------------  --------------------------------------------------", file=sys.stderr)
@@ -47,7 +46,6 @@ def _display_help():
     print("  --from              Load repositories when forked_from is blank", file=sys.stderr)
     print("  --json|-j           Switch to JSON output instead of plain text", file=sys.stderr)
     print("  --prune|-p          Prune cache items olday than DAYS and cache index", file=sys.stderr)
-    print("  --repo|-r           Process repositories instead of accounts", file=sys.stderr)
     print("  --topics            Load repositories when there are missing topics", file=sys.stderr)
     print("  --xml|-x            Switch to XML output instead of plain text", file=sys.stderr)
     print("  --debug             Enable debug mode", file=sys.stderr)
@@ -67,7 +65,7 @@ def _process_command_line():
 
     # option letters followed by : expect an argument
     # same for option strings followed by =
-    character_options = "d:fjprx?"
+    character_options = "d:fjpx?"
     string_options = [
         "days=",
         "debug",
@@ -76,7 +74,6 @@ def _process_command_line():
         "help",
         "json",
         "prune",
-        "repo",
         "topics",
         "version",
         "xml",
@@ -126,11 +123,6 @@ def _process_command_line():
 
         elif option in ("--prune", "-p"):
             parameters["Prune cache"] = True
-            parameters["Process repositories"] = False
-
-        elif option in ("--repo", "-r"):
-            parameters["Process repositories"] = True
-            parameters["Prune cache"] = False
 
         elif option == "--topics":
             parameters["Complete partial"].append("topics")
@@ -154,44 +146,50 @@ def main():
     if parameters["Prune cache"]:
         prune_cache(CACHE_DIR, parameters["Cache days"])
 
-    elif parameters["Process repositories"]:
-        if not arguments:
-            _display_help()
-        else:
-            repositories = load_repositories(
-                arguments,
-                parameters["Cache days"],
-                force_fetch=parameters["Force fetching URL"],
-            )
-            if parameters["JSON output"]:
-                json.dump(repositories, sys.stdout)
-                print()
-            elif parameters["XML output"]:
-                xml = collection2xml(repositories, name="repositories")
-                for line in xml:
-                    print(line)
-            else:
-                pprint.pprint(repositories, sort_dicts=False)
+    if not arguments:
+        _display_help()
 
     else:
-        if not arguments:
-            _display_help()
-        else:
-            accounts = load_accounts(
-                arguments,
+        accounts = []
+        repositories = []
+        for argument in arguments:
+            if '/' in argument:
+                repositories.append(argument)
+            else:
+                accounts.append(argument)
+
+        data = {}
+        if accounts:
+            data = load_accounts(
+                accounts,
                 parameters["Cache days"],
                 force_fetch=parameters["Force fetching URL"],
                 complete=parameters["Complete partial"],
             )
-            if parameters["JSON output"]:
-                json.dump(accounts, sys.stdout)
-                print()
-            elif parameters["XML output"]:
-                xml = collection2xml(accounts, name="accounts")
-                for line in xml:
-                    print(line)
-            else:
-                pprint.pprint(accounts, sort_dicts=False)
+        if repositories:
+            data2 = load_repositories(
+                repositories,
+                parameters["Cache days"],
+                force_fetch=parameters["Force fetching URL"],
+            )
+
+            # Performing nested dictionary update
+            for account, account_value in data2.items():
+                if account in data:
+                    for repository, repository_value in account_value["repositories"].items():
+                        data[account]["repositories"][repository] = repository_value
+                else:
+                    data[account] = account_value
+
+        if parameters["JSON output"]:
+            json.dump(data, sys.stdout)
+            print()
+        elif parameters["XML output"]:
+            xml = collection2xml(data, name="GitHub")
+            for line in xml:
+                print(line)
+        else:
+            pprint.pprint(data, sort_dicts=False)
 
     sys.exit(0)
 

diff --git a/src/libgh/organization_account.py b/src/libgh/organization_account.py
@@ -37,11 +37,11 @@ def load_org_repositories(
                 max_per_hour=REQUESTS_PER_HOUR
             )
         except (LookupError, PermissionError) as error:
-            logging.error("libGH: %s", error)
+            logging.error("libgh: %s", error)
             return repos
         for item in response:
             if item[0].startswith("x-ratelimit"):
-                logging.debug("libGH: HTTP response: %s=%s", item[0], item[1])
+                logging.debug("libgh: HTTP response: %s=%s", item[0], item[1])
 
         soup = BeautifulSoup(data, "html.parser")
 
@@ -61,10 +61,6 @@ def load_org_repositories(
             repos[name] = {}
             uncomplete = False
 
-            # archived?
-            if repos_type == "archived":
-                repos[name]["archived"] = True
-
             # forked from
             if repository["isFork"]:
                 repos[name]["forked_from"] = "" # The original location is not provided
@@ -111,6 +107,10 @@ def load_org_repositories(
             # last updated
             repos[name]["last_updated"] = repository["lastUpdated"]["timestamp"]
 
+            # archived?
+            if repos_type == "archived":
+                repos[name]["archived"] = True
+
         # is there a next page?
         next_page = soup.select_one('[aria-label="Next Page"]')
         if next_page is not None:
@@ -279,7 +279,7 @@ def load_org_account(account_name, soup, cache_days, force_fetch=False, complete
 
     if len(account["repositories"]) != account["repositories_count"]:
         logging.warning(
-            "libGH: Loaded %d/%d repositories",
+            "libgh: Loaded %d/%d repositories",
             len(account["repositories"]),
             account["repositories_count"]
         )