diff --git a/LIBGH.1.md b/LIBGH.1.md index a2683cd..6bd5bf3 100644 --- a/LIBGH.1.md +++ b/LIBGH.1.md @@ -10,7 +10,6 @@ libgh - GitHub scraping tool \[--from\] \[--json|-j\] \[--prune|-p\] -\[--repo|-r\] \[--topics\] \[--xml|-x\] \[--debug\] @@ -23,8 +22,8 @@ The alias **lgh** is also available to shorten the command name. ## DESCRIPTION The **libgh** command-line utility scraps data from a list of GitHub -accounts (either personal or organizational) or repositories if the -*--repo|-r* option is used. +accounts (either personal or organizational) or repositories (in +account/repository form). By default this data is returned as pretty-printed text, or JSON data if the *--json|-j* option is used, or XML data if the *--xml|-x* option @@ -59,7 +58,6 @@ Options | Use --from|Load repositories when forked_from is blank --json\|-j|Switch to JSON output instead of plain text --prune\|-p|Prune cache items olday than DAYS and cache index ---repo\|-r|Process repositories instead of accounts --topics|Load repositories when there are missing topics --xml\|-x|Switch to XML output instead of plain text --debug|Enable debug mode diff --git a/README.md b/README.md index eb738c6..da5974e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![Servier Inspired](https://raw.githubusercontent.com/servierhub/.github/main/badges/inspired.svg)](https://github.com/ServierHub/) [![PyPI package](https://repology.org/badge/version-for-repo/pypi/python:pnu-libgh.svg)](https://repology.org/project/python:pnu-libgh/versions) +[![Servier Inspired](https://raw.githubusercontent.com/servierhub/.github/main/badges/inspired.svg)](https://github.com/ServierHub/) # libGH(1), libGH(3) - GitHub scraping tool and library diff --git a/TODO.md b/TODO.md index c22bfd0..99aa56c 100644 --- a/TODO.md +++ b/TODO.md @@ -5,8 +5,6 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion * Move libpnu2.py code into [HubTou/libpnu](https://github.com/HubTou/libpnu) ## Probable evolutions -* libpnu2/get_url_bdata: - * Load browser signature from a file at first call * Fetch: * Accounts from: * Followers @@ -16,6 +14,8 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion * Repos from: * Forks * Instructions to do so will be passed with the "complete" parameter. +* libpnu2/get_url_bdata: + * Load browser signature from a file at first call ## Possible evolutions * Fetch: @@ -26,6 +26,13 @@ Feel free to [submit your own ideas!](https://github.com/HubTou/libgh/discussion * Stars * Sponsoring * Add GitHub authenticated mode +* libpnu2/get_url_bdata: + * Always create an index.txt file for the cache + * Replace the *cache_index* parameter by a *private* parameter + indicating if source URL are to be mentioned + * Write a cache expiration number of days in each line + * Make the *cache_days* parameter optional in *prune_cache()* + using the previous cache expiration number of days by default ## Unprobable evolutions * A *--all|-a* option to load all repositories individually diff --git a/man/libgh.1 b/man/libgh.1 index 6080825..9920aa8 100644 --- a/man/libgh.1 +++ b/man/libgh.1 @@ -1,4 +1,4 @@ -.Dd May 7, 2024 +.Dd May 18, 2024 .Dt LIBGH 1 .Os .Sh NAME @@ -11,7 +11,6 @@ .Op Fl \-from .Op Fl \-json|\-j .Op Fl \-prune|\-p -.Op Fl \-repo|\-r .Op Fl \-topics .Op Fl \-xml|\-x .Op Fl \-debug @@ -28,9 +27,8 @@ is also available to shorten the command name. The .Nm command\-line utility scraps data from a list of GitHub -accounts (either personal or organizational) or repositories if the -.Op Fl \-repo|\-r -option is used. +accounts (either personal or organizational) or repositories (in +account/repository form). .Pp By default this data is returned as pretty\-printed text, or JSON data if the @@ -85,9 +83,6 @@ Switch to JSON output instead of plain text .Op Fl \-prune|\-p Prune cache items olday than DAYS and cache index .Pp -.Op Fl \-repo|\-r -Process repositories instead of accounts -.Pp .Op Fl \-topics Load repositories when there are missing topics .Pp diff --git a/setup.cfg b/setup.cfg index 6ae9bd7..0101c53 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,7 +3,7 @@ name = pnu-libgh description = GitHub scraping library and tool long_description = file: README.md long_description_content_type = text/markdown -version = 0.9.0 +version = 0.9.1 license = BSD 3-Clause License license_files = LICENSe author = Hubert Tournier diff --git a/src/libgh/accounts.py b/src/libgh/accounts.py index f65eb46..73da117 100644 --- a/src/libgh/accounts.py +++ b/src/libgh/accounts.py @@ -29,11 +29,11 @@ def load_account(account_name, cache_days, force_fetch=False, complete=[]): max_per_hour=REQUESTS_PER_HOUR ) except (LookupError, PermissionError) as error: - logging.error("libGH: %s", error) + logging.error("libgh: %s", error) return account for item in response: if item[0].startswith("x-ratelimit"): - logging.debug("libGH: HTTP response: %s=%s", item[0], item[1]) + logging.debug("libgh: HTTP response: %s=%s", item[0], item[1]) soup = BeautifulSoup(data, "html.parser") diff --git a/src/libgh/libpnu2.py b/src/libgh/libpnu2.py index 7e23bef..5fd2f3e 100644 --- a/src/libgh/libpnu2.py +++ b/src/libgh/libpnu2.py @@ -36,7 +36,7 @@ def _count_from(current_time, requests_times): one_hour = 60 * 60 # seconds one_minute = 60 # seconds - # The requests_times is not necessarily ordered by ascending timestamps + # The requests_times is not necessarily ordered by ascending timestamps... for request_time in requests_times: time_diff = current_time - request_time if time_diff <= one_day: @@ -141,13 +141,24 @@ def get_url_bdata( last_hour, last_minute ) + slowing_down = False if last_day >= max_per_day > 0: - raise PermissionError(f"Max requests per day reached for '{website}'") + logging.debug("libpnu/get_url_bdata: Max requests per day reached. Sleeping for 1 day") + time.sleep(24 * 60 * 60) + slowing_down = True if last_hour >= max_per_hour > 0: - raise PermissionError(f"Max requests per hour reached for '{website}'") + logging.debug( + "libpnu/get_url_bdata: Max requests per hour reached. Sleeping for 1 hour" + ) + time.sleep(60 * 60) + slowing_down = True if last_minute >= max_per_minute > 0: - logging.debug("libpnu/get_url_bdata: Slowing down URL fetching. Sleeping for 1 minute") + logging.debug( + "libpnu/get_url_bdata: Max requests per minute reached. Sleeping for 1 minute" + ) time.sleep(60) + slowing_down = True + if slowing_down: current_time = time.time() current_date = datetime.datetime.fromtimestamp(current_time) @@ -245,6 +256,7 @@ def prune_cache(cache_dir, cache_days): # Load the index file if it exist index_name = f"{directory}{os.sep}index.txt" + lines = [] if os.path.isfile(index_name): with open(index_name, encoding="utf-8", errors="ignore") as file: lines = file.read().splitlines() diff --git a/src/libgh/main.py b/src/libgh/main.py index dfcca8d..3a8294f 100644 --- a/src/libgh/main.py +++ b/src/libgh/main.py @@ -19,11 +19,10 @@ from .libpnu2 import get_url_bdata, get_url_data, prune_cache, collection2xml # Version string used by the what(1) and ident(1) commands: -ID = "@(#) $Id: libgh - GitHub scraping tool v0.9.0 (May 7, 2024) by Hubert Tournier $" +ID = "@(#) $Id: libgh - GitHub scraping tool v0.9.1 (May 18, 2024) by Hubert Tournier $" # Default parameters. Can be overcome by environment variables, then command line options parameters = { - "Process repositories": False, "Prune cache": False, "Force fetching URL": False, "Cache days": CACHE_DAYS, @@ -38,7 +37,7 @@ def _display_help(): """ Display usage and help """ #pylint: disable=C0301 print("usage: libgh [--debug] [--help|-?] [--version]", file=sys.stderr) - print(" [--from] [--json|-j] [--repo|-r] [--topics] [--xml|-x]", file=sys.stderr) + print(" [--from] [--json|-j] [--topics] [--xml|-x]", file=sys.stderr) print(" [--days|-d DAYS] [--force|-f] [--prune|-p]", file=sys.stderr) print(" [--] account_or_repo [...]", file=sys.stderr) print(" ------------------ --------------------------------------------------", file=sys.stderr) @@ -47,7 +46,6 @@ def _display_help(): print(" --from Load repositories when forked_from is blank", file=sys.stderr) print(" --json|-j Switch to JSON output instead of plain text", file=sys.stderr) print(" --prune|-p Prune cache items olday than DAYS and cache index", file=sys.stderr) - print(" --repo|-r Process repositories instead of accounts", file=sys.stderr) print(" --topics Load repositories when there are missing topics", file=sys.stderr) print(" --xml|-x Switch to XML output instead of plain text", file=sys.stderr) print(" --debug Enable debug mode", file=sys.stderr) @@ -67,7 +65,7 @@ def _process_command_line(): # option letters followed by : expect an argument # same for option strings followed by = - character_options = "d:fjprx?" + character_options = "d:fjpx?" string_options = [ "days=", "debug", @@ -76,7 +74,6 @@ def _process_command_line(): "help", "json", "prune", - "repo", "topics", "version", "xml", @@ -126,11 +123,6 @@ def _process_command_line(): elif option in ("--prune", "-p"): parameters["Prune cache"] = True - parameters["Process repositories"] = False - - elif option in ("--repo", "-r"): - parameters["Process repositories"] = True - parameters["Prune cache"] = False elif option == "--topics": parameters["Complete partial"].append("topics") @@ -154,44 +146,50 @@ def main(): if parameters["Prune cache"]: prune_cache(CACHE_DIR, parameters["Cache days"]) - elif parameters["Process repositories"]: - if not arguments: - _display_help() - else: - repositories = load_repositories( - arguments, - parameters["Cache days"], - force_fetch=parameters["Force fetching URL"], - ) - if parameters["JSON output"]: - json.dump(repositories, sys.stdout) - print() - elif parameters["XML output"]: - xml = collection2xml(repositories, name="repositories") - for line in xml: - print(line) - else: - pprint.pprint(repositories, sort_dicts=False) + if not arguments: + _display_help() else: - if not arguments: - _display_help() - else: - accounts = load_accounts( - arguments, + accounts = [] + repositories = [] + for argument in arguments: + if '/' in argument: + repositories.append(argument) + else: + accounts.append(argument) + + data = {} + if accounts: + data = load_accounts( + accounts, parameters["Cache days"], force_fetch=parameters["Force fetching URL"], complete=parameters["Complete partial"], ) - if parameters["JSON output"]: - json.dump(accounts, sys.stdout) - print() - elif parameters["XML output"]: - xml = collection2xml(accounts, name="accounts") - for line in xml: - print(line) - else: - pprint.pprint(accounts, sort_dicts=False) + if repositories: + data2 = load_repositories( + repositories, + parameters["Cache days"], + force_fetch=parameters["Force fetching URL"], + ) + + # Performing nested dictionary update + for account, account_value in data2.items(): + if account in data: + for repository, repository_value in account_value["repositories"].items(): + data[account]["repositories"][repository] = repository_value + else: + data[account] = account_value + + if parameters["JSON output"]: + json.dump(data, sys.stdout) + print() + elif parameters["XML output"]: + xml = collection2xml(data, name="GitHub") + for line in xml: + print(line) + else: + pprint.pprint(data, sort_dicts=False) sys.exit(0) diff --git a/src/libgh/organization_account.py b/src/libgh/organization_account.py index f3a46ae..ce5486c 100644 --- a/src/libgh/organization_account.py +++ b/src/libgh/organization_account.py @@ -37,11 +37,11 @@ def load_org_repositories( max_per_hour=REQUESTS_PER_HOUR ) except (LookupError, PermissionError) as error: - logging.error("libGH: %s", error) + logging.error("libgh: %s", error) return repos for item in response: if item[0].startswith("x-ratelimit"): - logging.debug("libGH: HTTP response: %s=%s", item[0], item[1]) + logging.debug("libgh: HTTP response: %s=%s", item[0], item[1]) soup = BeautifulSoup(data, "html.parser") @@ -61,10 +61,6 @@ def load_org_repositories( repos[name] = {} uncomplete = False - # archived? - if repos_type == "archived": - repos[name]["archived"] = True - # forked from if repository["isFork"]: repos[name]["forked_from"] = "" # The original location is not provided @@ -111,6 +107,10 @@ def load_org_repositories( # last updated repos[name]["last_updated"] = repository["lastUpdated"]["timestamp"] + # archived? + if repos_type == "archived": + repos[name]["archived"] = True + # is there a next page? next_page = soup.select_one('[aria-label="Next Page"]') if next_page is not None: @@ -279,7 +279,7 @@ def load_org_account(account_name, soup, cache_days, force_fetch=False, complete if len(account["repositories"]) != account["repositories_count"]: logging.warning( - "libGH: Loaded %d/%d repositories", + "libgh: Loaded %d/%d repositories", len(account["repositories"]), account["repositories_count"] ) diff --git a/src/libgh/personal_account.py b/src/libgh/personal_account.py index 304c271..7737212 100644 --- a/src/libgh/personal_account.py +++ b/src/libgh/personal_account.py @@ -11,6 +11,7 @@ from .common import GITHUB_URL, CACHE_DIR, REQUESTS_MIN_DELAY, REQUESTS_PER_HOUR from .libpnu2 import get_url_data +from .repositories import load_repository #################################################################################################### def load_user_repositories(user_name, url, page, cache_days, force_fetch=False, complete=[]): @@ -28,11 +29,11 @@ def load_user_repositories(user_name, url, page, cache_days, force_fetch=False, max_per_hour=REQUESTS_PER_HOUR ) except (LookupError, PermissionError) as error: - logging.error("libGH: %s", error) + logging.error("libgh: %s", error) return repos for item in response: if item[0].startswith("x-ratelimit"): - logging.debug("libGH: HTTP response: %s=%s", item[0], item[1]) + logging.debug("libgh: HTTP response: %s=%s", item[0], item[1]) soup = BeautifulSoup(data, "html.parser") @@ -45,21 +46,6 @@ def load_user_repositories(user_name, url, page, cache_days, force_fetch=False, name = html.text.strip() repos[name] = {} - # archived? - li_class = item.get("class") - if "archived" in li_class: - repos[name]["archived"] = True - - # forked from - html = item.select_one('[class="Link--muted Link--inTextBlock"]') - if html is not None: - repos[name]["forked_from"] = html.text.strip() - - # description - html = item.select_one('[itemprop="description"]') - if html is not None: - repos[name]["description"] = html.text.strip() - # topics repos[name]["topics"] = [] html = item.select('[data-octo-click="topic_click"]') @@ -67,34 +53,55 @@ def load_user_repositories(user_name, url, page, cache_days, force_fetch=False, for html2 in html: repos[name]["topics"].append(html2.text.strip()) - # programming language - html = item.select_one('[itemprop="programmingLanguage"]') - if html is not None: - repos[name]["programming_language"] = html.text.strip() + # when there are 7 topics, it's probable than GitHub is only showing the first ones + # (can also happen with less when the topics character length is large) + if "topics" in repos[name] and len(repos[name]["topics"]) == 7 and "topics" in complete: + repos[name] = load_repository(user_name, name, cache_days, force_fetch) - # stargazers - repos[name]["stargazers"] = {} - html = item.select_one('[class="octicon octicon-star"]') - if html is None: - repos[name]["stargazers_count"] = 0 - else: - html2 = html.next_sibling - repos[name]["stargazers_count"] = int(html2.text.strip().replace(',', '')) - - # forks - repos[name]["forks"] = {} - html = item.select_one('[class="octicon octicon-repo-forked"]') - if html is None: - repos[name]["forks_count"] = 0 else: - html2 = html.next_sibling - repos[name]["forks_count"] = int(html2.text.strip().replace(',', '')) + # archived? + li_class = item.get("class") + if "archived" in li_class: + repos[name]["archived"] = True + + # forked from + html = item.select_one('[class="Link--muted Link--inTextBlock"]') + if html is not None: + repos[name]["forked_from"] = html.text.strip() + + # description + html = item.select_one('[itemprop="description"]') + if html is not None: + repos[name]["description"] = html.text.strip() + + # programming language + html = item.select_one('[itemprop="programmingLanguage"]') + if html is not None: + repos[name]["programming_language"] = html.text.strip() + + # stargazers + repos[name]["stargazers"] = {} + html = item.select_one('[class="octicon octicon-star"]') + if html is None: + repos[name]["stargazers_count"] = 0 + else: + html2 = html.next_sibling + repos[name]["stargazers_count"] = int(html2.text.strip().replace(',', '')) + + # forks + repos[name]["forks"] = {} + html = item.select_one('[class="octicon octicon-repo-forked"]') + if html is None: + repos[name]["forks_count"] = 0 + else: + html2 = html.next_sibling + repos[name]["forks_count"] = int(html2.text.strip().replace(',', '')) - # license - html = item.select_one('[class*="octicon octicon-law"]') - if html is not None: - html2 = html.next_sibling - repos[name]["license"] = html2.text.strip() + # license + html = item.select_one('[class*="octicon octicon-law"]') + if html is not None: + html2 = html.next_sibling + repos[name]["license"] = html2.text.strip() # updated html = item.select_one('relative-time') @@ -358,7 +365,7 @@ def load_user_account(account_name, soup, cache_days, force_fetch=False, complet if len(account["repositories"]) != account["repositories_count"]: logging.warning( - "libGH: Loaded %d/%d repositories", + "libgh: Loaded %d/%d repositories", len(account["repositories"]), account["repositories_count"] ) diff --git a/src/libgh/repositories.py b/src/libgh/repositories.py index 0fd9951..2d4b458 100644 --- a/src/libgh/repositories.py +++ b/src/libgh/repositories.py @@ -27,11 +27,11 @@ def load_repository(account_name, repository_name, cache_days, force_fetch=False max_per_hour=REQUESTS_PER_HOUR ) except (LookupError, PermissionError) as error: - logging.error("libGH: %s", error) + logging.error("libgh: %s", error) return repository for item in response: if item[0].startswith("x-ratelimit"): - logging.debug("libGH: HTTP response: %s=%s", item[0], item[1]) + logging.debug("libgh: HTTP response: %s=%s", item[0], item[1]) soup = BeautifulSoup(data, "html.parser") @@ -205,22 +205,26 @@ def load_repository(account_name, repository_name, cache_days, force_fetch=False #################################################################################################### def load_repositories(repositories_list, cache_days, force_fetch=False): """ Returns a dictionary of repositories information """ - repositories = {} + accounts = {} for item in repositories_list: if item.count('/') == 1: - account_name = item.split('/')[0] - repository_name = item.split('/')[1] - repositories[item] = load_repository( - account_name, - repository_name, + account = item.split('/')[0] + repository = item.split('/')[1] + + if account not in accounts: + accounts[account] = {"repositories":{}} + + accounts[account]["repositories"][repository] = load_repository( + account, + repository, cache_days, force_fetch=force_fetch ) else: logging.error( - "libGH: Repositories parameters must be in 'account/repo' form. '%s' discarded", + "libgh: Repositories parameters must be in 'account/repo' form. '%s' discarded", item ) - return repositories + return accounts