Skip to content

Commit

Permalink
Fixed #29: test remote entries URLs early
Browse files Browse the repository at this point in the history
  • Loading branch information
rgaudin committed May 13, 2024
1 parent 8cc68d0 commit 8cdf5c1
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 1 deletion.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ as of 1.3.0.
- Add urls support to collections. (#59)
- Add archiveless collection.json support. (#60)

### Changed

- URL entries are checked early to exit should access fails (#29)

### Fixed

- Header link to home was leading to template (#68)
Expand Down
40 changes: 39 additions & 1 deletion src/nautiluszim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
MAXIMUM_DESCRIPTION_METADATA_LENGTH,
MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH,
)
from zimscraperlib.download import save_large_file
from zimscraperlib.download import requests, save_large_file
from zimscraperlib.i18n import _, get_language_details, setlocale
from zimscraperlib.image.convertion import create_favicon
from zimscraperlib.image.probing import get_colors, is_hex_color
Expand Down Expand Up @@ -139,6 +139,9 @@ def run(self):
# fail early if supplied branding files are missing
self.check_branding_values()

# fail early if remote entries URLs are not OK
self.test_all_urls()

# download archive
if self.archive:
self.download_archive()
Expand Down Expand Up @@ -344,6 +347,41 @@ def load_collection(self):
nb_files = sum([len(i.get("files", [])) for i in self.json_collection])
logger.info(f"Collection loaded. {nb_items} items, {nb_files} files")

def test_all_urls(self):
"""Check that all URL entries in collection respond successfully"""
self.load_collection()
failed = False

for entry in self.json_collection:
if not entry.get("files"):
continue
for file in entry["files"]:
if not isinstance(file, dict) or not file.get("url"):
continue
url = file["url"]

if not url.startswith("http"):
logger.error(f"- Not a valid HTTP URL: {url}")
failed = True
continue

try:
resp = requests.get(url, stream=True)
except Exception as exc:
logger.error(f"- Connection Error: {url} ({exc})")
failed = True
continue

try:
resp.raise_for_status()
except Exception as exc:
logger.error(f"- HTTP {resp.status_code}: {url} ({exc})")
failed = True
continue

if failed:
raise ValueError("Remote entries failed access test")

def test_archiveless_collection(self):
"""Test the collection.json without archive file"""
self.load_collection()
Expand Down

0 comments on commit 8cdf5c1

Please sign in to comment.