Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Many fixes for reliability of the scraper #78

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/workflows/PublishDockerDevImage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- main
- reliability

jobs:
publish:
Expand Down
71 changes: 56 additions & 15 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
import re
import threading
from io import BytesIO
from typing import NamedTuple

import backoff
from kiwixstorage import KiwixStorage, NotFoundError
from pif import get_public_ip
from PIL import Image
from requests import HTTPError
from requests.exceptions import RequestException
from zimscraperlib.download import stream_file
from zimscraperlib.image.optimization import optimize_webp
from zimscraperlib.image.presets import WebpMedium
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
from zimscraperlib.zim import Creator

from mindtouch2zim.constants import logger, web_session
from mindtouch2zim.constants import KNOWN_BAD_ASSETS_REGEX, logger, web_session
from mindtouch2zim.errors import KnownBadAssetFailedError
from mindtouch2zim.utils import backoff_hdlr

SUPPORTED_IMAGE_MIME_TYPES = {
Expand Down Expand Up @@ -47,9 +49,22 @@

class AssetProcessor:

def __init__(self, s3_url_with_credentials: str | None) -> None:
def __init__(
self,
s3_url_with_credentials: str | None,
bad_assets_regex: str | None,
bad_assets_threshold: int,
) -> None:
self.s3_url_with_credentials = s3_url_with_credentials

bad_assets_regex = f"{bad_assets_regex}|{KNOWN_BAD_ASSETS_REGEX}"
self.bad_assets_regex = (

Check warning on line 61 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L60-L61

Added lines #L60 - L61 were not covered by tests
re.compile(bad_assets_regex, re.IGNORECASE) if bad_assets_regex else None
)
self.bad_assets_threshold = bad_assets_threshold

Check warning on line 64 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L64

Added line #L64 was not covered by tests
self._setup_s3()
self.bad_assets_count = 0
self.lock = threading.Lock()

Check warning on line 67 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L66-L67

Added lines #L66 - L67 were not covered by tests

def process_asset(
self,
Expand All @@ -62,12 +77,6 @@
asset_path=asset_path, asset_details=asset_details, creator=creator
)

@backoff.on_exception(
backoff.expo,
RequestException,
max_time=16,
on_backoff=backoff_hdlr,
)
def _process_asset_internal(
self,
asset_path: ZimPath,
Expand All @@ -89,11 +98,28 @@
content=asset_content.getvalue(),
)
break # file found and added
except HTTPError as exc:
# would make more sense to be a warning, but this is just too
# verbose, at least on geo.libretexts.org many assets are just
# missing
logger.debug(f"Ignoring {asset_path.value} due to {exc}")
except KnownBadAssetFailedError as exc:
logger.debug(f"Ignoring known bad asset for {asset_url.value}: {exc}")
except RequestException as exc:

Check warning on line 103 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L102-L103

Added lines #L102 - L103 were not covered by tests
with self.lock:
self.bad_assets_count += 1

Check warning on line 105 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L105

Added line #L105 was not covered by tests
if (
self.bad_assets_threshold >= 0
and self.bad_assets_count > self.bad_assets_threshold
):
logger.error(

Check warning on line 110 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L110

Added line #L110 was not covered by tests
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)
raise Exception( # noqa: B904

Check warning on line 114 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L114

Added line #L114 was not covered by tests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should use more qualified exceptions such as OSError

f"Asset failure threshold ({self.bad_assets_threshold}) "
"reached, stopping execution"
)
else:
logger.warning(

Check warning on line 119 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L119

Added line #L119 was not covered by tests
f"Exception while processing asset for {asset_url.value}: "
f"{exc}"
)

def _get_header_data_for(self, url: HttpUrl) -> HeaderData:
"""Get details from headers for a given url
Expand Down Expand Up @@ -204,6 +230,12 @@
)
return asset_content

@backoff.on_exception(
backoff.expo,
RequestException,
max_time=30, # secs
on_backoff=backoff_hdlr,
)
def get_asset_content(
self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
) -> BytesIO:
Expand All @@ -222,7 +254,16 @@
else:
logger.debug(f"Not optimizing, unsupported mime type: {mime_type}")

return self._download_from_online(asset_url=asset_url)
try:
return self._download_from_online(asset_url=asset_url)
except RequestException as exc:

Check warning on line 259 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L257-L259

Added lines #L257 - L259 were not covered by tests
# check if the failing download match known bad assets regex early, and if
# so raise a custom exception to escape backoff (always important to try
# once even if asset is expected to not work, but no need to loose time on
# retrying assets which are expected to be bad)
if self.bad_assets_regex and self.bad_assets_regex.findall(asset_url.value):
raise KnownBadAssetFailedError() from exc
raise

Check warning on line 266 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L265-L266

Added lines #L265 - L266 were not covered by tests

def _setup_s3(self):
if not self.s3_url_with_credentials:
Expand Down
3 changes: 2 additions & 1 deletion scraper/src/mindtouch2zim/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
HTTP_TIMEOUT_NORMAL_SECONDS = 15
HTTP_TIMEOUT_LONG_SECONDS = 30

HTML_ISSUES_WARN_ONLY = False
# Loading the CSS leads to many bad assets at these URLs, we just ignore them
KNOWN_BAD_ASSETS_REGEX = r"https?:\/\/a\.mtstatic\.com/@(cache|style)"

logger = getLogger(NAME, level=logging.DEBUG, log_format=DEFAULT_FORMAT_WITH_THREADS)

Expand Down
24 changes: 16 additions & 8 deletions scraper/src/mindtouch2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,20 @@
)

parser.add_argument(
"--html-issues-warn-only",
help="[dev] Only log a warning when unexpected HTML is encountered. Use with "
"caution because activating this option means that ZIM HTML will probably lead "
"to online resources without user noticing it.",
action="store_true",
default=False,
dest="html_issues_warn_only",
"--bad-assets-regex",
help="Regular expression of asset URLs known to not be available. "
"Case insensitive.",
dest="bad_assets_regex",
)

parser.add_argument(

Check warning on line 235 in scraper/src/mindtouch2zim/entrypoint.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/entrypoint.py#L235

Added line #L235 was not covered by tests
"--bad-assets-threshold",
type=int,
help="[dev] Number of assets allowed to fail to download before failing the"
" scraper. Assets already excluded with --bad-assets-regex are not counted for"
" this threshold. Defaults to 10 assets.",
default=10,
dest="bad_assets_threshold",
)

args = parser.parse_args()
Expand Down Expand Up @@ -272,7 +279,8 @@
illustration_url=args.illustration_url,
s3_url_with_credentials=args.s3_url_with_credentials,
assets_workers=args.assets_workers,
html_issues_warn_only=args.html_issues_warn_only,
bad_assets_regex=args.bad_assets_regex,
bad_assets_threshold=args.bad_assets_threshold,
).run()
except SystemExit:
logger.error("Generation failed, exiting")
Expand Down
14 changes: 4 additions & 10 deletions scraper/src/mindtouch2zim/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,13 @@ class InvalidFormatError(Exception):
pass


class UnsupportedTagError(Exception):
"""An exception raised when an HTML tag is not expected to be encountered"""

pass


class UnsupportedHrefSrcError(Exception):
"""An exception raised when an href or src is not expected to be encountered"""
class NoIllustrationFoundError(Exception):
"""An exception raised when no suitable illustration has been found"""

pass


class NoIllustrationFoundError(Exception):
"""An exception raised when no suitable illustration has been found"""
class KnownBadAssetFailedError(Exception):
"""An exception raised when an asset known to be failing, failed as expected"""

pass
58 changes: 18 additions & 40 deletions scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,8 @@
ZimPath,
)

import mindtouch2zim.constants
from mindtouch2zim.client import LibraryPage
from mindtouch2zim.constants import logger
from mindtouch2zim.errors import UnsupportedHrefSrcError, UnsupportedTagError
from mindtouch2zim.utils import is_better_srcset_descriptor
from mindtouch2zim.vimeo import get_vimeo_thumbnail_url

Expand All @@ -29,20 +27,20 @@


@html_rules.rewrite_attribute()
def rewrite_href_src_attributes(
def rewrite_href_src_srcset_attributes(
tag: str,
attr_name: str,
attr_value: str | None,
url_rewriter: ArticleUrlRewriter,
base_href: str | None,
):
"""Rewrite href and src attributes"""
if attr_name not in ("href", "src") or not attr_value:
if attr_name not in ("href", "src", "srcset") or not attr_value:
return
if not isinstance(url_rewriter, HtmlUrlsRewriter):
raise Exception("Expecting HtmlUrlsRewriter")
new_attr_value = None
if tag == "a":
if tag in ["a", "area"]:
rewrite_result = url_rewriter(
attr_value, base_href=base_href, rewrite_all_url=False
)
Expand All @@ -53,36 +51,17 @@
if rewrite_result.rewriten_url.startswith(url_rewriter.library_path.value)
else rewrite_result.rewriten_url
)
if not new_attr_value:
# we do not (yet) support other tags / attributes so we fail the scraper
msg = (
else:
# we remove the src/href/srcset which is not supported, to ensure we won't load
# external assets
new_attr_value = ""
logger.warning(
f"Unsupported '{attr_name}' encountered in '{tag}' tag (value: "
f"'{attr_value}') while rewriting {rewriting_context}"
)
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
raise UnsupportedHrefSrcError(msg)
else:
logger.warning(msg)
return
return (attr_name, new_attr_value)


@html_rules.rewrite_tag()
def refuse_unsupported_tags(tag: str):
"""Stop scraper if unsupported tag is encountered"""
if tag not in ["picture"]:
return
msg = (
f"Tag {tag} is not yet supported in this scraper, found while rewriting "
f"{rewriting_context}"
)
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
raise UnsupportedTagError(msg)
else:
logger.warning(msg)
return


YOUTUBE_IFRAME_RE = re.compile(r".*youtube(?:-\w+)*\.\w+\/embed\/(?P<id>.*?)(?:\?.*)*$")
VIMEO_IFRAME_RE = re.compile(r".*vimeo(?:-\w+)*\.\w+\/video\/(?:.*?)(?:\?.*)*$")

Expand All @@ -101,15 +80,8 @@
raise Exception("Expecting HtmlUrlsRewriter")
src = get_attr_value_from(attrs=attrs, name="src")
if not src:
msg = (
"Unsupported empty src in iframe, found while rewriting "
f"{rewriting_context}"
)
if not mindtouch2zim.constants.HTML_ISSUES_WARN_ONLY:
raise UnsupportedTagError(msg)
else:
logger.warning(msg)
return
logger.warning(f"Empty src found in iframe while rewriting {rewriting_context}")
return
image_rewriten_url = None
try:
if ytb_match := YOUTUBE_IFRAME_RE.match(src):
Expand All @@ -127,9 +99,15 @@
url_rewriter.add_item_to_download(rewrite_result)
image_rewriten_url = rewrite_result.rewriten_url
else:
logger.debug(f"iframe pointing to {src} will not have any preview")
logger.debug(
f"iframe pointing to {src} in {rewriting_context} will not "
"have any preview"
)
except Exception as exc:
logger.warning(f"Failed to rewrite iframe with src {src}", exc_info=exc)
logger.warning(

Check warning on line 107 in scraper/src/mindtouch2zim/html_rewriting.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/html_rewriting.py#L107

Added line #L107 was not covered by tests
f"Failed to rewrite iframe with src {src} in {rewriting_context}",
exc_info=exc,
)

if image_rewriten_url:
return (
Expand Down
Loading