From 4e59c8aeb36e49c58b85979a802b7009c397970d Mon Sep 17 00:00:00 2001
From: Adrien Barbaresi <adbar@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:38:11 +0100
Subject: [PATCH] tests: extend coverage (#753)

* tests: extend coverage

* fix tests

* fix cli test

* update actions
---
 .github/workflows/tests.yml     |  4 ++--
 tests/cli_tests.py              | 15 ++++++++++++++-
 tests/unit_tests.py             | 16 ++++++++++++++--
 tests/xml_tei_tests.py          | 29 ++++++++++++++++++++++++++++-
 trafilatura/external.py         |  2 +-
 trafilatura/readability_lxml.py | 28 ++++++++++------------------
 trafilatura/utils.py            |  2 +-
 7 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index e990ae57..ffef9b99 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -98,7 +98,7 @@ jobs:
       run: python -m pip install -e ".[all]"
 
     - name: Type checking
-      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
       run: |
         mypy -p trafilatura
 
@@ -110,7 +110,7 @@ jobs:
 
     # coverage
     - name: Upload coverage to Codecov
-      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
+      if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
       uses: codecov/codecov-action@v4
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
index aeba580a..8d094445 100644
--- a/tests/cli_tests.py
+++ b/tests/cli_tests.py
@@ -225,6 +225,14 @@ def test_sysoutput():
     options = settings.args_to_extractor(args)
     assert options.format == "markdown" and options.formatting is True
     assert cli_utils.process_result("DADIDA", args, -1, options) == -1
+
+    # with counter
+    with open(
+        path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
+    ) as f:
+        teststring = f.read()
+    assert cli_utils.process_result(teststring, args, 1, options) == 2
+
     # test keeping dir structure
     testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
     with patch.object(sys, "argv", testargs):
@@ -377,6 +385,9 @@ def test_cli_pipeline():
 
 def test_file_processing():
     "Test file processing pipeline on actual directories."
+    backup = settings.MAX_FILES_PER_DIRECTORY
+    settings.MAX_FILES_PER_DIRECTORY = 0
+
     # dry-run file processing pipeline
     testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
     with patch.object(sys, "argv", testargs):
@@ -393,6 +404,8 @@ def test_file_processing():
     for f in cli_utils.generate_filelist(args.input_dir):
         cli_utils.file_processing(f, args, options=options)
 
+    settings.MAX_FILES_PER_DIRECTORY = backup
+
 
 def test_cli_config_file():
     "Test if the configuration file is loaded correctly from the CLI."
@@ -509,7 +522,7 @@ def test_crawling():
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
-        cli_utils.cli_crawler(args)
+        cli.process_args(args)
     assert f.getvalue() == "https://httpbun.com/html\n"
 
     spider.URL_STORE = UrlStore(compressed=False, strict=False)
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index e7c12dc0..9990a107 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -23,7 +23,7 @@
 import trafilatura.htmlprocessing
 from trafilatura import bare_extraction, extract, xml
 from trafilatura.core import Extractor
-from trafilatura.external import sanitize_tree, try_justext
+from trafilatura.external import sanitize_tree, try_justext, try_readability
 from trafilatura.main_extractor import (handle_formatting, handle_image,
                                         handle_lists, handle_paragraphs, handle_quotes,
                                         handle_table, handle_textelem)
@@ -815,7 +815,8 @@ def test_htmlprocessing():
 
 def test_extraction_options():
     '''Test the different parameters available in extract() and bare_extraction()'''
-    my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --></p></div></body></html>'
+    my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --><?php echo "This is a PHP processing instruction"; ?></p></div></body></html>'
+
     with pytest.raises(ValueError) as err:
         extract(my_html, output_format="python")
     assert extract(my_html, config=NEW_CONFIG) is None
@@ -824,9 +825,20 @@ def test_extraction_options():
     assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
     assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
     assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None
+
+    # justext hardening
     assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
+    assert etree.tostring(try_justext(None, None, 'de')) == b'<body/>'
     # assert extract(my_html) is None
 
+    # readability
+    my_html = '<html><body><p>' + 'Text. '*10 + '</p></body></html>'
+    result = etree.tostring(try_readability(html.fromstring(my_html)))
+    assert len(result) > 10 and b'Text' in result
+    my_html = '<html><body><p>' + 'Text. '*10 + '<embed>Test</embed></p></body></html>'
+    result = etree.tostring(try_readability(html.fromstring(my_html)))
+    assert b'Test' not in result
+
     my_html = '<html><head/><body>' + '<p>ABC def ghi jkl.</p>'*1000 + '<p>Posted on 1st Dec 2019<.</p></body></html>'
     assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None
     assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None
diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py
index 5a36b4bd..160c151b 100644
--- a/tests/xml_tei_tests.py
+++ b/tests/xml_tei_tests.py
@@ -6,7 +6,7 @@
 from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring
 
 from trafilatura.metadata import Document
-from trafilatura.xml import (check_tei, write_fullheader,
+from trafilatura.xml import (check_tei, replace_element_text, write_fullheader,
                              _handle_unwanted_tails, _move_element_one_level_up,
                              _wrap_unwanted_siblings_of_div)
 
@@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div():
     assert cleaned.find(".//p").text == "tail"
 
 
+def test_replace_element_text():
+    elem = Element("head")
+    elem.text = "Title"
+    elem.set("rend", "h1")
+    assert replace_element_text(elem, True) == "# Title"
+
+    elem = Element("hi")
+    elem.text = "Text"
+    elem.set("rend", "#b")
+    assert replace_element_text(elem, True) == "**Text**"
+
+    elem = Element("item")
+    elem.text = "Test text"
+    elem.tag = "item"
+    assert replace_element_text(elem, True) == "- Test text\n"
+
+    elem = Element("ref")
+    elem.text = "Link"
+    elem.set("target", "https://example.com")
+    assert replace_element_text(elem, True) == "[Link](https://example.com)"
+
+    elem = Element("ref")
+    elem.text = "Link"
+    assert replace_element_text(elem, True) == "[Link]"
+
+
 if __name__ == "__main__":
     test_publisher_added_before_availability_in_publicationStmt()
     test_unwanted_siblings_of_div_removed()
     test_tail_on_p_like_elements_removed()
     test_head_with_children_converted_to_ab()
     test_ab_with_p_parent_resolved()
+    test_replace_element_text()
diff --git a/trafilatura/external.py b/trafilatura/external.py
index 49801869..4dd1f090 100644
--- a/trafilatura/external.py
+++ b/trafilatura/external.py
@@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element:
     # extract
     try:
         paragraphs = custom_justext(tree, justext_stoplist)
-    except ValueError as err:  # not an XML element: HtmlComment
+    except Exception as err:
         LOGGER.error('justext %s %s', err, url)
     else:
         for paragraph in paragraphs:
diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py
index 96742bd0..48f651c1 100644
--- a/trafilatura/readability_lxml.py
+++ b/trafilatura/readability_lxml.py
@@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
                 )
                 elem.drop_tree()
             elif elem.text_content().count(",") < 10:
-                to_remove = False
+                to_remove = True
                 counts = {
                     kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS
                 }
@@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
                 #    continue
                 if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
                     reason = f'too many images ({counts["img"]})'
-                    to_remove = True
                 elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS:
                     reason = "more <li>s than <p>s"
-                    to_remove = True
                 elif counts["input"] > (counts["p"] / 3):
                     reason = "less than 3x <p>s than <input>s"
-                    to_remove = True
                 elif content_length < self.min_text_length and counts["img"] == 0:
                     reason = f"too short content length {content_length} without a single image"
-                    to_remove = True
                 elif content_length < self.min_text_length and counts["img"] > 2:
                     reason = (
                         f"too short content length {content_length} and too many images"
                     )
-                    to_remove = True
                 elif weight < 25 and link_density > 0.2:
                     reason = (
                         f"too many links {link_density:.3f} for its weight {weight}"
                     )
-                    to_remove = True
                 elif weight >= 25 and link_density > 0.5:
                     reason = (
                         f"too many links {link_density:.3f} for its weight {weight}"
                     )
-                    to_remove = True
                 elif (counts["embed"] == 1 and content_length < 75) or counts[
                     "embed"
                 ] > 1:
                     reason = (
                         "<embed>s with too short content length, or too many <embed>s"
                     )
-                    to_remove = True
                 elif not content_length:
                     reason = "no content"
-                    to_remove = True
 
                     # find x non empty preceding and succeeding siblings
                     siblings = []
@@ -430,17 +421,18 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
                     if siblings and sum(siblings) > 1000:
                         to_remove = False
                         allowed.update(elem.iter("table", "ul", "div", "section"))
+                else:
+                    to_remove = False
 
                 if to_remove:
                     elem.drop_tree()
-                    if LOGGER.isEnabledFor(logging.DEBUG):
-                        LOGGER.debug(
-                            "Removed %6.3f %s with weight %s cause it has %s.",
-                            score,
-                            elem.tag,
-                            weight,
-                            reason or "",
-                        )
+                    LOGGER.debug(
+                        "Removed %6.3f %s with weight %s cause it has %s.",
+                        score,
+                        elem.tag,
+                        weight,
+                        reason or "",
+                    )
 
         self.doc = node
         return _tostring(self.doc)
diff --git a/trafilatura/utils.py b/trafilatura/utils.py
index aae37d7f..7db53889 100644
--- a/trafilatura/utils.py
+++ b/trafilatura/utils.py
@@ -405,7 +405,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]:
             if len(temp_text) > len(temp_comments)
             else py3langid.classify(temp_comments)
         )
-    else:
+    else:  # pragma: no cover
         LOGGER.warning('Language detector not installed, skipping detection')
         result = None
     return result  # type: ignore[no-any-return]