diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e990ae57..ffef9b99 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -98,7 +98,7 @@ jobs: run: python -m pip install -e ".[all]" - name: Type checking - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} run: | mypy -p trafilatura @@ -110,7 +110,7 @@ jobs: # coverage - name: Upload coverage to Codecov - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/tests/cli_tests.py b/tests/cli_tests.py index aeba580a..8d094445 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -225,6 +225,14 @@ def test_sysoutput(): options = settings.args_to_extractor(args) assert options.format == "markdown" and options.formatting is True assert cli_utils.process_result("DADIDA", args, -1, options) == -1 + + # with counter + with open( + path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8" + ) as f: + teststring = f.read() + assert cli_utils.process_result(teststring, args, 1, options) == 2 + # test keeping dir structure testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"] with patch.object(sys, "argv", testargs): @@ -377,6 +385,9 @@ def test_cli_pipeline(): def test_file_processing(): "Test file processing pipeline on actual directories." + backup = settings.MAX_FILES_PER_DIRECTORY + settings.MAX_FILES_PER_DIRECTORY = 0 + # dry-run file processing pipeline testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"] with patch.object(sys, "argv", testargs): @@ -393,6 +404,8 @@ def test_file_processing(): for f in cli_utils.generate_filelist(args.input_dir): cli_utils.file_processing(f, args, options=options) + settings.MAX_FILES_PER_DIRECTORY = backup + def test_cli_config_file(): "Test if the configuration file is loaded correctly from the CLI." @@ -509,7 +522,7 @@ def test_crawling(): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): - cli_utils.cli_crawler(args) + cli.process_args(args) assert f.getvalue() == "https://httpbun.com/html\n" spider.URL_STORE = UrlStore(compressed=False, strict=False) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e7c12dc0..9990a107 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -23,7 +23,7 @@ import trafilatura.htmlprocessing from trafilatura import bare_extraction, extract, xml from trafilatura.core import Extractor -from trafilatura.external import sanitize_tree, try_justext +from trafilatura.external import sanitize_tree, try_justext, try_readability from trafilatura.main_extractor import (handle_formatting, handle_image, handle_lists, handle_paragraphs, handle_quotes, handle_table, handle_textelem) @@ -815,7 +815,8 @@ def test_htmlprocessing(): def test_extraction_options(): '''Test the different parameters available in extract() and bare_extraction()''' - my_html = '

Text.

' + my_html = '

Text.

' + with pytest.raises(ValueError) as err: extract(my_html, output_format="python") assert extract(my_html, config=NEW_CONFIG) is None @@ -824,9 +825,20 @@ def test_extraction_options(): assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None + + # justext hardening assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'' + assert etree.tostring(try_justext(None, None, 'de')) == b'' # assert extract(my_html) is None + # readability + my_html = '

' + 'Text. '*10 + '

' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert len(result) > 10 and b'Text' in result + my_html = '

' + 'Text. '*10 + 'Test

' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert b'Test' not in result + my_html = '' + '

ABC def ghi jkl.

'*1000 + '

Posted on 1st Dec 2019<.

' assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py index 5a36b4bd..160c151b 100644 --- a/tests/xml_tei_tests.py +++ b/tests/xml_tei_tests.py @@ -6,7 +6,7 @@ from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring from trafilatura.metadata import Document -from trafilatura.xml import (check_tei, write_fullheader, +from trafilatura.xml import (check_tei, replace_element_text, write_fullheader, _handle_unwanted_tails, _move_element_one_level_up, _wrap_unwanted_siblings_of_div) @@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div(): assert cleaned.find(".//p").text == "tail" +def test_replace_element_text(): + elem = Element("head") + elem.text = "Title" + elem.set("rend", "h1") + assert replace_element_text(elem, True) == "# Title" + + elem = Element("hi") + elem.text = "Text" + elem.set("rend", "#b") + assert replace_element_text(elem, True) == "**Text**" + + elem = Element("item") + elem.text = "Test text" + elem.tag = "item" + assert replace_element_text(elem, True) == "- Test text\n" + + elem = Element("ref") + elem.text = "Link" + elem.set("target", "https://example.com") + assert replace_element_text(elem, True) == "[Link](https://example.com)" + + elem = Element("ref") + elem.text = "Link" + assert replace_element_text(elem, True) == "[Link]" + + if __name__ == "__main__": test_publisher_added_before_availability_in_publicationStmt() test_unwanted_siblings_of_div_removed() test_tail_on_p_like_elements_removed() test_head_with_children_converted_to_ab() test_ab_with_p_parent_resolved() + test_replace_element_text() diff --git a/trafilatura/external.py b/trafilatura/external.py index 49801869..4dd1f090 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element: # extract try: paragraphs = custom_justext(tree, justext_stoplist) - except ValueError as err: # not an XML element: HtmlComment + except Exception as err: LOGGER.error('justext %s %s', err, url) else: for paragraph in paragraphs: diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 96742bd0..48f651c1 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) ) elem.drop_tree() elif elem.text_content().count(",") < 10: - to_remove = False + to_remove = True counts = { kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS } @@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) # continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = f'too many images ({counts["img"]})' - to_remove = True elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS: reason = "more
  • s than

    s" - to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" - to_remove = True elif content_length < self.min_text_length and counts["img"] == 0: reason = f"too short content length {content_length} without a single image" - to_remove = True elif content_length < self.min_text_length and counts["img"] > 2: reason = ( f"too short content length {content_length} and too many images" ) - to_remove = True elif weight < 25 and link_density > 0.2: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif weight >= 25 and link_density > 0.5: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts[ "embed" ] > 1: reason = ( "s with too short content length, or too many s" ) - to_remove = True elif not content_length: reason = "no content" - to_remove = True # find x non empty preceding and succeeding siblings siblings = [] @@ -430,17 +421,18 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) if siblings and sum(siblings) > 1000: to_remove = False allowed.update(elem.iter("table", "ul", "div", "section")) + else: + to_remove = False if to_remove: elem.drop_tree() - if LOGGER.isEnabledFor(logging.DEBUG): - LOGGER.debug( - "Removed %6.3f %s with weight %s cause it has %s.", - score, - elem.tag, - weight, - reason or "", - ) + LOGGER.debug( + "Removed %6.3f %s with weight %s cause it has %s.", + score, + elem.tag, + weight, + reason or "", + ) self.doc = node return _tostring(self.doc) diff --git a/trafilatura/utils.py b/trafilatura/utils.py index aae37d7f..7db53889 100644 --- a/trafilatura/utils.py +++ b/trafilatura/utils.py @@ -405,7 +405,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]: if len(temp_text) > len(temp_comments) else py3langid.classify(temp_comments) ) - else: + else: # pragma: no cover LOGGER.warning('Language detector not installed, skipping detection') result = None return result # type: ignore[no-any-return]