diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e990ae57..ffef9b99 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -98,7 +98,7 @@ jobs: run: python -m pip install -e ".[all]" - name: Type checking - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} run: | mypy -p trafilatura @@ -110,7 +110,7 @@ jobs: # coverage - name: Upload coverage to Codecov - if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }} + if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }} uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/tests/cli_tests.py b/tests/cli_tests.py index aeba580a..8d094445 100644 --- a/tests/cli_tests.py +++ b/tests/cli_tests.py @@ -225,6 +225,14 @@ def test_sysoutput(): options = settings.args_to_extractor(args) assert options.format == "markdown" and options.formatting is True assert cli_utils.process_result("DADIDA", args, -1, options) == -1 + + # with counter + with open( + path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8" + ) as f: + teststring = f.read() + assert cli_utils.process_result(teststring, args, 1, options) == 2 + # test keeping dir structure testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"] with patch.object(sys, "argv", testargs): @@ -377,6 +385,9 @@ def test_cli_pipeline(): def test_file_processing(): "Test file processing pipeline on actual directories." + backup = settings.MAX_FILES_PER_DIRECTORY + settings.MAX_FILES_PER_DIRECTORY = 0 + # dry-run file processing pipeline testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"] with patch.object(sys, "argv", testargs): @@ -393,6 +404,8 @@ def test_file_processing(): for f in cli_utils.generate_filelist(args.input_dir): cli_utils.file_processing(f, args, options=options) + settings.MAX_FILES_PER_DIRECTORY = backup + def test_cli_config_file(): "Test if the configuration file is loaded correctly from the CLI." @@ -509,7 +522,7 @@ def test_crawling(): args = cli.parse_args(testargs) f = io.StringIO() with redirect_stdout(f): - cli_utils.cli_crawler(args) + cli.process_args(args) assert f.getvalue() == "https://httpbun.com/html\n" spider.URL_STORE = UrlStore(compressed=False, strict=False) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e7c12dc0..9990a107 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -23,7 +23,7 @@ import trafilatura.htmlprocessing from trafilatura import bare_extraction, extract, xml from trafilatura.core import Extractor -from trafilatura.external import sanitize_tree, try_justext +from trafilatura.external import sanitize_tree, try_justext, try_readability from trafilatura.main_extractor import (handle_formatting, handle_image, handle_lists, handle_paragraphs, handle_quotes, handle_table, handle_textelem) @@ -815,7 +815,8 @@ def test_htmlprocessing(): def test_extraction_options(): '''Test the different parameters available in extract() and bare_extraction()''' - my_html = '
Text.
Text.
' + 'Text. '*10 + '
' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert len(result) > 10 and b'Text' in result + my_html = '' + 'Text. '*10 + '
' + result = etree.tostring(try_readability(html.fromstring(my_html))) + assert b'Test' not in result + my_html = '' + 'ABC def ghi jkl.
'*1000 + 'Posted on 1st Dec 2019<.
' assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None diff --git a/tests/xml_tei_tests.py b/tests/xml_tei_tests.py index 5a36b4bd..160c151b 100644 --- a/tests/xml_tei_tests.py +++ b/tests/xml_tei_tests.py @@ -6,7 +6,7 @@ from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring from trafilatura.metadata import Document -from trafilatura.xml import (check_tei, write_fullheader, +from trafilatura.xml import (check_tei, replace_element_text, write_fullheader, _handle_unwanted_tails, _move_element_one_level_up, _wrap_unwanted_siblings_of_div) @@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div(): assert cleaned.find(".//p").text == "tail" +def test_replace_element_text(): + elem = Element("head") + elem.text = "Title" + elem.set("rend", "h1") + assert replace_element_text(elem, True) == "# Title" + + elem = Element("hi") + elem.text = "Text" + elem.set("rend", "#b") + assert replace_element_text(elem, True) == "**Text**" + + elem = Element("item") + elem.text = "Test text" + elem.tag = "item" + assert replace_element_text(elem, True) == "- Test text\n" + + elem = Element("ref") + elem.text = "Link" + elem.set("target", "https://example.com") + assert replace_element_text(elem, True) == "[Link](https://example.com)" + + elem = Element("ref") + elem.text = "Link" + assert replace_element_text(elem, True) == "[Link]" + + if __name__ == "__main__": test_publisher_added_before_availability_in_publicationStmt() test_unwanted_siblings_of_div_removed() test_tail_on_p_like_elements_removed() test_head_with_children_converted_to_ab() test_ab_with_p_parent_resolved() + test_replace_element_text() diff --git a/trafilatura/external.py b/trafilatura/external.py index 49801869..4dd1f090 100644 --- a/trafilatura/external.py +++ b/trafilatura/external.py @@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element: # extract try: paragraphs = custom_justext(tree, justext_stoplist) - except ValueError as err: # not an XML element: HtmlComment + except Exception as err: LOGGER.error('justext %s %s', err, url) else: for paragraph in paragraphs: diff --git a/trafilatura/readability_lxml.py b/trafilatura/readability_lxml.py index 96742bd0..48f651c1 100644 --- a/trafilatura/readability_lxml.py +++ b/trafilatura/readability_lxml.py @@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) ) elem.drop_tree() elif elem.text_content().count(",") < 10: - to_remove = False + to_remove = True counts = { kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS } @@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate]) # continue if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3: reason = f'too many images ({counts["img"]})' - to_remove = True elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS: reason = "mores" - to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x
s than s" - to_remove = True elif content_length < self.min_text_length and counts["img"] == 0: reason = f"too short content length {content_length} without a single image" - to_remove = True elif content_length < self.min_text_length and counts["img"] > 2: reason = ( f"too short content length {content_length} and too many images" ) - to_remove = True elif weight < 25 and link_density > 0.2: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif weight >= 25 and link_density > 0.5: reason = ( f"too many links {link_density:.3f} for its weight {weight}" ) - to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts[ "embed" ] > 1: reason = ( "