Skip to content

Commit

Permalink
tests: extend coverage (#753)
Browse files Browse the repository at this point in the history
* tests: extend coverage

* fix tests

* fix cli test

* update actions
  • Loading branch information
adbar authored Nov 29, 2024
1 parent 117ba8b commit 4e59c8a
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
run: python -m pip install -e ".[all]"

- name: Type checking
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
run: |
mypy -p trafilatura
Expand All @@ -110,7 +110,7 @@ jobs:

# coverage
- name: Upload coverage to Codecov
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.12' }}
if: ${{ matrix.env.MINIMAL == 'false' && matrix.python-version == '3.13' }}
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
Expand Down
15 changes: 14 additions & 1 deletion tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,14 @@ def test_sysoutput():
options = settings.args_to_extractor(args)
assert options.format == "markdown" and options.formatting is True
assert cli_utils.process_result("DADIDA", args, -1, options) == -1

# with counter
with open(
path.join(RESOURCES_DIR, "httpbin_sample.html"), "r", encoding="utf-8"
) as f:
teststring = f.read()
assert cli_utils.process_result(teststring, args, 1, options) == 2

# test keeping dir structure
testargs = ["", "-i", "myinputdir/", "-o", "test/", "--keep-dirs"]
with patch.object(sys, "argv", testargs):
Expand Down Expand Up @@ -377,6 +385,9 @@ def test_cli_pipeline():

def test_file_processing():
"Test file processing pipeline on actual directories."
backup = settings.MAX_FILES_PER_DIRECTORY
settings.MAX_FILES_PER_DIRECTORY = 0

# dry-run file processing pipeline
testargs = ["", "--parallel", "1", "--input-dir", "/dev/null"]
with patch.object(sys, "argv", testargs):
Expand All @@ -393,6 +404,8 @@ def test_file_processing():
for f in cli_utils.generate_filelist(args.input_dir):
cli_utils.file_processing(f, args, options=options)

settings.MAX_FILES_PER_DIRECTORY = backup


def test_cli_config_file():
"Test if the configuration file is loaded correctly from the CLI."
Expand Down Expand Up @@ -509,7 +522,7 @@ def test_crawling():
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
cli.process_args(args)
assert f.getvalue() == "https://httpbun.com/html\n"

spider.URL_STORE = UrlStore(compressed=False, strict=False)
Expand Down
16 changes: 14 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import trafilatura.htmlprocessing
from trafilatura import bare_extraction, extract, xml
from trafilatura.core import Extractor
from trafilatura.external import sanitize_tree, try_justext
from trafilatura.external import sanitize_tree, try_justext, try_readability
from trafilatura.main_extractor import (handle_formatting, handle_image,
handle_lists, handle_paragraphs, handle_quotes,
handle_table, handle_textelem)
Expand Down Expand Up @@ -815,7 +815,8 @@ def test_htmlprocessing():

def test_extraction_options():
'''Test the different parameters available in extract() and bare_extraction()'''
my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --></p></div></body></html>'
my_html = '<html><head><meta http-equiv="content-language" content="EN"/></head><body><div="article-body"><p>Text.<!-- comment --><?php echo "This is a PHP processing instruction"; ?></p></div></body></html>'

with pytest.raises(ValueError) as err:
extract(my_html, output_format="python")
assert extract(my_html, config=NEW_CONFIG) is None
Expand All @@ -824,9 +825,20 @@ def test_extraction_options():
assert extract(my_html, only_with_metadata=True, output_format='xml', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', config=ZERO_CONFIG) is None
assert extract(my_html, target_language='de', fast=True, config=ZERO_CONFIG) is None

# justext hardening
assert etree.tostring(try_justext(html.fromstring(my_html), None, 'de')) == b'<body/>'
assert etree.tostring(try_justext(None, None, 'de')) == b'<body/>'
# assert extract(my_html) is None

# readability
my_html = '<html><body><p>' + 'Text. '*10 + '</p></body></html>'
result = etree.tostring(try_readability(html.fromstring(my_html)))
assert len(result) > 10 and b'Text' in result
my_html = '<html><body><p>' + 'Text. '*10 + '<embed>Test</embed></p></body></html>'
result = etree.tostring(try_readability(html.fromstring(my_html)))
assert b'Test' not in result

my_html = '<html><head/><body>' + '<p>ABC def ghi jkl.</p>'*1000 + '<p>Posted on 1st Dec 2019<.</p></body></html>'
assert bare_extraction(my_html, config=ZERO_CONFIG, with_metadata=True).date is not None
assert bare_extraction(my_html, config=NEW_CONFIG, with_metadata=True).date is None
Expand Down
29 changes: 28 additions & 1 deletion tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from lxml.etree import Element, SubElement, XMLParser, fromstring, tostring

from trafilatura.metadata import Document
from trafilatura.xml import (check_tei, write_fullheader,
from trafilatura.xml import (check_tei, replace_element_text, write_fullheader,
_handle_unwanted_tails, _move_element_one_level_up,
_wrap_unwanted_siblings_of_div)

Expand Down Expand Up @@ -472,9 +472,36 @@ def test_handling_of_text_content_in_div():
assert cleaned.find(".//p").text == "tail"


def test_replace_element_text():
elem = Element("head")
elem.text = "Title"
elem.set("rend", "h1")
assert replace_element_text(elem, True) == "# Title"

elem = Element("hi")
elem.text = "Text"
elem.set("rend", "#b")
assert replace_element_text(elem, True) == "**Text**"

elem = Element("item")
elem.text = "Test text"
elem.tag = "item"
assert replace_element_text(elem, True) == "- Test text\n"

elem = Element("ref")
elem.text = "Link"
elem.set("target", "https://example.com")
assert replace_element_text(elem, True) == "[Link](https://example.com)"

elem = Element("ref")
elem.text = "Link"
assert replace_element_text(elem, True) == "[Link]"


if __name__ == "__main__":
test_publisher_added_before_availability_in_publicationStmt()
test_unwanted_siblings_of_div_removed()
test_tail_on_p_like_elements_removed()
test_head_with_children_converted_to_ab()
test_ab_with_p_parent_resolved()
test_replace_element_text()
2 changes: 1 addition & 1 deletion trafilatura/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def try_justext(tree: HtmlElement, url: str, target_language: str) -> _Element:
# extract
try:
paragraphs = custom_justext(tree, justext_stoplist)
except ValueError as err: # not an XML element: HtmlComment
except Exception as err:
LOGGER.error('justext %s %s', err, url)
else:
for paragraph in paragraphs:
Expand Down
28 changes: 10 additions & 18 deletions trafilatura/readability_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
)
elem.drop_tree()
elif elem.text_content().count(",") < 10:
to_remove = False
to_remove = True
counts = {
kind: len(elem.findall(f".//{kind}")) for kind in TEXT_CLEAN_ELEMS
}
Expand All @@ -376,41 +376,32 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
# continue
if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
reason = f'too many images ({counts["img"]})'
to_remove = True
elif counts["li"] > counts["p"] and elem.tag not in LIST_TAGS:
reason = "more <li>s than <p>s"
to_remove = True
elif counts["input"] > (counts["p"] / 3):
reason = "less than 3x <p>s than <input>s"
to_remove = True
elif content_length < self.min_text_length and counts["img"] == 0:
reason = f"too short content length {content_length} without a single image"
to_remove = True
elif content_length < self.min_text_length and counts["img"] > 2:
reason = (
f"too short content length {content_length} and too many images"
)
to_remove = True
elif weight < 25 and link_density > 0.2:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
to_remove = True
elif weight >= 25 and link_density > 0.5:
reason = (
f"too many links {link_density:.3f} for its weight {weight}"
)
to_remove = True
elif (counts["embed"] == 1 and content_length < 75) or counts[
"embed"
] > 1:
reason = (
"<embed>s with too short content length, or too many <embed>s"
)
to_remove = True
elif not content_length:
reason = "no content"
to_remove = True

# find x non empty preceding and succeeding siblings
siblings = []
Expand All @@ -430,17 +421,18 @@ def sanitize(self, node: HtmlElement, candidates: Dict[HtmlElement, Candidate])
if siblings and sum(siblings) > 1000:
to_remove = False
allowed.update(elem.iter("table", "ul", "div", "section"))
else:
to_remove = False

if to_remove:
elem.drop_tree()
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug(
"Removed %6.3f %s with weight %s cause it has %s.",
score,
elem.tag,
weight,
reason or "",
)
LOGGER.debug(
"Removed %6.3f %s with weight %s cause it has %s.",
score,
elem.tag,
weight,
reason or "",
)

self.doc = node
return _tostring(self.doc)
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ def language_classifier(temp_text: str, temp_comments: str) -> Optional[str]:
if len(temp_text) > len(temp_comments)
else py3langid.classify(temp_comments)
)
else:
else: # pragma: no cover
LOGGER.warning('Language detector not installed, skipping detection')
result = None
return result # type: ignore[no-any-return]
Expand Down

0 comments on commit 4e59c8a

Please sign in to comment.