diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 00046804..e7c12dc0 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1022,7 +1022,7 @@ def test_table_processing(): you buy they buy ''' in my_result - assert extract(htmlstring, fast=True, output_format='txt').startswith("Present Tense | I buy | you buy |") + assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |") # table with links # todo: further tests and adjustments htmlstring = '
' + 'ABCD'*100 + '
' @@ -1112,12 +1112,12 @@ def test_table_processing(): assert "1" in result and "2" in result # table headers in non-XML formats htmlstring = '
head 1head 2
12
' - assert "---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) + assert "|---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) # remove new lines in table cells in text format htmlstring = '
cell
1
cell

2

' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "cell 1 | cell 2 |" in result + assert "| cell 1 | cell 2 |" in result # only one header row is allowed in text format htmlstring = '
ab
cd
' @@ -1127,15 +1127,15 @@ def test_table_processing(): # handle colspan by appending columns in text format htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result # MemoryError: https://github.com/adbar/trafilatura/issues/657 htmlstring = '
ab
cde
' @@ -1149,16 +1149,16 @@ def test_table_processing(): # wrong span info htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result htmlstring = '
ab
cde
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert "a | b | |" in result + assert "| a | b | |" in result # links: this gets through (for now) htmlstring = '
a
' result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True) - assert result == "a |" + assert result == "| a |" # link: this is filtered out htmlstring = f'
{"abc"*100}
' diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 646c8426..953a5f9f 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -287,7 +287,10 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str: # cells if element.tag == "cell" and elem_text and len(element) > 0: if element[0].tag == 'p': - elem_text = f"{elem_text} " + elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} " + elif element.tag == 'cell' and elem_text: + # add | before first cell + elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}" # lists elif element.tag == "item" and elem_text: elem_text = f"- {elem_text}\n" @@ -324,7 +327,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting returnlist.append(f'{"|" * (max_span - cell_count)}\n') # if this is a head row, draw the separator below if element.xpath("./cell[@role='head']"): - returnlist.append(f'\n{"---|" * max_span}\n') + returnlist.append(f'\n|{"---|" * max_span}\n') else: returnlist.append("\n") elif element.tag != "cell": @@ -337,7 +340,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting # Common elements (Now processes end-tag logic correctly) if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"): # spacing hack - returnlist.append("\n\u2424\n" if include_formatting else "\n") + returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n") elif element.tag == "cell": returnlist.append(" | ") elif element.tag not in SPECIAL_FORMATTING: