diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 00046804..e7c12dc0 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -1022,7 +1022,7 @@ def test_table_processing():
you buy |
they buy |
''' in my_result
- assert extract(htmlstring, fast=True, output_format='txt').startswith("Present Tense | I buy | you buy |")
+ assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
# table with links
# todo: further tests and adjustments
htmlstring = '
'
@@ -1112,12 +1112,12 @@ def test_table_processing():
assert "1" in result and "2" in result
# table headers in non-XML formats
htmlstring = ''
- assert "---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+ assert "|---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
# remove new lines in table cells in text format
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "cell 1 | cell 2 |" in result
+ assert "| cell 1 | cell 2 |" in result
# only one header row is allowed in text format
htmlstring = ''
@@ -1127,15 +1127,15 @@ def test_table_processing():
# handle colspan by appending columns in text format
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "a | b | |" in result
+ assert "| a | b | |" in result
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "a | b | |" in result
+ assert "| a | b | |" in result
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "a | b | |" in result
+ assert "| a | b | |" in result
# MemoryError: https://github.com/adbar/trafilatura/issues/657
htmlstring = ''
@@ -1149,16 +1149,16 @@ def test_table_processing():
# wrong span info
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "a | b | |" in result
+ assert "| a | b | |" in result
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert "a | b | |" in result
+ assert "| a | b | |" in result
# links: this gets through (for now)
htmlstring = ''
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
- assert result == "a |"
+ assert result == "| a |"
# link: this is filtered out
htmlstring = f''
diff --git a/trafilatura/xml.py b/trafilatura/xml.py
index 646c8426..953a5f9f 100644
--- a/trafilatura/xml.py
+++ b/trafilatura/xml.py
@@ -287,7 +287,10 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
# cells
if element.tag == "cell" and elem_text and len(element) > 0:
if element[0].tag == 'p':
- elem_text = f"{elem_text} "
+ elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
+ elif element.tag == 'cell' and elem_text:
+ # add | before first cell
+ elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
@@ -324,7 +327,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(f'{"|" * (max_span - cell_count)}\n')
# if this is a head row, draw the separator below
if element.xpath("./cell[@role='head']"):
- returnlist.append(f'\n{"---|" * max_span}\n')
+ returnlist.append(f'\n|{"---|" * max_span}\n')
else:
returnlist.append("\n")
elif element.tag != "cell":
@@ -337,7 +340,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
# spacing hack
- returnlist.append("\n\u2424\n" if include_formatting else "\n")
+ returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag not in SPECIAL_FORMATTING: