Skip to content

Commit

Permalink
refine table markdown output (#752)
Browse files Browse the repository at this point in the history
* refine table markdown output

* fix ut

---------

Co-authored-by: CodyInnowhere <lostcody@CodyInnowheredeMacBook-Pro.local>
  • Loading branch information
unsleepy22 and CodyInnowhere authored Nov 29, 2024
1 parent 0ad8c3d commit 117ba8b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 12 deletions.
18 changes: 9 additions & 9 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,7 +1022,7 @@ def test_table_processing():
<cell>you buy</cell>
<cell>they buy</cell>
</row>''' in my_result
assert extract(htmlstring, fast=True, output_format='txt').startswith("Present Tense | I buy | you buy |")
assert extract(htmlstring, fast=True, output_format='txt').startswith("| Present Tense | I buy | you buy |")
# table with links
# todo: further tests and adjustments
htmlstring = '<html><body><article><table><tr><td><a href="test.html">' + 'ABCD'*100 + '</a></td></tr></table></article></body></html>'
Expand Down Expand Up @@ -1112,12 +1112,12 @@ def test_table_processing():
assert "1" in result and "2" in result
# table headers in non-XML formats
htmlstring = '<html><body><article><table><tr><th>head 1</th><th>head 2</th></tr><tr><td>1</td><td>2</td></tr></table></article></body></html>'
assert "---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "|---|---|" in extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)

# remove new lines in table cells in text format
htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "cell 1 | cell 2 |" in result
assert "| cell 1 | cell 2 |" in result

# only one header row is allowed in text format
htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
Expand All @@ -1127,15 +1127,15 @@ def test_table_processing():
# handle colspan by appending columns in text format
htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="2.1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result
assert "| a | b | |" in result

# MemoryError: https://github.com/adbar/trafilatura/issues/657
htmlstring = '<html><body><article><table><tr><td colspan="9007199254740991">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
Expand All @@ -1149,16 +1149,16 @@ def test_table_processing():
# wrong span info
htmlstring = '<html><body><article><table><tr><td span="-1">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result
assert "| a | b | |" in result

htmlstring = '<html><body><article><table><tr><td span="abc">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result
assert "| a | b | |" in result

# links: this gets through (for now)
htmlstring = '<html><body><article><table><tr><td><a href="link.html">a</a></td></tr></table></article></body></html>'
result = extract(htmlstring, fast=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result == "a |"
assert result == "| a |"

# link: this is filtered out
htmlstring = f'<html><body><article><table><tr><td><a href="link.html">{"abc"*100}</a></td></tr></table></article></body></html>'
Expand Down
9 changes: 6 additions & 3 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,10 @@ def replace_element_text(element: _Element, include_formatting: bool) -> str:
# cells
if element.tag == "cell" and elem_text and len(element) > 0:
if element[0].tag == 'p':
elem_text = f"{elem_text} "
elem_text = f"{elem_text} " if element.getprevious() is not None else f"| {elem_text} "
elif element.tag == 'cell' and elem_text:
# add | before first cell
elem_text = f"{elem_text}" if element.getprevious() is not None else f"| {elem_text}"
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand Down Expand Up @@ -324,7 +327,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
returnlist.append(f'{"|" * (max_span - cell_count)}\n')
# if this is a head row, draw the separator below
if element.xpath("./cell[@role='head']"):
returnlist.append(f'\n{"---|" * max_span}\n')
returnlist.append(f'\n|{"---|" * max_span}\n')
else:
returnlist.append("\n")
elif element.tag != "cell":
Expand All @@ -337,7 +340,7 @@ def process_element(element: _Element, returnlist: List[str], include_formatting
# Common elements (Now processes end-tag logic correctly)
if element.tag in NEWLINE_ELEMS and not element.xpath("ancestor::cell"):
# spacing hack
returnlist.append("\n\u2424\n" if include_formatting else "\n")
returnlist.append("\n\u2424\n" if include_formatting and element.tag != 'row' else "\n")
elif element.tag == "cell":
returnlist.append(" | ")
elif element.tag not in SPECIAL_FORMATTING:
Expand Down

0 comments on commit 117ba8b

Please sign in to comment.