Skip to content

Commit

Permalink
mupdf: fix getPageText implementation (#1975)
Browse files Browse the repository at this point in the history
Ignore empty word / line boxes, including the `fz_empty_rect` case (which changed from
`(0, 0, 0, 0)` to `(FZ_MAX_INF_RECT, FZ_MAX_INF_RECT, FZ_MIN_INF_RECT, FZ_MIN_INF_RECT)`
after the MuPDF update from 1.13.0 to 1.24.2).
  • Loading branch information
benoit-pierre authored Nov 20, 2024
1 parent a7c7329 commit 6d59b03
Showing 1 changed file with 14 additions and 11 deletions.
25 changes: 14 additions & 11 deletions ffi/mupdf.lua
Original file line number Diff line number Diff line change
Expand Up @@ -547,12 +547,14 @@ function page_mt.__index:getPageText()
ch = ch.next
end
-- add word to line
table.insert(line, {
word = ffi.string(textbuf, textlen),
x0 = word_bbox.x0, y0 = word_bbox.y0,
x1 = word_bbox.x1, y1 = word_bbox.y1,
})
size = size + 5 * 8 + textlen
if word_bbox.x0 < word_bbox.x1 and word_bbox.y0 < word_bbox.y1 then
table.insert(line, {
word = ffi.string(textbuf, textlen),
x0 = word_bbox.x0, y0 = word_bbox.y0,
x1 = word_bbox.x1, y1 = word_bbox.y1,
})
size = size + 5 * 8 + textlen
end

if ch == nil then
break
Expand All @@ -561,11 +563,12 @@ function page_mt.__index:getPageText()
ch = ch.next
end

line.x0, line.y0 = line_bbox.x0, line_bbox.y0
line.x1, line.y1 = line_bbox.x1, line_bbox.y1
size = size + 5 * 8

table.insert(lines, line)
if line_bbox.x0 < line_bbox.x1 and line_bbox.y0 < line_bbox.y1 then
line.x0, line.y0 = line_bbox.x0, line_bbox.y0
line.x1, line.y1 = line_bbox.x1, line_bbox.y1
size = size + 5 * 8
table.insert(lines, line)
end
end

mupdf_line = mupdf_line.next
Expand Down

0 comments on commit 6d59b03

Please sign in to comment.