From c82a04c49b0efdd2028aa3fa4b22d92a35a4b00c Mon Sep 17 00:00:00 2001 From: modesty Date: Sat, 4 Oct 2025 13:14:51 -0700 Subject: [PATCH] fix: #408: fix text block coordinates --- base/display/canvas.js | 7 +++---- package.json | 2 +- test/_test_.cjs | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/base/display/canvas.js b/base/display/canvas.js index b48cae6..4c473db 100755 --- a/base/display/canvas.js +++ b/base/display/canvas.js @@ -1269,10 +1269,6 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { // info(nodeUtil.inspect(glyphs)); // } - - ctx.restore(); - } - // Text rendering for regular fonts (Type3 fonts are handled in their own context above) if (str && !font.disableFontFace && !font.coded) { var curFontSize = fontSize * scale * textHScale + 3; @@ -1293,6 +1289,9 @@ var CanvasGraphics = (function CanvasGraphicsClosure() { } } + ctx.restore(); + } + return canvasWidth; }, showSpacedText: function CanvasGraphics_showSpacedText(arr) { diff --git a/package.json b/package.json index beafd7c..d005e5a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pdf2json", - "version": "3.2.2", + "version": "3.2.3", "description": "PDF file parser that converts PDF binaries to JSON and text, powered by porting a fork of PDF.JS to Node.js", "keywords": [ "pdf", diff --git a/test/_test_.cjs b/test/_test_.cjs index c2f507a..ad1cc51 100644 --- a/test/_test_.cjs +++ b/test/_test_.cjs @@ -163,6 +163,46 @@ function checkResult_pageContent(Pages, fileName) { }); } +function checkResult_textCoordinates(Pages, fileName) { + // Verify text block coordinates are unique (issue #408 regression test) + Pages.forEach((page, pageIndex) => { + const texts = page.Texts || []; + if (texts.length === 0) return; // Skip pages with no text + + // Collect all coordinates + const coords = texts.map(t => ({ x: t.x, y: t.y })); + + // Create unique coordinate strings + const uniqueCoords = new Set(coords.map(c => `${c.x},${c.y}`)); + + // Check that we have more than one unique coordinate if we have multiple text elements + // This prevents the regression where all text elements had identical coordinates (-0.25, 48.75) + if (texts.length > 5) { + assert( + uniqueCoords.size > 1, + fileName + " page " + pageIndex + + " : all " + texts.length + " text elements have identical coordinates. " + + "This is a regression of issue #408. Found only " + uniqueCoords.size + + " unique coordinate(s): " + Array.from(uniqueCoords).slice(0, 3).join(", ") + ); + } + + // Verify coordinates are reasonable (not all the same broken value) + texts.forEach((text, textIndex) => { + assert( + typeof text.x === 'number' && !isNaN(text.x), + fileName + " page " + pageIndex + " text " + textIndex + + " : has invalid x coordinate: " + text.x + ); + assert( + typeof text.y === 'number' && !isNaN(text.y), + fileName + " page " + pageIndex + " text " + textIndex + + " : has invalid y coordinate: " + text.y + ); + }); + }); +} + async function parseAndVerifyOnePDF(fileName, fromBuffer, pageCount) { let timeoutId; let pdfParser = null; @@ -203,12 +243,13 @@ async function parseAndVerifyOnePDF(fileName, fromBuffer, pageCount) { }); const evtData = await pdfParserDataReady; - + expect(evtData).toBeDefined(); checkResult_parseStatus(null, evtData, fileName); checkResult_mainFields(evtData, fileName); checkResult_pageCount(evtData.Pages, pageCount, fileName); checkResult_pageContent(evtData.Pages, fileName); + checkResult_textCoordinates(evtData.Pages, fileName); } catch (error) { console.error(`Error parsing PDF ${fileName}: `, error); throw error; // Re-throw to ensure Jest knows the test failed