@@ -4,11 +4,12 @@ import { calcWordCharMetrics } from '../utils/fontUtils.js';
44import { FontCont } from '../containers/fontContainer.js' ;
55
66const FONT_FAMILY = 'Times New Roman' ;
7- const FONT_SIZE = 12 ;
8- const CHAR_SPACING = 1 ;
7+ const FONT_SIZE = 14 ;
8+ const CHAR_SPACING = 0 ;
9+ const WORD_SPACING = 0 ;
910const LINE_HEIGHT = 14.4 ;
10- const ASCENDER_HEIGHT = 9.6 ;
11- const DESCENDER_HEIGHT = 2.4 ;
11+ const MARGIN_VERTICAL = 30 ;
12+ const MARGIN_HORIZONTAL = 20 ;
1213
1314/** @type {?opentype.Font } */
1415let fontOpentype = null ;
@@ -19,15 +20,24 @@ let fontOpentype = null;
1920 * @param {number } size
2021 * @param {opentype.Font } font
2122 */
22- function getTextAdvance ( text , size , font ) {
23+ function getTextWidth ( text , size , font ) {
2324 const { advanceArr, kerningArr } = calcWordCharMetrics ( text , font ) ;
2425
2526 const advanceTotal = advanceArr . reduce ( ( a , b ) => a + b , 0 ) ;
2627 const kerningTotal = kerningArr . reduce ( ( a , b ) => a + b , 0 ) ;
2728
28- const wordWidth1 = ( advanceTotal + kerningTotal ) * ( size / font . unitsPerEm ) ;
29+ const wordLastGlyphMetrics = font . charToGlyph ( text . at ( - 1 ) ) . getMetrics ( ) ;
30+ const wordFirstGlyphMetrics = font . charToGlyph ( text [ 0 ] ) . getMetrics ( ) ;
31+
32+ // The `leftSideBearing`/`rightSideBearing`/ numbers reported by Opentype.js are not accurate for mono-spaced fonts, so `xMin`/`xMax` are used instead.
33+ const wordLeftBearing = wordFirstGlyphMetrics . xMin || 0 ;
34+ const lastGlyphMax = wordLastGlyphMetrics . xMax || 0 ;
35+ const wordRightBearing = advanceArr [ advanceArr . length - 1 ] - lastGlyphMax ;
36+
37+ const wordWidth1 = ( advanceTotal + kerningTotal - wordLeftBearing - wordRightBearing ) ;
38+ const wordWidth1Px = wordWidth1 * ( size / font . unitsPerEm ) ;
2939 const spacingTotalPx = ( text . length - 1 ) * CHAR_SPACING ;
30- const wordWidth = wordWidth1 + spacingTotalPx ;
40+ const wordWidth = wordWidth1Px + spacingTotalPx ;
3141
3242 return wordWidth ;
3343}
@@ -76,6 +86,9 @@ export async function convertPageText({ textStr, pageDims = null }) {
7686 fontOpentype = ( await FontCont . getFont ( { font : FONT_FAMILY } ) ) . opentype ;
7787 }
7888
89+ const ASCENDER_HEIGHT = fontOpentype . ascender * ( FONT_SIZE / fontOpentype . unitsPerEm ) ;
90+ const DESCENDER_HEIGHT = fontOpentype . descender * ( FONT_SIZE / fontOpentype . unitsPerEm ) ;
91+
7992 const lines = textStr . split ( / \r ? \n / ) ;
8093
8194 if ( ! pageDims ) {
@@ -97,39 +110,38 @@ export async function convertPageText({ textStr, pageDims = null }) {
97110
98111 let tablesPage = new LayoutDataTablePage ( 0 ) ;
99112 const pagesOut = [ { pageObj, dataTables : tablesPage } ] ;
100- const margin = 20 ;
101- const availableWidth = pageDims . width - margin * 2 ;
113+ const availableWidth = pageDims . width - MARGIN_HORIZONTAL * 2 ;
102114
103- let currentY = margin + ASCENDER_HEIGHT ;
115+ let currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
104116
105117 for ( let lineIndex = 0 ; lineIndex < lines . length ; lineIndex ++ ) {
106118 const lineText = lines [ lineIndex ] ;
107119
108120 if ( lineText . length === 0 || lineText . trim ( ) . length === 0 ) {
109121 currentY += LINE_HEIGHT ;
110- if ( currentY + DESCENDER_HEIGHT > pageDims . height - margin ) {
122+ if ( currentY + FONT_SIZE > pageDims . height - MARGIN_VERTICAL ) {
111123 pageIndex ++ ;
112124 const newPage = new ocr . OcrPage ( pageIndex , pageDims ) ;
113125 newPage . textSource = 'text' ;
114126 const newTables = new LayoutDataTablePage ( 0 ) ;
115127 pagesOut . push ( { pageObj : newPage , dataTables : newTables } ) ;
116128 pageObj = newPage ;
117129 tablesPage = newTables ;
118- currentY = margin + ASCENDER_HEIGHT ;
130+ currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
119131 }
120132 continue ;
121133 }
122134
123135 const wordTokens = splitIntoWords ( lineText ) ;
124136
125137 const parLines = [ ] ;
126- let parRight = margin ;
138+ let parRight = MARGIN_HORIZONTAL ;
127139
128140 for ( let idx = 0 ; idx < wordTokens . length ; ) {
129- if ( currentY + DESCENDER_HEIGHT > pageDims . height - margin ) {
141+ if ( currentY + FONT_SIZE > pageDims . height - MARGIN_VERTICAL ) {
130142 if ( parLines . length > 0 ) {
131143 const parBbox = {
132- left : margin ,
144+ left : MARGIN_HORIZONTAL ,
133145 top : parLines [ 0 ] . bbox . top ,
134146 right : parRight ,
135147 bottom : parLines [ parLines . length - 1 ] . bbox . bottom ,
@@ -139,7 +151,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
139151 for ( const ln of parLines ) ln . par = parObj ;
140152 pageObj . pars . push ( parObj ) ;
141153 parLines . length = 0 ;
142- parRight = margin ;
154+ parRight = MARGIN_HORIZONTAL ;
143155 }
144156 pageIndex ++ ;
145157 const newPage = new ocr . OcrPage ( pageIndex , pageDims ) ;
@@ -148,34 +160,35 @@ export async function convertPageText({ textStr, pageDims = null }) {
148160 pagesOut . push ( { pageObj : newPage , dataTables : newTables } ) ;
149161 pageObj = newPage ;
150162 tablesPage = newTables ;
151- currentY = margin + ASCENDER_HEIGHT ;
163+ currentY = MARGIN_VERTICAL + LINE_HEIGHT / 2 ;
152164 }
153165
154166 const baseline = [ 0 , DESCENDER_HEIGHT ] ;
155167 const lineTop = Math . round ( currentY - ASCENDER_HEIGHT ) ;
156168 const lineBottom = Math . round ( currentY + DESCENDER_HEIGHT ) ;
157169
158- let currentX = margin ;
170+ let currentX = MARGIN_HORIZONTAL ;
159171 let widthSoFar = 0 ;
160172
161173 const lineBbox = {
162- left : margin ,
174+ left : MARGIN_HORIZONTAL ,
163175 top : lineTop ,
164- right : margin ,
176+ right : MARGIN_HORIZONTAL ,
165177 bottom : lineBottom ,
166178 } ;
167179 const lineObj = new ocr . OcrLine (
168180 pageObj ,
169181 lineBbox ,
170182 baseline ,
171183 ASCENDER_HEIGHT ,
172- ASCENDER_HEIGHT - DESCENDER_HEIGHT ,
184+ null ,
173185 ) ;
174186
175187 let lastConsumed = idx ;
176188 for ( let j = idx ; j < wordTokens . length ; j ++ ) {
177189 const tok = wordTokens [ j ] ;
178- const tokWidth = getTextAdvance ( tok . text , FONT_SIZE , fontOpentype ) ;
190+ let tokWidth = getTextWidth ( tok . text , FONT_SIZE , fontOpentype ) ;
191+ if ( tok . isWhitespace ) tokWidth += WORD_SPACING ;
179192
180193 if ( tok . isWhitespace ) {
181194 if ( lineObj . words . length === 0 ) {
@@ -218,7 +231,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
218231 if ( lineObj . words . length === 0 ) {
219232 const nextTok = wordTokens [ idx ] ;
220233 if ( nextTok && ! nextTok . isWhitespace ) {
221- const tokWidth = getTextAdvance ( nextTok . text , FONT_SIZE , fontOpentype ) ;
234+ const tokWidth = getTextWidth ( nextTok . text , FONT_SIZE , fontOpentype ) ;
222235 const wordBbox = {
223236 left : Math . round ( currentX ) ,
224237 top : lineTop ,
@@ -258,7 +271,7 @@ export async function convertPageText({ textStr, pageDims = null }) {
258271
259272 if ( parLines . length > 0 ) {
260273 const parBbox = {
261- left : margin ,
274+ left : MARGIN_HORIZONTAL ,
262275 top : parLines [ 0 ] . bbox . top ,
263276 right : parRight ,
264277 bottom : parLines [ parLines . length - 1 ] . bbox . bottom ,
0 commit comments