Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions src/readability/shared/analysis-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,28 @@ async function analyzeTextReadability(
}
}

/**
* Returns an array of meaningful text elements from the provided document.
* Selects <p>, <blockquote>, and <li> elements, but excludes <li> elements
* that are descendants of <header> or <footer>.
* Also filters out elements with insufficient text content length.
*
* @param {Document} doc - The DOM Document to search for text elements.
* @returns {Element[]} Array of meaningful text elements for readability analysis and enhancement.
*/
const getMeaningfulElements = (doc) => Array.from(doc.querySelectorAll('p, blockquote, li'))
.filter((element) => {
if (element.tagName.toLowerCase() === 'li') {
// Check if this <li> is inside a <header> or <footer>
return !element.closest('header, footer');
}
return true; // include p and blockquote as is
})
.filter((element) => {
const textContent = element.textContent?.trim();
return textContent && textContent.length >= MIN_TEXT_LENGTH;
});

/**
* Analyzes readability for a single page's content
*/
Expand All @@ -138,7 +160,7 @@ export async function analyzePageContent(rawBody, pageUrl, traffic, log) {
const doc = new JSDOM(rawBody).window.document;

// Get all paragraph, div, and list item elements (same as preflight)
const textElements = Array.from(doc.querySelectorAll('p, div, li'));
const textElements = getMeaningfulElements(doc);

const detectedLanguages = new Set();

Expand Down Expand Up @@ -166,10 +188,6 @@ export async function analyzePageContent(rawBody, pageUrl, traffic, log) {
});

return !hasBlockChildren;
})
.filter(({ element }) => {
const textContent = element.textContent?.trim();
return textContent && textContent.length >= MIN_TEXT_LENGTH && /\s/.test(textContent);
});

// Process each element and collect analysis promises
Expand Down
2 changes: 1 addition & 1 deletion src/readability/shared/constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export const TOP_PAGES_LIMIT = 10;
export const TARGET_READABILITY_SCORE = 30;

// Minimum character length for text chunks to be considered for readability analysis
export const MIN_TEXT_LENGTH = 100;
export const MIN_TEXT_LENGTH = 150;

// Maximum characters to display in the audit report
export const MAX_CHARACTERS_DISPLAY = 200;