fix: sites-36290 (#1650)

miakobchuk · web-flow · commit c262dc79d148 · 2025-11-26T09:53:25.000-06:00
fix https://jira.corp.adobe.com/browse/SITES-36290
diff --git a/src/backlinks/handler.js b/src/backlinks/handler.js
@@ -18,6 +18,7 @@ import calculateKpiMetrics from './kpi-metrics.js';
 import { convertToOpportunity } from '../common/opportunity.js';
 import { createOpportunityData } from './opportunity-data-mapper.js';
 import { syncSuggestions } from '../utils/data-access.js';
+import { filterByAuditScope, extractPathPrefix } from '../internal-links/subpath-filter.js';
 
 const { AUDIT_STEP_DESTINATIONS } = Audit;
 
@@ -108,16 +109,23 @@ export async function runAuditAndImportTopPages(context) {
 
 export async function submitForScraping(context) {
   const {
-    site, dataAccess, audit,
+    site, dataAccess, audit, log,
   } = context;
   const { SiteTopPage } = dataAccess;
   const auditResult = audit.getAuditResult();
   if (auditResult.success === false) {
     throw new Error('Audit failed, skipping scraping and suggestions generation');
   }
   const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'ahrefs', 'global');
+
+  // Filter top pages by audit scope (subpath/locale) if baseURL has a subpath
+  const baseURL = site.getBaseURL();
+  const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
+
+  log.info(`Found ${topPages.length} top pages, ${filteredTopPages.length} within audit scope`);
+
   return {
-    urls: topPages.map((topPage) => ({ url: topPage.getUrl() })),
+    urls: filteredTopPages.map((topPage) => ({ url: topPage.getUrl() })),
     siteId: site.getId(),
     type: 'broken-backlinks',
   };
@@ -183,24 +191,78 @@ export const generateSuggestionData = async (context) => {
     opportunity.getId(),
     SuggestionModel.STATUSES.NEW,
   );
+
+  // Build broken links array
+  const brokenLinks = suggestions
+    .map((suggestion) => ({
+      urlFrom: suggestion?.getData()?.url_from,
+      urlTo: suggestion?.getData()?.url_to,
+      suggestionId: suggestion?.getId(),
+    }))
+    .filter((link) => link.urlFrom && link.urlTo && link.suggestionId); // Filter invalid entries
+
+  // Get top pages and filter by audit scope
   const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'ahrefs', 'global');
+  const baseURL = site.getBaseURL();
+  const filteredTopPages = filterByAuditScope(topPages, baseURL, { urlProperty: 'getUrl' }, log);
+
+  // Filter alternatives by locales/subpaths present in broken links
+  // This limits suggestions to relevant locales only
+  const allTopPageUrls = filteredTopPages.map((page) => page.getUrl());
+
+  // Extract unique locales/subpaths from broken links
+  const brokenLinkLocales = new Set();
+  brokenLinks.forEach((link) => {
+    const locale = extractPathPrefix(link.urlTo);
+    if (locale) {
+      brokenLinkLocales.add(locale);
+    }
+  });
+
+  // Filter alternatives to only include URLs matching broken links' locales
+  // If no locales found (no subpath), include all alternatives
+  // Always ensure alternativeUrls is an array (even if empty)
+  let alternativeUrls = [];
+  if (brokenLinkLocales.size > 0) {
+    alternativeUrls = allTopPageUrls.filter((url) => {
+      const urlLocale = extractPathPrefix(url);
+      // Include if URL matches one of the broken links' locales, or has no locale
+      return !urlLocale || brokenLinkLocales.has(urlLocale);
+    });
+  } else {
+    // No locale prefixes found, include all alternatives
+    alternativeUrls = allTopPageUrls;
+  }
+
+  // Validate before sending to Mystique
+  if (brokenLinks.length === 0) {
+    log.warn('No valid broken links to send to Mystique. Skipping message.');
+    return {
+      status: 'complete',
+    };
+  }
+
+  if (alternativeUrls.length === 0) {
+    log.warn('No alternative URLs available. Cannot generate suggestions. Skipping message to Mystique.');
+    return {
+      status: 'complete',
+    };
+  }
+
   const message = {
     type: 'guidance:broken-links',
     siteId: site.getId(),
     auditId: audit.getId(),
     deliveryType: site.getDeliveryType(),
     time: new Date().toISOString(),
     data: {
-      alternativeUrls: topPages.map((page) => page.getUrl()),
+      alternativeUrls,
       opportunityId: opportunity?.getId(),
-      brokenLinks: suggestions.map((suggestion) => ({
-        urlFrom: suggestion?.getData()?.url_from,
-        urlTo: suggestion?.getData()?.url_to,
-        suggestionId: suggestion?.getId(),
-      })),
+      brokenLinks,
     },
   };
   await sqs.sendMessage(env.QUEUE_SPACECAT_TO_MYSTIQUE, message);
+  log.debug(`Message sent to Mystique: ${JSON.stringify(message)}`);
   return {
     status: 'complete',
   };
diff --git a/src/internal-links/handler.js b/src/internal-links/handler.js
@@ -266,7 +266,7 @@ export const opportunityAndSuggestionsStep = async (context) => {
     // Extract unique locales/subpaths from broken links
     const brokenLinkLocales = new Set();
     brokenLinks.forEach((link) => {
-      const locale = extractPathPrefix(link.urlTo) || extractPathPrefix(link.urlFrom);
+      const locale = extractPathPrefix(link.urlTo);
       if (locale) {
         brokenLinkLocales.add(locale);
       }
diff --git a/test/audits/backlinks.test.js b/test/audits/backlinks.test.js
@@ -52,6 +52,10 @@ describe('Backlinks Tests', function () {
     { getUrl: () => 'https://example.com/blog/page1' },
     { getUrl: () => 'https://example.com/blog/page2' },
   ];
+  const topPagesNoPrefix = [
+    { getUrl: () => 'https://example.com/page1' },
+    { getUrl: () => 'https://example.com/page2' },
+  ];
   const auditUrl = 'https://audit.url';
   const audit = {
     getId: () => auditDataMock.id,
@@ -173,11 +177,37 @@ describe('Backlinks Tests', function () {
 
     const result = await submitForScraping(context);
 
+    // filterByAuditScope returns all items when baseURL has no subpath
     expect(result).to.deep.equal({
       siteId: contextSite.getId(),
       type: 'broken-backlinks',
       urls: topPages.map((topPage) => ({ url: topPage.getUrl() })),
     });
+    expect(context.log.info).to.have.been.calledWith(sinon.match(/Found.*top pages.*within audit scope/));
+  });
+
+  it('should filter top pages by audit scope when baseURL has subpath', async () => {
+    context.audit.getAuditResult.returns({ success: true });
+    const siteWithSubpath = {
+      ...contextSite,
+      getBaseURL: () => 'https://example.com/uk',
+    };
+    context.site = siteWithSubpath;
+
+    const topPagesWithSubpaths = [
+      { getUrl: () => 'https://example.com/uk/page1' },
+      { getUrl: () => 'https://example.com/uk/page2' },
+      { getUrl: () => 'https://example.com/fr/page1' }, // Should be filtered out
+    ];
+    context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo.resolves(topPagesWithSubpaths);
+
+    const result = await submitForScraping(context);
+
+    // Should only include URLs within /uk subpath
+    expect(result.urls).to.have.length(2);
+    expect(result.urls.map((u) => u.url)).to.include('https://example.com/uk/page1');
+    expect(result.urls.map((u) => u.url)).to.include('https://example.com/uk/page2');
+    expect(result.urls.map((u) => u.url)).to.not.include('https://example.com/fr/page1');
   });
 
   it('should not submit urls for scraping step when audit was not successful', async () => {
@@ -333,26 +363,161 @@ describe('Backlinks Tests', function () {
       brokenBacklinksOpportunity.getSuggestions.returns([]);
       brokenBacklinksOpportunity.addSuggestions.returns(brokenBacklinksSuggestions);
 
+      // Mock calculateKpiMetrics S3 calls (needed for the function to complete)
+      context.s3Client.send.onCall(0).resolves(null); // No RUM traffic data
+      context.s3Client.send.onCall(1).resolves(null); // No organic traffic data
+
+      // Mock suggestions with broken link that has root-level URL (no path prefix)
+      // This ensures alternatives with any prefix or no prefix will be included
+      // IMPORTANT: Match the exact structure from the original test that works
+      const suggestionsWithRootUrl = [
+        {
+          opportunityId: 'test-opportunity-id',
+          getId: () => 'test-suggestion-1',
+          type: 'REDIRECT_UPDATE',
+          rank: 550000,
+          getData: () => ({
+            url_from: 'https://from.com/from-2',
+            url_to: 'https://example.com', // Root-level URL - extractPathPrefix returns ''
+          }),
+        },
+      ];
+      // Create new stub like internal links test does - MUST be set before generateSuggestionData is called
+      // The stub needs to accept opportunityId and status as parameters
+      context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sandbox.stub()
+        .withArgs('opportunity-id', sinon.match.any)
+        .resolves(suggestionsWithRootUrl);
+
+      // Use top pages with any prefix - since broken link has no prefix, all will be included
+      context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sandbox.stub()
+        .resolves(topPages);
+
       const result = await generateSuggestionData(context);
 
-      // 4x for headers + 4x for each page
       expect(result.status).to.deep.equal('complete');
-      expect(context.sqs.sendMessage).to.have.been.calledWithMatch('test-queue', {
-        type: 'guidance:broken-links',
-        siteId: 'site-id',
-        auditId: 'audit-id',
-        deliveryType: 'aem_cs',
-        time: sinon.match.any,
-        data: {
-          opportunityId: 'opportunity-id',
-          alternativeUrls: topPages.map((page) => page.getUrl()),
-          brokenLinks: [{
-            urlFrom: 'https://from.com/from-2',
-            urlTo: 'https://foo.com/redirects-throws-error',
-            suggestionId: 'test-suggestion-1',
-          }],
+      
+      // Verify no warnings were called (meaning both brokenLinks and alternativeUrls have items)
+      expect(context.log.warn).to.not.have.been.calledWith('No valid broken links to send to Mystique. Skipping message.');
+      expect(context.log.warn).to.not.have.been.calledWith('No alternative URLs available. Cannot generate suggestions. Skipping message to Mystique.');
+      
+      // Verify message was sent with correct structure
+      expect(context.sqs.sendMessage).to.have.been.calledOnce;
+      const sentMessage = context.sqs.sendMessage.getCall(0).args[1];
+      expect(sentMessage.type).to.equal('guidance:broken-links');
+      expect(sentMessage.siteId).to.equal('site-id');
+      expect(sentMessage.auditId).to.equal('audit-id');
+      expect(sentMessage.deliveryType).to.equal('aem_cs');
+      expect(sentMessage.data.opportunityId).to.equal('opportunity-id');
+      expect(sentMessage.data.alternativeUrls).to.deep.equal(topPages.map((page) => page.getUrl()));
+      expect(sentMessage.data.brokenLinks).to.be.an('array');
+      expect(sentMessage.data.brokenLinks.length).to.equal(1);
+      expect(sentMessage.data.brokenLinks[0]).to.deep.include({
+        urlFrom: 'https://from.com/from-2',
+        urlTo: 'https://example.com',
+        suggestionId: 'test-suggestion-1',
+      });
+      
+      expect(context.log.debug).to.have.been.calledWith(sinon.match(/Message sent to Mystique/));
+    });
+
+    it('should filter alternative URLs by locale when broken links have locales', async () => {
+      configuration.isHandlerEnabledForSite.returns(true);
+      context.audit.getAuditResult.returns({
+        success: true,
+        brokenBacklinks: auditDataMock.auditResult.brokenBacklinks,
+      });
+      brokenBacklinksOpportunity.getSuggestions.returns([]);
+      brokenBacklinksOpportunity.addSuggestions.returns(brokenBacklinksSuggestions);
+
+      // Mock suggestions with locale-specific broken links
+      const suggestionsWithLocale = [
+        {
+          getId: () => 'test-suggestion-1',
+          getData: () => ({
+            url_from: 'https://from.com/from-1',
+            url_to: 'https://example.com/uk/en/old-page',
+          }),
         },
+      ];
+      context.dataAccess.Suggestion.allByOpportunityIdAndStatus.resolves(suggestionsWithLocale);
+
+      // Mock top pages with different locales
+      const topPagesWithLocales = [
+        { getUrl: () => 'https://example.com/uk/en/page1' },
+        { getUrl: () => 'https://example.com/uk/en/page2' },
+        { getUrl: () => 'https://example.com/fr/page1' }, // Should be filtered out
+        { getUrl: () => 'https://example.com/de/page1' }, // Should be filtered out
+      ];
+      context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo.resolves(topPagesWithLocales);
+
+      const result = await generateSuggestionData(context);
+
+      expect(result.status).to.deep.equal('complete');
+      const sentMessage = context.sqs.sendMessage.getCall(0).args[1];
+      expect(sentMessage.data.alternativeUrls).to.have.length(2);
+      expect(sentMessage.data.alternativeUrls).to.include('https://example.com/uk/en/page1');
+      expect(sentMessage.data.alternativeUrls).to.include('https://example.com/uk/en/page2');
+      expect(sentMessage.data.alternativeUrls).to.not.include('https://example.com/fr/page1');
+      expect(sentMessage.data.alternativeUrls).to.not.include('https://example.com/de/page1');
+    });
+
+    it('should skip sending message when no valid broken links', async () => {
+      configuration.isHandlerEnabledForSite.returns(true);
+      context.audit.getAuditResult.returns({
+        success: true,
+        brokenBacklinks: auditDataMock.auditResult.brokenBacklinks,
+      });
+      brokenBacklinksOpportunity.getSuggestions.returns([]);
+      brokenBacklinksOpportunity.addSuggestions.returns(brokenBacklinksSuggestions);
+
+      // Mock suggestions with invalid data (missing fields)
+      const invalidSuggestions = [
+        {
+          getId: () => 'test-suggestion-1',
+          getData: () => ({
+            url_from: '', // Invalid - empty
+            url_to: 'https://example.com/page',
+          }),
+        },
+      ];
+      context.dataAccess.Suggestion.allByOpportunityIdAndStatus.resolves(invalidSuggestions);
+
+      const result = await generateSuggestionData(context);
+
+      expect(result.status).to.deep.equal('complete');
+      expect(context.sqs.sendMessage).to.not.have.been.called;
+      expect(context.log.warn).to.have.been.calledWith('No valid broken links to send to Mystique. Skipping message.');
+    });
+
+    it('should skip sending message when no alternative URLs available', async () => {
+      configuration.isHandlerEnabledForSite.returns(true);
+      context.audit.getAuditResult.returns({
+        success: true,
+        brokenBacklinks: auditDataMock.auditResult.brokenBacklinks,
       });
+      brokenBacklinksOpportunity.getSuggestions.returns([]);
+      brokenBacklinksOpportunity.addSuggestions.returns(brokenBacklinksSuggestions);
+
+      // Mock suggestions
+      const validSuggestions = [
+        {
+          getId: () => 'test-suggestion-1',
+          getData: () => ({
+            url_from: 'https://from.com/from-1',
+            url_to: 'https://example.com/uk/en/old-page',
+          }),
+        },
+      ];
+      context.dataAccess.Suggestion.allByOpportunityIdAndStatus.resolves(validSuggestions);
+
+      // Mock empty top pages (after filtering)
+      context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo.resolves([]);
+
+      const result = await generateSuggestionData(context);
+
+      expect(result.status).to.deep.equal('complete');
+      expect(context.sqs.sendMessage).to.not.have.been.called;
+      expect(context.log.warn).to.have.been.calledWith('No alternative URLs available. Cannot generate suggestions. Skipping message to Mystique.');
     });
   });
 

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ export const opportunityAndSuggestionsStep = async (context) => {`
`266`	`266`	`// Extract unique locales/subpaths from broken links`
`267`	`267`	`const brokenLinkLocales = new Set();`
`268`	`268`	`brokenLinks.forEach((link) => {`
`269`		`- const locale = extractPathPrefix(link.urlTo) \|\| extractPathPrefix(link.urlFrom);`
	`269`	`+ const locale = extractPathPrefix(link.urlTo);`
`270`	`270`	`if (locale) {`
`271`	`271`	`brokenLinkLocales.add(locale);`
`272`	`272`	`}`