Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0275a48
testing: commit to test a few things
LeChef318 Aug 14, 2025
4c82e06
fix: update dependencies for scrapeClient and data access with new URLs
LeChef318 Aug 14, 2025
b239e02
test: comment out invalid step destination test case
LeChef318 Aug 14, 2025
7296fb6
fix: update step destination for scraping to use SCRAPE_CLIENT
LeChef318 Aug 14, 2025
d9d5497
fix: simplify URL structure in handler.js by returning finalUrls dire…
LeChef318 Aug 14, 2025
56572fd
fix: update metatag test cases to return URLs as strings instead of o…
LeChef318 Aug 14, 2025
7eef64c
fix: modify URL structure to return objects in handler.js and add Mes…
LeChef318 Aug 15, 2025
d6d02a4
fix: update metatag test cases to return URLs as objects instead of s…
LeChef318 Aug 15, 2025
006e040
fix: update package-lock.json to reflect changes in integrity hashes …
LeChef318 Aug 15, 2025
67a3530
fix: refactor fetchAndProcessPageObject to accept URL and key parameters
LeChef318 Aug 15, 2025
8164136
fix: update metatag test cases to use block comments for better reada…
LeChef318 Aug 15, 2025
0c2db3f
fix: update integrity hash for spacecat-shared-scrape-client and add …
LeChef318 Aug 15, 2025
a1817c5
fix: log scrape result paths for better debugging
LeChef318 Aug 15, 2025
090ca66
fix: await scrape result paths retrieval in step-audit.js
LeChef318 Aug 15, 2025
acbd243
fix: update fetchAndProcessPageObject parameters for clarity
LeChef318 Aug 15, 2025
09f9539
fix: update integrity hash for spacecat-shared-scrape-client
LeChef318 Aug 15, 2025
86aaf74
fix: destructure parameters in fetchAndProcessPageObject mapping
LeChef318 Aug 15, 2025
26a6221
feat: implement ScrapeClient integration for stepped audits
LeChef318 Aug 20, 2025
ff507e8
Merge branch 'refs/heads/main' into feat-implement-scrapeClient-stepp…
LeChef318 Aug 20, 2025
116f374
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Aug 20, 2025
5ab779b
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Aug 20, 2025
e8a5208
fix: rebuilt package-lock.json (again)
LeChef318 Aug 20, 2025
e8fc6a4
chore: update @adobe/spacecat-shared-data-access to 2.55.0 and @adobe…
LeChef318 Aug 20, 2025
f023819
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Aug 20, 2025
19ef307
refactor: clean up unused code and improve error logging in handler.js
LeChef318 Aug 20, 2025
d1755da
fix: update error message in metatags test for clarity
LeChef318 Aug 20, 2025
7527bbe
feat: add scrapeResultPaths and update payload structure for SCRAPE_C…
LeChef318 Aug 27, 2025
6d998f7
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Aug 27, 2025
51d8a4a
fix: package-lock.json errors
LeChef318 Aug 27, 2025
b2b0564
fix: package-lock.json errors, again
LeChef318 Aug 27, 2025
c8972fd
Merge branch 'main' of github.com:adobe/spacecat-audit-worker into fe…
LeChef318 Aug 28, 2025
fa232fb
Merge branch 'main' of github.com:adobe/spacecat-audit-worker into fe…
LeChef318 Sep 2, 2025
8883d06
fix: update preflight/metatags audit to use URL to S3 key mapping
LeChef318 Sep 3, 2025
af06f60
fix: update metatagsAutoDetect to use pagesMap for S3 key mapping
LeChef318 Sep 3, 2025
3ea0781
fix: log start of meta tags audit with new scraper data format
LeChef318 Sep 3, 2025
faca2c9
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 5, 2025
76ebad3
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 8, 2025
3b9b25d
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 8, 2025
c934ba7
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 8, 2025
10b5a4b
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 10, 2025
c29a6af
Merge branch 'main' of github.com:adobe/spacecat-audit-worker into fe…
LeChef318 Sep 12, 2025
ede2cb6
Merge branch 'main' into feat-implement-scrapeClient-stepped-audits
LeChef318 Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 12 additions & 41 deletions src/metatags/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@
import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client';
import { Audit } from '@adobe/spacecat-shared-data-access';
import { calculateCPCValue } from '../support/utils.js';
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js';
import { getObjectFromKey } from '../utils/s3-utils.js';
import SeoChecks from './seo-checks.js';
import { AuditBuilder } from '../common/audit-builder.js';
import { wwwUrlResolver } from '../common/index.js';
import metatagsAutoSuggest from './metatags-auto-suggest.js';
import { convertToOpportunity } from '../common/opportunity.js';
import { getTopPagesForSiteId } from '../canonical/handler.js';
import { getIssueRanking, getBaseUrl } from './opportunity-utils.js';
import {
DESCRIPTION,
Expand Down Expand Up @@ -90,7 +89,7 @@ export async function opportunityAndSuggestions(finalUrl, auditData, context) {
log.info(`Successfully synced Opportunity And Suggestions for site: ${auditData.siteId} and ${auditType} audit type.`);
}

export async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) {
export async function fetchAndProcessPageObject(s3Client, bucketName, url, key, log) {
const object = await getObjectFromKey(s3Client, bucketName, key, log);
if (!object?.scrapeResult?.tags || typeof object.scrapeResult.tags !== 'object') {
log.error(`No Scraped tags found in S3 ${key} object`);
Expand All @@ -102,12 +101,9 @@ export async function fetchAndProcessPageObject(s3Client, bucketName, key, prefi
return null;
}

let pageUrl = object.finalUrl ? new URL(object.finalUrl).pathname
: key.slice(prefix.length - 1).replace('/scrape.json', ''); // Remove the prefix and scrape.json suffix
const pageUrl = object.finalUrl ? new URL(object.finalUrl).pathname
: new URL(url).pathname;
// handling for homepage
if (pageUrl === '') {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check was added since the home page had no path(empty). Will the page url being computed above be '/' for home page now?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new URL(url).pathname will resolve to '/' for the home page

pageUrl = '/';
}
return {
[pageUrl]: {
title: object.scrapeResult.tags.title,
Expand Down Expand Up @@ -206,24 +202,21 @@ async function calculateProjectedTraffic(context, site, detectedTags, log) {
}
}

export async function metatagsAutoDetect(site, pagesSet, context) {
export async function metatagsAutoDetect(site, pagesMap, context) {
const { log, s3Client } = context;
// Fetch site's scraped content from S3
const bucketName = context.env.S3_SCRAPER_BUCKET_NAME;
const prefix = `scrapes/${site.getId()}/`;
const scrapedObjectKeys = await getObjectKeysUsingPrefix(s3Client, bucketName, prefix, log);
const extractedTags = {};
const pageMetadataResults = await Promise.all(scrapedObjectKeys
.filter((key) => pagesSet.has(key))
.map((key) => fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log)));
const pageMetadataResults = await Promise.all([...pagesMap]
.map(([url, path]) => fetchAndProcessPageObject(s3Client, bucketName, url, path, log)));
pageMetadataResults.forEach((pageMetadata) => {
if (pageMetadata) {
Object.assign(extractedTags, pageMetadata);
}
});
const extractedTagsCount = Object.entries(extractedTags).length;
if (extractedTagsCount === 0) {
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`);
log.error(`Failed to extract tags from scraped content for bucket ${bucketName}`);
}

// Perform SEO checks
Expand All @@ -242,38 +235,16 @@ export async function metatagsAutoDetect(site, pagesSet, context) {
};
}

/**
* Transforms a URL into a scrape.json path for a given site
* @param {string} url - The URL to transform
* @param {string} siteId - The site ID
* @returns {string} The path to the scrape.json file
*/
function getScrapeJsonPath(url, siteId) {
const pathname = new URL(url).pathname.replace(/\/$/, '');
return `scrapes/${siteId}${pathname}/scrape.json`;
}

export async function runAuditAndGenerateSuggestions(context) {
const {
site, audit, finalUrl, log, dataAccess,
site, audit, finalUrl, log, scrapeResultPaths,
} = context;
// Get top pages for a site
const siteId = site.getId();
const topPages = await getTopPagesForSiteId(dataAccess, siteId, context, log);
const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || [];

// Transform URLs into scrape.json paths and combine them into a Set
const topPagePaths = topPages.map((page) => getScrapeJsonPath(page.url, siteId));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this code is now removed, will the scrapeResultPaths coming from context ensure it only contain the latest top-pages scrapes, and no older scrapes (from previous top-pages imports)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scrapeResultPaths will only contain URLs that were originally submitted for scraping by this audit. the new scrapeClient workflow will ensure that.

const includedUrlPaths = includedURLs.map((url) => getScrapeJsonPath(url, siteId));
Copy link
Contributor

@dipratap dipratap Sep 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this code is now removed, how are we incorporating the includedUrls in the meta-tags audit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we use all URLs that were successfully scraped. in the submitForScraping-step the included urls will be sent with the top pages to the scraper (see here).
Only the URLs that were submitted by this audit will be returned (no old scrapes or other pages that were not submitted)

const totalPagesSet = new Set([...topPagePaths, ...includedUrlPaths]);

log.info(`Received topPages: ${topPagePaths.length}, includedURLs: ${includedUrlPaths.length}, totalPages to process after removing duplicates: ${totalPagesSet.size}`);

log.info(scrapeResultPaths);
const {
seoChecks,
detectedTags,
extractedTags,
} = await metatagsAutoDetect(site, totalPagesSet, context);
} = await metatagsAutoDetect(site, scrapeResultPaths, context);

// Calculate projected traffic lost
const {
Expand Down Expand Up @@ -355,6 +326,6 @@ export async function submitForScraping(context) {
export default new AuditBuilder()
.withUrlResolver((site) => site.getBaseURL())
.addStep('submit-for-import-top-pages', importTopPages, AUDIT_STEP_DESTINATIONS.IMPORT_WORKER)
.addStep('submit-for-scraping', submitForScraping, AUDIT_STEP_DESTINATIONS.CONTENT_SCRAPER)
.addStep('submit-for-scraping', submitForScraping, AUDIT_STEP_DESTINATIONS.SCRAPE_CLIENT)
.addStep('run-audit-and-generate-suggestions', runAuditAndGenerateSuggestions)
.build();
11 changes: 9 additions & 2 deletions src/preflight/metatags.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ export default async function metatags(context, auditContext) {
step,
audits,
auditsResult,
s3Keys,
timeExecutionBreakdown,
} = auditContext;
if (!checks || checks.includes(PREFLIGHT_METATAGS)) {
Expand All @@ -39,11 +38,19 @@ export default async function metatags(context, auditContext) {
pageResult.audits.push({ name: PREFLIGHT_METATAGS, type: 'seo', opportunities: [] });
});

// Workaround for the updated meta-tags audit which requires a map of URL to S3 key
// TODO: change as soon as preflight is migrated to the ScrapeClient
const pageMap = new Map(previewUrls.map((url) => {
const s3Key = `scrapes/${site.getId()}${new URL(url).pathname.replace(/\/$/, '')}/scrape.json`;
return [url, s3Key];
}));
log.info('[preflight-audit] Starting meta tags audit with new scraper data format');

const {
seoChecks,
detectedTags,
extractedTags,
} = await metatagsAutoDetect(site, s3Keys, context);
} = await metatagsAutoDetect(site, pageMap, context);
try {
const tagCollection = step === 'suggest'
? await metatagsAutoSuggest({
Expand Down
129 changes: 92 additions & 37 deletions test/audits/metatags.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -364,8 +364,8 @@ describe('Meta Tags', () => {
const result = await fetchAndProcessPageObject(
s3ClientStub,
'test-bucket',
'www.test-site.com/page1',
'scrapes/site-id/page1/scrape.json',
'scrapes/site-id/',
logStub,
);

Expand Down Expand Up @@ -401,8 +401,8 @@ describe('Meta Tags', () => {
const result = await fetchAndProcessPageObject(
s3ClientStub,
'test-bucket',
'https://www.test-site.com',
'scrapes/site-id/scrape.json',
'scrapes/site-id/',
logStub,
);

Expand All @@ -427,8 +427,8 @@ describe('Meta Tags', () => {
const result = await fetchAndProcessPageObject(
s3ClientStub,
'test-bucket',
'https://www.test-site.com/page1',
'scrapes/site-id/page1/scrape.json',
'scrapes/site-id/',
logStub,
);

Expand Down Expand Up @@ -461,8 +461,8 @@ describe('Meta Tags', () => {
const result = await fetchAndProcessPageObject(
s3ClientStub,
'test-bucket',
'https://www.test-site.com/404',
'scrapes/site-id/404/scrape.json',
'scrapes/site-id/',
logStub,
);

Expand Down Expand Up @@ -495,8 +495,8 @@ describe('Meta Tags', () => {
const result = await fetchAndProcessPageObject(
s3ClientStub,
'test-bucket',
'https://www.test-site.com/valid-page',
'scrapes/site-id/valid-page/scrape.json',
'scrapes/site-id/',
logStub,
);

Expand Down Expand Up @@ -1062,6 +1062,13 @@ describe('Meta Tags', () => {
ContentType: 'application/json',
};

const scrapeResultPaths = new Map([
['https://www.test-site.com/blog/page1', 'scrapes/site-id/blog/page1/scrape.json'],
['https://www.test-site.com/blog/page2', 'scrapes/site-id/blog/page2/scrape.json'],
['https://www.test-site.com/blog/page3', 'scrapes/site-id/blog/page3/scrape.json'],
['https://www.test-site.com/', 'scrapes/site-id/scrape.json'],
]);

// Setup S3 client responses
s3ClientStub.send = sinon.stub();
s3ClientStub.send
Expand Down Expand Up @@ -1106,6 +1113,7 @@ describe('Meta Tags', () => {
env: {
S3_SCRAPER_BUCKET_NAME: 'test-bucket',
},
scrapeResultPaths,
};
});

Expand All @@ -1114,42 +1122,57 @@ describe('Meta Tags', () => {
});

it('should successfully run audit and generate suggestions', async () => {
const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub().resolves(5000);
const mockGetRUMDomainkey = sinon.stub()
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(5000);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, calculateCPCValue: mockCalculateCPCValue },
'../../src/support/utils.js': {
getRUMDomainkey: mockGetRUMDomainkey,
calculateCPCValue: mockCalculateCPCValue,
},
'@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub,
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({
'/blog/page1': {
title: {
aiSuggestion: 'AI Suggested Title 1',
aiRationale: 'AI Rationale 1',
'../../src/metatags/metatags-auto-suggest.js': sinon.stub()
.resolves({
'/blog/page1': {
title: {
aiSuggestion: 'AI Suggested Title 1',
aiRationale: 'AI Rationale 1',
},
},
},
'/blog/page2': {
title: {
aiSuggestion: 'AI Suggested Title 2',
aiRationale: 'AI Rationale 2',
'/blog/page2': {
title: {
aiSuggestion: 'AI Suggested Title 2',
aiRationale: 'AI Rationale 2',
},
},
},
}),
}),
});
const result = await auditStub.runAuditAndGenerateSuggestions(context);

expect(result).to.deep.equal({ status: 'complete' });
expect(result)
.to
.deep
.equal({ status: 'complete' });
expect(s3ClientStub.send).to.have.been.called;
expect(metatagsOppty.save).to.have.been.called;
});

it('should handle case when no tags are extracted', async () => {
const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub().resolves(2);
const mockGetRUMDomainkey = sinon.stub()
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(2);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, calculateCPCValue: mockCalculateCPCValue },
'../../src/support/utils.js': {
getRUMDomainkey: mockGetRUMDomainkey,
calculateCPCValue: mockCalculateCPCValue,
},
'@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub,
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}),
'../../src/metatags/metatags-auto-suggest.js': sinon.stub()
.resolves({}),
});

// Override all S3 responses to have null tags
Expand All @@ -1168,27 +1191,55 @@ describe('Meta Tags', () => {

const result = await auditStub.runAuditAndGenerateSuggestions(context);

expect(result).to.deep.equal({ status: 'complete' });
expect(logStub.error).to.have.been.calledWith('No Scraped tags found in S3 scrapes/site-id/blog/page3/scrape.json object');
expect(logStub.error).to.have.been.calledWith('Failed to extract tags from scraped content for bucket test-bucket and prefix scrapes/site-id/');
}).timeout(10000);
expect(result)
.to
.deep
.equal({ status: 'complete' });
expect(logStub.error)
.to
.have
.been
.calledWith('No Scraped tags found in S3 scrapes/site-id/blog/page3/scrape.json object');
expect(logStub.error)
.to
.have
.been
.calledWith('Failed to extract tags from scraped content for bucket test-bucket');
})
.timeout(10000);

it('should handle RUM API errors gracefully', async () => {
const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub().resolves(2);
const mockGetRUMDomainkey = sinon.stub()
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(2);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, calculateCPCValue: mockCalculateCPCValue },
'../../src/support/utils.js': {
getRUMDomainkey:
mockGetRUMDomainkey,
calculateCPCValue: mockCalculateCPCValue,
},
'@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub,
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}),
'../../src/metatags/metatags-auto-suggest.js': sinon.stub()
.resolves({}),
});
// Override RUM API response to simulate error
RUMAPIClientStub.createFrom().query.rejects(new Error('RUM API Error'));
RUMAPIClientStub.createFrom()
.query
.rejects(new Error('RUM API Error'));

const result = await auditStub.runAuditAndGenerateSuggestions(context);

expect(result).to.deep.equal({ status: 'complete' });
expect(logStub.warn).to.have.been.calledWith('Error while calculating projected traffic for site-id', sinon.match.instanceOf(Error));
expect(result)
.to
.deep
.equal({ status: 'complete' });
expect(logStub.warn)
.to
.have
.been
.calledWith('Error while calculating projected traffic for site-id', sinon.match.instanceOf(Error));
});

it('should submit top pages for scraping when getIncludedURLs returns null', async () => {
Expand All @@ -1199,7 +1250,11 @@ describe('Meta Tags', () => {
});
context.site.getConfig = getConfigStub;
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, calculateCPCValue: mockCalculateCPCValue },
'../../src/support/utils.js': {
getRUMDomainkey:
mockGetRUMDomainkey,
calculateCPCValue: mockCalculateCPCValue,
},
'@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub,
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}),
Expand Down