diff --git a/package-lock.json b/package-lock.json index 291124750..4777b2376 100644 --- a/package-lock.json +++ b/package-lock.json @@ -56,7 +56,7 @@ "@adobe/semantic-release-coralogix": "1.1.38", "@adobe/semantic-release-skms-cmr": "1.1.5", "@babel/core": "7.28.3", - "@babel/eslint-parser": "7.28.0", + "@babel/eslint-parser": "^7.28.0", "@babel/plugin-syntax-import-assertions": "7.27.1", "@eslint/config-helpers": "0.3.1", "@redocly/cli": "2.0.8", diff --git a/src/structured-data/handler.js b/src/structured-data/handler.js index 8cb9ec1ab..ba32e258a 100644 --- a/src/structured-data/handler.js +++ b/src/structured-data/handler.js @@ -10,7 +10,7 @@ * governing permissions and limitations under the License. */ /* eslint-disable no-continue, no-await-in-loop */ -import { isNonEmptyArray } from '@adobe/spacecat-shared-utils'; + import { Audit } from '@adobe/spacecat-shared-data-access'; import { Suggestion as SuggestionModel } from '@adobe/spacecat-shared-data-access/src/models/suggestion/index.js'; @@ -162,18 +162,24 @@ export async function submitForScraping(context) { site, dataAccess, log, - finalUrl, } = context; const { SiteTopPage } = dataAccess; const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'ahrefs', 'global'); - if (topPages.length === 0) { - throw new Error('No top pages found for site'); - } - log.info(`SDA: Submitting for scraping ${topPages.length} top pages for site ${site.getId()}, finalUrl: ${finalUrl}`); + const topPagesUrls = topPages.map((page) => page.getUrl()); + // Combine includedURLs and topPages URLs to scrape + // just for testing will replace again by structured data + const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || []; + + const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])]; + log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to scrape after removing duplicates: ${finalUrls.length}`); + + if (finalUrls.length === 0) { + throw new Error('No URLs found for site neither top pages nor included URLs'); + } return { - urls: topPages.map((topPage) => ({ url: topPage.getUrl() })), + urls: finalUrls.map((url) => ({ url })), siteId: site.getId(), type: 'structured-data', }; @@ -192,19 +198,28 @@ export async function runAuditAndGenerateSuggestions(context) { const scrapeCache = new Map(); try { - let topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global'); - if (!isNonEmptyArray(topPages)) { - log.error(`SDA: No top pages for site ID ${siteId} found. Ensure that top pages were imported.`); - throw new Error(`No top pages for site ID ${siteId} found.`); - } else { - topPages = topPages.map((page) => ({ url: page.getUrl() })); + const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global'); + const topPagesUrls = topPages.map((page) => page.getUrl()); + + // Combine includedURLs and topPages URLs to process + // just for testing will replace again by structured data + const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || []; + const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])]; + log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to process after removing duplicates: ${finalUrls.length}`); + + if (finalUrls.length === 0) { + log.error(`SDA: No URLs found for site ID ${siteId} (neither top pages nor included URLs). Ensure that top pages were imported or included URLs are configured.`); + throw new Error(`No URLs found for site ID ${siteId} (neither top pages nor included URLs).`); } - // Filter out files from the top pages as these are not scraped + // Convert URLs to the format expected by processStructuredData + let urlsToProcess = finalUrls.map((url) => ({ url })); + + // Filter out files from the URLs as these are not scraped const dataTypesToIgnore = ['pdf', 'ps', 'dwf', 'kml', 'kmz', 'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'rtf', 'swf']; - topPages = topPages.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`))); + urlsToProcess = urlsToProcess.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`))); - const auditResult = await processStructuredData(finalUrl, context, topPages, scrapeCache); + const auditResult = await processStructuredData(finalUrl, context, urlsToProcess, scrapeCache); // Create opportunities and suggestions const oppAndAudit = await opportunityAndSuggestions(finalUrl, { diff --git a/src/structured-data/lib.js b/src/structured-data/lib.js index 018885af0..7b904438d 100644 --- a/src/structured-data/lib.js +++ b/src/structured-data/lib.js @@ -206,8 +206,13 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) { const waeResult = scrapeResult?.scrapeResult?.structuredData; + // DEBUG: Log what we extracted + log.info(`SDA DEBUG: Page ${page} - Extracted structured data:`, JSON.stringify(waeResult, null, 2)); + log.info(`SDA DEBUG: Page ${page} - waeResult type: ${typeof waeResult}, isArray: ${Array.isArray(waeResult)}`); + // If scrape contains old format of structured data, skip if (isNonEmptyArray(waeResult)) { + log.info(`SDA DEBUG: Page ${page} - Skipping old format (array)`); return; } @@ -221,13 +226,21 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) { const validator = new StructuredDataValidator(schemaOrgJson); let validatorIssues = []; try { - validatorIssues = (await validator.validate(waeResult)) + const rawValidatorIssues = await validator.validate(waeResult); + log.info(`SDA DEBUG: Page ${page} - Raw validator found ${rawValidatorIssues.length} issues`); + rawValidatorIssues.forEach((issue, index) => { + log.info(`SDA DEBUG: Page ${page} - Issue ${index}: ${issue.severity} - ${issue.rootType} - ${issue.issueMessage}`); + }); + + validatorIssues = rawValidatorIssues // For now, ignore issues with severity lower than ERROR // and suppress unnecessary issues for AEM customers .filter((issue) => includeIssue(context, issue, imageObjectFlag)); + log.info(`SDA DEBUG: Page ${page} - After filtering: ${validatorIssues.length} issues remain`); } catch (e) { log.error(`SDA: Failed to validate structured data for ${page}.`, e); } + log.info(`SDA DEBUG: Page ${page} - Final issues to add: ${validatorIssues.length}`); for (const issue of validatorIssues) { // Only add if same issue for the same source does not exist already. // This can happen e.g. if a field is missing for every item in a list. diff --git a/test/audits/structured-data/structured-data.test.js b/test/audits/structured-data/structured-data.test.js index a94545644..e10a0cc17 100644 --- a/test/audits/structured-data/structured-data.test.js +++ b/test/audits/structured-data/structured-data.test.js @@ -55,6 +55,7 @@ describe('Structured Data Audit', () => { let mockConfiguration; let s3ClientStub; let auditStub; + let defaultGetIncludedURLs; const finalUrl = 'https://www.example.com'; @@ -67,10 +68,17 @@ describe('Structured Data Audit', () => { getObject: sinon.stub(), }; + defaultGetIncludedURLs = sinon.spy((auditType) => { + if (auditType === 'meta-tags') { + return ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3']; + } + return []; + }); + siteStub = { getId: () => '123', getConfig: () => ({ - getIncludedURLs: () => ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3'], + getIncludedURLs: defaultGetIncludedURLs, }), getDeliveryType: () => 'other', }; @@ -130,19 +138,50 @@ describe('Structured Data Audit', () => { }); describe('runAuditAndGenerateSuggestions', () => { - it('throws an error if no top pages are available', async () => { + it('throws an error if no top pages and no included URLs are available', async () => { context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]); + siteStub.getConfig = () => ({ + getIncludedURLs: () => [], + }); const result = await runAuditAndGenerateSuggestions(context); expect(result).to.deep.equal({ fullAuditRef: 'https://www.example.com', auditResult: { - error: 'No top pages for site ID 123 found.', + error: 'No URLs found for site ID 123 (neither top pages nor included URLs).', success: false, }, }); }); + it('works with only included URLs when no top pages are available', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]); + siteStub.getConfig = () => ({ + getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/product/1'] : []), + }); + + context.dataAccess.Opportunity.allBySiteIdAndStatus + .resolves([context.dataAccess.Opportunity]); + context.dataAccess.Opportunity.getSuggestions.resolves([]); + context.dataAccess.Opportunity.getId.returns('opportunity-id'); + context.dataAccess.Opportunity.getType.returns('structured-data'); + context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions); + + s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({ + scrapeResult: { + rawBody: '
', + structuredData: { + jsonld: {}, + errors: [], + }, + }, + }))); + context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]); + + const result = await runAuditAndGenerateSuggestions(context); + expect(result.auditResult.success).to.equal(true); + }); + it('filters out files from top pages', async () => { context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ createPageStub('https://example.com/product/1'), @@ -204,6 +243,100 @@ describe('Structured Data Audit', () => { await runAuditAndGenerateSuggestions(context); expect(context.sqs.sendMessage).to.have.been.calledOnce; + // Verify that getIncludedURLs was called with 'meta-tags' + expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags'); + }); + + it('combines top pages and included URLs for audit processing', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + ]); + const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1'] : [])); + siteStub.getConfig = () => ({ + getIncludedURLs: getIncludedURLsSpy, + }); + + context.dataAccess.Opportunity.allBySiteIdAndStatus + .resolves([context.dataAccess.Opportunity]); + context.dataAccess.Opportunity.getSuggestions.resolves([]); + context.dataAccess.Opportunity.getId.returns('opportunity-id'); + context.dataAccess.Opportunity.getType.returns('structured-data'); + context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions); + + s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({ + scrapeResult: { + rawBody: '
', + structuredData: { + jsonld: {}, + errors: [], + }, + }, + }))); + context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]); + + const result = await runAuditAndGenerateSuggestions(context); + expect(result.auditResult.success).to.equal(true); + // Verify that getIncludedURLs was called with 'meta-tags' + expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags'); + // Verify that both top pages and included URLs are processed + expect(s3ClientStub.send.callCount).to.be.at.least(1); // At least one call should be made + }); + + it('handles null site gracefully in runAuditAndGenerateSuggestions', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + ]); + // Create a site stub that has getId but getConfig returns null + siteStub.getConfig = () => null; + + context.dataAccess.Opportunity.allBySiteIdAndStatus + .resolves([context.dataAccess.Opportunity]); + context.dataAccess.Opportunity.getSuggestions.resolves([]); + context.dataAccess.Opportunity.getId.returns('opportunity-id'); + context.dataAccess.Opportunity.getType.returns('structured-data'); + context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions); + + s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({ + scrapeResult: { + rawBody: '
', + structuredData: { + jsonld: {}, + errors: [], + }, + }, + }))); + context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]); + + const result = await runAuditAndGenerateSuggestions(context); + expect(result.auditResult.success).to.equal(true); + }); + + it('handles null getConfig gracefully in runAuditAndGenerateSuggestions', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + ]); + siteStub.getConfig = () => null; // Set getConfig to return null + + context.dataAccess.Opportunity.allBySiteIdAndStatus + .resolves([context.dataAccess.Opportunity]); + context.dataAccess.Opportunity.getSuggestions.resolves([]); + context.dataAccess.Opportunity.getId.returns('opportunity-id'); + context.dataAccess.Opportunity.getType.returns('structured-data'); + context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions); + + s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({ + scrapeResult: { + rawBody: '
', + structuredData: { + jsonld: {}, + errors: [], + }, + }, + }))); + context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]); + + const result = await runAuditAndGenerateSuggestions(context); + expect(result.auditResult.success).to.equal(true); }); }); @@ -479,12 +612,92 @@ describe('Structured Data Audit', () => { { url: 'https://example.com/product/3' }, ], }); + // Verify that getIncludedURLs was called with 'meta-tags' + expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags'); }); - it('throws error if no top pages are found when sending scraping request', async () => { + it('throws error if no top pages and no included URLs are found when sending scraping request', async () => { context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]); + siteStub.getConfig = () => ({ + getIncludedURLs: () => [], + }); + + await expect(submitForScraping(context)).to.be.rejectedWith('No URLs found for site neither top pages nor included URLs'); + }); + + it('works with only included URLs when no top pages are available for scraping', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]); + siteStub.getConfig = () => ({ + getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : []), + }); + + const result = await submitForScraping(context); + expect(result).to.deep.equal({ + siteId: '123', + type: 'structured-data', + urls: [ + { url: 'https://example.com/included/1' }, + { url: 'https://example.com/included/2' }, + ], + }); + }); - expect(submitForScraping(context)).to.be.rejectedWith('No top pages for site ID 123 found.'); + it('combines top pages and included URLs for scraping', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + createPageStub('https://example.com/top/2'), + ]); + const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : [])); + siteStub.getConfig = () => ({ + getIncludedURLs: getIncludedURLsSpy, + }); + + const result = await submitForScraping(context); + expect(result).to.deep.equal({ + siteId: '123', + type: 'structured-data', + urls: [ + { url: 'https://example.com/top/1' }, + { url: 'https://example.com/top/2' }, + { url: 'https://example.com/included/1' }, + { url: 'https://example.com/included/2' }, + ], + }); + // Verify that getIncludedURLs was called with 'meta-tags' + expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags'); + }); + + it('handles null site gracefully in submitForScraping', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + ]); + // Create a site stub that has getId but getConfig returns null + siteStub.getConfig = () => null; + + const result = await submitForScraping(context); + expect(result).to.deep.equal({ + siteId: '123', + type: 'structured-data', + urls: [ + { url: 'https://example.com/top/1' }, + ], + }); + }); + + it('handles null getConfig gracefully in submitForScraping', async () => { + context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([ + createPageStub('https://example.com/top/1'), + ]); + siteStub.getConfig = () => null; // Set getConfig to return null + + const result = await submitForScraping(context); + expect(result).to.deep.equal({ + siteId: '123', + type: 'structured-data', + urls: [ + { url: 'https://example.com/top/1' }, + ], + }); }); }); });