diff --git a/package-lock.json b/package-lock.json
index 291124750..4777b2376 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -56,7 +56,7 @@
"@adobe/semantic-release-coralogix": "1.1.38",
"@adobe/semantic-release-skms-cmr": "1.1.5",
"@babel/core": "7.28.3",
- "@babel/eslint-parser": "7.28.0",
+ "@babel/eslint-parser": "^7.28.0",
"@babel/plugin-syntax-import-assertions": "7.27.1",
"@eslint/config-helpers": "0.3.1",
"@redocly/cli": "2.0.8",
diff --git a/src/structured-data/handler.js b/src/structured-data/handler.js
index 8cb9ec1ab..ba32e258a 100644
--- a/src/structured-data/handler.js
+++ b/src/structured-data/handler.js
@@ -10,7 +10,7 @@
* governing permissions and limitations under the License.
*/
/* eslint-disable no-continue, no-await-in-loop */
-import { isNonEmptyArray } from '@adobe/spacecat-shared-utils';
+
import { Audit } from '@adobe/spacecat-shared-data-access';
import { Suggestion as SuggestionModel } from '@adobe/spacecat-shared-data-access/src/models/suggestion/index.js';
@@ -162,18 +162,24 @@ export async function submitForScraping(context) {
site,
dataAccess,
log,
- finalUrl,
} = context;
const { SiteTopPage } = dataAccess;
const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'ahrefs', 'global');
- if (topPages.length === 0) {
- throw new Error('No top pages found for site');
- }
- log.info(`SDA: Submitting for scraping ${topPages.length} top pages for site ${site.getId()}, finalUrl: ${finalUrl}`);
+ const topPagesUrls = topPages.map((page) => page.getUrl());
+ // Combine includedURLs and topPages URLs to scrape
+ // just for testing will replace again by structured data
+ const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || [];
+
+ const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])];
+ log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to scrape after removing duplicates: ${finalUrls.length}`);
+
+ if (finalUrls.length === 0) {
+ throw new Error('No URLs found for site neither top pages nor included URLs');
+ }
return {
- urls: topPages.map((topPage) => ({ url: topPage.getUrl() })),
+ urls: finalUrls.map((url) => ({ url })),
siteId: site.getId(),
type: 'structured-data',
};
@@ -192,19 +198,28 @@ export async function runAuditAndGenerateSuggestions(context) {
const scrapeCache = new Map();
try {
- let topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global');
- if (!isNonEmptyArray(topPages)) {
- log.error(`SDA: No top pages for site ID ${siteId} found. Ensure that top pages were imported.`);
- throw new Error(`No top pages for site ID ${siteId} found.`);
- } else {
- topPages = topPages.map((page) => ({ url: page.getUrl() }));
+ const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global');
+ const topPagesUrls = topPages.map((page) => page.getUrl());
+
+ // Combine includedURLs and topPages URLs to process
+ // just for testing will replace again by structured data
+ const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || [];
+ const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])];
+ log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to process after removing duplicates: ${finalUrls.length}`);
+
+ if (finalUrls.length === 0) {
+ log.error(`SDA: No URLs found for site ID ${siteId} (neither top pages nor included URLs). Ensure that top pages were imported or included URLs are configured.`);
+ throw new Error(`No URLs found for site ID ${siteId} (neither top pages nor included URLs).`);
}
- // Filter out files from the top pages as these are not scraped
+ // Convert URLs to the format expected by processStructuredData
+ let urlsToProcess = finalUrls.map((url) => ({ url }));
+
+ // Filter out files from the URLs as these are not scraped
const dataTypesToIgnore = ['pdf', 'ps', 'dwf', 'kml', 'kmz', 'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'rtf', 'swf'];
- topPages = topPages.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`)));
+ urlsToProcess = urlsToProcess.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`)));
- const auditResult = await processStructuredData(finalUrl, context, topPages, scrapeCache);
+ const auditResult = await processStructuredData(finalUrl, context, urlsToProcess, scrapeCache);
// Create opportunities and suggestions
const oppAndAudit = await opportunityAndSuggestions(finalUrl, {
diff --git a/src/structured-data/lib.js b/src/structured-data/lib.js
index 018885af0..7b904438d 100644
--- a/src/structured-data/lib.js
+++ b/src/structured-data/lib.js
@@ -206,8 +206,13 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) {
const waeResult = scrapeResult?.scrapeResult?.structuredData;
+ // DEBUG: Log what we extracted
+ log.info(`SDA DEBUG: Page ${page} - Extracted structured data:`, JSON.stringify(waeResult, null, 2));
+ log.info(`SDA DEBUG: Page ${page} - waeResult type: ${typeof waeResult}, isArray: ${Array.isArray(waeResult)}`);
+
// If scrape contains old format of structured data, skip
if (isNonEmptyArray(waeResult)) {
+ log.info(`SDA DEBUG: Page ${page} - Skipping old format (array)`);
return;
}
@@ -221,13 +226,21 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) {
const validator = new StructuredDataValidator(schemaOrgJson);
let validatorIssues = [];
try {
- validatorIssues = (await validator.validate(waeResult))
+ const rawValidatorIssues = await validator.validate(waeResult);
+ log.info(`SDA DEBUG: Page ${page} - Raw validator found ${rawValidatorIssues.length} issues`);
+ rawValidatorIssues.forEach((issue, index) => {
+ log.info(`SDA DEBUG: Page ${page} - Issue ${index}: ${issue.severity} - ${issue.rootType} - ${issue.issueMessage}`);
+ });
+
+ validatorIssues = rawValidatorIssues
// For now, ignore issues with severity lower than ERROR
// and suppress unnecessary issues for AEM customers
.filter((issue) => includeIssue(context, issue, imageObjectFlag));
+ log.info(`SDA DEBUG: Page ${page} - After filtering: ${validatorIssues.length} issues remain`);
} catch (e) {
log.error(`SDA: Failed to validate structured data for ${page}.`, e);
}
+ log.info(`SDA DEBUG: Page ${page} - Final issues to add: ${validatorIssues.length}`);
for (const issue of validatorIssues) {
// Only add if same issue for the same source does not exist already.
// This can happen e.g. if a field is missing for every item in a list.
diff --git a/test/audits/structured-data/structured-data.test.js b/test/audits/structured-data/structured-data.test.js
index a94545644..e10a0cc17 100644
--- a/test/audits/structured-data/structured-data.test.js
+++ b/test/audits/structured-data/structured-data.test.js
@@ -55,6 +55,7 @@ describe('Structured Data Audit', () => {
let mockConfiguration;
let s3ClientStub;
let auditStub;
+ let defaultGetIncludedURLs;
const finalUrl = 'https://www.example.com';
@@ -67,10 +68,17 @@ describe('Structured Data Audit', () => {
getObject: sinon.stub(),
};
+ defaultGetIncludedURLs = sinon.spy((auditType) => {
+ if (auditType === 'meta-tags') {
+ return ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3'];
+ }
+ return [];
+ });
+
siteStub = {
getId: () => '123',
getConfig: () => ({
- getIncludedURLs: () => ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3'],
+ getIncludedURLs: defaultGetIncludedURLs,
}),
getDeliveryType: () => 'other',
};
@@ -130,19 +138,50 @@ describe('Structured Data Audit', () => {
});
describe('runAuditAndGenerateSuggestions', () => {
- it('throws an error if no top pages are available', async () => {
+ it('throws an error if no top pages and no included URLs are available', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
+ siteStub.getConfig = () => ({
+ getIncludedURLs: () => [],
+ });
const result = await runAuditAndGenerateSuggestions(context);
expect(result).to.deep.equal({
fullAuditRef: 'https://www.example.com',
auditResult: {
- error: 'No top pages for site ID 123 found.',
+ error: 'No URLs found for site ID 123 (neither top pages nor included URLs).',
success: false,
},
});
});
+ it('works with only included URLs when no top pages are available', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
+ siteStub.getConfig = () => ({
+ getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/product/1'] : []),
+ });
+
+ context.dataAccess.Opportunity.allBySiteIdAndStatus
+ .resolves([context.dataAccess.Opportunity]);
+ context.dataAccess.Opportunity.getSuggestions.resolves([]);
+ context.dataAccess.Opportunity.getId.returns('opportunity-id');
+ context.dataAccess.Opportunity.getType.returns('structured-data');
+ context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);
+
+ s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
+ scrapeResult: {
+ rawBody: '',
+ structuredData: {
+ jsonld: {},
+ errors: [],
+ },
+ },
+ })));
+ context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);
+
+ const result = await runAuditAndGenerateSuggestions(context);
+ expect(result.auditResult.success).to.equal(true);
+ });
+
it('filters out files from top pages', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/product/1'),
@@ -204,6 +243,100 @@ describe('Structured Data Audit', () => {
await runAuditAndGenerateSuggestions(context);
expect(context.sqs.sendMessage).to.have.been.calledOnce;
+ // Verify that getIncludedURLs was called with 'meta-tags'
+ expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags');
+ });
+
+ it('combines top pages and included URLs for audit processing', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ ]);
+ const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1'] : []));
+ siteStub.getConfig = () => ({
+ getIncludedURLs: getIncludedURLsSpy,
+ });
+
+ context.dataAccess.Opportunity.allBySiteIdAndStatus
+ .resolves([context.dataAccess.Opportunity]);
+ context.dataAccess.Opportunity.getSuggestions.resolves([]);
+ context.dataAccess.Opportunity.getId.returns('opportunity-id');
+ context.dataAccess.Opportunity.getType.returns('structured-data');
+ context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);
+
+ s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
+ scrapeResult: {
+ rawBody: '',
+ structuredData: {
+ jsonld: {},
+ errors: [],
+ },
+ },
+ })));
+ context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);
+
+ const result = await runAuditAndGenerateSuggestions(context);
+ expect(result.auditResult.success).to.equal(true);
+ // Verify that getIncludedURLs was called with 'meta-tags'
+ expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags');
+ // Verify that both top pages and included URLs are processed
+ expect(s3ClientStub.send.callCount).to.be.at.least(1); // At least one call should be made
+ });
+
+ it('handles null site gracefully in runAuditAndGenerateSuggestions', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ ]);
+ // Create a site stub that has getId but getConfig returns null
+ siteStub.getConfig = () => null;
+
+ context.dataAccess.Opportunity.allBySiteIdAndStatus
+ .resolves([context.dataAccess.Opportunity]);
+ context.dataAccess.Opportunity.getSuggestions.resolves([]);
+ context.dataAccess.Opportunity.getId.returns('opportunity-id');
+ context.dataAccess.Opportunity.getType.returns('structured-data');
+ context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);
+
+ s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
+ scrapeResult: {
+ rawBody: '',
+ structuredData: {
+ jsonld: {},
+ errors: [],
+ },
+ },
+ })));
+ context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);
+
+ const result = await runAuditAndGenerateSuggestions(context);
+ expect(result.auditResult.success).to.equal(true);
+ });
+
+ it('handles null getConfig gracefully in runAuditAndGenerateSuggestions', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ ]);
+ siteStub.getConfig = () => null; // Set getConfig to return null
+
+ context.dataAccess.Opportunity.allBySiteIdAndStatus
+ .resolves([context.dataAccess.Opportunity]);
+ context.dataAccess.Opportunity.getSuggestions.resolves([]);
+ context.dataAccess.Opportunity.getId.returns('opportunity-id');
+ context.dataAccess.Opportunity.getType.returns('structured-data');
+ context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);
+
+ s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
+ scrapeResult: {
+ rawBody: '',
+ structuredData: {
+ jsonld: {},
+ errors: [],
+ },
+ },
+ })));
+ context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);
+
+ const result = await runAuditAndGenerateSuggestions(context);
+ expect(result.auditResult.success).to.equal(true);
});
});
@@ -479,12 +612,92 @@ describe('Structured Data Audit', () => {
{ url: 'https://example.com/product/3' },
],
});
+ // Verify that getIncludedURLs was called with 'meta-tags'
+ expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags');
});
- it('throws error if no top pages are found when sending scraping request', async () => {
+ it('throws error if no top pages and no included URLs are found when sending scraping request', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
+ siteStub.getConfig = () => ({
+ getIncludedURLs: () => [],
+ });
+
+ await expect(submitForScraping(context)).to.be.rejectedWith('No URLs found for site neither top pages nor included URLs');
+ });
+
+ it('works with only included URLs when no top pages are available for scraping', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
+ siteStub.getConfig = () => ({
+ getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : []),
+ });
+
+ const result = await submitForScraping(context);
+ expect(result).to.deep.equal({
+ siteId: '123',
+ type: 'structured-data',
+ urls: [
+ { url: 'https://example.com/included/1' },
+ { url: 'https://example.com/included/2' },
+ ],
+ });
+ });
- expect(submitForScraping(context)).to.be.rejectedWith('No top pages for site ID 123 found.');
+ it('combines top pages and included URLs for scraping', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ createPageStub('https://example.com/top/2'),
+ ]);
+ const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : []));
+ siteStub.getConfig = () => ({
+ getIncludedURLs: getIncludedURLsSpy,
+ });
+
+ const result = await submitForScraping(context);
+ expect(result).to.deep.equal({
+ siteId: '123',
+ type: 'structured-data',
+ urls: [
+ { url: 'https://example.com/top/1' },
+ { url: 'https://example.com/top/2' },
+ { url: 'https://example.com/included/1' },
+ { url: 'https://example.com/included/2' },
+ ],
+ });
+ // Verify that getIncludedURLs was called with 'meta-tags'
+ expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags');
+ });
+
+ it('handles null site gracefully in submitForScraping', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ ]);
+ // Create a site stub that has getId but getConfig returns null
+ siteStub.getConfig = () => null;
+
+ const result = await submitForScraping(context);
+ expect(result).to.deep.equal({
+ siteId: '123',
+ type: 'structured-data',
+ urls: [
+ { url: 'https://example.com/top/1' },
+ ],
+ });
+ });
+
+ it('handles null getConfig gracefully in submitForScraping', async () => {
+ context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
+ createPageStub('https://example.com/top/1'),
+ ]);
+ siteStub.getConfig = () => null; // Set getConfig to return null
+
+ const result = await submitForScraping(context);
+ expect(result).to.deep.equal({
+ siteId: '123',
+ type: 'structured-data',
+ urls: [
+ { url: 'https://example.com/top/1' },
+ ],
+ });
});
});
});