Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 31 additions & 16 deletions src/structured-data/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
* governing permissions and limitations under the License.
*/
/* eslint-disable no-continue, no-await-in-loop */
import { isNonEmptyArray } from '@adobe/spacecat-shared-utils';

import { Audit } from '@adobe/spacecat-shared-data-access';

import { Suggestion as SuggestionModel } from '@adobe/spacecat-shared-data-access/src/models/suggestion/index.js';
Expand Down Expand Up @@ -162,18 +162,24 @@ export async function submitForScraping(context) {
site,
dataAccess,
log,
finalUrl,
} = context;
const { SiteTopPage } = dataAccess;
const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(site.getId(), 'ahrefs', 'global');
if (topPages.length === 0) {
throw new Error('No top pages found for site');
}

log.info(`SDA: Submitting for scraping ${topPages.length} top pages for site ${site.getId()}, finalUrl: ${finalUrl}`);
const topPagesUrls = topPages.map((page) => page.getUrl());
// Combine includedURLs and topPages URLs to scrape
// just for testing will replace again by structured data
const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || [];

const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])];
log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to scrape after removing duplicates: ${finalUrls.length}`);

if (finalUrls.length === 0) {
throw new Error('No URLs found for site neither top pages nor included URLs');
}

return {
urls: topPages.map((topPage) => ({ url: topPage.getUrl() })),
urls: finalUrls.map((url) => ({ url })),
siteId: site.getId(),
type: 'structured-data',
};
Expand All @@ -192,19 +198,28 @@ export async function runAuditAndGenerateSuggestions(context) {
const scrapeCache = new Map();

try {
let topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global');
if (!isNonEmptyArray(topPages)) {
log.error(`SDA: No top pages for site ID ${siteId} found. Ensure that top pages were imported.`);
throw new Error(`No top pages for site ID ${siteId} found.`);
} else {
topPages = topPages.map((page) => ({ url: page.getUrl() }));
const topPages = await SiteTopPage.allBySiteIdAndSourceAndGeo(siteId, 'ahrefs', 'global');
const topPagesUrls = topPages.map((page) => page.getUrl());

// Combine includedURLs and topPages URLs to process
// just for testing will replace again by structured data
const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || [];
const finalUrls = [...new Set([...topPagesUrls, ...includedURLs])];
log.info(`SDA: Total top pages: ${topPagesUrls.length}, Total included URLs: ${includedURLs.length}, Final URLs to process after removing duplicates: ${finalUrls.length}`);

if (finalUrls.length === 0) {
log.error(`SDA: No URLs found for site ID ${siteId} (neither top pages nor included URLs). Ensure that top pages were imported or included URLs are configured.`);
throw new Error(`No URLs found for site ID ${siteId} (neither top pages nor included URLs).`);
}

// Filter out files from the top pages as these are not scraped
// Convert URLs to the format expected by processStructuredData
let urlsToProcess = finalUrls.map((url) => ({ url }));

// Filter out files from the URLs as these are not scraped
const dataTypesToIgnore = ['pdf', 'ps', 'dwf', 'kml', 'kmz', 'xls', 'xlsx', 'ppt', 'pptx', 'doc', 'docx', 'rtf', 'swf'];
topPages = topPages.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`)));
urlsToProcess = urlsToProcess.filter((page) => !dataTypesToIgnore.some((dataType) => page.url.endsWith(`.${dataType}`)));

const auditResult = await processStructuredData(finalUrl, context, topPages, scrapeCache);
const auditResult = await processStructuredData(finalUrl, context, urlsToProcess, scrapeCache);

// Create opportunities and suggestions
const oppAndAudit = await opportunityAndSuggestions(finalUrl, {
Expand Down
15 changes: 14 additions & 1 deletion src/structured-data/lib.js
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,13 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) {

const waeResult = scrapeResult?.scrapeResult?.structuredData;

// DEBUG: Log what we extracted
log.info(`SDA DEBUG: Page ${page} - Extracted structured data:`, JSON.stringify(waeResult, null, 2));
log.info(`SDA DEBUG: Page ${page} - waeResult type: ${typeof waeResult}, isArray: ${Array.isArray(waeResult)}`);

// If scrape contains old format of structured data, skip
if (isNonEmptyArray(waeResult)) {
log.info(`SDA DEBUG: Page ${page} - Skipping old format (array)`);
return;
}

Expand All @@ -221,13 +226,21 @@ export async function getIssuesFromScraper(context, pages, scrapeCache) {
const validator = new StructuredDataValidator(schemaOrgJson);
let validatorIssues = [];
try {
validatorIssues = (await validator.validate(waeResult))
const rawValidatorIssues = await validator.validate(waeResult);
log.info(`SDA DEBUG: Page ${page} - Raw validator found ${rawValidatorIssues.length} issues`);
rawValidatorIssues.forEach((issue, index) => {
log.info(`SDA DEBUG: Page ${page} - Issue ${index}: ${issue.severity} - ${issue.rootType} - ${issue.issueMessage}`);
});

validatorIssues = rawValidatorIssues
// For now, ignore issues with severity lower than ERROR
// and suppress unnecessary issues for AEM customers
.filter((issue) => includeIssue(context, issue, imageObjectFlag));
log.info(`SDA DEBUG: Page ${page} - After filtering: ${validatorIssues.length} issues remain`);
} catch (e) {
log.error(`SDA: Failed to validate structured data for ${page}.`, e);
}
log.info(`SDA DEBUG: Page ${page} - Final issues to add: ${validatorIssues.length}`);
for (const issue of validatorIssues) {
// Only add if same issue for the same source does not exist already.
// This can happen e.g. if a field is missing for every item in a list.
Expand Down
223 changes: 218 additions & 5 deletions test/audits/structured-data/structured-data.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ describe('Structured Data Audit', () => {
let mockConfiguration;
let s3ClientStub;
let auditStub;
let defaultGetIncludedURLs;

const finalUrl = 'https://www.example.com';

Expand All @@ -67,10 +68,17 @@ describe('Structured Data Audit', () => {
getObject: sinon.stub(),
};

defaultGetIncludedURLs = sinon.spy((auditType) => {
if (auditType === 'meta-tags') {
return ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3'];
}
return [];
});

siteStub = {
getId: () => '123',
getConfig: () => ({
getIncludedURLs: () => ['https://example.com/product/1', 'https://example.com/product/2', 'https://example.com/product/3'],
getIncludedURLs: defaultGetIncludedURLs,
}),
getDeliveryType: () => 'other',
};
Expand Down Expand Up @@ -130,19 +138,50 @@ describe('Structured Data Audit', () => {
});

describe('runAuditAndGenerateSuggestions', () => {
it('throws an error if no top pages are available', async () => {
it('throws an error if no top pages and no included URLs are available', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
siteStub.getConfig = () => ({
getIncludedURLs: () => [],
});

const result = await runAuditAndGenerateSuggestions(context);
expect(result).to.deep.equal({
fullAuditRef: 'https://www.example.com',
auditResult: {
error: 'No top pages for site ID 123 found.',
error: 'No URLs found for site ID 123 (neither top pages nor included URLs).',
success: false,
},
});
});

it('works with only included URLs when no top pages are available', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
siteStub.getConfig = () => ({
getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/product/1'] : []),
});

context.dataAccess.Opportunity.allBySiteIdAndStatus
.resolves([context.dataAccess.Opportunity]);
context.dataAccess.Opportunity.getSuggestions.resolves([]);
context.dataAccess.Opportunity.getId.returns('opportunity-id');
context.dataAccess.Opportunity.getType.returns('structured-data');
context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);

s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
scrapeResult: {
rawBody: '<main></main>',
structuredData: {
jsonld: {},
errors: [],
},
},
})));
context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);

const result = await runAuditAndGenerateSuggestions(context);
expect(result.auditResult.success).to.equal(true);
});

it('filters out files from top pages', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/product/1'),
Expand Down Expand Up @@ -204,6 +243,100 @@ describe('Structured Data Audit', () => {

await runAuditAndGenerateSuggestions(context);
expect(context.sqs.sendMessage).to.have.been.calledOnce;
// Verify that getIncludedURLs was called with 'meta-tags'
expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags');
});

it('combines top pages and included URLs for audit processing', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
]);
const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1'] : []));
siteStub.getConfig = () => ({
getIncludedURLs: getIncludedURLsSpy,
});

context.dataAccess.Opportunity.allBySiteIdAndStatus
.resolves([context.dataAccess.Opportunity]);
context.dataAccess.Opportunity.getSuggestions.resolves([]);
context.dataAccess.Opportunity.getId.returns('opportunity-id');
context.dataAccess.Opportunity.getType.returns('structured-data');
context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);

s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
scrapeResult: {
rawBody: '<main></main>',
structuredData: {
jsonld: {},
errors: [],
},
},
})));
context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);

const result = await runAuditAndGenerateSuggestions(context);
expect(result.auditResult.success).to.equal(true);
// Verify that getIncludedURLs was called with 'meta-tags'
expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags');
// Verify that both top pages and included URLs are processed
expect(s3ClientStub.send.callCount).to.be.at.least(1); // At least one call should be made
});

it('handles null site gracefully in runAuditAndGenerateSuggestions', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
]);
// Create a site stub that has getId but getConfig returns null
siteStub.getConfig = () => null;

context.dataAccess.Opportunity.allBySiteIdAndStatus
.resolves([context.dataAccess.Opportunity]);
context.dataAccess.Opportunity.getSuggestions.resolves([]);
context.dataAccess.Opportunity.getId.returns('opportunity-id');
context.dataAccess.Opportunity.getType.returns('structured-data');
context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);

s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
scrapeResult: {
rawBody: '<main></main>',
structuredData: {
jsonld: {},
errors: [],
},
},
})));
context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);

const result = await runAuditAndGenerateSuggestions(context);
expect(result.auditResult.success).to.equal(true);
});

it('handles null getConfig gracefully in runAuditAndGenerateSuggestions', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
]);
siteStub.getConfig = () => null; // Set getConfig to return null

context.dataAccess.Opportunity.allBySiteIdAndStatus
.resolves([context.dataAccess.Opportunity]);
context.dataAccess.Opportunity.getSuggestions.resolves([]);
context.dataAccess.Opportunity.getId.returns('opportunity-id');
context.dataAccess.Opportunity.getType.returns('structured-data');
context.dataAccess.Opportunity.addSuggestions.resolves(structuredDataSuggestions);

s3ClientStub.send.resolves(createS3ObjectStub(JSON.stringify({
scrapeResult: {
rawBody: '<main></main>',
structuredData: {
jsonld: {},
errors: [],
},
},
})));
context.dataAccess.Suggestion.allByOpportunityIdAndStatus = sinon.stub().resolves([]);

const result = await runAuditAndGenerateSuggestions(context);
expect(result.auditResult.success).to.equal(true);
});
});

Expand Down Expand Up @@ -479,12 +612,92 @@ describe('Structured Data Audit', () => {
{ url: 'https://example.com/product/3' },
],
});
// Verify that getIncludedURLs was called with 'meta-tags'
expect(defaultGetIncludedURLs).to.have.been.calledWith('meta-tags');
});

it('throws error if no top pages are found when sending scraping request', async () => {
it('throws error if no top pages and no included URLs are found when sending scraping request', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
siteStub.getConfig = () => ({
getIncludedURLs: () => [],
});

await expect(submitForScraping(context)).to.be.rejectedWith('No URLs found for site neither top pages nor included URLs');
});

it('works with only included URLs when no top pages are available for scraping', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([]);
siteStub.getConfig = () => ({
getIncludedURLs: (auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : []),
});

const result = await submitForScraping(context);
expect(result).to.deep.equal({
siteId: '123',
type: 'structured-data',
urls: [
{ url: 'https://example.com/included/1' },
{ url: 'https://example.com/included/2' },
],
});
});

expect(submitForScraping(context)).to.be.rejectedWith('No top pages for site ID 123 found.');
it('combines top pages and included URLs for scraping', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
createPageStub('https://example.com/top/2'),
]);
const getIncludedURLsSpy = sinon.spy((auditType) => (auditType === 'meta-tags' ? ['https://example.com/included/1', 'https://example.com/included/2'] : []));
siteStub.getConfig = () => ({
getIncludedURLs: getIncludedURLsSpy,
});

const result = await submitForScraping(context);
expect(result).to.deep.equal({
siteId: '123',
type: 'structured-data',
urls: [
{ url: 'https://example.com/top/1' },
{ url: 'https://example.com/top/2' },
{ url: 'https://example.com/included/1' },
{ url: 'https://example.com/included/2' },
],
});
// Verify that getIncludedURLs was called with 'meta-tags'
expect(getIncludedURLsSpy).to.have.been.calledWith('meta-tags');
});

it('handles null site gracefully in submitForScraping', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
]);
// Create a site stub that has getId but getConfig returns null
siteStub.getConfig = () => null;

const result = await submitForScraping(context);
expect(result).to.deep.equal({
siteId: '123',
type: 'structured-data',
urls: [
{ url: 'https://example.com/top/1' },
],
});
});

it('handles null getConfig gracefully in submitForScraping', async () => {
context.dataAccess.SiteTopPage.allBySiteIdAndSourceAndGeo = sinon.stub().resolves([
createPageStub('https://example.com/top/1'),
]);
siteStub.getConfig = () => null; // Set getConfig to return null

const result = await submitForScraping(context);
expect(result).to.deep.equal({
siteId: '123',
type: 'structured-data',
urls: [
{ url: 'https://example.com/top/1' },
],
});
});
});
});
Loading