-
Notifications
You must be signed in to change notification settings - Fork 13
fix(meta-tags): add workaround for Preflight-Audit #1216
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
0275a48
4c82e06
b239e02
7296fb6
d9d5497
56572fd
7eef64c
d6d02a4
006e040
67a3530
8164136
0c2db3f
a1817c5
090ca66
acbd243
09f9539
86aaf74
26a6221
ff507e8
116f374
5ab779b
e8a5208
e8fc6a4
f023819
19ef307
d1755da
7527bbe
6d998f7
51d8a4a
b2b0564
c8972fd
fa232fb
8883d06
af06f60
3ea0781
faca2c9
76ebad3
3b9b25d
c934ba7
10b5a4b
c29a6af
ede2cb6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,13 +13,12 @@ | |
import RUMAPIClient from '@adobe/spacecat-shared-rum-api-client'; | ||
import { Audit } from '@adobe/spacecat-shared-data-access'; | ||
import { calculateCPCValue } from '../support/utils.js'; | ||
import { getObjectFromKey, getObjectKeysUsingPrefix } from '../utils/s3-utils.js'; | ||
import { getObjectFromKey } from '../utils/s3-utils.js'; | ||
import SeoChecks from './seo-checks.js'; | ||
import { AuditBuilder } from '../common/audit-builder.js'; | ||
import { wwwUrlResolver } from '../common/index.js'; | ||
import metatagsAutoSuggest from './metatags-auto-suggest.js'; | ||
import { convertToOpportunity } from '../common/opportunity.js'; | ||
import { getTopPagesForSiteId } from '../canonical/handler.js'; | ||
import { getIssueRanking, getBaseUrl } from './opportunity-utils.js'; | ||
import { | ||
DESCRIPTION, | ||
|
@@ -90,7 +89,7 @@ export async function opportunityAndSuggestions(finalUrl, auditData, context) { | |
log.info(`Successfully synced Opportunity And Suggestions for site: ${auditData.siteId} and ${auditType} audit type.`); | ||
} | ||
|
||
export async function fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log) { | ||
export async function fetchAndProcessPageObject(s3Client, bucketName, url, key, log) { | ||
const object = await getObjectFromKey(s3Client, bucketName, key, log); | ||
if (!object?.scrapeResult?.tags || typeof object.scrapeResult.tags !== 'object') { | ||
log.error(`No Scraped tags found in S3 ${key} object`); | ||
|
@@ -102,12 +101,9 @@ export async function fetchAndProcessPageObject(s3Client, bucketName, key, prefi | |
return null; | ||
} | ||
|
||
let pageUrl = object.finalUrl ? new URL(object.finalUrl).pathname | ||
: key.slice(prefix.length - 1).replace('/scrape.json', ''); // Remove the prefix and scrape.json suffix | ||
const pageUrl = object.finalUrl ? new URL(object.finalUrl).pathname | ||
: new URL(url).pathname; | ||
// handling for homepage | ||
if (pageUrl === '') { | ||
pageUrl = '/'; | ||
} | ||
return { | ||
[pageUrl]: { | ||
title: object.scrapeResult.tags.title, | ||
|
@@ -206,24 +202,21 @@ async function calculateProjectedTraffic(context, site, detectedTags, log) { | |
} | ||
} | ||
|
||
export async function metatagsAutoDetect(site, pagesSet, context) { | ||
export async function metatagsAutoDetect(site, pagesMap, context) { | ||
const { log, s3Client } = context; | ||
// Fetch site's scraped content from S3 | ||
const bucketName = context.env.S3_SCRAPER_BUCKET_NAME; | ||
const prefix = `scrapes/${site.getId()}/`; | ||
const scrapedObjectKeys = await getObjectKeysUsingPrefix(s3Client, bucketName, prefix, log); | ||
const extractedTags = {}; | ||
const pageMetadataResults = await Promise.all(scrapedObjectKeys | ||
.filter((key) => pagesSet.has(key)) | ||
.map((key) => fetchAndProcessPageObject(s3Client, bucketName, key, prefix, log))); | ||
const pageMetadataResults = await Promise.all([...pagesMap] | ||
.map(([url, path]) => fetchAndProcessPageObject(s3Client, bucketName, url, path, log))); | ||
pageMetadataResults.forEach((pageMetadata) => { | ||
if (pageMetadata) { | ||
Object.assign(extractedTags, pageMetadata); | ||
} | ||
}); | ||
const extractedTagsCount = Object.entries(extractedTags).length; | ||
if (extractedTagsCount === 0) { | ||
log.error(`Failed to extract tags from scraped content for bucket ${bucketName} and prefix ${prefix}`); | ||
log.error(`Failed to extract tags from scraped content for bucket ${bucketName}`); | ||
} | ||
|
||
// Perform SEO checks | ||
|
@@ -242,38 +235,16 @@ export async function metatagsAutoDetect(site, pagesSet, context) { | |
}; | ||
} | ||
|
||
/** | ||
* Transforms a URL into a scrape.json path for a given site | ||
* @param {string} url - The URL to transform | ||
* @param {string} siteId - The site ID | ||
* @returns {string} The path to the scrape.json file | ||
*/ | ||
function getScrapeJsonPath(url, siteId) { | ||
const pathname = new URL(url).pathname.replace(/\/$/, ''); | ||
return `scrapes/${siteId}${pathname}/scrape.json`; | ||
} | ||
|
||
export async function runAuditAndGenerateSuggestions(context) { | ||
const { | ||
site, audit, finalUrl, log, dataAccess, | ||
site, audit, finalUrl, log, scrapeResultPaths, | ||
} = context; | ||
// Get top pages for a site | ||
const siteId = site.getId(); | ||
const topPages = await getTopPagesForSiteId(dataAccess, siteId, context, log); | ||
const includedURLs = await site?.getConfig()?.getIncludedURLs('meta-tags') || []; | ||
|
||
// Transform URLs into scrape.json paths and combine them into a Set | ||
const topPagePaths = topPages.map((page) => getScrapeJsonPath(page.url, siteId)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this code is now removed, will the scrapeResultPaths coming from context ensure it only contain the latest top-pages scrapes, and no older scrapes (from previous top-pages imports)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. scrapeResultPaths will only contain URLs that were originally submitted for scraping by this audit. the new scrapeClient workflow will ensure that. |
||
const includedUrlPaths = includedURLs.map((url) => getScrapeJsonPath(url, siteId)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since this code is now removed, how are we incorporating the includedUrls in the meta-tags audit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we use all URLs that were successfully scraped. in the |
||
const totalPagesSet = new Set([...topPagePaths, ...includedUrlPaths]); | ||
|
||
log.info(`Received topPages: ${topPagePaths.length}, includedURLs: ${includedUrlPaths.length}, totalPages to process after removing duplicates: ${totalPagesSet.size}`); | ||
|
||
log.info(scrapeResultPaths); | ||
const { | ||
seoChecks, | ||
detectedTags, | ||
extractedTags, | ||
} = await metatagsAutoDetect(site, totalPagesSet, context); | ||
} = await metatagsAutoDetect(site, scrapeResultPaths, context); | ||
|
||
// Calculate projected traffic lost | ||
const { | ||
|
@@ -355,6 +326,6 @@ export async function submitForScraping(context) { | |
export default new AuditBuilder() | ||
.withUrlResolver((site) => site.getBaseURL()) | ||
.addStep('submit-for-import-top-pages', importTopPages, AUDIT_STEP_DESTINATIONS.IMPORT_WORKER) | ||
.addStep('submit-for-scraping', submitForScraping, AUDIT_STEP_DESTINATIONS.CONTENT_SCRAPER) | ||
.addStep('submit-for-scraping', submitForScraping, AUDIT_STEP_DESTINATIONS.SCRAPE_CLIENT) | ||
.addStep('run-audit-and-generate-suggestions', runAuditAndGenerateSuggestions) | ||
.build(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check was added since the home page had no path(empty). Will the page url being computed above be '/' for home page now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
new URL(url).pathname
will resolve to'/'
for the home page