Skip to content

Page *page-name* or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated. #46

@jackf723

Description

@jackf723

This happens when running pnpm run embeddings locally and when deploying to Vercel. The NEXT_PUBLIC_SUPABASE_ANON_KEY, SUPABASE_SERVICE_ROLE_KEY and OPENAI_KEY are in both the .env and in the Vercel vars.
  details: 'TypeError: fetch failed\n' +
    '    at Object.fetch (node:internal/deps/undici/undici:11576:11)\n' +
    '    at generateEmbeddings
(lib\generate-embeddings.ts:319:61)\n' +
    '    at main
lib\generate-embeddings.ts:487:3)',
  hint: '',
  code: ''
}
Here's the Generate-embeddings.tsx, for reference.
`
import { createClient } from '@supabase/supabase-js'
import { createHash } from 'crypto'
import dotenv from 'dotenv'
import { ObjectExpression } from 'estree'
import { readdir, readFile, stat } from 'fs/promises'
import GithubSlugger from 'github-slugger'
import { Content, Root } from 'mdast'
import { fromMarkdown } from 'mdast-util-from-markdown'
import { mdxFromMarkdown, MdxjsEsm } from 'mdast-util-mdx'
import { toMarkdown } from 'mdast-util-to-markdown'
import { toString } from 'mdast-util-to-string'
import { mdxjs } from 'micromark-extension-mdxjs'
import 'openai'
import { Configuration, OpenAIApi } from 'openai'
import { basename, dirname, join } from 'path'
import { u } from 'unist-builder'
import { filter } from 'unist-util-filter'
import { inspect } from 'util'
import yargs from 'yargs'

dotenv.config()

const ignoredFiles = ['pages/404.mdx']

/**

  • Extracts ES literals from an estree ObjectExpression
  • into a plain JavaScript object.
    */
    function getObjectFromExpression(node: ObjectExpression) {
    return node.properties.reduce<
    Record<string, string | number | bigint | true | RegExp | undefined>

((object, property) => {
if (property.type !== 'Property') {
return object
}

const key = (property.key.type === 'Identifier' && property.key.name) || undefined
const value = (property.value.type === 'Literal' && property.value.value) || undefined

if (!key) {
  return object
}

return {
  ...object,
  [key]: value,
}

}, {})
}

/**

  • Extracts the meta ESM export from the MDX file.
  • This info is akin to frontmatter.
    */
    function extractMetaExport(mdxTree: Root) {
    const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => {
    return (
    node.type === 'mdxjsEsm' &&
    node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
    node.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
    node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
    node.data.estree.body[0].declaration.declarations[0].id.name === 'meta'
    )
    })

if (!metaExportNode) {
return undefined
}

const objectExpression =
(metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type ===
'ObjectExpression' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init) ||
undefined

if (!objectExpression) {
return undefined
}

return getObjectFromExpression(objectExpression)
}

/**

  • Splits a mdast tree into multiple trees based on

  • a predicate function. Will include the splitting node

  • at the beginning of each tree.

  • Useful to split a markdown file into smaller sections.
    */
    function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
    return tree.children.reduce<Root[]>((trees, node) => {
    const [lastTree] = trees.slice(-1)

    if (!lastTree || predicate(node)) {
    const tree: Root = u('root', [node])
    return trees.concat(tree)
    }

    lastTree.children.push(node)
    return trees
    }, [])
    }

type Meta = ReturnType

type Section = {
content: string
heading?: string
slug?: string
}

type ProcessedMdx = {
checksum: string
meta: Meta
sections: Section[]
}

/**

  • Processes MDX content for search indexing.
  • It extracts metadata, strips it of all JSX,
  • and splits it into sub-sections based on criteria.
    */
    function processMdxForSearch(content: string): ProcessedMdx {
    const checksum = createHash('sha256').update(content).digest('base64')

const mdxTree = fromMarkdown(content, {
extensions: [mdxjs()],
mdastExtensions: [mdxFromMarkdown()],
})

const meta = extractMetaExport(mdxTree)

// Remove all MDX elements from markdown
const mdTree = filter(
mdxTree,
(node) =>
![
'mdxjsEsm',
'mdxJsxFlowElement',
'mdxJsxTextElement',
'mdxFlowExpression',
'mdxTextExpression',
].includes(node.type)
)

if (!mdTree) {
return {
checksum,
meta,
sections: [],
}
}

const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading')

const slugger = new GithubSlugger()

const sections = sectionTrees.map((tree) => {
const [firstNode] = tree.children

const heading = firstNode.type === 'heading' ? toString(firstNode) : undefined
const slug = heading ? slugger.slug(heading) : undefined

return {
  content: toMarkdown(tree),
  heading,
  slug,
}

})

return {
checksum,
meta,
sections,
}
}

type WalkEntry = {
path: string
parentPath?: string
}

async function walk(dir: string, parentPath?: string): Promise<WalkEntry[]> {
const immediateFiles = await readdir(dir)

const recursiveFiles = await Promise.all(
immediateFiles.map(async (file) => {
const path = join(dir, file)
const stats = await stat(path)
if (stats.isDirectory()) {
// Keep track of document hierarchy (if this dir has corresponding doc file)
const docPath = ${basename(path)}.mdx

    return walk(
      path,
      immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath
    )
  } else if (stats.isFile()) {
    return [
      {
        path: path,
        parentPath,
      },
    ]
  } else {
    return []
  }
})

)

const flattenedFiles = recursiveFiles.reduce(
(all, folderContents) => all.concat(folderContents),
[]
)

return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path))
}

abstract class BaseEmbeddingSource {
checksum?: string
meta?: Meta
sections?: Section[]

constructor(public source: string, public path: string, public parentPath?: string) {}

abstract load(): Promise<{
checksum: string
meta?: Meta
sections: Section[]
}>
}

class MarkdownEmbeddingSource extends BaseEmbeddingSource {
type: 'markdown' = 'markdown'

constructor(source: string, public filePath: string, public parentFilePath?: string) {
const path = filePath.replace(/^pages/, '').replace(/.mdx?$/, '')
const parentPath = parentFilePath?.replace(/^pages/, '').replace(/.mdx?$/, '')

super(source, path, parentPath)

}

async load() {
const contents = await readFile(this.filePath, 'utf8')

const { checksum, meta, sections } = processMdxForSearch(contents)

this.checksum = checksum
this.meta = meta
this.sections = sections

return {
  checksum,
  meta,
  sections,
}

}
}

type EmbeddingSource = MarkdownEmbeddingSource

async function generateEmbeddings() {
const argv = await yargs.option('refresh', {
alias: 'r',
description: 'Refresh data',
type: 'boolean',
}).argv

const shouldRefresh = argv.refresh

if (
!process.env.NEXT_PUBLIC_SUPABASE_URL ||
!process.env.SUPABASE_SERVICE_ROLE_KEY ||
!process.env.OPENAI_KEY
) {
return console.log(
'Environment variables NEXT_PUBLIC_SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation'
)
}

const supabaseClient = createClient(
process.env.NEXT_PUBLIC_SUPABASE_URL,
process.env.SUPABASE_SERVICE_ROLE_KEY,
{
auth: {
persistSession: false,
autoRefreshToken: false,
},
}
)

const embeddingSources: EmbeddingSource[] = [
...(await walk('pages'))
.filter(({ path }) => /.mdx?$/.test(path))
.filter(({ path }) => !ignoredFiles.includes(path))
.map((entry) => new MarkdownEmbeddingSource('guide', entry.path)),
]

console.log(Discovered ${embeddingSources.length} pages)

if (!shouldRefresh) {
console.log('Checking which pages are new or have changed')
} else {
console.log('Refresh flag set, re-generating all pages')
}

for (const embeddingSource of embeddingSources) {
const { type, source, path, parentPath } = embeddingSource

try {
  const { checksum, meta, sections } = await embeddingSource.load()

  // Check for existing page in DB and compare checksums
  const { error: fetchPageError, data: existingPage } = await supabaseClient
    .from('nods_page')
    .select('id, path, checksum, parentPage:parent_page_id(id, path)')
    .filter('path', 'eq', path)
    .limit(1)
    .maybeSingle()

  if (fetchPageError) {
    throw fetchPageError
  }

  type Singular<T> = T extends any[] ? unknown : T

  // We use checksum to determine if this page & its sections need to be regenerated
  if (!shouldRefresh && existingPage !== null && existingPage.checksum === checksum) {
    const existingParentPage = existingPage.parentPage?.length === 1 ? existingPage.parentPage[0] : undefined;
    // If parent page changed, update it
    if (typeof existingParentPage?.path === 'string' && existingParentPage?.path !== parentPath) {
      console.log(`[${path}] Parent page has changed. Updating to '${parentPath}'...`)
      const { error: fetchParentPageError, data: parentPage } = await supabaseClient
        .from('nods_page')
        .select()
        .filter('path', 'eq', parentPath)
        .limit(1)
        .maybeSingle()

      if (fetchParentPageError) {
        throw fetchParentPageError
      }

      const { error: updatePageError } = await supabaseClient
        .from('nods_page')
        .update({ parent_page_id: parentPage?.id })
        .filter('id', 'eq', existingPage.id)

      if (updatePageError) {
        throw updatePageError
      }
    }
    continue
  }

  if (existingPage) {
    if (!shouldRefresh) {
      console.log(
        `[${path}] Docs have changed, removing old page sections and their embeddings`
      )
    } else {
      console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`)
    }

    const { error: deletePageSectionError } = await supabaseClient
      .from('nods_page_section')
      .delete()
      .filter('page_id', 'eq', existingPage.id)

    if (deletePageSectionError) {
      throw deletePageSectionError
    }
  }

  const { error: fetchParentPageError, data: parentPage } = await supabaseClient
    .from('nods_page')
    .select()
    .filter('path', 'eq', parentPath)
    .limit(1)
    .maybeSingle()

  if (fetchParentPageError) {
    throw fetchParentPageError
  }

  // Create/update page record. Intentionally clear checksum until we
  // have successfully generated all page sections.
  const { error: upsertPageError, data: page } = await supabaseClient
    .from('nods_page')
    .upsert(
      {
        checksum: null,
        path,
        type,
        source,
        meta,
        parent_page_id: parentPage?.id,
      },
      { onConflict: 'path' }
    )
    .select()
    .limit(1)
    .single()

  if (upsertPageError) {
    throw upsertPageError
  }

  console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`)
  for (const { slug, heading, content } of sections) {
    // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
    const input = content.replace(/\n/g, ' ')

    try {
      const configuration = new Configuration({
        apiKey: process.env.OPENAI_KEY,
      })
      const openai = new OpenAIApi(configuration)

      const embeddingResponse = await openai.createEmbedding({
        model: 'text-embedding-ada-002',
        input,
      })

      if (embeddingResponse.status !== 200) {
        throw new Error(inspect(embeddingResponse.data, false, 2))
      }

      const [responseData] = embeddingResponse.data.data

      const { error: insertPageSectionError, data: pageSection } = await supabaseClient
        .from('nods_page_section')
        .insert({
          page_id: page.id,
          slug,
          heading,
          content,
          token_count: embeddingResponse.data.usage.total_tokens,
          embedding: responseData.embedding,
        })
        .select()
        .limit(1)
        .single()

      if (insertPageSectionError) {
        throw insertPageSectionError
      }
    } catch (err) {
      // TODO: decide how to better handle failed embeddings
      console.error(
        `Failed to generate embeddings for '${path}' page section starting with '${input.slice(
          0,
          40
        )}...'`
      )

      throw err
    }
  }

  // Set page checksum so that we know this page was stored successfully
  const { error: updatePageError } = await supabaseClient
    .from('nods_page')
    .update({ checksum })
    .filter('id', 'eq', page.id)

  if (updatePageError) {
    throw updatePageError
  }
} catch (err) {
  console.error(
    `Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
  )
  console.error(err)
}

}

console.log('Embedding generation complete')
}

async function main() {
await generateEmbeddings()
}

main().catch((err) => console.error(err))

`

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions