Invalid Link Refs

Hi there. I found a bunch of invalid links in the `/links\d+.csv` files. 

<details>
<summary>Script to find the invalid links</summary>

```ts
import { resolve } from 'path';
import {
  existsSync,
  mkdirSync,
  readFileSync,
  writeFileSync,
  createReadStream,
} from 'fs';
import { parse } from '@fast-csv/parse';
import { sync } from 'glob';

type RefsFile = { title: string; refs: string[] };

const sefaria = resolve(`${import.meta.dirname}/../Sefaria-Export/`);

const linkFiles = sync(`${sefaria}/links/links[0-9]*.csv`);

const merged = sync(`${sefaria}/json/**/merged.json`);

const refsDir = resolve(`${import.meta.dirname}/../refs`);
const refsByTitleDir = `${refsDir}/texts`;
const linksByTitleDir = `${refsDir}/links`;

mkdirSync(refsByTitleDir, { recursive: true });
mkdirSync(linksByTitleDir, { recursive: true });

// Build schema cache
console.time('\nbuilding schema cache');
const metaByTitle: Record<string, any> = {};
for (const file of sync(`${sefaria}/schemas/*.json`)) {
  const json = readJson<any>(file);
  if (!json?.title) continue;
  metaByTitle[json.title?.toLowerCase()] = json;
}
console.timeEnd('\nbuilding schema cache');
console.log();

// Build up index of all refs by title
console.time('\norganizing refs by title');
organizeTitleRefs();
console.timeEnd('\norganizing refs by title');
console.log();

// Build up index of all links by title
console.time('\ngrouping links by title');
await groupLinksByTitle();
console.timeEnd('\ngrouping links by title');
console.log();

// Check that title links are valid
console.time('\nflattening title refs');
checkTitleLinks();
console.timeEnd('\nflattening title refs');
console.log();

// #region Functions

function readJson<T>(file: string) {
  if (!existsSync(file.toLowerCase())) return null;
  const contents = readFileSync(file.toLowerCase(), 'utf8');
  return contents ? (JSON.parse(contents) as T) : null;
}

function writeJson(file: string, data: any) {
  writeFileSync(file.toLowerCase(), JSON.stringify(data, null, 2));
}

function format(number: number, type: string) {
  if (type === 'Daf') {
    const offset = Math.floor((number - 2) / 2);
    const side = number % 2 === 0 ? 'a' : 'b';
    return `${2 + offset}${side}`;
  }
  return `${number + 1}`;
}

function createRefPath(schema: any, path: string[], depthOffset = 0): string {
  if (!schema) {
    console.warn(`No schema or nodes found for path: ${path}`);
    return '';
  }

  const { sectionNames, index_offsets_by_depth, nodes } = schema;

  if (index_offsets_by_depth) {
    for (let i = 0; i < path.length; i++) {
      const offset = index_offsets_by_depth[i + depthOffset];
      if (offset) {
        path[i] = `${+path[i] + 1 + offset[path[i - 1]]}`;
      }
    }
  }

  if (sectionNames) {
    // Simple schema with direct sectionNames (e.g., Chapter and Verse)
    return path.map((num, i) => format(+num, sectionNames[i])).join(':');
  } else {
    // Complex schema with nodes, find the node for this path
    const node = nodes.find((node: any) => path[0] === node.title);
    if (!node) {
      console.warn(`No node found for ${schema.title} - ${path.join(' > ')}`);
      return '';
    }

    const rest = createRefPath(node, path.slice(1), depthOffset + 1);
    const title = node.title;

    const comma = /^\d/.test(rest) ? '' : ',';
    return !title ? rest : `${title}${comma} ${rest}`;
  }
}

function organizeTitleRefs() {
  console.log(`Organizing ${merged.length} texts by title`);
  let index = 0;

  for (const file of merged) {
    if (++index % 1000 === 0) console.log(`Processed ${index} texts`);
    const { title, text } = readJson<any>(file)!;

    const schema = metaByTitle[title.toLowerCase()]?.schema;
    if (!schema) {
      console.log(`No schema for ${title}`);
      continue;
    }

    const location = `${refsByTitleDir}/${title}.json`;
    const refsFile = readJson<RefsFile>(location) || { title, refs: [] };

    const allRefs = new Set<string>(refsFile.refs);
    const process = (section: any, path: string[] = []) => {
      if (Array.isArray(section)) {
        for (const [index, item] of Object.entries(section)) {
          process(item, [...path, `${index}`]);
        }
      } else if (typeof section === 'object') {
        for (const [key, value] of Object.entries(section)) {
          process(value, [...path, key]);
        }
      } else {
        const refPath = createRefPath(schema, path);
        allRefs.add(refPath);
      }
    };

    process(text);

    const updateFile: RefsFile = { title, refs: Array.from(allRefs) };
    writeJson(location, updateFile);
  }
}

async function groupLinksByTitle() {
  const linksByTitle: Record<string, string[]> = {};
  for (const file of linkFiles) {
    console.log(`Processing ${file}`);
    const stream = createReadStream(file).pipe(parse({ headers: true }));
    for await (const row of stream) {
      processCitation(row[`Citation 1`], linksByTitle);
      processCitation(row[`Citation 2`], linksByTitle);
    }
  }
  for (const [title, refs] of Object.entries(linksByTitle)) {
    const uniqueRefs = Array.from(new Set(refs));
    writeJson(`${linksByTitleDir}/${title}.json`, { title, refs: uniqueRefs });
  }
}

function processCitation(
  citation: string,
  linksByTitle: Record<string, string[]>
) {
  const parts = citation.split(/( |(?:, ))/);
  for (let stop = parts.length; stop > 0; stop -= 2) {
    const work = parts.slice(0, stop).join('');
    if (metaByTitle[work.toLowerCase()]) {
      const ref = parts.slice(stop + 1).join('');
      linksByTitle[work] = linksByTitle[work] || [];
      linksByTitle[work].push(ref);
      return;
    }
  }

  console.log(`No title found for ${citation}`);
}

function checkTitleLinks() {
  const linkFiles = sync(`${linksByTitleDir}/*.json`);
  const badRefsByTitle: Record<string, string[]> = {};
  console.log(`Checking ${linkFiles.length} texts for bad links`);
  let index = 0;

  const noLinkRefs: string[] = [];

  for (const file of linkFiles) {
    if (++index % 1000 === 0) {
      const found = Object.values(badRefsByTitle).flat().length;
      console.log(`Checked ${index} texts, found ${found} bad refs so far`);
    }
    const { title, refs: links } = readJson<RefsFile>(file)!;

    const refsByTitleLocation = `${refsByTitleDir}/${title}.json`;
    const refsFile = readJson<RefsFile>(refsByTitleLocation)!;
    if (!refsFile) {
      noLinkRefs.push(title);
      // console.log(`No link refs found for ${title}`);
      continue;
    }

    const allRefs = new Set(refsFile.refs);

    for (const link of links) {
      const firstRange = link.split('-')[0];
      let found = false;
      found ||= allRefs.has(link);
      found ||= allRefs.has(firstRange);
      found ||= [...allRefs].some((ref) => ref.startsWith(firstRange));
      if (!found) {
        badRefsByTitle[title] = badRefsByTitle[title] || [];
        badRefsByTitle[title].push(link);
      }
    }
  }

  const lines = Object.entries(badRefsByTitle).map(([title, refs]) => {
    const actualTitle = metaByTitle[title]?.title || title;
    const bads = Array.from(new Set(refs));
    return `  ${JSON.stringify(actualTitle)}: ${JSON.stringify(bads)}`;
  });
  const nicerBadRefString = `{\n${lines.join(',\n')}\n}`;
  writeFileSync(`${refsDir}/bad-refs.json`, nicerBadRefString);

  console.log(`No link refs found for ${JSON.stringify(noLinkRefs)}`);

  const found = Object.values(badRefsByTitle).flat().length;
  console.log(`Finished checking ${index} texts, found ${found} bad refs`);
}

// #endregion

```

Here is the script output:

```
node scripts/find-bad-refs.ts

building schema cache: 736.84ms

Organizing 7063 texts by title
No schema for Tosefta Tohorot
Processed 1000 texts
Processed 2000 texts
Processed 3000 texts
No schema for Terumot
No node found for English Explanation of Mishnah Makhshirin - intro > 0
No node found for English Explanation of Mishnah Makhshirin - intro > 1
No node found for English Explanation of Mishnah Makhshirin - intro > 2
No node found for English Explanation of Mishnah Makhshirin - intro > 3
Processed 4000 texts
No schema for Keter Malchut
Processed 5000 texts
Processed 6000 texts
Processed 7000 texts

organizing refs by title: 20.982s

Processing ~/link-finder/Sefaria-Export/links/links9.csv
Processing ~/link-finder/Sefaria-Export/links/links8.csv
Processing ~/link-finder/Sefaria-Export/links/links7.csv
Processing ~/link-finder/Sefaria-Export/links/links6.csv
Processing ~/link-finder/Sefaria-Export/links/links5.csv
Processing ~/link-finder/Sefaria-Export/links/links4.csv
Processing ~/link-finder/Sefaria-Export/links/links3.csv
Processing ~/link-finder/Sefaria-Export/links/links2.csv
Processing ~/link-finder/Sefaria-Export/links/links11.csv
Processing ~/link-finder/Sefaria-Export/links/links10.csv
Processing ~/link-finder/Sefaria-Export/links/links1.csv
Processing ~/link-finder/Sefaria-Export/links/links0.csv

grouping links by title: 28.763s

Checking 5759 texts for bad links
Checked 1000 texts, found 2679 bad refs so far
Checked 2000 texts, found 30521 bad refs so far
Checked 3000 texts, found 34662 bad refs so far
Checked 4000 texts, found 36663 bad refs so far
Checked 5000 texts, found 38135 bad refs so far
No link refs found for ["The Five Books of Moses, by Everett Fox","Sefer HaShorashim","Klein Dictionary","Jastrow","BDB","BDB Aramaic","Animadversions by Elias Levita on Sefer HaShorashim"]
Finished checking 5759 texts, found 38328 bad refs

flattening title refs: 7.320s

Execution time: 59 s.
```

</details>

I've attached `bad-refs.json` as a file as it was too large to be inlined in this comment

[bad-refs.json](https://github.com/user-attachments/files/19095565/bad-refs.json)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Invalid Link Refs #56

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Invalid Link Refs #56

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions