Skip to content

Commit c314bf8

Browse files
committed
code cleanup
1 parent e0b0698 commit c314bf8

9 files changed

+125
-115
lines changed

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"notion-download": "node dist/index.js",
1515
"cmdhelp": "ts-node --compiler-options \"{\\\"module\\\": \\\"commonjs\\\"}\" src/index.ts",
1616
"// test out with a private sample notion db": "",
17-
"pull-test": "cross-var ts-node --compiler-options \"{\\\"module\\\": \\\"commonjs\\\"}\" src/index.ts -n %DOCU_NOTION_INTEGRATION_TOKEN% -r %DOCU_NOTION_TEST_ROOT_PAGE%",
17+
"pull-test": "cross-var ts-node --compiler-options \"{\\\"module\\\": \\\"commonjs\\\"}\" src/index.ts -n %DOCU_NOTION_INTEGRATION_TOKEN% -r %DOCU_NOTION_TEST_ROOT_PAGE% --log-level debug",
1818
"// test with a semi-stable/public site:": "",
1919
"pull-sample": "cross-var ts-node --compiler-options \"{\\\"module\\\": \\\"commonjs\\\"}\" src/index.ts -n %DOCU_NOTION_INTEGRATION_TOKEN% -r %DOCU_NOTION_SAMPLE_ROOT_PAGE% -m ./sample --locales en,es,fr,de --log-level verbose",
2020
"pull-sample-with-paths": "cross-var ts-node --compiler-options \"{\\\"module\\\": \\\"commonjs\\\"}\" src/index.ts -n %DOCU_NOTION_INTEGRATION_TOKEN% -r %DOCU_NOTION_SAMPLE_ROOT_PAGE% -m ./sample --img-output-path ./sample_img"

src/HierarchicalNamedLayoutStrategy.ts

+4-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,10 @@ export class HierarchicalNamedLayoutStrategy extends LayoutStrategy {
3535
sanitize(page.nameOrTitle) +
3636
extensionWithDot;
3737

38-
path = path.replace("//", "/");
38+
path = path
39+
.replaceAll("//", "/")
40+
.replaceAll("%20", "-")
41+
.replaceAll(" ", "-");
3942
// console.log(
4043
// `getPathForPage(${context}, ${pageId}, ${title}) with root ${this.rootDirectory} --> ${path}`
4144
// );

src/LayoutStrategy.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ export abstract class LayoutStrategy {
3434

3535
public getLinkPathForPage(page: NotionPage): string {
3636
// the url we return starts with a "/", meaning it is relative to the root of the markdown root (e.g. /docs root in Docusaurus)
37-
return this.getPathForPage(page, ".md").replace(this.rootDirectory, "");
37+
return this.getPathForPage(page, "").replace(this.rootDirectory, "");
3838
}
3939

4040
public pageWasSeen(page: NotionPage): void {

src/MakeImagePersistencePlan.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { ImageSet } from "./NotionImage";
1+
import { ImageSet } from "./images";
22
import * as Path from "path";
33
import { error } from "./log";
44

src/NotionImage-CaptionReading.spec.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { parseImageBlock } from "./NotionImage";
1+
import { parseImageBlock } from "./images";
22

33
const kPrimaryImageUrl =
44
"https://s3.us-west-2.amazonaws.com/primaryImage.png?Blah=foo";

src/NotionImage.ts renamed to src/images.ts

+21-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import fetch from "node-fetch";
44
import * as Path from "path";
55
import { makeImagePersistencePlan } from "./MakeImagePersistencePlan";
66
import { logDebug, verbose, info } from "./log";
7+
import { ListBlockChildrenResponse } from "@notionhq/client/build/src/api-endpoints";
78

89
let existingImagesNotSeenYetInPull: string[] = [];
910
let imageOutputPath = ""; // default to putting in the same directory as the document referring to it.
@@ -58,6 +59,25 @@ export async function initImageHandling(
5859
}
5960
}
6061

62+
export async function outputImages(
63+
blocks: (
64+
| ListBlockChildrenResponse
65+
| /* not avail in types: BlockObjectResponse so we use any*/ any
66+
)[],
67+
fullPathToDirectoryContainingMarkdown: string,
68+
relativePathToThisPage: string
69+
): Promise<void> {
70+
for (const b of blocks) {
71+
if ("image" in b) {
72+
await processImageBlock(
73+
b,
74+
fullPathToDirectoryContainingMarkdown,
75+
relativePathToThisPage
76+
);
77+
}
78+
}
79+
}
80+
6181
async function readPrimaryImage(imageSet: ImageSet) {
6282
const response = await fetch(imageSet.primaryUrl);
6383
const arrayBuffer = await response.arrayBuffer();
@@ -151,7 +171,7 @@ export function parseImageBlock(b: any): ImageSet {
151171

152172
// Download the image if we don't have it, give it a good name, and
153173
// change the src to point to our copy of the image.
154-
export async function processImageBlock(
174+
async function processImageBlock(
155175
b: any,
156176
pathToParentDocument: string,
157177
relativePathToThisPage: string

src/links.ts

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import { LayoutStrategy } from "./LayoutStrategy";
2+
import { verbose, warning } from "./log";
3+
import { NotionPage } from "./NotionPage";
4+
5+
export function convertInternalLinks(
6+
markdown: string,
7+
pages: NotionPage[],
8+
layoutStrategy: LayoutStrategy
9+
): string {
10+
//console.log(JSON.stringify(pages, null, 2));
11+
12+
return transformLinks(markdown, (url: string) => {
13+
const p = pages.find(p => {
14+
return p.matchesLinkId(url);
15+
});
16+
if (p) {
17+
verbose(
18+
`Converting Link ${url} --> ${layoutStrategy.getLinkPathForPage(p)}`
19+
);
20+
return layoutStrategy.getLinkPathForPage(p);
21+
}
22+
23+
warning(
24+
`Could not find the target of this link. Note that links to outline sections are not supported. ${url}`
25+
);
26+
27+
return url;
28+
});
29+
}
30+
// function convertInternalLinks(
31+
// blocks: (
32+
// | ListBlockChildrenResponse
33+
// | /* not avail in types: BlockObjectResponse so we use any*/ any
34+
// )[]
35+
// ): void {
36+
// // Note. Waiting on https://github.com/souvikinator/notion-to-md/issues/31 before we can get at raw links to other pages.
37+
// // But we can do the conversion now... they just won't actually make it out to the markdown until that gets fixed.
38+
// // blocks
39+
// // .filter((b: any) => b.type === "link_to_page")
40+
// // .forEach((b: any) => {
41+
// // const targetId = b.link_to_page.page_id;
42+
// // });
43+
44+
// blocks
45+
// .filter((b: any) => b.paragraph.rich_text. === "link_to_page")
46+
// .forEach((b: any) => {
47+
// const targetId = b.text.link.url;
48+
// });
49+
// }
50+
51+
function transformLinks(input: string, transform: (url: string) => string) {
52+
// Note: from notion (or notion-md?) we get slightly different hrefs depending on whether the links is "inline"
53+
// (has some other text that's been turned into a link) or "raw".
54+
// Raw links come in without a leading slash, e.g. [link_to_page](4a6de8c0-b90b-444b-8a7b-d534d6ec71a4)
55+
// Inline links come in with a leading slash, e.g. [pointer to the introduction](/4a6de8c0b90b444b8a7bd534d6ec71a4)
56+
const linkRegExp = /\[([^\]]+)?\]\(\/?([^),^/]+)\)/g;
57+
let output = input;
58+
let match;
59+
60+
// The key to understanding this while is that linkRegExp actually has state, and
61+
// it gives you a new one each time. https://stackoverflow.com/a/1520853/723299
62+
verbose(`transformLinks ${input}`);
63+
while ((match = linkRegExp.exec(input)) !== null) {
64+
const string = match[0];
65+
const text = match[1] || "";
66+
const url = match[2];
67+
68+
const replacement = transform(url);
69+
70+
if (replacement) {
71+
output = output.replace(string, `[${text}](${replacement})`);
72+
} else {
73+
verbose(`Maybe problem with link ${JSON.stringify(match)}`);
74+
}
75+
}
76+
77+
return output;
78+
}

src/makeImagePersistencePlan.spec.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { makeImagePersistencePlan } from "./MakeImagePersistencePlan";
2-
import { ImageSet } from "./NotionImage";
2+
import { ImageSet } from "./images";
33

44
test("primary file with explicit file output path and prefix", () => {
55
const imageSet: ImageSet = {

src/pull.ts

+17-108
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,16 @@
11
import * as fs from "fs-extra";
22

33
import { NotionToMarkdown } from "notion-to-md";
4-
import { ListBlockChildrenResponse } from "@notionhq/client/build/src/api-endpoints";
54
import { HierarchicalNamedLayoutStrategy } from "./HierarchicalNamedLayoutStrategy";
65
import { LayoutStrategy } from "./LayoutStrategy";
76
import { initNotionClient, NotionPage, PageType } from "./NotionPage";
8-
import {
9-
initImageHandling,
10-
processImageBlock,
11-
cleanupOldImages,
12-
} from "./NotionImage";
7+
import { initImageHandling, cleanupOldImages, outputImages } from "./images";
138

149
import { tweakForDocusaurus } from "./DocusaurusTweaks";
1510
import { setupCustomTransformers } from "./CustomTranformers";
1611
import * as Path from "path";
17-
import { error, info, verbose, warning } from "./log";
18-
19-
//import { FlatGuidLayoutStrategy } from "./FlatGuidLayoutStrategy";
12+
import { error, info, logDebug, verbose, warning } from "./log";
13+
import { convertInternalLinks } from "./links";
2014

2115
export type Options = {
2216
notionToken: string;
@@ -29,7 +23,6 @@ export type Options = {
2923
};
3024

3125
let options: Options;
32-
3326
let currentSidebarPosition = 0;
3427
let layoutStrategy: LayoutStrategy;
3528
let notionToMarkdown: NotionToMarkdown;
@@ -56,12 +49,11 @@ export async function notionPull(incomingOptions: Options): Promise<void> {
5649
notionToMarkdown = new NotionToMarkdown({ notionClient });
5750
setupCustomTransformers(notionToMarkdown, notionClient);
5851
layoutStrategy = new HierarchicalNamedLayoutStrategy();
59-
//layoutStrategy = new FlatGuidLayoutStrategy();
6052

6153
await fs.mkdir(options.markdownOutputPath, { recursive: true });
6254
layoutStrategy.setRootDirectoryForMarkdown(options.markdownOutputPath);
6355

64-
console.log("Connecting to Notion...");
56+
info("Connecting to Notion...");
6557
// About the complication here of getting all the pages first and then output
6658
// them all. It would be simpler to just do it all in one pass, however the
6759
// two passes are required in order to change links between
@@ -71,8 +63,7 @@ export async function notionPull(incomingOptions: Options): Promise<void> {
7163
// do this link fixing until we've already seen all the pages and
7264
// figured out what their eventual relative url will be.
7365
await getPagesRecursively("", options.rootPage, true);
74-
// console.log("***Pages***");
75-
// console.log(JSON.stringify(pages, null, 2));
66+
logDebug("getPagesRecursively", JSON.stringify(pages, null, 2));
7667
await outputPages(pages);
7768
await layoutStrategy.cleanupOldFiles();
7869
await cleanupOldImages();
@@ -81,20 +72,15 @@ export async function notionPull(incomingOptions: Options): Promise<void> {
8172
async function outputPages(pages: Array<NotionPage>) {
8273
for (const page of pages) {
8374
await outputPage(page);
84-
// if (page.type === PageType.DatabasePage) await processDatabasePage(page);
85-
// if (page.type === PageType.Simple) await processSimplePage(page);
8675
}
8776
}
8877

8978
// This walks the "Outline" page and creates a list of all the nodes that will
9079
// be in the sidebar, including the directories, the pages that are linked to
9180
// that are parented in from the "Database", and any pages we find in the
92-
// outline that contain content (which we call "Simple" pages).
93-
// It does not generate any files. Later, we can
94-
// then step through this list creating the directories and files we need, and,
95-
// crucially, be able to figure out what the url will be for any links between
96-
// content pages.
97-
// FIX comment above: actually, the HierarchicalNamedLayoutStrategy does create directories.
81+
// outline that contain content (which we call "Simple" pages). Later, we can
82+
// then step through this list creating the files we need, and, crucially, be
83+
// able to figure out what the url will be for any links between content pages.
9884
async function getPagesRecursively(
9985
incomingContext: string,
10086
pageId: string,
@@ -117,10 +103,16 @@ async function getPagesRecursively(
117103
}
118104
if (!rootLevel && pageInfo.hasParagraphs) {
119105
pages.push(pageInTheOutline);
120-
if (pageInfo.linksPages)
106+
107+
// The best practice is to keep content pages in the "database" (kanban), but we do allow people to make pages in the outline directly.
108+
// So how can we tell the difference between a page that is supposed to be content and one that is meant to form the sidebar? If it
109+
// have just links, then it's a page for forming the sidebar. If it has contents and no links, then it's a content page. But what if
110+
// it has both? Well then we assume it's a content page.
111+
if (pageInfo.linksPages?.length) {
121112
warning(
122-
`Ambiguity: The page "${pageInTheOutline.nameOrTitle}" is in the outline, has content, and also points at other pages. It will be treated as a simple content page.`
113+
`Note: The page "${pageInTheOutline.nameOrTitle}" is in the outline, has content, and also points at other pages. It will be treated as a simple content page. This is no problem, unless you intended to have all your content pages in the database (kanban workflow) section.`
123114
);
115+
}
124116
}
125117
// a normal outline page that exists just to create the level, pointing at database pages that belong in this level
126118
else if (pageInfo.childPages.length || pageInfo.linksPages.length) {
@@ -195,93 +187,10 @@ async function outputPage(page: NotionPage) {
195187
frontmatter += "---\n";
196188

197189
let markdown = notionToMarkdown.toMarkdownString(mdBlocks);
198-
markdown = convertInternalLinks(markdown);
190+
markdown = convertInternalLinks(markdown, pages, layoutStrategy);
199191

200192
const { body, imports } = tweakForDocusaurus(markdown);
201193
const output = `${frontmatter}\n${imports}\n${body}`;
202194

203195
fs.writeFileSync(mdPath, output, {});
204196
}
205-
206-
async function outputImages(
207-
blocks: (
208-
| ListBlockChildrenResponse
209-
| /* not avail in types: BlockObjectResponse so we use any*/ any
210-
)[],
211-
fullPathToDirectoryContainingMarkdown: string,
212-
relativePathToThisPage: string
213-
): Promise<void> {
214-
for (const b of blocks) {
215-
if ("image" in b) {
216-
await processImageBlock(
217-
b,
218-
fullPathToDirectoryContainingMarkdown,
219-
relativePathToThisPage
220-
);
221-
}
222-
}
223-
}
224-
225-
function convertInternalLinks(markdown: string): string {
226-
//console.log(JSON.stringify(pages, null, 2));
227-
228-
return transformLinks(markdown, (url: string) => {
229-
const p = pages.find(p => {
230-
return p.matchesLinkId(url);
231-
});
232-
if (p) {
233-
verbose(
234-
`Convering Link ${url} --> ${layoutStrategy.getLinkPathForPage(p)}`
235-
);
236-
return layoutStrategy.getLinkPathForPage(p);
237-
}
238-
239-
warning(
240-
`Could not find the target of this link. Note that links to outline sections are not supported. ${url}`
241-
);
242-
243-
return url;
244-
});
245-
}
246-
// function convertInternalLinks(
247-
// blocks: (
248-
// | ListBlockChildrenResponse
249-
// | /* not avail in types: BlockObjectResponse so we use any*/ any
250-
// )[]
251-
// ): void {
252-
// // Note. Waiting on https://github.com/souvikinator/notion-to-md/issues/31 before we can get at raw links to other pages.
253-
// // But we can do the conversion now... they just won't actually make it out to the markdown until that gets fixed.
254-
// // blocks
255-
// // .filter((b: any) => b.type === "link_to_page")
256-
// // .forEach((b: any) => {
257-
// // const targetId = b.link_to_page.page_id;
258-
// // });
259-
260-
// blocks
261-
// .filter((b: any) => b.paragraph.rich_text. === "link_to_page")
262-
// .forEach((b: any) => {
263-
// const targetId = b.text.link.url;
264-
// });
265-
// }
266-
267-
function transformLinks(input: string, transform: (url: string) => string) {
268-
const linkRegExp = /\[([^\]]+)?\]\(\/([^),^/]+)\)/g;
269-
let output = input;
270-
let match;
271-
272-
// The key to understanding this while is that linkRegExp actually has state, and
273-
// it gives you a new one each time. https://stackoverflow.com/a/1520853/723299
274-
while ((match = linkRegExp.exec(input)) !== null) {
275-
const string = match[0];
276-
const text = match[1] || "";
277-
const url = match[2];
278-
279-
const replacement = transform(url);
280-
281-
if (replacement) {
282-
output = output.replace(string, `[${text}](${replacement})`);
283-
}
284-
}
285-
286-
return output;
287-
}

0 commit comments

Comments
 (0)