diff --git a/.env.example b/.env.example index a9693586786..49c5ef6d22b 100644 --- a/.env.example +++ b/.env.example @@ -185,3 +185,27 @@ # VERCEL_ENV= # NEXT_PUBLIC_VERSION= # NEXT_BUILD_STANDALONE= + +# ---------- notion-i18n-translator(CLI 翻译脚本) ---------- +# 仅供 `yarn translate` 系列脚本使用,详见 scripts/translate/README.md。 +# 真实值请放入 .env.local(已被 gitignore);本文件仅作模板。 +# +# Notion 集成 token(https://www.notion.so/my-integrations);下方两个数据库 +# 必须均与该集成连接(在数据库页右上角 ••• → Connections 添加)。 +# NOTION_TOKEN= +# 两个语言数据库的 ID — 形如 xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx,可从数据库 URL 中提取。 +# NOTION_DB_EN_ID= +# NOTION_DB_ZH_ID= +# +# 翻译提供方:deepseek(默认)或 glm +# TRANSLATOR_PROVIDER=deepseek +# DEEPSEEK_API_KEY= +# DEEPSEEK_MODEL=deepseek-chat +# DEEPSEEK_BASE_URL=https://api.deepseek.com/v1 +# GLM_API_KEY= +# GLM_MODEL=glm-4-plus +# GLM_BASE_URL=https://open.bigmodel.cn/api/paas/v4 +# +# 并发与安全 +# TRANSLATOR_CONCURRENCY=8 +# TRANSLATOR_BUDGET_TOKENS_PER_RUN=500000 diff --git a/.gitignore b/.gitignore index 162ead61b7b..aaee050dadd 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,4 @@ package-lock.json .tmp/ .perf/ tsconfig.tsbuildinfo -webpack-internal:/ \ No newline at end of file +webpack-internal:/ diff --git a/__tests__/scripts/translate/block-mapper.test.js b/__tests__/scripts/translate/block-mapper.test.js new file mode 100644 index 00000000000..2b3c5237103 --- /dev/null +++ b/__tests__/scripts/translate/block-mapper.test.js @@ -0,0 +1,87 @@ +const { translateBlock, richTextToString, rebuildRichText } = require('../../../scripts/translate/block-mapper') + +const fakeCtx = { + translateText: async (text, opts) => { + if (opts?.hint === 'mermaid') return { text: text.replace(/中国/g, 'China') } + return { text: '[' + text + ']' } + } +} + +describe('translate/block-mapper', () => { + test('paragraph rich_text gets translated', async () => { + const block = { + type: 'paragraph', + paragraph: { + rich_text: [ + { + type: 'text', + text: { content: 'hi' }, + annotations: { bold: true }, + plain_text: 'hi' + } + ] + } + } + const out = await translateBlock(block, fakeCtx) + expect(out.paragraph.rich_text[0].text.content).toBe('[hi]') + expect(out.paragraph.rich_text[0].annotations.bold).toBe(true) + }) + + test('non-mermaid code blocks are preserved verbatim', async () => { + const block = { + type: 'code', + code: { + rich_text: [{ type: 'text', text: { content: 'x = 1' }, plain_text: 'x = 1' }], + language: 'python' + } + } + const out = await translateBlock(block, fakeCtx) + expect(out.code.rich_text[0].text.content).toBe('x = 1') + expect(out.code.language).toBe('python') + }) + + test('mermaid code blocks translate labels with a hint', async () => { + const block = { + type: 'code', + code: { + rich_text: [{ type: 'text', text: { content: 'pie title 国家\n "中国" : 50' }, plain_text: 'pie title 国家\n "中国" : 50' }], + language: 'mermaid' + } + } + const out = await translateBlock(block, fakeCtx) + expect(out.code.rich_text[0].text.content).toContain('China') + expect(out.code.language).toBe('mermaid') + }) + + test('column_list returns null (skipped)', async () => { + const out = await translateBlock({ type: 'column_list', column_list: {} }, fakeCtx) + expect(out).toBeNull() + }) + + test('image blocks pass through without translation', async () => { + const block = { + type: 'image', + image: { type: 'external', external: { url: 'https://example.com/x.png' } } + } + const out = await translateBlock(block, fakeCtx) + expect(out.type).toBe('image') + expect(out.image.external.url).toBe('https://example.com/x.png') + }) + + test('rebuildRichText preserves single-segment annotations', () => { + const original = [ + { type: 'text', text: { content: 'hi' }, annotations: { italic: true }, plain_text: 'hi' } + ] + const out = rebuildRichText('你好', original) + expect(out[0].text.content).toBe('你好') + expect(out[0].annotations.italic).toBe(true) + }) + + test('richTextToString concatenates all segments', () => { + const rich = [ + { plain_text: 'foo ' }, + { plain_text: 'bar' } + ] + expect(richTextToString(rich)).toBe('foo bar') + }) +}) diff --git a/__tests__/scripts/translate/state.test.js b/__tests__/scripts/translate/state.test.js new file mode 100644 index 00000000000..6b039078bb0 --- /dev/null +++ b/__tests__/scripts/translate/state.test.js @@ -0,0 +1,46 @@ +const { + sha256OfBlocks, + TRANSLATABLE_BLOCK_TYPES, + COPY_AS_IS_BLOCK_TYPES, + SKIP_BLOCK_TYPES, + TRANSLATABLE_CODE_LANGUAGES +} = require('../../../scripts/translate/state') + +const para = text => ({ + type: 'paragraph', + paragraph: { rich_text: [{ plain_text: text }] } +}) + +describe('translate/state', () => { + test('sha256OfBlocks is deterministic for identical inputs', () => { + const a = [para('hello world'), { type: 'divider' }] + const b = [para('hello world'), { type: 'divider' }] + expect(sha256OfBlocks(a)).toBe(sha256OfBlocks(b)) + }) + + test('sha256OfBlocks changes when content changes', () => { + const a = [para('hello world')] + const b = [para('hello world!')] + expect(sha256OfBlocks(a)).not.toBe(sha256OfBlocks(b)) + }) + + test('block-type classifications are disjoint', () => { + for (const t of TRANSLATABLE_BLOCK_TYPES) { + expect(COPY_AS_IS_BLOCK_TYPES.has(t)).toBe(false) + expect(SKIP_BLOCK_TYPES.has(t)).toBe(false) + } + for (const t of COPY_AS_IS_BLOCK_TYPES) { + expect(SKIP_BLOCK_TYPES.has(t)).toBe(false) + } + }) + + test('column_list and table are skipped (require inlined children on create)', () => { + expect(SKIP_BLOCK_TYPES.has('column_list')).toBe(true) + expect(SKIP_BLOCK_TYPES.has('table')).toBe(true) + expect(SKIP_BLOCK_TYPES.has('synced_block')).toBe(true) + }) + + test('mermaid is in TRANSLATABLE_CODE_LANGUAGES', () => { + expect(TRANSLATABLE_CODE_LANGUAGES.has('mermaid')).toBe(true) + }) +}) diff --git a/lib/utils/lang.js b/lib/utils/lang.js index bf2e3b0d1d0..cf7bcfef553 100644 --- a/lib/utils/lang.js +++ b/lib/utils/lang.js @@ -97,7 +97,7 @@ export const redirectUserLang = (lang, pageId) => { return } // 只在首页处理跳转 - if (!window.location.pathname === '/') { + if (window.location.pathname !== '/') { return } // 没有开启多语言 diff --git a/package.json b/package.json index 56ccb9c3d91..2ffa1e109ff 100644 --- a/package.json +++ b/package.json @@ -54,6 +54,11 @@ "perf:lighthouse": "lhci autorun", "perf:audit:themes": "node scripts/audit-theme-performance.js", "perf:compress-theme-previews": "node scripts/compress-theme-previews.js", + "translate": "node scripts/translate/index.js", + "translate:all": "node scripts/translate/index.js --batch", + "translate:check": "node scripts/translate/index.js --check-drift", + "translate:backfill": "node scripts/translate/index.js --backfill", + "translate:diagnose": "node scripts/translate/index.js --diagnose", "postinstall": "patch-package" }, "dependencies": { @@ -87,6 +92,7 @@ "devDependencies": { "@lhci/cli": "^0.15.1", "@netlify/plugin-nextjs": "^5.15.8", + "@notionhq/client": "^2.2.15", "@testing-library/jest-dom": "^6.1.4", "@testing-library/react": "^14.1.2", "@testing-library/user-event": "^14.5.1", diff --git a/scripts/translate/README.md b/scripts/translate/README.md new file mode 100644 index 00000000000..10bc5dc16d2 --- /dev/null +++ b/scripts/translate/README.md @@ -0,0 +1,142 @@ +# notion-i18n-translator(中英双数据库自动翻译) + +为 NotionNext 博客提供两个语言数据库(中文库与英文库)之间的自动双向翻译。源语言由页面所在数据库决定,无需在每篇文章上额外维护 `lang` 字段。 + +默认语言对:**`zh-CN` ↔ `en-US`**。已在 DeepSeek(V3+)与 GLM-4 上验证通过。翻译提供方接口很小(`{ text, sourceLang, targetLang, glossary, hint } → { text, inputTokens, outputTokens }`),可在 `providers/` 下扩展其他模型。 + +## 工作原理 + +1. 维护两个 Notion 数据库,每个对应一种语言。它们分别填入 `blog.config.js` 的 `NOTION_PAGE_ID`(多语言形式 `id1,en:id2`)。 +2. 翻译时,脚本从源数据库读取页面,将每个可翻译块送入大模型,并在另一语言的数据库中创建或更新对应的「孪生页面」。 +3. 通过两端页面上的 `paired_with` 文本属性记录配对关系(双向 UUID)。目标页面同时记录源内容的 SHA-256(`source_hash`),未变化时跳过翻译,幂等可重入。 +4. 代码块、公式、嵌入、文件、链接、书签等会原样保留。Mermaid / PlantUML 代码块例外:仅翻译其中可见的标签文本,严格保留语法结构。 +5. 已人工修改过的目标页面,可勾选 `translation_locked` 锁定,后续即使源页面发生漂移也不会被覆盖。 + +## 数据库 schema 要求 + +请在**两个数据库中**分别新增以下属性(名称需保持一致): + +| 属性 | 类型 | 用途 | +|---|---|---| +| `paired_with` | Text | 对端语言数据库中孪生页面的 UUID | +| `translation_locked` | Checkbox | 勾选后,重新同步时不覆盖该页面 | +| `source_hash` | Text | 源页面可翻译内容的 SHA-256,由脚本写入 | + +NotionNext 已有的属性(`title`、`slug`、`status`、`type`、`category`、`tags`、`date`、`summary`、`icon`、`password`)会被读取使用,但不会被翻译脚本修改。 + +## 配置 + +翻译脚本从项目根目录的 `.env.local`(已被 gitignore,与 Next.js 应用读取的是同一份)中读取配置;不再在 `scripts/` 下保留独立 env 文件。所有变量已写入根目录 `.env.example` 的 `notion-i18n-translator` 段落。 + +```bash +# 1. 在已有的 .env.local 末尾追加翻译相关变量 +cat >> .env.local <<'EOF' +NOTION_TOKEN=secret_xxx +NOTION_DB_EN_ID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +NOTION_DB_ZH_ID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx +DEEPSEEK_API_KEY=sk-xxx +EOF + +# 2. 安装依赖(脚本仅新增 @notionhq/client,作为 devDependency) +yarn install +``` + +请到 创建一个 Notion 集成,然后**将两个数据库都与该集成连接**(在每个数据库页面右上角 `•••` → Connections → Add connections,选择刚创建的集成)。 + +随后可运行诊断检查权限是否到位: + +```bash +yarn translate:diagnose +``` + +## 用法 + +```bash +# 翻译单个页面(自动从源数据库定位到对端数据库) +yarn translate <页面 id 或 URL> + +# 批量翻译两库中所有 Published 状态、尚未配对的文章 +yarn translate:all + +# 限制源语言 +yarn translate:all --from zh-CN # 仅中→英 +yarn translate:all --from en-US # 仅英→中 + +# 包含 Draft +yarn translate:all --include-drafts + +# 强制重新翻译已配对的页面(用于内容漂移修复) +yarn translate:all --include-paired --force + +# 交互式将已有的人工翻译进行配对(不调用翻译 API,免费) +yarn translate:backfill + +# 检查与上次同步相比已发生漂移的页面 +yarn translate:check + +# 即使 source_hash 一致也强制重译 +yarn translate <页面 id> --force + +# 干跑:只打印将要进行的操作和目标位置,不调用翻译 API、不写入 Notion +yarn translate:all --dry-run +``` + +## 翻译提供方 + +通过 `.env.local` 中的 `TRANSLATOR_PROVIDER` 切换,默认 `deepseek`。 + +| 提供方 | 模型环境变量 | 默认值 | 价格(约) | +|---|---|---|---| +| `deepseek` | `DEEPSEEK_MODEL` | `deepseek-chat` | $0.27/M 输入,$1.10/M 输出 | +| `glm` | `GLM_MODEL` | `glm-4-plus` | 以官网为准 | + +`deepseek-chat` 是平台级别名,会自动指向当前最新对话模型;如手动指定具体版本(如 `deepseek-v3`)后返回 404,可改回 `deepseek-chat`。 + +## 并发、预算与重试 + +| 环境变量 | 默认值 | 说明 | +|---|---|---| +| `TRANSLATOR_CONCURRENCY` | `8` | 单页面内块翻译的并发度(瓶颈在 LLM,不在 Notion) | +| `TRANSLATOR_BUDGET_TOKENS_PER_RUN` | `500000` | 单次运行的硬性 token 上限,超出立即抛错终止 | + +所有 Notion API 调用统一封装了指数退避重试(429 / 5xx / 超时),偶发的 502 不会中断长批量任务。 + +## 词表与分类映射 + +- `glossary.json` — 翻译时原样保留的术语列表(如 `Notion`、`LLM`、`React`),可按需追加。 +- `category-map.json` — `category`(select)与 `tags`(multi-select)属性的双向映射;未登记的值在写入目标库时会被跳过并在控制台警告,因此初次使用可以保持空映射。 + +## 成本与耗时 + +一篇 1500 字的博客大约 5K 输入 + 10K 输出 token,使用 DeepSeek 约 **$0.02 一篇**。并发 8 时,~30 块的页面约 60–90 秒完成。 + +## 块处理规则 + +| 块类型 | 行为 | +|---|---| +| `paragraph`、`heading_1/2/3`、`bulleted_list_item`、`numbered_list_item`、`quote`、`callout`、`toggle`、`to_do` | 翻译 rich_text;保留加粗/斜体/链接等格式注释 | +| `code`(mermaid / plantuml) | 仅翻译标签,保留语法 | +| `code`(其他语言)、`equation`、`image`、`video`、`file`、`embed`、`divider`、`table_of_contents`、`bookmark`、`breadcrumb`、`link_preview`、`link_to_page`、`child_page`、`child_database` | 原样保留 | +| `column_list`、`column`、`table`、`table_row`、`synced_block`、`unsupported` | 跳过 — Notion 创建接口要求这些块在 body 中携带子节点,而扁平拉取无法包含嵌套子节点 | + +## 文件结构 + +``` +scripts/translate/ + index.js CLI 入口(yarn translate / translate:all 等) + pipeline.js 主流程:读取 → 比对哈希 → 翻译 → 写入 → 双向链接 + config.js 语言↔数据库 解析、分类/标签映射工具函数 + notion-client.js 封装 @notionhq/client,附带重试逻辑 + block-mapper.js 按块类型的翻译与字段净化 + state.js SHA-256 与块类型分类集合 + backfill.js 交互式跨库配对工具 + diagnose.js 列出集成可访问的所有数据库,定位权限问题 + glossary.json 不翻译术语清单 + category-map.json select / multi_select 双向映射 + load-env.js 零依赖 .env 加载器(读取项目根目录 .env.local / .env) + providers/ + index.js 提供方选择器(按 TRANSLATOR_PROVIDER) + deepseek.js DeepSeek(OpenAI 兼容) + glm.js 智谱 GLM + _http.js 带超时的 fetch 封装 +``` diff --git a/scripts/translate/backfill.js b/scripts/translate/backfill.js new file mode 100644 index 00000000000..ece25a59c94 --- /dev/null +++ b/scripts/translate/backfill.js @@ -0,0 +1,98 @@ +const readline = require('readline') +const notion = require('./notion-client') +const { envIds, langFromDb } = require('./config') + +function tokenize(s) { + if (!s) return [] + return s + .toLowerCase() + // 在拉丁字符与中日韩文字之间插入空格,便于按词切分 + .replace(/([a-z0-9])([一-鿿])/g, '$1 $2') + .replace(/([一-鿿])([a-z0-9])/g, '$1 $2') + .split(/[\s\-_/.,!?,。!?]+/u) + .filter(t => t && t.length >= 2) +} + +function similarity(a, b) { + if (!a || !b) return 0 + if (a.toLowerCase().trim() === b.toLowerCase().trim()) return 1 + const A = new Set(tokenize(a)) + const B = new Set(tokenize(b)) + if (!A.size || !B.size) return 0 + let inter = 0 + for (const t of A) if (B.has(t)) inter++ + const union = A.size + B.size - inter + return inter / union +} + +async function ask(question) { + const rl = readline.createInterface({ input: process.stdin, output: process.stdout }) + return new Promise(resolve => rl.question(question, ans => { rl.close(); resolve(ans) })) +} + +async function loadPosts(dbId) { + const pages = await notion.queryDatabase(dbId, { + property: 'type', + select: { equals: 'Post' } + }) + return pages.map(p => ({ + id: p.id, + title: notion.getProp(p, 'title', 'title') || '', + slug: notion.getProp(p, 'slug', 'rich_text') || '', + pairedWith: notion.getProp(p, 'paired_with', 'rich_text') || '' + })) +} + +async function runBackfill({ autoYes = false } = {}) { + const { en, zh } = envIds() + const enPosts = await loadPosts(en) + const zhPosts = await loadPosts(zh) + console.log(`[backfill] 英文库共 ${enPosts.length} 篇,中文库共 ${zhPosts.length} 篇`) + + const matched = new Set() + let pairs = 0 + + for (const z of zhPosts) { + if (z.pairedWith) continue + const candidates = enPosts + .filter(e => !matched.has(e.id) && !e.pairedWith) + .map(e => ({ + ...e, + score: Math.max(similarity(z.slug, e.slug), similarity(z.title, e.title)) + })) + .sort((a, b) => b.score - a.score) + .slice(0, 3) + + if (!candidates.length) continue + + console.log(`\n[zh] ${z.title} (slug: ${z.slug || '—'})`) + candidates.forEach((c, i) => { + console.log(` ${i + 1}. [score=${c.score.toFixed(2)}] ${c.title} (slug: ${c.slug || '—'})`) + }) + + let ans + if (autoYes) { + ans = candidates[0].score >= 0.4 ? '1' : 'skip' + } else { + ans = (await ask(`选择配对编号?[1-${candidates.length}/n/skip]: `)).trim() + } + if (!ans || ans === 'n' || ans === 'skip') continue + const idx = parseInt(ans, 10) - 1 + if (Number.isNaN(idx) || idx < 0 || idx >= candidates.length) continue + const chosen = candidates[idx] + + await notion.updatePageProperties(z.id, { + paired_with: { rich_text: [{ type: 'text', text: { content: chosen.id } }] } + }) + await notion.updatePageProperties(chosen.id, { + paired_with: { rich_text: [{ type: 'text', text: { content: z.id } }] } + }) + matched.add(chosen.id) + pairs++ + console.log(` ✓ 已配对 ${z.id.slice(0, 8)} ↔ ${chosen.id.slice(0, 8)}`) + } + + console.log(`\n[backfill] 完成。共建立 ${pairs} 对配对关系。`) +} + +module.exports = { runBackfill } diff --git a/scripts/translate/block-mapper.js b/scripts/translate/block-mapper.js new file mode 100644 index 00000000000..5842709c431 --- /dev/null +++ b/scripts/translate/block-mapper.js @@ -0,0 +1,118 @@ +const { + TRANSLATABLE_BLOCK_TYPES, + COPY_AS_IS_BLOCK_TYPES, + SKIP_BLOCK_TYPES, + TRANSLATABLE_CODE_LANGUAGES +} = require('./state') + +function richTextToString(rich) { + if (!Array.isArray(rich)) return '' + return rich.map(rt => rt.plain_text || '').join('') +} + +function rebuildRichText(translated, originalRich) { + if (!translated) return [] + if (originalRich.length === 1) { + const rt = originalRich[0] + return [ + { + type: 'text', + text: { content: translated, link: rt.text?.link || null }, + annotations: rt.annotations || {}, + plain_text: translated, + href: rt.href || null + } + ] + } + const annotations = originalRich[0]?.annotations || {} + return [ + { + type: 'text', + text: { content: translated, link: null }, + annotations, + plain_text: translated, + href: null + } + ] +} + +function shouldTranslateRichText(rich) { + if (!Array.isArray(rich) || rich.length === 0) return false + const text = richTextToString(rich) + if (!text.trim()) return false + const allCode = rich.every(rt => rt.annotations?.code) + return !allCode +} + +async function translateBlock(block, ctx) { + const type = block.type + const data = block[type] + + if (SKIP_BLOCK_TYPES.has(type)) return null + + // 特殊处理:mermaid / plantuml 代码块同时包含可翻译的标签和严格语法。 + // 整体送入翻译模型并附带语言提示,由模型仅替换文本而保留语法结构。 + if (type === 'code' && data?.language && TRANSLATABLE_CODE_LANGUAGES.has(data.language)) { + const newData = { ...data } + if (data.rich_text && shouldTranslateRichText(data.rich_text)) { + const sourceText = richTextToString(data.rich_text) + const result = await ctx.translateText(sourceText, { hint: data.language }) + newData.rich_text = rebuildRichText(result.text, data.rich_text) + } + return cloneBlockForCreate({ ...block, code: newData }) + } + + if (COPY_AS_IS_BLOCK_TYPES.has(type)) { + return cloneBlockForCreate(block) + } + + if (TRANSLATABLE_BLOCK_TYPES.has(type)) { + const newData = { ...data } + if (data.rich_text && shouldTranslateRichText(data.rich_text)) { + const sourceText = richTextToString(data.rich_text) + const result = await ctx.translateText(sourceText) + newData.rich_text = rebuildRichText(result.text, data.rich_text) + } + if (Array.isArray(data.children) && data.children.length) { + newData.children = [] + for (const child of data.children) { + const t = await translateBlock(child, ctx) + if (t) newData.children.push(t) + } + } + return cloneBlockForCreate({ ...block, [type]: newData }) + } + + return cloneBlockForCreate(block) +} + +function cloneBlockForCreate(block) { + const type = block.type + if (!type) return null + const data = block[type] + if (!data) return { object: 'block', type, [type]: {} } + + const cleaned = stripReadOnly(data) + return { object: 'block', type, [type]: cleaned } +} + +function stripReadOnly(obj) { + if (Array.isArray(obj)) return obj.map(stripReadOnly).filter(v => v !== undefined) + if (obj && typeof obj === 'object') { + const out = {} + for (const [k, v] of Object.entries(obj)) { + if (k === 'plain_text' || k === 'href') continue + if (v === null || v === undefined) continue // Notion 不接受可选字段为 null,需直接省略 + out[k] = stripReadOnly(v) + } + return out + } + return obj +} + +module.exports = { + translateBlock, + richTextToString, + rebuildRichText, + shouldTranslateRichText +} diff --git a/scripts/translate/category-map.json b/scripts/translate/category-map.json new file mode 100644 index 00000000000..da615f88d75 --- /dev/null +++ b/scripts/translate/category-map.json @@ -0,0 +1,23 @@ +{ + "_comment": "中英文数据库中 select / multi_select 类型属性的双向映射。两侧分类与标签的命名通常并不一致(例如英文库的 Knowledge 对应中文库的「知识」),翻译时若目标库不存在对应选项,写入会失败;因此在此手动登记映射关系。未登记的值会在翻译时打印警告并跳过,因此初次使用时映射可以为空。", + "_example": { + "category": { + "en_to_zh": { + "Knowledge": "知识", + "Technology": "技术" + }, + "zh_to_en": { + "知识": "Knowledge", + "技术": "Technology" + } + } + }, + "category": { + "en_to_zh": {}, + "zh_to_en": {} + }, + "tags": { + "en_to_zh": {}, + "zh_to_en": {} + } +} diff --git a/scripts/translate/config.js b/scripts/translate/config.js new file mode 100644 index 00000000000..f1c2d8357d6 --- /dev/null +++ b/scripts/translate/config.js @@ -0,0 +1,75 @@ +const path = require('path') +const fs = require('fs') + +function envIds() { + const en = process.env.NOTION_DB_EN_ID + const zh = process.env.NOTION_DB_ZH_ID + if (!en || !zh) { + throw new Error('NOTION_DB_EN_ID 与 NOTION_DB_ZH_ID 均需在项目根目录 .env.local 中配置') + } + return { en, zh } +} + +function normalize(id) { + return String(id).replace(/-/g, '').toLowerCase() +} + +function langFromDb(dbId) { + const { en, zh } = envIds() + const id = normalize(dbId) + if (id === normalize(en)) return 'en-US' + if (id === normalize(zh)) return 'zh-CN' + return null +} + +function dbForLang(lang) { + const { en, zh } = envIds() + if (lang === 'en-US') return en + if (lang === 'zh-CN') return zh + throw new Error(`不支持的语言: ${lang}`) +} + +function flipLang(lang) { + if (lang === 'zh-CN') return 'en-US' + if (lang === 'en-US') return 'zh-CN' + throw new Error(`不支持的语言: ${lang}`) +} + +let _categoryMap = null +function categoryMap() { + if (!_categoryMap) { + _categoryMap = JSON.parse( + fs.readFileSync(path.join(__dirname, 'category-map.json'), 'utf8') + ) + } + return _categoryMap +} + +function mapSelect(kind, value, sourceLang) { + if (!value) return null + const direction = sourceLang === 'en-US' ? 'en_to_zh' : 'zh_to_en' + const m = categoryMap()[kind]?.[direction] || {} + return m[value] || null +} + +function mapMultiSelect(kind, values, sourceLang) { + if (!Array.isArray(values)) return [] + const direction = sourceLang === 'en-US' ? 'en_to_zh' : 'zh_to_en' + const m = categoryMap()[kind]?.[direction] || {} + const out = [] + const dropped = [] + for (const v of values) { + if (m[v]) out.push(m[v]) + else dropped.push(v) + } + return { mapped: out, dropped } +} + +module.exports = { + envIds, + langFromDb, + dbForLang, + flipLang, + mapSelect, + mapMultiSelect +} diff --git a/scripts/translate/diagnose.js b/scripts/translate/diagnose.js new file mode 100644 index 00000000000..6ae8909c64b --- /dev/null +++ b/scripts/translate/diagnose.js @@ -0,0 +1,73 @@ +const { Client } = require('@notionhq/client') + +async function runDiagnose() { + const token = process.env.NOTION_TOKEN + const enId = process.env.NOTION_DB_EN_ID + const zhId = process.env.NOTION_DB_ZH_ID + if (!token) throw new Error('NOTION_TOKEN 未设置(请检查项目根目录 .env.local)') + + const client = new Client({ auth: token }) + + console.log('=== Notion 集成访问诊断 ===') + console.log(`token 末尾: …${token.slice(-6)}`) + console.log(`英文库目标: ${enId || '(未设置)'}`) + console.log(`中文库目标: ${zhId || '(未设置)'}`) + console.log('') + + try { + const me = await client.users.me({}) + console.log(`集成身份: ${me.name || '(无名称)'} (${me.bot?.owner?.type || '?'})`) + console.log('') + } catch (err) { + console.error(`无法读取集成身份: ${err.message}`) + } + + console.log('正在搜索集成可访问的所有数据库…') + const dbs = [] + let cursor + do { + const res = await client.search({ + start_cursor: cursor, + page_size: 100, + filter: { property: 'object', value: 'database' } + }) + dbs.push(...res.results) + cursor = res.has_more ? res.next_cursor : null + } while (cursor) + + if (!dbs.length) { + console.log(' → 该集成尚未获得任何数据库的访问权限。请先在 Notion 中将两个数据库与该集成连接。') + return + } + + const norm = id => String(id || '').replace(/-/g, '').toLowerCase() + console.log(`可访问数据库 (${dbs.length}):`) + for (const db of dbs) { + const title = (db.title || []).map(t => t.plain_text).join('') || '(无标题)' + const isEn = norm(db.id) === norm(enId) + const isZh = norm(db.id) === norm(zhId) + const tag = isEn ? '[EN]' : isZh ? '[ZH]' : ' ' + console.log(` ${tag} ${db.id} ${title}`) + } + console.log('') + + const enOk = dbs.some(db => norm(db.id) === norm(enId)) + const zhOk = dbs.some(db => norm(db.id) === norm(zhId)) + + if (enOk && zhOk) { + console.log('✓ 两个数据库均可访问,翻译脚本应能正常运行。') + } else { + if (!enOk) console.log(`✗ 英文库 ${enId} 无法访问 — 请在 Notion 中将其与集成连接。`) + if (!zhOk) console.log(`✗ 中文库 ${zhId} 无法访问 — 请在 Notion 中将其与集成连接。`) + } +} + +module.exports = { runDiagnose } + +if (require.main === module) { + require('./load-env').loadEnv() + runDiagnose().catch(err => { + console.error('FATAL:', err.message) + process.exit(1) + }) +} diff --git a/scripts/translate/glossary.json b/scripts/translate/glossary.json new file mode 100644 index 00000000000..03c2dbfc3df --- /dev/null +++ b/scripts/translate/glossary.json @@ -0,0 +1,32 @@ +{ + "comment": "此列表中的术语在翻译时原样保留。如遇到误译,请将相关术语补充到 preserve 数组中。", + "preserve": [ + "Notion", + "NotionNext", + "LLM", + "API", + "SDK", + "CLI", + "React", + "Next.js", + "Remix", + "Hooks", + "TypeScript", + "JavaScript", + "Python", + "GitHub", + "Vercel", + "Cloudflare", + "Algolia", + "Giscus", + "Tailwind", + "DeepSeek", + "GLM", + "Claude", + "GPT", + "Anthropic", + "OpenAI" + ], + "zh_to_en": {}, + "en_to_zh": {} +} diff --git a/scripts/translate/index.js b/scripts/translate/index.js new file mode 100644 index 00000000000..13609055e94 --- /dev/null +++ b/scripts/translate/index.js @@ -0,0 +1,128 @@ +#!/usr/bin/env node + +require('./load-env').loadEnv() + +const notion = require('./notion-client') +const { translateOnePage, checkDrift, findUntranslated } = require('./pipeline') +const { runBackfill } = require('./backfill') +const { runDiagnose } = require('./diagnose') + +function parseArgs(argv) { + const args = { _: [], flags: {} } + for (let i = 0; i < argv.length; i++) { + const a = argv[i] + if (a.startsWith('--')) { + const key = a.slice(2) + const next = argv[i + 1] + if (!next || next.startsWith('--')) { + args.flags[key] = true + } else { + args.flags[key] = next + i++ + } + } else { + args._.push(a) + } + } + return args +} + +function help() { + console.log(` +notion-i18n-translator + +跨数据库的中英双向翻译脚本:以页面所在数据库判定源语言,将翻译结果写入另一语言的数据库。 + +用法: + yarn translate <页面 id 或 URL> 翻译单个页面(自动从源数据库定位到对端数据库) + yarn translate:all 批量翻译两个数据库中尚未配对(paired_with 为空)的 Published 文章 + yarn translate:check 列出与上次同步相比内容已发生变化的页面 + yarn translate:backfill 交互式将两个数据库中已有的人工翻译进行配对 + yarn translate:diagnose 检查 Notion 集成是否能访问到两个目标数据库 + +参数: + --dry-run 仅打印将要进行的操作,不调用翻译 API、不写入 Notion + --force 即使 source_hash 未变也强制重新翻译 + --include-drafts 批量模式下,将 Draft 状态的文章也纳入翻译范围 + --include-paired 批量模式下,将已配对的页面也重新翻译(漂移修复) + --from 批量模式下,仅翻译指定来源语言(zh-CN 或 en-US) + --provider X 覆盖 TRANSLATOR_PROVIDER(deepseek 或 glm) + --verbose 输出更详细的过程日志 + --yes 非交互模式(backfill 在相似度 ≥ 0.4 时自动选择最佳匹配) + --help 显示本帮助信息 +`) +} + +async function main() { + const argv = process.argv.slice(2) + const { _: positional, flags } = parseArgs(argv) + + if (flags.help) return help() + + const opts = { + dryRun: Boolean(flags['dry-run']), + force: Boolean(flags.force), + provider: typeof flags.provider === 'string' ? flags.provider : undefined, + verbose: Boolean(flags.verbose) + } + + if (flags.batch) { + const includeDrafts = Boolean(flags['include-drafts']) + const fromLang = typeof flags.from === 'string' ? flags.from : null + const includePaired = Boolean(flags['include-paired']) + const { eligible, skipped } = await findUntranslated({ includeDrafts, fromLang, includePaired }) + console.log(`[batch] 共扫描两个数据库下 ${skipped.total} 篇 Post`) + console.log(` ${eligible.length} 篇待翻译`) + console.log(` ${skipped.alreadyPaired} 篇已配对(如需重译,运行 yarn translate:check 检查漂移)`) + if (!includeDrafts) { + console.log(` ${skipped.notPublished} 篇非 Published(如需包含草稿,加上 --include-drafts)`) + } + if (!eligible.length) { + console.log('\n无需处理。') + return + } + for (const item of eligible) { + try { + await translateOnePage(item.page.id, opts) + } catch (err) { + console.error(`[err] ${item.page.id}: ${err.message}`) + } + } + return + } + + if (flags['check-drift']) { + const drifted = await checkDrift() + if (!drifted.length) { + console.log('[check] 未检测到内容漂移') + } else { + console.log(`[check] 共 ${drifted.length} 篇页面的源内容已发生变化:`) + for (const d of drifted) { + console.log(` ${d.pageId} [${d.lang}] ${d.title}`) + } + } + return + } + + if (flags.backfill) { + return runBackfill({ autoYes: Boolean(flags.yes) }) + } + + if (flags.diagnose) { + return runDiagnose() + } + + if (!positional.length) { + help() + process.exit(1) + } + + const pageId = notion.extractIdFromInput(positional[0]) + await translateOnePage(pageId, opts) +} + +main().catch(err => { + console.error('FATAL:', err.message || err) + if (process.env.DEBUG) console.error(err.stack) + process.exit(1) +}) diff --git a/scripts/translate/load-env.js b/scripts/translate/load-env.js new file mode 100644 index 00000000000..f8bbf404090 --- /dev/null +++ b/scripts/translate/load-env.js @@ -0,0 +1,32 @@ +const fs = require('fs') +const path = require('path') + +// 极简零依赖 .env 加载器,与 Next.js 应用运行时的行为一致: +// 从项目根目录读取 .env.local(优先)与 .env,写入 process.env, +// 但不会覆盖已有的环境变量(例如 Vercel 注入或 shell 中显式 export 的值)。 +// 两个文件都不存在时静默退出,不报错。 +function loadEnv() { + const projectRoot = path.resolve(__dirname, '..', '..') + for (const name of ['.env.local', '.env']) { + const file = path.join(projectRoot, name) + if (!fs.existsSync(file)) continue + const content = fs.readFileSync(file, 'utf8') + for (const raw of content.split(/\r?\n/)) { + const line = raw.trim() + if (!line || line.startsWith('#')) continue + const eq = line.indexOf('=') + if (eq < 0) continue + const key = line.slice(0, eq).trim() + let value = line.slice(eq + 1).trim() + if ( + (value.startsWith('"') && value.endsWith('"')) || + (value.startsWith("'") && value.endsWith("'")) + ) { + value = value.slice(1, -1) + } + if (process.env[key] === undefined) process.env[key] = value + } + } +} + +module.exports = { loadEnv } diff --git a/scripts/translate/notion-client.js b/scripts/translate/notion-client.js new file mode 100644 index 00000000000..29c823c2934 --- /dev/null +++ b/scripts/translate/notion-client.js @@ -0,0 +1,141 @@ +const { Client } = require('@notionhq/client') + +let _client = null +function client() { + if (!_client) { + if (!process.env.NOTION_TOKEN) throw new Error('NOTION_TOKEN 未设置(请检查项目根目录 .env.local)') + _client = new Client({ auth: process.env.NOTION_TOKEN }) + } + return _client +} + +const TRANSIENT = new Set([429, 500, 502, 503, 504]) +const sleep = ms => new Promise(r => setTimeout(r, ms)) + +async function withRetry(fn, label = 'notion-call', maxAttempts = 4) { + let attempt = 0 + while (true) { + attempt++ + try { + return await fn() + } catch (err) { + const status = err.status || err.statusCode + const transient = + TRANSIENT.has(status) || + /status: (429|5\d\d)/.test(String(err.message || '')) || + err.code === 'ETIMEDOUT' || + err.code === 'ECONNRESET' + if (!transient || attempt >= maxAttempts) throw err + const backoff = Math.min(1000 * 2 ** (attempt - 1), 8000) + console.warn(`[重试 ${attempt}/${maxAttempts - 1}] ${label}: ${err.message || err}; 等待 ${backoff}ms`) + await sleep(backoff) + } + } +} + +function normalizeId(id) { + const stripped = String(id).replace(/-/g, '') + if (stripped.length !== 32) throw new Error(`无效的 Notion id: ${id}`) + return `${stripped.slice(0, 8)}-${stripped.slice(8, 12)}-${stripped.slice(12, 16)}-${stripped.slice(16, 20)}-${stripped.slice(20)}` +} + +function extractIdFromInput(input) { + if (!input) throw new Error('未提供页面 id') + const match = String(input).match(/[0-9a-fA-F]{32}/) || String(input).match(/[0-9a-fA-F-]{36}/) + if (!match) throw new Error(`无法从输入中解析 Notion id: ${input}`) + return normalizeId(match[0]) +} + +async function fetchPage(pageId) { + return withRetry(() => client().pages.retrieve({ page_id: pageId }), `fetchPage ${pageId}`) +} + +async function fetchAllBlocks(blockId) { + const blocks = [] + let cursor + do { + const res = await withRetry( + () => client().blocks.children.list({ block_id: blockId, start_cursor: cursor, page_size: 100 }), + `fetchAllBlocks ${blockId}` + ) + blocks.push(...res.results) + cursor = res.has_more ? res.next_cursor : null + } while (cursor) + return blocks +} + +async function queryDatabase(databaseId, filter) { + const results = [] + let cursor + do { + const res = await withRetry( + () => client().databases.query({ database_id: databaseId, start_cursor: cursor, filter, page_size: 100 }), + `queryDatabase ${databaseId}` + ) + results.push(...res.results) + cursor = res.has_more ? res.next_cursor : null + } while (cursor) + return results +} + +async function createPage({ parent, properties, children, cover, icon }) { + return withRetry(() => client().pages.create({ parent, properties, children, cover, icon }), 'createPage') +} + +async function updatePageProperties(pageId, properties) { + return withRetry(() => client().pages.update({ page_id: pageId, properties }), `updatePage ${pageId}`) +} + +async function appendBlocks(blockId, children) { + const CHUNK = 100 + for (let i = 0; i < children.length; i += CHUNK) { + const slice = children.slice(i, i + CHUNK) + await withRetry(() => client().blocks.children.append({ block_id: blockId, children: slice }), `appendBlocks ${blockId}`) + } +} + +async function deleteAllChildBlocks(blockId) { + const existing = await fetchAllBlocks(blockId) + for (const b of existing) { + try { + await withRetry(() => client().blocks.delete({ block_id: b.id }), `deleteBlock ${b.id}`) + } catch (err) { + if (!String(err.message || '').includes('archived')) throw err + } + } +} + +function getProp(page, name, type) { + const p = page.properties?.[name] + if (!p) return null + switch (type) { + case 'title': + return (p.title || []).map(rt => rt.plain_text).join('') + case 'rich_text': + return (p.rich_text || []).map(rt => rt.plain_text).join('') + case 'select': + return p.select?.name || null + case 'multi_select': + return (p.multi_select || []).map(s => s.name) + case 'checkbox': + return Boolean(p.checkbox) + case 'date': + return p.date || null + default: + return p + } +} + +module.exports = { + client, + normalizeId, + extractIdFromInput, + fetchPage, + fetchAllBlocks, + queryDatabase, + createPage, + updatePageProperties, + appendBlocks, + deleteAllChildBlocks, + getProp +} diff --git a/scripts/translate/pipeline.js b/scripts/translate/pipeline.js new file mode 100644 index 00000000000..8879e2020fb --- /dev/null +++ b/scripts/translate/pipeline.js @@ -0,0 +1,245 @@ +const path = require('path') +const fs = require('fs') +const pLimit = require('p-limit').default || require('p-limit') + +const notion = require('./notion-client') +const { sha256OfBlocks } = require('./state') +const { translateBlock } = require('./block-mapper') +const { getProvider } = require('./providers') +const { envIds, langFromDb, dbForLang, flipLang, mapSelect, mapMultiSelect } = require('./config') + +const GLOSSARY = JSON.parse( + fs.readFileSync(path.join(__dirname, 'glossary.json'), 'utf8') +) + +function buildBudget() { + const cap = parseInt(process.env.TRANSLATOR_BUDGET_TOKENS_PER_RUN || '500000', 10) + let usedIn = 0 + let usedOut = 0 + return { + add(inT, outT) { + usedIn += inT + usedOut += outT + if (usedIn + usedOut > cap) { + throw new Error(`Token 预算超限: ${usedIn}+${usedOut}=${usedIn + usedOut} > ${cap}`) + } + }, + summary() { + return { input: usedIn, output: usedOut } + } + } +} + +async function translateOnePage(pageId, opts = {}) { + const dryRun = Boolean(opts.dryRun) + const force = Boolean(opts.force) + const verbose = Boolean(opts.verbose) + const log = verbose ? console.log : () => {} + + const provider = getProvider(opts.provider) + const budget = buildBudget() + + log(`[fetch] page ${pageId}`) + const page = await notion.fetchPage(pageId) + const sourceDbId = page.parent?.database_id + if (!sourceDbId) throw new Error(`页面 ${pageId} 不属于任何数据库`) + const sourceLang = langFromDb(sourceDbId) + if (!sourceLang) { + throw new Error(`页面 ${pageId} 所在数据库 ${sourceDbId} 既不是 NOTION_DB_EN_ID 也不是 NOTION_DB_ZH_ID`) + } + const targetLang = flipLang(sourceLang) + const targetDbId = dbForLang(targetLang) + + const blocks = await notion.fetchAllBlocks(pageId) + const sourceTitle = notion.getProp(page, 'title', 'title') || '' + const sourceSummary = notion.getProp(page, 'summary', 'rich_text') || '' + const sourceSlug = notion.getProp(page, 'slug', 'rich_text') || '' + const sourceCategory = notion.getProp(page, 'category', 'select') + const sourceTags = notion.getProp(page, 'tags', 'multi_select') || [] + const pairedWith = notion.getProp(page, 'paired_with', 'rich_text') || '' + const status = notion.getProp(page, 'status', 'select') || 'Draft' + const type = notion.getProp(page, 'type', 'select') || 'Post' + const date = notion.getProp(page, 'date', 'date') + + const currentHash = sha256OfBlocks(blocks) + log(`[hash] ${currentHash.slice(0, 12)}… (${sourceLang} → ${targetLang})`) + + let targetPageId = pairedWith || null + let targetExisting = null + if (targetPageId) { + try { + targetExisting = await notion.fetchPage(targetPageId) + } catch { + log(`[warn] 配对页面 ${targetPageId} 无法读取,将创建新页面`) + targetPageId = null + } + } + + if (targetExisting && !force) { + const targetSourceHash = notion.getProp(targetExisting, 'source_hash', 'rich_text') || '' + if (targetSourceHash === currentHash) { + console.log(`[skip] ${pageId} — 目标 ${targetPageId} 已是最新`) + return { skipped: true } + } + if (notion.getProp(targetExisting, 'translation_locked', 'checkbox')) { + console.log(`[skip] ${pageId} — 目标 ${targetPageId} 已锁定(translation_locked)`) + return { skipped: true, locked: true } + } + } + + async function translateText(text, opts = {}) { + if (!text || !text.trim()) return { text } + if (dryRun) return { text } // dry-run 模式下不调用翻译 API + const result = await provider.translate({ + text, + sourceLang, + targetLang, + glossary: GLOSSARY, + hint: opts.hint + }) + budget.add(result.inputTokens || 0, result.outputTokens || 0) + return result + } + + if (dryRun) { + const blockCount = blocks.length + console.log(`[dry-run] ${pageId} (${sourceLang} → ${targetLang})`) + console.log(` title: ${sourceTitle}`) + console.log(` blocks: ${blockCount}`) + console.log(` target: ${targetPageId ? `update ${targetPageId}` : `create in DB ${targetDbId}`}`) + return { dryRun: true, blocks: blockCount } + } + + const translatedTitle = sourceTitle.trim() + ? (await translateText(sourceTitle)).text + : sourceTitle + const translatedSummary = sourceSummary.trim() + ? (await translateText(sourceSummary)).text + : sourceSummary + + log(`[translate] title="${translatedTitle.slice(0, 60)}"`) + + const concurrency = parseInt(process.env.TRANSLATOR_CONCURRENCY || '5', 10) + const limit = pLimit(concurrency) + const t0 = Date.now() + const translated = await Promise.all( + blocks.map(b => limit(() => translateBlock(b, { translateText }))) + ) + const newBlocks = translated.filter(Boolean) + log(`[translate] ${newBlocks.length} 个块, 用时 ${((Date.now() - t0) / 1000).toFixed(1)}s (并发=${concurrency})`) + + const mappedCategory = mapSelect('category', sourceCategory, sourceLang) + const mappedTagsResult = mapMultiSelect('tags', sourceTags, sourceLang) + const mappedTags = mappedTagsResult.mapped || [] + if (mappedTagsResult.dropped?.length) { + log(`[warn] 以下标签缺少映射,已跳过: ${mappedTagsResult.dropped.join(', ')}`) + } + if (sourceCategory && !mappedCategory) { + log(`[warn] 分类缺少映射,已跳过: ${sourceCategory}`) + } + + const props = { + title: { title: [{ type: 'text', text: { content: translatedTitle } }] }, + summary: { rich_text: [{ type: 'text', text: { content: translatedSummary } }] }, + paired_with: { rich_text: [{ type: 'text', text: { content: pageId } }] }, + source_hash: { rich_text: [{ type: 'text', text: { content: currentHash } }] }, + type: { select: { name: type } }, + status: { select: { name: status } } + } + if (mappedCategory) props.category = { select: { name: mappedCategory } } + if (mappedTags.length) props.tags = { multi_select: mappedTags.map(name => ({ name })) } + if (sourceSlug) props.slug = { rich_text: [{ type: 'text', text: { content: sourceSlug } }] } + if (date && date.start) { + props.date = { date: { start: date.start, end: date.end || null } } + } + + // Notion 创建页面时不接受内部托管的文件作为封面或图标,只允许 external + // URL 或 emoji;file 与 file_upload 类型直接忽略。 + const safeCover = page.cover?.type === 'external' ? page.cover : undefined + const safeIcon = + page.icon?.type === 'emoji' || page.icon?.type === 'external' + ? page.icon + : undefined + + if (targetPageId) { + log(`[update] 更新目标页面 ${targetPageId}`) + await notion.deleteAllChildBlocks(targetPageId) + await notion.appendBlocks(targetPageId, newBlocks) + await notion.updatePageProperties(targetPageId, props) + } else { + log(`[create] 在数据库 ${targetDbId} 中新建目标页面`) + const created = await notion.createPage({ + parent: { database_id: targetDbId }, + properties: props, + children: newBlocks.slice(0, 100), + cover: safeCover, + icon: safeIcon + }) + targetPageId = created.id + if (newBlocks.length > 100) { + await notion.appendBlocks(targetPageId, newBlocks.slice(100)) + } + } + + await notion.updatePageProperties(pageId, { + paired_with: { rich_text: [{ type: 'text', text: { content: targetPageId } }] }, + source_hash: { rich_text: [{ type: 'text', text: { content: currentHash } }] } + }) + + console.log(`[ok] ${pageId} → ${targetPageId} 耗用=${JSON.stringify(budget.summary())}`) + return { ok: true, targetPageId, budget: budget.summary() } +} + +async function checkDrift() { + const { en, zh } = envIds() + const drifted = [] + for (const dbId of [en, zh]) { + const pages = await notion.queryDatabase(dbId, { + and: [ + { property: 'type', select: { equals: 'Post' } }, + { property: 'paired_with', rich_text: { is_not_empty: true } } + ] + }) + for (const page of pages) { + const storedHash = notion.getProp(page, 'source_hash', 'rich_text') + const blocks = await notion.fetchAllBlocks(page.id) + const currentHash = sha256OfBlocks(blocks) + if (currentHash !== storedHash) { + drifted.push({ + pageId: page.id, + title: notion.getProp(page, 'title', 'title'), + lang: langFromDb(dbId) + }) + } + } + } + return drifted +} + +async function findUntranslated({ includeDrafts = false, fromLang = null, includePaired = false } = {}) { + const { en, zh } = envIds() + const skipped = { alreadyPaired: 0, notPublished: 0, wrongLang: 0, total: 0 } + const eligible = [] + + const dbsToScan = [] + if (!fromLang || fromLang === 'en-US') dbsToScan.push(en) + if (!fromLang || fromLang === 'zh-CN') dbsToScan.push(zh) + + for (const dbId of dbsToScan) { + const pages = await notion.queryDatabase(dbId, { + property: 'type', + select: { equals: 'Post' } + }) + for (const p of pages) { + skipped.total++ + const paired = notion.getProp(p, 'paired_with', 'rich_text') + const status = notion.getProp(p, 'status', 'select') + if (paired && !includePaired) { skipped.alreadyPaired++; continue } + if (!includeDrafts && status !== 'Published') { skipped.notPublished++; continue } + eligible.push({ page: p, sourceLang: langFromDb(dbId) }) + } + } + return { eligible, skipped } +} + +module.exports = { translateOnePage, checkDrift, findUntranslated } diff --git a/scripts/translate/providers/_http.js b/scripts/translate/providers/_http.js new file mode 100644 index 00000000000..a4f4219ebe1 --- /dev/null +++ b/scripts/translate/providers/_http.js @@ -0,0 +1,19 @@ +// 简易 HTTP 客户端:在原生 fetch 之上增加超时与外部 signal 串联。 +async function request({ url, method = 'GET', headers = {}, body, signal, timeoutMs = 60000 }) { + const controller = new AbortController() + const t = setTimeout(() => controller.abort(new Error('请求超时')), timeoutMs) + if (signal) signal.addEventListener('abort', () => controller.abort(signal.reason)) + + try { + const res = await fetch(url, { method, headers, body, signal: controller.signal }) + const text = await res.text() + if (!res.ok) { + throw new Error(`${method} ${url} → ${res.status}: ${text.slice(0, 300)}`) + } + return { status: res.status, body: text } + } finally { + clearTimeout(t) + } +} + +module.exports = { request } diff --git a/scripts/translate/providers/deepseek.js b/scripts/translate/providers/deepseek.js new file mode 100644 index 00000000000..35acfaf783b --- /dev/null +++ b/scripts/translate/providers/deepseek.js @@ -0,0 +1,71 @@ +const { request } = require('./_http') + +// DeepSeek 翻译适配器;OpenAI 兼容接口,默认模型 deepseek-chat。 +async function translate({ text, sourceLang, targetLang, glossary, hint, signal }) { + const apiKey = process.env.DEEPSEEK_API_KEY + if (!apiKey) throw new Error('DEEPSEEK_API_KEY 未设置') + + const baseUrl = process.env.DEEPSEEK_BASE_URL || 'https://api.deepseek.com/v1' + const model = process.env.DEEPSEEK_MODEL || 'deepseek-chat' + + const preserveList = (glossary?.preserve || []).join(', ') + const langName = { 'zh-CN': 'Simplified Chinese', 'en-US': 'English' } + const sourceName = langName[sourceLang] || sourceLang + const targetName = langName[targetLang] || targetLang + + const hintLine = + hint === 'mermaid' + ? 'This input is Mermaid diagram code. Translate ONLY the human-readable labels, titles, and node text inside quotes or after `title`. Preserve every keyword, arrow, bracket, color directive, and identifier exactly.' + : hint === 'plantuml' + ? 'This input is PlantUML code. Translate ONLY the human-readable labels and titles. Preserve every keyword, arrow, and identifier exactly.' + : '' + + const system = [ + `You are a professional bilingual translator for technical blog posts.`, + `Translate from ${sourceName} to ${targetName}.`, + `Preserve markdown, code, URLs, technical terms exactly.`, + `Match the original tone (formal/casual).`, + preserveList + ? `Always keep these terms verbatim: ${preserveList}.` + : '', + hintLine, + `Output ONLY the translation. No commentary, no quotes, no explanations.`, + `If the input is empty or only whitespace, output the input unchanged.` + ] + .filter(Boolean) + .join(' ') + + const body = { + model, + messages: [ + { role: 'system', content: system }, + { role: 'user', content: text } + ], + temperature: 0.3, + stream: false + } + + const response = await request({ + url: `${baseUrl}/chat/completions`, + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body), + signal + }) + + const data = JSON.parse(response.body) + if (!data.choices?.[0]?.message?.content) { + throw new Error(`DeepSeek 返回异常: ${response.body.slice(0, 200)}`) + } + + return { + text: data.choices[0].message.content.trim(), + inputTokens: data.usage?.prompt_tokens || 0, + outputTokens: data.usage?.completion_tokens || 0 + } +} + +module.exports = { translate } diff --git a/scripts/translate/providers/glm.js b/scripts/translate/providers/glm.js new file mode 100644 index 00000000000..00737010af0 --- /dev/null +++ b/scripts/translate/providers/glm.js @@ -0,0 +1,68 @@ +const { request } = require('./_http') + +// 智谱 GLM 翻译适配器;与 DeepSeek 对应的备选实现。 +async function translate({ text, sourceLang, targetLang, glossary, hint, signal }) { + const apiKey = process.env.GLM_API_KEY + if (!apiKey) throw new Error('GLM_API_KEY 未设置') + + const baseUrl = process.env.GLM_BASE_URL || 'https://open.bigmodel.cn/api/paas/v4' + const model = process.env.GLM_MODEL || 'glm-4-plus' + + const preserveList = (glossary?.preserve || []).join(', ') + const langName = { 'zh-CN': 'Simplified Chinese', 'en-US': 'English' } + const sourceName = langName[sourceLang] || sourceLang + const targetName = langName[targetLang] || targetLang + + const hintLine = + hint === 'mermaid' + ? 'This input is Mermaid diagram code. Translate ONLY human-readable labels/titles/node text. Preserve every keyword, arrow, bracket, color, and identifier exactly.' + : hint === 'plantuml' + ? 'This input is PlantUML code. Translate ONLY labels/titles. Preserve all keywords/arrows/identifiers exactly.' + : '' + + const system = [ + `You are a professional bilingual translator for technical blog posts.`, + `Translate from ${sourceName} to ${targetName}.`, + `Preserve markdown, code, URLs, technical terms exactly.`, + `Match the original tone.`, + preserveList ? `Always keep these terms verbatim: ${preserveList}.` : '', + hintLine, + `Output ONLY the translation. No commentary.` + ] + .filter(Boolean) + .join(' ') + + const body = { + model, + messages: [ + { role: 'system', content: system }, + { role: 'user', content: text } + ], + temperature: 0.3, + stream: false + } + + const response = await request({ + url: `${baseUrl}/chat/completions`, + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json' + }, + body: JSON.stringify(body), + signal + }) + + const data = JSON.parse(response.body) + if (!data.choices?.[0]?.message?.content) { + throw new Error(`GLM 返回异常: ${response.body.slice(0, 200)}`) + } + + return { + text: data.choices[0].message.content.trim(), + inputTokens: data.usage?.prompt_tokens || 0, + outputTokens: data.usage?.completion_tokens || 0 + } +} + +module.exports = { translate } diff --git a/scripts/translate/providers/index.js b/scripts/translate/providers/index.js new file mode 100644 index 00000000000..7c645625b37 --- /dev/null +++ b/scripts/translate/providers/index.js @@ -0,0 +1,12 @@ +// 翻译提供方选择器:通过 TRANSLATOR_PROVIDER 环境变量或参数指定使用哪个适配器。 +const deepseek = require('./deepseek') +const glm = require('./glm') + +function getProvider(name) { + const which = (name || process.env.TRANSLATOR_PROVIDER || 'deepseek').toLowerCase() + if (which === 'deepseek') return deepseek + if (which === 'glm') return glm + throw new Error(`未知的 TRANSLATOR_PROVIDER: ${which}`) +} + +module.exports = { getProvider } diff --git a/scripts/translate/state.js b/scripts/translate/state.js new file mode 100644 index 00000000000..8d9c81d90aa --- /dev/null +++ b/scripts/translate/state.js @@ -0,0 +1,84 @@ +const crypto = require('crypto') + +function sha256OfBlocks(blocks) { + const normalized = blocks.map(normalizeForHash).filter(Boolean) + const hash = crypto.createHash('sha256') + hash.update(JSON.stringify(normalized)) + return hash.digest('hex') +} + +function normalizeForHash(block) { + const type = block.type + const data = block[type] + if (!data) return { type } + if (data.rich_text) { + return { + type, + text: data.rich_text.map(rt => rt.plain_text || '').join('') + } + } + if (type === 'code') { + return { type, text: (data.rich_text || []).map(rt => rt.plain_text).join(''), language: data.language } + } + if (type === 'image' || type === 'video' || type === 'file') { + const file = data.file || data.external + return { type, url: file?.url || '' } + } + return { type } +} + +const TRANSLATABLE_BLOCK_TYPES = new Set([ + 'paragraph', + 'heading_1', + 'heading_2', + 'heading_3', + 'bulleted_list_item', + 'numbered_list_item', + 'quote', + 'callout', + 'toggle', + 'to_do' +]) + +// 此白名单内的代码块(按 language 字段判断)虽然属于代码,但其内容中 +// 含有需要翻译的可读文本(如图表标签、节点说明),翻译时仅替换文字, +// 严格保留语法结构。不在此列表中的代码块统一原样保留。 +const TRANSLATABLE_CODE_LANGUAGES = new Set(['mermaid', 'plantuml']) + +const COPY_AS_IS_BLOCK_TYPES = new Set([ + 'code', + 'equation', + 'image', + 'video', + 'file', + 'pdf', + 'embed', + 'divider', + 'table_of_contents', + 'bookmark', + 'breadcrumb', + 'link_preview', + 'link_to_page', + 'child_page', + 'child_database' +]) + +// 这些块类型在创建时必须携带 children 字段,但我们采用扁平方式获取 +// 块列表,不包含嵌套子节点。直接丢弃以避免提交无效请求;页面其余内容 +// 不受影响。(多列布局、表格、同步块在博客文章中较少使用。) +const SKIP_BLOCK_TYPES = new Set([ + 'synced_block', + 'column_list', + 'column', + 'table', + 'table_row', + 'unsupported' +]) + +module.exports = { + sha256OfBlocks, + TRANSLATABLE_BLOCK_TYPES, + COPY_AS_IS_BLOCK_TYPES, + SKIP_BLOCK_TYPES, + TRANSLATABLE_CODE_LANGUAGES +} diff --git a/yarn.lock b/yarn.lock index 955f8b1905e..54e530952dd 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1151,6 +1151,14 @@ resolved "https://registry.npmmirror.com/@nolyfill/is-core-module/-/is-core-module-1.0.39.tgz" integrity sha512-nn5ozdjYQpUCZlWGuxcJY/KpxkWQs4DcbMCmKojjyrYDEAGy4Ce19NN4v5MduafTwJlbKc99UA8YhSVqq9yPZA== +"@notionhq/client@^2.2.15": + version "2.3.0" + resolved "https://registry.npmmirror.com/@notionhq/client/-/client-2.3.0.tgz#4feecb012ebcd4116df14a9e2c77afed7eeb73cd" + integrity sha512-l7WqTCpQqC+HibkB9chghONQTYcxNQT0/rOJemBfmuKQRTu2vuV8B3yA395iKaUdDo7HI+0KvQaz9687Xskzkw== + dependencies: + "@types/node-fetch" "^2.5.10" + node-fetch "^2.6.1" + "@paulirish/trace_engine@0.0.53": version "0.0.53" resolved "https://registry.npmmirror.com/@paulirish/trace_engine/-/trace_engine-0.0.53.tgz" @@ -1486,6 +1494,14 @@ resolved "https://registry.npmmirror.com/@types/json5/-/json5-0.0.29.tgz" integrity sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ== +"@types/node-fetch@^2.5.10": + version "2.6.13" + resolved "https://registry.npmmirror.com/@types/node-fetch/-/node-fetch-2.6.13.tgz#e0c9b7b5edbdb1b50ce32c127e85e880872d56ee" + integrity sha512-QGpRVpzSaUs30JBSGPjOg4Uveu384erbHBoT1zeONvyCfwQxIkUshLAOqN/k9EjGviPRmWTTe6aH2qySWKTVSw== + dependencies: + "@types/node" "*" + form-data "^4.0.4" + "@types/node@*", "@types/node@22.15.3": version "22.15.3" resolved "https://registry.npmmirror.com/@types/node/-/node-22.15.3.tgz"