Skip to content

Commit c88c4a4

Browse files
authored
feat: enhance parser_sitemap.js for pdf (#2252)
* chore: update gpt translate * feat: enhance parser_sitemap.js for pdf
1 parent 6e06ade commit c88c4a4

File tree

3 files changed

+72
-11214
lines changed

3 files changed

+72
-11214
lines changed

scripts/parser_sitemap.js

Lines changed: 70 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,90 @@
1+
// https://www.princexml.com/ download princexml first
12
const fs = require("fs");
2-
const xml2js = require("xml2js");
33
const path = require("path");
4+
const https = require("https");
5+
const xml2js = require("xml2js");
6+
47
const site = process.env.site;
8+
9+
if (!site || !["cn", "en"].includes(site)) {
10+
process.exit(1);
11+
}
12+
513
const domain = site === "cn" ? "cn" : "com";
14+
const sitemapUrl = `https://docs.databend.${domain}/sitemap.xml`;
615
const sitemapPath = path.join(__dirname, `sitemap-${site}.xml`);
7-
816
const outputFilePath = path.join("./", "pdf", `docs.databend.${site}-sql.txt`);
917

18+
const excludeUrls = [
19+
`https://docs.databend.${domain}/`,
20+
`https://docs.databend.${domain}/download/`,
21+
`https://docs.databend.${domain}/search`,
22+
];
23+
1024
if (!fs.existsSync(path.dirname(outputFilePath))) {
1125
fs.mkdirSync(path.dirname(outputFilePath), { recursive: true });
1226
}
1327

1428
if (fs.existsSync(outputFilePath)) {
15-
console.log(`file ${outputFilePath} existed, delete it...`);
29+
console.log(`File ${outputFilePath} existed, delete it...`);
1630
fs.unlinkSync(outputFilePath);
1731
}
1832

19-
const excludeUrls = [
20-
`https://docs.databend.${domain}/`,
21-
`https://docs.databend.${domain}/download/`,
22-
`https://docs.databend.${domain}/search`,
23-
];
24-
25-
try {
26-
const xmlData = fs.readFileSync(sitemapPath, "utf8");
27-
28-
xml2js.parseString(xmlData, (err, result) => {
29-
if (err) {
30-
console.error("XML pasrder errors", err);
31-
return;
33+
function downloadSitemap(url, destPath) {
34+
return new Promise((resolve, reject) => {
35+
if (fs.existsSync(destPath)) {
36+
return resolve();
3237
}
3338

34-
const urls = result.urlset.url
35-
.map((entry) => entry.loc[0]?.trim())
36-
.filter((url) => !excludeUrls.includes(url))
37-
.sort((a, b) => {
38-
const aHasGuides = a.includes("/guides/");
39-
const bHasGuides = b.includes("/guides/");
40-
if (aHasGuides && !bHasGuides) return -1; // a comes first
41-
if (!aHasGuides && bHasGuides) return 1; // b comes first
42-
return 0; // maintain original order for others
43-
});
44-
45-
fs.writeFileSync(outputFilePath, urls.join("\n"), "utf8");
39+
const file = fs.createWriteStream(destPath);
40+
https
41+
.get(url, (response) => {
42+
if (response.statusCode !== 200) {
43+
reject(new Error(`Failed, error code:${response.statusCode}`));
44+
return;
45+
}
4646

47-
console.log(`${urls.length}, ${outputFilePath}`);
47+
response.pipe(file);
48+
file.on("finish", () => {
49+
file.close(resolve);
50+
});
51+
})
52+
.on("error", (err) => {
53+
fs.unlink(destPath, () => reject(err));
54+
});
4855
});
49-
} catch (error) {
50-
console.error("file read errors:", error);
5156
}
57+
58+
(async () => {
59+
try {
60+
console.log(`sitemap:${site} (${sitemapUrl})`);
61+
62+
await downloadSitemap(sitemapUrl, sitemapPath);
63+
console.log(`Finished:${sitemapPath}`);
64+
65+
const xmlData = fs.readFileSync(sitemapPath, "utf8");
66+
67+
xml2js.parseString(xmlData, (err, result) => {
68+
if (err) {
69+
console.error("❌", err);
70+
return;
71+
}
72+
73+
const urls = result.urlset.url
74+
.map((entry) => entry.loc[0]?.trim())
75+
.filter((url) => !excludeUrls.includes(url))
76+
.sort((a, b) => {
77+
const aHasGuides = a.includes("/guides/");
78+
const bHasGuides = b.includes("/guides/");
79+
if (aHasGuides && !bHasGuides) return -1;
80+
if (!aHasGuides && bHasGuides) return 1;
81+
return 0;
82+
});
83+
84+
fs.writeFileSync(outputFilePath, urls.join("\n"), "utf8");
85+
console.log(`✅ Successed ${urls.length}, ${outputFilePath}`);
86+
});
87+
} catch (error) {
88+
console.error("❌ Failed:", error);
89+
}
90+
})();

0 commit comments

Comments
 (0)