|
| 1 | +// https://www.princexml.com/ download princexml first |
1 | 2 | const fs = require("fs");
|
2 |
| -const xml2js = require("xml2js"); |
3 | 3 | const path = require("path");
|
| 4 | +const https = require("https"); |
| 5 | +const xml2js = require("xml2js"); |
| 6 | + |
4 | 7 | const site = process.env.site;
|
| 8 | + |
| 9 | +if (!site || !["cn", "en"].includes(site)) { |
| 10 | + process.exit(1); |
| 11 | +} |
| 12 | + |
5 | 13 | const domain = site === "cn" ? "cn" : "com";
|
| 14 | +const sitemapUrl = `https://docs.databend.${domain}/sitemap.xml`; |
6 | 15 | const sitemapPath = path.join(__dirname, `sitemap-${site}.xml`);
|
7 |
| - |
8 | 16 | const outputFilePath = path.join("./", "pdf", `docs.databend.${site}-sql.txt`);
|
9 | 17 |
|
| 18 | +const excludeUrls = [ |
| 19 | + `https://docs.databend.${domain}/`, |
| 20 | + `https://docs.databend.${domain}/download/`, |
| 21 | + `https://docs.databend.${domain}/search`, |
| 22 | +]; |
| 23 | + |
10 | 24 | if (!fs.existsSync(path.dirname(outputFilePath))) {
|
11 | 25 | fs.mkdirSync(path.dirname(outputFilePath), { recursive: true });
|
12 | 26 | }
|
13 | 27 |
|
14 | 28 | if (fs.existsSync(outputFilePath)) {
|
15 |
| - console.log(`file ${outputFilePath} existed, delete it...`); |
| 29 | + console.log(`File ${outputFilePath} existed, delete it...`); |
16 | 30 | fs.unlinkSync(outputFilePath);
|
17 | 31 | }
|
18 | 32 |
|
19 |
| -const excludeUrls = [ |
20 |
| - `https://docs.databend.${domain}/`, |
21 |
| - `https://docs.databend.${domain}/download/`, |
22 |
| - `https://docs.databend.${domain}/search`, |
23 |
| -]; |
24 |
| - |
25 |
| -try { |
26 |
| - const xmlData = fs.readFileSync(sitemapPath, "utf8"); |
27 |
| - |
28 |
| - xml2js.parseString(xmlData, (err, result) => { |
29 |
| - if (err) { |
30 |
| - console.error("XML pasrder errors", err); |
31 |
| - return; |
| 33 | +function downloadSitemap(url, destPath) { |
| 34 | + return new Promise((resolve, reject) => { |
| 35 | + if (fs.existsSync(destPath)) { |
| 36 | + return resolve(); |
32 | 37 | }
|
33 | 38 |
|
34 |
| - const urls = result.urlset.url |
35 |
| - .map((entry) => entry.loc[0]?.trim()) |
36 |
| - .filter((url) => !excludeUrls.includes(url)) |
37 |
| - .sort((a, b) => { |
38 |
| - const aHasGuides = a.includes("/guides/"); |
39 |
| - const bHasGuides = b.includes("/guides/"); |
40 |
| - if (aHasGuides && !bHasGuides) return -1; // a comes first |
41 |
| - if (!aHasGuides && bHasGuides) return 1; // b comes first |
42 |
| - return 0; // maintain original order for others |
43 |
| - }); |
44 |
| - |
45 |
| - fs.writeFileSync(outputFilePath, urls.join("\n"), "utf8"); |
| 39 | + const file = fs.createWriteStream(destPath); |
| 40 | + https |
| 41 | + .get(url, (response) => { |
| 42 | + if (response.statusCode !== 200) { |
| 43 | + reject(new Error(`Failed, error code:${response.statusCode}`)); |
| 44 | + return; |
| 45 | + } |
46 | 46 |
|
47 |
| - console.log(`${urls.length}, ${outputFilePath}`); |
| 47 | + response.pipe(file); |
| 48 | + file.on("finish", () => { |
| 49 | + file.close(resolve); |
| 50 | + }); |
| 51 | + }) |
| 52 | + .on("error", (err) => { |
| 53 | + fs.unlink(destPath, () => reject(err)); |
| 54 | + }); |
48 | 55 | });
|
49 |
| -} catch (error) { |
50 |
| - console.error("file read errors:", error); |
51 | 56 | }
|
| 57 | + |
| 58 | +(async () => { |
| 59 | + try { |
| 60 | + console.log(`sitemap:${site} (${sitemapUrl})`); |
| 61 | + |
| 62 | + await downloadSitemap(sitemapUrl, sitemapPath); |
| 63 | + console.log(`Finished:${sitemapPath}`); |
| 64 | + |
| 65 | + const xmlData = fs.readFileSync(sitemapPath, "utf8"); |
| 66 | + |
| 67 | + xml2js.parseString(xmlData, (err, result) => { |
| 68 | + if (err) { |
| 69 | + console.error("❌", err); |
| 70 | + return; |
| 71 | + } |
| 72 | + |
| 73 | + const urls = result.urlset.url |
| 74 | + .map((entry) => entry.loc[0]?.trim()) |
| 75 | + .filter((url) => !excludeUrls.includes(url)) |
| 76 | + .sort((a, b) => { |
| 77 | + const aHasGuides = a.includes("/guides/"); |
| 78 | + const bHasGuides = b.includes("/guides/"); |
| 79 | + if (aHasGuides && !bHasGuides) return -1; |
| 80 | + if (!aHasGuides && bHasGuides) return 1; |
| 81 | + return 0; |
| 82 | + }); |
| 83 | + |
| 84 | + fs.writeFileSync(outputFilePath, urls.join("\n"), "utf8"); |
| 85 | + console.log(`✅ Successed ${urls.length}, ${outputFilePath}`); |
| 86 | + }); |
| 87 | + } catch (error) { |
| 88 | + console.error("❌ Failed:", error); |
| 89 | + } |
| 90 | +})(); |
0 commit comments