Skip to content

Commit

Permalink
chore: make cli execution default
Browse files Browse the repository at this point in the history
  • Loading branch information
marcelovicentegc committed Nov 21, 2023
1 parent 54fc5ff commit ade2157
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 110 deletions.
51 changes: 0 additions & 51 deletions Dockerfile

This file was deleted.

5 changes: 2 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@
"scripts": {
"preinstall": "bunx playwright install",
"start": "bun run start:dev",
"start:cli": "NODE_ENV=development bun run build && node dist/src/cli.js",
"start:prod": "node dist/src/main.js",
"start:dev": "bun run build && node --no-warnings=ExperimentalWarning dist/src/main.js",
"start:dev": "NODE_ENV=development bun run build && node dist/cli.js",
"start:prod": "node dist/main.js",
"build": "tsc"
},
"author": "It's not you it's me",
Expand Down
104 changes: 69 additions & 35 deletions src/cli.ts
Original file line number Diff line number Diff line change
@@ -1,47 +1,77 @@
#!/usr/bin/env node

import { program } from "commander";
import { Config } from "../config.js";
import { Config } from "./config.js";
import { crawl, write } from "./core.js";
import { createRequire } from "node:module";
import inquirer from "inquirer";

const require = createRequire(import.meta.url);
const { version, description } = require("../../package.json");
const { version, description } = require("../package.json");

async function handler(options: any) {
const messages = {
url: "What is the first URL of the website you want to crawl?",
match: "What is the URL pattern you want to match?",
selector: "What is the CSS selector you want to match?",
maxPagesToCrawl: "How many pages do you want to crawl?",
outputFileName: "What is the name of the output file?",
};

async function handler(options: Config) {
try {
const {
url,
match,
selector,
maxPagesToCrawl: maxPagesToCrawlStr,
outputFileName,
} = options;

// @ts-ignore
const maxPagesToCrawl = parseInt(maxPagesToCrawlStr, 10);

let config: Config = {
url: options.url,
match: options.match,
selector: options.selector,
maxPagesToCrawl: 50,
outputFileName: options.outputFileName ?? "output.json",
url,
match,
selector,
maxPagesToCrawl,
outputFileName,
};

if (!config.url || !config.match || !config.selector) {
const { url, match, selector } = await inquirer
.prompt([
{
type: "input",
name: "url",
message: "What is the URL of the website you want to crawl?",
},
{
type: "input",
name: "match",
message: "What is the URL pattern you want to match?",
},
{
type: "input",
name: "selector",
message: "What is the CSS selector you want to match?",
},
]);

config.url = url;
config.match = match;
config.selector = selector;
const questions = [];

if (!config.url) {
questions.push({
type: "input",
name: "url",
message: messages.url,
});
}

if (!config.match) {
questions.push({
type: "input",
name: "match",
message: messages.match,
});
}

if (!config.selector) {
questions.push({
type: "input",
name: "selector",
message: messages.selector,
});
}

const answers = await inquirer
.prompt(questions);

config = {
...config,
...answers,
};
}

await crawl(config);
Expand All @@ -56,11 +86,15 @@ program
.description(description);

program
.option("-u, --url")
.option("-m, --match")
.option("-s, --selector")
.option("-m, --maxPagesToCrawl")
.option("-o, --outputFileName")
.option("-u, --url <string>", messages.url, "")
.option("-m, --match <string>", messages.match, "")
.option("-s, --selector <string>", messages.selector, "")
.option("-m, --maxPagesToCrawl <number>", messages.maxPagesToCrawl, "50")
.option(
"-o, --outputFileName <string>",
messages.outputFileName,
"output.json",
)
.action(handler);

program.parse();
28 changes: 23 additions & 5 deletions config.ts → src/config.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
import { Page } from "playwright";

export type Config = {
/** URL to start the crawl */
/**
* URL to start the crawl
* @example "https://www.builder.io/c/docs/developers"
* @default ""
*/
url: string;
/** Pattern to match against for links on a page to subsequently crawl */
/**
* Pattern to match against for links on a page to subsequently crawl
* @example "https://www.builder.io/c/docs/**"
* @default ""
*/
match: string | string[];
/** Selector to grab the inner text from */
/**
* Selector to grab the inner text from
* @example ".docs-builder-container"
* @default ""
*/
selector: string;
/** Don't crawl more than this many pages */
/**
* Don't crawl more than this many pages
* @default 50
*/
maxPagesToCrawl: number;
/** File name for the finished data */
/**
* File name for the finished data
* @default "output.json"
*/
outputFileName: string;
/** Optional cookie to be set. E.g. for Cookie Consent */
cookie?: { name: string; value: string };
Expand Down
2 changes: 1 addition & 1 deletion src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import { PlaywrightCrawler } from "crawlee";
import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import { Config } from "../config";
import { Config } from "./config";
import { Page } from "playwright";

let pageCounter = 0;
Expand Down
13 changes: 0 additions & 13 deletions src/hardcoded.ts

This file was deleted.

1 change: 0 additions & 1 deletion src/main.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
export * from "./core.js";
export * from "./cli.js";
export * from "./hardcoded.js";
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@
"skipLibCheck": true,
"lib": ["DOM"]
},
"include": ["./src/**/*", "./config.ts"]
"include": ["./src/**/*", "src/config.ts"]
}

0 comments on commit ade2157

Please sign in to comment.