Skip to content

Commit

Permalink
For migrating WordPress blog. Tabling, for re-rework.
Browse files Browse the repository at this point in the history
  • Loading branch information
renoirb committed Mar 2, 2020
1 parent 83a9ea1 commit 87d0861
Show file tree
Hide file tree
Showing 13 changed files with 286 additions and 97 deletions.
14 changes: 14 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"type": "node",
"request": "launch",
"name": "Launch Program",
"program": "${workspaceFolder}/example.js"
}
]
}
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"git.ignoreLimitWarning": true
}
12 changes: 12 additions & 0 deletions .vscode/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
// See https://go.microsoft.com/fwlink/?LinkId=733558
// for the documentation about the tasks.json format
"version": "2.0.0",
"tasks": [
{
"type": "npm",
"script": "start",
"problemMatcher": []
}
]
}
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,14 @@ test:

.PHONY: archive
archive:
yarn archive
bin/archivator archive

.PHONY: analyze
analyze:
yarn analyze
bin/archivator analyze

markdownify:
yarn markdownify

dist:
yarn dist
Expand Down
24 changes: 24 additions & 0 deletions bin/archivator-markdownify.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import {
catcher,
iterateIntoArchivable
} from '../src/module';
import {
read,
handle,
write
} from '../src/markdownify';

const URL_LIST = 'archive/index.csv';

(async () => {
/**
* Something is going somewhat as an anti-pattern here.
* We want Promise.all(...) at each step, and it's not how
* it is as of now. Needs rework here. TODO
*/
for (const archivable of iterateIntoArchivable(URL_LIST)) {
const contents = await read(archivable).catch(catcher);
const handled = await handle(contents).catch(catcher);
await write(handled).catch(catcher);
}
})();
58 changes: 29 additions & 29 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,51 @@
"name": "archivator",
"version": "1.0.0",
"description": "Fetch and stream body contents for URLs",
"repository": {
"type": "git",
"url": "https://github.com/renoirb/archivator.git"
},
"bugs": {
"url": "https://github.com/renoirb/archivator/issues"
},
"bin": {
"archivator-archive": "./src/archive.js",
"archivator-analyze": "./src/analyze.js"
"repository": {
"type": "git",
"url": "https://github.com/renoirb/archivator.git"
},
"license": "MIT",
"author": "Renoir Boulanger <[email protected]> (http://renoirb.com/)",
"main": "index.js",
"module": "module.js",
"bin": {
"archivator-analyze": "./dist/analyze.js",
"archivator-archive": "./dist/archive.js"
},
"scripts": {
"analyze": "node_modules/.bin/babel-node src/analyze.js",
"archive": "node_modules/.bin/babel-node src/archive.js",
"dev": "node_modules/.bin/babel src/ -d dist/ -w -s",
"dist": "node_modules/.bin/babel src/ --minified -d dist/ -s",
"test": "node_modules/.bin/mocha --compilers js:babel-core/register",
"lint": "node_modules/.bin/xo",
"dev": "node_modules/.bin/babel src/ -d dist/ -w -s",
"archive": "node_modules/.bin/babel-node src/archive.js",
"analyze": "node_modules/.bin/babel-node src/analyze.js",
"lint:fix": "node_modules/.bin/xo --fix",
"markdownify": "node_modules/.bin/babel-node src/markdownify.js",
"lint:fix": "node_modules/.bin/xo --fix"
"test": "node_modules/.bin/mocha --compilers js:babel-core/register"
},
"xo": {
"esnext": true,
"ignores": [
"dist/**",
"example.js"
],
"plugins": [
"unicorn"
],
"rules": {
"func-names": 0
},
"space": 2
},
"author": "Renoir Boulanger <[email protected]> (http://renoirb.com/)",
"license": "MIT",
"dependencies": {
"async-file": "^2.0.2",
"babel-polyfill": "^6.23.0",
"cheerio": "^0.22.0",
"elasticsearch": "^12.1.3",
"esm": "^3.0.81",
"esm": "^3.2.25",
"gen-readlines": "^0.1.3",
"html-md-2": "^3.0.0",
"node-fetch": "^1.6.3",
Expand All @@ -52,19 +66,5 @@
"eslint-plugin-unicorn": "^1.0.0",
"mocha": "^3.2.0",
"xo": "^0.17.1"
},
"xo": {
"esnext": true,
"space": 2,
"plugins": [
"unicorn"
],
"ignores": [
"dist/**",
"example.js"
],
"rules": {
"func-names": 0
}
}
}
22 changes: 11 additions & 11 deletions src/analyze.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@ import {
cheerioLoad
} from './common';

import dictionary from './lists/stopwords.en';

/**
* https://www.ranks.nl/stopwords
* http://xpo6.com/list-of-english-stop-words/
*/
const stopWords = new Set(dictionary);
import dictionary from './lists/stopwords.fr';
import dictionaryEn from './lists/stopwords.en';

function normalize(input) {
const dto = String(input) || '';
return dto.replace(/[^\w\s]|_/g, '').toLowerCase();
}

async function extractWords(recv, archivable) {
const loaded = cheerioLoad(recv);
return loaded.then(shard => {
/**
* https://www.ranks.nl/stopwords
* http://xpo6.com/list-of-english-stop-words/
*/
const stopWordsSet = new Set([...dictionary, ...dictionaryEn]);

return cheerioLoad(recv).then(shard => {
const truncate = archivable.truncate;
shard(truncate).remove();
const text = shard.text().split(' ');
Expand All @@ -31,7 +31,7 @@ async function extractWords(recv, archivable) {
for (let i = 0; i < text.length; i++) {
const word = normalize(text[i]);
const withinCharRange = /^[a-zA-ZÀ-ÖØ-öø-ÿ]+$/.test(word);
const isNotStopWord = stopWords.has(word) === false;
const isNotStopWord = stopWordsSet.has(word) === false;
const hasAtLeastTwo = word.length > 1;
if (withinCharRange && isNotStopWord && hasAtLeastTwo) {
if (foundOnce.has(word) === false) {
Expand Down Expand Up @@ -117,7 +117,7 @@ async function write(file, data = {}, boolOverwrite = true) {
export default async archivable => {
const slug = archivable.slug;
const path = `archive/${slug}`;
const cacheFile = `${path}/document.html`;
const cacheFile = `${path}/cache.html`;
const file = `${path}/analyze.json`;
return Promise.resolve(cacheFile)
.then(cacheFile => analyze(cacheFile, archivable))
Expand Down
6 changes: 4 additions & 2 deletions src/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ function readCachedError(errorObj) {
}

// Make possible to do extractLinks, markdownify, ... in parallel TODO
async function cheerioLoad(recv, configObj = {}) {
return new Promise(resolve => resolve(cheerio.load(recv, configObj)));
function cheerioLoad(recv, configObj = {}) {
// console.log('async cheerioLoad', { recv, configObj })
const loading = cheerio.load(recv, configObj);
return Promise.resolve(loading);
}

const urlNotInBlacklist = u => {
Expand Down
16 changes: 14 additions & 2 deletions src/lists/stopwords.en.json
Original file line number Diff line number Diff line change
Expand Up @@ -563,5 +563,17 @@
"yourself",
"yourselves",
"youve",
"zero"
]
"zero",
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december"
]
46 changes: 46 additions & 0 deletions src/lists/stopwords.fr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[
"a",
"avec",
"ce",
"dans",
"de",
"de",
"des",
"du",
"en",
"est",
"jai",
"je",
"la",
"le",
"les",
"long",
"mon",
"non",
"off",
"oh",
"ok",
"or",
"pas",
"possible",
"pour",
"que",
"qui",
"sensible",
"sont",
"sur",
"une",
"zero",
"janvier",
"fevrier",
"mars",
"avril",
"mai",
"juin",
"juillet",
"aout",
"septembre",
"octobre",
"novembre",
"decembre"
]
Loading

0 comments on commit 87d0861

Please sign in to comment.