This repository has been archived by the owner on Jan 2, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathen.js
100 lines (96 loc) · 3.01 KB
/
en.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
const fs = require('fs-extra');
const { orderBy, includes } = require('lodash');
const extract = require('extract-lemmatized-nonstop-words');
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const words = new Map();
const blackList = [
'don',
'didn',
'doesn',
'isn',
'wasn',
'wouldn',
'haven',
'couldn',
'shouldn',
'weren',
'hasn',
'hadn',
'goin',
// Proper names
'john',
'jeff',
'jeffrey',
'google',
'mumbai',
// ?
'huh',
'ces',
];
function addWord(word, count) {
if (includes(blackList, word)) return;
const record = words.get(word);
if (record) {
words.set(word, record + count);
} else {
words.set(word, count);
}
}
(async () => {
let data = await fs.readFile('./data/en/frequency-alpha-gcide.txt', 'utf8');
data.split('\n').forEach(line => {
// [RANKING,WORD,COUNT,PERCENT,CUMULATIVE]
let [rank, word, count, percent, cumulative] = line.split(/[\s]+/);
const parsed = extract(word);
if (parsed.length != 1) return;
word = parsed[0].vocabulary;
count = parseInt(count.replace(/,/g, ''));
addWord(word, count);
});
data = await fs.readFile('./data/en/en_2016_50k.txt', 'utf8');
data += `\n` + await fs.readFile('./data/en/en_2018_50k.txt', 'utf8');
data.split('\n').forEach(line => {
// [WORD,COUNT]
let [word, count] = line.split(/[\s]+/);
const parsed = extract(word);
if (parsed.length != 1) return;
word = parsed[0].vocabulary;
// if (word == 'hunting') {
// console.log(parsed);
// debugger;
// }
count = parseInt(count) * (348412387855 / 653789027); // scaling en_2018_50k.txt counts to the frequency-alpha-gcide.txt counts
addWord(word, count);
});
let list = orderBy(Array.from(words).map(record => ({ word: record[0], count: record[1] })), 'count', 'desc').slice(0, 50000);
const totalCount = list.reduce((totalCount, word) => totalCount + word.count, 0);
let rank = 1;
list.reduce((cumulative, item) => {
item.percent = item.count * 100 / totalCount;// .toFixed(6);
cumulative += item.percent;
item.rank = rank++;
item.cumulative = cumulative;// .toFixed(6);
delete item.count;
return cumulative;
}, 0);
debugger;
try {
// saving as JSON
await fs.writeJSON('./dist/en.json', list);
// saving as CSV
const csvWriter = createCsvWriter({
header: [
{ id: 'rank', title: 'Rank' },
{ id: 'word', title: 'Word' },
{ id: 'percent', title: 'Percent' },
{ id: 'cumulative', title: 'Cumulative' },
],
// header: ['RANKING', 'WORD', 'COUNT', 'PERCENT', 'CUMULATIVE'],
path: './dist/en.csv'
});
await csvWriter.writeRecords(list);
} catch (error) {
console.log(error);
}
console.log('...Done');
})();