-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.js
87 lines (77 loc) · 2.24 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
const REDIS_QUEUE='url'
const REDIS_DB=2
const COUCHDB_URL=process.env['COUCHDB_URL'] || 'http://localhost:5984'
// Axios
const axios = require('axios')
// Redis for queuing
const redis = require('redis')
var client = redis.createClient()
client.select(REDIS_DB);
const {promisify} = require('util');
const rpushAsync = promisify(client.rpush).bind(client);
const rpopAsync = promisify(client.lpop).bind(client);
// CouchDB
const nano = require('nano')(COUCHDB_URL);
const db = nano.db.use('dd-meteo-gc-ca');
function walk(path) {
var docs = [];
return axios
.get('https://dd.meteo.gc.ca' + path)
.then(response => {
var lines = response.data.split('\n')
for(var i = 0; i < lines.length; i++) {
if(!lines[i].startsWith('<img')) continue
var l = lines[i]
var m = l.match(/<a[^>]*>([^<]*)<\/a>/)
var filepath = path + m[1]
var rem = l.split('</a>')[1]
m = rem.match(/\w*(\d\d\d\d-\d\d-\d\d \d\d:\d\d)\w*(.*)/)
var mtime = m[1]
var size = m[2].trim()
var directory = filepath.slice(-1) === '/'
var doc = {_id: filepath, filepath, mtime, size, directory}
docs.push(doc)
}
return db.bulk({docs})
})
.then(function() {
// Add to queue
var toQueue = docs.filter(d => d.directory).map(d => d.filepath)
if(toQueue.length > 0) return rpushAsync(`${REDIS_QUEUE}-${path.split('/').length}`, toQueue)
})
.then(function() {
return Promise.resolve(true)
})
}
function delay(duration) {
return function(value) {
return new Promise(function(resolve) {
setTimeout(function() {
resolve(value);
}, duration || 0);
});
};
};
function processNext(level) {
rpopAsync(`${REDIS_QUEUE}-${level}`)
.then(data => {
if(!data) throw 'Empty queue'
console.log(`Processing url: ${data}`)
return walk(data).then(delay(100))
})
.catch(function(err) {
console.log(err);
if(err === 'Empty queue') {
level += 1
console.log(`Moving to level ${level}`)
if(level > 100) {
console.log('Exceeded max number of levels')
process.exit(0);
}
}
})
.then(function() {
processNext(level)
})
}
processNext(0)