forked from dobomode/coursera-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
344 lines (317 loc) · 13.7 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
const axios = require('axios').default;
const inquirer = require('inquirer');
const Configstore = require('configstore');
const path = require('path');
const chalk = require('chalk');
const figlet = require('figlet');
const Downloader = require('nodejs-file-downloader');
/**
* Local configuration store to persists course ID and CAUTH value
*/
const config = new Configstore('coursera-asset-scraper', require('./config_default.json'), {
configPath: `${path.join(__dirname, 'config.json')}`,
});
const app = (() => {
/**
* The course ID which can be found in the Coursera URL
* E.g. 'convolutional-neural-networks' from https://www.coursera.org/learn/convolutional-neural-networks
*/
let _cid;
/**
* The CAUTH value which is part of the Coursera cookie once the user authenticates on the platform.
* You can find this value in Chrome by opening the inspector and going to
* Application => Cookies => 'www.coursera.org' => CAUTH
*/
let _cauth;
/**
* The ID of the authenticated user
*/
let _uid;
/**
* The course defails as returned by the `guidedCourseWeekCards.v1` API
*/
let _courseDetails;
const api = {
main,
};
/**
* A shortcut to the console.log function
*/
const { log } = console;
/**
* Pads integer `i` with leading zero
* @param {int} i Integer to pad
* @returns {string} Zero-padded string value
*/
function padZero(i) {
return String(i).padStart(2, '0');
}
/**
* Prompts the user for the course id
* @returns {string} The course id
*/
async function promptCourse() {
const res = await inquirer.prompt({
name: 'cid',
type: 'input',
message: "Enter course ID (e.g. 'neural-networks-deep-learning'):",
get default() {
return config.get('cid');
},
validate: (val) => !!val,
});
return res.cid;
}
/**
* Prompts the user for the CAUTH value which is set in the Coursera cookie
* @returns {string} The CAUTH value
*/
async function promptCauth() {
const res = await inquirer.prompt({
name: 'cauth',
type: 'input',
message: 'Enter CAUTH value from Coursera cookie:',
get default() {
return config.get('cauth');
},
validate: (val) => !!val,
});
return res.cauth;
}
/**
* Prompts the user for the course ID and CAUTH value and stores them in the local configuration store
*/
async function getCourseAndCauth() {
_cauth = await promptCauth();
config.set('cauth', _cauth);
_cid = await promptCourse();
config.set('cid', _cid);
}
/**
* Gets the user ID as authenticated by the CAUTH value using the `adminUserPermissions.v1` API
* @returns {string} The user ID
*/
async function getUserId() {
const res = await axios
.get('https://www.coursera.org/api/adminUserPermissions.v1?q=my', {
headers: { Cookie: `CAUTH=${_cauth}` },
})
.catch((err) => {
throw err + '\nUnable to authenticate. Make sure you set CAUTH value correctly.\n';
});
_uid = res.data.elements[0].id;
if (!_uid) {
throw '\nUnable to authenticate. Make sure you set CAUTH value correctly.\n';
}
return _uid;
}
/**
* Gets the course details for the course ID using CAUTH to authenticate. Uses the `guidedCourseWeekCards.v1` API.
* @returns {object} An object containing the course details.
*/
async function getCourseDetails() {
const res = await axios
.get(`https://www.coursera.org/api/guidedCourseWeekCards.v1?ids=${_uid}~${_cid}&fields=courseId,id,weeks`, {
headers: { Cookie: `CAUTH=${_cauth}` },
})
.catch((err) => {
throw (
err +
'\nnUnable to fetch course details. Make sure you set the course ID and CAUTH value correctly and that you have access to this course.\n'
);
});
_courseDetails = res.data.elements[0];
if (!_courseDetails) {
throw '\nUnable to fetch course details. Make sure you set the course ID and CAUTH value correctly and that you have access to this course.\n';
}
return _courseDetails;
}
/**
* Downloads the source file for the given module asset.
* @param {int} assetNum The numerical sequence of the asset (i.e. 01, 02, 03, ...)
* @param {object} asset The asset object extracted from the response of `onDemandLectureAssets.v1` API
* @param {int} moduleNum The numerical sequence of the module (i.e. 01, 02, 03, ...)
* @param {object} module The week object extracted from the course details
* @param {int} weekNum The numerical sequence of the week (i.e. 01, 02, 03, ...)
* @param {object} week The week object extracted from the course details
*/
async function scrapeAsset(assetNum, asset, moduleNum, module, weekNum, week) {
if (asset && asset.typeName && asset.typeName == 'url') {
log(
` ${chalk.white('Asset')} ${chalk.yellow(
`#${padZero(assetNum)} - ` + `Skipping URL ${asset.definition.name}`
)}`
);
return;
} else
log(
` ${chalk.white('Asset')} ${chalk.yellow(
`#${padZero(assetNum)} - ` + `Downloading ${asset.definition.name}`
)}`
);
const resAsset = await axios
.get(`https://www.coursera.org/api/assets.v1/${asset.definition.assetId}?fields=fileExtension`, {
headers: { Cookie: `CAUTH=${_cauth}` },
})
.catch((err) => {
throw err + '\nUnable to download asset.\n';
});
const { url } = resAsset.data.elements[0].url;
const fileName = `${padZero(assetNum)} - ${resAsset.data.elements[0].name}`;
const moduleName = module.name.replace(/[\/\:*?"<>|+]/g, ' ');
const directory = path.join('.', _cid, 'Week ' + padZero(weekNum), padZero(moduleNum) + ' - ' + moduleName);
const downloader = new Downloader({ url, directory, fileName, cloneFiles: false, timeout: 300000 });
await downloader.download();
log(` ${chalk.white('Asset')} ${chalk.green(`#${padZero(assetNum)} - Saved '${fileName}'`)}`);
}
/**
* Downloads the highest resolution (720p mp4) lecture video file for the given video object.
* @param {int} videoNum The numerical sequence of the video (i.e. 01, 02, 03, ...)
* @param {object} video The video object extracted from the response of `onDemandLectureVideos.v1` API
* @param {int} moduleNum The numerical sequence of the module (i.e. 01, 02, 03, ...)
* @param {object} module The week object extracted from the course details
* @param {int} weekNum The numerical sequence of the week (i.e. 01, 02, 03, ...)
* @param {object} week The week object extracted from the course details
*/
async function scrapeVideo(videoNum, video, moduleNum, module, weekNum, week) {
log(
` ${chalk.white('Video')} ${chalk.yellow(
`#${padZero(videoNum)} - ` + `Downloading 720p lecture video`
)}`
);
const url = video.sources.byResolution['720p'].mp4VideoUrl;
const fileName = `${padZero(videoNum)} - Lecture video (720p).mp4`;
const moduleName = module.name.replace(/[\/\:*?"<>|+]/g, ' ');
const directory = path.join('.', _cid, 'Week ' + padZero(weekNum), padZero(moduleNum) + ' - ' + moduleName);
const downloader = new Downloader({ url, directory, fileName, cloneFiles: false, timeout: 300000 });
await downloader.download().catch((err) => {
throw err + '\nUnable to download video.\n';
});
log(` ${chalk.white('Video')} ${chalk.green(`#${padZero(videoNum)} - Saved '${fileName}'`)}`);
}
/**
* Scrapes the given module by looping over each of its assets and videos. The function creates a pool
* of promises to fetch all the assets and videos concurrently.
* Assets are fetched via `onDemandLectureAssets.v1` API and downloaded via `scrapeAsset()`
* Videos are fetched via `onDemandLectureVideos.v1` and downloaded via `scrapeVideo()`
* @param {int} moduleNum The numerical sequence of the module (i.e. 01, 02, 03, ...)
* @param {object} module The week object extracted from the course details
* @param {int} weekNum The numerical sequence of the week (i.e. 01, 02, 03, ...)
* @param {object} week The week object extracted from the course details
*/
async function scrapeModule(moduleNum, module, weekNum, week) {
log(`\n ${chalk.white('Module')} ${chalk.yellow(`#${padZero(moduleNum)} - ${module.name}`)}`);
const lectureAssets = axios
.get(
`https://www.coursera.org/api/onDemandLectureAssets.v1/${_courseDetails.courseId}~${module.id}/?includes=openCourseAssets`,
{ headers: { Cookie: `CAUTH=${_cauth}` } }
)
.catch((err) => {
if (
err &&
err.response &&
err.response.data &&
err.response.data.message &&
err.response.data.message.startsWith('Wrong content type for item StoredItem')
) {
console.log(' Module does not have any downloadable assets.');
} else throw err + '\nUnable to fetch lecture assets.\n';
});
const lectureVideos = axios
.get(
`https://www.coursera.org/api/onDemandLectureVideos.v1/${_courseDetails.courseId}~${module.id}?includes=video&fields=onDemandVideos.v1(sources%2Csubtitles%2CsubtitlesVtt%2CsubtitlesTxt)`,
{ headers: { Cookie: `CAUTH=${_cauth}` } }
)
.catch((err) => {
if (
err &&
err.response &&
err.response.data &&
err.response.data.message &&
err.response.data.message.startsWith('Wrong content type for item StoredItem')
) {
console.log(' Module does not have any downloadable videos.');
} else throw err + '\nUnable to fetch lecture video.\n';
});
const resModule = await Promise.all([lectureAssets, lectureVideos]);
let assetNum = 0;
const promises = [];
// resModule[1] has the videos
if (resModule[1]) {
const video = resModule[1].data.linked['onDemandVideos.v1'][0];
assetNum += 1;
promises.push(scrapeVideo(assetNum, video, moduleNum, module, weekNum, week));
}
// resModule[0] has the assets
if (resModule[0]) {
for (const asset of resModule[0].data.linked['openCourseAssets.v1']) {
assetNum += 1;
promises.push(scrapeAsset(assetNum, asset, moduleNum, module, weekNum, week));
}
}
await Promise.all(promises);
// return resModule;
}
/**
* Scrapes the given week by looping over each of its modules and calling `scrapeModule()`
* @param {int} weekNum The numerical sequence of the week (i.e. 01, 02, 03, ...)
* @param {object} week The week object extracted from the course details
*/
async function scrapeWeek(weekNum, week) {
log(`\n ${chalk.white('Week')} ${chalk.yellow(`#${padZero(weekNum)}`)}`);
let moduleNum = 0;
for (const module of week.modules[0].items) {
moduleNum += 1;
await scrapeModule(moduleNum, module, weekNum, week);
}
}
/**
* Scrapes the course by looping over each week in the course details and calling `scrapeWeek()`
*/
async function scrapeCourse() {
log(`\n${chalk.white('Course')} '${chalk.yellow(_cid)}'`);
let weekNum = 0;
for (const week of _courseDetails.weeks) {
weekNum += 1;
await scrapeWeek(weekNum, week);
}
}
/**
* Main app logic:
* 1) Get the course ID and CAUTH value by prompting the user
* 2) Authenticate and fet the user ID via `adminUserPermissions.v1` API
* 3) Get the course details via `guidedCourseWeekCards.v1` API
* 4) Scrape the course and download & save all course assets & videos
*
* All assets and videos are saved in a hierarhical directory structure as follows:
* <course id>/<## - week id>/<## - module id>/<## - asset / video>
*
* For example, for the `neural-networks-deep-learning` course, this looks like this:
*
* neural-networks-deep-learning
* Week 01
* 1 - Welcome
* 01 - Lecture video (720p).mp4
* 02 - Welcome_merged.doc
* 03 - 3287059-Welcome-extended-description-mixed (1).mp4
* 2 - What is a Neural Network?
* 01 - Lecture video (720p).mp4
* 02 - What is a NN?.pptx
* 03 - What_is_Neural_Network.pdf
* ...
*/
async function main() {
log(chalk.yellow(figlet.textSync('cscraper', { font: 'Standard', horizontalLayout: 'full' })));
log();
try {
await getCourseAndCauth();
await getUserId();
await getCourseDetails();
await scrapeCourse();
} catch (error) {
log(chalk.red(error));
}
}
return api;
})();
app.main();