|
| 1 | +request = require 'request' |
| 2 | +jsdom = require 'jsdom' |
| 3 | +Models = require '../models/models' # Schemas Container |
| 4 | +Subjects = Models.subjects |
| 5 | +Meta = Models.meta |
| 6 | +Courses = Models.courses |
| 7 | +baseUrl = 'https://webapp4.asu.edu/catalog' |
| 8 | +jQueryUrl = ['https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js'] |
| 9 | +class Crawler |
| 10 | + @jsession |
| 11 | + |
| 12 | + getJSession: (cb) -> |
| 13 | + cookieJar = request.jar() |
| 14 | + jsessionid = '' |
| 15 | + that = @ |
| 16 | + |
| 17 | + cookie = request.cookie 'onlineCampusSelection=C' |
| 18 | + cookie.value = 'C' |
| 19 | + cookie.path = '/catalog' |
| 20 | + cookieJar.add cookie |
| 21 | + |
| 22 | + options = |
| 23 | + "url": "https://webapp4.asu.edu/catalog/" |
| 24 | + "jar": cookieJar |
| 25 | + "followRedirect": false |
| 26 | + |
| 27 | + request options, (error, response, body) -> |
| 28 | + if error? |
| 29 | + console.log "Error: #{error}" |
| 30 | + else |
| 31 | + for prop,i in cookieJar.cookies |
| 32 | + if cookieJar.cookies[i].name == 'JSESSIONID' |
| 33 | + jsessionid = cookieJar.cookies[i].value |
| 34 | + |
| 35 | + cookie = request.cookie 'JSESSIONID=' + jsessionid |
| 36 | + cookie.value = jsessionid |
| 37 | + cookieJar.add cookie |
| 38 | + that.jsession = jsessionid |
| 39 | + |
| 40 | + cb(cookieJar) |
| 41 | + |
| 42 | + updateCurrentTerm: () -> |
| 43 | + request |
| 44 | + url: "https://webapp4.asu.edu/catalog/TooltipTerms.ext" |
| 45 | + , (error, response, body) -> |
| 46 | + jsdom.env body |
| 47 | + , ['https://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js'] |
| 48 | + , (errors, window) -> |
| 49 | + $ = window.jQuery |
| 50 | + termList = $('#termList').find('span a') |
| 51 | + summerTermRegex = /(Summer|Sum)/i |
| 52 | + currentTerm = [] |
| 53 | + |
| 54 | + # If first node is not summer, then just use first node |
| 55 | + if termList.eq(0).text()? |
| 56 | + if termList.eq(0).text().match(summerTermRegex) is null |
| 57 | + currentTerm.push termList.eq(0).attr('href').match(/\d+?$/)[0] |
| 58 | + #console.log "CurrentTermID: #{currentTerm}" |
| 59 | + else |
| 60 | + # Since it is summer, check the next 3 nodes for additional summer terms |
| 61 | + for i in [0..3] |
| 62 | + term = termList.eq(i).text() |
| 63 | + if term? |
| 64 | + if term.match(summerTermRegex)? |
| 65 | + summerTerm = termList.eq(i).attr('href').match(/\d{4}[A-z]?/)[0] |
| 66 | + |
| 67 | + # check if term matches with current year |
| 68 | + summerTermYr = summerTerm.match(/\d(\d{2})\d[A-z]?/)[1] |
| 69 | + currentYr = new Date().getFullYear() |
| 70 | + .toString().match(/\d{2}$/)[0] |
| 71 | + |
| 72 | + if summerTermYr == currentYr |
| 73 | + currentTerm.push summerTerm |
| 74 | + |
| 75 | + MetaModel = Meta.model() |
| 76 | + MetaModel.update({}, {currentTerm: currentTerm} |
| 77 | + , {multi:true}, (err, numAffected) -> |
| 78 | + if err? |
| 79 | + console.log "Error: #{err}" |
| 80 | + else |
| 81 | + console.log "Row(s) affected: #{numAffected}" |
| 82 | + ) |
| 83 | + |
| 84 | + getSubjects: () -> |
| 85 | + @getJSession (cookieJar) -> |
| 86 | + request |
| 87 | + url: "https://webapp4.asu.edu/catalog/Subjects.html" |
| 88 | + jar: cookieJar |
| 89 | + , (error, response, body) -> |
| 90 | + jsdom.env body, jQueryUrl, (errors, window) -> |
| 91 | + #String::trim = () -> @replace /^\s+|\s+$/g, '' |
| 92 | + |
| 93 | + $ = window.jQuery |
| 94 | + subjectsNodes = $('#subjectDivs').find('.row') |
| 95 | + |
| 96 | + subjects = [] |
| 97 | + names = [] |
| 98 | + nRows = subjectsNodes.length |
| 99 | + |
| 100 | + subjectsNodes.each () -> |
| 101 | + subject = $(this).find('div.subject').text() |
| 102 | + name = $(this).find('div.subjectTitle').text() |
| 103 | + |
| 104 | + # read-only model instance |
| 105 | + SubjectsModel = Subjects.model() |
| 106 | + |
| 107 | + # check if subject already exists |
| 108 | + SubjectsModel.findOne |
| 109 | + subject: subject |
| 110 | + , ['subject'] |
| 111 | + , (err, doc) -> |
| 112 | + if doc? |
| 113 | + console.log "Skipping... #{doc.subject} @ #{new Date()}" |
| 114 | + else |
| 115 | + console.log "Creating doc for... #{subject} @ #{new Date()}" |
| 116 | + |
| 117 | + SubjectsInstance = Subjects.model(true) |
| 118 | + SubjectsInstance.subject = subject |
| 119 | + SubjectsInstance.name = name |
| 120 | + |
| 121 | + SubjectsInstance.save (err, result) -> |
| 122 | + if err? then console.log "Error: #{err}" |
| 123 | + |
| 124 | + # finished, close mongo connection |
| 125 | + if not --nRows |
| 126 | + setTimeout(-> |
| 127 | + Models.close() |
| 128 | + , 1000) |
| 129 | + console.log "MongoDB connection closed... @ #{new Date()}" |
| 130 | + |
| 131 | + getCourseList: (subject, termID) -> |
| 132 | + @getJSession (cookieJar) -> |
| 133 | + request |
| 134 | + url: "#{baseUrl}/classlist?s=#{subject}&t=#{termID}&e=all" |
| 135 | + jar: cookieJar |
| 136 | + , (error, response, body) -> |
| 137 | + jsdom.env body, jQueryUrl, (errors, window) -> |
| 138 | + String::trim = () -> @replace /^\s+|\s+$/g, '' |
| 139 | + |
| 140 | + $ = window.jQuery |
| 141 | + courseNodes = $('#CatalogList > tbody > tr') |
| 142 | + nCourses = courseNodes.length |
| 143 | + |
| 144 | + courseNodes.each () -> |
| 145 | + # initial parsed values |
| 146 | + courseNode = $(this) |
| 147 | + courseId = courseNode.find('.classNbrColumnValue a').text().trim() |
| 148 | + number = courseNode.find('.subjectNumberColumnValue') |
| 149 | + .text().trim().split(/\s/)[1] |
| 150 | + title = courseNode.find('.titleColumnValue a').text().trim() |
| 151 | + units = courseNode.find('.hoursColumnValue').text().trim() |
| 152 | + startDate = courseNode.find('.startDateColumnValue a') |
| 153 | + .text().trim().split(/\s-\s/g)[0] |
| 154 | + endDate = courseNode.find('.startDateColumnValue a') |
| 155 | + .text().trim().split(/\s-\s/g)[1].replace(/\(C\)/g, '') |
| 156 | + days = courseNode.find('.dayListColumnValue').text().trim() |
| 157 | + startTime = courseNode.find('.startTimeDateColumnValue') |
| 158 | + .text().trim() |
| 159 | + endTime = courseNode.find('.endTimeDateColumnValue').text().trim() |
| 160 | + gstudy = courseNode.find('.tooltipRqDesDescrColumnValue .gstip') |
| 161 | + .text().trim() |
| 162 | + location = courseNode.find('.locationBuildingColumnValue') |
| 163 | + .text().trim() |
| 164 | + instructorsTmp = courseNode |
| 165 | + .find('.instructorListColumnValue > span > span') |
| 166 | + openSeats = courseNode.find('.availableSeatsColumnValue') |
| 167 | + .find('table> tr > td:eq(0)').text().trim() |
| 168 | + maxSeats = courseNode.find('.availableSeatsColumnValue') |
| 169 | + .find('table> tr > td:eq(2)').text().trim() |
| 170 | + |
| 171 | + lastClosed = null |
| 172 | + lastOpened = null |
| 173 | + status = null |
| 174 | + instructors = [] |
| 175 | + |
| 176 | + # further data processing/formatting |
| 177 | + instructorsTmp.each () -> |
| 178 | + #console.log $(this).find('span > span > a').attr('title') |
| 179 | + instructor = $(this).find('span > span > a').attr('title') |
| 180 | + |
| 181 | + if instructor? |
| 182 | + instructor = $(this).find('span > span > a').attr('title') |
| 183 | + .split('|')[1] |
| 184 | + else |
| 185 | + instructor = $(this).text().trim() |
| 186 | + instructors.push instructor |
| 187 | + |
| 188 | + |
| 189 | + honors = if /Honor/gi.test(title) then true else false |
| 190 | + |
| 191 | + # CourseID given in Y{termID}Y{classId} format to help ensure |
| 192 | + # it is unique. |
| 193 | + courseId = "Y#{termID}Y#{courseId}" |
| 194 | + |
| 195 | + # read-only |
| 196 | + CoursesModel = Courses.model() |
| 197 | + |
| 198 | + CoursesModel.findOne |
| 199 | + courseId: courseId |
| 200 | + , ['courseId', 'openSeats'] |
| 201 | + , (err, course) -> |
| 202 | + if course? |
| 203 | + prevOpen = parseInt \ |
| 204 | + course.openSeats[course.openSeats.length - 1] |
| 205 | + openSeats = parseInt openSeats |
| 206 | + |
| 207 | + condition = courseId: courseId |
| 208 | + update = {} |
| 209 | + |
| 210 | + # class still open |
| 211 | + if openSeats > 0 and prevOpen > 0 |
| 212 | + update = |
| 213 | + $push: |
| 214 | + openSeats: openSeats |
| 215 | + lastOpened: new Date() |
| 216 | + $set: |
| 217 | + status: 'Open' |
| 218 | + |
| 219 | + # class still closed |
| 220 | + if openSeats == 0 and prevOpen == 0 |
| 221 | + update = |
| 222 | + $push: |
| 223 | + openSeats: openSeats |
| 224 | + lastClosed: new Date() |
| 225 | + $set: |
| 226 | + status: 'Closed' |
| 227 | + |
| 228 | + # class just closed, no available seats |
| 229 | + if openSeats == 0 and prevOpen > 0 |
| 230 | + update = |
| 231 | + $push: |
| 232 | + openSeats: openSeats |
| 233 | + lastClosed: new Date() |
| 234 | + $set: |
| 235 | + status: 'Just closed' |
| 236 | + |
| 237 | + # class just opened, available seats |
| 238 | + if openSeats > 0 and prevOpen == 0 |
| 239 | + update = |
| 240 | + $push: |
| 241 | + openSeats: openSeats |
| 242 | + lastOpened: new Date() |
| 243 | + $set: |
| 244 | + status: 'Just opened' |
| 245 | + |
| 246 | + CoursesModel.update(condition, update, {}, (err, nAffected) -> |
| 247 | + if err? |
| 248 | + console.log "Error updating: #{err}" |
| 249 | + else |
| 250 | + console.log "Updated #{courseId} @ #{new Date()}" |
| 251 | + ) |
| 252 | + else |
| 253 | + console.log "New #{courseId.trim()} @ #{new Date()}" |
| 254 | + |
| 255 | + CoursesInst = Courses.model(true) |
| 256 | + CoursesInst.courseId = courseId |
| 257 | + CoursesInst.subject = subject |
| 258 | + CoursesInst.number = number |
| 259 | + CoursesInst.title = title |
| 260 | + CoursesInst.units = units |
| 261 | + CoursesInst.startDate = startDate |
| 262 | + CoursesInst.endDate = endDate |
| 263 | + CoursesInst.days = days |
| 264 | + CoursesInst.startTime = startTime |
| 265 | + CoursesInst.endTime = endTime |
| 266 | + CoursesInst.gstudy = gstudy |
| 267 | + CoursesInst.instructor = instructors |
| 268 | + CoursesInst.honors = honors |
| 269 | + CoursesInst.openSeats = [openSeats] |
| 270 | + CoursesInst.maxSeats = maxSeats |
| 271 | + CoursesInst.status = status |
| 272 | + |
| 273 | + if parseInt(openSeats) == 0 |
| 274 | + CoursesInst.lastClosed = [new Date()] |
| 275 | + CoursesInst.lastOpened = [] |
| 276 | + CoursesInst.status = 'Closed' |
| 277 | + else |
| 278 | + CoursesInst.lastClosed = [] |
| 279 | + CoursesInst.lastOpened = [new Date()] |
| 280 | + CoursesInst.status = 'Open' |
| 281 | + |
| 282 | + CoursesInst.save (err) -> |
| 283 | + if err? then console.log "Error: #{err}" |
| 284 | + |
| 285 | + if not --nCourses |
| 286 | + setTimeout(-> |
| 287 | + Models.close() |
| 288 | + , 1000) |
| 289 | + console.log "MongoDB connection closed... @ #{new Date()}" |
| 290 | + |
| 291 | + asyncBatch: (batch) -> |
| 292 | + console.log 'Using #{@jsession}' |
| 293 | + console.log 'working on the batch...' |
| 294 | + |
| 295 | +module.exports = Crawler |
0 commit comments