Skip to content

Commit

Permalink
Cache rewrite (nytimes#92)
Browse files Browse the repository at this point in the history
* remove redirect from cache logic (nytimes#91)

* Fix README language

* remove cache url redirect

* update tests

* remove instead of commenting out

* update to remove redirectUrl

* First pass: move byline processing to formatter

* Move byline tests to htmlProcessing, stub formatter for tests

* First pass: cache only document body, authors, sections

* Switch to docId for cache key

* Remove unnecessary move logic, may scrap all of move

* Remove much of purge logic, quiet reading history errors

* Remove time logic from cache purge

* some test reworking

* rework cache tests to run through new cache handling

* Playlist fixes

* Remove file move button, add notes

* Fix html processing tests

* Remove unnecessary elseif stmt, quieter logs

* Rewrite redirect logic, remove file moving (nytimes#93)

* Add redirect logic

* Remove move-file, test/lint fixes

* Unit test cache redirect

* New redirect middleware, serve at end of middleware chain

* Add check to not set redirects on new file additions

Co-authored-by: Shawn Gao <[email protected]>
Co-authored-by: Isaac White <[email protected]>
  • Loading branch information
3 people authored Feb 9, 2020
1 parent a4f9e42 commit 668f734
Show file tree
Hide file tree
Showing 23 changed files with 311 additions and 699 deletions.
2 changes: 1 addition & 1 deletion .eslintrc.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"afterEach": true
},
"rules": {
"arrow-parens": 0,
"react/prop-types": 0,
"no-var": ["error"],
"arrow-parens": ["error", "always"],
Expand All @@ -22,6 +21,7 @@
"anonymous": "always",
"named": "never"
}],
"object-curly-spacing": ["error", "never"],
"prefer-const": ["error"]
}
}
5 changes: 0 additions & 5 deletions config/strings.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,6 @@ playlist:
previous: <
next: '>'

# pages/move-file
moveFile:
title: !!js/function (title) => `Move ${title}`
prompt: !!js/function (title) => `Choose a folder to move '${title}' to`

# errors
error:
403:
Expand Down
18 changes: 0 additions & 18 deletions layouts/pages/move-file.ejs

This file was deleted.

1 change: 0 additions & 1 deletion layouts/partials/footer.ejs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
<% } %>
<% if (locals.editLink) { %>
<a class="button btn-footer" href="<%- editLink %>" id="edit-button" target="_blank"><%- template('footer.buttons.edit') %></a>
<a class="button btn-footer move-file-button" href="/move-file?id=<%- id %>"><%- template('footer.buttons.move') %></a>
<% } %>
<% if (locals.parentId) { %>
<a class="button btn-footer" href="https://docs.google.com/document/u/0/create?usp=drive_web&folder=<%- parentId %>" %> target="_blank"><%- template('footer.buttons.create') %></a>
Expand Down
108 changes: 29 additions & 79 deletions server/cache.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const middlewareRouter = require('express-promise-router')()

const log = require('./logger')
const {requireWithFallback} = require('./utils')
const {parseUrl} = require('./urlParser')

const cache = requireWithFallback('cache/store')

Expand All @@ -13,15 +14,18 @@ const noCacheDelay = parseInt(process.env.EDIT_CACHE_DELAY, 10) || 60 * 60

exports.get = cache.get // expose the ability to retreive cache data internally

// detects purge requests and serves cached responses when available
// detects purge requests in the query string
middlewareRouter.use(async (req, res) => {
// handle the purge request if purge or edit params are present
const {purge, edit, ignore} = req.query

if (purge || edit) {
const {email} = edit ? req.userInfo : {}
const overrides = ignore ? ignore.split(',') : null
const {meta} = await parseUrl(req.path)

return purgeCache({
url: req.path,
id: meta.id,
editEmail: email,
ignore: overrides
}).then(() => res.end('OK')).catch((err) => {
Expand All @@ -30,117 +34,63 @@ middlewareRouter.use(async (req, res) => {
})
}

// otherwise consult cache for stored html
const data = await cache.get(req.path)

const {html, redirectUrl, id} = data || {}
if (redirectUrl) return res.redirect(redirectUrl)

// if no html was returned proceed to next middleware
if (!html) return 'next'

// attach doc id to the request for reading history tracking
res.locals.docId = id
log.info(`CACHE HIT ${req.path}.`)
res.end(html)
return 'next'
})

exports.middleware = middlewareRouter

exports.add = async (id, newModified, path, html) => {
exports.add = async (id, newModified, content) => {
if (!newModified) throw new Error('Refusing to store new item without modified time.')

const data = await cache.get(path)
const {modified, noCache, html: oldHtml} = data || {}
const data = await cache.get(id)
const {modified, noCache, content: oldContent} = data || {}
// don't store any items over noCache entries
if (noCache) return // refuse to cache any items that are being edited
// if there was previous data and it is not older than the new data, don't do anything
if (oldHtml && modified && !isNewer(modified, newModified)) return // nothing to do if data is current
if (oldContent && modified && !isNewer(modified, newModified)) return // nothing to do if data is current
// store new data in the cache
return cache.set(path, {html, modified: newModified, id})
}

// redirects when a url changes
// should we expose a cb here for testing?
exports.redirect = async (path, newPath, modified) => {
const data = await cache.get(path)
const {noCache, redirectUrl} = data || {}

// since we run multiple pods, we don't need to set the redirect more than once
if (redirectUrl === newPath) throw new Error('Already configured that redirect')

log.info(`ADDING REDIRECT: ${path} => ${newPath}`)

await cache.set(path, {redirectUrl: newPath}).catch((err) => {
if (err) log.warn(`Failed setting redirect for ${path} => ${newPath}`, err)
return err
})

const preventCacheReason = noCache ? 'redirect_detected' : null
return purgeCache({
url: newPath,
modified,
editEmail: preventCacheReason,
ignore: ['redirect', 'missing', 'modified']
}).catch((err) => {
if (err && err.message !== 'Not found') log.warn(`Failed purging redirect destination ${newPath}`, err)
throw err
})
return cache.set(id, {content, modified: newModified, id})
}

// expose the purgeCache method externally so that list can call while building tree
exports.purge = purgeCache

async function purgeCache({url, modified, editEmail, ignore}) {
async function purgeCache({id, modified, editEmail, ignore}) {
modified = modified || moment().subtract(1, 'hour').utc().format('YYYY-MM-DDTHH:mm:ss.SSS[Z]')

const overrides = ignore && Array.isArray(ignore) ? new Set(ignore) : new Set().add(ignore)
const shouldIgnore = (type) => overrides.has(type) || overrides.has('all') || overrides.has('1')

if (!url) throw new Error(`Can't purge cache without url! Given url was ${url}`)
if (!id) throw new Error(`Can't purge cache without a document id! Given id was ${id}`)

const data = await cache.get(url)
// compare current cache entry data vs this request
const {redirectUrl, noCache, html, modified: oldModified, purgeId: lastPurgeId} = data || {}
const data = await cache.get(id)

if (redirectUrl && !shouldIgnore('redirect')) throw new Error('Unauthorized')
// edit is considered its own override for everything but redirect
if (!data) return // if no currently cached item, don't attempt to purge

// compare current cache entry data vs this request
const {noCache, content, purgeId: lastPurgeId} = data || {}
const html = content && content.html

// FIXME: this should be more robust
if (editEmail && editEmail.includes('@')) {
log.info(`CACHE PURGE PERSIST for ${noCacheDelay}s (${editEmail}): ${url}`)
return cache.set(url, {noCache: true}, {ttl: noCacheDelay})
log.info(`CACHE PURGE PERSIST for ${noCacheDelay}s (${editEmail}): ${id}`)
return cache.set(id, {noCache: true}, {ttl: noCacheDelay})
}

const purgeId = `${modified}-${editEmail || ''}-${ignore}`

// if attempting to purge /trash but nothing has changed, skip.
if (purgeId === lastPurgeId && url === '/trash') return

// const isTrashed = url.split('/')[1] === 'trash'
// try and dedupe extra requests from multiple pods (tidier logs)
if (purgeId === lastPurgeId && !shouldIgnore('all')) throw new Error(`Same purge id as previous request ${purgeId} for ${url}`)
if (purgeId === lastPurgeId && !shouldIgnore('all')) throw new Error(`Same purge id as previous request ${purgeId} for docId ${id}`)

// by default, don't try to purge empty
if (!html && !shouldIgnore('missing')) throw new Error('Not found')

// by default, don't purge a noCache entry
if (noCache && !shouldIgnore('editing')) throw new Error('Unauthorized')
// by default, don't purge when the modification time is not fresher than previous
if (!isNewer(oldModified, modified) && !shouldIgnore('modified')) throw new Error(`No purge of fresh content for ${url}`)

// if we passed all the checks, determine all ancestor links and purge
const segments = url.split('/').map((segment, i, segments) => {
return segments.slice(0, i).concat([segment]).join('/')
}).filter((s) => s.length) // don't allow purging empty string

// call the callback when all segments have been purged
return Promise.all(
segments.map((path) => {
log.info(`CACHE PURGE ${path} FROM CHANGE AT ${url}`)
// there is an edge here where a homepage upstream was being edited and already not in cache.
// we need to get the cache entries for all of these in case and not purge them to account for that edge
cache.set(path, {modified, purgeId})
})
)

// if all checks pass, purge
log.info(`CACHE PURGE ${id}`)
cache.set(id, {modified, purgeId})
}

function isNewer(oldModified, newModified) {
Expand Down
88 changes: 18 additions & 70 deletions server/docs.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@ const cheerio = require('cheerio')
const slugify = require('slugify')
const xlsx = require('xlsx')

const {getAuth} = require('./auth')
const log = require('./logger')
const {stringTemplate} = require('./utils')

const cache = require('./cache')
const formatter = require('./formatter')
const log = require('./logger')
const {getAuth} = require('./auth')

const supportedTypes = new Set(['document', 'spreadsheet', 'text/html'])

Expand All @@ -31,41 +30,23 @@ exports.slugify = (text = '') => {
}

exports.fetchDoc = async (id, resourceType, req) => {
const data = await cache.get(id)
if (data && data.content) {
log.info(`CACHE HIT ${req.path}`)
return data.content
}

const auth = await getAuth()

const [html, originalRevision] = await fetch({id, resourceType, req}, auth)
const processedHtml = formatter.getProcessedHtml(html)
const sections = getSections(html)
// maybe we should pull out headers here
return {html: processedHtml, originalRevision, sections, template: stringTemplate}
}
const driveDoc = await fetch({id, resourceType, req}, auth)
const originalRevision = driveDoc[1]

exports.fetchByline = (html, creatorOfDoc) => {
let byline = creatorOfDoc
const $ = cheerio.load(html)

// Iterates through all p tags to find byline
$('p').each((index, p) => {
// don't search any empty p tags
if (p.children.length < 1) return

// regex that checks for byline
const r = /^by.+[^.\n]$/mig
if (r.test(p.children[0].data)) {
byline = p.children[0].data
// Removes the word "By"
byline = byline.slice(3)
$(p).remove()
}

// only check the first p tag
return false
})
const {html, byline, createdBy, sections} = formatter.getProcessedDocAttributes(driveDoc)
const payload = {html, byline, createdBy, sections}

return {
byline,
html: $.html()
}
// cache only information from document body
cache.add(id, originalRevision.data.modifiedTime, payload)
return payload
}

async function fetchHTMLForId(id, resourceType, req, drive) {
Expand Down Expand Up @@ -94,15 +75,15 @@ async function fetchOriginalRevisions(id, resourceType, req, drive) {

if (!revisionSupported.has(resourceType)) {
log.info(`Revision data not supported for ${resourceType}:${id}`)
return {data: { lastModifyingUser: {} }} // return mock/empty revision object
return {data: {lastModifyingUser: {}}} // return mock/empty revision object
}
return drive.revisions.get({
fileId: id,
revisionId: '1',
fields: '*'
}).catch((err) => {
log.warn(`Failed retrieving revision data for ${resourceType}:${id}. Error was:`, err)
return {data: { lastModifyingUser: {} }} // return mock/empty revision object
return {data: {lastModifyingUser: {}}} // return mock/empty revision object
})
}

Expand Down Expand Up @@ -169,36 +150,3 @@ async function fetchHTML(drive, id) {
})
return data
}

function getSections(html) {
const $ = cheerio.load(html)
const headers = ['h1', 'h2']
.map((h) => `body ${h}`)
.join(', ')

const ordered = $(headers).map((i, el) => {
const tag = el.name
const $el = $(el)
const name = $el.text()
const url = `#${$el.attr('id')}`
return {
name,
url,
level: parseInt(tag.slice(-1), 10)
}
}).toArray()

// take our ordered sections and turn them into appropriately nested headings
const nested = ordered.reduce((memo, heading) => {
const tail = memo.slice(-1)[0]
const extended = Object.assign({}, heading, {subsections: []})
if (!tail || heading.level <= tail.level) {
return memo.concat(extended)
}

tail.subsections.push(heading)
return memo
}, [])

return nested
}
Loading

0 comments on commit 668f734

Please sign in to comment.