Skip to content

Commit 8e47976

Browse files
committed
more parsing hardware for 6266
1 parent ba15ca9 commit 8e47976

File tree

1 file changed

+127
-40
lines changed

1 file changed

+127
-40
lines changed

web-server-lib/web-server/http/parse-content-disposition.rkt

+127-40
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
(or/c
1111
(list/c 'parsefail string?)
1212
(list/c bytes?
13-
(listof (list/c bytes? bytes?)))))]))
13+
(listof (list/c bytes? string?)))))]))
1414

1515
(struct parsefail exn ())
1616

@@ -152,6 +152,7 @@
152152
(define SEMI (rx-matcher/const #px#"^;" 'SEMI))
153153
(define EQ (rx-matcher/const #px#"^=" 'EQ))
154154
(define DQ (rx-matcher/const #px#"^\"" 'DQ))
155+
(define SQ (rx-matcher/const #px"^'" 'SQ))
155156

156157
;; a quoted string. a quote followed by any character from 32-255 not
157158
;; including backslash or quote, but optionally a backslash followed
@@ -166,21 +167,29 @@
166167
(postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167168
(λ (v) (list 'quoted (cadr v)))))
168169
(define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+)"))
169-
(define VALUE (orparse TOKEN QTDSTR))
170+
(define VALUE (postproc (orparse TOKEN QTDSTR) (λ (x) (list 'val x))))
171+
172+
(define ISO-8859-1-TOKEN (rx-matcher/const #px"^[iI][sS][oO]-8859-1" 'iso-8559-1))
173+
(define UTF-8-TOKEN (rx-matcher/const #px"^[uU][tT][fF]-8" 'utf-8))
174+
(define LANG-TOKEN (rx-matcher/const #px"^[-a-zA-Z0-9]*" 'LANG-TAG))
175+
(define PCT-ENCODED
176+
(postproc (rx-matcher/raw #px"^%[0-9a-fA-F][0-9a-fA-F]")
177+
(λ (x) (list 'pct (string->number
178+
(bytes->string/utf-8 (subbytes x 1 3))
179+
16)))))
180+
(define ATTR-CHARS (rx-matcher/raw #px"^[-A-Za-z0-9!#$&+.^_`|~]+"))
181+
(define EXT-VALUE-CHARS (kstar (orparse PCT-ENCODED ATTR-CHARS)))
182+
(define EXT-VALUE
183+
(postproc (seq (orparse ISO-8859-1-TOKEN UTF-8-TOKEN)
184+
SQ LANG-TOKEN SQ EXT-VALUE-CHARS)
185+
(λ (x) (list 'extval x))))
186+
187+
170188

171189
;; give up if we see a token ending with a star; these signal
172190
;; RFC5987 ext-values, and we don't handle them correctly.
173191
(define CLAUSE
174-
(postproc
175-
(seq/ws TOKEN EQ VALUE)
176-
(λ (v)
177-
(when (regexp-match #px#"\\*$" (car v))
178-
(raise
179-
(parsefail
180-
(format "token ending with * indicates unsupported ext-value: ~e"
181-
(car v))
182-
(current-continuation-marks))))
183-
v)))
192+
(seq/ws TOKEN EQ (orparse VALUE EXT-VALUE)))
184193

185194
(define content-disposition-parser
186195
(seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
@@ -200,7 +209,7 @@
200209
(list ty (for/list ([c (in-list clauses)])
201210
(match c
202211
[(list 'SEMI (list tok 'EQ val))
203-
(list tok (val-cleanup val))]
212+
(clause-postproc tok val)]
204213
[other (error
205214
'parse-content-disposition-header
206215
"internal error, unexpected parse shape: ~e"
@@ -217,16 +226,62 @@
217226
"no RFC5987 ext-values, got: ~e")
218227
rhs))])))
219228

229+
;; clean up a clause by undoing escaping and joining strings
230+
(define (clause-postproc token val)
231+
(define token-ends-with-star?
232+
(regexp-match? #px"\\*$" token))
233+
(define cleaned-val
234+
(match val
235+
[(list 'extval v)
236+
(cond [token-ends-with-star? (extval-cleanup v)]
237+
[else
238+
(raise
239+
(parsefail
240+
"illegal extended value attached to non-asterisk token: ~e"
241+
token))])]
242+
[(list 'val v) (val-cleanup v)]))
243+
(list token cleaned-val))
244+
220245
;; clean up a quoted string by removing the quotes and undoing escaping
221246
(define (val-cleanup val)
222247
(match val
223248
[(? bytes? b) b]
224249
[(list 'quoted l)
225-
(apply bytes-append (for/list ([chunk (in-list l)])
226-
(match chunk
227-
[(? bytes? b) b]
228-
[(list 'escaped eseq)
229-
(subbytes eseq 1 2)])))]))
250+
;; quoted strings are supposed to be interpreted using
251+
;; iso-8859-1, often known as latin-1.
252+
;;
253+
;; Here's a frightening passage from RFC2612, concerning the
254+
;; definition of TEXT, the stuff in between the quotes:
255+
#|Words
256+
of *TEXT MAY contain characters from character sets other than ISO-
257+
8859-1 [22] only when encoded according to the rules of RFC 2047
258+
[14].|#
259+
;; ... which leaves open the possibility that interpreting these
260+
;; strictly as ISO-8859-1 strings may be incorrect. However, given
261+
;; the existence of ext-values, I think that no provider would
262+
;; use this mechanims. Famous last words. Lemme ask.
263+
(bytes->string/latin-1
264+
(apply bytes-append
265+
(for/list ([chunk (in-list l)])
266+
(match chunk
267+
[(? bytes? b) b]
268+
[(list 'escaped eseq)
269+
(subbytes eseq 1 2)]))))]))
270+
271+
;; clean up an extval by unescaping pct-encoded strings
272+
(define (extval-cleanup extval)
273+
(match extval
274+
[(list encoding _ _ _ pieces)
275+
(define unencoder
276+
(match encoding
277+
['utf-8 bytes->string/utf-8]
278+
['iso-8559-1 bytes->string/latin-1]))
279+
(define bstrs
280+
(for/list ([p (in-list pieces)])
281+
(match p
282+
[(list 'pct n) (bytes n)]
283+
[other other])))
284+
(unencoder (apply bytes-append bstrs))]))
230285

231286
(module+ test
232287
(require rackunit)
@@ -241,19 +296,36 @@
241296
(escaped #"\\\"")
242297
#"def"))
243298
#""))
299+
300+
;; move down later
301+
(check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd")
302+
'((extval (utf-8 SQ LANG-TAG SQ (#"abcd"))) #""))
303+
(check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd%20%5c")
304+
'((extval
305+
(utf-8 SQ LANG-TAG SQ (#"abcd" (pct #x20) (pct #x5c))))
306+
#""))
307+
308+
244309

245-
(check-equal?
246-
(parse-content-disposition-header
247-
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"")
248-
'(#"form-data"
249-
((#"name" #"abcz")
250-
(#"filename" #"abc\"d"))))
310+
(check-equal?
311+
(parse-content-disposition-header
312+
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"")
313+
'(#"form-data"
314+
((#"name" "abcz")
315+
(#"filename" "abc\"d"))))
316+
317+
;; try a high latin-1 character:
318+
(check-equal?
319+
(parse-content-disposition-header
320+
#" form-data;filename=\"ab\330cd\"")
321+
'(#"form-data"
322+
((#"filename" "abØcd"))))
251323

252324
(check-equal?
253325
(parse-content-disposition-header
254326
#" attachment; filename=\"\\\\foo.html\"\n")
255327
'(#"attachment"
256-
((#"filename" #"\\foo.html"))))
328+
((#"filename" "\\foo.html"))))
257329

258330
(check-equal? (TOKEN #"form-data ;")
259331
(list #"form-data" #" ;"))
@@ -280,11 +352,11 @@
280352
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
281353
; zokbar=abc24")
282354
(list `(#"form-data"
283-
((SEMI (#"name" EQ (quoted (#"abcz"))))
284-
(SEMI (#"filename" EQ (quoted (#"abc"
285-
(escaped #"\\\"")
286-
#"d"))))
287-
(SEMI (#"zokbar" EQ #"abc24"))))
355+
((SEMI (#"name" EQ (val (quoted (#"abcz")))))
356+
(SEMI (#"filename" EQ (val (quoted (#"abc"
357+
(escaped #"\\\"")
358+
#"d")))))
359+
(SEMI (#"zokbar" EQ (val #"abc24")))))
288360
#""))
289361

290362
(check-equal? (QTDSTR #"\"filename=\"")
@@ -296,31 +368,33 @@
296368
(content-disposition-parser
297369
#"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\"")
298370
(list `(#"form-data"
299-
((SEMI (#"name" EQ (quoted (#"filename="))))
300-
(SEMI (#"zokbar" EQ (quoted (#"dingo"))))
301-
(SEMI (#"filename" EQ (quoted (#"wallaby"))))))
371+
((SEMI (#"name" EQ (val (quoted (#"filename=")))))
372+
(SEMI (#"zokbar" EQ (val (quoted (#"dingo")))))
373+
(SEMI (#"filename" EQ (val (quoted (#"wallaby")))))))
302374
#""))
303375

304376
(check-equal?
305377
(content-disposition-parser
306378
#" form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\"")
307379
(list `(#"form-data"
308-
((SEMI (#"name" EQ (quoted (#"filename="))))
309-
(SEMI (#"zokbar" EQ (quoted (#"dingo"))))
310-
(SEMI (#"filename" EQ (quoted (#"wallaby"))))))
380+
((SEMI (#"name" EQ (val (quoted (#"filename=")))))
381+
(SEMI (#"zokbar" EQ (val (quoted (#"dingo")))))
382+
(SEMI (#"filename" EQ (val (quoted (#"wallaby")))))))
311383
#""))
312384

313385
(check-match
314386
(parse-content-disposition-header
315387
#"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\"")
316388
(list 'parsefail (regexp #px"expected: byte string matching RFC6266")))
317389

318-
(check-match
390+
(check-equal?
319391
(parse-content-disposition-header
320-
#"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\"")
321-
(list 'parsefail (regexp #px"token ending with *")))
392+
#" attachment; filename=\"foo-ae.html\"; filename*=UTF-8''foo-%c3%a4.html\n")
393+
'(#"attachment" (#"filename" "foo-ae.html")
394+
(#"filename*" "foo-ä.html")))
322395

323396
)
397+
324398

325399
;; this code was used to generate the regexp for tokens. In principle,
326400
;; you shouldn't need this code unless you need to re-generate this
@@ -401,4 +475,17 @@
401475
ch)]
402476
[else
403477
(check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
404-
ch)])))
478+
ch)]))
479+
480+
481+
#|attr-char = ALPHA / DIGIT
482+
/ "!" / "#" / "$" / "&" / "+" / "-" / "."
483+
/ "^" / "_" / "`" / "|" / "~"
484+
; token except ( "*" / "'" / "%" )
485+
|#
486+
487+
)
488+
489+
490+
491+

0 commit comments

Comments
 (0)