10
10
(or/c
11
11
(list/c 'parsefail string?)
12
12
(list/c bytes?
13
- (listof (list/c bytes? bytes ?)))))]))
13
+ (listof (list/c bytes? string ?)))))]))
14
14
15
15
(struct parsefail exn ())
16
16
152
152
(define SEMI (rx-matcher/const #px#"^; " 'SEMI ))
153
153
(define EQ (rx-matcher/const #px#"^= " 'EQ ))
154
154
(define DQ (rx-matcher/const #px#"^\" " 'DQ ))
155
+ (define SQ (rx-matcher/const #px"^' " 'SQ ))
155
156
156
157
;; a quoted string. a quote followed by any character from 32-255 not
157
158
;; including backslash or quote, but optionally a backslash followed
166
167
(postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167
168
(λ (v) (list 'quoted (cadr v)))))
168
169
(define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+) " ))
169
- (define VALUE (orparse TOKEN QTDSTR))
170
+ (define VALUE (postproc (orparse TOKEN QTDSTR) (λ (x) (list 'val x))))
171
+
172
+ (define ISO-8859-1-TOKEN (rx-matcher/const #px"^[iI][sS][oO]-8859-1 " 'iso-8559-1 ))
173
+ (define UTF-8-TOKEN (rx-matcher/const #px"^[uU][tT][fF]-8 " 'utf-8 ))
174
+ (define LANG-TOKEN (rx-matcher/const #px"^[-a-zA-Z0-9]* " 'LANG-TAG ))
175
+ (define PCT-ENCODED
176
+ (postproc (rx-matcher/raw #px"^%[0-9a-fA-F][0-9a-fA-F] " )
177
+ (λ (x) (list 'pct (string->number
178
+ (bytes->string/utf-8 (subbytes x 1 3 ))
179
+ 16 )))))
180
+ (define ATTR-CHARS (rx-matcher/raw #px"^[-A-Za-z0-9!#$&+.^_`|~]+ " ))
181
+ (define EXT-VALUE-CHARS (kstar (orparse PCT-ENCODED ATTR-CHARS)))
182
+ (define EXT-VALUE
183
+ (postproc (seq (orparse ISO-8859-1-TOKEN UTF-8-TOKEN)
184
+ SQ LANG-TOKEN SQ EXT-VALUE-CHARS)
185
+ (λ (x) (list 'extval x))))
186
+
187
+
170
188
171
189
;; give up if we see a token ending with a star; these signal
172
190
;; RFC5987 ext-values, and we don't handle them correctly.
173
191
(define CLAUSE
174
- (postproc
175
- (seq/ws TOKEN EQ VALUE)
176
- (λ (v)
177
- (when (regexp-match #px#"\\*$ " (car v))
178
- (raise
179
- (parsefail
180
- (format "token ending with * indicates unsupported ext-value: ~e "
181
- (car v))
182
- (current-continuation-marks))))
183
- v)))
192
+ (seq/ws TOKEN EQ (orparse VALUE EXT-VALUE)))
184
193
185
194
(define content-disposition-parser
186
195
(seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
200
209
(list ty (for/list ([c (in-list clauses)])
201
210
(match c
202
211
[(list 'SEMI (list tok 'EQ val))
203
- (list tok ( val-cleanup val) )]
212
+ (clause-postproc tok val)]
204
213
[other (error
205
214
'parse-content-disposition-header
206
215
"internal error, unexpected parse shape: ~e "
217
226
"no RFC5987 ext-values, got: ~e " )
218
227
rhs))])))
219
228
229
+ ;; clean up a clause by undoing escaping and joining strings
230
+ (define (clause-postproc token val)
231
+ (define token-ends-with-star?
232
+ (regexp-match? #px"\\*$ " token))
233
+ (define cleaned-val
234
+ (match val
235
+ [(list 'extval v)
236
+ (cond [token-ends-with-star? (extval-cleanup v)]
237
+ [else
238
+ (raise
239
+ (parsefail
240
+ "illegal extended value attached to non-asterisk token: ~e "
241
+ token))])]
242
+ [(list 'val v) (val-cleanup v)]))
243
+ (list token cleaned-val))
244
+
220
245
;; clean up a quoted string by removing the quotes and undoing escaping
221
246
(define (val-cleanup val)
222
247
(match val
223
248
[(? bytes? b) b]
224
249
[(list 'quoted l)
225
- (apply bytes-append (for/list ([chunk (in-list l)])
226
- (match chunk
227
- [(? bytes? b) b]
228
- [(list 'escaped eseq)
229
- (subbytes eseq 1 2 )])))]))
250
+ ;; quoted strings are supposed to be interpreted using
251
+ ;; iso-8859-1, often known as latin-1.
252
+ ;;
253
+ ;; Here's a frightening passage from RFC2612, concerning the
254
+ ;; definition of TEXT, the stuff in between the quotes:
255
+ #|Words
256
+ of *TEXT MAY contain characters from character sets other than ISO-
257
+ 8859-1 [22] only when encoded according to the rules of RFC 2047
258
+ [14]. |#
259
+ ;; ... which leaves open the possibility that interpreting these
260
+ ;; strictly as ISO-8859-1 strings may be incorrect. However, given
261
+ ;; the existence of ext-values, I think that no provider would
262
+ ;; use this mechanims. Famous last words. Lemme ask.
263
+ (bytes->string/latin-1
264
+ (apply bytes-append
265
+ (for/list ([chunk (in-list l)])
266
+ (match chunk
267
+ [(? bytes? b) b]
268
+ [(list 'escaped eseq)
269
+ (subbytes eseq 1 2 )]))))]))
270
+
271
+ ;; clean up an extval by unescaping pct-encoded strings
272
+ (define (extval-cleanup extval)
273
+ (match extval
274
+ [(list encoding _ _ _ pieces)
275
+ (define unencoder
276
+ (match encoding
277
+ ['utf-8 bytes->string/utf-8]
278
+ ['iso-8559-1 bytes->string/latin-1]))
279
+ (define bstrs
280
+ (for/list ([p (in-list pieces)])
281
+ (match p
282
+ [(list 'pct n) (bytes n)]
283
+ [other other])))
284
+ (unencoder (apply bytes-append bstrs))]))
230
285
231
286
(module+ test
232
287
(require rackunit)
241
296
(escaped #"\\\" " )
242
297
#"def " ))
243
298
#"" ))
299
+
300
+ ;; move down later
301
+ (check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd " )
302
+ '((extval (utf-8 SQ LANG-TAG SQ (#"abcd " ))) #"" ))
303
+ (check-equal? (EXT-VALUE #"UTF-8'en-li-SS'abcd%20%5c " )
304
+ '((extval
305
+ (utf-8 SQ LANG-TAG SQ (#"abcd " (pct #x20 ) (pct #x5c ))))
306
+ #"" ))
307
+
308
+
244
309
245
- (check-equal?
246
- (parse-content-disposition-header
247
- #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
248
- '(#"form-data "
249
- ((#"name " #"abcz " )
250
- (#"filename " #"abc\"d " ))))
310
+ (check-equal?
311
+ (parse-content-disposition-header
312
+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
313
+ '(#"form-data "
314
+ ((#"name " "abcz " )
315
+ (#"filename " "abc\"d " ))))
316
+
317
+ ;; try a high latin-1 character:
318
+ (check-equal?
319
+ (parse-content-disposition-header
320
+ #" form-data;filename=\"ab\330cd\" " )
321
+ '(#"form-data "
322
+ ((#"filename " "abØcd " ))))
251
323
252
324
(check-equal?
253
325
(parse-content-disposition-header
254
326
#" attachment; filename=\"\\\\foo.html\"\n " )
255
327
'(#"attachment "
256
- ((#"filename " # "\\foo.html " ))))
328
+ ((#"filename " "\\foo.html " ))))
257
329
258
330
(check-equal? (TOKEN #"form-data ; " )
259
331
(list #"form-data " #" ; " ))
280
352
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
281
353
; zokbar=abc24 " )
282
354
(list `(#"form-data "
283
- ((SEMI (#"name " EQ (quoted (#"abcz " ))))
284
- (SEMI (#"filename " EQ (quoted (#"abc "
285
- (escaped #"\\\" " )
286
- #"d " ))))
287
- (SEMI (#"zokbar " EQ #"abc24 " ))))
355
+ ((SEMI (#"name " EQ (val ( quoted (#"abcz " ) ))))
356
+ (SEMI (#"filename " EQ (val ( quoted (#"abc "
357
+ (escaped #"\\\" " )
358
+ #"d " ) ))))
359
+ (SEMI (#"zokbar " EQ (val #"abc24 " ) ))))
288
360
#"" ))
289
361
290
362
(check-equal? (QTDSTR #"\"filename=\" " )
296
368
(content-disposition-parser
297
369
#"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
298
370
(list `(#"form-data "
299
- ((SEMI (#"name " EQ (quoted (#"filename= " ))))
300
- (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
301
- (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
371
+ ((SEMI (#"name " EQ (val ( quoted (#"filename= " ) ))))
372
+ (SEMI (#"zokbar " EQ (val ( quoted (#"dingo " ) ))))
373
+ (SEMI (#"filename " EQ (val ( quoted (#"wallaby " ) ))))))
302
374
#"" ))
303
375
304
376
(check-equal?
305
377
(content-disposition-parser
306
378
#" form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
307
379
(list `(#"form-data "
308
- ((SEMI (#"name " EQ (quoted (#"filename= " ))))
309
- (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
310
- (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
380
+ ((SEMI (#"name " EQ (val ( quoted (#"filename= " ) ))))
381
+ (SEMI (#"zokbar " EQ (val ( quoted (#"dingo " ) ))))
382
+ (SEMI (#"filename " EQ (val ( quoted (#"wallaby " ) ))))))
311
383
#"" ))
312
384
313
385
(check-match
314
386
(parse-content-disposition-header
315
387
#"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
316
388
(list 'parsefail (regexp #px"expected: byte string matching RFC6266 " )))
317
389
318
- (check-match
390
+ (check-equal?
319
391
(parse-content-disposition-header
320
- #"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\" " )
321
- (list 'parsefail (regexp #px"token ending with * " )))
392
+ #" attachment; filename=\"foo-ae.html\"; filename*=UTF-8''foo-%c3%a4.html\n " )
393
+ '(#"attachment " (#"filename " "foo-ae.html " )
394
+ (#"filename* " "foo-ä.html " )))
322
395
323
396
)
397
+
324
398
325
399
;; this code was used to generate the regexp for tokens. In principle,
326
400
;; you shouldn't need this code unless you need to re-generate this
401
475
ch)]
402
476
[else
403
477
(check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
404
- ch)])))
478
+ ch)]))
479
+
480
+
481
+ #|attr-char = ALPHA / DIGIT
482
+ / "!" / "#" / "$" / "&" / "+" / "-" / "."
483
+ / "^" / "_" / "`" / "|" / "~"
484
+ ; token except ( "*" / "'" / "%" )
485
+ |#
486
+
487
+ )
488
+
489
+
490
+
491
+
0 commit comments