From 6159257c9da3664c516a26fad1ee942068fbbdd2 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Tue, 1 Oct 2019 22:07:32 +0530 Subject: email: Tolerate invalid charset. * email/email.scm (post-process-content-type): If charset is invalid, assume default UTF-8 as charset. * tests/email.scm ("tolerate invalid charset"): New test. Reported-by: Ricardo Wurmus --- email/email.scm | 51 ++++++++++++++++++++++++++++++++++----------------- tests/email.scm | 10 ++++++++++ 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/email/email.scm b/email/email.scm index a791a0e..666bd3e 100644 --- a/email/email.scm +++ b/email/email.scm @@ -643,28 +643,45 @@ values. The returned headers is a string and body is a bytevector." (define (post-process-content-transfer-encoding _ value) (list 'content-transfer-encoding (string->lcase-symbol value))) -(define post-process-content-type - (match-lambda* +(define (post-process-content-type . args) + (define (valid-charset? charset) + (catch #t + (lambda () + (bytevector->string (make-bytevector 0 0) charset) + #t) + (const #f))) + + (match args (`(content (type ,type) (subtype ,subtype) . ,parameters) (let ((type (string->lcase-symbol type)) - (subtype (string->lcase-symbol subtype))) + (subtype (string->lcase-symbol subtype)) + (parameters + (map (match-lambda + (`(parameter (attribute ,attribute) + (value ,value)) + (cons (string->lcase-symbol attribute) value))) + (flatten-and-filter '(parameter) parameters)))) `(content-type - ,(acons* 'type type - 'subtype subtype - (let ((parameters - (map (match-lambda - (`(parameter (attribute ,attribute) - (value ,value)) - (cons (string->lcase-symbol attribute) value))) - (flatten-and-filter '(parameter) parameters)))) - ;; RFC6657 specifies UTF-8 as the default charset - ;; for text/* media types. - (if (and (eq? type 'text) - (not (assoc-ref parameters 'charset))) - (acons 'charset "utf-8" parameters) - parameters)))))))) + ,(acons* + 'type type + 'subtype subtype + (cond + ((and (eq? type 'text) + (or + ;; RFC6657 specifies UTF-8 as the default charset for + ;; text/* media types. + (not (assoc-ref parameters 'charset)) + ;; RFC2045 recommends that the default be assumed + ;; when a syntactically invalid Content-Type header + ;; is encountered. In this implementation, we are + ;; only checking for the validity of the + ;; charset. Should we check for the validity of the + ;; entire Content-Type header? If so, how? + (not (valid-charset? (assoc-ref parameters 'charset))))) + (acons 'charset "utf-8" parameters)) + (else parameters)))))))) (define post-process-content-disposition (match-lambda* diff --git a/tests/email.scm b/tests/email.scm index ca0ebc3..1a0ec34 100644 --- a/tests/email.scm +++ b/tests/email.scm @@ -312,6 +312,16 @@ body" "iso-8859-1")) (address . "foo@bar.org")))) "body")) +(test-equal "tolerate invalid charset" + (parse-email-headers + "Content-Type: text/plain; charset=foo +") + `((content-transfer-encoding . 7bit) + (content-type (type . text) + (subtype . plain) + (charset . "utf-8") + (charset . "foo")))) + (test-equal "parse name-addr email address" (parse-email-address "Foo ") '((name . "Foo") (address . "foo@example.org"))) -- cgit v1.2.3