email: Tolerate invalid charset.

* email/email.scm (post-process-content-type): If charset is invalid, assume default UTF-8 as charset. * tests/email.scm ("tolerate invalid charset"): New test. Reported-by: Ricardo Wurmus <rekado@elephly.net>
author: Arun Isaac 2019-10-01 22:07:32 +0530
committer: Arun Isaac 2019-10-01 22:07:32 +0530
commit: 6159257c9da3664c516a26fad1ee942068fbbdd2 (patch)
tree: 3504d2a434d0e8989bae9084063df1d0c89d63ce
parent: 9d82904011516530b6ef1bcd53cef220db485e7a (diff)
download: guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.tar.gz
guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.tar.lz
guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.zip
2 files changed, 44 insertions, 17 deletions
diff --git a/email/email.scm b/email/email.scm
index a791a0e..666bd3e 100644
--- a/email/email.scm
+++ b/email/email.scm
@@ -643,28 +643,45 @@ values. The returned headers is a string and body is a bytevector."
 (define (post-process-content-transfer-encoding _ value)
   (list 'content-transfer-encoding (string->lcase-symbol value)))
 
-(define post-process-content-type
-  (match-lambda*
+(define (post-process-content-type . args)
+  (define (valid-charset? charset)
+    (catch #t
+      (lambda ()
+        (bytevector->string (make-bytevector 0 0) charset)
+        #t)
+      (const #f)))
+
+  (match args
     (`(content (type ,type)
                (subtype ,subtype)
                . ,parameters)
      (let ((type (string->lcase-symbol type))
-           (subtype (string->lcase-symbol subtype)))
+           (subtype (string->lcase-symbol subtype))
+           (parameters
+            (map (match-lambda
+                   (`(parameter (attribute ,attribute)
+                                (value ,value))
+                    (cons (string->lcase-symbol attribute) value)))
+                 (flatten-and-filter '(parameter) parameters))))
        `(content-type
-         ,(acons* 'type type
-                  'subtype subtype
-                  (let ((parameters
-                         (map (match-lambda
-                                (`(parameter (attribute ,attribute)
-                                             (value ,value))
-                                 (cons (string->lcase-symbol attribute) value)))
-                              (flatten-and-filter '(parameter) parameters))))
-                    ;; RFC6657 specifies UTF-8 as the default charset
-                    ;; for text/* media types.
-                    (if (and (eq? type 'text)
-                             (not (assoc-ref parameters 'charset)))
-                        (acons 'charset "utf-8" parameters)
-                        parameters))))))))
+         ,(acons*
+           'type type
+           'subtype subtype
+           (cond
+            ((and (eq? type 'text)
+                  (or
+                   ;; RFC6657 specifies UTF-8 as the default charset for
+                   ;; text/* media types.
+                   (not (assoc-ref parameters 'charset))
+                   ;; RFC2045 recommends that the default be assumed
+                   ;; when a syntactically invalid Content-Type header
+                   ;; is encountered. In this implementation, we are
+                   ;; only checking for the validity of the
+                   ;; charset. Should we check for the validity of the
+                   ;; entire Content-Type header? If so, how?
+                   (not (valid-charset? (assoc-ref parameters 'charset)))))
+             (acons 'charset "utf-8" parameters))
+            (else parameters))))))))
 
 (define post-process-content-disposition
   (match-lambda*
diff --git a/tests/email.scm b/tests/email.scm
index ca0ebc3..1a0ec34 100644
--- a/tests/email.scm
+++ b/tests/email.scm
@@ -312,6 +312,16 @@ body" "iso-8859-1"))
             (address . "foo@bar.org"))))
    "body"))
 
+(test-equal "tolerate invalid charset"
+  (parse-email-headers
+   "Content-Type: text/plain; charset=foo
+")
+  `((content-transfer-encoding . 7bit)
+    (content-type (type . text)
+                  (subtype . plain)
+                  (charset . "utf-8")
+                  (charset . "foo"))))
+
 (test-equal "parse name-addr email address"
   (parse-email-address "Foo <foo@example.org>")
   '((name . "Foo") (address . "foo@example.org")))
author	Arun Isaac	2019-10-01 22:07:32 +0530
committer	Arun Isaac	2019-10-01 22:07:32 +0530
commit	6159257c9da3664c516a26fad1ee942068fbbdd2 (patch)
tree	3504d2a434d0e8989bae9084063df1d0c89d63ce
parent	9d82904011516530b6ef1bcd53cef220db485e7a (diff)
download	guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.tar.gz guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.tar.lz guile-email-6159257c9da3664c516a26fad1ee942068fbbdd2.zip