From ac83c2a00c13702bc365cd0f3074239fa63d743f Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 26 Jul 2019 01:53:22 +0530 Subject: email: Support email with mixed encoding of characters. Prior to this, parse-email would accept email in the form of a string. A string is constrained to use the same encoding for all its characters whereas an email can have characters encoded using different encoding schemes. Therefore, it is more correct that parse-email deals with bytevectors instead of strings. * email/utils.scm (read-bytes-till): New function. * email/email.scm (body->mime-entities, email->headers+body, decode-body): Deal with emails as bytevectors instead of strings. (parse-mime-entity): Rename text argument to bv. (parse-email, parse-email-body): Overload to handle input in the form of a string or bytevector. * doc/guile-email.texi (Parsing e-mail): Document overloading of parse-email and parse-email-body. * tests/email.scm ("handle truncated multipart message gracefully"): Deal in bytevectors instead of strings. ("email with 8 bit encoding and non UTF-8 charset", "multipart email with a 8 bit encoding and non UTF-8 charset part"): New tests. * tests/email-with-8bit-encoding-and-non-utf8-charset, tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part: New files. Reported-by: Jack Hill --- .../email-with-8bit-encoding-and-non-utf8-charset | 9 ++++ tests/email.scm | 52 +++++++++++++++++++--- ...-with-a-8bit-encoding-and-non-utf8-charset-part | 13 ++++++ 3 files changed, 69 insertions(+), 5 deletions(-) create mode 100644 tests/email-with-8bit-encoding-and-non-utf8-charset create mode 100644 tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part (limited to 'tests') diff --git a/tests/email-with-8bit-encoding-and-non-utf8-charset b/tests/email-with-8bit-encoding-and-non-utf8-charset new file mode 100644 index 0000000..a4f4a6e --- /dev/null +++ b/tests/email-with-8bit-encoding-and-non-utf8-charset @@ -0,0 +1,9 @@ +From: John Doe +To: Mary Smith +Subject: Saying Hello +Date: Fri, 21 Nov 1997 09:55:06 -0600 +Message-ID: <1234@local.machine.example> +Content-Type: text/plain; charset=ISO-8859-7 +Content-Transfer-Encoding: 8bit + +Hello Foo¢. \ No newline at end of file diff --git a/tests/email.scm b/tests/email.scm index ab2a408..856a5b9 100644 --- a/tests/email.scm +++ b/tests/email.scm @@ -1,6 +1,6 @@ ;;; guile-email --- Guile email parser ;;; Copyright © 2017 Ricardo Wurmus -;;; Copyright © 2018 Arun Isaac +;;; Copyright © 2018, 2019 Arun Isaac ;;; ;;; This file was adapted from guile-debbugs and is part of guile-email. ;;; @@ -51,6 +51,8 @@ ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (use-modules (email email) + (ice-9 binary-ports) + (ice-9 iconv) (srfi srfi-19) (srfi srfi-64)) @@ -100,19 +102,59 @@ Content-Length: 4349 (x-mailer . "FooMail 4.0 4.03 (SMT460B92F)") (content-length . "4349"))) +(test-equal "email with 8 bit encoding and non UTF-8 charset" + (call-with-input-file "tests/email-with-8bit-encoding-and-non-utf8-charset" + (compose parse-email get-bytevector-all)) + (make-email + `((from ((name . "John Doe") + (address . "jdoe@machine.example"))) + (to ((name . "Mary Smith") + (address . "mary@example.net"))) + (subject . "Saying Hello") + (date . ,(make-date 0 6 55 9 21 11 1997 -21600)) + (message-id . "1234@local.machine.example") + (content-type (type . text) + (subtype . plain) + (charset . "ISO-8859-7")) + (content-transfer-encoding . 8bit)) + "Hello Foo’.")) + +(test-equal "multipart email with a 8 bit encoding and non UTF-8 charset part" + (call-with-input-file "tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part" + (compose parse-email get-bytevector-all)) + (make-email + `((content-transfer-encoding . 7bit) + (from ((name . "John Doe") + (address . "jdoe@machine.example"))) + (to ((name . "Mary Smith") + (address . "mary@example.net"))) + (subject . "Saying Hello") + (date . ,(make-date 0 6 55 9 21 11 1997 -21600)) + (message-id . "1234@local.machine.example") + (content-type (type . multipart) + (subtype . mixed) + (boundary . "boundary"))) + (list (make-mime-entity + `((content-type (type . text) + (subtype . plain) + (charset . "ISO-8859-7")) + (content-transfer-encoding . 8bit)) + "Hello Foo’.")))) + (test-equal "handle truncated multipart message gracefully" ((module-ref (resolve-module '(email email)) 'body->mime-entities) - "--boundary + (string->bytevector + "--boundary Content-Type: text/plain foo -" +" "utf-8") "boundary") - (list "Content-Type: text/plain + (list (string->bytevector "Content-Type: text/plain foo -")) +" "utf-8"))) (test-equal "parse name-addr email address" (parse-email-address "Foo ") diff --git a/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part new file mode 100644 index 0000000..de340f4 --- /dev/null +++ b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part @@ -0,0 +1,13 @@ +From: John Doe +To: Mary Smith +Subject: Saying Hello +Date: Fri, 21 Nov 1997 09:55:06 -0600 +Message-ID: <1234@local.machine.example> +Content-Type: multipart/mixed; boundary="boundary" + +--boundary +Content-Type: text/plain; charset=ISO-8859-7 +Content-Transfer-Encoding: 8bit + +Hello Foo¢. +--boundary-- \ No newline at end of file -- cgit v1.2.3