From ac83c2a00c13702bc365cd0f3074239fa63d743f Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 26 Jul 2019 01:53:22 +0530 Subject: email: Support email with mixed encoding of characters. Prior to this, parse-email would accept email in the form of a string. A string is constrained to use the same encoding for all its characters whereas an email can have characters encoded using different encoding schemes. Therefore, it is more correct that parse-email deals with bytevectors instead of strings. * email/utils.scm (read-bytes-till): New function. * email/email.scm (body->mime-entities, email->headers+body, decode-body): Deal with emails as bytevectors instead of strings. (parse-mime-entity): Rename text argument to bv. (parse-email, parse-email-body): Overload to handle input in the form of a string or bytevector. * doc/guile-email.texi (Parsing e-mail): Document overloading of parse-email and parse-email-body. * tests/email.scm ("handle truncated multipart message gracefully"): Deal in bytevectors instead of strings. ("email with 8 bit encoding and non UTF-8 charset", "multipart email with a 8 bit encoding and non UTF-8 charset part"): New tests. * tests/email-with-8bit-encoding-and-non-utf8-charset, tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part: New files. Reported-by: Jack Hill --- doc/guile-email.texi | 34 +++++++-- email/email.scm | 85 +++++++++++++--------- email/utils.scm | 32 +++++++- .../email-with-8bit-encoding-and-non-utf8-charset | 9 +++ tests/email.scm | 52 +++++++++++-- ...-with-a-8bit-encoding-and-non-utf8-charset-part | 13 ++++ 6 files changed, 178 insertions(+), 47 deletions(-) create mode 100644 tests/email-with-8bit-encoding-and-non-utf8-charset create mode 100644 tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part diff --git a/doc/guile-email.texi b/doc/guile-email.texi index b606021..70a4e28 100644 --- a/doc/guile-email.texi +++ b/doc/guile-email.texi @@ -53,22 +53,42 @@ RF2047 and RFC2049. @node Parsing e-mail @chapter Parsing e-mail -@deffn {Scheme Procedure} parse-email email -Parse string @var{email} and return result as an record. -@end deffn +@deftypefn {Scheme Procedure} parse-email (bytevector @var{email}) +@deftypefnx {Scheme Procedure} parse-email (string @var{email}) +Parse bytevector @var{email} and return result as an @code{} +record. + +Parse string @var{email} and return result as an @code{} +record. +@end deftypefn @deffn {Scheme Procedure} parse-email-headers headers Parse string @var{headers} as email headers and return an association list of header keys and values. @end deffn -@deffn {Scheme Procedure} parse-email-body headers body -Parse @var{body} as email body where @var{headers} is an association -list of header keys and values as returned by +@deftypefn {Scheme Procedure} parse-email-body (string @var{headers}) (bytevector @var{body}) +@deftypefnx {Scheme Procedure} parse-email-body (string @var{headers}) (string @var{body}) +Parse bytevector @var{body} as email body where @var{headers} is an +association list of header keys and values as returned by @code{parse-email-headers}. Return a list of records if the body is a multipart message. Else, return a single record. -@end deffn + +Parse string @var{body} as email body where @var{headers} is an +association list of header keys and values as returned by +@code{parse-email-headers}. Return a list of records if +the body is a multipart message. Else, return a single +record. +@end deftypefn + +Note that while an email can have characters encoded using different +schemes, a string is constrained to have all characters encoded using +the same scheme. Therefore, passing a string to @code{parse-email} or +@code{parse-email-body} will not always produce correct results. Hence, +this variant of @code{parse-email} and @code{parse-email-body} will be +deprecated in the future. This variant is only provided in the interest +of backward compatibility. @node Encoding and Decoding @chapter Encoding and Decoding diff --git a/email/email.scm b/email/email.scm index 7e613ff..381c81a 100644 --- a/email/email.scm +++ b/email/email.scm @@ -18,10 +18,13 @@ ;;; . (define-module (email email) + #:use-module (ice-9 binary-ports) #:use-module (ice-9 iconv) #:use-module (ice-9 match) #:use-module (ice-9 peg) #:use-module (ice-9 regex) + #:use-module (rnrs bytevectors) + #:use-module ((rnrs io ports) #:select (call-with-port)) #:use-module (rnrs io simple) #:use-module (srfi srfi-1) #:use-module (srfi srfi-2) @@ -570,11 +573,15 @@ 'post)) (define (body->mime-entities body boundary) - "Split BODY into a list of mime entities separated by BOUNDARY (as -explained in RFC2045), and return that list." + "Split bytevector BODY into a list of mime entities separated by +BOUNDARY (as explained in RFC2045), and return that list." (define (read-till-boundary port) - (read-while port get-line-with-delimiter - (negate (cut string-prefix? (string-append "--" boundary) <>)))) + (read-bytes-till + port + ;; TODO: Look for the boundary sequence only at the beginning of + ;; a line. + ;; Boundary is always an ASCII string. + (string->bytevector (string-append "--" boundary) "us-ascii"))) (define (read-mime-entity port) (let ((line (get-line-with-delimiter port))) @@ -586,22 +593,25 @@ explained in RFC2045), and return that list." (eof-object) (read-till-boundary port)))) - (call-with-input-string body - (lambda (port) - (read-till-boundary port) - (read-objects read-mime-entity port)))) + (call-with-port + (open-bytevector-input-port body) + (lambda (port) + (read-till-boundary port) + (read-objects read-mime-entity port)))) (define (email->headers+body email) - "Split EMAIL into headers and body. Return as multiple values." - (call-with-input-string email + "Split EMAIL bytevector into headers and body. Return as multiple +values. The returned headers is a string and body is a bytevector." + (call-with-port + (open-bytevector-input-port email) (lambda (port) + (set-port-encoding! port "us-ascii") (let ((headers (read-while port get-line-with-delimiter (lambda (line) (not (or (string= line "\n") (string= line "\r\n"))))))) (get-line-with-delimiter port) - (values headers - (read-while port get-line-with-delimiter identity)))))) + (values headers (get-bytevector-all port)))))) (define (post-process-mailbox . args) (define process-name @@ -714,16 +724,20 @@ For example, header keys and values as returned by parse-email-headers. Return a list of records if the body is a multipart message. Else, return a single record." - (let ((content-type (assoc-ref headers 'content-type))) - (case (assoc-ref content-type 'type) - ((multipart) - (map parse-mime-entity - (body->mime-entities body (assoc-ref content-type 'boundary)))) - ((text) - (string-trim-both - (decode-body body (assoc-ref headers 'content-transfer-encoding) - (assoc-ref content-type 'charset)))) - (else (decode-body body (assoc-ref headers 'content-transfer-encoding)))))) + (match body + ((? string? body) + (parse-email-body headers (string->bytevector body "utf-8"))) + ((? bytevector? body) + (let ((content-type (assoc-ref headers 'content-type))) + (case (assoc-ref content-type 'type) + ((multipart) + (map parse-mime-entity + (body->mime-entities body (assoc-ref content-type 'boundary)))) + ((text) + (string-trim-both + (decode-body body (assoc-ref headers 'content-transfer-encoding) + (assoc-ref content-type 'charset)))) + (else (decode-body body (assoc-ref headers 'content-transfer-encoding)))))))) (define (add-default-headers headers) ;; Default Content-Type and Content-Transfer-Encoding headers as @@ -742,8 +756,8 @@ message. Else, return a single record." default-headers) headers))) -(define (parse-mime-entity text) - (let-values (((headers body) (email->headers+body text))) +(define (parse-mime-entity bv) + (let-values (((headers body) (email->headers+body bv))) (let ((headers (pre-post-order (peg:tree @@ -766,9 +780,13 @@ message. Else, return a single record." (define (parse-email email) "Parse string EMAIL and return result as an record." - (let-values (((headers body) (email->headers+body email))) - (let ((headers-alist (parse-email-headers headers))) - (make-email headers-alist (parse-email-body headers-alist body))))) + (match email + ((? string? email) + (parse-email (string->bytevector email "utf-8"))) + ((? bytevector? email) + (let-values (((headers body) (email->headers+body email))) + (let ((headers-alist (parse-email-headers headers))) + (make-email headers-alist (parse-email-body headers-alist body))))))) (define (parse-email-headers headers) "Parse string HEADERS as email headers and return an association @@ -824,7 +842,7 @@ list of header keys and values." (*default* . ,(lambda tree tree))))) (define* (decode-body body encoding #:optional charset) - (let ((octets + (let ((decoded-octets (case encoding ((base64) (base64-decode @@ -834,15 +852,14 @@ list of header keys and values." (ucs-range->char-set (char->integer #\A) (1+ (char->integer #\Z))) (ucs-range->char-set (char->integer #\0) (1+ (char->integer #\9))) (char-set #\+ #\/ #\=)) - body))) - ((quoted-printable) (quoted-printable-decode body)) + (bytevector->string body "us-ascii")))) + ((quoted-printable) (quoted-printable-decode + (bytevector->string body "us-ascii"))) ((#{7bit}# #{8bit}# binary) body) (else (error "Body decoding failed. Unknown encoding" encoding))))) (if charset - (case encoding - ((base64 quoted-printable) (bytevector->string octets charset)) - ((#{7bit}# #{8bit}# binary) octets)) - octets))) + (bytevector->string decoded-octets charset) + decoded-octets))) (define (read-next-email-in-mbox port) ;; Read and discard From_ line diff --git a/email/utils.scm b/email/utils.scm index 35a96d8..2040b21 100644 --- a/email/utils.scm +++ b/email/utils.scm @@ -1,5 +1,5 @@ ;;; guile-email --- Guile email parser -;;; Copyright © 2018 Arun Isaac +;;; Copyright © 2018, 2019 Arun Isaac ;;; ;;; This file is part of guile-email. ;;; @@ -20,11 +20,17 @@ (define-module (email utils) #:use-module (ice-9 match) #:use-module (ice-9 peg codegen) + #:use-module (ice-9 binary-ports) #:use-module (ice-9 textual-ports) + #:use-module (rnrs bytevectors) + #:use-module ((rnrs io ports) + #:select (call-with-bytevector-output-port)) #:use-module (rnrs io simple) + #:use-module (srfi srfi-26) #:export (get-line-with-delimiter read-objects read-while + read-bytes-till acons* alist-delete*)) @@ -52,6 +58,30 @@ string returned by READ-PROC as argument." (let ((str (call-with-output-string read-while-loop))) (if (string-null? str) (eof-object) str))) +(define (read-bytes-till port sequence) + "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is +reached. If SEQUENCE is seen, unget it to PORT and return." + (define (read-bytes-and-write-till in out sequence) + (let ((octet (get-u8 in))) + (cond + ((eof-object? octet) octet) + ;; If octet read matches first octet of sequence, try matching + ;; the full sequence. + ((= octet (bytevector-u8-ref sequence 0)) + (unget-bytevector in sequence 0 1) + (let ((bv (get-bytevector-n in (bytevector-length sequence)))) + (cond + ((bytevector=? bv sequence) (unget-bytevector in bv)) + (else (unget-bytevector in bv 1) + (put-u8 out octet) + (read-bytes-and-write-till in out sequence))))) + ;; Else, output the octet and continue reading. + (else (put-u8 out octet) + (read-bytes-and-write-till in out sequence))))) + + (call-with-bytevector-output-port + (cut read-bytes-and-write-till port <> sequence))) + (define (get-line-with-delimiter port) "Read a line from PORT and return it as a string including the delimiting linefeed character." diff --git a/tests/email-with-8bit-encoding-and-non-utf8-charset b/tests/email-with-8bit-encoding-and-non-utf8-charset new file mode 100644 index 0000000..a4f4a6e --- /dev/null +++ b/tests/email-with-8bit-encoding-and-non-utf8-charset @@ -0,0 +1,9 @@ +From: John Doe +To: Mary Smith +Subject: Saying Hello +Date: Fri, 21 Nov 1997 09:55:06 -0600 +Message-ID: <1234@local.machine.example> +Content-Type: text/plain; charset=ISO-8859-7 +Content-Transfer-Encoding: 8bit + +Hello Foo¢. \ No newline at end of file diff --git a/tests/email.scm b/tests/email.scm index ab2a408..856a5b9 100644 --- a/tests/email.scm +++ b/tests/email.scm @@ -1,6 +1,6 @@ ;;; guile-email --- Guile email parser ;;; Copyright © 2017 Ricardo Wurmus -;;; Copyright © 2018 Arun Isaac +;;; Copyright © 2018, 2019 Arun Isaac ;;; ;;; This file was adapted from guile-debbugs and is part of guile-email. ;;; @@ -51,6 +51,8 @@ ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. (use-modules (email email) + (ice-9 binary-ports) + (ice-9 iconv) (srfi srfi-19) (srfi srfi-64)) @@ -100,19 +102,59 @@ Content-Length: 4349 (x-mailer . "FooMail 4.0 4.03 (SMT460B92F)") (content-length . "4349"))) +(test-equal "email with 8 bit encoding and non UTF-8 charset" + (call-with-input-file "tests/email-with-8bit-encoding-and-non-utf8-charset" + (compose parse-email get-bytevector-all)) + (make-email + `((from ((name . "John Doe") + (address . "jdoe@machine.example"))) + (to ((name . "Mary Smith") + (address . "mary@example.net"))) + (subject . "Saying Hello") + (date . ,(make-date 0 6 55 9 21 11 1997 -21600)) + (message-id . "1234@local.machine.example") + (content-type (type . text) + (subtype . plain) + (charset . "ISO-8859-7")) + (content-transfer-encoding . 8bit)) + "Hello Foo’.")) + +(test-equal "multipart email with a 8 bit encoding and non UTF-8 charset part" + (call-with-input-file "tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part" + (compose parse-email get-bytevector-all)) + (make-email + `((content-transfer-encoding . 7bit) + (from ((name . "John Doe") + (address . "jdoe@machine.example"))) + (to ((name . "Mary Smith") + (address . "mary@example.net"))) + (subject . "Saying Hello") + (date . ,(make-date 0 6 55 9 21 11 1997 -21600)) + (message-id . "1234@local.machine.example") + (content-type (type . multipart) + (subtype . mixed) + (boundary . "boundary"))) + (list (make-mime-entity + `((content-type (type . text) + (subtype . plain) + (charset . "ISO-8859-7")) + (content-transfer-encoding . 8bit)) + "Hello Foo’.")))) + (test-equal "handle truncated multipart message gracefully" ((module-ref (resolve-module '(email email)) 'body->mime-entities) - "--boundary + (string->bytevector + "--boundary Content-Type: text/plain foo -" +" "utf-8") "boundary") - (list "Content-Type: text/plain + (list (string->bytevector "Content-Type: text/plain foo -")) +" "utf-8"))) (test-equal "parse name-addr email address" (parse-email-address "Foo ") diff --git a/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part new file mode 100644 index 0000000..de340f4 --- /dev/null +++ b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part @@ -0,0 +1,13 @@ +From: John Doe +To: Mary Smith +Subject: Saying Hello +Date: Fri, 21 Nov 1997 09:55:06 -0600 +Message-ID: <1234@local.machine.example> +Content-Type: multipart/mixed; boundary="boundary" + +--boundary +Content-Type: text/plain; charset=ISO-8859-7 +Content-Transfer-Encoding: 8bit + +Hello Foo¢. +--boundary-- \ No newline at end of file -- cgit v1.2.3