aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2019-07-26 01:53:22 +0530
committerArun Isaac2019-07-28 12:13:43 +0530
commitac83c2a00c13702bc365cd0f3074239fa63d743f (patch)
treed5055a15da9dba36033bb207a541ee890098f8fe
parent36474696eaa6187e7077f73d02daeb0138e03124 (diff)
downloadguile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.tar.gz
guile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.tar.lz
guile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.zip
email: Support email with mixed encoding of characters.
Prior to this, parse-email would accept email in the form of a string. A string is constrained to use the same encoding for all its characters whereas an email can have characters encoded using different encoding schemes. Therefore, it is more correct that parse-email deals with bytevectors instead of strings. * email/utils.scm (read-bytes-till): New function. * email/email.scm (body->mime-entities, email->headers+body, decode-body): Deal with emails as bytevectors instead of strings. (parse-mime-entity): Rename text argument to bv. (parse-email, parse-email-body): Overload to handle input in the form of a string or bytevector. * doc/guile-email.texi (Parsing e-mail): Document overloading of parse-email and parse-email-body. * tests/email.scm ("handle truncated multipart message gracefully"): Deal in bytevectors instead of strings. ("email with 8 bit encoding and non UTF-8 charset", "multipart email with a 8 bit encoding and non UTF-8 charset part"): New tests. * tests/email-with-8bit-encoding-and-non-utf8-charset, tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part: New files. Reported-by: Jack Hill <jackhill@jackhill.us>
-rw-r--r--doc/guile-email.texi34
-rw-r--r--email/email.scm85
-rw-r--r--email/utils.scm32
-rw-r--r--tests/email-with-8bit-encoding-and-non-utf8-charset9
-rw-r--r--tests/email.scm52
-rw-r--r--tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part13
6 files changed, 178 insertions, 47 deletions
diff --git a/doc/guile-email.texi b/doc/guile-email.texi
index b606021..70a4e28 100644
--- a/doc/guile-email.texi
+++ b/doc/guile-email.texi
@@ -53,22 +53,42 @@ RF2047 and RFC2049.
@node Parsing e-mail
@chapter Parsing e-mail
-@deffn {Scheme Procedure} parse-email email
-Parse string @var{email} and return result as an <email> record.
-@end deffn
+@deftypefn {Scheme Procedure} parse-email (bytevector @var{email})
+@deftypefnx {Scheme Procedure} parse-email (string @var{email})
+Parse bytevector @var{email} and return result as an @code{<email>}
+record.
+
+Parse string @var{email} and return result as an @code{<email>}
+record.
+@end deftypefn
@deffn {Scheme Procedure} parse-email-headers headers
Parse string @var{headers} as email headers and return an association
list of header keys and values.
@end deffn
-@deffn {Scheme Procedure} parse-email-body headers body
-Parse @var{body} as email body where @var{headers} is an association
-list of header keys and values as returned by
+@deftypefn {Scheme Procedure} parse-email-body (string @var{headers}) (bytevector @var{body})
+@deftypefnx {Scheme Procedure} parse-email-body (string @var{headers}) (string @var{body})
+Parse bytevector @var{body} as email body where @var{headers} is an
+association list of header keys and values as returned by
@code{parse-email-headers}. Return a list of <mime-entity> records if
the body is a multipart message. Else, return a single <mime-entity>
record.
-@end deffn
+
+Parse string @var{body} as email body where @var{headers} is an
+association list of header keys and values as returned by
+@code{parse-email-headers}. Return a list of <mime-entity> records if
+the body is a multipart message. Else, return a single <mime-entity>
+record.
+@end deftypefn
+
+Note that while an email can have characters encoded using different
+schemes, a string is constrained to have all characters encoded using
+the same scheme. Therefore, passing a string to @code{parse-email} or
+@code{parse-email-body} will not always produce correct results. Hence,
+this variant of @code{parse-email} and @code{parse-email-body} will be
+deprecated in the future. This variant is only provided in the interest
+of backward compatibility.
@node Encoding and Decoding
@chapter Encoding and Decoding
diff --git a/email/email.scm b/email/email.scm
index 7e613ff..381c81a 100644
--- a/email/email.scm
+++ b/email/email.scm
@@ -18,10 +18,13 @@
;;; <http://www.gnu.org/licenses/>.
(define-module (email email)
+ #:use-module (ice-9 binary-ports)
#:use-module (ice-9 iconv)
#:use-module (ice-9 match)
#:use-module (ice-9 peg)
#:use-module (ice-9 regex)
+ #:use-module (rnrs bytevectors)
+ #:use-module ((rnrs io ports) #:select (call-with-port))
#:use-module (rnrs io simple)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-2)
@@ -570,11 +573,15 @@
'post))
(define (body->mime-entities body boundary)
- "Split BODY into a list of mime entities separated by BOUNDARY (as
-explained in RFC2045), and return that list."
+ "Split bytevector BODY into a list of mime entities separated by
+BOUNDARY (as explained in RFC2045), and return that list."
(define (read-till-boundary port)
- (read-while port get-line-with-delimiter
- (negate (cut string-prefix? (string-append "--" boundary) <>))))
+ (read-bytes-till
+ port
+ ;; TODO: Look for the boundary sequence only at the beginning of
+ ;; a line.
+ ;; Boundary is always an ASCII string.
+ (string->bytevector (string-append "--" boundary) "us-ascii")))
(define (read-mime-entity port)
(let ((line (get-line-with-delimiter port)))
@@ -586,22 +593,25 @@ explained in RFC2045), and return that list."
(eof-object)
(read-till-boundary port))))
- (call-with-input-string body
- (lambda (port)
- (read-till-boundary port)
- (read-objects read-mime-entity port))))
+ (call-with-port
+ (open-bytevector-input-port body)
+ (lambda (port)
+ (read-till-boundary port)
+ (read-objects read-mime-entity port))))
(define (email->headers+body email)
- "Split EMAIL into headers and body. Return as multiple values."
- (call-with-input-string email
+ "Split EMAIL bytevector into headers and body. Return as multiple
+values. The returned headers is a string and body is a bytevector."
+ (call-with-port
+ (open-bytevector-input-port email)
(lambda (port)
+ (set-port-encoding! port "us-ascii")
(let ((headers (read-while port get-line-with-delimiter
(lambda (line)
(not (or (string= line "\n")
(string= line "\r\n")))))))
(get-line-with-delimiter port)
- (values headers
- (read-while port get-line-with-delimiter identity))))))
+ (values headers (get-bytevector-all port))))))
(define (post-process-mailbox . args)
(define process-name
@@ -714,16 +724,20 @@ For example,
header keys and values as returned by parse-email-headers. Return a
list of <mime-entity> records if the body is a multipart
message. Else, return a single <mime-entity> record."
- (let ((content-type (assoc-ref headers 'content-type)))
- (case (assoc-ref content-type 'type)
- ((multipart)
- (map parse-mime-entity
- (body->mime-entities body (assoc-ref content-type 'boundary))))
- ((text)
- (string-trim-both
- (decode-body body (assoc-ref headers 'content-transfer-encoding)
- (assoc-ref content-type 'charset))))
- (else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))
+ (match body
+ ((? string? body)
+ (parse-email-body headers (string->bytevector body "utf-8")))
+ ((? bytevector? body)
+ (let ((content-type (assoc-ref headers 'content-type)))
+ (case (assoc-ref content-type 'type)
+ ((multipart)
+ (map parse-mime-entity
+ (body->mime-entities body (assoc-ref content-type 'boundary))))
+ ((text)
+ (string-trim-both
+ (decode-body body (assoc-ref headers 'content-transfer-encoding)
+ (assoc-ref content-type 'charset))))
+ (else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))))
(define (add-default-headers headers)
;; Default Content-Type and Content-Transfer-Encoding headers as
@@ -742,8 +756,8 @@ message. Else, return a single <mime-entity> record."
default-headers)
headers)))
-(define (parse-mime-entity text)
- (let-values (((headers body) (email->headers+body text)))
+(define (parse-mime-entity bv)
+ (let-values (((headers body) (email->headers+body bv)))
(let ((headers
(pre-post-order
(peg:tree
@@ -766,9 +780,13 @@ message. Else, return a single <mime-entity> record."
(define (parse-email email)
"Parse string EMAIL and return result as an <email> record."
- (let-values (((headers body) (email->headers+body email)))
- (let ((headers-alist (parse-email-headers headers)))
- (make-email headers-alist (parse-email-body headers-alist body)))))
+ (match email
+ ((? string? email)
+ (parse-email (string->bytevector email "utf-8")))
+ ((? bytevector? email)
+ (let-values (((headers body) (email->headers+body email)))
+ (let ((headers-alist (parse-email-headers headers)))
+ (make-email headers-alist (parse-email-body headers-alist body)))))))
(define (parse-email-headers headers)
"Parse string HEADERS as email headers and return an association
@@ -824,7 +842,7 @@ list of header keys and values."
(*default* . ,(lambda tree tree)))))
(define* (decode-body body encoding #:optional charset)
- (let ((octets
+ (let ((decoded-octets
(case encoding
((base64)
(base64-decode
@@ -834,15 +852,14 @@ list of header keys and values."
(ucs-range->char-set (char->integer #\A) (1+ (char->integer #\Z)))
(ucs-range->char-set (char->integer #\0) (1+ (char->integer #\9)))
(char-set #\+ #\/ #\=))
- body)))
- ((quoted-printable) (quoted-printable-decode body))
+ (bytevector->string body "us-ascii"))))
+ ((quoted-printable) (quoted-printable-decode
+ (bytevector->string body "us-ascii")))
((#{7bit}# #{8bit}# binary) body)
(else (error "Body decoding failed. Unknown encoding" encoding)))))
(if charset
- (case encoding
- ((base64 quoted-printable) (bytevector->string octets charset))
- ((#{7bit}# #{8bit}# binary) octets))
- octets)))
+ (bytevector->string decoded-octets charset)
+ decoded-octets)))
(define (read-next-email-in-mbox port)
;; Read and discard From_ line
diff --git a/email/utils.scm b/email/utils.scm
index 35a96d8..2040b21 100644
--- a/email/utils.scm
+++ b/email/utils.scm
@@ -1,5 +1,5 @@
;;; guile-email --- Guile email parser
-;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2018, 2019 Arun Isaac <arunisaac@systemreboot.net>
;;;
;;; This file is part of guile-email.
;;;
@@ -20,11 +20,17 @@
(define-module (email utils)
#:use-module (ice-9 match)
#:use-module (ice-9 peg codegen)
+ #:use-module (ice-9 binary-ports)
#:use-module (ice-9 textual-ports)
+ #:use-module (rnrs bytevectors)
+ #:use-module ((rnrs io ports)
+ #:select (call-with-bytevector-output-port))
#:use-module (rnrs io simple)
+ #:use-module (srfi srfi-26)
#:export (get-line-with-delimiter
read-objects
read-while
+ read-bytes-till
acons*
alist-delete*))
@@ -52,6 +58,30 @@ string returned by READ-PROC as argument."
(let ((str (call-with-output-string read-while-loop)))
(if (string-null? str) (eof-object) str)))
+(define (read-bytes-till port sequence)
+ "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is
+reached. If SEQUENCE is seen, unget it to PORT and return."
+ (define (read-bytes-and-write-till in out sequence)
+ (let ((octet (get-u8 in)))
+ (cond
+ ((eof-object? octet) octet)
+ ;; If octet read matches first octet of sequence, try matching
+ ;; the full sequence.
+ ((= octet (bytevector-u8-ref sequence 0))
+ (unget-bytevector in sequence 0 1)
+ (let ((bv (get-bytevector-n in (bytevector-length sequence))))
+ (cond
+ ((bytevector=? bv sequence) (unget-bytevector in bv))
+ (else (unget-bytevector in bv 1)
+ (put-u8 out octet)
+ (read-bytes-and-write-till in out sequence)))))
+ ;; Else, output the octet and continue reading.
+ (else (put-u8 out octet)
+ (read-bytes-and-write-till in out sequence)))))
+
+ (call-with-bytevector-output-port
+ (cut read-bytes-and-write-till port <> sequence)))
+
(define (get-line-with-delimiter port)
"Read a line from PORT and return it as a string including the
delimiting linefeed character."
diff --git a/tests/email-with-8bit-encoding-and-non-utf8-charset b/tests/email-with-8bit-encoding-and-non-utf8-charset
new file mode 100644
index 0000000..a4f4a6e
--- /dev/null
+++ b/tests/email-with-8bit-encoding-and-non-utf8-charset
@@ -0,0 +1,9 @@
+From: John Doe <jdoe@machine.example>
+To: Mary Smith <mary@example.net>
+Subject: Saying Hello
+Date: Fri, 21 Nov 1997 09:55:06 -0600
+Message-ID: <1234@local.machine.example>
+Content-Type: text/plain; charset=ISO-8859-7
+Content-Transfer-Encoding: 8bit
+
+Hello Foo¢. \ No newline at end of file
diff --git a/tests/email.scm b/tests/email.scm
index ab2a408..856a5b9 100644
--- a/tests/email.scm
+++ b/tests/email.scm
@@ -1,6 +1,6 @@
;;; guile-email --- Guile email parser
;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
-;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2018, 2019 Arun Isaac <arunisaac@systemreboot.net>
;;;
;;; This file was adapted from guile-debbugs and is part of guile-email.
;;;
@@ -51,6 +51,8 @@
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(use-modules (email email)
+ (ice-9 binary-ports)
+ (ice-9 iconv)
(srfi srfi-19)
(srfi srfi-64))
@@ -100,19 +102,59 @@ Content-Length: 4349
(x-mailer . "FooMail 4.0 4.03 (SMT460B92F)")
(content-length . "4349")))
+(test-equal "email with 8 bit encoding and non UTF-8 charset"
+ (call-with-input-file "tests/email-with-8bit-encoding-and-non-utf8-charset"
+ (compose parse-email get-bytevector-all))
+ (make-email
+ `((from ((name . "John Doe")
+ (address . "jdoe@machine.example")))
+ (to ((name . "Mary Smith")
+ (address . "mary@example.net")))
+ (subject . "Saying Hello")
+ (date . ,(make-date 0 6 55 9 21 11 1997 -21600))
+ (message-id . "1234@local.machine.example")
+ (content-type (type . text)
+ (subtype . plain)
+ (charset . "ISO-8859-7"))
+ (content-transfer-encoding . 8bit))
+ "Hello Foo’."))
+
+(test-equal "multipart email with a 8 bit encoding and non UTF-8 charset part"
+ (call-with-input-file "tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part"
+ (compose parse-email get-bytevector-all))
+ (make-email
+ `((content-transfer-encoding . 7bit)
+ (from ((name . "John Doe")
+ (address . "jdoe@machine.example")))
+ (to ((name . "Mary Smith")
+ (address . "mary@example.net")))
+ (subject . "Saying Hello")
+ (date . ,(make-date 0 6 55 9 21 11 1997 -21600))
+ (message-id . "1234@local.machine.example")
+ (content-type (type . multipart)
+ (subtype . mixed)
+ (boundary . "boundary")))
+ (list (make-mime-entity
+ `((content-type (type . text)
+ (subtype . plain)
+ (charset . "ISO-8859-7"))
+ (content-transfer-encoding . 8bit))
+ "Hello Foo’."))))
+
(test-equal "handle truncated multipart message gracefully"
((module-ref (resolve-module '(email email))
'body->mime-entities)
- "--boundary
+ (string->bytevector
+ "--boundary
Content-Type: text/plain
foo
-"
+" "utf-8")
"boundary")
- (list "Content-Type: text/plain
+ (list (string->bytevector "Content-Type: text/plain
foo
-"))
+" "utf-8")))
(test-equal "parse name-addr email address"
(parse-email-address "Foo <foo@example.org>")
diff --git a/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part
new file mode 100644
index 0000000..de340f4
--- /dev/null
+++ b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part
@@ -0,0 +1,13 @@
+From: John Doe <jdoe@machine.example>
+To: Mary Smith <mary@example.net>
+Subject: Saying Hello
+Date: Fri, 21 Nov 1997 09:55:06 -0600
+Message-ID: <1234@local.machine.example>
+Content-Type: multipart/mixed; boundary="boundary"
+
+--boundary
+Content-Type: text/plain; charset=ISO-8859-7
+Content-Transfer-Encoding: 8bit
+
+Hello Foo¢.
+--boundary-- \ No newline at end of file