summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--doc/guile-email.texi34
-rw-r--r--email/email.scm85
-rw-r--r--email/utils.scm32
-rw-r--r--tests/email-with-8bit-encoding-and-non-utf8-charset9
-rw-r--r--tests/email.scm52
-rw-r--r--tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part13
6 files changed, 178 insertions, 47 deletions
diff --git a/doc/guile-email.texi b/doc/guile-email.texi
index b606021..70a4e28 100644
--- a/doc/guile-email.texi
+++ b/doc/guile-email.texi
@@ -53,22 +53,42 @@ RF2047 and RFC2049.
 @node Parsing e-mail
 @chapter Parsing e-mail
 
-@deffn {Scheme Procedure} parse-email email
-Parse string @var{email} and return result as an <email> record.
-@end deffn
+@deftypefn {Scheme Procedure} parse-email (bytevector @var{email})
+@deftypefnx {Scheme Procedure} parse-email (string @var{email})
+Parse bytevector @var{email} and return result as an @code{<email>}
+record.
+
+Parse string @var{email} and return result as an @code{<email>}
+record.
+@end deftypefn
 
 @deffn {Scheme Procedure} parse-email-headers headers
 Parse string @var{headers} as email headers and return an association
 list of header keys and values.
 @end deffn
 
-@deffn {Scheme Procedure} parse-email-body headers body
-Parse @var{body} as email body where @var{headers} is an association
-list of header keys and values as returned by
+@deftypefn {Scheme Procedure} parse-email-body (string @var{headers}) (bytevector @var{body})
+@deftypefnx {Scheme Procedure} parse-email-body (string @var{headers}) (string @var{body})
+Parse bytevector @var{body} as email body where @var{headers} is an
+association list of header keys and values as returned by
 @code{parse-email-headers}. Return a list of <mime-entity> records if
 the body is a multipart message. Else, return a single <mime-entity>
 record.
-@end deffn
+
+Parse string @var{body} as email body where @var{headers} is an
+association list of header keys and values as returned by
+@code{parse-email-headers}. Return a list of <mime-entity> records if
+the body is a multipart message. Else, return a single <mime-entity>
+record.
+@end deftypefn
+
+Note that while an email can have characters encoded using different
+schemes, a string is constrained to have all characters encoded using
+the same scheme. Therefore, passing a string to @code{parse-email} or
+@code{parse-email-body} will not always produce correct results. Hence,
+this variant of @code{parse-email} and @code{parse-email-body} will be
+deprecated in the future. This variant is only provided in the interest
+of backward compatibility.
 
 @node Encoding and Decoding
 @chapter Encoding and Decoding
diff --git a/email/email.scm b/email/email.scm
index 7e613ff..381c81a 100644
--- a/email/email.scm
+++ b/email/email.scm
@@ -18,10 +18,13 @@
 ;;; <http://www.gnu.org/licenses/>.
 
 (define-module (email email)
+  #:use-module (ice-9 binary-ports)
   #:use-module (ice-9 iconv)
   #:use-module (ice-9 match)
   #:use-module (ice-9 peg)
   #:use-module (ice-9 regex)
+  #:use-module (rnrs bytevectors)
+  #:use-module ((rnrs io ports) #:select (call-with-port))
   #:use-module (rnrs io simple)
   #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-2)
@@ -570,11 +573,15 @@
    'post))
 
 (define (body->mime-entities body boundary)
-  "Split BODY into a list of mime entities separated by BOUNDARY (as
-explained in RFC2045), and return that list."
+  "Split bytevector BODY into a list of mime entities separated by
+BOUNDARY (as explained in RFC2045), and return that list."
   (define (read-till-boundary port)
-    (read-while port get-line-with-delimiter
-                (negate (cut string-prefix? (string-append "--" boundary) <>))))
+    (read-bytes-till
+     port
+     ;; TODO: Look for the boundary sequence only at the beginning of
+     ;; a line.
+     ;; Boundary is always an ASCII string.
+     (string->bytevector (string-append "--" boundary) "us-ascii")))
 
   (define (read-mime-entity port)
     (let ((line (get-line-with-delimiter port)))
@@ -586,22 +593,25 @@ explained in RFC2045), and return that list."
           (eof-object)
           (read-till-boundary port))))
 
-  (call-with-input-string body
-    (lambda (port)
-      (read-till-boundary port)
-      (read-objects read-mime-entity port))))
+  (call-with-port
+   (open-bytevector-input-port body)
+   (lambda (port)
+     (read-till-boundary port)
+     (read-objects read-mime-entity port))))
 
 (define (email->headers+body email)
-  "Split EMAIL into headers and body. Return as multiple values."
-  (call-with-input-string email
+  "Split EMAIL bytevector into headers and body. Return as multiple
+values. The returned headers is a string and body is a bytevector."
+  (call-with-port
+   (open-bytevector-input-port email)
     (lambda (port)
+      (set-port-encoding! port "us-ascii")
       (let ((headers (read-while port get-line-with-delimiter
                                  (lambda (line)
                                    (not (or (string= line "\n")
                                             (string= line "\r\n")))))))
         (get-line-with-delimiter port)
-        (values headers
-                (read-while port get-line-with-delimiter identity))))))
+        (values headers (get-bytevector-all port))))))
 
 (define (post-process-mailbox . args)
   (define process-name
@@ -714,16 +724,20 @@ For example,
 header keys and values as returned by parse-email-headers. Return a
 list of <mime-entity> records if the body is a multipart
 message. Else, return a single <mime-entity> record."
-  (let ((content-type (assoc-ref headers 'content-type)))
-    (case (assoc-ref content-type 'type)
-      ((multipart)
-       (map parse-mime-entity
-            (body->mime-entities body (assoc-ref content-type 'boundary))))
-      ((text)
-       (string-trim-both
-        (decode-body body (assoc-ref headers 'content-transfer-encoding)
-                     (assoc-ref content-type 'charset))))
-      (else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))
+  (match body
+    ((? string? body)
+     (parse-email-body headers (string->bytevector body "utf-8")))
+    ((? bytevector? body)
+     (let ((content-type (assoc-ref headers 'content-type)))
+       (case (assoc-ref content-type 'type)
+         ((multipart)
+          (map parse-mime-entity
+               (body->mime-entities body (assoc-ref content-type 'boundary))))
+         ((text)
+          (string-trim-both
+           (decode-body body (assoc-ref headers 'content-transfer-encoding)
+                        (assoc-ref content-type 'charset))))
+         (else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))))
 
 (define (add-default-headers headers)
   ;; Default Content-Type and Content-Transfer-Encoding headers as
@@ -742,8 +756,8 @@ message. Else, return a single <mime-entity> record."
                            default-headers)
             headers)))
 
-(define (parse-mime-entity text)
-  (let-values (((headers body) (email->headers+body text)))
+(define (parse-mime-entity bv)
+  (let-values (((headers body) (email->headers+body bv)))
     (let ((headers
            (pre-post-order
             (peg:tree
@@ -766,9 +780,13 @@ message. Else, return a single <mime-entity> record."
 
 (define (parse-email email)
   "Parse string EMAIL and return result as an <email> record."
-  (let-values (((headers body) (email->headers+body email)))
-    (let ((headers-alist (parse-email-headers headers)))
-      (make-email headers-alist (parse-email-body headers-alist body)))))
+  (match email
+    ((? string? email)
+     (parse-email (string->bytevector email "utf-8")))
+    ((? bytevector? email)
+     (let-values (((headers body) (email->headers+body email)))
+       (let ((headers-alist (parse-email-headers headers)))
+         (make-email headers-alist (parse-email-body headers-alist body)))))))
 
 (define (parse-email-headers headers)
   "Parse string HEADERS as email headers and return an association
@@ -824,7 +842,7 @@ list of header keys and values."
      (*default* . ,(lambda tree tree)))))
 
 (define* (decode-body body encoding #:optional charset)
-  (let ((octets
+  (let ((decoded-octets
          (case encoding
            ((base64)
             (base64-decode 
@@ -834,15 +852,14 @@ list of header keys and values."
                (ucs-range->char-set (char->integer #\A) (1+ (char->integer #\Z)))
                (ucs-range->char-set (char->integer #\0) (1+ (char->integer #\9)))
                (char-set #\+ #\/ #\=))
-              body)))
-           ((quoted-printable) (quoted-printable-decode body))
+              (bytevector->string body "us-ascii"))))
+           ((quoted-printable) (quoted-printable-decode
+                                (bytevector->string body "us-ascii")))
            ((#{7bit}# #{8bit}# binary) body)
            (else (error "Body decoding failed. Unknown encoding" encoding)))))
     (if charset
-        (case encoding
-          ((base64 quoted-printable) (bytevector->string octets charset))
-          ((#{7bit}# #{8bit}# binary) octets))
-        octets)))
+        (bytevector->string decoded-octets charset)
+        decoded-octets)))
 
 (define (read-next-email-in-mbox port)
   ;; Read and discard From_ line
diff --git a/email/utils.scm b/email/utils.scm
index 35a96d8..2040b21 100644
--- a/email/utils.scm
+++ b/email/utils.scm
@@ -1,5 +1,5 @@
 ;;; guile-email --- Guile email parser
-;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2018, 2019 Arun Isaac <arunisaac@systemreboot.net>
 ;;;
 ;;; This file is part of guile-email.
 ;;;
@@ -20,11 +20,17 @@
 (define-module (email utils)
   #:use-module (ice-9 match)
   #:use-module (ice-9 peg codegen)
+  #:use-module (ice-9 binary-ports)
   #:use-module (ice-9 textual-ports)
+  #:use-module (rnrs bytevectors)
+  #:use-module ((rnrs io ports)
+                #:select (call-with-bytevector-output-port))
   #:use-module (rnrs io simple)
+  #:use-module (srfi srfi-26)
   #:export (get-line-with-delimiter
             read-objects
             read-while
+            read-bytes-till
             acons*
             alist-delete*))
 
@@ -52,6 +58,30 @@ string returned by READ-PROC as argument."
   (let ((str (call-with-output-string read-while-loop)))
     (if (string-null? str) (eof-object) str)))
 
+(define (read-bytes-till port sequence)
+  "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is
+reached. If SEQUENCE is seen, unget it to PORT and return."
+  (define (read-bytes-and-write-till in out sequence)
+    (let ((octet (get-u8 in)))
+      (cond
+       ((eof-object? octet) octet)
+       ;; If octet read matches first octet of sequence, try matching
+       ;; the full sequence.
+       ((= octet (bytevector-u8-ref sequence 0))
+        (unget-bytevector in sequence 0 1)
+        (let ((bv (get-bytevector-n in (bytevector-length sequence))))
+          (cond
+           ((bytevector=? bv sequence) (unget-bytevector in bv))
+           (else (unget-bytevector in bv 1)
+                 (put-u8 out octet)
+                 (read-bytes-and-write-till in out sequence)))))
+       ;; Else, output the octet and continue reading.
+       (else (put-u8 out octet)
+             (read-bytes-and-write-till in out sequence)))))
+
+  (call-with-bytevector-output-port
+   (cut read-bytes-and-write-till port <> sequence)))
+
 (define (get-line-with-delimiter port)
   "Read a line from PORT and return it as a string including the
 delimiting linefeed character."
diff --git a/tests/email-with-8bit-encoding-and-non-utf8-charset b/tests/email-with-8bit-encoding-and-non-utf8-charset
new file mode 100644
index 0000000..a4f4a6e
--- /dev/null
+++ b/tests/email-with-8bit-encoding-and-non-utf8-charset
@@ -0,0 +1,9 @@
+From: John Doe <jdoe@machine.example>
+To: Mary Smith <mary@example.net>
+Subject: Saying Hello
+Date: Fri, 21 Nov 1997 09:55:06 -0600
+Message-ID: <1234@local.machine.example>
+Content-Type: text/plain; charset=ISO-8859-7
+Content-Transfer-Encoding: 8bit
+
+Hello Foo¢.
\ No newline at end of file
diff --git a/tests/email.scm b/tests/email.scm
index ab2a408..856a5b9 100644
--- a/tests/email.scm
+++ b/tests/email.scm
@@ -1,6 +1,6 @@
 ;;; guile-email --- Guile email parser
 ;;; Copyright © 2017 Ricardo Wurmus <rekado@elephly.net>
-;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2018, 2019 Arun Isaac <arunisaac@systemreboot.net>
 ;;;
 ;;; This file was adapted from guile-debbugs and is part of guile-email.
 ;;;
@@ -51,6 +51,8 @@
 ;;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 (use-modules (email email)
+             (ice-9 binary-ports)
+             (ice-9 iconv)
              (srfi srfi-19)
              (srfi srfi-64))
 
@@ -100,19 +102,59 @@ Content-Length: 4349
     (x-mailer . "FooMail 4.0 4.03 (SMT460B92F)")
     (content-length . "4349")))
 
+(test-equal "email with 8 bit encoding and non UTF-8 charset"
+  (call-with-input-file "tests/email-with-8bit-encoding-and-non-utf8-charset"
+    (compose parse-email get-bytevector-all))
+  (make-email
+   `((from ((name . "John Doe")
+            (address . "jdoe@machine.example")))
+     (to ((name . "Mary Smith")
+          (address . "mary@example.net")))
+     (subject . "Saying Hello")
+     (date . ,(make-date 0 6 55 9 21 11 1997 -21600))
+     (message-id . "1234@local.machine.example")
+     (content-type (type . text)
+                   (subtype . plain)
+                   (charset . "ISO-8859-7"))
+     (content-transfer-encoding . 8bit))
+   "Hello Foo’."))
+
+(test-equal "multipart email with a 8 bit encoding and non UTF-8 charset part"
+  (call-with-input-file "tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part"
+    (compose parse-email get-bytevector-all))
+  (make-email
+   `((content-transfer-encoding . 7bit)
+     (from ((name . "John Doe")
+            (address . "jdoe@machine.example")))
+     (to ((name . "Mary Smith")
+          (address . "mary@example.net")))
+     (subject . "Saying Hello")
+     (date . ,(make-date 0 6 55 9 21 11 1997 -21600))
+     (message-id . "1234@local.machine.example")
+     (content-type (type . multipart)
+                   (subtype . mixed)
+                   (boundary . "boundary")))
+   (list (make-mime-entity
+          `((content-type (type . text)
+                          (subtype . plain)
+                          (charset . "ISO-8859-7"))
+            (content-transfer-encoding . 8bit))
+          "Hello Foo’."))))
+
 (test-equal "handle truncated multipart message gracefully"
   ((module-ref (resolve-module '(email email))
                'body->mime-entities)
-   "--boundary
+   (string->bytevector
+    "--boundary
 Content-Type: text/plain
 
 foo
-"
+" "utf-8")
    "boundary")
-  (list "Content-Type: text/plain
+  (list (string->bytevector "Content-Type: text/plain
 
 foo
-"))
+" "utf-8")))
 
 (test-equal "parse name-addr email address"
   (parse-email-address "Foo <foo@example.org>")
diff --git a/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part
new file mode 100644
index 0000000..de340f4
--- /dev/null
+++ b/tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part
@@ -0,0 +1,13 @@
+From: John Doe <jdoe@machine.example>
+To: Mary Smith <mary@example.net>
+Subject: Saying Hello
+Date: Fri, 21 Nov 1997 09:55:06 -0600
+Message-ID: <1234@local.machine.example>
+Content-Type: multipart/mixed; boundary="boundary"
+
+--boundary
+Content-Type: text/plain; charset=ISO-8859-7
+Content-Transfer-Encoding: 8bit
+
+Hello Foo¢.
+--boundary--
\ No newline at end of file