about summary refs log tree commit diff
path: root/email/utils.scm
diff options
context:
space:
mode:
authorArun Isaac2019-07-26 01:53:22 +0530
committerArun Isaac2019-07-28 12:13:43 +0530
commitac83c2a00c13702bc365cd0f3074239fa63d743f (patch)
treed5055a15da9dba36033bb207a541ee890098f8fe /email/utils.scm
parent36474696eaa6187e7077f73d02daeb0138e03124 (diff)
downloadguile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.tar.gz
guile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.tar.lz
guile-email-ac83c2a00c13702bc365cd0f3074239fa63d743f.zip
email: Support email with mixed encoding of characters.
Prior to this, parse-email would accept email in the form of a
string. A string is constrained to use the same encoding for all its
characters whereas an email can have characters encoded using
different encoding schemes. Therefore, it is more correct that
parse-email deals with bytevectors instead of strings.

* email/utils.scm (read-bytes-till): New function.
* email/email.scm (body->mime-entities, email->headers+body,
decode-body): Deal with emails as bytevectors instead of strings.
(parse-mime-entity): Rename text argument to bv.
(parse-email, parse-email-body): Overload to handle input in the form
of a string or bytevector.
* doc/guile-email.texi (Parsing e-mail): Document overloading of
parse-email and parse-email-body.
* tests/email.scm ("handle truncated multipart message gracefully"):
Deal in bytevectors instead of strings.
("email with 8 bit encoding and non UTF-8 charset", "multipart email
with a 8 bit encoding and non UTF-8 charset part"): New tests.
* tests/email-with-8bit-encoding-and-non-utf8-charset,
tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part:
New files.

Reported-by: Jack Hill <jackhill@jackhill.us>
Diffstat (limited to 'email/utils.scm')
-rw-r--r--email/utils.scm32
1 files changed, 31 insertions, 1 deletions
diff --git a/email/utils.scm b/email/utils.scm
index 35a96d8..2040b21 100644
--- a/email/utils.scm
+++ b/email/utils.scm
@@ -1,5 +1,5 @@
 ;;; guile-email --- Guile email parser
-;;; Copyright © 2018 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2018, 2019 Arun Isaac <arunisaac@systemreboot.net>
 ;;;
 ;;; This file is part of guile-email.
 ;;;
@@ -20,11 +20,17 @@
 (define-module (email utils)
   #:use-module (ice-9 match)
   #:use-module (ice-9 peg codegen)
+  #:use-module (ice-9 binary-ports)
   #:use-module (ice-9 textual-ports)
+  #:use-module (rnrs bytevectors)
+  #:use-module ((rnrs io ports)
+                #:select (call-with-bytevector-output-port))
   #:use-module (rnrs io simple)
+  #:use-module (srfi srfi-26)
   #:export (get-line-with-delimiter
             read-objects
             read-while
+            read-bytes-till
             acons*
             alist-delete*))
 
@@ -52,6 +58,30 @@ string returned by READ-PROC as argument."
   (let ((str (call-with-output-string read-while-loop)))
     (if (string-null? str) (eof-object) str)))
 
+(define (read-bytes-till port sequence)
+  "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is
+reached. If SEQUENCE is seen, unget it to PORT and return."
+  (define (read-bytes-and-write-till in out sequence)
+    (let ((octet (get-u8 in)))
+      (cond
+       ((eof-object? octet) octet)
+       ;; If octet read matches first octet of sequence, try matching
+       ;; the full sequence.
+       ((= octet (bytevector-u8-ref sequence 0))
+        (unget-bytevector in sequence 0 1)
+        (let ((bv (get-bytevector-n in (bytevector-length sequence))))
+          (cond
+           ((bytevector=? bv sequence) (unget-bytevector in bv))
+           (else (unget-bytevector in bv 1)
+                 (put-u8 out octet)
+                 (read-bytes-and-write-till in out sequence)))))
+       ;; Else, output the octet and continue reading.
+       (else (put-u8 out octet)
+             (read-bytes-and-write-till in out sequence)))))
+
+  (call-with-bytevector-output-port
+   (cut read-bytes-and-write-till port <> sequence)))
+
 (define (get-line-with-delimiter port)
   "Read a line from PORT and return it as a string including the
 delimiting linefeed character."