From ac83c2a00c13702bc365cd0f3074239fa63d743f Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Fri, 26 Jul 2019 01:53:22 +0530 Subject: email: Support email with mixed encoding of characters. Prior to this, parse-email would accept email in the form of a string. A string is constrained to use the same encoding for all its characters whereas an email can have characters encoded using different encoding schemes. Therefore, it is more correct that parse-email deals with bytevectors instead of strings. * email/utils.scm (read-bytes-till): New function. * email/email.scm (body->mime-entities, email->headers+body, decode-body): Deal with emails as bytevectors instead of strings. (parse-mime-entity): Rename text argument to bv. (parse-email, parse-email-body): Overload to handle input in the form of a string or bytevector. * doc/guile-email.texi (Parsing e-mail): Document overloading of parse-email and parse-email-body. * tests/email.scm ("handle truncated multipart message gracefully"): Deal in bytevectors instead of strings. ("email with 8 bit encoding and non UTF-8 charset", "multipart email with a 8 bit encoding and non UTF-8 charset part"): New tests. * tests/email-with-8bit-encoding-and-non-utf8-charset, tests/multipart-email-with-a-8bit-encoding-and-non-utf8-charset-part: New files. Reported-by: Jack Hill --- email/utils.scm | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'email/utils.scm') diff --git a/email/utils.scm b/email/utils.scm index 35a96d8..2040b21 100644 --- a/email/utils.scm +++ b/email/utils.scm @@ -1,5 +1,5 @@ ;;; guile-email --- Guile email parser -;;; Copyright © 2018 Arun Isaac +;;; Copyright © 2018, 2019 Arun Isaac ;;; ;;; This file is part of guile-email. ;;; @@ -20,11 +20,17 @@ (define-module (email utils) #:use-module (ice-9 match) #:use-module (ice-9 peg codegen) + #:use-module (ice-9 binary-ports) #:use-module (ice-9 textual-ports) + #:use-module (rnrs bytevectors) + #:use-module ((rnrs io ports) + #:select (call-with-bytevector-output-port)) #:use-module (rnrs io simple) + #:use-module (srfi srfi-26) #:export (get-line-with-delimiter read-objects read-while + read-bytes-till acons* alist-delete*)) @@ -52,6 +58,30 @@ string returned by READ-PROC as argument." (let ((str (call-with-output-string read-while-loop))) (if (string-null? str) (eof-object) str))) +(define (read-bytes-till port sequence) + "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is +reached. If SEQUENCE is seen, unget it to PORT and return." + (define (read-bytes-and-write-till in out sequence) + (let ((octet (get-u8 in))) + (cond + ((eof-object? octet) octet) + ;; If octet read matches first octet of sequence, try matching + ;; the full sequence. + ((= octet (bytevector-u8-ref sequence 0)) + (unget-bytevector in sequence 0 1) + (let ((bv (get-bytevector-n in (bytevector-length sequence)))) + (cond + ((bytevector=? bv sequence) (unget-bytevector in bv)) + (else (unget-bytevector in bv 1) + (put-u8 out octet) + (read-bytes-and-write-till in out sequence))))) + ;; Else, output the octet and continue reading. + (else (put-u8 out octet) + (read-bytes-and-write-till in out sequence))))) + + (call-with-bytevector-output-port + (cut read-bytes-and-write-till port <> sequence))) + (define (get-line-with-delimiter port) "Read a line from PORT and return it as a string including the delimiting linefeed character." -- cgit v1.2.3