;;; guile-email --- Guile email parser
;;; Copyright © 2018, 2019, 2020, 2021, 2023, 2025 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2021 Mathieu Othacehe <othacehe@gnu.org>
;;; Copyright © 2023 Andrew Whatson <whatson@tailcall.au>
;;;
;;; This file is part of guile-email.
;;;
;;; guile-email is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU Affero General Public License as
;;; published by the Free Software Foundation; either version 3 of the
;;; License, or (at your option) any later version.
;;;
;;; guile-email is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; Affero General Public License for more details.
;;;
;;; You should have received a copy of the GNU Affero General Public
;;; License along with guile-email. If not, see
;;; <http://www.gnu.org/licenses/>.
(define-module (email email)
#:use-module (ice-9 binary-ports)
#:use-module (ice-9 iconv)
#:use-module (ice-9 match)
#:use-module (ice-9 peg)
#:use-module (ice-9 regex)
#:use-module (rnrs bytevectors)
#:use-module ((rnrs io ports) #:select (call-with-port))
#:use-module (rnrs io simple)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-2)
#:use-module (srfi srfi-9)
#:use-module (srfi srfi-11)
#:use-module (srfi srfi-19)
#:use-module (srfi srfi-26)
#:use-module (sxml transform)
#:use-module (email base64)
#:use-module (email quoted-printable)
#:use-module (email utils)
#:export (<email>
make-email
email?
email-headers
email-body
email-from
email-to
email-cc
email-bcc
email-subject
email-date
email-message-id
email-content-type
<mime-entity>
make-mime-entity
mime-entity?
mime-entity-headers
mime-entity-body
email->headers+body
parse-email
parse-email-headers
parse-email-body
parse-email-address
interpret-address
mbox->emails))
(define (flatten-and-filter terms tree)
(filter list? (keyword-flatten terms tree)))
(define-record-type <email>
(make-email headers body)
email?
(headers email-headers)
(body email-body))
(define-record-type <mime-entity>
(make-mime-entity headers body)
mime-entity?
(headers mime-entity-headers)
(body mime-entity-body))
(define (validate-field email field)
"Validate @var{field} value in @var{email}. Return @code{#f} if value
is invalid. Else, return the value unchanged."
(let ((field-value (assq-ref (email-headers email)
field)))
;; Invalid fields show up as strings in the parse tree. Discard
;; them and return #f.
(and (not (string? field-value))
field-value)))
(define (email-from email)
"Return list of From addresses in @var{email}."
(or (validate-field email 'from)
(list)))
(define (email-to email)
"Return list of To addresses in @var{email}."
(or (validate-field email 'to)
(list)))
(define (email-cc email)
"Return list of Cc addresses in @var{email}."
(or (validate-field email 'cc)
(list)))
(define (email-bcc email)
"Return list of Bcc addresses in @var{email}."
(or (validate-field email 'bcc)
(list)))
(define (email-subject email)
"Return Subject of @var{email}."
(assq-ref (email-headers email)
'subject))
(define (email-date email)
"Return Date of @var{email}."
(validate-field email 'date))
(define (email-message-id email)
"Return Message-ID of @var{email}."
(assq-ref (email-headers email)
'message-id))
(define (email-content-type email)
"Return Content-Type of @var{email}."
(assq-ref (email-headers email)
'content-type))
(define string->lcase-symbol
(compose string->symbol string-downcase))
;;; PEG parser implementing the ABNF grammar specified in RFC5322
;;; (Internet Message Format), RFC6854 (Update to Internet Message
;;; Format to Allow Group Syntax in the "From:" and "Sender:" Header
;;; Fields)
;;; Obsolete syntax has not been implemented.
;;; Core ABNF rules from RFC5234
(define-peg-pattern alpha body
(or (range #\A #\Z) (range #\a #\z)))
;; Though line endings should be crlf (\r\n), we also tolerate bare
;; line feeds (\n)
(define-peg-pattern crlf none
(or "\r\n" "\n"))
(define-peg-pattern digit body
(range #\0 #\9))
(define-peg-pattern dquote body
"\"")
;; Printable ASCII characters and UTF-8 characters > \x7f (RFC6532)
(define-peg-pattern vchar body
(and (not-followed-by (or (range #\Nul #\Space)
"\x7f"))
peg-any))
(define-peg-pattern wsp body
(or " " "\t"))
(define-peg-pattern lwsp body
(* (or wsp (and crlf wsp))))
(define-syntax-rule (define-printable-ascii-character-pattern name . exceptions)
(define-peg-pattern name body
(and (not-followed-by (or . exceptions))
vchar)))
(define-syntax-rule (define-printable-ascii-character-pattern-with-obsolete
name obsolete-pattern . exceptions)
(define-peg-pattern name body
(or (and (not-followed-by (or . exceptions))
vchar)
obsolete-pattern)))
;;; Quoted characters
(define-peg-pattern obs-qp body
(and "\\" (or "\x00" obs-no-ws-ctl "\n" "\r")))
(define-peg-pattern quoted-pair body
(or (and (ignore "\\") (or vchar wsp))
obs-qp))
;;; Folding white space and comments
(define-peg-pattern obs-fws body
(and (+ wsp) (* (and crlf (+ wsp)))))
(define-peg-pattern fws body
(or (and (? (and (* wsp) crlf)) (+ wsp))
obs-fws))
(define-peg-pattern obs-no-ws-ctl body
(and (not-followed-by (or "\t" "\n" "\r"))
(or (range #\soh #\us)
"\x7f")))
(define-peg-pattern obs-ctext body
obs-no-ws-ctl)
(define-printable-ascii-character-pattern-with-obsolete ctext obs-ctext
"(" ")" "\\")
(define-syntax-rule (define-comment-pattern name capture-type)
(define-peg-pattern name capture-type
(and (ignore "(") (* (and (? fws) ccontent)) (ignore ")"))))
(define-comment-pattern comment none)
(define-peg-pattern ccontent body
(or ctext quoted-pair comment))
(define-syntax-rule (define-cfws-pattern name comment)
(define-peg-pattern name body
(or (and (+ (and (? fws) comment))
(? fws))
fws)))
(define-cfws-pattern cfws comment)
;;; Atom
(define-printable-ascii-character-pattern atext
"\"" "(" ")" "," "." ":" ";" "<" ">" "@" "[" "\\" "]")
(define-syntax define-atom-pattern
(syntax-rules ()
((define-atom-pattern name)
(define-peg-pattern name body
(and (ignore (? cfws)) (+ atext) (ignore (? cfws)))))
((define-atom-pattern name cfws)
(define-peg-pattern name body
(and (? cfws) (+ atext) (? cfws))))
((define-atom-pattern name cfws atext)
(define-peg-pattern name body
(and (? cfws) (+ atext) (? cfws))))))
(define-atom-pattern atom)
(define-peg-pattern dot-atom-text body
(and (+ atext) (* (and "." (+ atext)))))
(define-syntax-rule (define-dot-atom-pattern name cfws)
(define-peg-pattern name body
(and (ignore (? cfws)) dot-atom-text (ignore (? cfws)))))
(define-dot-atom-pattern dot-atom cfws)
;;; Quoted strings
(define-peg-pattern obs-qtext body
obs-no-ws-ctl)
(define-printable-ascii-character-pattern-with-obsolete qtext obs-qtext
"\\" "\"")
(define-peg-pattern qcontent body
(or qtext quoted-pair))
;; TODO: Remove workaround guile peg bug for ignore
(define-peg-pattern quoted-string body
(and (? cfws) (ignore (and dquote))
(* (and (? fws) qcontent))
(? fws)
(ignore (and dquote)) (? cfws)))
;;; Miscellaneous tokens
;; According to RFC5322§3.2.3, the leading and trailing cfws in atom
;; are semantically not part of it, and should be ignored. But, to
;; support obs-phrase, we need to capture it. Hence, we define a set
;; of cfws-captured-* patterns needed by obs-phrase.
(define-syntax-rule (define-word-pattern name atom)
(define-peg-pattern name body
(or atom quoted-string)))
(define-word-pattern word atom)
(define-atom-pattern cfws-captured-atom cfws)
(define-word-pattern cfws-captured-word cfws-captured-atom)
(define-syntax-rule (define-phrase-pattern name word)
(define-peg-pattern name body
(and word (* (or word "." cfws)))))
(define-phrase-pattern obs-phrase cfws-captured-word)
;; We set phrase to be the same as obs-phrase since, according to
;; their definitions in RFC5322, all phrases are obs-phrases.
(define-peg-pattern phrase body
obs-phrase)
(define-peg-pattern obs-phrase-list body
(and (? (or phrase cfws))
(* (and "," (or phrase cfws)))))
(define-peg-pattern obs-utext body
(or "\x00" obs-no-ws-ctl vchar))
(define-peg-pattern obs-unstruct body
(* (or (and (* "\n") (* "\r")
(* (and obs-utext (* "\n") (* "\r"))))
fws)))
;; ABNF modified to ignore leading whitespace
;; ABNF modified to allow for blank lines in folded field
(define-peg-pattern unstructured body
(or (and (ignore (? fws))
(* (and (? fws) (? vchar)))
(ignore (* wsp)))
obs-unstruct))
;;; Date and time specification
(define-peg-pattern day-name body
(or "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"))
(define-peg-pattern obs-day-of-week body
(and (ignore (? cfws)) day-name (ignore (? cfws))))
;; We set day-of-week to be the same as obs-day-of-week since,
;; according to their definitions in RFC5322, all days-of-week are
;; obs-days-of-week.
(define-peg-pattern day-of-week all
obs-day-of-week)
(define-peg-pattern obs-day body
(and (ignore (? cfws)) digit (? digit) (ignore (? cfws))))
;; We set day to be the same as obs-day since, according to their
;; definitions in RFC5322, all days are obs-days.
(define-peg-pattern day all
obs-day)
(define-peg-pattern month all
(or "Jan" "Feb" "Mar" "Apr" "May" "Jun"
"Jul" "Aug" "Sep" "Oct" "Nov" "Dec"))
(define-peg-pattern obs-year body
(and (ignore (? cfws)) (and digit digit) (* digit) (ignore (? cfws))))
;; We set year to be the same as obs-year since, according to their
;; definitions in RFC5322, all years are obs-years.
(define-peg-pattern year all
obs-year)
(define-peg-pattern date all
(and day month year))
(define-peg-pattern obs-hour body
(and (ignore (? cfws)) digit digit (ignore (? cfws))))
;; We set hours to be the same as obs-hour since, according to their
;; definitions in RFC5322, all hours are obs-hours.
(define-peg-pattern hours all
obs-hour)
(define-peg-pattern obs-minute body
(and (ignore (? cfws)) digit digit (ignore (? cfws))))
;; We set minutes to be the same as obs-minute since, according to
;; their definitions in RFC5322, all minutes are obs-minutes.
(define-peg-pattern minutes all
obs-minute)
(define-peg-pattern obs-second body
(and (ignore (? cfws)) digit digit (ignore (? cfws))))
;; We set seconds to be the same as obs-second since, according to
;; their definitions in RFC5322, all seconds are obs-seconds.
(define-peg-pattern seconds all
obs-second)
(define-peg-pattern time-of-day all
(and hours (ignore ":") minutes (? (and (ignore ":") seconds))))
;; zone-sign, zone-hours and zone-minutes do not exist in the ABNF
;; specified in RFC5322. But, we have it here since it eases parsing
;; the zone and converting it to a number of seconds as required by
;; the make-date procedure of SRFI-19.
(define-peg-pattern zone-sign all
(or "+" "-"))
(define-peg-pattern zone-hours all
(and digit digit))
(define-peg-pattern zone-minutes all
(and digit digit))
(define-peg-pattern obs-zone body
(or "UT" "GMT"
"EST" "EDT" "CST" "CDT"
"MST" "MDT" "PST" "PDT"
(and (not-followed-by (or "j" "J"))
(or (range #\a #\z)
(range #\A #\Z)))))
(define-peg-pattern zone all
(or (and zone-sign zone-hours zone-minutes)
obs-zone))
(define-peg-pattern time all
(and time-of-day zone))
(define-peg-pattern date-time all
(and (? (and day-of-week (ignore ","))) date time (? cfws)))
;;; Address specification
;; People tend to put strange characters in their display names, and
;; MUAs tend to pass on these non-standard names without any
;; quoting. Tolerate such names.
;; Tolerate non-standard (, ), [ and ] in atext.
(define-peg-pattern liberal-atext body
(or atext "(" ")" "[" "]"))
(define-atom-pattern liberal-cfws-captured-atom cfws liberal-atext)
(define-word-pattern liberal-cfws-captured-word liberal-cfws-captured-atom)
(define-phrase-pattern liberal-phrase liberal-cfws-captured-word)
(define-peg-pattern display-name all
liberal-phrase)
(define-peg-pattern obs-local-part body
(and word (* (and "." word))))
(define-peg-pattern local-part body
(or obs-local-part dot-atom quoted-string))
(define-peg-pattern obs-dtext body
(or obs-no-ws-ctl quoted-pair))
(define-printable-ascii-character-pattern-with-obsolete dtext obs-dtext "[" "]" "\\")
(define-peg-pattern domain-literal body
(and (? cfws) "[" (* (and (? fws) dtext)) (? fws) "]" (? cfws)))
(define-syntax-rule (define-obs-domain-pattern name atom)
(define-peg-pattern name body
(and atom (* (and "." atom)))))
(define-obs-domain-pattern obs-domain atom)
(define-syntax-rule (define-domain-pattern name obs-domain dot-atom)
(define-peg-pattern name body
(or obs-domain dot-atom domain-literal)))
(define-domain-pattern domain obs-domain dot-atom)
(define-syntax-rule (define-addr-spec-pattern name domain)
(define-peg-pattern name body
(and local-part "@" domain)))
(define-addr-spec-pattern addr-spec domain)
(define-peg-pattern obs-domain-list body
(and (* (or cfws ",")) "@" domain
(* (and "," (? cfws) (? (and "@" domain))))))
(define-peg-pattern obs-route none
(and obs-domain-list ":"))
(define-peg-pattern obs-angle-addr body
(and (ignore (and (? cfws) "<"))
obs-route addr-spec
(ignore (and ">" (? cfws)))))
(define-syntax-rule (define-angle-addr-pattern
name opening-bracket closing-bracket)
(define-peg-pattern name body
(or (and (ignore (? cfws))
opening-bracket addr-spec closing-bracket
(ignore (? cfws)))
obs-angle-addr)))
(define-angle-addr-pattern angle-addr "<" ">")
;; When handling email addresses, we need to drop the angle
;; brackets. But in the Received field we need to include the angle
;; brackes. So, we define unbracketed-angle-addr, a variant of
;; angle-addr in which the angle brackets are ignored.
(define-angle-addr-pattern
unbracketed-angle-addr (ignore "<") (ignore ">"))
(define-peg-pattern name-addr body
(and (? display-name) unbracketed-angle-addr))
;; In order to support emacs message mode parens style email
;; addresses, we need to capture comments in address fields and
;; process them as names. But, we need to ignore comments in other
;; parts of the email. In order to facilitate this, we define a set of
;; captured-* patterns which are used in address fields.
(define-atom-pattern captured-atom captured-cfws)
(define-comment-pattern captured-comment all)
(define-cfws-pattern captured-cfws captured-comment)
(define-dot-atom-pattern captured-dot-atom captured-cfws)
(define-obs-domain-pattern captured-obs-domain captured-atom)
(define-domain-pattern captured-domain captured-obs-domain captured-dot-atom)
(define-addr-spec-pattern captured-addr-spec captured-domain)
(define-peg-pattern mailbox all
(or name-addr captured-addr-spec))
(define-peg-pattern obs-mbox-list body
(and (* (and (? cfws) ",")) mailbox
(* (and "," (? (or mailbox cfws))))))
;; We set mailbox-list to be the same as obs-mbox-list since,
;; according to their definitions in RFC5322, all mailbox-lists are
;; obs-mbox-lists.
(define-peg-pattern mailbox-list all
obs-mbox-list)
(define-peg-pattern group all
(and display-name (ignore ":")
(? group-list) (ignore ";") (? cfws)))
(define-peg-pattern obs-group-list body
(and (+ (and (? cfws) ","))
(? cfws)))
(define-peg-pattern group-list all
(or mailbox-list cfws obs-group-list))
(define-peg-pattern address body
(or mailbox group))
(define-peg-pattern obs-addr-list body
(and (* (and (? cfws) ",")) address
(* (and "," (? (or address cfws))))))
;; We set address-list to be the same as obs-addr-list since,
;; according to their definitions in RFC5322, all address-lists are
;; obs-address-lists.
(define-peg-pattern address-list all
obs-addr-list)
;;; Fields
;; We have compressed the current field pattern and the obsolete field
;; pattern into a single PEG pattern.
(define-syntax define-field-pattern
(syntax-rules ()
((define-field-pattern name header pattern)
(define-peg-pattern name all
(and (ignore (string-ci header))
(ignore (and (* wsp) ":"))
pattern crlf)))
((define-field-pattern name header pattern obsolete-pattern)
(define-peg-pattern name all
(and (ignore (string-ci header))
(ignore (and (* wsp) ":"))
(or pattern obsolete-pattern) crlf)))))
;;; Origination date field
(define-field-pattern orig-date "Date" date-time)
;;; Originator fields
(define-field-pattern from "From" mailbox-list (or mailbox-list address-list))
(define-field-pattern sender "Sender" mailbox (or mailbox address))
(define-field-pattern reply-to "Reply-To" address-list)
;; Destination address fields
(define-field-pattern to "To" address-list)
(define-field-pattern cc "Cc" address-list)
(define-field-pattern bcc "Bcc"
(? (or address-list cfws))
(or address-list (and (* (and (? cfws) ",")) (? cfws))))
;;; Identification fields
(define-peg-pattern no-fold-literal body
(and "[" (* dtext) "]"))
(define-peg-pattern obs-id-left body
local-part)
(define-peg-pattern id-left body
(or obs-id-left dot-atom-text))
(define-peg-pattern obs-id-right body
domain)
(define-peg-pattern id-right body
(or obs-id-right dot-atom-text no-fold-literal))
(define-peg-pattern msg-id all
(and (ignore (? cfws)) (ignore "<")
id-left "@" id-right
(ignore ">") (ignore (? cfws))))
(define-field-pattern message-id "Message-ID" msg-id)
(define-field-pattern in-reply-to "In-Reply-To"
(+ msg-id)
(* (or phrase msg-id)))
(define-field-pattern references "References"
(+ msg-id)
(* (or phrase msg-id)))
;;; Informational fields
(define-field-pattern subject "Subject" unstructured)
(define-field-pattern comments "Comments" unstructured)
(define-field-pattern keywords "Keywords"
(and phrase (* (and "," phrase)))
obs-phrase-list)
;;; Resent fields
(define-field-pattern resent-date "Resent-Date" date-time)
(define-field-pattern resent-from "Resent-From"
(or mailbox-list address-list)
mailbox-list)
(define-field-pattern resent-sender "Resent-Sender"
(or mailbox address)
mailbox)
(define-field-pattern resent-to "Resent-To" address-list)
(define-field-pattern resent-cc "Resent-Cc" address-list)
(define-field-pattern resent-bcc "Resent-Bcc"
(? (or address-list cfws))
(or address-list
(and (* (and (? cfws) ",")) (? cfws))))
(define-field-pattern resent-msg-id "Resent-Message-ID" msg-id)
(define-field-pattern obs-resent-rply "Resent-Reply-To" address-list)
;;; Trace fields
;; word is given last priority in the ordered choice
(define-peg-pattern received-token all
(or angle-addr addr-spec domain word))
(define-field-pattern received "Received"
(and (* received-token) (ignore ";") date-time)
(* received-token))
(define-peg-pattern path body
(or angle-addr (and (? cfws) (ignore "<") (? cfws) (ignore ">") (? cfws))))
(define-field-pattern return "Return-Path" path)
(define-peg-pattern trace all
(and (? return) (+ received)))
;;; Optional fields
(define-printable-ascii-character-pattern ftext ":")
(define-peg-pattern field-name all
(+ ftext))
(define-peg-pattern obs-optional body
(and field-name (ignore (and (* wsp) ":")) unstructured crlf))
(define-peg-pattern optional-field all
(or (and field-name (ignore ":") unstructured crlf)
obs-optional))
;;; MIME version
;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern mime-version "MIME-Version"
(and (ignore (? cfws)) (+ digit) "." (+ digit)))
;;; Content type
;; TODO: What is <"> in RFC2045?
(define-peg-pattern tspecials body
(or "(" ")" "<" ">" "@"
"," ";" ":" "\\"
"/" "[" "]" "?" "="))
;; TODO: What is a CTL in RFC2045?
(define-peg-pattern token body
(+ (and (not-followed-by (or " " crlf tspecials))
peg-any)))
;; TODO: Implement iana-token, ietf-token and x-token
(define-peg-pattern iana-token body
token)
(define-peg-pattern ietf-token body
token)
(define-peg-pattern x-token body
token)
(define-peg-pattern extension-token body
(or ietf-token x-token))
(define-peg-pattern discrete-type body
(or "text" "image" "audio" "video"
"application" extension-token))
(define-peg-pattern composite-type body
(or "message" "multipart" extension-token))
(define-peg-pattern type all
(or discrete-type composite-type))
(define-peg-pattern subtype all
(or extension-token iana-token))
(define-peg-pattern attribute all
token)
;; quoted-string is given higher precedence
(define-peg-pattern value all
(or quoted-string token))
(define-peg-pattern parameter all
(and attribute (ignore "=") value))
;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern content "Content-Type"
(and (ignore (? cfws)) type (ignore "/") subtype
(* (and (ignore ";") (ignore (? cfws)) parameter))))
;;; Content Disposition as defined in RFC2183
(define-peg-pattern disposition-type body
(or "inline" "attachment" extension-token))
(define-peg-pattern quoted-date-time all
quoted-string)
(define-syntax-rule (define-parameter-pattern name header pattern)
(define-peg-pattern name all
(and (ignore header) (ignore "=") pattern)))
(define-parameter-pattern filename-parm "filename" value)
(define-parameter-pattern creation-date-parm "creation-date" quoted-date-time)
(define-parameter-pattern modification-date-parm "modification-date" quoted-date-time)
(define-parameter-pattern read-date-parm "read-date" quoted-date-time)
(define-parameter-pattern size-parm "size" (+ digit))
(define-peg-pattern disposition-parm body
(or filename-parm creation-date-parm modification-date-parm
read-date-parm size-parm parameter))
;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern disposition "Content-Disposition"
(and (ignore (? cfws)) disposition-type
(* (and (ignore ";") (ignore (? cfws)) disposition-parm))))
;;; Content transfer encoding
(define-peg-pattern mechanism body
(or "7bit" "8bit" "binary"
"quoted-printable" "base64"
ietf-token x-token))
(define-field-pattern encoding "Content-Transfer-Encoding"
(and (ignore (? cfws)) mechanism))
;;; Fields
;; The ABNF specified for fields in RFC5322 does not make sense. With
;; it, all headers are eaten up by optional-field. So, as a temporary
;; workaround, I am going with the following much simpler (but
;; possibly incorrect) ABNF.
;; TODO: Try to understand and implement the actual ABNF specified by
;; RFC5322.
(define-peg-pattern fields all
(* (or trace
resent-date
resent-from
resent-sender
resent-to
resent-cc
resent-bcc
resent-msg-id
obs-resent-rply
orig-date
from
sender
reply-to
to
cc
bcc
message-id
in-reply-to
references
subject
comments
keywords
mime-version
content
disposition
encoding
optional-field)))
(define-peg-pattern mime-extension-field-name all
(and "Content-" (+ ftext)))
(define-peg-pattern mime-extension-field all
(and mime-extension-field-name (ignore ":") unstructured crlf))
(define-peg-pattern mime-entity-fields all
(* (or content
disposition
encoding
mime-extension-field
optional-field)))
(define (decode-mime-encoded-word word)
(regexp-substitute/global
#f "=\\?([^?]*)\\?([^?]*)\\?([^?]*)\\?=" word
'pre (lambda (match-record)
(let ((charset (match:substring match-record 1))
(encoding (string->lcase-symbol (match:substring match-record 2)))
(encoded-text (match:substring match-record 3)))
(bytevector->string
((case encoding
((b) base64-decode)
((q) q-encoding-decode)
(else (error "Encoding of MIME word unknown" word)))
encoded-text)
charset 'substitute)))
'post))
(define (body->mime-entities body boundary)
"Split bytevector BODY into a list of mime entities separated by
BOUNDARY (as explained in RFC2045), and return that list."
(define (read-till-boundary port)
(read-bytes-till
port
;; TODO: Look for the boundary sequence only at the beginning of
;; a line.
;; Boundary is always an ASCII string.
(string->bytevector (string-append "--" boundary) "us-ascii")))
(define (read-mime-entity port)
(let ((line (get-line-with-delimiter port)))
;; The eof-object? check returns #t only when the message is
;; prematurely truncated. It is invoked only to handle truncated
;; messages gracefully without raising an error.
(if (or (eof-object? line)
(string-prefix? (string-append "--" boundary "--") line))
(eof-object)
(read-till-boundary port))))
(call-with-port (open-bytevector-input-port body)
(lambda (port)
(read-till-boundary port)
(read-objects read-mime-entity port))))
(define (email->headers+body email)
"Split EMAIL bytevector into headers and body. Return as multiple
values. The returned headers is a string and body is a bytevector."
(call-with-port (open-bytevector-input-port email)
(lambda (port)
;; Email headers must strictly be ASCII characters. But for the
;; sake of supporting Emacs message mode parens style addresses
;; that may use non-ASCII characters, typically for the full
;; name, we relax this requirement. We assume an encoding of
;; UTF-8, and hope that everything turns out fine. Since UTF-8
;; is a superset of ASCII, this should not affect standards
;; conforming headers. If encoding is neither UTF-8 nor ASCII,
;; we use the substitute conversion strategy and proceed without
;; raising an error.
(set-port-encoding! port "utf-8")
(set-port-conversion-strategy! port 'substitute)
(let ((headers (read-while port get-line-with-delimiter
(lambda (line)
(not (or (string= line "\n")
(string= line "\r\n")))))))
(get-line-with-delimiter port)
(values (if (eof-object? headers) "" headers)
(get-bytevector-all port))))))
(define (post-process-mailbox . args)
(define process-name
(compose decode-mime-encoded-word string-trim-both))
(match args
;; name-addr email address
(`(mailbox (display-name ,name) ,address)
`((name . ,(process-name name))
(address . ,(string-trim-both address))))
;; addr-spec email address
(`(mailbox ,(? string? address))
`((address . ,(string-trim-both address))))
;; emacs message mode parens style email address
(`(mailbox ,(? list? address-parts))
`((name . ,(match-let ((`((captured-comment ,name))
(flatten-and-filter '(captured-comment) address-parts)))
(process-name name)))
(address . ,(string-trim-both
(string-join (drop-right address-parts 1) "")))))
(_ (error "Failed to parse mailbox"))))
(define (post-process-content-transfer-encoding _ value)
(list 'content-transfer-encoding (string->lcase-symbol value)))
(define (post-process-content-type . args)
(define (valid-charset? charset)
(catch #t
(lambda ()
(bytevector->string (make-bytevector 1 0) charset)
#t)
(const #f)))
(match args
(`(content (type ,type)
(subtype ,subtype)
. ,parameters)
(let ((type (string->lcase-symbol type))
(subtype (string->lcase-symbol subtype))
(parameters
(map (match-lambda
(`(parameter (attribute ,attribute)
(value ,value))
(cons (string->lcase-symbol attribute) value)))
(flatten-and-filter '(parameter) parameters))))
`(content-type
,(acons*
'type type
'subtype subtype
(cond
((and (eq? type 'text)
(or
;; RFC6657 specifies UTF-8 as the default charset for
;; text/* media types.
(not (assoc-ref parameters 'charset))
;; RFC2045 recommends that the default be assumed
;; when a syntactically invalid Content-Type header
;; is encountered. In this implementation, we are
;; only checking for the validity of the
;; charset. Should we check for the validity of the
;; entire Content-Type header? If so, how?
(not (valid-charset? (assoc-ref parameters 'charset)))))
(alist-combine parameters (acons* 'charset "utf-8")))
(else parameters))))))))
(define post-process-content-disposition
(match-lambda*
(`(disposition ,type . ,parameters)
`(content-disposition
,(acons 'type (string->lcase-symbol type)
(map (match-lambda
(('filename-parm ('value filename))
(cons 'filename (basename filename)))
(((? (lambda (date-parm)
(member date-parm '(creation-date-parm modification-date-parm read-date-parm)))
date-parm) value)
;; TODO: Convert to SRFI-19 datetime
(cons date-parm value))
(('size-parm value)
(cons 'size (string->number value)))
(`(parameter (attribute ,attribute)
(value ,value))
(cons (string->lcase-symbol attribute) value)))
(flatten-and-filter
'(filename-parm creation-date-parm modification-date-parm
read-date-parm size-parm parameter)
parameters)))))))
(define post-process-optional-field
(match-lambda*
(`(optional-field
(field-name ,field-name)
,field-value)
(list (string->lcase-symbol field-name)
field-value))
(`(optional-field (field-name ,field-name))
(list field-name ""))))
(define* (macro-process-address-list _ . addresses)
(flatten-and-filter '(address mailbox) addresses))
(define (post-process-fields fields)
(map (match-lambda
((field value)
(cons field value))
((field . values)
(cons field values))
;; If the Subject header is blank, treat it as having the
;; null string as value.
('subject '(subject . ""))
(_ #f))
fields))
(define (parse-email-address address)
"Parse ADDRESS as an email address and return an association list
with keys being the symbols name and address, and values being the
display-name and addr-spec respectively. display-name and addr-spec
are as defined in RFC5322.
For example,
(parse-email-address \"Foo <foo@example.org>\")
=> ((name . \"Foo\") (address . \"foo@example.org\"))
(parse-email-address \"foo@example.org\")
=> ((address . \"foo@example.org\"))"
(pre-post-order
(peg:tree (match-pattern mailbox address))
`((mailbox . ,post-process-mailbox)
(*text* . ,(lambda (_ text) text))
(*default* . ,(lambda tree tree)))))
(define interpret-address
(match-lambda
((('name . name)
('address . address))
(string-append
;; Quote display names with illegal characters.
(let ((char-set:atext (char-set-intersection
char-set:ascii
(char-set-union char-set:letter
char-set:digit
(char-set #\! #\# #\$ #\%
#\& #\' #\* #\+
#\- #\/ #\= #\?
#\^ #\_ #\` #\{
#\| #\} #\~)))))
(if (string-every char-set:atext name)
name
(string-append "\"" name "\"")))
" <" address ">"))
((('address . address)) address)))
(define (parse-email-body headers body)
"Parse BODY as email body where HEADERS is an association list of
header keys and values as returned by parse-email-headers. Return a
list of <mime-entity> records if the body is a multipart
message. Else, return a single <mime-entity> record."
(match body
((? string? body)
(parse-email-body headers (string->bytevector body "utf-8")))
((? bytevector? body)
(let ((content-type (assoc-ref headers 'content-type)))
(case (assoc-ref content-type 'type)
((multipart)
(map (cut parse-mime-entity headers <>)
(body->mime-entities body (assoc-ref content-type 'boundary))))
((text)
(string-trim-both
(decode-body body (assoc-ref headers 'content-transfer-encoding)
(assoc-ref content-type 'charset))))
(else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))))
(define (add-default-headers headers)
;; Default Content-Type and Content-Transfer-Encoding headers as
;; specified in RFC2045
(alist-combine (acons* 'content-type '((type . text)
(subtype . plain)
;; UTF-8 is specified as the default
;; charset in RFC6657
(charset . "utf-8"))
'content-transfer-encoding '#{7bit}#)
headers))
(define (handle-invalid-headers headers)
;; §6.4 of RFC2045 specifies that any entity with an unrecognized
;; Content-Transfer-Encoding must be treated as if it has a
;; Content-Type of "application/octet-stream", regardless of what
;; the Content-Type header field actually says.
(if (memq (assq-ref headers 'content-transfer-encoding)
(list '7bit '8bit 'binary 'quoted-printable 'base64))
headers
(alist-combine headers
'((content-type (type . application)
(subtype . octet-stream))
(content-transfer-encoding . binary)))))
(define (add-default-mime-entity-headers parent-headers headers)
;; Default Content-Type and Content-Transfer-Encoding headers as
;; specified in RFC2045 and RFC2046
(let ((parent-content-type (assoc-ref parent-headers 'content-type)))
(alist-combine
(acons* 'content-type
`(,@(if (and (eq? (assoc-ref parent-content-type 'type) 'multipart)
(eq? (assoc-ref parent-content-type 'subtype) 'digest))
'((type . message)
(subtype . rfc822))
'((type . text)
(subtype . plain)))
;; UTF-8 is specified as the default
;; charset in RFC6657
(charset . "utf-8"))
'content-transfer-encoding '#{7bit}#)
headers)))
(define (parse-mime-entity parent-headers bv)
(let-values (((headers body) (email->headers+body bv)))
(let ((headers
(pre-post-order
(peg:tree
(match-pattern mime-entity-fields headers))
`((content . ,post-process-content-type)
(encoding . ,post-process-content-transfer-encoding)
(disposition . ,post-process-content-disposition)
(optional-field . ,post-process-optional-field)
(mime-entity-fields . ,(lambda (_ . mime-entity-fields)
(add-default-mime-entity-headers
parent-headers
(post-process-fields mime-entity-fields))))
(*text* . ,(match-lambda*
;; Handle MIME entities that have no
;; headers.
((_ 'mime-entity-fields)
(add-default-mime-entity-headers parent-headers '()))
((_ text) text)))
(*default* . ,(lambda tree tree))))))
(make-mime-entity headers (parse-email-body headers body)))))
(define (parse-email email)
"Parse string EMAIL and return result as an <email> record."
(match email
((? string? email)
(parse-email (string->bytevector email "utf-8")))
((? bytevector? email)
(let-values (((headers body) (email->headers+body email)))
(let ((headers-alist (parse-email-headers headers)))
(make-email headers-alist (parse-email-body headers-alist body)))))))
(define (parse-email-headers headers)
"Parse string HEADERS as email headers and return an association
list of header keys and values."
(pre-post-order
(peg:tree
(match-pattern fields headers))
`((received . ,(match-lambda*
(('received tokens (? date? timestamp))
(list 'received (string-join tokens) timestamp))
(('received tokens ...)
(list 'received (string-join tokens)))))
(received-token . ,(match-lambda*
(`(received-token ,token) token)))
(date-time . ,(lambda node
(match-let*
(;; Seconds are optional; provide a default
;; binding which will be shadowed by match.
(seconds "0")
((('day day) ('month month) ('year year)
('hours hours) ('minutes minutes)
. (or (('seconds seconds) ('zone . zone))
(('zone . zone))))
(flatten-and-filter
'(day month year hours minutes seconds zone)
node)))
(make-date 0
(string->number seconds)
(string->number minutes)
(string->number hours)
(string->number day)
(1+ (list-index
(cut equal? <> month)
(list "Jan" "Feb" "Mar" "Apr" "May" "Jun"
"Jul" "Aug" "Sep" "Oct" "Nov" "Dec")))
(let ((year (string->number year)))
(cond
((and (>= year 0) (<= year 49))
(+ year 2000))
((and (>= year 50) (<= year 999))
(+ year 1900))
(else year)))
(match zone
(((or "UT" "GMT")) 0)
(("EDT") (* -4 3600))
(((or "EST" "CDT")) (* -5 3600))
(((or "CST" "MDT")) (* -6 3600))
(((or "MST" "PDT")) (* -7 3600))
(("PST") (* -8 3600))
(`((zone-sign ,zone-sign)
(zone-hours ,zone-hours)
(zone-minutes ,zone-minutes))
(* (case (string->symbol zone-sign)
((+) 1)
((-) -1))
(+ (* 60 60 (string->number zone-hours))
(* 60 (string->number zone-minutes))))))))))
(orig-date . ,(lambda (_ date) (list 'date date)))
(mailbox . ,post-process-mailbox)
(address-list *macro* . ,macro-process-address-list)
(mailbox-list *macro* . ,macro-process-address-list)
(optional-field . ,post-process-optional-field)
(msg-id . ,(match-lambda* (`(msg-id ,msg-id) msg-id)))
(content . ,post-process-content-type)
(encoding . ,post-process-content-transfer-encoding)
(disposition . ,post-process-content-disposition)
(subject . ,(match-lambda* (`(subject ,subject)
`(subject ,(decode-mime-encoded-word subject)))))
(keywords . ,(lambda (_ value)
(cons 'keywords (string-split value #\,))))
(fields . ,(lambda (_ . fields)
(handle-invalid-headers
(add-default-headers (post-process-fields fields)))))
(*text* . ,(lambda (_ value) value))
(*default* . ,(lambda tree tree)))))
(define* (decode-body body encoding #:optional charset)
(let ((decoded-octets
(case encoding
((base64) (base64-decode body))
((quoted-printable) (quoted-printable-decode
(bytevector->string body "us-ascii")))
((#{7bit}# #{8bit}# binary) body)
(else (error "Body decoding failed. Unknown encoding" encoding)))))
(if charset
(bytevector->string decoded-octets charset 'substitute)
decoded-octets)))
(define (read-next-email-in-mbox port)
"Read next email from PORT and return as a bytevector. PORT is an
input port reading an mbox file."
;; Read and discard From_ line
(get-line-with-delimiter port)
;; Read the actual email
(let ((email (read-bytes-till
port (string->bytevector "\nFrom " "us-ascii"))))
(read-char port)
email))
(define (mbox->emails port)
"Read all emails from PORT and return as a list of bytevectors.
PORT is an input port reading an mbox file."
(read-objects read-next-email-in-mbox port))