aboutsummaryrefslogtreecommitdiff
;;; guile-email --- Guile email parser
;;; Copyright © 2018, 2019, 2020, 2021, 2023 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2021 Mathieu Othacehe <othacehe@gnu.org>
;;; Copyright © 2023 Andrew Whatson <whatson@tailcall.au>
;;;
;;; This file is part of guile-email.
;;;
;;; guile-email is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU Affero General Public License as
;;; published by the Free Software Foundation; either version 3 of the
;;; License, or (at your option) any later version.
;;;
;;; guile-email is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; Affero General Public License for more details.
;;;
;;; You should have received a copy of the GNU Affero General Public
;;; License along with guile-email.  If not, see
;;; <http://www.gnu.org/licenses/>.

(define-module (email email)
  #:use-module (ice-9 binary-ports)
  #:use-module (ice-9 iconv)
  #:use-module (ice-9 match)
  #:use-module (ice-9 peg)
  #:use-module (ice-9 regex)
  #:use-module (rnrs bytevectors)
  #:use-module ((rnrs io ports) #:select (call-with-port))
  #:use-module (rnrs io simple)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-2)
  #:use-module (srfi srfi-9)
  #:use-module (srfi srfi-11)
  #:use-module (srfi srfi-19)
  #:use-module (srfi srfi-26)
  #:use-module (sxml transform)
  #:use-module (email base64)
  #:use-module (email quoted-printable)
  #:use-module (email utils)
  #:export (<email>
            make-email
            email?
            email-headers
            email-body
            <mime-entity>
            make-mime-entity
            mime-entity?
            mime-entity-headers
            mime-entity-body
            email->headers+body
            parse-email
            parse-email-headers
            parse-email-body
            parse-email-address
            interpret-address
            mbox->emails))

(define (flatten-and-filter terms tree)
  (filter list? (keyword-flatten terms tree)))

(define-record-type <email>
  (make-email headers body)
  email?
  (headers email-headers)
  (body email-body))

(define-record-type <mime-entity>
  (make-mime-entity headers body)
  mime-entity?
  (headers mime-entity-headers)
  (body mime-entity-body))

(define string->lcase-symbol
  (compose string->symbol string-downcase))

;;; PEG parser implementing the ABNF grammar specified in RFC5322
;;; (Internet Message Format), RFC6854 (Update to Internet Message
;;; Format to Allow Group Syntax in the "From:" and "Sender:" Header
;;; Fields)

;;; Obsolete syntax has not been implemented.

;;; Core ABNF rules from RFC5234

(define-peg-pattern alpha body
  (or (range #\A #\Z) (range #\a #\z)))

;; Though line endings should be crlf (\r\n), we also tolerate bare
;; line feeds (\n)
(define-peg-pattern crlf none
  (or "\r\n" "\n"))

(define-peg-pattern digit body
  (range #\0 #\9))

(define-peg-pattern dquote body
  "\"")

;; Printable ASCII characters and UTF-8 characters > \x7f (RFC6532)
(define-peg-pattern vchar body
  (and (not-followed-by (or (range #\Nul #\Space)
                            "\x7f"))
       peg-any))

(define-peg-pattern wsp body
  (or " " "\t"))

(define-peg-pattern lwsp body
  (* (or wsp (and crlf wsp))))

(define-syntax-rule (define-printable-ascii-character-pattern name . exceptions)
  (define-peg-pattern name body
    (and (not-followed-by (or . exceptions))
         vchar)))

(define-syntax-rule (define-printable-ascii-character-pattern-with-obsolete
                      name obsolete-pattern . exceptions)
  (define-peg-pattern name body
    (or (and (not-followed-by (or . exceptions))
             vchar)
        obsolete-pattern)))

;;; Quoted characters

(define-peg-pattern obs-qp body
  (and "\\" (or "\x00" obs-no-ws-ctl "\n" "\r")))

(define-peg-pattern quoted-pair body
  (or (and (ignore "\\") (or vchar wsp))
      obs-qp))

;;; Folding white space and comments

(define-peg-pattern obs-fws body
  (and (+ wsp) (* (and crlf (+ wsp)))))

(define-peg-pattern fws body
  (or (and (? (and (* wsp) crlf)) (+ wsp))
      obs-fws))

(define-peg-pattern obs-no-ws-ctl body
  (and (not-followed-by (or "\t" "\n" "\r"))
       (or (range #\soh #\us)
           "\x7f")))

(define-peg-pattern obs-ctext body
  obs-no-ws-ctl)

(define-printable-ascii-character-pattern-with-obsolete ctext obs-ctext
  "(" ")" "\\")

(define-syntax-rule (define-comment-pattern name capture-type)
  (define-peg-pattern name capture-type
    (and (ignore "(") (* (and (? fws) ccontent)) (ignore ")"))))

(define-comment-pattern comment none)

(define-peg-pattern ccontent body
  (or ctext quoted-pair comment))

(define-syntax-rule (define-cfws-pattern name comment)
  (define-peg-pattern name body
    (or (and (+ (and (? fws) comment))
             (? fws))
        fws)))

(define-cfws-pattern cfws comment)

;;; Atom

(define-printable-ascii-character-pattern atext
  "\"" "(" ")" "," "." ":" ";" "<" ">" "@" "[" "\\" "]")

(define-syntax define-atom-pattern
  (syntax-rules ()
    ((define-atom-pattern name)
     (define-peg-pattern name body
       (and (ignore (? cfws)) (+ atext) (ignore (? cfws)))))
    ((define-atom-pattern name cfws)
     (define-peg-pattern name body
       (and (? cfws) (+ atext) (? cfws))))
    ((define-atom-pattern name cfws atext)
     (define-peg-pattern name body
       (and (? cfws) (+ atext) (? cfws))))))

(define-atom-pattern atom)

(define-peg-pattern dot-atom-text body
  (and (+ atext) (* (and "." (+ atext)))))

(define-syntax-rule (define-dot-atom-pattern name cfws)
  (define-peg-pattern name body
    (and (ignore (? cfws)) dot-atom-text (ignore (? cfws)))))

(define-dot-atom-pattern dot-atom cfws)

;;; Quoted strings

(define-peg-pattern obs-qtext body
  obs-no-ws-ctl)

(define-printable-ascii-character-pattern-with-obsolete qtext obs-qtext
  "\\" "\"")

(define-peg-pattern qcontent body
  (or qtext quoted-pair))

;; TODO: Remove workaround guile peg bug for ignore
(define-peg-pattern quoted-string body
  (and (? cfws) (ignore (and dquote))
       (* (and (? fws) qcontent))
       (? fws)
       (ignore (and dquote)) (? cfws)))

;;; Miscellaneous tokens

;; According to RFC5322§3.2.3, the leading and trailing cfws in atom
;; are semantically not part of it, and should be ignored. But, to
;; support obs-phrase, we need to capture it. Hence, we define a set
;; of cfws-captured-* patterns needed by obs-phrase.

(define-syntax-rule (define-word-pattern name atom)
  (define-peg-pattern name body
    (or atom quoted-string)))

(define-word-pattern word atom)
(define-atom-pattern cfws-captured-atom cfws)
(define-word-pattern cfws-captured-word cfws-captured-atom)

(define-syntax-rule (define-phrase-pattern name word)
  (define-peg-pattern name body
    (and word (* (or word "." cfws)))))

(define-phrase-pattern obs-phrase cfws-captured-word)

;; We set phrase to be the same as obs-phrase since, according to
;; their definitions in RFC5322, all phrases are obs-phrases.
(define-peg-pattern phrase body
  obs-phrase)

(define-peg-pattern obs-phrase-list body
  (and (? (or phrase cfws))
       (* (and "," (or phrase cfws)))))

(define-peg-pattern obs-utext body
  (or "\x00" obs-no-ws-ctl vchar))

(define-peg-pattern obs-unstruct body
  (* (or (and (* "\n") (* "\r")
              (* (and obs-utext (* "\n") (* "\r"))))
         fws)))

;; ABNF modified to ignore leading whitespace
;; ABNF modified to allow for blank lines in folded field
(define-peg-pattern unstructured body
  (or (and (ignore (? fws))
           (* (and (? fws) (? vchar)))
           (ignore (* wsp)))
      obs-unstruct))

;;; Date and time specification

(define-peg-pattern day-name body
  (or "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"))

(define-peg-pattern obs-day-of-week body
  (and (ignore (? cfws)) day-name (ignore (? cfws))))

;; We set day-of-week to be the same as obs-day-of-week since,
;; according to their definitions in RFC5322, all days-of-week are
;; obs-days-of-week.
(define-peg-pattern day-of-week all
  obs-day-of-week)

(define-peg-pattern obs-day body
  (and (ignore (? cfws)) digit (? digit) (ignore (? cfws))))

;; We set day to be the same as obs-day since, according to their
;; definitions in RFC5322, all days are obs-days.
(define-peg-pattern day all
  obs-day)

(define-peg-pattern month all
  (or "Jan" "Feb" "Mar" "Apr" "May" "Jun"
      "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"))

(define-peg-pattern obs-year body
  (and (ignore (? cfws)) (and digit digit) (* digit) (ignore (? cfws))))

;; We set year to be the same as obs-year since, according to their
;; definitions in RFC5322, all years are obs-years.
(define-peg-pattern year all
  obs-year)

(define-peg-pattern date all
  (and day month year))

(define-peg-pattern obs-hour body
  (and (ignore (? cfws)) digit digit (ignore (? cfws))))

;; We set hours to be the same as obs-hour since, according to their
;; definitions in RFC5322, all hours are obs-hours.
(define-peg-pattern hours all
  obs-hour)

(define-peg-pattern obs-minute body
  (and (ignore (? cfws)) digit digit (ignore (? cfws))))

;; We set minutes to be the same as obs-minute since, according to
;; their definitions in RFC5322, all minutes are obs-minutes.
(define-peg-pattern minutes all
  obs-minute)

(define-peg-pattern obs-second body
  (and (ignore (? cfws)) digit digit (ignore (? cfws))))

;; We set seconds to be the same as obs-second since, according to
;; their definitions in RFC5322, all seconds are obs-seconds.
(define-peg-pattern seconds all
  obs-second)

(define-peg-pattern time-of-day all
  (and hours (ignore ":") minutes (? (and (ignore ":") seconds))))

;; zone-sign, zone-hours and zone-minutes do not exist in the ABNF
;; specified in RFC5322. But, we have it here since it eases parsing
;; the zone and converting it to a number of seconds as required by
;; the make-date procedure of SRFI-19.
(define-peg-pattern zone-sign all
  (or "+" "-"))

(define-peg-pattern zone-hours all
  (and digit digit))

(define-peg-pattern zone-minutes all
  (and digit digit))

(define-peg-pattern obs-zone body
  (or "UT" "GMT"
      "EST" "EDT" "CST" "CDT"
      "MST" "MDT" "PST" "PDT"
      (and (not-followed-by (or "j" "J"))
           (or (range #\a #\z)
               (range #\A #\Z)))))

(define-peg-pattern zone all
  (or (and zone-sign zone-hours zone-minutes)
      obs-zone))

(define-peg-pattern time all
  (and time-of-day zone))

(define-peg-pattern date-time all
  (and (? (and day-of-week (ignore ","))) date time (? cfws)))

;;; Address specification

;; People tend to put strange characters in their display names, and
;; MUAs tend to pass on these non-standard names without any
;; quoting. Tolerate such names.

;; Tolerate non-standard ( and ) in atext.
(define-peg-pattern liberal-atext body
  (or atext "(" ")"))

(define-atom-pattern liberal-cfws-captured-atom cfws liberal-atext)
(define-word-pattern liberal-cfws-captured-word liberal-cfws-captured-atom)
(define-phrase-pattern liberal-phrase liberal-cfws-captured-word)

(define-peg-pattern display-name all
  liberal-phrase)

(define-peg-pattern obs-local-part body
  (and word (* (and "." word))))

(define-peg-pattern local-part body
  (or obs-local-part dot-atom quoted-string))

(define-peg-pattern obs-dtext body
  (or obs-no-ws-ctl quoted-pair))

(define-printable-ascii-character-pattern-with-obsolete dtext obs-dtext "[" "]" "\\")

(define-peg-pattern domain-literal body
  (and (? cfws) "[" (* (and (? fws) dtext)) (? fws) "]" (? cfws)))

(define-syntax-rule (define-obs-domain-pattern name atom)
  (define-peg-pattern name body
    (and atom (* (and "." atom)))))

(define-obs-domain-pattern obs-domain atom)

(define-syntax-rule (define-domain-pattern name obs-domain dot-atom)
  (define-peg-pattern name body
    (or obs-domain dot-atom domain-literal)))

(define-domain-pattern domain obs-domain dot-atom)

(define-syntax-rule (define-addr-spec-pattern name domain)
  (define-peg-pattern name body
    (and local-part "@" domain)))

(define-addr-spec-pattern addr-spec domain)

(define-peg-pattern obs-domain-list body
  (and (* (or cfws ",")) "@" domain
       (* (and "," (? cfws) (? (and "@" domain))))))

(define-peg-pattern obs-route none
  (and obs-domain-list ":"))

(define-peg-pattern obs-angle-addr body
  (and (ignore (and (? cfws) "<"))
       obs-route addr-spec
       (ignore (and ">" (? cfws)))))

(define-syntax-rule (define-angle-addr-pattern
                      name opening-bracket closing-bracket)
  (define-peg-pattern name body
    (or (and (ignore (? cfws))
             opening-bracket addr-spec closing-bracket
             (ignore (? cfws)))
        obs-angle-addr)))

(define-angle-addr-pattern angle-addr "<" ">")

;; When handling email addresses, we need to drop the angle
;; brackets. But in the Received field we need to include the angle
;; brackes. So, we define unbracketed-angle-addr, a variant of
;; angle-addr in which the angle brackets are ignored.

(define-angle-addr-pattern
  unbracketed-angle-addr (ignore "<") (ignore ">"))

(define-peg-pattern name-addr body
  (and (? display-name) unbracketed-angle-addr))

;; In order to support emacs message mode parens style email
;; addresses, we need to capture comments in address fields and
;; process them as names. But, we need to ignore comments in other
;; parts of the email. In order to facilitate this, we define a set of
;; captured-* patterns which are used in address fields.

(define-atom-pattern captured-atom captured-cfws)
(define-comment-pattern captured-comment all)
(define-cfws-pattern captured-cfws captured-comment)
(define-dot-atom-pattern captured-dot-atom captured-cfws)
(define-obs-domain-pattern captured-obs-domain captured-atom)
(define-domain-pattern captured-domain captured-obs-domain captured-dot-atom)
(define-addr-spec-pattern captured-addr-spec captured-domain)

(define-peg-pattern mailbox all
  (or name-addr captured-addr-spec))

(define-peg-pattern obs-mbox-list body
  (and (* (and (? cfws) ",")) mailbox
       (* (and "," (? (or mailbox cfws))))))

;; We set mailbox-list to be the same as obs-mbox-list since,
;; according to their definitions in RFC5322, all mailbox-lists are
;; obs-mbox-lists.
(define-peg-pattern mailbox-list all
  obs-mbox-list)

(define-peg-pattern group all
  (and display-name (ignore ":")
       (? group-list) (ignore ";") (? cfws)))

(define-peg-pattern obs-group-list body
  (and (+ (and (? cfws) ","))
       (? cfws)))

(define-peg-pattern group-list all
  (or mailbox-list cfws obs-group-list))

(define-peg-pattern address body
  (or mailbox group))

(define-peg-pattern obs-addr-list body
  (and (* (and (? cfws) ",")) address
       (* (and "," (? (or address cfws))))))

;; We set address-list to be the same as obs-addr-list since,
;; according to their definitions in RFC5322, all address-lists are
;; obs-address-lists.
(define-peg-pattern address-list all
  obs-addr-list)

;;; Fields

;; We have compressed the current field pattern and the obsolete field
;; pattern into a single PEG pattern.
(define-syntax define-field-pattern
  (syntax-rules ()
    ((define-field-pattern name header pattern)
     (define-peg-pattern name all
       (and (ignore (string-ci header))
            (ignore (and (* wsp) ":"))
            pattern crlf)))
    ((define-field-pattern name header pattern obsolete-pattern)
     (define-peg-pattern name all
       (and (ignore (string-ci header))
            (ignore (and (* wsp) ":"))
            (or pattern obsolete-pattern) crlf)))))

;;; Origination date field

(define-field-pattern orig-date "Date" date-time)

;;; Originator fields

(define-field-pattern from "From" mailbox-list (or mailbox-list address-list))
(define-field-pattern sender "Sender" mailbox (or mailbox address))
(define-field-pattern reply-to "Reply-To" address-list)

;; Destination address fields

(define-field-pattern to "To" address-list)
(define-field-pattern cc "Cc" address-list)
(define-field-pattern bcc "Bcc"
  (? (or address-list cfws))
  (or address-list (and (* (and (? cfws) ",")) (? cfws))))

;;; Identification fields

(define-peg-pattern no-fold-literal body
  (and "[" (* dtext) "]"))

(define-peg-pattern obs-id-left body
  local-part)

(define-peg-pattern id-left body
  (or obs-id-left dot-atom-text))

(define-peg-pattern obs-id-right body
  domain)

(define-peg-pattern id-right body
  (or obs-id-right dot-atom-text no-fold-literal))

(define-peg-pattern msg-id all
  (and (ignore (? cfws)) (ignore "<")
       id-left "@" id-right
       (ignore ">") (ignore (? cfws))))

(define-field-pattern message-id "Message-ID" msg-id)
(define-field-pattern in-reply-to "In-Reply-To"
  (+ msg-id)
  (* (or phrase msg-id)))
(define-field-pattern references "References"
  (+ msg-id)
  (* (or phrase msg-id)))

;;; Informational fields

(define-field-pattern subject "Subject" unstructured)
(define-field-pattern comments "Comments" unstructured)
(define-field-pattern keywords "Keywords"
  (and phrase (* (and "," phrase)))
  obs-phrase-list)

;;; Resent fields

(define-field-pattern resent-date "Resent-Date" date-time)
(define-field-pattern resent-from "Resent-From"
  (or mailbox-list address-list)
  mailbox-list)
(define-field-pattern resent-sender "Resent-Sender"
  (or mailbox address)
  mailbox)
(define-field-pattern resent-to "Resent-To" address-list)
(define-field-pattern resent-cc "Resent-Cc" address-list)
(define-field-pattern resent-bcc "Resent-Bcc"
  (? (or address-list cfws))
  (or address-list
      (and (* (and (? cfws) ",")) (? cfws))))
(define-field-pattern resent-msg-id "Resent-Message-ID" msg-id)
(define-field-pattern obs-resent-rply "Resent-Reply-To" address-list)

;;; Trace fields

;; word is given last priority in the ordered choice
(define-peg-pattern received-token all
  (or angle-addr addr-spec domain word))

(define-field-pattern received "Received"
  (and (* received-token) (ignore ";") date-time)
  (* received-token))

(define-peg-pattern path body
  (or angle-addr (and (? cfws) (ignore "<") (? cfws) (ignore ">") (? cfws))))

(define-field-pattern return "Return-Path" path)

(define-peg-pattern trace all
  (and (? return) (+ received)))

;;; Optional fields

(define-printable-ascii-character-pattern ftext ":")

(define-peg-pattern field-name all
  (+ ftext))

(define-peg-pattern obs-optional body
  (and field-name (ignore (and (* wsp) ":")) unstructured crlf))

(define-peg-pattern optional-field all
  (or (and field-name (ignore ":") unstructured crlf)
      obs-optional))

;;; MIME version

;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern mime-version "MIME-Version"
  (and (ignore (? cfws)) (+ digit) "." (+ digit)))

;;; Content type

;; TODO: What is <"> in RFC2045?
(define-peg-pattern tspecials body
  (or "(" ")" "<" ">" "@"
      "," ";" ":" "\\"
      "/" "[" "]" "?" "="))

;; TODO: What is a CTL in RFC2045?
(define-peg-pattern token body
  (+ (and (not-followed-by (or " " crlf tspecials))
          peg-any)))

;; TODO: Implement iana-token, ietf-token and x-token
(define-peg-pattern iana-token body
  token)

(define-peg-pattern ietf-token body
  token)

(define-peg-pattern x-token body
  token)

(define-peg-pattern extension-token body
  (or ietf-token x-token))

(define-peg-pattern discrete-type body
  (or "text" "image" "audio" "video"
      "application" extension-token))

(define-peg-pattern composite-type body
  (or "message" "multipart" extension-token))

(define-peg-pattern type all
  (or discrete-type composite-type))

(define-peg-pattern subtype all
  (or extension-token iana-token))

(define-peg-pattern attribute all
  token)

;; quoted-string is given higher precedence
(define-peg-pattern value all
  (or quoted-string token))

(define-peg-pattern parameter all
  (and attribute (ignore "=") value))

;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern content "Content-Type"
  (and (ignore (? cfws)) type (ignore "/") subtype
       (* (and (ignore ";") (ignore (? cfws)) parameter))))

;;; Content Disposition as defined in RFC2183

(define-peg-pattern disposition-type body
  (or "inline" "attachment" extension-token))

(define-peg-pattern quoted-date-time all
  quoted-string)

(define-syntax-rule (define-parameter-pattern name header pattern)
  (define-peg-pattern name all
    (and (ignore header) (ignore "=") pattern)))

(define-parameter-pattern filename-parm "filename" value)
(define-parameter-pattern creation-date-parm "creation-date" quoted-date-time)
(define-parameter-pattern modification-date-parm "modification-date" quoted-date-time)
(define-parameter-pattern read-date-parm "read-date" quoted-date-time)
(define-parameter-pattern size-parm "size" (+ digit))

(define-peg-pattern disposition-parm body
  (or filename-parm creation-date-parm modification-date-parm
      read-date-parm size-parm parameter))

;; I have prepended optional cfws to account for leading whitespace.
(define-field-pattern disposition "Content-Disposition"
  (and (ignore (? cfws)) disposition-type
       (* (and (ignore ";") (ignore (? cfws)) disposition-parm))))

;;; Content transfer encoding

(define-peg-pattern mechanism body
  (or "7bit" "8bit" "binary"
      "quoted-printable" "base64"
      ietf-token x-token))

(define-field-pattern encoding "Content-Transfer-Encoding"
  (and (ignore (? cfws)) mechanism))

;;; Fields

;; The ABNF specified for fields in RFC5322 does not make sense. With
;; it, all headers are eaten up by optional-field. So, as a temporary
;; workaround, I am going with the following much simpler (but
;; possibly incorrect) ABNF.

;; TODO: Try to understand and implement the actual ABNF specified by
;; RFC5322.
(define-peg-pattern fields all
  (* (or trace
         resent-date
         resent-from
         resent-sender
         resent-to
         resent-cc
         resent-bcc
         resent-msg-id
         obs-resent-rply
         orig-date
         from
         sender
         reply-to
         to
         cc
         bcc
         message-id
         in-reply-to
         references
         subject
         comments
         keywords
         mime-version
         content
         disposition
         encoding
         optional-field)))

(define-peg-pattern mime-extension-field-name all
  (and "Content-" (+ ftext)))

(define-peg-pattern mime-extension-field all
  (and mime-extension-field-name (ignore ":") unstructured crlf))

(define-peg-pattern mime-entity-fields all
  (* (or content
         disposition
         encoding
         mime-extension-field
         optional-field)))

(define (decode-mime-encoded-word word)
  (regexp-substitute/global
   #f "=\\?([^?]*)\\?([^?]*)\\?([^?]*)\\?=" word
   'pre (lambda (match-record)
          (let ((charset (match:substring match-record 1))
                (encoding (string->lcase-symbol (match:substring match-record 2)))
                (encoded-text (match:substring match-record 3)))
            (bytevector->string
             ((case encoding
                ((b) base64-decode)
                ((q) q-encoding-decode)
                (else (error "Encoding of MIME word unknown" word)))
              encoded-text)
             charset 'substitute)))
   'post))

(define (body->mime-entities body boundary)
  "Split bytevector BODY into a list of mime entities separated by
BOUNDARY (as explained in RFC2045), and return that list."
  (define (read-till-boundary port)
    (read-bytes-till
     port
     ;; TODO: Look for the boundary sequence only at the beginning of
     ;; a line.
     ;; Boundary is always an ASCII string.
     (string->bytevector (string-append "--" boundary) "us-ascii")))

  (define (read-mime-entity port)
    (let ((line (get-line-with-delimiter port)))
      ;; The eof-object? check returns #t only when the message is
      ;; prematurely truncated. It is invoked only to handle truncated
      ;; messages gracefully without raising an error.
      (if (or (eof-object? line)
              (string-prefix? (string-append "--" boundary "--") line))
          (eof-object)
          (read-till-boundary port))))

  (call-with-port (open-bytevector-input-port body)
    (lambda (port)
      (read-till-boundary port)
      (read-objects read-mime-entity port))))

(define (email->headers+body email)
  "Split EMAIL bytevector into headers and body. Return as multiple
values. The returned headers is a string and body is a bytevector."
  (call-with-port (open-bytevector-input-port email)
    (lambda (port)
      ;; Email headers must strictly be ASCII characters. But for the
      ;; sake of supporting Emacs message mode parens style addresses
      ;; that may use non-ASCII characters, typically for the full
      ;; name, we relax this requirement. We assume an encoding of
      ;; UTF-8, and hope that everything turns out fine. Since UTF-8
      ;; is a superset of ASCII, this should not affect standards
      ;; conforming headers. If encoding is neither UTF-8 nor ASCII,
      ;; we use the substitute conversion strategy and proceed without
      ;; raising an error.
      (set-port-encoding! port "utf-8")
      (set-port-conversion-strategy! port 'substitute)
      (let ((headers (read-while port get-line-with-delimiter
                                 (lambda (line)
                                   (not (or (string= line "\n")
                                            (string= line "\r\n")))))))
        (get-line-with-delimiter port)
        (values (if (eof-object? headers) "" headers)
                (get-bytevector-all port))))))

(define (post-process-mailbox . args)
  (define process-name
    (compose decode-mime-encoded-word string-trim-both))

  (match args
    ;; name-addr email address
    (`(mailbox (display-name ,name) ,address)
     `((name . ,(process-name name))
       (address . ,(string-trim-both address))))
    ;; addr-spec email address
    (`(mailbox ,(? string? address))
     `((address . ,(string-trim-both address))))
    ;; emacs message mode parens style email address
    (`(mailbox ,(? list? address-parts))
     `((name . ,(match-let ((`((captured-comment ,name))
                             (flatten-and-filter '(captured-comment) address-parts)))
                  (process-name name)))
       (address . ,(string-trim-both
                    (string-join (drop-right address-parts 1) "")))))
    (_ (error "Failed to parse mailbox"))))

(define (post-process-content-transfer-encoding _ value)
  (list 'content-transfer-encoding (string->lcase-symbol value)))

(define (post-process-content-type . args)
  (define (valid-charset? charset)
    (catch #t
      (lambda ()
        (bytevector->string (make-bytevector 1 0) charset)
        #t)
      (const #f)))

  (match args
    (`(content (type ,type)
               (subtype ,subtype)
               . ,parameters)
     (let ((type (string->lcase-symbol type))
           (subtype (string->lcase-symbol subtype))
           (parameters
            (map (match-lambda
                   (`(parameter (attribute ,attribute)
                                (value ,value))
                    (cons (string->lcase-symbol attribute) value)))
                 (flatten-and-filter '(parameter) parameters))))
       `(content-type
         ,(acons*
           'type type
           'subtype subtype
           (cond
            ((and (eq? type 'text)
                  (or
                   ;; RFC6657 specifies UTF-8 as the default charset for
                   ;; text/* media types.
                   (not (assoc-ref parameters 'charset))
                   ;; RFC2045 recommends that the default be assumed
                   ;; when a syntactically invalid Content-Type header
                   ;; is encountered. In this implementation, we are
                   ;; only checking for the validity of the
                   ;; charset. Should we check for the validity of the
                   ;; entire Content-Type header? If so, how?
                   (not (valid-charset? (assoc-ref parameters 'charset)))))
             (alist-combine parameters (acons* 'charset "utf-8")))
            (else parameters))))))))

(define post-process-content-disposition
  (match-lambda*
    (`(disposition ,type . ,parameters)
     `(content-disposition
       ,(acons 'type (string->lcase-symbol type)
               (map (match-lambda
                      (('filename-parm ('value filename))
                       (cons 'filename (basename filename)))
                      (((? (lambda (date-parm)
                             (member date-parm '(creation-date-parm modification-date-parm read-date-parm)))
                           date-parm) value)
                       ;; TODO: Convert to SRFI-19 datetime
                       (cons date-parm value))
                      (('size-parm value)
                       (cons 'size (string->number value)))
                      (`(parameter (attribute ,attribute)
                                   (value ,value))
                       (cons (string->lcase-symbol attribute) value)))
                    (flatten-and-filter
                     '(filename-parm creation-date-parm modification-date-parm
                                     read-date-parm size-parm parameter)
                     parameters)))))))

(define post-process-optional-field
  (match-lambda*
    (`(optional-field
       (field-name ,field-name)
       ,field-value)
     (list (string->lcase-symbol field-name)
           field-value))
    (`(optional-field (field-name ,field-name))
     (list field-name ""))))

(define* (macro-process-address-list _ . addresses)
  (flatten-and-filter '(address mailbox) addresses))

(define (post-process-fields fields)
  (map (match-lambda
         ((field value)
          (cons field value))
         ((field . values)
          (cons field values))
         ;; If the Subject header is blank, treat it as having the
         ;; null string as value.
         ('subject '(subject . ""))
         (_ #f))
       fields))

(define (parse-email-address address)
  "Parse ADDRESS as an email address and return an association list
with keys being the symbols name and address, and values being the
display-name and addr-spec respectively. display-name and addr-spec
are as defined in RFC5322.

For example,
(parse-email-address \"Foo <foo@example.org>\")
=> ((name . \"Foo\") (address . \"foo@example.org\"))
(parse-email-address \"foo@example.org\")
=> ((address . \"foo@example.org\"))"
  (pre-post-order
   (peg:tree (match-pattern mailbox address))
   `((mailbox . ,post-process-mailbox)
     (*text* . ,(lambda (_ text) text))
     (*default* . ,(lambda tree tree)))))

(define interpret-address
  (match-lambda
    ((('name . name)
      ('address . address))
     (format #f "~a <~a>" name address))
    ((('address . address)) address)))

(define (parse-email-body headers body)
  "Parse BODY as email body where HEADERS is an association list of
header keys and values as returned by parse-email-headers. Return a
list of <mime-entity> records if the body is a multipart
message. Else, return a single <mime-entity> record."
  (match body
    ((? string? body)
     (parse-email-body headers (string->bytevector body "utf-8")))
    ((? bytevector? body)
     (let ((content-type (assoc-ref headers 'content-type)))
       (case (assoc-ref content-type 'type)
         ((multipart)
          (map (cut parse-mime-entity headers <>)
               (body->mime-entities body (assoc-ref content-type 'boundary))))
         ((text)
          (string-trim-both
           (decode-body body (assoc-ref headers 'content-transfer-encoding)
                        (assoc-ref content-type 'charset))))
         (else (decode-body body (assoc-ref headers 'content-transfer-encoding))))))))

(define (add-default-headers headers)
  ;; Default Content-Type and Content-Transfer-Encoding headers as
  ;; specified in RFC2045
  (alist-combine (acons* 'content-type '((type . text)
                                         (subtype . plain)
                                         ;; UTF-8 is specified as the default
                                         ;; charset in RFC6657
                                         (charset . "utf-8"))
                         'content-transfer-encoding '#{7bit}#)
                 headers))

(define (handle-invalid-headers headers)
  ;; §6.4 of RFC2045 specifies that any entity with an unrecognized
  ;; Content-Transfer-Encoding must be treated as if it has a
  ;; Content-Type of "application/octet-stream", regardless of what
  ;; the Content-Type header field actually says.
  (if (memq (assq-ref headers 'content-transfer-encoding)
            (list '7bit '8bit 'binary 'quoted-printable 'base64))
      headers
      (alist-combine headers
                     '((content-type (type . application)
                                     (subtype . octet-stream))
                       (content-transfer-encoding . binary)))))

(define (add-default-mime-entity-headers parent-headers headers)
  ;; Default Content-Type and Content-Transfer-Encoding headers as
  ;; specified in RFC2045 and RFC2046
  (let ((parent-content-type (assoc-ref parent-headers 'content-type)))
    (alist-combine
     (acons* 'content-type
             `(,@(if (and (eq? (assoc-ref parent-content-type 'type) 'multipart)
                          (eq? (assoc-ref parent-content-type 'subtype) 'digest))
                     '((type . message)
                       (subtype . rfc822))
                     '((type . text)
                       (subtype . plain)))
               ;; UTF-8 is specified as the default
               ;; charset in RFC6657
               (charset . "utf-8"))
             'content-transfer-encoding '#{7bit}#)
     headers)))

(define (parse-mime-entity parent-headers bv)
  (let-values (((headers body) (email->headers+body bv)))
    (let ((headers
           (pre-post-order
            (peg:tree
             (match-pattern mime-entity-fields headers))
            `((content . ,post-process-content-type)
              (encoding . ,post-process-content-transfer-encoding)
              (disposition . ,post-process-content-disposition)
              (optional-field . ,post-process-optional-field)
              (mime-entity-fields . ,(lambda (_ . mime-entity-fields)
                                       (add-default-mime-entity-headers
                                        parent-headers
                                        (post-process-fields mime-entity-fields))))
              (*text* . ,(match-lambda*
                           ;; Handle MIME entities that have no
                           ;; headers.
                           ((_ 'mime-entity-fields)
                            (add-default-mime-entity-headers parent-headers '()))
                           ((_ text) text)))
              (*default* . ,(lambda tree tree))))))
      (make-mime-entity headers (parse-email-body headers body)))))

(define (parse-email email)
  "Parse string EMAIL and return result as an <email> record."
  (match email
    ((? string? email)
     (parse-email (string->bytevector email "utf-8")))
    ((? bytevector? email)
     (let-values (((headers body) (email->headers+body email)))
       (let ((headers-alist (parse-email-headers headers)))
         (make-email headers-alist (parse-email-body headers-alist body)))))))

(define (parse-email-headers headers)
  "Parse string HEADERS as email headers and return an association
list of header keys and values."
  (pre-post-order
   (peg:tree
    (match-pattern fields headers))
   `((received . ,(match-lambda*
                    (('received tokens (? date? timestamp))
                     (list 'received (string-join tokens) timestamp))
                    (('received tokens ...)
                     (list 'received (string-join tokens)))))
     (received-token . ,(match-lambda*
                          (`(received-token ,token) token)))
     (date-time . ,(lambda node
                     (match-let*
                         (;; Seconds are optional; provide a default
                          ;; binding which will be shadowed by match.
                          (seconds "0")
                          ((('day day) ('month month) ('year year)
                            ('hours hours) ('minutes minutes)
                            . (or (('seconds seconds) ('zone . zone))
                                  (('zone . zone))))
                           (flatten-and-filter
                            '(day month year hours minutes seconds zone)
                            node)))
                       (make-date 0
                                  (string->number seconds)
                                  (string->number minutes)
                                  (string->number hours)
                                  (string->number day)
                                  (1+ (list-index
                                       (cut equal? <> month)
                                       (list "Jan" "Feb" "Mar" "Apr" "May" "Jun"
                                             "Jul" "Aug" "Sep" "Oct" "Nov" "Dec")))
                                  (let ((year (string->number year)))
                                    (cond
                                     ((and (>= year 0) (<= year 49))
                                      (+ year 2000))
                                     ((and (>= year 50) (<= year 999))
                                      (+ year 1900))
                                     (else year)))
                                  (match zone
                                    (((or "UT" "GMT")) 0)
                                    (("EDT") (* -4 3600))
                                    (((or "EST" "CDT")) (* -5 3600))
                                    (((or "CST" "MDT")) (* -6 3600))
                                    (((or "MST" "PDT")) (* -7 3600))
                                    (("PST") (* -8 3600))
                                    (`((zone-sign ,zone-sign)
                                       (zone-hours ,zone-hours)
                                       (zone-minutes ,zone-minutes))
                                     (* (case (string->symbol zone-sign)
                                          ((+) 1)
                                          ((-) -1))
                                        (+ (* 60 60 (string->number zone-hours))
                                           (* 60 (string->number zone-minutes))))))))))
     (orig-date . ,(lambda (_ date) (list 'date date)))
     (mailbox . ,post-process-mailbox)
     (address-list *macro* . ,macro-process-address-list)
     (mailbox-list *macro* . ,macro-process-address-list)
     (optional-field . ,post-process-optional-field)
     (msg-id . ,(match-lambda* (`(msg-id ,msg-id) msg-id)))
     (content . ,post-process-content-type)
     (encoding . ,post-process-content-transfer-encoding)
     (disposition . ,post-process-content-disposition)
     (subject . ,(match-lambda* (`(subject ,subject)
                                 `(subject ,(decode-mime-encoded-word subject)))))
     (keywords . ,(lambda (_ value)
                    (cons 'keywords (string-split value #\,))))
     (fields . ,(lambda (_ . fields)
                  (handle-invalid-headers
                   (add-default-headers (post-process-fields fields)))))
     (*text* . ,(lambda (_ value) value))
     (*default* . ,(lambda tree tree)))))

(define* (decode-body body encoding #:optional charset)
  (let ((decoded-octets
         (case encoding
           ((base64) (base64-decode body))
           ((quoted-printable) (quoted-printable-decode
                                (bytevector->string body "us-ascii")))
           ((#{7bit}# #{8bit}# binary) body)
           (else (error "Body decoding failed. Unknown encoding" encoding)))))
    (if charset
        (bytevector->string decoded-octets charset 'substitute)
        decoded-octets)))

(define (read-next-email-in-mbox port)
  "Read next email from PORT and return as a bytevector.  PORT is an
input port reading an mbox file."
  ;; Read and discard From_ line
  (get-line-with-delimiter port)
  ;; Read the actual email
  (let ((email (read-bytes-till
                port (string->bytevector "\nFrom " "us-ascii"))))
    (read-char port)
    email))

(define (mbox->emails port)
  "Read all emails from PORT and return as a list of bytevectors.
PORT is an input port reading an mbox file."
  (read-objects read-next-email-in-mbox port))