;;; guile-email --- Guile email parser ;;; Copyright © 2018, 2019, 2020, 2021 Arun Isaac ;;; ;;; This file is part of guile-email. ;;; ;;; guile-email is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU Affero General Public License as ;;; published by the Free Software Foundation; either version 3 of the ;;; License, or (at your option) any later version. ;;; ;;; guile-email is distributed in the hope that it will be useful, but ;;; WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; Affero General Public License for more details. ;;; ;;; You should have received a copy of the GNU Affero General Public ;;; License along with guile-email. If not, see ;;; . (define-module (email email) #:use-module (ice-9 binary-ports) #:use-module (ice-9 iconv) #:use-module (ice-9 match) #:use-module (ice-9 peg) #:use-module (ice-9 regex) #:use-module (rnrs bytevectors) #:use-module ((rnrs io ports) #:select (call-with-port)) #:use-module (rnrs io simple) #:use-module (srfi srfi-1) #:use-module (srfi srfi-2) #:use-module (srfi srfi-9) #:use-module (srfi srfi-11) #:use-module (srfi srfi-19) #:use-module (srfi srfi-26) #:use-module (sxml transform) #:use-module (email base64) #:use-module (email quoted-printable) #:use-module (email utils) #:export ( make-email email? email-headers email-body make-mime-entity mime-entity? mime-entity-headers mime-entity-body email->headers+body parse-email parse-email-headers parse-email-body parse-email-address interpret-address mbox->emails)) (define (flatten-and-filter terms tree) (filter list? (keyword-flatten terms tree))) (define-record-type (make-email headers body) email? (headers email-headers) (body email-body)) (define-record-type (make-mime-entity headers body) mime-entity? (headers mime-entity-headers) (body mime-entity-body)) (define string->lcase-symbol (compose string->symbol string-downcase)) ;;; PEG parser implementing the ABNF grammar specified in RFC5322 ;;; (Internet Message Format), RFC6854 (Update to Internet Message ;;; Format to Allow Group Syntax in the "From:" and "Sender:" Header ;;; Fields) ;;; Obsolete syntax has not been implemented. ;;; Core ABNF rules from RFC5234 (define-peg-pattern alpha body (or (range #\A #\Z) (range #\a #\z))) ;; Though line endings should be crlf (\r\n), we also tolerate bare ;; line feeds (\n) (define-peg-pattern crlf none (or "\r\n" "\n")) (define-peg-pattern digit body (range #\0 #\9)) (define-peg-pattern dquote body "\"") ;; Printable ASCII characters and UTF-8 characters > \x7f (RFC6532) (define-peg-pattern vchar body (and (not-followed-by (or (range #\Nul #\Space) "\x7f")) peg-any)) (define-peg-pattern wsp body (or " " "\t")) (define-peg-pattern lwsp body (* (or wsp (and crlf wsp)))) (define-syntax-rule (define-printable-ascii-character-pattern name . exceptions) (define-peg-pattern name body (and (not-followed-by (or . exceptions)) vchar))) (define-syntax-rule (define-printable-ascii-character-pattern-with-obsolete name obsolete-pattern . exceptions) (define-peg-pattern name body (or (and (not-followed-by (or . exceptions)) vchar) obsolete-pattern))) ;;; Quoted characters (define-peg-pattern obs-qp body (and "\\" (or "\x00" obs-no-ws-ctl "\n" "\r"))) (define-peg-pattern quoted-pair body (or (and (ignore "\\") (or vchar wsp)) obs-qp)) ;;; Folding white space and comments (define-peg-pattern obs-fws body (and (+ wsp) (* (and crlf (+ wsp))))) (define-peg-pattern fws body (or (and (? (and (* wsp) crlf)) (+ wsp)) obs-fws)) (define-peg-pattern obs-no-ws-ctl body (and (not-followed-by (or "\t" "\n" "\r")) (or (range #\soh #\us) "\x7f"))) (define-peg-pattern obs-ctext body obs-no-ws-ctl) (define-printable-ascii-character-pattern-with-obsolete ctext obs-ctext "(" ")" "\\") (define-syntax-rule (define-comment-pattern name capture-type) (define-peg-pattern name capture-type (and (ignore "(") (* (and (? fws) ccontent)) (ignore ")")))) (define-comment-pattern comment none) (define-peg-pattern ccontent body (or ctext quoted-pair comment)) (define-syntax-rule (define-cfws-pattern name comment) (define-peg-pattern name body (or (and (+ (and (? fws) comment)) (? fws)) fws))) (define-cfws-pattern cfws comment) ;;; Atom (define-printable-ascii-character-pattern atext "\"" "(" ")" "," "." ":" ";" "<" ">" "@" "[" "\\" "]") (define-syntax define-atom-pattern (syntax-rules () ((define-atom-pattern name) (define-peg-pattern name body (and (ignore (? cfws)) (+ atext) (ignore (? cfws))))) ((define-atom-pattern name cfws) (define-peg-pattern name body (and (? cfws) (+ atext) (? cfws)))))) (define-atom-pattern atom) (define-peg-pattern dot-atom-text body (and (+ atext) (* (and "." (+ atext))))) (define-syntax-rule (define-dot-atom-pattern name cfws) (define-peg-pattern name body (and (ignore (? cfws)) dot-atom-text (ignore (? cfws))))) (define-dot-atom-pattern dot-atom cfws) ;;; Quoted strings (define-peg-pattern obs-qtext body obs-no-ws-ctl) (define-printable-ascii-character-pattern-with-obsolete qtext obs-qtext "\\" "\"") (define-peg-pattern qcontent body (or qtext quoted-pair)) ;; TODO: Remove workaround guile peg bug for ignore (define-peg-pattern quoted-string body (and (? cfws) (ignore (and dquote)) (* (and (? fws) qcontent)) (? fws) (ignore (and dquote)) (? cfws))) ;;; Miscellaneous tokens ;; According to RFC5322§3.2.3, the leading and trailing cfws in atom ;; are semantically not part of it, and should be ignored. But, to ;; support obs-phrase, we need to capture it. Hence, we define a set ;; of cfws-captured-* patterns needed by obs-phrase. (define-syntax-rule (define-word-pattern name atom) (define-peg-pattern name body (or atom quoted-string))) (define-word-pattern word atom) (define-atom-pattern cfws-captured-atom cfws) (define-word-pattern cfws-captured-word cfws-captured-atom) (define-peg-pattern obs-phrase body (and cfws-captured-word (* (or cfws-captured-word "." cfws)))) ;; We set phrase to be the same as obs-phrase since, according to ;; their definitions in RFC5322, all phrases are obs-phrases. (define-peg-pattern phrase body obs-phrase) (define-peg-pattern obs-phrase-list body (and (? (or phrase cfws)) (* (and "," (or phrase cfws))))) (define-peg-pattern obs-utext body (or "\x00" obs-no-ws-ctl vchar)) (define-peg-pattern obs-unstruct body (* (or (and (* "\n") (* "\r") (* (and obs-utext (* "\n") (* "\r")))) fws))) ;; ABNF modified to ignore leading whitespace ;; ABNF modified to allow for blank lines in folded field (define-peg-pattern unstructured body (or (and (ignore (? fws)) (* (and (? fws) (? vchar))) (ignore (* wsp))) obs-unstruct)) ;;; Date and time specification (define-peg-pattern day-name body (or "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun")) (define-peg-pattern obs-day-of-week body (and (ignore (? cfws)) day-name (ignore (? cfws)))) ;; We set day-of-week to be the same as obs-day-of-week since, ;; according to their definitions in RFC5322, all days-of-week are ;; obs-days-of-week. (define-peg-pattern day-of-week all obs-day-of-week) (define-peg-pattern obs-day body (and (ignore (? cfws)) digit (? digit) (ignore (? cfws)))) ;; We set day to be the same as obs-day since, according to their ;; definitions in RFC5322, all days are obs-days. (define-peg-pattern day all obs-day) (define-peg-pattern month all (or "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec")) (define-peg-pattern obs-year body (and (ignore (? cfws)) (and digit digit) (* digit) (ignore (? cfws)))) ;; We set year to be the same as obs-year since, according to their ;; definitions in RFC5322, all years are obs-years. (define-peg-pattern year all obs-year) (define-peg-pattern date all (and day month year)) (define-peg-pattern obs-hour body (and (ignore (? cfws)) digit digit (ignore (? cfws)))) ;; We set hours to be the same as obs-hour since, according to their ;; definitions in RFC5322, all hours are obs-hours. (define-peg-pattern hours all obs-hour) (define-peg-pattern obs-minute body (and (ignore (? cfws)) digit digit (ignore (? cfws)))) ;; We set minutes to be the same as obs-minute since, according to ;; their definitions in RFC5322, all minutes are obs-minutes. (define-peg-pattern minutes all obs-minute) (define-peg-pattern obs-second body (and (ignore (? cfws)) digit digit (ignore (? cfws)))) ;; We set seconds to be the same as obs-second since, according to ;; their definitions in RFC5322, all seconds are obs-seconds. (define-peg-pattern seconds all obs-second) (define-peg-pattern time-of-day all (and hours (ignore ":") minutes (? (and (ignore ":") seconds)))) ;; zone-sign, zone-hours and zone-minutes do not exist in the ABNF ;; specified in RFC5322. But, we have it here since it eases parsing ;; the zone and converting it to a number of seconds as required by ;; the make-date procedure of SRFI-19. (define-peg-pattern zone-sign all (or "+" "-")) (define-peg-pattern zone-hours all (and digit digit)) (define-peg-pattern zone-minutes all (and digit digit)) (define-peg-pattern obs-zone body (or "UT" "GMT" "EST" "EDT" "CST" "CDT" "MST" "MDT" "PST" "PDT" (and (not-followed-by (or "j" "J")) (or (range #\a #\z) (range #\A #\Z))))) (define-peg-pattern zone all (or (and zone-sign zone-hours zone-minutes) obs-zone)) (define-peg-pattern time all (and time-of-day zone)) (define-peg-pattern date-time all (and (? (and day-of-week (ignore ","))) date time (? cfws))) ;;; Address specification (define-peg-pattern display-name all phrase) (define-peg-pattern obs-local-part body (and word (* (and "." word)))) (define-peg-pattern local-part body (or obs-local-part dot-atom quoted-string)) (define-peg-pattern obs-dtext body (or obs-no-ws-ctl quoted-pair)) (define-printable-ascii-character-pattern-with-obsolete dtext obs-dtext "[" "]" "\\") (define-peg-pattern domain-literal body (and (? cfws) "[" (* (and (? fws) dtext)) (? fws) "]" (? cfws))) (define-syntax-rule (define-obs-domain-pattern name atom) (define-peg-pattern name body (and atom (* (and "." atom))))) (define-obs-domain-pattern obs-domain atom) (define-syntax-rule (define-domain-pattern name obs-domain dot-atom) (define-peg-pattern name body (or obs-domain dot-atom domain-literal))) (define-domain-pattern domain obs-domain dot-atom) (define-syntax-rule (define-addr-spec-pattern name domain) (define-peg-pattern name body (and local-part "@" domain))) (define-addr-spec-pattern addr-spec domain) (define-peg-pattern obs-domain-list body (and (* (or cfws ",")) "@" domain (* (and "," (? cfws) (? (and "@" domain)))))) (define-peg-pattern obs-route none (and obs-domain-list ":")) (define-peg-pattern obs-angle-addr body (and (ignore (and (? cfws) "<")) obs-route addr-spec (ignore (and ">" (? cfws))))) (define-syntax-rule (define-angle-addr-pattern name opening-bracket closing-bracket) (define-peg-pattern name body (or (and (ignore (? cfws)) opening-bracket addr-spec closing-bracket (ignore (? cfws))) obs-angle-addr))) (define-angle-addr-pattern angle-addr "<" ">") ;; When handling email addresses, we need to drop the angle ;; brackets. But in the Received field we need to include the angle ;; brackes. So, we define unbracketed-angle-addr, a variant of ;; angle-addr in which the angle brackets are ignored. (define-angle-addr-pattern unbracketed-angle-addr (ignore "<") (ignore ">")) (define-peg-pattern name-addr body (and (? display-name) unbracketed-angle-addr)) ;; In order to support emacs message mode parens style email ;; addresses, we need to capture comments in address fields and ;; process them as names. But, we need to ignore comments in other ;; parts of the email. In order to facilitate this, we define a set of ;; captured-* patterns which are used in address fields. (define-atom-pattern captured-atom captured-cfws) (define-comment-pattern captured-comment all) (define-cfws-pattern captured-cfws captured-comment) (define-dot-atom-pattern captured-dot-atom captured-cfws) (define-obs-domain-pattern captured-obs-domain captured-atom) (define-domain-pattern captured-domain captured-obs-domain captured-dot-atom) (define-addr-spec-pattern captured-addr-spec captured-domain) (define-peg-pattern mailbox all (or name-addr captured-addr-spec)) (define-peg-pattern obs-mbox-list body (and (* (and (? cfws) ",")) mailbox (* (and "," (? (or mailbox cfws)))))) ;; We set mailbox-list to be the same as obs-mbox-list since, ;; according to their definitions in RFC5322, all mailbox-lists are ;; obs-mbox-lists. (define-peg-pattern mailbox-list all obs-mbox-list) (define-peg-pattern group all (and display-name (ignore ":") (? group-list) (ignore ";") (? cfws))) (define-peg-pattern obs-group-list body (and (+ (and (? cfws) ",")) (? cfws))) (define-peg-pattern group-list all (or mailbox-list cfws obs-group-list)) (define-peg-pattern address body (or mailbox group)) (define-peg-pattern obs-addr-list body (and (* (and (? cfws) ",")) address (* (and "," (? (or address cfws)))))) ;; We set address-list to be the same as obs-addr-list since, ;; according to their definitions in RFC5322, all address-lists are ;; obs-address-lists. (define-peg-pattern address-list all obs-addr-list) ;;; Fields ;; We have compressed the current field pattern and the obsolete field ;; pattern into a single PEG pattern. (define-syntax define-field-pattern (syntax-rules () ((define-field-pattern name header pattern) (define-peg-pattern name all (and (ignore (string-ci header)) (ignore (and (* wsp) ":")) pattern crlf))) ((define-field-pattern name header pattern obsolete-pattern) (define-peg-pattern name all (and (ignore (string-ci header)) (ignore (and (* wsp) ":")) (or pattern obsolete-pattern) crlf))))) ;;; Origination date field (define-field-pattern orig-date "Date" date-time) ;;; Originator fields (define-field-pattern from "From" mailbox-list (or mailbox-list address-list)) (define-field-pattern sender "Sender" mailbox (or mailbox address)) (define-field-pattern reply-to "Reply-To" address-list) ;; Destination address fields (define-field-pattern to "To" address-list) (define-field-pattern cc "Cc" address-list) (define-field-pattern bcc "Bcc" (? (or address-list cfws)) (or address-list (and (* (and (? cfws) ",")) (? cfws)))) ;;; Identification fields (define-peg-pattern no-fold-literal body (and "[" (* dtext) "]")) (define-peg-pattern obs-id-left body local-part) (define-peg-pattern id-left body (or obs-id-left dot-atom-text)) (define-peg-pattern obs-id-right body domain) (define-peg-pattern id-right body (or obs-id-right dot-atom-text no-fold-literal)) (define-peg-pattern msg-id all (and (ignore (? cfws)) (ignore "<") id-left "@" id-right (ignore ">") (ignore (? cfws)))) (define-field-pattern message-id "Message-ID" msg-id) (define-field-pattern in-reply-to "In-Reply-To" (+ msg-id) (* (or phrase msg-id))) (define-field-pattern references "References" (+ msg-id) (* (or phrase msg-id))) ;;; Informational fields (define-field-pattern subject "Subject" unstructured) (define-field-pattern comments "Comments" unstructured) (define-field-pattern keywords "Keywords" (and phrase (* (and "," phrase))) obs-phrase-list) ;;; Resent fields (define-field-pattern resent-date "Resent-Date" date-time) (define-field-pattern resent-from "Resent-From" (or mailbox-list address-list) mailbox-list) (define-field-pattern resent-sender "Resent-Sender" (or mailbox address) mailbox) (define-field-pattern resent-to "Resent-To" address-list) (define-field-pattern resent-cc "Resent-Cc" address-list) (define-field-pattern resent-bcc "Resent-Bcc" (? (or address-list cfws)) (or address-list (and (* (and (? cfws) ",")) (? cfws)))) (define-field-pattern resent-msg-id "Resent-Message-ID" msg-id) (define-field-pattern obs-resent-rply "Resent-Reply-To" address-list) ;;; Trace fields ;; word is given last priority in the ordered choice (define-peg-pattern received-token all (or angle-addr addr-spec domain word)) (define-field-pattern received "Received" (and (* received-token) (ignore ";") date-time) (* received-token)) (define-peg-pattern path body (or angle-addr (and (? cfws) (ignore "<") (? cfws) (ignore ">") (? cfws)))) (define-field-pattern return "Return-Path" path) (define-peg-pattern trace all (and (? return) (+ received))) ;;; Optional fields (define-printable-ascii-character-pattern ftext ":") (define-peg-pattern field-name all (+ ftext)) (define-peg-pattern obs-optional body (and field-name (ignore (and (* wsp) ":")) unstructured crlf)) (define-peg-pattern optional-field all (or (and field-name (ignore ":") unstructured crlf) obs-optional)) ;;; MIME version ;; I have prepended optional cfws to account for leading whitespace. (define-field-pattern mime-version "MIME-Version" (and (ignore (? cfws)) (+ digit) "." (+ digit))) ;;; Content type ;; TODO: What is <"> in RFC2045? (define-peg-pattern tspecials body (or "(" ")" "<" ">" "@" "," ";" ":" "\\" "/" "[" "]" "?" "=")) ;; TODO: What is a CTL in RFC2045? (define-peg-pattern token body (+ (and (not-followed-by (or " " crlf tspecials)) peg-any))) ;; TODO: Implement iana-token, ietf-token and x-token (define-peg-pattern iana-token body token) (define-peg-pattern ietf-token body token) (define-peg-pattern x-token body token) (define-peg-pattern extension-token body (or ietf-token x-token)) (define-peg-pattern discrete-type body (or "text" "image" "audio" "video" "application" extension-token)) (define-peg-pattern composite-type body (or "message" "multipart" extension-token)) (define-peg-pattern type all (or discrete-type composite-type)) (define-peg-pattern subtype all (or extension-token iana-token)) (define-peg-pattern attribute all token) ;; quoted-string is given higher precedence (define-peg-pattern value all (or quoted-string token)) (define-peg-pattern parameter all (and attribute (ignore "=") value)) ;; I have prepended optional cfws to account for leading whitespace. (define-field-pattern content "Content-Type" (and (ignore (? cfws)) type (ignore "/") subtype (* (and (ignore ";") (ignore (? cfws)) parameter)))) ;;; Content Disposition as defined in RFC2183 (define-peg-pattern disposition-type body (or "inline" "attachment" extension-token)) (define-peg-pattern quoted-date-time all quoted-string) (define-syntax-rule (define-parameter-pattern name header pattern) (define-peg-pattern name all (and (ignore header) (ignore "=") pattern))) (define-parameter-pattern filename-parm "filename" value) (define-parameter-pattern creation-date-parm "creation-date" quoted-date-time) (define-parameter-pattern modification-date-parm "modification-date" quoted-date-time) (define-parameter-pattern read-date-parm "read-date" quoted-date-time) (define-parameter-pattern size-parm "size" (+ digit)) (define-peg-pattern disposition-parm body (or filename-parm creation-date-parm modification-date-parm read-date-parm size-parm parameter)) ;; I have prepended optional cfws to account for leading whitespace. (define-field-pattern disposition "Content-Disposition" (and (ignore (? cfws)) disposition-type (* (and (ignore ";") (ignore (? cfws)) disposition-parm)))) ;;; Content transfer encoding (define-peg-pattern mechanism body (or "7bit" "8bit" "binary" "quoted-printable" "base64" ietf-token x-token)) (define-field-pattern encoding "Content-Transfer-Encoding" (and (ignore (? cfws)) mechanism)) ;;; Fields ;; The ABNF specified for fields in RFC5322 does not make sense. With ;; it, all headers are eaten up by optional-field. So, as a temporary ;; workaround, I am going with the following much simpler (but ;; possibly incorrect) ABNF. ;; TODO: Try to understand and implement the actual ABNF specified by ;; RFC5322. (define-peg-pattern fields all (* (or trace resent-date resent-from resent-sender resent-to resent-cc resent-bcc resent-msg-id obs-resent-rply orig-date from sender reply-to to cc bcc message-id in-reply-to references subject comments keywords mime-version content disposition encoding optional-field))) (define-peg-pattern mime-extension-field-name all (and "Content-" (+ ftext))) (define-peg-pattern mime-extension-field all (and mime-extension-field-name (ignore ":") unstructured crlf)) (define-peg-pattern mime-entity-fields all (* (or content disposition encoding mime-extension-field optional-field))) (define (decode-mime-encoded-word word) (regexp-substitute/global #f "=\\?([^?]*)\\?([^?]*)\\?([^?]*)\\?=" word 'pre (lambda (match-record) (let ((charset (match:substring match-record 1)) (encoding (string->lcase-symbol (match:substring match-record 2))) (encoded-text (match:substring match-record 3))) (bytevector->string ((case encoding ((b) base64-decode) ((q) q-encoding-decode) (else (error "Encoding of MIME word unknown" word))) encoded-text) charset 'substitute))) 'post)) (define (body->mime-entities body boundary) "Split bytevector BODY into a list of mime entities separated by BOUNDARY (as explained in RFC2045), and return that list." (define (read-till-boundary port) (read-bytes-till port ;; TODO: Look for the boundary sequence only at the beginning of ;; a line. ;; Boundary is always an ASCII string. (string->bytevector (string-append "--" boundary) "us-ascii"))) (define (read-mime-entity port) (let ((line (get-line-with-delimiter port))) ;; The eof-object? check returns #t only when the message is ;; prematurely truncated. It is invoked only to handle truncated ;; messages gracefully without raising an error. (if (or (eof-object? line) (string-prefix? (string-append "--" boundary "--") line)) (eof-object) (read-till-boundary port)))) (call-with-port (open-bytevector-input-port body) (lambda (port) (read-till-boundary port) (read-objects read-mime-entity port)))) (define (email->headers+body email) "Split EMAIL bytevector into headers and body. Return as multiple values. The returned headers is a string and body is a bytevector." (call-with-port (open-bytevector-input-port email) (lambda (port) ;; Email headers must strictly be ASCII characters. But for the ;; sake of supporting Emacs message mode parens style addresses ;; that may use non-ASCII characters, typically for the full ;; name, we relax this requirement. We assume an encoding of ;; UTF-8, and hope that everything turns out fine. Since UTF-8 ;; is a superset of ASCII, this should not affect standards ;; conforming headers. If encoding is neither UTF-8 nor ASCII, ;; we use the substitute conversion strategy and proceed without ;; raising an error. (set-port-encoding! port "utf-8") (set-port-conversion-strategy! port 'substitute) (let ((headers (read-while port get-line-with-delimiter (lambda (line) (not (or (string= line "\n") (string= line "\r\n"))))))) (get-line-with-delimiter port) (values (if (eof-object? headers) "" headers) (get-bytevector-all port)))))) (define (post-process-mailbox . args) (define process-name (compose decode-mime-encoded-word string-trim-both)) (match args ;; name-addr email address (`(mailbox (display-name ,name) ,address) `((name . ,(process-name name)) (address . ,(string-trim-both address)))) ;; addr-spec email address (`(mailbox ,(? string? address)) `((address . ,(string-trim-both address)))) ;; emacs message mode parens style email address (`(mailbox ,(? list? address-parts)) `((name . ,(match-let ((`((captured-comment ,name)) (flatten-and-filter '(captured-comment) address-parts))) (process-name name))) (address . ,(string-trim-both (string-join (drop-right address-parts 1) ""))))) (_ (error "Failed to parse mailbox")))) (define (post-process-content-transfer-encoding _ value) (list 'content-transfer-encoding (string->lcase-symbol value))) (define (post-process-content-type . args) (define (valid-charset? charset) (catch #t (lambda () (bytevector->string (make-bytevector 0 0) charset) #t) (const #f))) (match args (`(content (type ,type) (subtype ,subtype) . ,parameters) (let ((type (string->lcase-symbol type)) (subtype (string->lcase-symbol subtype)) (parameters (map (match-lambda (`(parameter (attribute ,attribute) (value ,value)) (cons (string->lcase-symbol attribute) value))) (flatten-and-filter '(parameter) parameters)))) `(content-type ,(acons* 'type type 'subtype subtype (cond ((and (eq? type 'text) (or ;; RFC6657 specifies UTF-8 as the default charset for ;; text/* media types. (not (assoc-ref parameters 'charset)) ;; RFC2045 recommends that the default be assumed ;; when a syntactically invalid Content-Type header ;; is encountered. In this implementation, we are ;; only checking for the validity of the ;; charset. Should we check for the validity of the ;; entire Content-Type header? If so, how? (not (valid-charset? (assoc-ref parameters 'charset))))) (alist-combine parameters (acons* 'charset "utf-8"))) (else parameters)))))))) (define post-process-content-disposition (match-lambda* (`(disposition ,type . ,parameters) `(content-disposition ,(acons 'type (string->lcase-symbol type) (map (match-lambda (('filename-parm ('value filename)) (cons 'filename (basename filename))) (((? (lambda (date-parm) (member date-parm '(creation-date-parm modification-date-parm read-date-parm))) date-parm) value) ;; TODO: Convert to SRFI-19 datetime (cons date-parm value)) (('size-parm value) (cons 'size (string->number value))) (`(parameter (attribute ,attribute) (value ,value)) (cons (string->lcase-symbol attribute) value))) (flatten-and-filter '(filename-parm creation-date-parm modification-date-parm read-date-parm size-parm parameter) parameters))))))) (define post-process-optional-field (match-lambda* (`(optional-field (field-name ,field-name) ,field-value) (list (string->lcase-symbol field-name) field-value)) (`(optional-field (field-name ,field-name)) (list field-name "")))) (define* (macro-process-address-list _ . addresses) (flatten-and-filter '(address mailbox) addresses)) (define (post-process-fields fields) (map (match-lambda ((field value) (cons field value)) ((field . values) (cons field values)) ;; If the Subject header is blank, treat it as having the ;; null string as value. ('subject '(subject . "")) (_ #f)) fields)) (define (parse-email-address address) "Parse ADDRESS as an email address and return an association list with keys being the symbols name and address, and values being the display-name and addr-spec respectively. display-name and addr-spec are as defined in RFC5322. For example, (parse-email-address \"Foo \") => ((name . \"Foo\") (address . \"foo@example.org\")) (parse-email-address \"foo@example.org\") => ((address . \"foo@example.org\"))" (pre-post-order (peg:tree (match-pattern mailbox address)) `((mailbox . ,post-process-mailbox) (*text* . ,(lambda (_ text) text)) (*default* . ,(lambda tree tree))))) (define interpret-address (match-lambda ((('name . name) ('address . address)) (format #f "~a <~a>" name address)) ((('address . address)) address))) (define (parse-email-body headers body) "Parse BODY as email body where HEADERS is an association list of header keys and values as returned by parse-email-headers. Return a list of records if the body is a multipart message. Else, return a single record." (match body ((? string? body) (parse-email-body headers (string->bytevector body "utf-8"))) ((? bytevector? body) (let ((content-type (assoc-ref headers 'content-type))) (case (assoc-ref content-type 'type) ((multipart) (map (cut parse-mime-entity headers <>) (body->mime-entities body (assoc-ref content-type 'boundary)))) ((text) (string-trim-both (decode-body body (assoc-ref headers 'content-transfer-encoding) (assoc-ref content-type 'charset)))) (else (decode-body body (assoc-ref headers 'content-transfer-encoding)))))))) (define (add-default-headers headers) ;; Default Content-Type and Content-Transfer-Encoding headers as ;; specified in RFC2045 (alist-combine (acons* 'content-type '((type . text) (subtype . plain) ;; UTF-8 is specified as the default ;; charset in RFC6657 (charset . "utf-8")) 'content-transfer-encoding '#{7bit}#) headers)) (define (add-default-mime-entity-headers parent-headers headers) ;; Default Content-Type and Content-Transfer-Encoding headers as ;; specified in RFC2045 and RFC2046 (let ((parent-content-type (assoc-ref parent-headers 'content-type))) (alist-combine (acons* 'content-type `(,@(if (and (eq? (assoc-ref parent-content-type 'type) 'multipart) (eq? (assoc-ref parent-content-type 'subtype) 'digest)) '((type . message) (subtype . rfc822)) '((type . text) (subtype . plain))) ;; UTF-8 is specified as the default ;; charset in RFC6657 (charset . "utf-8")) 'content-transfer-encoding '#{7bit}#) headers))) (define (parse-mime-entity parent-headers bv) (let-values (((headers body) (email->headers+body bv))) (let ((headers (pre-post-order (peg:tree (match-pattern mime-entity-fields headers)) `((content . ,post-process-content-type) (encoding . ,post-process-content-transfer-encoding) (disposition . ,post-process-content-disposition) (optional-field . ,post-process-optional-field) (mime-entity-fields . ,(lambda (_ . mime-entity-fields) (add-default-mime-entity-headers parent-headers (post-process-fields mime-entity-fields)))) (*text* . ,(match-lambda* ;; Handle MIME entities that have no ;; headers. ((_ 'mime-entity-fields) (add-default-mime-entity-headers parent-headers '())) ((_ text) text))) (*default* . ,(lambda tree tree)))))) (make-mime-entity headers (parse-email-body headers body))))) (define (parse-email email) "Parse string EMAIL and return result as an record." (match email ((? string? email) (parse-email (string->bytevector email "utf-8"))) ((? bytevector? email) (let-values (((headers body) (email->headers+body email))) (let ((headers-alist (parse-email-headers headers))) (make-email headers-alist (parse-email-body headers-alist body))))))) (define (parse-email-headers headers) "Parse string HEADERS as email headers and return an association list of header keys and values." (pre-post-order (peg:tree (match-pattern fields headers)) `((received . ,(match-lambda* (`(received ,tokens ,timestamp) (list 'received (string-join tokens) timestamp)) (('received tokens ...) (list 'received (string-join tokens))))) (received-token . ,(match-lambda* (`(received-token ,token) token))) (date-time . ,(lambda node (match-let ((`((day ,day) (month ,month) (year ,year) (hours ,hours) (minutes ,minutes) (seconds ,seconds) (zone . ,zone)) (flatten-and-filter '(day month year hours minutes seconds zone) node))) (make-date 0 (string->number seconds) (string->number minutes) (string->number hours) (string->number day) (1+ (list-index (cut equal? <> month) (list "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"))) (let ((year (string->number year))) (cond ((and (>= year 0) (<= year 49)) (+ year 2000)) ((and (>= year 50) (<= year 999)) (+ year 1900)) (else year))) (match zone (((or "UT" "GMT")) 0) (("EDT") (* -4 3600)) (((or "EST" "CDT")) (* -5 3600)) (((or "CST" "MDT")) (* -6 3600)) (((or "MST" "PDT")) (* -7 3600)) (("PST") (* -8 3600)) (`((zone-sign ,zone-sign) (zone-hours ,zone-hours) (zone-minutes ,zone-minutes)) (* (case (string->symbol zone-sign) ((+) 1) ((-) -1)) (+ (* 60 60 (string->number zone-hours)) (* 60 (string->number zone-minutes)))))))))) (orig-date . ,(lambda (_ date) (list 'date date))) (mailbox . ,post-process-mailbox) (address-list *macro* . ,macro-process-address-list) (mailbox-list *macro* . ,macro-process-address-list) (optional-field . ,post-process-optional-field) (msg-id . ,(match-lambda* (`(msg-id ,msg-id) msg-id))) (content . ,post-process-content-type) (encoding . ,post-process-content-transfer-encoding) (disposition . ,post-process-content-disposition) (subject . ,(match-lambda* (`(subject ,subject) `(subject ,(decode-mime-encoded-word subject))))) (keywords . ,(lambda (_ value) (cons 'keywords (string-split value #\,)))) (fields . ,(lambda (_ . fields) (add-default-headers (post-process-fields fields)))) (*text* . ,(lambda (_ value) value)) (*default* . ,(lambda tree tree))))) (define* (decode-body body encoding #:optional charset) (let ((decoded-octets (case encoding ((base64) (base64-decode body)) ((quoted-printable) (quoted-printable-decode (bytevector->string body "us-ascii"))) ((#{7bit}# #{8bit}# binary) body) (else (error "Body decoding failed. Unknown encoding" encoding))))) (if charset (bytevector->string decoded-octets charset 'substitute) decoded-octets))) (define (read-next-email-in-mbox port) "Read next email from PORT and return as a bytevector. PORT is an input port reading an mbox file." ;; Read and discard From_ line (get-line-with-delimiter port) ;; Read the actual email (let ((email (read-bytes-till port (string->bytevector "\nFrom " "us-ascii")))) (read-char port) email)) (define (mbox->emails port) "Read all emails from PORT and return as a list of bytevectors. PORT is an input port reading an mbox file." (read-objects read-next-email-in-mbox port))