summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArun Isaac2021-06-15 23:38:47 +0530
committerArun Isaac2021-06-15 23:38:47 +0530
commit512e835cf2752681eda5bbb374c122c037baa6df (patch)
treea2a0295a5e0180f11eb74be2f3028067d83e3b63
parent7a78c23f6f2f1c32abb508563a700cd535b29ad1 (diff)
downloadguile-email-512e835cf2752681eda5bbb374c122c037baa6df.tar.gz
guile-email-512e835cf2752681eda5bbb374c122c037baa6df.tar.lz
guile-email-512e835cf2752681eda5bbb374c122c037baa6df.zip
build-aux: Add corpus pull script.
This is a follow-up to commit 7a78c23f6f2f1c32abb508563a700cd535b29ad1 adding a file that was missed out. * build-aux/pull-corpus.scm: New file.
-rw-r--r--build-aux/pull-corpus.scm103
1 files changed, 103 insertions, 0 deletions
diff --git a/build-aux/pull-corpus.scm b/build-aux/pull-corpus.scm
new file mode 100644
index 0000000..bf4129e
--- /dev/null
+++ b/build-aux/pull-corpus.scm
@@ -0,0 +1,103 @@
+;;; guile-email --- Guile email parser
+;;; Copyright © 2021 Arun Isaac <arunisaac@systemreboot.net>
+;;;
+;;; This file is part of guile-email.
+;;;
+;;; guile-email is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU Affero General Public License as
+;;; published by the Free Software Foundation; either version 3 of the
+;;; License, or (at your option) any later version.
+;;;
+;;; guile-email is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;; Affero General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU Affero General Public
+;;; License along with guile-email. If not, see
+;;; <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; This script pulls archives of specified GNU mailing lists
+;; (currently guix-commits and guix-patches). If a partial archive
+;; already exists in the current directory, only the newer messages
+;; are downloaded.
+
+;;; Code:
+
+(use-modules (ice-9 format)
+ (rnrs io ports)
+ (srfi srfi-1)
+ (srfi srfi-9 gnu)
+ (srfi srfi-19)
+ (srfi srfi-41)
+ (srfi srfi-71)
+ (web client)
+ (web response))
+
+(define-immutable-record-type <archive>
+ (make-archive mailing-list year month size)
+ archive?
+ (mailing-list archive-mailing-list)
+ (year archive-year set-archive-year)
+ (month archive-month set-archive-month)
+ (size archive-size set-archive-size))
+
+(define (archive->uri archive)
+ (format #f "https://lists.gnu.org/archive/mbox/~a/~a-~2,'0d"
+ (archive-mailing-list archive)
+ (archive-year archive)
+ (archive-month archive)))
+
+(define (initialize-archive-size archive)
+ (let ((response body (http-head (archive->uri archive))))
+ (set-archive-size archive (assoc-ref (response-headers response) 'content-length))))
+
+(define (previous-archive archive)
+ (initialize-archive-size
+ (case (archive-month archive)
+ ((1) (set-archive-year
+ (set-archive-month archive 12)
+ (1- (archive-year archive))))
+ (else (set-archive-month archive (1- (archive-month archive)))))))
+
+(define (archive->filename archive)
+ (format #f "~a/~a-~2,'0d.mbox"
+ (archive-mailing-list archive)
+ (archive-year archive)
+ (archive-month archive)))
+
+(define file-size
+ (compose stat:size stat))
+
+(define (out-of-sync? archive)
+ (let ((filename (archive->filename archive)))
+ (if (file-exists? filename)
+ (< (file-size filename)
+ (archive-size archive))
+ #t)))
+
+(define (pull-archive archive)
+ (let ((file (archive->filename archive)))
+ (format #t "Pulling archive ~a~%" (archive->filename archive))
+ (call-with-port (open-file file "a")
+ (lambda (port)
+ (let ((response body (http-get (archive->uri archive)
+ #:decode-body? #f
+ #:headers `((range . (bytes (,(file-size file) . #f)))))))
+ (put-bytevector port body))))))
+
+(define (pull-mailing-list mailing-list)
+ "Pull archives of MAILING-LIST to a directory of the same name."
+ (define archives
+ (let ((today (current-date)))
+ (stream-unfold
+ identity archive-size previous-archive
+ (initialize-archive-size
+ (make-archive mailing-list (date-year today) (date-month today) #f)))))
+
+ (stream-for-each pull-archive (stream-take-while out-of-sync? archives)))
+
+(pull-mailing-list "guix-commits")
+(pull-mailing-list "guix-patches")