From 512e835cf2752681eda5bbb374c122c037baa6df Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Tue, 15 Jun 2021 23:38:47 +0530 Subject: build-aux: Add corpus pull script. This is a follow-up to commit 7a78c23f6f2f1c32abb508563a700cd535b29ad1 adding a file that was missed out. * build-aux/pull-corpus.scm: New file. --- build-aux/pull-corpus.scm | 103 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 build-aux/pull-corpus.scm (limited to 'build-aux') diff --git a/build-aux/pull-corpus.scm b/build-aux/pull-corpus.scm new file mode 100644 index 0000000..bf4129e --- /dev/null +++ b/build-aux/pull-corpus.scm @@ -0,0 +1,103 @@ +;;; guile-email --- Guile email parser +;;; Copyright © 2021 Arun Isaac +;;; +;;; This file is part of guile-email. +;;; +;;; guile-email is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU Affero General Public License as +;;; published by the Free Software Foundation; either version 3 of the +;;; License, or (at your option) any later version. +;;; +;;; guile-email is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; Affero General Public License for more details. +;;; +;;; You should have received a copy of the GNU Affero General Public +;;; License along with guile-email. If not, see +;;; . + +;;; Commentary: + +;; This script pulls archives of specified GNU mailing lists +;; (currently guix-commits and guix-patches). If a partial archive +;; already exists in the current directory, only the newer messages +;; are downloaded. + +;;; Code: + +(use-modules (ice-9 format) + (rnrs io ports) + (srfi srfi-1) + (srfi srfi-9 gnu) + (srfi srfi-19) + (srfi srfi-41) + (srfi srfi-71) + (web client) + (web response)) + +(define-immutable-record-type + (make-archive mailing-list year month size) + archive? + (mailing-list archive-mailing-list) + (year archive-year set-archive-year) + (month archive-month set-archive-month) + (size archive-size set-archive-size)) + +(define (archive->uri archive) + (format #f "https://lists.gnu.org/archive/mbox/~a/~a-~2,'0d" + (archive-mailing-list archive) + (archive-year archive) + (archive-month archive))) + +(define (initialize-archive-size archive) + (let ((response body (http-head (archive->uri archive)))) + (set-archive-size archive (assoc-ref (response-headers response) 'content-length)))) + +(define (previous-archive archive) + (initialize-archive-size + (case (archive-month archive) + ((1) (set-archive-year + (set-archive-month archive 12) + (1- (archive-year archive)))) + (else (set-archive-month archive (1- (archive-month archive))))))) + +(define (archive->filename archive) + (format #f "~a/~a-~2,'0d.mbox" + (archive-mailing-list archive) + (archive-year archive) + (archive-month archive))) + +(define file-size + (compose stat:size stat)) + +(define (out-of-sync? archive) + (let ((filename (archive->filename archive))) + (if (file-exists? filename) + (< (file-size filename) + (archive-size archive)) + #t))) + +(define (pull-archive archive) + (let ((file (archive->filename archive))) + (format #t "Pulling archive ~a~%" (archive->filename archive)) + (call-with-port (open-file file "a") + (lambda (port) + (let ((response body (http-get (archive->uri archive) + #:decode-body? #f + #:headers `((range . (bytes (,(file-size file) . #f))))))) + (put-bytevector port body)))))) + +(define (pull-mailing-list mailing-list) + "Pull archives of MAILING-LIST to a directory of the same name." + (define archives + (let ((today (current-date))) + (stream-unfold + identity archive-size previous-archive + (initialize-archive-size + (make-archive mailing-list (date-year today) (date-month today) #f))))) + + (stream-for-each pull-archive (stream-take-while out-of-sync? archives))) + +(pull-mailing-list "guix-commits") +(pull-mailing-list "guix-patches") -- cgit v1.2.3