aboutsummaryrefslogtreecommitdiff
path: root/build-aux/pull-corpus.scm
blob: bf4129e78190ec4a605f5c8d94e73cb2bfd6f96c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
;;; guile-email --- Guile email parser
;;; Copyright © 2021 Arun Isaac <arunisaac@systemreboot.net>
;;;
;;; This file is part of guile-email.
;;;
;;; guile-email is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU Affero General Public License as
;;; published by the Free Software Foundation; either version 3 of the
;;; License, or (at your option) any later version.
;;;
;;; guile-email is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; Affero General Public License for more details.
;;;
;;; You should have received a copy of the GNU Affero General Public
;;; License along with guile-email.  If not, see
;;; <http://www.gnu.org/licenses/>.

;;; Commentary:

;; This script pulls archives of specified GNU mailing lists
;; (currently guix-commits and guix-patches). If a partial archive
;; already exists in the current directory, only the newer messages
;; are downloaded.

;;; Code:

(use-modules (ice-9 format)
             (rnrs io ports)
             (srfi srfi-1)
             (srfi srfi-9 gnu)
             (srfi srfi-19)
             (srfi srfi-41)
             (srfi srfi-71)
             (web client)
             (web response))

(define-immutable-record-type <archive>
  (make-archive mailing-list year month size)
  archive?
  (mailing-list archive-mailing-list)
  (year archive-year set-archive-year)
  (month archive-month set-archive-month)
  (size archive-size set-archive-size))

(define (archive->uri archive)
  (format #f "https://lists.gnu.org/archive/mbox/~a/~a-~2,'0d"
          (archive-mailing-list archive)
          (archive-year archive)
          (archive-month archive)))

(define (initialize-archive-size archive)
  (let ((response body (http-head (archive->uri archive))))
    (set-archive-size archive (assoc-ref (response-headers response) 'content-length))))

(define (previous-archive archive)
  (initialize-archive-size
   (case (archive-month archive)
     ((1) (set-archive-year
           (set-archive-month archive 12)
           (1- (archive-year archive))))
     (else (set-archive-month archive (1- (archive-month archive)))))))

(define (archive->filename archive)
  (format #f "~a/~a-~2,'0d.mbox"
          (archive-mailing-list archive)
          (archive-year archive)
          (archive-month archive)))

(define file-size
  (compose stat:size stat))

(define (out-of-sync? archive)
  (let ((filename (archive->filename archive)))
    (if (file-exists? filename)
        (< (file-size filename)
           (archive-size archive))
        #t)))

(define (pull-archive archive)
  (let ((file (archive->filename archive)))
    (format #t "Pulling archive ~a~%" (archive->filename archive))
    (call-with-port (open-file file "a")
      (lambda (port)
        (let ((response body (http-get (archive->uri archive)
                                       #:decode-body? #f
                                       #:headers `((range . (bytes (,(file-size file) . #f)))))))
          (put-bytevector port body))))))

(define (pull-mailing-list mailing-list)
  "Pull archives of MAILING-LIST to a directory of the same name."
  (define archives
    (let ((today (current-date)))
      (stream-unfold
       identity archive-size previous-archive
       (initialize-archive-size
        (make-archive mailing-list (date-year today) (date-month today) #f)))))

  (stream-for-each pull-archive (stream-take-while out-of-sync? archives)))

(pull-mailing-list "guix-commits")
(pull-mailing-list "guix-patches")