From 4d1e46f50c7c3ae6e9a62fe89d8ed3638c64f9b9 Mon Sep 17 00:00:00 2001
From: Arun Isaac
Date: Mon, 25 May 2020 05:39:16 +0530
Subject: utils: Do not match sequence byte by byte in read-bytes-till.

* email/utils.scm (bytevector-match, bytevector-overlap,
lookahead-bytevector-n): New functions.
(read-bytes-till): Do not match sequence byte by byte. Process blocks
of bytes at a time.
---
 email/utils.scm | 75 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 18 deletions(-)

(limited to 'email')

diff --git a/email/utils.scm b/email/utils.scm
index 97c1b5a..718ad52 100644
--- a/email/utils.scm
+++ b/email/utils.scm
@@ -75,30 +75,69 @@ back into PORT."
             read-result
             str)))))
 
+(define (bytevector-match bv1 start1 bv2 start2)
+  "Return #t if bytevector BV1 starting from START1 is equal to
+bytevector BV2 starting from START2, else return #f. If the
+bytevectors are of unequal length, they are only compared up to the
+shorter of their lengths."
+  (let ((len (min (- (bytevector-length bv1) start1)
+                  (- (bytevector-length bv2) start2))))
+    (let loop ((i 0))
+      (cond
+       ((= i len) #t)
+       ((not (= (bytevector-u8-ref bv1 (+ start1 i))
+                (bytevector-u8-ref bv2 (+ start2 i))))
+        #f)
+       (else (loop (1+ i)))))))
+
+(define (bytevector-overlap bv1 bv2)
+  "Return the index of bytevector BV1 from which it is equal to
+bytevector BV2 in the sense of bytevector-match. If there is no such
+index, then return the length of BV1."
+  ;; TODO: Maybe implement the Boyer-Moore string search algorithm
+  (let loop ((offset 0))
+    (cond
+     ((bytevector-match bv1 offset bv2 0) offset)
+     (else (loop (1+ offset))))))
+
+(define (lookahead-bytevector-n port count)
+  "Look ahead COUNT bytes into port, and return the seen bytevector."
+  (let ((bv (get-bytevector-n port count)))
+    (unget-bytevector port bv)
+    bv))
+
 (define (read-bytes-till port sequence)
   "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is
 reached. If SEQUENCE is seen, unget it to PORT and return."
-  (define (read-bytes-and-write-till in out sequence)
-    (let ((octet (get-u8 in)))
-      (cond
-       ((eof-object? octet) octet)
-       ;; If octet read matches first octet of sequence, try matching
-       ;; the full sequence.
-       ((= octet (bytevector-u8-ref sequence 0))
-        (unget-bytevector in sequence 0 1)
-        (let ((bv (get-bytevector-n in (bytevector-length sequence))))
-          (cond
-           ((bytevector=? bv sequence) (unget-bytevector in bv))
-           (else (unget-bytevector in bv 1)
-                 (put-u8 out octet)
-                 (read-bytes-and-write-till in out sequence)))))
-       ;; Else, output the octet and continue reading.
-       (else (put-u8 out octet)
-             (read-bytes-and-write-till in out sequence)))))
+  (define (read-loop out)
+    (not-end-let (bv (get-bytevector-some port))
+      (let ((offset (bytevector-overlap bv sequence)))
+        ;; Write bytes before match point to output
+        (put-bytevector out bv 0 offset)
+        (cond
+         ;; Matched nothing, continue searching
+         ((= offset (bytevector-length bv))
+          (read-loop out))
+         ;; Matched full sequence, unget bytes from match point and
+         ;; quit searching
+         ((> (- (bytevector-length bv) offset)
+             (bytevector-length sequence))
+          (unget-bytevector port bv offset))
+         ;; Matched partial sequence, try to match the rest
+         (else
+          (unget-bytevector port bv offset)
+          (not-end-let
+              (lookahead (lookahead-bytevector-n
+                          port (bytevector-length sequence)))
+            ;; Failed to match, take back the first matched byte,
+            ;; write it to the output and continue searching
+            (unless (bytevector=? lookahead sequence)
+              (put-u8 out (get-u8 port))
+              (read-loop out))))))))
 
   (call-with-values open-bytevector-output-port
     (lambda (out get-bytevector)
-      (let ((read-result (read-bytes-and-write-till port out sequence))
+      (let ((read-result (read-loop out))
             (bv (get-bytevector)))
         (if (and (eof-object? read-result)
                  (bytevector=? bv #vu8()))
-- 
cgit v1.2.3