From 4d1e46f50c7c3ae6e9a62fe89d8ed3638c64f9b9 Mon Sep 17 00:00:00 2001 From: Arun Isaac Date: Mon, 25 May 2020 05:39:16 +0530 Subject: utils: Do not match sequence byte by byte in read-bytes-till. * email/utils.scm (bytevector-match, bytevector-overlap, lookahead-bytevector-n): New functions. (read-bytes-till): Do not match sequence byte by byte. Process blocks of bytes at a time. --- email/utils.scm | 75 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/email/utils.scm b/email/utils.scm index 97c1b5a..718ad52 100644 --- a/email/utils.scm +++ b/email/utils.scm @@ -75,30 +75,69 @@ back into PORT." read-result str))))) +(define (bytevector-match bv1 start1 bv2 start2) + "Return #t if bytevector BV1 starting from START1 is equal to +bytevector BV2 starting from START2, else return #f. If the +bytevectors are of unequal length, they are only compared up to the +shorter of their lengths." + (let ((len (min (- (bytevector-length bv1) start1) + (- (bytevector-length bv2) start2)))) + (let loop ((i 0)) + (cond + ((= i len) #t) + ((not (= (bytevector-u8-ref bv1 (+ start1 i)) + (bytevector-u8-ref bv2 (+ start2 i)))) + #f) + (else (loop (1+ i))))))) + +(define (bytevector-overlap bv1 bv2) + "Return the index of bytevector BV1 from which it is equal to +bytevector BV2 in the sense of bytevector-match. If there is no such +index, then return the length of BV1." + ;; TODO: Maybe implement the Boyer-Moore string search algorithm + (let loop ((offset 0)) + (cond + ((bytevector-match bv1 offset bv2 0) offset) + (else (loop (1+ offset)))))) + +(define (lookahead-bytevector-n port count) + "Look ahead COUNT bytes into port, and return the seen bytevector." + (let ((bv (get-bytevector-n port count))) + (unget-bytevector port bv) + bv)) + (define (read-bytes-till port sequence) "Read bytes from PORT until byte SEQUENCE is seen or end-of-file is reached. If SEQUENCE is seen, unget it to PORT and return." - (define (read-bytes-and-write-till in out sequence) - (let ((octet (get-u8 in))) - (cond - ((eof-object? octet) octet) - ;; If octet read matches first octet of sequence, try matching - ;; the full sequence. - ((= octet (bytevector-u8-ref sequence 0)) - (unget-bytevector in sequence 0 1) - (let ((bv (get-bytevector-n in (bytevector-length sequence)))) - (cond - ((bytevector=? bv sequence) (unget-bytevector in bv)) - (else (unget-bytevector in bv 1) - (put-u8 out octet) - (read-bytes-and-write-till in out sequence))))) - ;; Else, output the octet and continue reading. - (else (put-u8 out octet) - (read-bytes-and-write-till in out sequence))))) + (define (read-loop out) + (not-end-let (bv (get-bytevector-some port)) + (let ((offset (bytevector-overlap bv sequence))) + ;; Write bytes before match point to output + (put-bytevector out bv 0 offset) + (cond + ;; Matched nothing, continue searching + ((= offset (bytevector-length bv)) + (read-loop out)) + ;; Matched full sequence, unget bytes from match point and + ;; quit searching + ((> (- (bytevector-length bv) offset) + (bytevector-length sequence)) + (unget-bytevector port bv offset)) + ;; Matched partial sequence, try to match the rest + (else + (unget-bytevector port bv offset) + (not-end-let + (lookahead (lookahead-bytevector-n + port (bytevector-length sequence))) + ;; Failed to match, take back the first matched byte, + ;; write it to the output and continue searching + (unless (bytevector=? lookahead sequence) + (put-u8 out (get-u8 port)) + (read-loop out)))))))) (call-with-values open-bytevector-output-port (lambda (out get-bytevector) - (let ((read-result (read-bytes-and-write-till port out sequence)) + (let ((read-result (read-loop out)) (bv (get-bytevector))) (if (and (eof-object? read-result) (bytevector=? bv #vu8())) -- cgit v1.2.3