; SILex - Scheme Implementation of Lex ; Copyright (C) 2001 Danny Dube' ; ; This program is free software; you can redistribute it and/or ; modify it under the terms of the GNU General Public License ; as published by the Free Software Foundation; either version 2 ; of the License, or (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, write to the Free Software ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ; ; Fonctions auxilliaires du lexer ; (define parse-spec-char (lambda (lexeme line column) (make-tok char-tok lexeme line column newline-ch))) (define parse-digits-char (lambda (lexeme line column) (let* ((num (substring lexeme 1 (string-length lexeme))) (n (string->number num))) (make-tok char-tok lexeme line column n)))) (define parse-quoted-char (lambda (lexeme line column) (let ((c (string-ref lexeme 1))) (make-tok char-tok lexeme line column (char->integer c))))) (define parse-ordinary-char (lambda (lexeme line column) (let ((c (string-ref lexeme 0))) (make-tok char-tok lexeme line column (char->integer c))))) (define string-downcase (lambda (s) (let* ((l (string->list s)) (ld (map char-downcase l))) (list->string ld)))) (define extract-id (lambda (s) (let ((len (string-length s))) (substring s 1 (- len 1))))) (define parse-id (lambda (lexeme line column) (make-tok id-tok lexeme line column (string-downcase lexeme) lexeme))) (define parse-id-ref (lambda (lexeme line column) (let* ((orig-name (extract-id lexeme)) (name (string-downcase orig-name))) (make-tok subst-tok lexeme line column name orig-name)))) (define parse-power-m (lambda (lexeme line column) (let* ((len (string-length lexeme)) (substr (substring lexeme 1 (- len 1))) (m (string->number substr)) (range (cons m m))) (make-tok power-tok lexeme line column range)))) (define parse-power-m-inf (lambda (lexeme line column) (let* ((len (string-length lexeme)) (substr (substring lexeme 1 (- len 2))) (m (string->number substr)) (range (cons m 'inf))) (make-tok power-tok lexeme line column range)))) (define parse-power-m-n (lambda (lexeme line column) (let ((len (string-length lexeme))) (let loop ((comma 2)) (if (char=? (string-ref lexeme comma) #\,) (let* ((sub1 (substring lexeme 1 comma)) (sub2 (substring lexeme (+ comma 1) (- len 1))) (m (string->number sub1)) (n (string->number sub2)) (range (cons m n))) (make-tok power-tok lexeme line column range)) (loop (+ comma 1))))))) ; ; Lexer generique ; (define lexer-raw #f) (define lexer-stack '()) (define lexer-alist #f) (define lexer-buffer #f) (define lexer-buffer-empty? #t) (define lexer-history '()) (define lexer-history-interp #f) (define init-lexer (lambda (port) (let* ((IS (lexer-make-IS 'port port 'all)) (action-lexer (lexer-make-lexer action-tables IS)) (class-lexer (lexer-make-lexer class-tables IS)) (macro-lexer (lexer-make-lexer macro-tables IS)) (regexp-lexer (lexer-make-lexer regexp-tables IS)) (string-lexer (lexer-make-lexer string-tables IS))) (set! lexer-raw #f) (set! lexer-stack '()) (set! lexer-alist (list (cons 'action action-lexer) (cons 'class class-lexer) (cons 'macro macro-lexer) (cons 'regexp regexp-lexer) (cons 'string string-lexer))) (set! lexer-buffer-empty? #t) (set! lexer-history '())))) ; Lexer brut ; S'assurer qu'il n'y a pas de risque de changer de ; lexer quand le buffer est rempli (define push-lexer (lambda (name) (set! lexer-stack (cons lexer-raw lexer-stack)) (set! lexer-raw (cdr (assq name lexer-alist))))) (define pop-lexer (lambda () (set! lexer-raw (car lexer-stack)) (set! lexer-stack (cdr lexer-stack)))) ; Traite le "unget" (capacite du unget: 1) (define lexer2 (lambda () (if lexer-buffer-empty? (lexer-raw) (begin (set! lexer-buffer-empty? #t) lexer-buffer)))) (define lexer2-unget (lambda (tok) (set! lexer-buffer tok) (set! lexer-buffer-empty? #f))) ; Traite l'historique (define lexer (lambda () (let* ((tok (lexer2)) (tok-lexeme (get-tok-lexeme tok)) (hist-lexeme (if lexer-history-interp (blank-translate tok-lexeme) tok-lexeme))) (set! lexer-history (cons hist-lexeme lexer-history)) tok))) (define lexer-unget (lambda (tok) (set! lexer-history (cdr lexer-history)) (lexer2-unget tok))) (define lexer-set-blank-history (lambda (b) (set! lexer-history-interp b))) (define blank-translate (lambda (s) (let ((ss (string-copy s))) (let loop ((i (- (string-length ss) 1))) (cond ((< i 0) ss) ((char=? (string-ref ss i) (integer->char tab-ch)) (loop (- i 1))) ((char=? (string-ref ss i) #\newline) (loop (- i 1))) (else (string-set! ss i #\space) (loop (- i 1)))))))) (define lexer-get-history (lambda () (let* ((rightlist (reverse lexer-history)) (str (apply string-append rightlist)) (strlen (string-length str)) (str2 (if (and (> strlen 0) (char=? (string-ref str (- strlen 1)) #\newline)) str (string-append str (string #\newline))))) (set! lexer-history '()) str2))) ; ; Traitement des listes de tokens ; (define de-anchor-tokens (let ((not-anchor-toks (make-dispatch-table number-of-tokens (list (cons caret-tok #f) (cons dollar-tok #f) (cons <>-tok #f) (cons <>-tok #f)) #t))) (lambda (tok-list) (if (null? tok-list) '() (let* ((tok (car tok-list)) (tok-type (get-tok-type tok)) (toks (cdr tok-list)) (new-toks (de-anchor-tokens toks))) (cond ((vector-ref not-anchor-toks tok-type) (cons tok new-toks)) ((or (= tok-type caret-tok) (= tok-type dollar-tok)) (let* ((line (get-tok-line tok)) (column (get-tok-column tok)) (attr (if (= tok-type caret-tok) caret-ch dollar-ch)) (new-tok (make-tok char-tok "" line column attr))) (cons new-tok new-toks))) ((= tok-type <>-tok) (lex-error (get-tok-line tok) (get-tok-column tok) "the <> anchor must be used alone" " and only after %%.")) ((= tok-type <>-tok) (lex-error (get-tok-line tok) (get-tok-column tok) "the <> anchor must be used alone" " and only after %%.")))))))) (define strip-end (lambda (l) (if (null? (cdr l)) '() (cons (car l) (strip-end (cdr l)))))) (define extract-anchors (lambda (tok-list) (let* ((tok1 (car tok-list)) (line (get-tok-line tok1)) (tok1-type (get-tok-type tok1))) (cond ((and (= tok1-type <>-tok) (null? (cdr tok-list))) (make-rule line #t #f #f #f '() #f)) ((and (= tok1-type <>-tok) (null? (cdr tok-list))) (make-rule line #f #t #f #f '() #f)) (else (let* ((bol? (= tok1-type caret-tok)) (tok-list2 (if bol? (cdr tok-list) tok-list))) (if (null? tok-list2) (make-rule line #f #f bol? #f tok-list2 #f) (let* ((len (length tok-list2)) (tok2 (list-ref tok-list2 (- len 1))) (tok2-type (get-tok-type tok2)) (eol? (= tok2-type dollar-tok)) (tok-list3 (if eol? (strip-end tok-list2) tok-list2))) (make-rule line #f #f bol? eol? tok-list3 #f))))))))) (define char-list->conc (lambda (char-list) (if (null? char-list) (make-re epsilon-re) (let loop ((cl char-list)) (let* ((c (car cl)) (cl2 (cdr cl))) (if (null? cl2) (make-re char-re c) (make-re conc-re (make-re char-re c) (loop cl2)))))))) (define parse-tokens-atom (let ((action-table (make-dispatch-table number-of-tokens (list (cons lpar-tok (lambda (tok tok-list macros) (parse-tokens-sub tok-list macros))) (cons dot-tok (lambda (tok tok-list macros) (cons (make-re class-re dot-class) (cdr tok-list)))) (cons subst-tok (lambda (tok tok-list macros) (let* ((name (get-tok-attr tok)) (ass (assoc name macros))) (if ass (cons (cdr ass) (cdr tok-list)) (lex-error (get-tok-line tok) (get-tok-column tok) "unknown macro \"" (get-tok-2nd-attr tok) "\"."))))) (cons char-tok (lambda (tok tok-list macros) (let ((c (get-tok-attr tok))) (cons (make-re char-re c) (cdr tok-list))))) (cons class-tok (lambda (tok tok-list macros) (let ((class (get-tok-attr tok))) (cons (make-re class-re class) (cdr tok-list))))) (cons string-tok (lambda (tok tok-list macros) (let* ((char-list (get-tok-attr tok)) (re (char-list->conc char-list))) (cons re (cdr tok-list)))))) (lambda (tok tok-list macros) (lex-error (get-tok-line tok) (get-tok-column tok) "syntax error in regular expression."))))) (lambda (tok-list macros) (let* ((tok (car tok-list)) (tok-type (get-tok-type tok)) (action (vector-ref action-table tok-type))) (action tok tok-list macros))))) (define check-power-tok (lambda (tok) (let* ((range (get-tok-attr tok)) (start (car range)) (end (cdr range))) (if (or (eq? 'inf end) (<= start end)) range (lex-error (get-tok-line tok) (get-tok-column tok) "incorrect power specification."))))) (define power->star-plus (lambda (re range) (power->star-plus-rec re (car range) (cdr range)))) (define power->star-plus-rec (lambda (re start end) (cond ((eq? end 'inf) (cond ((= start 0) (make-re star-re re)) ((= start 1) (make-re plus-re re)) (else (make-re conc-re re (power->star-plus-rec re (- start 1) 'inf))))) ((= start 0) (cond ((= end 0) (make-re epsilon-re)) ((= end 1) (make-re question-re re)) (else (make-re question-re (power->star-plus-rec re 1 end))))) ((= start 1) (if (= end 1) re (make-re conc-re re (power->star-plus-rec re 0 (- end 1))))) (else (make-re conc-re re (power->star-plus-rec re (- start 1) (- end 1))))))) (define parse-tokens-fact (let ((not-op-toks (make-dispatch-table number-of-tokens (list (cons question-tok #f) (cons plus-tok #f) (cons star-tok #f) (cons power-tok #f)) #t))) (lambda (tok-list macros) (let* ((result (parse-tokens-atom tok-list macros)) (re (car result)) (tok-list2 (cdr result))) (let loop ((re re) (tok-list3 tok-list2)) (let* ((tok (car tok-list3)) (tok-type (get-tok-type tok))) (cond ((vector-ref not-op-toks tok-type) (cons re tok-list3)) ((= tok-type question-tok) (loop (make-re question-re re) (cdr tok-list3))) ((= tok-type plus-tok) (loop (make-re plus-re re) (cdr tok-list3))) ((= tok-type star-tok) (loop (make-re star-re re) (cdr tok-list3))) ((= tok-type power-tok) (loop (power->star-plus re (check-power-tok tok)) (cdr tok-list3)))))))))) (define parse-tokens-conc (lambda (tok-list macros) (let* ((result1 (parse-tokens-fact tok-list macros)) (re1 (car result1)) (tok-list2 (cdr result1)) (tok (car tok-list2)) (tok-type (get-tok-type tok))) (cond ((or (= tok-type pipe-tok) (= tok-type rpar-tok)) result1) (else ; Autres facteurs (let* ((result2 (parse-tokens-conc tok-list2 macros)) (re2 (car result2)) (tok-list3 (cdr result2))) (cons (make-re conc-re re1 re2) tok-list3))))))) (define parse-tokens-or (lambda (tok-list macros) (let* ((result1 (parse-tokens-conc tok-list macros)) (re1 (car result1)) (tok-list2 (cdr result1)) (tok (car tok-list2)) (tok-type (get-tok-type tok))) (cond ((= tok-type pipe-tok) (let* ((tok-list3 (cdr tok-list2)) (result2 (parse-tokens-or tok-list3 macros)) (re2 (car result2)) (tok-list4 (cdr result2))) (cons (make-re or-re re1 re2) tok-list4))) (else ; rpar-tok result1))))) (define parse-tokens-sub (lambda (tok-list macros) (let* ((tok-list2 (cdr tok-list)) ; Manger le lpar-tok (result (parse-tokens-or tok-list2 macros)) (re (car result)) (tok-list3 (cdr result)) (tok-list4 (cdr tok-list3))) ; Manger le rpar-tok (cons re tok-list4)))) (define parse-tokens-match (lambda (tok-list line) (let loop ((tl tok-list) (count 0)) (if (null? tl) (if (> count 0) (lex-error line #f "mismatched parentheses.")) (let* ((tok (car tl)) (tok-type (get-tok-type tok))) (cond ((= tok-type lpar-tok) (loop (cdr tl) (+ count 1))) ((= tok-type rpar-tok) (if (zero? count) (lex-error line #f "mismatched parentheses.")) (loop (cdr tl) (- count 1))) (else (loop (cdr tl) count)))))))) ; Ne traite pas les anchors (define parse-tokens (lambda (tok-list macros) (if (null? tok-list) (make-re epsilon-re) (let ((line (get-tok-line (car tok-list)))) (parse-tokens-match tok-list line) (let* ((begin-par (make-tok lpar-tok "" line 1)) (end-par (make-tok rpar-tok "" line 1))) (let* ((tok-list2 (append (list begin-par) tok-list (list end-par))) (result (parse-tokens-sub tok-list2 macros))) (car result))))))) ; (cdr result) == () obligatoirement (define tokens->regexp (lambda (tok-list macros) (let ((tok-list2 (de-anchor-tokens tok-list))) (parse-tokens tok-list2 macros)))) (define tokens->rule (lambda (tok-list macros) (let* ((rule (extract-anchors tok-list)) (tok-list2 (get-rule-regexp rule)) (tok-list3 (de-anchor-tokens tok-list2)) (re (parse-tokens tok-list3 macros))) (set-rule-regexp rule re) rule))) ; Retourne une paire: <>-action et vecteur des regles ordinaires (define adapt-rules (lambda (rules) (let loop ((r rules) (revr '()) (<>-action #f) (<>-action #f)) (if (null? r) (cons (or <>-action default-<>-action) (cons (or <>-action default-<>-action) (list->vector (reverse revr)))) (let ((r1 (car r))) (cond ((get-rule-eof? r1) (if <>-action (lex-error (get-rule-line r1) #f "the <> anchor can be " "used at most once.") (loop (cdr r) revr (get-rule-action r1) <>-action))) ((get-rule-error? r1) (if <>-action (lex-error (get-rule-line r1) #f "the <> anchor can be " "used at most once.") (loop (cdr r) revr <>-action (get-rule-action r1)))) (else (loop (cdr r) (cons r1 revr) <>-action <>-action)))))))) ; ; Analyseur de fichier lex ; (define parse-hv-blanks (lambda () (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (if (or (= tok-type hblank-tok) (= tok-type vblank-tok)) (parse-hv-blanks) (lexer-unget tok))))) (define parse-class-range (lambda () (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (cond ((= tok-type char-tok) (let* ((c (get-tok-attr tok)) (tok2 (lexer)) (tok2-type (get-tok-type tok2))) (if (not (= tok2-type minus-tok)) (begin (lexer-unget tok2) (cons c c)) (let* ((tok3 (lexer)) (tok3-type (get-tok-type tok3))) (cond ((= tok3-type char-tok) (let ((c2 (get-tok-attr tok3))) (if (> c c2) (lex-error (get-tok-line tok3) (get-tok-column tok3) "bad range specification in " "character class;" #\newline "the start character is " "higher than the end one.") (cons c c2)))) ((or (= tok3-type rbrack-tok) (= tok3-type minus-tok)) (lex-error (get-tok-line tok3) (get-tok-column tok3) "bad range specification in " "character class; a specification" #\newline "like \"-x\", \"x--\" or \"x-]\" has " "been used.")) ((= tok3-type eof-tok) (lex-error (get-tok-line tok3) #f "eof of file found while parsing " "a character class."))))))) ((= tok-type minus-tok) (lex-error (get-tok-line tok) (get-tok-column tok) "bad range specification in character class; a " "specification" #\newline "like \"-x\", \"x--\" or \"x-]\" has been used.")) ((= tok-type rbrack-tok) #f) ((= tok-type eof-tok) (lex-error (get-tok-line tok) #f "eof of file found while parsing " "a character class.")))))) (define parse-class (lambda (initial-class negative-class? line column) (push-lexer 'class) (let loop ((class initial-class)) (let ((new-range (parse-class-range))) (if new-range (loop (class-union (list new-range) class)) (let ((class (if negative-class? (class-compl class) class))) (pop-lexer) (make-tok class-tok "" line column class))))))) (define parse-string (lambda (line column) (push-lexer 'string) (let ((char-list (let loop () (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (cond ((= tok-type char-tok) (cons (get-tok-attr tok) (loop))) ((= tok-type doublequote-tok) (pop-lexer) '()) (else ; eof-tok (lex-error (get-tok-line tok) #f "end of file found while " "parsing a string."))))))) (make-tok string-tok "" line column char-list)))) (define parse-regexp (let* ((end-action (lambda (tok loop) (lexer-unget tok) (pop-lexer) (lexer-set-blank-history #f) `())) (action-table (make-dispatch-table number-of-tokens (list (cons eof-tok end-action) (cons hblank-tok end-action) (cons vblank-tok end-action) (cons lbrack-tok (lambda (tok loop) (let ((tok1 (parse-class (list) #f (get-tok-line tok) (get-tok-column tok)))) (cons tok1 (loop))))) (cons lbrack-rbrack-tok (lambda (tok loop) (let ((tok1 (parse-class (list (cons rbrack-ch rbrack-ch)) #f (get-tok-line tok) (get-tok-column tok)))) (cons tok1 (loop))))) (cons lbrack-caret-tok (lambda (tok loop) (let ((tok1 (parse-class (list) #t (get-tok-line tok) (get-tok-column tok)))) (cons tok1 (loop))))) (cons lbrack-minus-tok (lambda (tok loop) (let ((tok1 (parse-class (list (cons minus-ch minus-ch)) #f (get-tok-line tok) (get-tok-column tok)))) (cons tok1 (loop))))) (cons doublequote-tok (lambda (tok loop) (let ((tok1 (parse-string (get-tok-line tok) (get-tok-column tok)))) (cons tok1 (loop))))) (cons illegal-tok (lambda (tok loop) (lex-error (get-tok-line tok) (get-tok-column tok) "syntax error in macro reference.")))) (lambda (tok loop) (cons tok (loop)))))) (lambda () (push-lexer 'regexp) (lexer-set-blank-history #t) (parse-hv-blanks) (let loop () (let* ((tok (lexer)) (tok-type (get-tok-type tok)) (action (vector-ref action-table tok-type))) (action tok loop)))))) (define parse-ws1-regexp ; Exige un blanc entre le nom et la RE d'une macro (lambda () (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (cond ((or (= tok-type hblank-tok) (= tok-type vblank-tok)) (parse-regexp)) (else ; percent-percent-tok, id-tok ou illegal-tok (lex-error (get-tok-line tok) (get-tok-column tok) "white space expected.")))))) (define parse-macro (lambda (macros) (push-lexer 'macro) (parse-hv-blanks) (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (cond ((= tok-type id-tok) (let* ((name (get-tok-attr tok)) (ass (assoc name macros))) (if ass (lex-error (get-tok-line tok) (get-tok-column tok) "the macro \"" (get-tok-2nd-attr tok) "\" has already been defined.") (let* ((tok-list (parse-ws1-regexp)) (regexp (tokens->regexp tok-list macros))) (pop-lexer) (cons name regexp))))) ((= tok-type percent-percent-tok) (pop-lexer) #f) ((= tok-type illegal-tok) (lex-error (get-tok-line tok) (get-tok-column tok) "macro name expected.")) ((= tok-type eof-tok) (lex-error (get-tok-line tok) #f "end of file found before %%.")))))) (define parse-macros (lambda () (let loop ((macros '())) (let ((macro (parse-macro macros))) (if macro (loop (cons macro macros)) macros))))) (define parse-action-end (lambda (<>-action? <>-action? action?) (let ((act (lexer-get-history))) (cond (action? act) (<>-action? (string-append act default-<>-action)) (<>-action? (string-append act default-<>-action)) (else (string-append act default-action)))))) (define parse-action (lambda (<>-action? <>-action?) (push-lexer 'action) (let loop ((action? #f)) (let* ((tok (lexer)) (tok-type (get-tok-type tok))) (cond ((= tok-type char-tok) (loop #t)) ((= tok-type hblank-tok) (loop action?)) ((= tok-type vblank-tok) (push-lexer 'regexp) (let* ((tok (lexer)) (tok-type (get-tok-type tok)) (bidon (lexer-unget tok))) (pop-lexer) (if (or (= tok-type hblank-tok) (= tok-type vblank-tok)) (loop action?) (begin (pop-lexer) (parse-action-end <>-action? <>-action? action?))))) (else ; eof-tok (lexer-unget tok) (pop-lexer) (parse-action-end <>-action? <>-action? action?))))))) (define parse-rule (lambda (macros) (let ((tok-list (parse-regexp))) (if (null? tok-list) #f (let* ((rule (tokens->rule tok-list macros)) (action (parse-action (get-rule-eof? rule) (get-rule-error? rule)))) (set-rule-action rule action) rule))))) (define parse-rules (lambda (macros) (parse-action #f #f) (let loop () (let ((rule (parse-rule macros))) (if rule (cons rule (loop)) '()))))) (define parser (lambda (filename) (let* ((port (open-input-file filename)) (port-open? #t)) (lex-unwind-protect (lambda () (if port-open? (close-input-port port)))) (init-lexer port) (let* ((macros (parse-macros)) (rules (parse-rules macros))) (close-input-port port) (set! port-open? #f) (adapt-rules rules)))))