;;; guile-xapian --- Guile bindings for Xapian ;;; Copyright © 2020, 2022 Arun Isaac ;;; Copyright © 2021 Bob131 ;;; ;;; This file is part of guile-xapian. ;;; ;;; guile-xapian is free software: you can redistribute it and/or ;;; modify it under the terms of the GNU General Public License as ;;; published by the Free Software Foundation, either version 2 of the ;;; License, or (at your option) any later version. ;;; ;;; guile-xapian is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with guile-xapian. If not, see ;;; . (define-module (xapian xapian) #:use-module (rnrs arithmetic bitwise) #:use-module (rnrs bytevectors) #:use-module (ice-9 match) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) #:use-module (htmlprag) #:use-module (xapian wrap) #:export (xapian-open xapian-close call-with-database database-document-count xapian-open-writable xapian-close-writable call-with-writable-database call-with-in-memory-database add-document! replace-document! make-document document-data document-bytes document-set-data! document-set-bytes! document-slot-ref document-slot-ref-bytes document-slot-set! document-slot-set-bytes! make-stem make-term-generator index-text! increase-termpos! parse-query query-and query-or query-filter enquire enquire-mset mset-item-docid mset-item-document mset-item-rank mset-item-weight mset-fold mset-snippet mset-sxml-snippet)) (define xapian-open new-Database) (define xapian-close delete-Database) (define (call-with-database dbpath proc) (let ((db (xapian-open dbpath))) (dynamic-wind noop (cut proc db) (cut xapian-close db)))) (define database-document-count Database-get-doccount) (define xapian-open-writable new-WritableDatabase) (define xapian-close-writable delete-WritableDatabase) (define (call-with-writable-database dbpath proc) (let ((db (xapian-open-writable dbpath))) (dynamic-wind noop (cut proc db) (cut xapian-close-writable db)))) (define (call-with-in-memory-database proc) "Call PROC passing it an in-memory writable database. Only a writable in-memory database can be created, since a read-only in-memory database would always remain empty and is of little use." (let ((db (inmemory-open))) (dynamic-wind noop (cut proc db) (cut xapian-close-writable db)))) (define add-document! WritableDatabase-add-document) (define replace-document! WritableDatabase-replace-document) (define* (make-document #:key data (terms '()) (values '())) (let ((doc (new-Document))) (when data (document-set-data! doc data)) (for-each (match-lambda ((term . wdf-increment) (Document-add-term doc term wdf-increment))) terms) (for-each (match-lambda ((slot . value) (document-slot-set! doc slot value))) values) doc)) (define document-data Document-get-data) (define document-bytes Document-get-data-bytes) (define (document-set-data! document data) "Set data of DOCUMENT to DATA. DATA may be a string or a bytevector." (cond ((string? data) (Document-set-data document data)) ((bytevector? data) (Document-set-data-bytes document data)) (else (error "Invalid document data" data)))) (define (document-set-bytes! doc data) (display "document-set-bytes! is deprecated. document-set-data! now supports setting bytevectors as data. Please use that instead." (current-error-port)) (Document-set-data-bytes doc data)) (define document-slot-ref Document-get-value) (define document-slot-ref-bytes Document-get-value-bytes) (define (document-slot-set! document slot value) "Set SLOT of DOCUMENT to VALUE. VALUE may be a string or a bytevector." (cond ((string? value) (Document-add-value document slot value)) ((bytevector? value) (Document-add-value-bytes document slot value)) (else (error "Invalid document slot value" value)))) (define (document-slot-set-bytes! document slot value) (display "document-slot-set-bytes! is deprecated. document-slot-set! now supports setting bytevectors as slot value. Please use that instead." (current-error-port)) (Document-add-value-bytes document slot value)) (define make-stem new-Stem) (define* (make-term-generator #:key stem document) (let ((term-generator (new-TermGenerator))) (TermGenerator-set-stemmer term-generator stem) (TermGenerator-set-document term-generator document) term-generator)) (define* (index-text! term-generator text #:key (wdf-increment 1) prefix) (apply TermGenerator-index-text term-generator text wdf-increment (if prefix (list prefix) '()))) (define increase-termpos! TermGenerator-increase-termpos) (define* (parse-query querystring #:key stemmer stemming-strategy (prefixes '())) (let ((queryparser (new-QueryParser))) (QueryParser-set-stemmer queryparser stemmer) (when stemming-strategy (QueryParser-set-stemming-strategy queryparser stemming-strategy)) (for-each (match-lambda ((field . prefix) (QueryParser-add-prefix queryparser field prefix))) prefixes) (let ((query (QueryParser-parse-query queryparser querystring))) (delete-QueryParser queryparser) query))) (define* (enquire db query #:key weighting-scheme) (let ((enquire (new-Enquire db))) (Enquire-set-query enquire query) (when weighting-scheme (Enquire-set-weighting-scheme enquire weighting-scheme)) enquire)) (define* (enquire-mset enquire #:key (offset 0) maximum-items) "Run a query using the settings in the ENQUIRE object and return an mset object. OFFSET specifies the number of items to ignore at the beginning of the result set. MAXIMUM-ITEMS specifies the maximum number of items to return. To return all matches, pass the result of calling database-document-count on the database object." (Enquire-get-mset enquire offset maximum-items)) (define mset-item-docid MSetIterator-get-docid) (define mset-item-document MSetIterator-get-document) (define mset-item-rank MSetIterator-get-rank) (define mset-item-weight MSetIterator-get-weight) (define (mset-fold proc init mset) (let loop ((head (MSet-begin mset)) (result init)) (cond ((MSetIterator-equals head (MSet-end mset)) result) (else (let ((result (proc head result))) (MSetIterator-next head) (loop head result)))))) (define (query-combine combine-operator default . queries) (reduce (cut new-Query combine-operator <> <>) default queries)) (define (query-and . queries) "Return a query matching only documents matching all @var{queries}. In a weighted context, the weight is the sum of the weights for all queries." (apply query-combine (Query-OP-AND) (Query-MatchAll) queries)) (define (query-or . queries) "Return a query matching documents which at least one of @var{queries} match. In a weighted context, the weight is the sum of the weights for matching queries." (apply query-combine (Query-OP-OR) (Query-MatchNothing) queries)) (define (query-filter . queries) "Return a query matching only documents matching all @var{queries}, but only take weight from the first of @var{queries}. In a non-weighted context, @code{query-filter} and @code{query-and} are equivalent." (apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries)) (define (get-flag flag-thunk value) (if value (flag-thunk) 0)) ;; TODO: Support cjk-words? (define* (mset-snippet mset text #:key (length 500) (stemmer (make-stem "none")) (highlight-start "") (highlight-end "") (omit "...") (background-model? #t) (exhaustive? #t) (empty-without-match? #t) (cjk-ngram? #t)) "Generate a snippet from @var{text}. @var{mset} is the xapian @code{MSet} object representing a list of search results. @var{length} is the number of bytes of @var{text} to aim to select. The same stemmer used to build the query should be specified as @var{stemmer}. @var{highlight-start} and @var{highlight-end} are inserted respectively before and after the highlit terms. If the chosen snippet seems to start or end mid-sentence, then @var{omit} is prepended or appended to indicate this. If @var{background-model?} is @code{#true}, the relevance of non-query terms are modelled to prefer snippets containing a more interesting background. If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate snippets. Else, snippet generation will stop once a @emph{good enough} snippet has been found. If @var{empty-without-match?} is @code{#true}, return the empty string if not a single term was found in @var{text}. Else, return a substring of text without any highlit terms. If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from CJK text. See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source for more details." (MSet-snippet mset text length stemmer (bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?) (get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?) (get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?) (get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?)) highlight-start highlight-end omit)) (define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none")) (highlight-proc (lambda (text) `(b ,text))) (omit "...") (background-model? #t) (exhaustive? #t) (empty-without-match? #t) (cjk-ngram? #t)) "Generate a snippet in SXML form from @var{text}. @var{highlight-proc} is a function that is passed the highlit text. It should return a SXML tree highlighting that text. Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit}, @var{background-model?}, @var{exhaustive?}, @var{empty-without-match?} and @var{cjk-ngram?} are the same as in the @code{mset-snippet} function." ;; mset-snippet returns serialized HTML. So, we reverse it with ;; html->sxml. (match (html->sxml (mset-snippet text mset)) (('*TOP* children ...) (append-map (match-lambda ;; Apply highlight-proc if highlit text. (('b text) (highlight-proc text)) ;; Add (br) if end of line. ((? (cut string-suffix? "\n" <>) text) (list (string-trim-right text #\newline) '(br))) ;; Else, return verbatim. (text text)) children))))