;;; guile-xapian --- Guile bindings for Xapian ;;; Copyright © 2020, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net> ;;; Copyright © 2021 Bob131 <bob@bob131.so> ;;; ;;; This file is part of guile-xapian. ;;; ;;; guile-xapian is free software: you can redistribute it and/or ;;; modify it under the terms of the GNU General Public License as ;;; published by the Free Software Foundation, either version 2 of the ;;; License, or (at your option) any later version. ;;; ;;; guile-xapian is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;;; General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with guile-xapian. If not, see ;;; <https://www.gnu.org/licenses/>. (define-module (xapian xapian) #:use-module (rnrs arithmetic bitwise) #:use-module (rnrs bytevectors) #:use-module (ice-9 match) #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) #:use-module (htmlprag) #:use-module (xapian wrap) #:export (xapian-open xapian-close call-with-database database-document-count xapian-open-writable xapian-close-writable call-with-writable-database call-with-in-memory-database add-document! replace-document! make-document document-data document-bytes document-set-data! document-set-bytes! document-slot-ref document-slot-ref-bytes document-slot-set! document-slot-set-bytes! make-stem make-term-generator index-text! increase-termpos! parse-query query query-and query-or query-filter prefixed-range-processor suffixed-range-processor prefixed-date-range-processor suffixed-date-range-processor field-processor enquire enquire-mset mset-item-docid mset-item-document mset-item-rank mset-item-weight mset-fold mset-snippet mset-sxml-snippet)) (define xapian-open new-Database) (define xapian-close delete-Database) (define (call-with-database dbpath proc) (let ((db (xapian-open dbpath))) (dynamic-wind noop (cut proc db) (cut xapian-close db)))) (define database-document-count Database-get-doccount) (define xapian-open-writable new-WritableDatabase) (define xapian-close-writable delete-WritableDatabase) (define (call-with-writable-database dbpath proc) (let ((db (xapian-open-writable dbpath))) (dynamic-wind noop (cut proc db) (cut xapian-close-writable db)))) (define (call-with-in-memory-database proc) "Call PROC passing it an in-memory writable database. Only a writable in-memory database can be created, since a read-only in-memory database would always remain empty and is of little use." (let ((db (inmemory-open))) (dynamic-wind noop (cut proc db) (cut xapian-close-writable db)))) (define add-document! WritableDatabase-add-document) (define replace-document! WritableDatabase-replace-document) (define* (make-document #:key data (terms '()) (values '())) (let ((doc (new-Document))) (when data (document-set-data! doc data)) (for-each (match-lambda ((term . wdf-increment) (Document-add-term doc term wdf-increment))) terms) (for-each (match-lambda ((slot . value) (document-slot-set! doc slot value))) values) doc)) (define document-data Document-get-data) (define document-bytes Document-get-data-bytes) (define (document-set-data! document data) "Set data of DOCUMENT to DATA. DATA may be a string or a bytevector." (cond ((string? data) (Document-set-data document data)) ((bytevector? data) (Document-set-data-bytes document data)) (else (error "Invalid document data" data)))) (define (document-set-bytes! doc data) (display "document-set-bytes! is deprecated. document-set-data! now supports setting bytevectors as data. Please use that instead." (current-error-port)) (Document-set-data-bytes doc data)) (define document-slot-ref Document-get-value) (define document-slot-ref-bytes Document-get-value-bytes) (define (document-slot-set! document slot value) "Set SLOT of DOCUMENT to VALUE. VALUE may be a string or a bytevector." (cond ((string? value) (Document-add-value document slot value)) ((bytevector? value) (Document-add-value-bytes document slot value)) (else (error "Invalid document slot value" value)))) (define (document-slot-set-bytes! document slot value) (display "document-slot-set-bytes! is deprecated. document-slot-set! now supports setting bytevectors as slot value. Please use that instead." (current-error-port)) (Document-add-value-bytes document slot value)) (define make-stem new-Stem) (define* (make-term-generator #:key stem document) (let ((term-generator (new-TermGenerator))) (TermGenerator-set-stemmer term-generator stem) (TermGenerator-set-document term-generator document) term-generator)) (define* (index-text! term-generator text #:key (wdf-increment 1) (prefix "") (positions? #t)) "Index @var{text} using @var{term-generator}. @var{wdf-increment} is the within document frequency increment. @var{prefix} is the term prefix to use. If @var{positions?} is #f, position information is not generated." ((if positions? TermGenerator-index-text TermGenerator-index-text-without-positions) term-generator text wdf-increment prefix)) (define increase-termpos! TermGenerator-increase-termpos) (define* (parse-query querystring #:key stemmer stemming-strategy (prefixes '()) (boolean-prefixes '()) (range-processors '()) (boolean? #t) (phrases? #t) (love-hate? #t) any-case-boolean? wildcard?) "Parse @var{querystring} and return a @code{Query} object. @var{prefixes} and @var{boolean-prefixes} must be association lists mapping fields to prefixes or @code{FieldProcessor} objects. @var{range-processors} is a list of @code{RangeProcessor} objects. When @var{boolean?} is @code{#t}, boolean operators (AND, OR, etc.) and bracketed subexpressions are supported. When @var{phrases?} is @code{#t}, quoted phrases are supported. When @var{love-hate?} is @code{#t}, @samp{+} and @samp{-} are supported. When @var{any-case-boolean?} is @code{#t}, boolean operators are supported even if they are not in capitals. When @var{wildcard?} is @code{#t}, wildcards are supported." (let ((queryparser (new-QueryParser))) (QueryParser-set-stemmer queryparser stemmer) (when stemming-strategy (QueryParser-set-stemming-strategy queryparser stemming-strategy)) (for-each (match-lambda ((field . prefix) (QueryParser-add-prefix queryparser field prefix))) prefixes) (for-each (match-lambda ((field . prefix) (QueryParser-add-boolean-prefix queryparser field prefix))) boolean-prefixes) (for-each (cut QueryParser-add-rangeprocessor queryparser <>) range-processors) (let ((query (QueryParser-parse-query queryparser querystring (bitwise-ior (get-flag QueryParser-FLAG-BOOLEAN boolean?) (get-flag QueryParser-FLAG-PHRASE phrases?) (get-flag QueryParser-FLAG-LOVEHATE love-hate?) (get-flag QueryParser-FLAG-BOOLEAN-ANY-CASE any-case-boolean?) (get-flag QueryParser-FLAG-WILDCARD wildcard?))))) (delete-QueryParser queryparser) query))) (define* (enquire db query #:key weighting-scheme) (let ((enquire (new-Enquire db))) (Enquire-set-query enquire query) (when weighting-scheme (Enquire-set-weighting-scheme enquire weighting-scheme)) enquire)) (define* (enquire-mset enquire #:key (offset 0) maximum-items) "Run a query using the settings in the ENQUIRE object and return an mset object. OFFSET specifies the number of items to ignore at the beginning of the result set. MAXIMUM-ITEMS specifies the maximum number of items to return. To return all matches, pass the result of calling database-document-count on the database object." (Enquire-get-mset enquire offset maximum-items)) (define mset-item-docid MSetIterator-get-docid) (define mset-item-document MSetIterator-get-document) (define mset-item-rank MSetIterator-get-rank) (define mset-item-weight MSetIterator-get-weight) (define (mset-fold proc init mset) (let loop ((head (MSet-begin mset)) (result init)) (cond ((MSetIterator-equals head (MSet-end mset)) result) (else (let ((result (proc head result))) (MSetIterator-next head) (loop head result)))))) (define (query term) "Return a @code{Query} object for @var{term}." (new-Query term)) (define (query-combine combine-operator default . queries) (reduce (cut new-Query combine-operator <> <>) default queries)) (define (query-and . queries) "Return a query matching only documents matching all @var{queries}. In a weighted context, the weight is the sum of the weights for all queries." (apply query-combine (Query-OP-AND) (Query-MatchAll) queries)) (define (query-or . queries) "Return a query matching documents which at least one of @var{queries} match. In a weighted context, the weight is the sum of the weights for matching queries." (apply query-combine (Query-OP-OR) (Query-MatchNothing) queries)) (define (query-filter . queries) "Return a query matching only documents matching all @var{queries}, but only take weight from the first of @var{queries}. In a non-weighted context, @code{query-filter} and @code{query-and} are equivalent." (apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries)) (define* (prefixed-range-processor slot proc #:key (prefix "") repeated?) "Return a @code{RangeProcessor} object that calls @var{proc} to process its range over @var{slot}. @var{proc} is a procedure that, given a begin string and an end string, must return a @code{Query} object. For open-ended ranges, either the begin string or the end string will be @code{#f}. @var{prefix} is a prefix to look for to recognize values as belonging to this range. When @var{repeated?} is @code{#t}, allow @var{prefix} on both ends of the range—@samp{$1..$10}." (new-GuileXapianRangeProcessorWrapper slot prefix (get-flag RP-REPEATED repeated?) proc)) (define* (suffixed-range-processor slot proc #:key suffix repeated?) "Return a @code{RangeProcessor} object that calls @var{proc} to process its range over @var{slot}. @var{proc} is a procedure that, given a begin string and an end string, must return a @code{Query} object. For open-ended ranges, either the begin string or the end string will be @code{#f}. @var{suffix} is a suffix to look for to recognize values as belonging to this range. When @var{repeated?} is @code{#t}, allow @var{suffix} on both ends of the range—@samp{2kg..12kg}." (new-GuileXapianRangeProcessorWrapper slot suffix (bitwise-ior (RP-SUFFIX) (get-flag RP-REPEATED repeated?)) proc)) (define* (prefixed-date-range-processor slot #:key (prefix "") repeated? prefer-mdy? (epoch-year 1970)) "Return a @code{DateRangeProcessor} object that handles date ranges on @var{slot}. @var{prefix} and @var{repeated?} are the same as in @code{prefixed-range-processor}. When @var{prefer-mdy?} is @code{#t}, interpret ambiguous dates as month/day/year rather than day/month/year. @var{epoch-year} is the year to use as the epoch for dates with two-digit years." (new-DateRangeProcessor slot prefix (bitwise-ior (get-flag (RP-REPEATED) repeated?) (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?)) epoch-year)) (define* (suffixed-date-range-processor slot #:key suffix repeated? prefer-mdy? (epoch-year 1970)) "Return a @code{DateRangeProcessor} object that handles date ranges on @var{slot}. @var{suffix} and @var{repeated?} are the same as in @code{suffixed-range-processor}. @var{prefer-mdy?} and @var{epoch-year} are the same as in @code{prefixed-date-range-processor}." (new-DateRangeProcessor slot suffix (bitwise-ior (RP-SUFFIX) (get-flag (RP-REPEATED) repeated?) (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?)) epoch-year)) (define (field-processor proc) "Return a @code{FieldProcessor} object that calls @var{proc} to process its field. @var{proc} is a procedure that, given a string, must return a @code{Query} object." (new-GuileXapianFieldProcessorWrapper proc)) (define (get-flag flag-thunk value) (if value (flag-thunk) 0)) ;; TODO: Support cjk-words? (define* (mset-snippet mset text #:key (length 500) (stemmer (make-stem "none")) (highlight-start "<b>") (highlight-end "</b>") (omit "...") (background-model? #t) (exhaustive? #t) (empty-without-match? #t) (cjk-ngram? #t)) "Generate a snippet from @var{text}. @var{mset} is the xapian @code{MSet} object representing a list of search results. @var{length} is the number of bytes of @var{text} to aim to select. The same stemmer used to build the query should be specified as @var{stemmer}. @var{highlight-start} and @var{highlight-end} are inserted respectively before and after the highlit terms. If the chosen snippet seems to start or end mid-sentence, then @var{omit} is prepended or appended to indicate this. If @var{background-model?} is @code{#true}, the relevance of non-query terms are modelled to prefer snippets containing a more interesting background. If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate snippets. Else, snippet generation will stop once a @emph{good enough} snippet has been found. If @var{empty-without-match?} is @code{#true}, return the empty string if not a single term was found in @var{text}. Else, return a substring of text without any highlit terms. If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from CJK text. See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source for more details." (MSet-snippet mset text length stemmer (bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?) (get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?) (get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?) (get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?)) highlight-start highlight-end omit)) (define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none")) (highlight-proc (lambda (text) `(b ,text))) (omit "...") (background-model? #t) (exhaustive? #t) (empty-without-match? #t) (cjk-ngram? #t)) "Generate a snippet in SXML form from @var{text}. @var{highlight-proc} is a function that is passed the highlit text. It should return a SXML tree highlighting that text. Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit}, @var{background-model?}, @var{exhaustive?}, @var{empty-without-match?} and @var{cjk-ngram?} are the same as in the @code{mset-snippet} function." ;; mset-snippet returns serialized HTML. So, we reverse it with ;; html->sxml. (match (html->sxml (mset-snippet mset text)) (('*TOP* children ...) (append-map (match-lambda ;; Apply highlight-proc if highlit text. (('b text) (list (highlight-proc text))) ;; Add (br) if end of line. ((? (cut string-suffix? "\n" <>) text) (list (string-trim-right text #\newline) '(br))) ;; Else, return verbatim. (text (list text))) children))))