diff options
Diffstat (limited to 'xapian/xapian.scm')
-rw-r--r-- | xapian/xapian.scm | 242 |
1 files changed, 235 insertions, 7 deletions
diff --git a/xapian/xapian.scm b/xapian/xapian.scm index 75924ae..64042a2 100644 --- a/xapian/xapian.scm +++ b/xapian/xapian.scm @@ -1,5 +1,5 @@ ;;; guile-xapian --- Guile bindings for Xapian -;;; Copyright © 2020, 2022 Arun Isaac <arunisaac@systemreboot.net> +;;; Copyright © 2020, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net> ;;; Copyright © 2021 Bob131 <bob@bob131.so> ;;; ;;; This file is part of guile-xapian. @@ -22,7 +22,9 @@ #:use-module (rnrs arithmetic bitwise) #:use-module (rnrs bytevectors) #:use-module (ice-9 match) + #:use-module (srfi srfi-1) #:use-module (srfi srfi-26) + #:use-module (htmlprag) #:use-module (xapian wrap) #:export (xapian-open xapian-close @@ -48,6 +50,15 @@ index-text! increase-termpos! parse-query + query + query-and + query-or + query-filter + prefixed-range-processor + suffixed-range-processor + prefixed-date-range-processor + suffixed-date-range-processor + field-processor enquire enquire-mset mset-item-docid @@ -55,7 +66,8 @@ mset-item-rank mset-item-weight mset-fold - mset-snippet)) + mset-snippet + mset-sxml-snippet)) (define xapian-open new-Database) (define xapian-close delete-Database) @@ -142,13 +154,49 @@ bytevector." (TermGenerator-set-document term-generator document) term-generator)) -(define* (index-text! term-generator text #:key (wdf-increment 1) prefix) - (apply TermGenerator-index-text term-generator text wdf-increment - (if prefix (list prefix) '()))) +(define* (index-text! term-generator text #:key (wdf-increment 1) (prefix "") (positions? #t)) + "Index @var{text} using @var{term-generator}. @var{wdf-increment} is +the within document frequency increment. @var{prefix} is the term +prefix to use. If @var{positions?} is #f, position information is not +generated." + ((if positions? + TermGenerator-index-text + TermGenerator-index-text-without-positions) + term-generator text wdf-increment prefix)) (define increase-termpos! TermGenerator-increase-termpos) -(define* (parse-query querystring #:key stemmer stemming-strategy (prefixes '())) +(define* (parse-query querystring + #:key + stemmer + stemming-strategy + (prefixes '()) + (boolean-prefixes '()) + (range-processors '()) + (boolean? #t) + (phrases? #t) + (love-hate? #t) + any-case-boolean? + wildcard?) + "Parse @var{querystring} and return a @code{Query} object. + +@var{prefixes} and @var{boolean-prefixes} must be association lists +mapping fields to prefixes or @code{FieldProcessor} +objects. @var{range-processors} is a list of @code{RangeProcessor} +objects. + +When @var{boolean?} is @code{#t}, boolean operators (AND, OR, etc.) +and bracketed subexpressions are supported. + +When @var{phrases?} is @code{#t}, quoted phrases are supported. + +When @var{love-hate?} is @code{#t}, @samp{+} and @samp{-} are +supported. + +When @var{any-case-boolean?} is @code{#t}, boolean operators are +supported even if they are not in capitals. + +When @var{wildcard?} is @code{#t}, wildcards are supported." (let ((queryparser (new-QueryParser))) (QueryParser-set-stemmer queryparser stemmer) (when stemming-strategy @@ -157,7 +205,19 @@ bytevector." ((field . prefix) (QueryParser-add-prefix queryparser field prefix))) prefixes) - (let ((query (QueryParser-parse-query queryparser querystring))) + (for-each (match-lambda + ((field . prefix) + (QueryParser-add-boolean-prefix queryparser field prefix))) + boolean-prefixes) + (for-each (cut QueryParser-add-rangeprocessor queryparser <>) + range-processors) + (let ((query (QueryParser-parse-query queryparser + querystring + (bitwise-ior (get-flag QueryParser-FLAG-BOOLEAN boolean?) + (get-flag QueryParser-FLAG-PHRASE phrases?) + (get-flag QueryParser-FLAG-LOVEHATE love-hate?) + (get-flag QueryParser-FLAG-BOOLEAN-ANY-CASE any-case-boolean?) + (get-flag QueryParser-FLAG-WILDCARD wildcard?))))) (delete-QueryParser queryparser) query))) @@ -194,6 +254,113 @@ on the database object." (MSetIterator-next head) (loop head result)))))) +(define (query term) + "Return a @code{Query} object for @var{term}." + (new-Query term)) + +(define (query-combine combine-operator default . queries) + (reduce (cut new-Query combine-operator <> <>) + default + queries)) + +(define (query-and . queries) + "Return a query matching only documents matching all @var{queries}. + +In a weighted context, the weight is the sum of the weights for all +queries." + (apply query-combine (Query-OP-AND) (Query-MatchAll) queries)) + +(define (query-or . queries) + "Return a query matching documents which at least one of @var{queries} +match. + +In a weighted context, the weight is the sum of the weights for +matching queries." + (apply query-combine (Query-OP-OR) (Query-MatchNothing) queries)) + +(define (query-filter . queries) + "Return a query matching only documents matching all @var{queries}, +but only take weight from the first of @var{queries}. + +In a non-weighted context, @code{query-filter} and @code{query-and} +are equivalent." + (apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries)) + +(define* (prefixed-range-processor slot proc #:key (prefix "") repeated?) + "Return a @code{RangeProcessor} object that calls @var{proc} to process +its range over @var{slot}. + +@var{proc} is a procedure that, given a begin string and an end +string, must return a @code{Query} object. For open-ended ranges, +either the begin string or the end string will be @code{#f}. + +@var{prefix} is a prefix to look for to recognize values as belonging +to this range. When @var{repeated?} is @code{#t}, allow @var{prefix} +on both ends of the range—@samp{$1..$10}." + (new-GuileXapianRangeProcessorWrapper + slot + prefix + (get-flag RP-REPEATED repeated?) + proc)) + +(define* (suffixed-range-processor slot proc #:key suffix repeated?) + "Return a @code{RangeProcessor} object that calls @var{proc} to process +its range over @var{slot}. + +@var{proc} is a procedure that, given a begin string and an end +string, must return a @code{Query} object. For open-ended ranges, +either the begin string or the end string will be @code{#f}. + +@var{suffix} is a suffix to look for to recognize values as belonging +to this range. When @var{repeated?} is @code{#t}, allow @var{suffix} +on both ends of the range—@samp{2kg..12kg}." + (new-GuileXapianRangeProcessorWrapper + slot + suffix + (bitwise-ior (RP-SUFFIX) + (get-flag RP-REPEATED repeated?)) + proc)) + +(define* (prefixed-date-range-processor slot #:key (prefix "") repeated? prefer-mdy? (epoch-year 1970)) + "Return a @code{DateRangeProcessor} object that handles date ranges on +@var{slot}. + +@var{prefix} and @var{repeated?} are the same as in +@code{prefixed-range-processor}. + +When @var{prefer-mdy?} is @code{#t}, interpret ambiguous dates as +month/day/year rather than day/month/year. + +@var{epoch-year} is the year to use as the epoch for dates with +two-digit years." + (new-DateRangeProcessor slot + prefix + (bitwise-ior (get-flag (RP-REPEATED) repeated?) + (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?)) + epoch-year)) + +(define* (suffixed-date-range-processor slot #:key suffix repeated? prefer-mdy? (epoch-year 1970)) + "Return a @code{DateRangeProcessor} object that handles date ranges on +@var{slot}. + +@var{suffix} and @var{repeated?} are the same as in +@code{suffixed-range-processor}. + +@var{prefer-mdy?} and @var{epoch-year} are the same as in +@code{prefixed-date-range-processor}." + (new-DateRangeProcessor slot + suffix + (bitwise-ior (RP-SUFFIX) + (get-flag (RP-REPEATED) repeated?) + (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?)) + epoch-year)) + +(define (field-processor proc) + "Return a @code{FieldProcessor} object that calls +@var{proc} to process its field. @var{proc} is a procedure that, given +a string, must return a @code{Query} object." + (new-GuileXapianFieldProcessorWrapper proc)) + (define (get-flag flag-thunk value) (if value (flag-thunk) 0)) @@ -204,9 +371,70 @@ on the database object." (background-model? #t) (exhaustive? #t) (empty-without-match? #t) (cjk-ngram? #t)) + "Generate a snippet from @var{text}. @var{mset} is the xapian +@code{MSet} object representing a list of search results. + +@var{length} is the number of bytes of @var{text} to aim to select. + +The same stemmer used to build the query should be specified as +@var{stemmer}. + +@var{highlight-start} and @var{highlight-end} are inserted +respectively before and after the highlit terms. + +If the chosen snippet seems to start or end mid-sentence, then +@var{omit} is prepended or appended to indicate this. + +If @var{background-model?} is @code{#true}, the relevance of non-query +terms are modelled to prefer snippets containing a more interesting +background. + +If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate +snippets. Else, snippet generation will stop once a @emph{good enough} +snippet has been found. + +If @var{empty-without-match?} is @code{#true}, return the empty string +if not a single term was found in @var{text}. Else, return a substring +of text without any highlit terms. + +If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from +CJK text. + +See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source +for more details." (MSet-snippet mset text length stemmer (bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?) (get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?) (get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?) (get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?)) highlight-start highlight-end omit)) + +(define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none")) + (highlight-proc (lambda (text) + `(b ,text))) + (omit "...") (background-model? #t) (exhaustive? #t) + (empty-without-match? #t) (cjk-ngram? #t)) + "Generate a snippet in SXML form from @var{text}. + +@var{highlight-proc} is a function that is passed the highlit text. It +should return a SXML tree highlighting that text. + +Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit}, +@var{background-model?}, @var{exhaustive?}, @var{empty-without-match?} +and @var{cjk-ngram?} are the same as in the @code{mset-snippet} +function." + ;; mset-snippet returns serialized HTML. So, we reverse it with + ;; html->sxml. + (match (html->sxml (mset-snippet mset text)) + (('*TOP* children ...) + (append-map (match-lambda + ;; Apply highlight-proc if highlit text. + (('b text) + (list (highlight-proc text))) + ;; Add (br) if end of line. + ((? (cut string-suffix? "\n" <>) text) + (list (string-trim-right text #\newline) + '(br))) + ;; Else, return verbatim. + (text (list text))) + children)))) |