about summary refs log tree commit diff
path: root/xapian/xapian.scm
diff options
context:
space:
mode:
Diffstat (limited to 'xapian/xapian.scm')
-rw-r--r--xapian/xapian.scm242
1 files changed, 235 insertions, 7 deletions
diff --git a/xapian/xapian.scm b/xapian/xapian.scm
index 75924ae..64042a2 100644
--- a/xapian/xapian.scm
+++ b/xapian/xapian.scm
@@ -1,5 +1,5 @@
 ;;; guile-xapian --- Guile bindings for Xapian
-;;; Copyright © 2020, 2022 Arun Isaac <arunisaac@systemreboot.net>
+;;; Copyright © 2020, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net>
 ;;; Copyright © 2021 Bob131 <bob@bob131.so>
 ;;;
 ;;; This file is part of guile-xapian.
@@ -22,7 +22,9 @@
   #:use-module (rnrs arithmetic bitwise)
   #:use-module (rnrs bytevectors)
   #:use-module (ice-9 match)
+  #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-26)
+  #:use-module (htmlprag)
   #:use-module (xapian wrap)
   #:export (xapian-open
             xapian-close
@@ -48,6 +50,15 @@
             index-text!
             increase-termpos!
             parse-query
+            query
+            query-and
+            query-or
+            query-filter
+            prefixed-range-processor
+            suffixed-range-processor
+            prefixed-date-range-processor
+            suffixed-date-range-processor
+            field-processor
             enquire
             enquire-mset
             mset-item-docid
@@ -55,7 +66,8 @@
             mset-item-rank
             mset-item-weight
             mset-fold
-            mset-snippet))
+            mset-snippet
+            mset-sxml-snippet))
 
 (define xapian-open new-Database)
 (define xapian-close delete-Database)
@@ -142,13 +154,49 @@ bytevector."
     (TermGenerator-set-document term-generator document)
     term-generator))
 
-(define* (index-text! term-generator text #:key (wdf-increment 1) prefix)
-  (apply TermGenerator-index-text term-generator text wdf-increment
-         (if prefix (list prefix) '())))
+(define* (index-text! term-generator text #:key (wdf-increment 1) (prefix "") (positions? #t))
+  "Index @var{text} using @var{term-generator}. @var{wdf-increment} is
+the within document frequency increment. @var{prefix} is the term
+prefix to use. If @var{positions?} is #f, position information is not
+generated."
+  ((if positions?
+       TermGenerator-index-text
+       TermGenerator-index-text-without-positions)
+   term-generator text wdf-increment prefix))
 
 (define increase-termpos! TermGenerator-increase-termpos)
 
-(define* (parse-query querystring #:key stemmer stemming-strategy (prefixes '()))
+(define* (parse-query querystring
+                      #:key
+                      stemmer
+                      stemming-strategy
+                      (prefixes '())
+                      (boolean-prefixes '())
+                      (range-processors '())
+                      (boolean? #t)
+                      (phrases? #t)
+                      (love-hate? #t)
+                      any-case-boolean?
+                      wildcard?)
+  "Parse @var{querystring} and return a @code{Query} object.
+
+@var{prefixes} and @var{boolean-prefixes} must be association lists
+mapping fields to prefixes or @code{FieldProcessor}
+objects. @var{range-processors} is a list of @code{RangeProcessor}
+objects.
+
+When @var{boolean?} is @code{#t}, boolean operators (AND, OR, etc.)
+and bracketed subexpressions are supported.
+
+When @var{phrases?} is @code{#t}, quoted phrases are supported.
+
+When @var{love-hate?} is @code{#t}, @samp{+} and @samp{-} are
+supported.
+
+When @var{any-case-boolean?} is @code{#t}, boolean operators are
+supported even if they are not in capitals.
+
+When @var{wildcard?} is @code{#t}, wildcards are supported."
   (let ((queryparser (new-QueryParser)))
     (QueryParser-set-stemmer queryparser stemmer)
     (when stemming-strategy
@@ -157,7 +205,19 @@ bytevector."
                 ((field . prefix)
                  (QueryParser-add-prefix queryparser field prefix)))
               prefixes)
-    (let ((query (QueryParser-parse-query queryparser querystring)))
+    (for-each (match-lambda
+                ((field . prefix)
+                 (QueryParser-add-boolean-prefix queryparser field prefix)))
+              boolean-prefixes)
+    (for-each (cut QueryParser-add-rangeprocessor queryparser <>)
+              range-processors)
+    (let ((query (QueryParser-parse-query queryparser
+                                          querystring
+                                          (bitwise-ior (get-flag QueryParser-FLAG-BOOLEAN boolean?)
+                                                       (get-flag QueryParser-FLAG-PHRASE phrases?)
+                                                       (get-flag QueryParser-FLAG-LOVEHATE love-hate?)
+                                                       (get-flag QueryParser-FLAG-BOOLEAN-ANY-CASE any-case-boolean?)
+                                                       (get-flag QueryParser-FLAG-WILDCARD wildcard?)))))
       (delete-QueryParser queryparser)
       query)))
 
@@ -194,6 +254,113 @@ on the database object."
              (MSetIterator-next head)
              (loop head result))))))
 
+(define (query term)
+  "Return a @code{Query} object for @var{term}."
+  (new-Query term))
+
+(define (query-combine combine-operator default . queries)
+  (reduce (cut new-Query combine-operator <> <>)
+          default
+          queries))
+
+(define (query-and . queries)
+  "Return a query matching only documents matching all @var{queries}.
+
+In a weighted context, the weight is the sum of the weights for all
+queries."
+  (apply query-combine (Query-OP-AND) (Query-MatchAll) queries))
+
+(define (query-or . queries)
+  "Return a query matching documents which at least one of @var{queries}
+match.
+
+In a weighted context, the weight is the sum of the weights for
+matching queries."
+  (apply query-combine (Query-OP-OR) (Query-MatchNothing) queries))
+
+(define (query-filter . queries)
+  "Return a query matching only documents matching all @var{queries},
+but only take weight from the first of @var{queries}.
+
+In a non-weighted context, @code{query-filter} and @code{query-and}
+are equivalent."
+  (apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries))
+
+(define* (prefixed-range-processor slot proc #:key (prefix "") repeated?)
+  "Return a @code{RangeProcessor} object that calls @var{proc} to process
+its range over @var{slot}.
+
+@var{proc} is a procedure that, given a begin string and an end
+string, must return a @code{Query} object. For open-ended ranges,
+either the begin string or the end string will be @code{#f}.
+
+@var{prefix} is a prefix to look for to recognize values as belonging
+to this range. When @var{repeated?} is @code{#t}, allow @var{prefix}
+on both ends of the range—@samp{$1..$10}."
+  (new-GuileXapianRangeProcessorWrapper
+   slot
+   prefix
+   (get-flag RP-REPEATED repeated?)
+   proc))
+
+(define* (suffixed-range-processor slot proc #:key suffix repeated?)
+  "Return a @code{RangeProcessor} object that calls @var{proc} to process
+its range over @var{slot}.
+
+@var{proc} is a procedure that, given a begin string and an end
+string, must return a @code{Query} object. For open-ended ranges,
+either the begin string or the end string will be @code{#f}.
+
+@var{suffix} is a suffix to look for to recognize values as belonging
+to this range. When @var{repeated?} is @code{#t}, allow @var{suffix}
+on both ends of the range—@samp{2kg..12kg}."
+  (new-GuileXapianRangeProcessorWrapper
+   slot
+   suffix
+   (bitwise-ior (RP-SUFFIX)
+                (get-flag RP-REPEATED repeated?))
+   proc))
+
+(define* (prefixed-date-range-processor slot #:key (prefix "") repeated? prefer-mdy? (epoch-year 1970))
+  "Return a @code{DateRangeProcessor} object that handles date ranges on
+@var{slot}.
+
+@var{prefix} and @var{repeated?} are the same as in
+@code{prefixed-range-processor}.
+
+When @var{prefer-mdy?} is @code{#t}, interpret ambiguous dates as
+month/day/year rather than day/month/year.
+
+@var{epoch-year} is the year to use as the epoch for dates with
+two-digit years."
+  (new-DateRangeProcessor slot
+                          prefix
+                          (bitwise-ior (get-flag (RP-REPEATED) repeated?)
+                                       (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
+                          epoch-year))
+
+(define* (suffixed-date-range-processor slot #:key suffix repeated? prefer-mdy? (epoch-year 1970))
+  "Return a @code{DateRangeProcessor} object that handles date ranges on
+@var{slot}.
+
+@var{suffix} and @var{repeated?} are the same as in
+@code{suffixed-range-processor}.
+
+@var{prefer-mdy?} and @var{epoch-year} are the same as in
+@code{prefixed-date-range-processor}."
+  (new-DateRangeProcessor slot
+                          suffix
+                          (bitwise-ior (RP-SUFFIX)
+                                       (get-flag (RP-REPEATED) repeated?)
+                                       (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
+                          epoch-year))
+
+(define (field-processor proc)
+  "Return a @code{FieldProcessor} object that calls
+@var{proc} to process its field. @var{proc} is a procedure that, given
+a string, must return a @code{Query} object."
+  (new-GuileXapianFieldProcessorWrapper proc))
+
 (define (get-flag flag-thunk value)
   (if value (flag-thunk) 0))
 
@@ -204,9 +371,70 @@ on the database object."
                        (background-model? #t) (exhaustive? #t)
                        (empty-without-match? #t)
                        (cjk-ngram? #t))
+  "Generate a snippet from @var{text}. @var{mset} is the xapian
+@code{MSet} object representing a list of search results.
+
+@var{length} is the number of bytes of @var{text} to aim to select.
+
+The same stemmer used to build the query should be specified as
+@var{stemmer}.
+
+@var{highlight-start} and @var{highlight-end} are inserted
+respectively before and after the highlit terms.
+
+If the chosen snippet seems to start or end mid-sentence, then
+@var{omit} is prepended or appended to indicate this.
+
+If @var{background-model?} is @code{#true}, the relevance of non-query
+terms are modelled to prefer snippets containing a more interesting
+background.
+
+If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate
+snippets. Else, snippet generation will stop once a @emph{good enough}
+snippet has been found.
+
+If @var{empty-without-match?} is @code{#true}, return the empty string
+if not a single term was found in @var{text}. Else, return a substring
+of text without any highlit terms.
+
+If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from
+CJK text.
+
+See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source
+for more details."
   (MSet-snippet mset text length stemmer
                 (bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?)
                              (get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?)
                              (get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?)
                              (get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?))
                 highlight-start highlight-end omit))
+
+(define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none"))
+                           (highlight-proc (lambda (text)
+                                             `(b ,text)))
+                           (omit "...") (background-model? #t) (exhaustive? #t)
+                           (empty-without-match? #t) (cjk-ngram? #t))
+  "Generate a snippet in SXML form from @var{text}.
+
+@var{highlight-proc} is a function that is passed the highlit text. It
+should return a SXML tree highlighting that text.
+
+Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit},
+@var{background-model?}, @var{exhaustive?}, @var{empty-without-match?}
+and @var{cjk-ngram?} are the same as in the @code{mset-snippet}
+function."
+  ;; mset-snippet returns serialized HTML. So, we reverse it with
+  ;; html->sxml.
+  (match (html->sxml (mset-snippet mset text))
+    (('*TOP* children ...)
+     (append-map (match-lambda
+                   ;; Apply highlight-proc if highlit text.
+                   (('b text)
+                    (list (highlight-proc text)))
+                   ;; Add (br) if end of line.
+                   ((? (cut string-suffix? "\n" <>) text)
+                    (list (string-trim-right text #\newline)
+                          '(br)))
+                   ;; Else, return verbatim.
+                   (text (list text)))
+                 children))))