aboutsummaryrefslogtreecommitdiff
;;; guile-xapian --- Guile bindings for Xapian
;;; Copyright © 2020, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2021 Bob131 <bob@bob131.so>
;;;
;;; This file is part of guile-xapian.
;;;
;;; guile-xapian is free software: you can redistribute it and/or
;;; modify it under the terms of the GNU General Public License as
;;; published by the Free Software Foundation, either version 2 of the
;;; License, or (at your option) any later version.
;;;
;;; guile-xapian is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with guile-xapian.  If not, see
;;; <https://www.gnu.org/licenses/>.

(define-module (xapian xapian)
  #:use-module (rnrs arithmetic bitwise)
  #:use-module (rnrs bytevectors)
  #:use-module (ice-9 match)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-26)
  #:use-module (htmlprag)
  #:use-module (xapian wrap)
  #:export (xapian-open
            xapian-close
            call-with-database
            database-document-count
            xapian-open-writable
            xapian-close-writable
            call-with-writable-database
            call-with-in-memory-database
            add-document!
            replace-document!
            make-document
            document-data
            document-bytes
            document-set-data!
            document-set-bytes!
            document-slot-ref
            document-slot-ref-bytes
            document-slot-set!
            document-slot-set-bytes!
            make-stem
            make-term-generator
            index-text!
            increase-termpos!
            parse-query
            query
            query-and
            query-or
            query-filter
            prefixed-range-processor
            suffixed-range-processor
            prefixed-date-range-processor
            suffixed-date-range-processor
            field-processor
            enquire
            enquire-mset
            mset-item-docid
            mset-item-document
            mset-item-rank
            mset-item-weight
            mset-fold
            mset-snippet
            mset-sxml-snippet))

(define xapian-open new-Database)
(define xapian-close delete-Database)

(define (call-with-database dbpath proc)
  (let ((db (xapian-open dbpath)))
    (dynamic-wind noop (cut proc db) (cut xapian-close db))))

(define database-document-count Database-get-doccount)

(define xapian-open-writable new-WritableDatabase)
(define xapian-close-writable delete-WritableDatabase)

(define (call-with-writable-database dbpath proc)
  (let ((db (xapian-open-writable dbpath)))
    (dynamic-wind noop (cut proc db) (cut xapian-close-writable db))))

(define (call-with-in-memory-database proc)
  "Call PROC passing it an in-memory writable database. Only a
writable in-memory database can be created, since a read-only
in-memory database would always remain empty and is of little use."
  (let ((db (inmemory-open)))
    (dynamic-wind noop (cut proc db) (cut xapian-close-writable db))))

(define add-document! WritableDatabase-add-document)
(define replace-document! WritableDatabase-replace-document)

(define* (make-document #:key data (terms '()) (values '()))
  (let ((doc (new-Document)))
    (when data
      (document-set-data! doc data))
    (for-each (match-lambda
                ((term . wdf-increment)
                 (Document-add-term doc term wdf-increment)))
              terms)
    (for-each (match-lambda
                ((slot . value)
                 (document-slot-set! doc slot value)))
              values)
    doc))

(define document-data Document-get-data)
(define document-bytes Document-get-data-bytes)

(define (document-set-data! document data)
  "Set data of DOCUMENT to DATA. DATA may be a string or a bytevector."
  (cond
   ((string? data)
    (Document-set-data document data))
   ((bytevector? data)
    (Document-set-data-bytes document data))
   (else
    (error "Invalid document data" data))))

(define (document-set-bytes! doc data)
  (display "document-set-bytes! is deprecated. document-set-data! now supports setting bytevectors as data. Please use that instead."
           (current-error-port))
  (Document-set-data-bytes doc data))

(define document-slot-ref Document-get-value)
(define document-slot-ref-bytes Document-get-value-bytes)

(define (document-slot-set! document slot value)
  "Set SLOT of DOCUMENT to VALUE. VALUE may be a string or a
bytevector."
  (cond
   ((string? value)
    (Document-add-value document slot value))
   ((bytevector? value)
    (Document-add-value-bytes document slot value))
   (else
    (error "Invalid document slot value" value))))

(define (document-slot-set-bytes! document slot value)
  (display "document-slot-set-bytes! is deprecated. document-slot-set! now supports setting bytevectors as slot value. Please use that instead."
           (current-error-port))
  (Document-add-value-bytes document slot value))

(define make-stem new-Stem)

(define* (make-term-generator #:key stem document)
  (let ((term-generator (new-TermGenerator)))
    (TermGenerator-set-stemmer term-generator stem)
    (TermGenerator-set-document term-generator document)
    term-generator))

(define* (index-text! term-generator text #:key (wdf-increment 1) (prefix "") (positions? #t))
  "Index @var{text} using @var{term-generator}. @var{wdf-increment} is
the within document frequency increment. @var{prefix} is the term
prefix to use. If @var{positions?} is #f, position information is not
generated."
  ((if positions?
       TermGenerator-index-text
       TermGenerator-index-text-without-positions)
   term-generator text wdf-increment prefix))

(define increase-termpos! TermGenerator-increase-termpos)

(define* (parse-query querystring
                      #:key
                      stemmer
                      stemming-strategy
                      (prefixes '())
                      (boolean-prefixes '())
                      (range-processors '())
                      (boolean? #t)
                      (phrases? #t)
                      (love-hate? #t)
                      any-case-boolean?
                      wildcard?)
  "Parse @var{querystring} and return a @code{Query} object.

@var{prefixes} and @var{boolean-prefixes} must be association lists
mapping fields to prefixes or @code{FieldProcessor}
objects. @var{range-processors} is a list of @code{RangeProcessor}
objects.

When @var{boolean?} is @code{#t}, boolean operators (AND, OR, etc.)
and bracketed subexpressions are supported.

When @var{phrases?} is @code{#t}, quoted phrases are supported.

When @var{love-hate?} is @code{#t}, @samp{+} and @samp{-} are
supported.

When @var{any-case-boolean?} is @code{#t}, boolean operators are
supported even if they are not in capitals.

When @var{wildcard?} is @code{#t}, wildcards are supported."
  (let ((queryparser (new-QueryParser)))
    (QueryParser-set-stemmer queryparser stemmer)
    (when stemming-strategy
      (QueryParser-set-stemming-strategy queryparser stemming-strategy))
    (for-each (match-lambda
                ((field . prefix)
                 (QueryParser-add-prefix queryparser field prefix)))
              prefixes)
    (for-each (match-lambda
                ((field . prefix)
                 (QueryParser-add-boolean-prefix queryparser field prefix)))
              boolean-prefixes)
    (for-each (cut QueryParser-add-rangeprocessor queryparser <>)
              range-processors)
    (let ((query (QueryParser-parse-query queryparser
                                          querystring
                                          (bitwise-ior (get-flag QueryParser-FLAG-BOOLEAN boolean?)
                                                       (get-flag QueryParser-FLAG-PHRASE phrases?)
                                                       (get-flag QueryParser-FLAG-LOVEHATE love-hate?)
                                                       (get-flag QueryParser-FLAG-BOOLEAN-ANY-CASE any-case-boolean?)
                                                       (get-flag QueryParser-FLAG-WILDCARD wildcard?)))))
      (delete-QueryParser queryparser)
      query)))

(define* (enquire db query #:key weighting-scheme)
  (let ((enquire (new-Enquire db)))
    (Enquire-set-query enquire query)
    (when weighting-scheme
      (Enquire-set-weighting-scheme enquire weighting-scheme))
    enquire))

(define* (enquire-mset enquire #:key (offset 0) maximum-items)
  "Run a query using the settings in the ENQUIRE object and return an
mset object.

OFFSET specifies the number of items to ignore at the beginning of the
result set.

MAXIMUM-ITEMS specifies the maximum number of items to return. To
return all matches, pass the result of calling database-document-count
on the database object."
  (Enquire-get-mset enquire offset maximum-items))

(define mset-item-docid MSetIterator-get-docid)
(define mset-item-document MSetIterator-get-document)
(define mset-item-rank MSetIterator-get-rank)
(define mset-item-weight MSetIterator-get-weight)

(define (mset-fold proc init mset)
  (let loop ((head (MSet-begin mset))
             (result init))
    (cond
     ((MSetIterator-equals head (MSet-end mset)) result)
     (else (let ((result (proc head result)))
             (MSetIterator-next head)
             (loop head result))))))

(define (query term)
  "Return a @code{Query} object for @var{term}."
  (new-Query term))

(define (query-combine combine-operator default . queries)
  (reduce (cut new-Query combine-operator <> <>)
          default
          queries))

(define (query-and . queries)
  "Return a query matching only documents matching all @var{queries}.

In a weighted context, the weight is the sum of the weights for all
queries."
  (apply query-combine (Query-OP-AND) (Query-MatchAll) queries))

(define (query-or . queries)
  "Return a query matching documents which at least one of @var{queries}
match.

In a weighted context, the weight is the sum of the weights for
matching queries."
  (apply query-combine (Query-OP-OR) (Query-MatchNothing) queries))

(define (query-filter . queries)
  "Return a query matching only documents matching all @var{queries},
but only take weight from the first of @var{queries}.

In a non-weighted context, @code{query-filter} and @code{query-and}
are equivalent."
  (apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries))

(define* (prefixed-range-processor slot proc #:key (prefix "") repeated?)
  "Return a @code{RangeProcessor} object that calls @var{proc} to process
its range over @var{slot}.

@var{proc} is a procedure that, given a begin string and an end
string, must return a @code{Query} object. For open-ended ranges,
either the begin string or the end string will be @code{#f}.

@var{prefix} is a prefix to look for to recognize values as belonging
to this range. When @var{repeated?} is @code{#t}, allow @var{prefix}
on both ends of the range—@samp{$1..$10}."
  (new-GuileXapianRangeProcessorWrapper
   slot
   prefix
   (get-flag RP-REPEATED repeated?)
   proc))

(define* (suffixed-range-processor slot proc #:key suffix repeated?)
  "Return a @code{RangeProcessor} object that calls @var{proc} to process
its range over @var{slot}.

@var{proc} is a procedure that, given a begin string and an end
string, must return a @code{Query} object. For open-ended ranges,
either the begin string or the end string will be @code{#f}.

@var{suffix} is a suffix to look for to recognize values as belonging
to this range. When @var{repeated?} is @code{#t}, allow @var{suffix}
on both ends of the range—@samp{2kg..12kg}."
  (new-GuileXapianRangeProcessorWrapper
   slot
   suffix
   (bitwise-ior (RP-SUFFIX)
                (get-flag RP-REPEATED repeated?))
   proc))

(define* (prefixed-date-range-processor slot #:key (prefix "") repeated? prefer-mdy? (epoch-year 1970))
  "Return a @code{DateRangeProcessor} object that handles date ranges on
@var{slot}.

@var{prefix} and @var{repeated?} are the same as in
@code{prefixed-range-processor}.

When @var{prefer-mdy?} is @code{#t}, interpret ambiguous dates as
month/day/year rather than day/month/year.

@var{epoch-year} is the year to use as the epoch for dates with
two-digit years."
  (new-DateRangeProcessor slot
                          prefix
                          (bitwise-ior (get-flag (RP-REPEATED) repeated?)
                                       (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
                          epoch-year))

(define* (suffixed-date-range-processor slot #:key suffix repeated? prefer-mdy? (epoch-year 1970))
  "Return a @code{DateRangeProcessor} object that handles date ranges on
@var{slot}.

@var{suffix} and @var{repeated?} are the same as in
@code{suffixed-range-processor}.

@var{prefer-mdy?} and @var{epoch-year} are the same as in
@code{prefixed-date-range-processor}."
  (new-DateRangeProcessor slot
                          suffix
                          (bitwise-ior (RP-SUFFIX)
                                       (get-flag (RP-REPEATED) repeated?)
                                       (get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
                          epoch-year))

(define (field-processor proc)
  "Return a @code{FieldProcessor} object that calls
@var{proc} to process its field. @var{proc} is a procedure that, given
a string, must return a @code{Query} object."
  (new-GuileXapianFieldProcessorWrapper proc))

(define (get-flag flag-thunk value)
  (if value (flag-thunk) 0))

;; TODO: Support cjk-words?
(define* (mset-snippet mset text
                       #:key (length 500) (stemmer (make-stem "none"))
                       (highlight-start "<b>") (highlight-end "</b>") (omit "...")
                       (background-model? #t) (exhaustive? #t)
                       (empty-without-match? #t)
                       (cjk-ngram? #t))
  "Generate a snippet from @var{text}. @var{mset} is the xapian
@code{MSet} object representing a list of search results.

@var{length} is the number of bytes of @var{text} to aim to select.

The same stemmer used to build the query should be specified as
@var{stemmer}.

@var{highlight-start} and @var{highlight-end} are inserted
respectively before and after the highlit terms.

If the chosen snippet seems to start or end mid-sentence, then
@var{omit} is prepended or appended to indicate this.

If @var{background-model?} is @code{#true}, the relevance of non-query
terms are modelled to prefer snippets containing a more interesting
background.

If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate
snippets. Else, snippet generation will stop once a @emph{good enough}
snippet has been found.

If @var{empty-without-match?} is @code{#true}, return the empty string
if not a single term was found in @var{text}. Else, return a substring
of text without any highlit terms.

If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from
CJK text.

See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source
for more details."
  (MSet-snippet mset text length stemmer
                (bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?)
                             (get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?)
                             (get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?)
                             (get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?))
                highlight-start highlight-end omit))

(define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none"))
                           (highlight-proc (lambda (text)
                                             `(b ,text)))
                           (omit "...") (background-model? #t) (exhaustive? #t)
                           (empty-without-match? #t) (cjk-ngram? #t))
  "Generate a snippet in SXML form from @var{text}.

@var{highlight-proc} is a function that is passed the highlit text. It
should return a SXML tree highlighting that text.

Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit},
@var{background-model?}, @var{exhaustive?}, @var{empty-without-match?}
and @var{cjk-ngram?} are the same as in the @code{mset-snippet}
function."
  ;; mset-snippet returns serialized HTML. So, we reverse it with
  ;; html->sxml.
  (match (html->sxml (mset-snippet mset text))
    (('*TOP* children ...)
     (append-map (match-lambda
                   ;; Apply highlight-proc if highlit text.
                   (('b text)
                    (list (highlight-proc text)))
                   ;; Add (br) if end of line.
                   ((? (cut string-suffix? "\n" <>) text)
                    (list (string-trim-right text #\newline)
                          '(br)))
                   ;; Else, return verbatim.
                   (text (list text)))
                 children))))