;;; guile-xapian --- Guile bindings for Xapian
;;; Copyright © 2020, 2022, 2024 Arun Isaac <arunisaac@systemreboot.net>
;;; Copyright © 2021 Bob131 <bob@bob131.so>
;;;
;;; This file is part of guile-xapian.
;;;
;;; guile-xapian is free software: you can redistribute it and/or
;;; modify it under the terms of the GNU General Public License as
;;; published by the Free Software Foundation, either version 2 of the
;;; License, or (at your option) any later version.
;;;
;;; guile-xapian is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;;; General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with guile-xapian. If not, see
;;; <https://www.gnu.org/licenses/>.
(define-module (xapian xapian)
#:use-module (rnrs arithmetic bitwise)
#:use-module (rnrs bytevectors)
#:use-module (ice-9 match)
#:use-module (srfi srfi-1)
#:use-module (srfi srfi-26)
#:use-module (htmlprag)
#:use-module (xapian wrap)
#:export (xapian-open
xapian-close
call-with-database
database-document-count
xapian-open-writable
xapian-close-writable
call-with-writable-database
call-with-in-memory-database
add-document!
replace-document!
make-document
document-data
document-bytes
document-set-data!
document-set-bytes!
document-slot-ref
document-slot-ref-bytes
document-slot-set!
document-slot-set-bytes!
make-stem
make-term-generator
index-text!
increase-termpos!
parse-query
query
query-and
query-or
query-filter
prefixed-range-processor
suffixed-range-processor
prefixed-date-range-processor
suffixed-date-range-processor
field-processor
enquire
enquire-mset
mset-item-docid
mset-item-document
mset-item-rank
mset-item-weight
mset-fold
mset-snippet
mset-sxml-snippet))
(define xapian-open new-Database)
(define xapian-close delete-Database)
(define (call-with-database dbpath proc)
(let ((db (xapian-open dbpath)))
(dynamic-wind noop (cut proc db) (cut xapian-close db))))
(define database-document-count Database-get-doccount)
(define xapian-open-writable new-WritableDatabase)
(define xapian-close-writable delete-WritableDatabase)
(define (call-with-writable-database dbpath proc)
(let ((db (xapian-open-writable dbpath)))
(dynamic-wind noop (cut proc db) (cut xapian-close-writable db))))
(define (call-with-in-memory-database proc)
"Call PROC passing it an in-memory writable database. Only a
writable in-memory database can be created, since a read-only
in-memory database would always remain empty and is of little use."
(let ((db (inmemory-open)))
(dynamic-wind noop (cut proc db) (cut xapian-close-writable db))))
(define add-document! WritableDatabase-add-document)
(define replace-document! WritableDatabase-replace-document)
(define* (make-document #:key data (terms '()) (values '()))
(let ((doc (new-Document)))
(when data
(document-set-data! doc data))
(for-each (match-lambda
((term . wdf-increment)
(Document-add-term doc term wdf-increment)))
terms)
(for-each (match-lambda
((slot . value)
(document-slot-set! doc slot value)))
values)
doc))
(define document-data Document-get-data)
(define document-bytes Document-get-data-bytes)
(define (document-set-data! document data)
"Set data of DOCUMENT to DATA. DATA may be a string or a bytevector."
(cond
((string? data)
(Document-set-data document data))
((bytevector? data)
(Document-set-data-bytes document data))
(else
(error "Invalid document data" data))))
(define (document-set-bytes! doc data)
(display "document-set-bytes! is deprecated. document-set-data! now supports setting bytevectors as data. Please use that instead."
(current-error-port))
(Document-set-data-bytes doc data))
(define document-slot-ref Document-get-value)
(define document-slot-ref-bytes Document-get-value-bytes)
(define (document-slot-set! document slot value)
"Set SLOT of DOCUMENT to VALUE. VALUE may be a string or a
bytevector."
(cond
((string? value)
(Document-add-value document slot value))
((bytevector? value)
(Document-add-value-bytes document slot value))
(else
(error "Invalid document slot value" value))))
(define (document-slot-set-bytes! document slot value)
(display "document-slot-set-bytes! is deprecated. document-slot-set! now supports setting bytevectors as slot value. Please use that instead."
(current-error-port))
(Document-add-value-bytes document slot value))
(define make-stem new-Stem)
(define* (make-term-generator #:key stem document)
(let ((term-generator (new-TermGenerator)))
(TermGenerator-set-stemmer term-generator stem)
(TermGenerator-set-document term-generator document)
term-generator))
(define* (index-text! term-generator text #:key (wdf-increment 1) (prefix "") (positions? #t))
"Index @var{text} using @var{term-generator}. @var{wdf-increment} is
the within document frequency increment. @var{prefix} is the term
prefix to use. If @var{positions?} is #f, position information is not
generated."
((if positions?
TermGenerator-index-text
TermGenerator-index-text-without-positions)
term-generator text wdf-increment prefix))
(define increase-termpos! TermGenerator-increase-termpos)
(define* (parse-query querystring
#:key
stemmer
stemming-strategy
(prefixes '())
(boolean-prefixes '())
(range-processors '())
(boolean? #t)
(phrases? #t)
(love-hate? #t)
any-case-boolean?
wildcard?)
"Parse @var{querystring} and return a @code{Query} object.
@var{prefixes} and @var{boolean-prefixes} must be association lists
mapping fields to prefixes or @code{FieldProcessor}
objects. @var{range-processors} is a list of @code{RangeProcessor}
objects.
When @var{boolean?} is @code{#t}, boolean operators (AND, OR, etc.)
and bracketed subexpressions are supported.
When @var{phrases?} is @code{#t}, quoted phrases are supported.
When @var{love-hate?} is @code{#t}, @samp{+} and @samp{-} are
supported.
When @var{any-case-boolean?} is @code{#t}, boolean operators are
supported even if they are not in capitals.
When @var{wildcard?} is @code{#t}, wildcards are supported."
(let ((queryparser (new-QueryParser)))
(QueryParser-set-stemmer queryparser stemmer)
(when stemming-strategy
(QueryParser-set-stemming-strategy queryparser stemming-strategy))
(for-each (match-lambda
((field . prefix)
(QueryParser-add-prefix queryparser field prefix)))
prefixes)
(for-each (match-lambda
((field . prefix)
(QueryParser-add-boolean-prefix queryparser field prefix)))
boolean-prefixes)
(for-each (cut QueryParser-add-rangeprocessor queryparser <>)
range-processors)
(let ((query (QueryParser-parse-query queryparser
querystring
(bitwise-ior (get-flag QueryParser-FLAG-BOOLEAN boolean?)
(get-flag QueryParser-FLAG-PHRASE phrases?)
(get-flag QueryParser-FLAG-LOVEHATE love-hate?)
(get-flag QueryParser-FLAG-BOOLEAN-ANY-CASE any-case-boolean?)
(get-flag QueryParser-FLAG-WILDCARD wildcard?)))))
(delete-QueryParser queryparser)
query)))
(define* (enquire db query #:key weighting-scheme)
(let ((enquire (new-Enquire db)))
(Enquire-set-query enquire query)
(when weighting-scheme
(Enquire-set-weighting-scheme enquire weighting-scheme))
enquire))
(define* (enquire-mset enquire #:key (offset 0) maximum-items)
"Run a query using the settings in the ENQUIRE object and return an
mset object.
OFFSET specifies the number of items to ignore at the beginning of the
result set.
MAXIMUM-ITEMS specifies the maximum number of items to return. To
return all matches, pass the result of calling database-document-count
on the database object."
(Enquire-get-mset enquire offset maximum-items))
(define mset-item-docid MSetIterator-get-docid)
(define mset-item-document MSetIterator-get-document)
(define mset-item-rank MSetIterator-get-rank)
(define mset-item-weight MSetIterator-get-weight)
(define (mset-fold proc init mset)
(let loop ((head (MSet-begin mset))
(result init))
(cond
((MSetIterator-equals head (MSet-end mset)) result)
(else (let ((result (proc head result)))
(MSetIterator-next head)
(loop head result))))))
(define (query term)
"Return a @code{Query} object for @var{term}."
(new-Query term))
(define (query-combine combine-operator default . queries)
(reduce (cut new-Query combine-operator <> <>)
default
queries))
(define (query-and . queries)
"Return a query matching only documents matching all @var{queries}.
In a weighted context, the weight is the sum of the weights for all
queries."
(apply query-combine (Query-OP-AND) (Query-MatchAll) queries))
(define (query-or . queries)
"Return a query matching documents which at least one of @var{queries}
match.
In a weighted context, the weight is the sum of the weights for
matching queries."
(apply query-combine (Query-OP-OR) (Query-MatchNothing) queries))
(define (query-filter . queries)
"Return a query matching only documents matching all @var{queries},
but only take weight from the first of @var{queries}.
In a non-weighted context, @code{query-filter} and @code{query-and}
are equivalent."
(apply query-combine (Query-OP-FILTER) (Query-MatchAll) queries))
(define* (prefixed-range-processor slot proc #:key (prefix "") repeated?)
"Return a @code{RangeProcessor} object that calls @var{proc} to process
its range over @var{slot}.
@var{proc} is a procedure that, given a begin string and an end
string, must return a @code{Query} object. For open-ended ranges,
either the begin string or the end string will be @code{#f}.
@var{prefix} is a prefix to look for to recognize values as belonging
to this range. When @var{repeated?} is @code{#t}, allow @var{prefix}
on both ends of the range—@samp{$1..$10}."
(new-GuileXapianRangeProcessorWrapper
slot
prefix
(get-flag RP-REPEATED repeated?)
proc))
(define* (suffixed-range-processor slot proc #:key suffix repeated?)
"Return a @code{RangeProcessor} object that calls @var{proc} to process
its range over @var{slot}.
@var{proc} is a procedure that, given a begin string and an end
string, must return a @code{Query} object. For open-ended ranges,
either the begin string or the end string will be @code{#f}.
@var{suffix} is a suffix to look for to recognize values as belonging
to this range. When @var{repeated?} is @code{#t}, allow @var{suffix}
on both ends of the range—@samp{2kg..12kg}."
(new-GuileXapianRangeProcessorWrapper
slot
suffix
(bitwise-ior (RP-SUFFIX)
(get-flag RP-REPEATED repeated?))
proc))
(define* (prefixed-date-range-processor slot #:key (prefix "") repeated? prefer-mdy? (epoch-year 1970))
"Return a @code{DateRangeProcessor} object that handles date ranges on
@var{slot}.
@var{prefix} and @var{repeated?} are the same as in
@code{prefixed-range-processor}.
When @var{prefer-mdy?} is @code{#t}, interpret ambiguous dates as
month/day/year rather than day/month/year.
@var{epoch-year} is the year to use as the epoch for dates with
two-digit years."
(new-DateRangeProcessor slot
prefix
(bitwise-ior (get-flag (RP-REPEATED) repeated?)
(get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
epoch-year))
(define* (suffixed-date-range-processor slot #:key suffix repeated? prefer-mdy? (epoch-year 1970))
"Return a @code{DateRangeProcessor} object that handles date ranges on
@var{slot}.
@var{suffix} and @var{repeated?} are the same as in
@code{suffixed-range-processor}.
@var{prefer-mdy?} and @var{epoch-year} are the same as in
@code{prefixed-date-range-processor}."
(new-DateRangeProcessor slot
suffix
(bitwise-ior (RP-SUFFIX)
(get-flag (RP-REPEATED) repeated?)
(get-flag (RP-DATE-PREFER-MDY) prefer-mdy?))
epoch-year))
(define (field-processor proc)
"Return a @code{FieldProcessor} object that calls
@var{proc} to process its field. @var{proc} is a procedure that, given
a string, must return a @code{Query} object."
(new-GuileXapianFieldProcessorWrapper proc))
(define (get-flag flag-thunk value)
(if value (flag-thunk) 0))
;; TODO: Support cjk-words?
(define* (mset-snippet mset text
#:key (length 500) (stemmer (make-stem "none"))
(highlight-start "<b>") (highlight-end "</b>") (omit "...")
(background-model? #t) (exhaustive? #t)
(empty-without-match? #t)
(cjk-ngram? #t))
"Generate a snippet from @var{text}. @var{mset} is the xapian
@code{MSet} object representing a list of search results.
@var{length} is the number of bytes of @var{text} to aim to select.
The same stemmer used to build the query should be specified as
@var{stemmer}.
@var{highlight-start} and @var{highlight-end} are inserted
respectively before and after the highlit terms.
If the chosen snippet seems to start or end mid-sentence, then
@var{omit} is prepended or appended to indicate this.
If @var{background-model?} is @code{#true}, the relevance of non-query
terms are modelled to prefer snippets containing a more interesting
background.
If @var{exhaustive?} is @code{#true}, exhaustively evaluate candidate
snippets. Else, snippet generation will stop once a @emph{good enough}
snippet has been found.
If @var{empty-without-match?} is @code{#true}, return the empty string
if not a single term was found in @var{text}. Else, return a substring
of text without any highlit terms.
If @var{cjk-ngram?} is @code{#true}, enable generation of n-grams from
CJK text.
See @code{MSet::snippet} in @file{xapian/mset.h} of the xapian source
for more details."
(MSet-snippet mset text length stemmer
(bitwise-ior (get-flag MSet-SNIPPET-BACKGROUND-MODEL background-model?)
(get-flag MSet-SNIPPET-EXHAUSTIVE exhaustive?)
(get-flag MSet-SNIPPET-EMPTY-WITHOUT-MATCH empty-without-match?)
(get-flag MSet-SNIPPET-CJK-NGRAM cjk-ngram?))
highlight-start highlight-end omit))
(define* (mset-sxml-snippet mset text #:key (length 500) (stemmer (make-stem "none"))
(highlight-proc (lambda (text)
`(b ,text)))
(omit "...") (background-model? #t) (exhaustive? #t)
(empty-without-match? #t) (cjk-ngram? #t))
"Generate a snippet in SXML form from @var{text}.
@var{highlight-proc} is a function that is passed the highlit text. It
should return a SXML tree highlighting that text.
Arguments @var{mset}, @var{length}, @var{stemmer}, @var{omit},
@var{background-model?}, @var{exhaustive?}, @var{empty-without-match?}
and @var{cjk-ngram?} are the same as in the @code{mset-snippet}
function."
;; mset-snippet returns serialized HTML. So, we reverse it with
;; html->sxml.
(match (html->sxml (mset-snippet mset text))
(('*TOP* children ...)
(append-map (match-lambda
;; Apply highlight-proc if highlit text.
(('b text)
(list (highlight-proc text)))
;; Add (br) if end of line.
((? (cut string-suffix? "\n" <>) text)
(list (string-trim-right text #\newline)
'(br)))
;; Else, return verbatim.
(text (list text)))
children))))