Contrasting examples of the lucene api versus pythonic lupyne idioms.


Basic indexing and searching example adapted from http://lucene.apache.org/core/4_10_1/core/index.html

import lucene
from org.apache.lucene import analysis, document, index, queryparser, search, store, util
from lupyne import engine

# # # lucene # # #

analyzer = analysis.standard.StandardAnalyzer(util.Version.LUCENE_CURRENT)

# Store the index in memory:
directory = store.RAMDirectory()
# To store an index on disk, use this instead:
# Directory directory = FSDirectory.open(File("/tmp/testindex"))
config = index.IndexWriterConfig(util.Version.LUCENE_CURRENT, analyzer)
iwriter = index.IndexWriter(directory, config)
doc = document.Document()
text = "This is the text to be indexed."
doc.add(document.Field("fieldname", text, document.TextField.TYPE_STORED))

# Now search the index:
ireader = index.IndexReader.open(directory)
isearcher = search.IndexSearcher(ireader)
# Parse a simple query that searches for "text":
parser = queryparser.classic.QueryParser(util.Version.LUCENE_CURRENT, "fieldname", analyzer)
query = parser.parse("text")
hits = isearcher.search(query, None, 1000).scoreDocs
assert len(hits) == 1
# Iterate through the results:
for hit in hits:
    hitDoc = isearcher.doc(hit.doc)
    assert hitDoc['fieldname'] == text

# # # lupyne # # #

# Store the index in memory:
indexer = engine.Indexer()              # Indexer combines Writer and Searcher; RAMDirectory and StandardAnalyzer are defaults
indexer.set('fieldname', stored=True)   # settings for all documents of indexer; indexed and tokenized is the default
indexer.add(fieldname=text)             # add document
indexer.commit()                        # commit changes and refresh searcher

# Now search the index:
hits = indexer.search('text', field='fieldname')    # parsing handled if necessary
assert len(hits) == 1
for hit in hits:                                    # hits support mapping interface
    assert hit['fieldname'] == text
# closing is handled automatically


Convenient Query creation.

Operator overloading is used for combining boolean clauses.

import lucene
from org.apache.lucene import index, search
from org.apache.lucene.search import spans
from lupyne.engine import Query

# # # lucene # # #

q1 = search.TermQuery(index.Term('text', 'lucene'))
q2 = search.PhraseQuery()
q2.add(index.Term('text', 'search'))
q2.add(index.Term('text', 'engine'))
q3 = search.BooleanQuery()
q3.add(q1, search.BooleanClause.Occur.MUST)
q3.add(q2, search.BooleanClause.Occur.MUST)
assert str(q3) == '+text:lucene +text:"search engine"'

q1 = spans.SpanTermQuery(index.Term('text', 'hello'))
q2 = spans.SpanTermQuery(index.Term('text', 'world'))
q3 = spans.SpanPositionRangeQuery(q1, 0, 10)
q4 = spans.SpanNearQuery([q1, q2], 0, True)
q5 = spans.SpanNotQuery(q3, q4)
assert str(q5) == 'spanNot(spanPosRange(text:hello, 0, 10), spanNear([text:hello, text:world], 0, true), 0, 0)'

# # # lupyne # # #

q = Query.term('text', 'lucene') & Query.phrase('text', 'search', 'engine')
assert isinstance(q, search.BooleanQuery)
assert str(q) == '+text:lucene +text:"search engine"'

q = Query.span('text', 'hello')[:10] - Query.near('text', 'hello', 'world')
assert isinstance(q, spans.SpanQuery)
assert str(q) == 'spanNot(spanPosRange(text:hello, 0, 10), spanNear([text:hello, text:world], 0, true), 0, 0)'


Advanced searching with custom fields.

Prefix and Range queries are a potential pitfall in Lucene.
As the queries expand to more terms, the performance drops off precipitously.
A common example is where datetimes are indexed, but a large span of date ranges are being searched.
The usual workaround is to only index the amount of granularity needed, e.g., just the dates.
But this may not be sufficient, or the datetimes may be necessary for other searches.

The general solution is to index the term values into a prefix tree.
Then each query can expand to only values of the appropriate granularity.
Lucene's NumericFields encode numbers to be sortable, so it is also able to cluster prefixes into the same field.
Whereas Lupyne's NestedField assumes the value is already a sortable string, so different fields must be used to cluster the prefixes.
There are trade-offs to each approach:
 * NumericFields support range queries natively, but must translate prefix queries.
 * NestedFields support prefix queries optimally, but must translate range queries.
 * NumericFields only support numbers, and result in unreadable values in the index.
 * NestedFields support any searchable values, but pollute the field namespace.

Lupyne PointFields and DateTimeFields are implemented as NumericFields since both are easily encoded as numbers.
NestedFields could still be used however, as demonstrated on dates below.

from datetime import date
import lucene
from org.apache.lucene import search
from lupyne import engine

docs = [
    {'city': 'San Francisco', 'state': 'CA', 'incorporated': '1850-04-15', 'population': 808976, 'longitude': -122.4192, 'latitude': 37.7752},
    {'city': 'Los Angeles', 'state': 'CA', 'incorporated': '1850-04-04', 'population': 3849378, 'longitude': -118.2434, 'latitude': 34.0521},
    {'city': 'Portland', 'state': 'OR', 'incorporated': '1851-02-08', 'population': 575930, 'longitude': -122.6703, 'latitude': 45.5238},

indexer = engine.Indexer()
indexer.set('city', stored=True, indexed=False)
indexer.set('state', stored=True, indexed=False)
# set method supports custom field types inheriting their default settings
indexer.set('incorporated', engine.DateTimeField)
indexer.set('year-month-day', engine.NestedField, sep='-')
indexer.set('population', engine.NumericField, type=int)
indexer.set('point', engine.PointField, precision=10)
# assigned fields can have a different key from their underlying field name
indexer.fields['location'] = engine.NestedField('state.city')

for doc in docs:
    doc['year-month-day'] = doc['incorporated']
    point = doc.pop('longitude'), doc.pop('latitude')
    location = doc['state'] + '.' + doc['city']
    incorporated = map(int, doc.pop('incorporated').split('-'))
    indexer.add(doc, location=location, incorporated=date(*incorporated), point=[point])

query = indexer.fields['incorporated'].prefix([1850])
assert query.max.doubleValue() - query.min.doubleValue() == 60 * 60 * 24 * 365
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['incorporated'].range(date(1850, 4, 10), None)
assert query.max is None
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['year-month-day'].prefix('1850')
assert str(query) == 'year:1850*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']
query = indexer.fields['year-month-day'].range('1850-04-10', None)
assert str(query) == 'year-month-day:[1850-04-10 TO *}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

query = indexer.fields['population'].range(0, 1000000)
assert str(query) == 'population:[0 TO 1000000}'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Portland']

cities = ['San Francisco', 'Los Angeles', 'Portland']
for index, distance in enumerate([1e3, 1e5, 2e5, 1e6]):
    query = indexer.fields['point'].within(-122.4, 37.7, distance=distance)
    assert isinstance(query, search.BooleanQuery) and len(query) <= 4
    assert {hit['city'] for hit in indexer.search(query)} == set(cities[:index])

query = indexer.fields['location'].prefix('CA.San')
# works like any prefix query
assert str(query) == 'state.city:CA.San*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco']
query = indexer.fields['location'].prefix('CA')
# optimized to search the best field
assert str(query) == 'state:CA*'
assert [hit['city'] for hit in indexer.search(query)] == ['San Francisco', 'Los Angeles']


PyLucene has several pitfalls when collecting or sorting a large query result.
Generally they involve the overhead of traversing the VM in an internal loop.

Lucene also requires supplying a maximum doc count for searches,
and supplying an excessively large count is a poor workaround because the collection heap is pre-allocated.

Finally the custom sorting interface, although well-supported in PyLucene, has horrible performance.
The sort key of every potential doc must realistically be cached,
but the overhead of O(n log n) comparison calls dispatched through the VM is far worse than iterating ScoreDocs.

To mitigate all these problems, Lupyne first provides a unified search interface.
The same Hits type is returned regardless of optional doc count or sorting parameters.
As with lucene, the result is fully evaluated but each individual Hit object will only be loaded on demand.
Internally a CachingCollector is used when all docs are requested.

The search method allows lucene Sort parameters to be passed through, since that's still optimal.
Additionally the hits themselves can be sorted afterwards with any python callable key.
The IndexSearcher.comparator method is convenient for creating a sort key table from indexed fields.
The upshot is custom sorting and sorting large results are both easier and faster.

Custom sorting isn't necessary in the below example of course, just there for demonstration.

import lucene
from org.apache.lucene import search
from org.apache.pylucene.search import PythonFieldComparator, PythonFieldComparatorSource
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
indexer = engine.Indexer()
indexer.set('color', stored=True, tokenized=False)
for color in colors:

# # # lucene # # #

searcher = search.IndexSearcher(indexer.indexReader)
sorter = search.Sort(search.SortField('color', search.SortField.Type.STRING))
topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

class ComparatorSource(PythonFieldComparatorSource):
    class newComparator(PythonFieldComparator):
        def __init__(self, name, numHits, sortPos, reversed):
            self.name = name
            self.values = [None] * numHits
            self.value = self.values.__getitem__

        def setNextReader(self, context):
            self.comparator = search.FieldCache.DEFAULT.getTermsIndex(context.reader(), self.name)
            return self

        def compare(self, slot1, slot2):
            return cmp(self.values[slot1], self.values[slot2])

        def setBottom(self, slot):
            self._bottom = self.values[slot]

        def compareBottom(self, doc):
            return cmp(self._bottom, self.comparator.get(doc).utf8ToString())

        def copy(self, slot, doc):
            self.values[slot] = self.comparator.get(doc).utf8ToString()

sorter = search.Sort(search.SortField('color', ComparatorSource()))
# still must supply excessive doc count to use the sorter
topdocs = searcher.search(search.MatchAllDocsQuery(), None, 10, sorter)
assert [searcher.doc(scoredoc.doc)['color'] for scoredoc in topdocs.scoreDocs] == sorted(colors)

# # # lupyne # # #

hits = indexer.search(sort='color')
assert [hit['color'] for hit in hits] == sorted(colors)
comparator = indexer.comparator('color')
assert list(comparator) == list(colors)
hits = indexer.search().sorted(comparator.__getitem__)
assert [hit['color'] for hit in hits] == sorted(colors)


Grouping and facets.

Lupyne supports lucene's contrib grouping.GroupingSearch interface, but it has some limitations.
GroupingSearch objects only support single-valued strings, and won't find zero-valued facets.
Lupyne also supports grouping hits by an arbitrary function after the original search,
Similar to sorting, the native approach is generally more efficient, proportional to the number of documents culled.

Lupyne also supports using cached filters to compute facet counts.
Although seemingly less efficient, it is significantly faster with small numbers of terms.
It also has no limitations on multiple values, and can be fully customized without reindexing.

import itertools
import lucene
from lupyne import engine

colors = 'red', 'green', 'blue', 'cyan', 'magenta', 'yellow'
facets = dict(zip(colors, itertools.count(1)))
indexer = engine.Indexer()
indexer.set('color', stored=True, tokenized=False)
for color in facets:
    for index in range(facets[color]):
query = engine.Query.alldocs()

# group using native GroupingSearch
for hits in indexer.groupby('color', query):
    assert facets[hits.value] == hits.count
    hit, = hits
    assert hit['color'] == hits.value

# group using Hits interface
for hits in indexer.search(query).groupby(indexer.comparator('color').__getitem__, docs=1):
    assert facets[hits.value] == hits.count
    hit, = hits
    assert hit['color'] == hits.value

# facets use a GroupingSearch if no filters are registered
assert indexer.facets(query, 'color')['color'] == facets

# filters allow flexible customizations without any indexing changes
indexer.filters['color'] = {
    'additive': engine.Query.any(color=colors[:3]).filter(),
    'subtractive': engine.Query.any(color=colors[3:]).filter(),
assert indexer.facets(query, 'color')['color'] == {'additive': 6, 'subtractive': 15}


Parallel indexing.

One of Lucene's shortcomings as a general purpose database is the lack of atomic partial updates.
IndexWriter.updateDocument merely deletes and adds a document in a transaction.
The burden is on the application to handle both the inefficiency and concurrency issues of updating unchanged fields.
This is poorly suited for many scenarios, where there are large static fields (e.g. text) and small volatile fields (e.g. tags).
Thus many applications must keep volatile data in an external database, with poor performance when joining searches across vertical partitions.

Solutions have been discussed for years (https://issues.apache.org/jira/browse/LUCENE-1879) with little progress.
IndexWriters can now update DocValues in-place, but that's only a partial workaround since DocValues aren't indexed.
ParallelReaders allow keeping the volatile fields in a separate index, but require syncing the ephemeral doc nums.
This is essentially useless, as the whole point is that the indices wouldn't be updated with the same frequency.

Lupyne provides another solution: parallel indexing with syncing on a unique indexed field.
The most efficient way to intersect a search with outside data is to use a cached TermsFilter.
Lupyne's TermsFilter provides a set-like interface for managing which unique terms should match.
For simplicity and efficiency a searcher must also be registered with the filter before using it in a search.
The TermsFilter instance manages the thread-safe cache, with optimal incremental updates of both terms and searchers.

Additionally TermsFilters can be registered with IndexSearchers, such that reopening keeps the filter updated.
Finally, for applications which can also keep the volatile data in a separate Lucene index,
a ParallelIndexer will manage the matching terms by mapping the real underlying filters into terms,
keeping the registered TermsFilters updated with every commit.

import lucene
from lupyne import engine

# setup main index with unique name field
primary = engine.Indexer()
primary.set('name', stored=True, tokenized=False)
for name in ('alpha', 'bravo'):
    primary.add(name=name, text='large body of text')

# setup parallel index with matching unique field and additional volatile field
secondary = engine.ParallelIndexer('name')
field = secondary.set('votes', engine.NumericField)
secondary.add(name='alpha', votes=1)
secondary.add(name='bravo', votes=0)
secondary.add(name='charlie', votes=1)

# automatically create and register TermsFilter, which matches positive votes
real_filter = engine.Query.filter(field.range(1, None), cache=False)
assert str(real_filter) == "votes:[1 TO *}"
auto_filter = secondary.termsfilter(real_filter, primary)

# instead of using parallel index, manually create and register TermsFilter
man_filter = primary.termsfilter('name', ['alpha', 'charlie'])

# in either case: alpha matches, bravo doesn't, charlie doesn't exist (yet)
for filter in (man_filter, auto_filter):
    assert [hit['name'] for hit in primary.search(filter=filter)] == ['alpha']

# update vote counts
secondary.update('alpha', votes=0)
secondary.update('bravo', votes=1)

# instead of using parallel index, simulate the updates manually

# add missing document to main index

# in either case: alpha no longer matches, bravo now does, charlie now exists
for filter in (man_filter, auto_filter):
    assert [hit['name'] for hit in primary.search(filter=filter)] == ['bravo', 'charlie']


Custom server.

Fields settings are assigned directly to the root.
Indexing is done here just to populate the example.

A custom filter and sorter are demonstrated by transforming a date field into a year field.
Filters are also used for faceting;  sorters are also used for grouping.

Example queries:
 * http://localhost:8080/search?q=date:17*&group=year
 * http://localhost:8080/search?q=date:17*&group=year&sort=-year
 * http://localhost:8080/search?count=0&facets=year
 * http://localhost:8080/search?q=text:right&count=3&facets=year

import lucene
from lupyne import engine, server
from tests import fixtures

def parse(date):
    return int(date.utf8ToString().split('-')[0])

if __name__ == '__main__':
    root = server.WebIndexer()
    # assign field settings
    root.indexer.set('amendment', stored=True, tokenized=False)
    root.indexer.set('date', stored=True, tokenized=False)
    # populate index
    for doc in fixtures.constitution():
        if 'amendment' in doc:
    # assign custom filter and sorter based on year
    root.searcher.sorters['year'] = engine.SortField('date', int, parse)
    years = {date.split('-')[0] for date in root.searcher.terms('date')}
    root.searcher.filters['year'] = {year: engine.Query.prefix('date', year).filter() for year in years}
    # start with pretty-printing
    server.start(root, config={'global': {'tools.json_out.indent': 2}})


Output a kml file (for Google Earth) which visualizes a spatial tile search.

Default searches within 1 kilometer of Griffith Observatory.
Reports the number of found tiles, precision level,
final number of grouped tiles, and the ratio of extra area searched.
Experiment with different tile limits to see search accuracy.

import argparse
import itertools
import math
import os
import sys
from lupyne.engine.spatial import Point, Tile

overlay = '''<GroundOverlay>

document = '''<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--lng', type=float, default='-118.3004', help='longitude [%(default)s]')
parser.add_argument('--lat', type=float, default='34.1184', help='latitude [%(default)s]')
parser.add_argument('--distance', type=float, default='1000', help='search radius in meters [%(default)s]')
parser.add_argument('--tiles', type=int, default='4', help='maximum number of tiles to consider [%(default)s]')

if __name__ == '__main__':
    args = parser.parse_args()
    sets = Point(args.lng, args.lat).within(args.distance, 30)
    tiles = list(itertools.takewhile(lambda tiles: len(tiles) <= args.tiles, sets))[-1]
    print >>sys.stderr, len(tiles), 'tiles at precision', len(tiles[0])

    grouped = []
    while tiles:
        remaining = []
        for key, group in itertools.groupby(tiles, key=lambda tile: tile[:-1]):
            group = list(group)
            if len(group) == 4:
                remaining.append(str.__new__(Tile, key))
                grouped += group
        tiles = remaining

    overlays = []
    area = 0.0
    for tile in grouped:
        points = tile.points
        width, height = (abs(i - j) for i, j in zip(*points))
        area += width * height
        (west, south), (east, north) = (point.coords for point in points)
        overlays.append(overlay.format(north, south, east, west))
    area /= math.pi * (args.distance) ** 2
    print >>sys.stderr, len(grouped), 'grouped tiles covering', area, 'times the circle area'
    print document.format(args.lng, args.lat, os.linesep.join(overlays))