Merge pull request #1469 from bigchaindb/feat/1462/text-search

Text search support for mongodb backend
This commit is contained in:
Rodolphe Marques 2017-05-29 13:30:45 +02:00 committed by GitHub
commit ac2d65d23d
7 changed files with 239 additions and 2 deletions

View File

@ -353,3 +353,28 @@ def get_unvoted_blocks(conn, node_pubkey):
'votes': False, '_id': False 'votes': False, '_id': False
}} }}
])) ]))
@register_query(MongoDBConnection)
def text_search(conn, search, *, language='english', case_sensitive=False,
diacritic_sensitive=False, text_score=False, limit=0):
cursor = conn.run(
conn.collection('assets')
.find({'$text': {
'$search': search,
'$language': language,
'$caseSensitive': case_sensitive,
'$diacriticSensitive': diacritic_sensitive}},
{'score': {'$meta': 'textScore'}, '_id': False})
.sort([('score', {'$meta': 'textScore'})])
.limit(limit))
if text_score:
return cursor
return (_remove_text_score(asset) for asset in cursor)
def _remove_text_score(asset):
asset.pop('score', None)
return asset

View File

@ -2,7 +2,7 @@
import logging import logging
from pymongo import ASCENDING, DESCENDING from pymongo import ASCENDING, DESCENDING, TEXT
from bigchaindb import backend from bigchaindb import backend
from bigchaindb.common import exceptions from bigchaindb.common import exceptions
@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname):
conn.conn[dbname]['assets'].create_index('id', conn.conn[dbname]['assets'].create_index('id',
name='asset_id', name='asset_id',
unique=True) unique=True)
# full text search index
conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text')

View File

@ -2,6 +2,8 @@
from functools import singledispatch from functools import singledispatch
from bigchaindb.backend.exceptions import OperationError
@singledispatch @singledispatch
def write_transaction(connection, signed_transaction): def write_transaction(connection, signed_transaction):
@ -353,3 +355,33 @@ def get_txids_filtered(connection, asset_id, operation=None):
""" """
raise NotImplementedError raise NotImplementedError
@singledispatch
def text_search(conn, search, *, language='english', case_sensitive=False,
diacritic_sensitive=False, text_score=False, limit=0):
"""Return all the assets that match the text search.
The results are sorted by text score.
For more information about the behavior of text search on MongoDB see
https://docs.mongodb.com/manual/reference/operator/query/text/#behavior
Args:
search (str): Text search string to query the text index
language (str, optional): The language for the search and the rules for
stemmer and tokenizer. If the language is ``None`` text search uses
simple tokenization and no stemming.
case_sensitive (bool, optional): Enable or disable case sensitive
search.
diacritic_sensitive (bool, optional): Enable or disable case sensitive
diacritic search.
text_score (bool, optional): If ``True`` returns the text score with
each document.
limit (int, optional): Limit the number of returned documents.
Returns:
:obj:`list` of :obj:`dict`: a list of assets
"""
raise OperationError('This query is only supported when running '
'BigchainDB with MongoDB as the backend.')

View File

@ -619,3 +619,14 @@ class Bigchain(object):
the database. the database.
""" """
return backend.query.write_assets(self.connection, assets) return backend.query.write_assets(self.connection, assets)
def text_search(self, search, *, limit=0):
assets = backend.query.text_search(self.connection, search, limit=limit)
# TODO: This is not efficient. There may be a more efficient way to
# query by storing block ids with the assets and using fastquery.
# See https://github.com/bigchaindb/bigchaindb/issues/1496
for asset in assets:
tx, status = self.get_transaction(asset['id'], True)
if status == self.TX_VALID:
yield asset

View File

@ -513,3 +513,87 @@ def test_get_assets():
assert cursor.count() == 2 assert cursor.count() == 2
assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2] assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2]
def test_text_search():
from bigchaindb.backend import connect, query
conn = connect()
# Example data and tests cases taken from the mongodb documentation
# https://docs.mongodb.com/manual/reference/operator/query/text/
assets = [
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
{'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90},
{'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100},
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
{'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
]
# insert the assets
conn.db.assets.insert_many(deepcopy(assets), ordered=False)
# test search single word
assert list(query.text_search(conn, 'coffee')) == [
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
]
# match any of the search terms
assert list(query.text_search(conn, 'bake coffee cake')) == [
{'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90},
{'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50},
{'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100},
{'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5},
{'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10}
]
# search for a phrase
assert list(query.text_search(conn, '\"coffee shop\"')) == [
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
]
# exclude documents that contain a term
assert list(query.text_search(conn, 'coffee -shop')) == [
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
]
# search different language
assert list(query.text_search(conn, 'leche', language='es')) == [
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
]
# case and diacritic insensitive search
assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [
{'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
]
# case sensitive search
assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
]
# diacritic sensitive search
assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
]
# return text score
assert list(query.text_search(conn, 'coffee', text_score=True)) == [
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0},
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75},
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75},
]
# limit search result
assert list(query.text_search(conn, 'coffee', limit=2)) == [
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
]

View File

@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes():
assert sorted(indexes) == ['_id_', 'block_and_voter'] assert sorted(indexes) == ['_id_', 'block_and_voter']
indexes = conn.conn[dbname]['assets'].index_information().keys() indexes = conn.conn[dbname]['assets'].index_information().keys()
assert sorted(indexes) == ['_id_', 'asset_id'] assert sorted(indexes) == ['_id_', 'asset_id', 'text']
def test_init_database_fails_if_db_exists(): def test_init_database_fails_if_db_exists():

View File

@ -213,6 +213,88 @@ class TestBigchainApi(object):
assert b.get_transaction(tx1.id) is None assert b.get_transaction(tx1.id) is None
assert b.get_transaction(tx2.id) == tx2 assert b.get_transaction(tx2.id) == tx2
@pytest.mark.genesis
def test_text_search(self, b):
from bigchaindb.models import Transaction
from bigchaindb.backend.exceptions import OperationError
from bigchaindb.backend.mongodb.connection import MongoDBConnection
# define the assets
asset1 = {'msg': 'BigchainDB 1'}
asset2 = {'msg': 'BigchainDB 2'}
asset3 = {'msg': 'BigchainDB 3'}
# create the transactions
tx1 = Transaction.create([b.me], [([b.me], 1)],
asset=asset1).sign([b.me_private])
tx2 = Transaction.create([b.me], [([b.me], 1)],
asset=asset2).sign([b.me_private])
tx3 = Transaction.create([b.me], [([b.me], 1)],
asset=asset3).sign([b.me_private])
# create the block
block = b.create_block([tx1, tx2, tx3])
b.write_block(block)
# vote valid
vote = b.vote(block.id, b.get_last_voted_block().id, True)
b.write_vote(vote)
# get the assets through text search
# this query only works with MongoDB
try:
assets = list(b.text_search('bigchaindb'))
except OperationError as exc:
assert not isinstance(b.connection, MongoDBConnection)
else:
assert len(assets) == 3
@pytest.mark.genesis
def test_text_search_returns_valid_only(self, monkeypatch, b):
from bigchaindb.models import Transaction
from bigchaindb.backend.exceptions import OperationError
from bigchaindb.backend.mongodb.connection import MongoDBConnection
asset_valid = {'msg': 'Hello BigchainDB!'}
asset_invalid = {'msg': 'Goodbye BigchainDB!'}
monkeypatch.setattr('time.time', lambda: 1000000000)
tx1 = Transaction.create([b.me], [([b.me], 1)],
asset=asset_valid)
tx1 = tx1.sign([b.me_private])
block1 = b.create_block([tx1])
b.write_block(block1)
monkeypatch.setattr('time.time', lambda: 1000000020)
tx2 = Transaction.create([b.me], [([b.me], 1)],
asset=asset_invalid)
tx2 = tx2.sign([b.me_private])
block2 = b.create_block([tx2])
b.write_block(block2)
# vote the first block valid
vote = b.vote(block1.id, b.get_last_voted_block().id, True)
b.write_vote(vote)
# vote the second block invalid
vote = b.vote(block2.id, b.get_last_voted_block().id, False)
b.write_vote(vote)
# get assets with text search
try:
assets = list(b.text_search('bigchaindb'))
except OperationError:
assert not isinstance(b.connection, MongoDBConnection)
return
# should only return one asset
assert len(assets) == 1
# should return the asset created by tx1
assert assets[0] == {
'data': {'msg': 'Hello BigchainDB!'},
'id': tx1.id
}
@pytest.mark.usefixtures('inputs') @pytest.mark.usefixtures('inputs')
def test_write_transaction(self, b, user_pk, user_sk): def test_write_transaction(self, b, user_pk, user_sk):
from bigchaindb import Bigchain from bigchaindb import Bigchain