mirror of
https://github.com/bigchaindb/bigchaindb.git
synced 2024-10-13 13:34:05 +00:00
Merge pull request #1469 from bigchaindb/feat/1462/text-search
Text search support for mongodb backend
This commit is contained in:
commit
ac2d65d23d
@ -353,3 +353,28 @@ def get_unvoted_blocks(conn, node_pubkey):
|
|||||||
'votes': False, '_id': False
|
'votes': False, '_id': False
|
||||||
}}
|
}}
|
||||||
]))
|
]))
|
||||||
|
|
||||||
|
|
||||||
|
@register_query(MongoDBConnection)
|
||||||
|
def text_search(conn, search, *, language='english', case_sensitive=False,
|
||||||
|
diacritic_sensitive=False, text_score=False, limit=0):
|
||||||
|
cursor = conn.run(
|
||||||
|
conn.collection('assets')
|
||||||
|
.find({'$text': {
|
||||||
|
'$search': search,
|
||||||
|
'$language': language,
|
||||||
|
'$caseSensitive': case_sensitive,
|
||||||
|
'$diacriticSensitive': diacritic_sensitive}},
|
||||||
|
{'score': {'$meta': 'textScore'}, '_id': False})
|
||||||
|
.sort([('score', {'$meta': 'textScore'})])
|
||||||
|
.limit(limit))
|
||||||
|
|
||||||
|
if text_score:
|
||||||
|
return cursor
|
||||||
|
|
||||||
|
return (_remove_text_score(asset) for asset in cursor)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_text_score(asset):
|
||||||
|
asset.pop('score', None)
|
||||||
|
return asset
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from pymongo import ASCENDING, DESCENDING
|
from pymongo import ASCENDING, DESCENDING, TEXT
|
||||||
|
|
||||||
from bigchaindb import backend
|
from bigchaindb import backend
|
||||||
from bigchaindb.common import exceptions
|
from bigchaindb.common import exceptions
|
||||||
@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname):
|
|||||||
conn.conn[dbname]['assets'].create_index('id',
|
conn.conn[dbname]['assets'].create_index('id',
|
||||||
name='asset_id',
|
name='asset_id',
|
||||||
unique=True)
|
unique=True)
|
||||||
|
|
||||||
|
# full text search index
|
||||||
|
conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text')
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
|
|
||||||
from functools import singledispatch
|
from functools import singledispatch
|
||||||
|
|
||||||
|
from bigchaindb.backend.exceptions import OperationError
|
||||||
|
|
||||||
|
|
||||||
@singledispatch
|
@singledispatch
|
||||||
def write_transaction(connection, signed_transaction):
|
def write_transaction(connection, signed_transaction):
|
||||||
@ -353,3 +355,33 @@ def get_txids_filtered(connection, asset_id, operation=None):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
@singledispatch
|
||||||
|
def text_search(conn, search, *, language='english', case_sensitive=False,
|
||||||
|
diacritic_sensitive=False, text_score=False, limit=0):
|
||||||
|
"""Return all the assets that match the text search.
|
||||||
|
|
||||||
|
The results are sorted by text score.
|
||||||
|
For more information about the behavior of text search on MongoDB see
|
||||||
|
https://docs.mongodb.com/manual/reference/operator/query/text/#behavior
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search (str): Text search string to query the text index
|
||||||
|
language (str, optional): The language for the search and the rules for
|
||||||
|
stemmer and tokenizer. If the language is ``None`` text search uses
|
||||||
|
simple tokenization and no stemming.
|
||||||
|
case_sensitive (bool, optional): Enable or disable case sensitive
|
||||||
|
search.
|
||||||
|
diacritic_sensitive (bool, optional): Enable or disable case sensitive
|
||||||
|
diacritic search.
|
||||||
|
text_score (bool, optional): If ``True`` returns the text score with
|
||||||
|
each document.
|
||||||
|
limit (int, optional): Limit the number of returned documents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:obj:`list` of :obj:`dict`: a list of assets
|
||||||
|
"""
|
||||||
|
|
||||||
|
raise OperationError('This query is only supported when running '
|
||||||
|
'BigchainDB with MongoDB as the backend.')
|
||||||
|
@ -619,3 +619,14 @@ class Bigchain(object):
|
|||||||
the database.
|
the database.
|
||||||
"""
|
"""
|
||||||
return backend.query.write_assets(self.connection, assets)
|
return backend.query.write_assets(self.connection, assets)
|
||||||
|
|
||||||
|
def text_search(self, search, *, limit=0):
|
||||||
|
assets = backend.query.text_search(self.connection, search, limit=limit)
|
||||||
|
|
||||||
|
# TODO: This is not efficient. There may be a more efficient way to
|
||||||
|
# query by storing block ids with the assets and using fastquery.
|
||||||
|
# See https://github.com/bigchaindb/bigchaindb/issues/1496
|
||||||
|
for asset in assets:
|
||||||
|
tx, status = self.get_transaction(asset['id'], True)
|
||||||
|
if status == self.TX_VALID:
|
||||||
|
yield asset
|
||||||
|
@ -513,3 +513,87 @@ def test_get_assets():
|
|||||||
|
|
||||||
assert cursor.count() == 2
|
assert cursor.count() == 2
|
||||||
assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2]
|
assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2]
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_search():
|
||||||
|
from bigchaindb.backend import connect, query
|
||||||
|
conn = connect()
|
||||||
|
|
||||||
|
# Example data and tests cases taken from the mongodb documentation
|
||||||
|
# https://docs.mongodb.com/manual/reference/operator/query/text/
|
||||||
|
assets = [
|
||||||
|
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
|
||||||
|
{'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90},
|
||||||
|
{'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100},
|
||||||
|
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
|
||||||
|
{'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
|
||||||
|
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
|
||||||
|
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
|
||||||
|
]
|
||||||
|
|
||||||
|
# insert the assets
|
||||||
|
conn.db.assets.insert_many(deepcopy(assets), ordered=False)
|
||||||
|
|
||||||
|
# test search single word
|
||||||
|
assert list(query.text_search(conn, 'coffee')) == [
|
||||||
|
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
|
||||||
|
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
|
||||||
|
]
|
||||||
|
|
||||||
|
# match any of the search terms
|
||||||
|
assert list(query.text_search(conn, 'bake coffee cake')) == [
|
||||||
|
{'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90},
|
||||||
|
{'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50},
|
||||||
|
{'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100},
|
||||||
|
{'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5},
|
||||||
|
{'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10}
|
||||||
|
]
|
||||||
|
|
||||||
|
# search for a phrase
|
||||||
|
assert list(query.text_search(conn, '\"coffee shop\"')) == [
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
|
||||||
|
]
|
||||||
|
|
||||||
|
# exclude documents that contain a term
|
||||||
|
assert list(query.text_search(conn, 'coffee -shop')) == [
|
||||||
|
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
|
||||||
|
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10},
|
||||||
|
]
|
||||||
|
|
||||||
|
# search different language
|
||||||
|
assert list(query.text_search(conn, 'leche', language='es')) == [
|
||||||
|
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
|
||||||
|
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
|
||||||
|
]
|
||||||
|
|
||||||
|
# case and diacritic insensitive search
|
||||||
|
assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [
|
||||||
|
{'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80},
|
||||||
|
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
|
||||||
|
{'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10}
|
||||||
|
]
|
||||||
|
|
||||||
|
# case sensitive search
|
||||||
|
assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
|
||||||
|
]
|
||||||
|
|
||||||
|
# diacritic sensitive search
|
||||||
|
assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [
|
||||||
|
{'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200},
|
||||||
|
]
|
||||||
|
|
||||||
|
# return text score
|
||||||
|
assert list(query.text_search(conn, 'coffee', text_score=True)) == [
|
||||||
|
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0},
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75},
|
||||||
|
{'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75},
|
||||||
|
]
|
||||||
|
|
||||||
|
# limit search result
|
||||||
|
assert list(query.text_search(conn, 'coffee', limit=2)) == [
|
||||||
|
{'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50},
|
||||||
|
{'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5},
|
||||||
|
]
|
||||||
|
@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes():
|
|||||||
assert sorted(indexes) == ['_id_', 'block_and_voter']
|
assert sorted(indexes) == ['_id_', 'block_and_voter']
|
||||||
|
|
||||||
indexes = conn.conn[dbname]['assets'].index_information().keys()
|
indexes = conn.conn[dbname]['assets'].index_information().keys()
|
||||||
assert sorted(indexes) == ['_id_', 'asset_id']
|
assert sorted(indexes) == ['_id_', 'asset_id', 'text']
|
||||||
|
|
||||||
|
|
||||||
def test_init_database_fails_if_db_exists():
|
def test_init_database_fails_if_db_exists():
|
||||||
|
@ -213,6 +213,88 @@ class TestBigchainApi(object):
|
|||||||
assert b.get_transaction(tx1.id) is None
|
assert b.get_transaction(tx1.id) is None
|
||||||
assert b.get_transaction(tx2.id) == tx2
|
assert b.get_transaction(tx2.id) == tx2
|
||||||
|
|
||||||
|
@pytest.mark.genesis
|
||||||
|
def test_text_search(self, b):
|
||||||
|
from bigchaindb.models import Transaction
|
||||||
|
from bigchaindb.backend.exceptions import OperationError
|
||||||
|
from bigchaindb.backend.mongodb.connection import MongoDBConnection
|
||||||
|
|
||||||
|
# define the assets
|
||||||
|
asset1 = {'msg': 'BigchainDB 1'}
|
||||||
|
asset2 = {'msg': 'BigchainDB 2'}
|
||||||
|
asset3 = {'msg': 'BigchainDB 3'}
|
||||||
|
|
||||||
|
# create the transactions
|
||||||
|
tx1 = Transaction.create([b.me], [([b.me], 1)],
|
||||||
|
asset=asset1).sign([b.me_private])
|
||||||
|
tx2 = Transaction.create([b.me], [([b.me], 1)],
|
||||||
|
asset=asset2).sign([b.me_private])
|
||||||
|
tx3 = Transaction.create([b.me], [([b.me], 1)],
|
||||||
|
asset=asset3).sign([b.me_private])
|
||||||
|
|
||||||
|
# create the block
|
||||||
|
block = b.create_block([tx1, tx2, tx3])
|
||||||
|
b.write_block(block)
|
||||||
|
|
||||||
|
# vote valid
|
||||||
|
vote = b.vote(block.id, b.get_last_voted_block().id, True)
|
||||||
|
b.write_vote(vote)
|
||||||
|
|
||||||
|
# get the assets through text search
|
||||||
|
# this query only works with MongoDB
|
||||||
|
try:
|
||||||
|
assets = list(b.text_search('bigchaindb'))
|
||||||
|
except OperationError as exc:
|
||||||
|
assert not isinstance(b.connection, MongoDBConnection)
|
||||||
|
else:
|
||||||
|
assert len(assets) == 3
|
||||||
|
|
||||||
|
@pytest.mark.genesis
|
||||||
|
def test_text_search_returns_valid_only(self, monkeypatch, b):
|
||||||
|
from bigchaindb.models import Transaction
|
||||||
|
from bigchaindb.backend.exceptions import OperationError
|
||||||
|
from bigchaindb.backend.mongodb.connection import MongoDBConnection
|
||||||
|
|
||||||
|
asset_valid = {'msg': 'Hello BigchainDB!'}
|
||||||
|
asset_invalid = {'msg': 'Goodbye BigchainDB!'}
|
||||||
|
|
||||||
|
monkeypatch.setattr('time.time', lambda: 1000000000)
|
||||||
|
tx1 = Transaction.create([b.me], [([b.me], 1)],
|
||||||
|
asset=asset_valid)
|
||||||
|
tx1 = tx1.sign([b.me_private])
|
||||||
|
block1 = b.create_block([tx1])
|
||||||
|
b.write_block(block1)
|
||||||
|
|
||||||
|
monkeypatch.setattr('time.time', lambda: 1000000020)
|
||||||
|
tx2 = Transaction.create([b.me], [([b.me], 1)],
|
||||||
|
asset=asset_invalid)
|
||||||
|
tx2 = tx2.sign([b.me_private])
|
||||||
|
block2 = b.create_block([tx2])
|
||||||
|
b.write_block(block2)
|
||||||
|
|
||||||
|
# vote the first block valid
|
||||||
|
vote = b.vote(block1.id, b.get_last_voted_block().id, True)
|
||||||
|
b.write_vote(vote)
|
||||||
|
|
||||||
|
# vote the second block invalid
|
||||||
|
vote = b.vote(block2.id, b.get_last_voted_block().id, False)
|
||||||
|
b.write_vote(vote)
|
||||||
|
|
||||||
|
# get assets with text search
|
||||||
|
try:
|
||||||
|
assets = list(b.text_search('bigchaindb'))
|
||||||
|
except OperationError:
|
||||||
|
assert not isinstance(b.connection, MongoDBConnection)
|
||||||
|
return
|
||||||
|
|
||||||
|
# should only return one asset
|
||||||
|
assert len(assets) == 1
|
||||||
|
# should return the asset created by tx1
|
||||||
|
assert assets[0] == {
|
||||||
|
'data': {'msg': 'Hello BigchainDB!'},
|
||||||
|
'id': tx1.id
|
||||||
|
}
|
||||||
|
|
||||||
@pytest.mark.usefixtures('inputs')
|
@pytest.mark.usefixtures('inputs')
|
||||||
def test_write_transaction(self, b, user_pk, user_sk):
|
def test_write_transaction(self, b, user_pk, user_sk):
|
||||||
from bigchaindb import Bigchain
|
from bigchaindb import Bigchain
|
||||||
|
Loading…
x
Reference in New Issue
Block a user