diff --git a/bigchaindb/backend/mongodb/query.py b/bigchaindb/backend/mongodb/query.py index 926bd7d5..eaef721d 100644 --- a/bigchaindb/backend/mongodb/query.py +++ b/bigchaindb/backend/mongodb/query.py @@ -353,3 +353,28 @@ def get_unvoted_blocks(conn, node_pubkey): 'votes': False, '_id': False }} ])) + + +@register_query(MongoDBConnection) +def text_search(conn, search, *, language='english', case_sensitive=False, + diacritic_sensitive=False, text_score=False, limit=0): + cursor = conn.run( + conn.collection('assets') + .find({'$text': { + '$search': search, + '$language': language, + '$caseSensitive': case_sensitive, + '$diacriticSensitive': diacritic_sensitive}}, + {'score': {'$meta': 'textScore'}, '_id': False}) + .sort([('score', {'$meta': 'textScore'})]) + .limit(limit)) + + if text_score: + return cursor + + return (_remove_text_score(asset) for asset in cursor) + + +def _remove_text_score(asset): + asset.pop('score', None) + return asset diff --git a/bigchaindb/backend/mongodb/schema.py b/bigchaindb/backend/mongodb/schema.py index 12b873e0..6c54bfd8 100644 --- a/bigchaindb/backend/mongodb/schema.py +++ b/bigchaindb/backend/mongodb/schema.py @@ -2,7 +2,7 @@ import logging -from pymongo import ASCENDING, DESCENDING +from pymongo import ASCENDING, DESCENDING, TEXT from bigchaindb import backend from bigchaindb.common import exceptions @@ -113,3 +113,6 @@ def create_assets_secondary_index(conn, dbname): conn.conn[dbname]['assets'].create_index('id', name='asset_id', unique=True) + + # full text search index + conn.conn[dbname]['assets'].create_index([('$**', TEXT)], name='text') diff --git a/bigchaindb/backend/query.py b/bigchaindb/backend/query.py index 07101e47..f989c8b1 100644 --- a/bigchaindb/backend/query.py +++ b/bigchaindb/backend/query.py @@ -2,6 +2,8 @@ from functools import singledispatch +from bigchaindb.backend.exceptions import OperationError + @singledispatch def write_transaction(connection, signed_transaction): @@ -353,3 +355,33 @@ def get_txids_filtered(connection, asset_id, operation=None): """ raise NotImplementedError + + +@singledispatch +def text_search(conn, search, *, language='english', case_sensitive=False, + diacritic_sensitive=False, text_score=False, limit=0): + """Return all the assets that match the text search. + + The results are sorted by text score. + For more information about the behavior of text search on MongoDB see + https://docs.mongodb.com/manual/reference/operator/query/text/#behavior + + Args: + search (str): Text search string to query the text index + language (str, optional): The language for the search and the rules for + stemmer and tokenizer. If the language is ``None`` text search uses + simple tokenization and no stemming. + case_sensitive (bool, optional): Enable or disable case sensitive + search. + diacritic_sensitive (bool, optional): Enable or disable case sensitive + diacritic search. + text_score (bool, optional): If ``True`` returns the text score with + each document. + limit (int, optional): Limit the number of returned documents. + + Returns: + :obj:`list` of :obj:`dict`: a list of assets + """ + + raise OperationError('This query is only supported when running ' + 'BigchainDB with MongoDB as the backend.') diff --git a/bigchaindb/core.py b/bigchaindb/core.py index 39039cc0..923cb8a8 100644 --- a/bigchaindb/core.py +++ b/bigchaindb/core.py @@ -619,3 +619,14 @@ class Bigchain(object): the database. """ return backend.query.write_assets(self.connection, assets) + + def text_search(self, search, *, limit=0): + assets = backend.query.text_search(self.connection, search, limit=limit) + + # TODO: This is not efficient. There may be a more efficient way to + # query by storing block ids with the assets and using fastquery. + # See https://github.com/bigchaindb/bigchaindb/issues/1496 + for asset in assets: + tx, status = self.get_transaction(asset['id'], True) + if status == self.TX_VALID: + yield asset diff --git a/tests/backend/mongodb/test_queries.py b/tests/backend/mongodb/test_queries.py index c3792064..897a0f06 100644 --- a/tests/backend/mongodb/test_queries.py +++ b/tests/backend/mongodb/test_queries.py @@ -513,3 +513,87 @@ def test_get_assets(): assert cursor.count() == 2 assert list(cursor.sort('id', pymongo.ASCENDING)) == assets[::2] + + +def test_text_search(): + from bigchaindb.backend import connect, query + conn = connect() + + # Example data and tests cases taken from the mongodb documentation + # https://docs.mongodb.com/manual/reference/operator/query/text/ + assets = [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + {'id': 3, 'subject': 'Baking a cake', 'author': 'abc', 'views': 90}, + {'id': 4, 'subject': 'baking', 'author': 'xyz', 'views': 100}, + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # insert the assets + conn.db.assets.insert_many(deepcopy(assets), ordered=False) + + # test search single word + assert list(query.text_search(conn, 'coffee')) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + ] + + # match any of the search terms + assert list(query.text_search(conn, 'bake coffee cake')) == [ + {'author': 'abc', 'id': 3, 'subject': 'Baking a cake', 'views': 90}, + {'author': 'xyz', 'id': 1, 'subject': 'coffee', 'views': 50}, + {'author': 'xyz', 'id': 4, 'subject': 'baking', 'views': 100}, + {'author': 'efg', 'id': 2, 'subject': 'Coffee Shopping', 'views': 5}, + {'author': 'efg', 'id': 7, 'subject': 'coffee and cream', 'views': 10} + ] + + # search for a phrase + assert list(query.text_search(conn, '\"coffee shop\"')) == [ + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] + + # exclude documents that contain a term + assert list(query.text_search(conn, 'coffee -shop')) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10}, + ] + + # search different language + assert list(query.text_search(conn, 'leche', language='es')) == [ + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # case and diacritic insensitive search + assert list(query.text_search(conn, 'сы́рники CAFÉS')) == [ + {'id': 6, 'subject': 'Сырники', 'author': 'jkl', 'views': 80}, + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + {'id': 8, 'subject': 'Cafe con Leche', 'author': 'xyz', 'views': 10} + ] + + # case sensitive search + assert list(query.text_search(conn, 'Coffee', case_sensitive=True)) == [ + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] + + # diacritic sensitive search + assert list(query.text_search(conn, 'CAFÉ', diacritic_sensitive=True)) == [ + {'id': 5, 'subject': 'Café Con Leche', 'author': 'abc', 'views': 200}, + ] + + # return text score + assert list(query.text_search(conn, 'coffee', text_score=True)) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50, 'score': 1.0}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5, 'score': 0.75}, + {'id': 7, 'subject': 'coffee and cream', 'author': 'efg', 'views': 10, 'score': 0.75}, + ] + + # limit search result + assert list(query.text_search(conn, 'coffee', limit=2)) == [ + {'id': 1, 'subject': 'coffee', 'author': 'xyz', 'views': 50}, + {'id': 2, 'subject': 'Coffee Shopping', 'author': 'efg', 'views': 5}, + ] diff --git a/tests/backend/mongodb/test_schema.py b/tests/backend/mongodb/test_schema.py index e3b320bd..e11dbfe8 100644 --- a/tests/backend/mongodb/test_schema.py +++ b/tests/backend/mongodb/test_schema.py @@ -33,7 +33,7 @@ def test_init_creates_db_tables_and_indexes(): assert sorted(indexes) == ['_id_', 'block_and_voter'] indexes = conn.conn[dbname]['assets'].index_information().keys() - assert sorted(indexes) == ['_id_', 'asset_id'] + assert sorted(indexes) == ['_id_', 'asset_id', 'text'] def test_init_database_fails_if_db_exists(): diff --git a/tests/db/test_bigchain_api.py b/tests/db/test_bigchain_api.py index 0371ecbf..05b07bf6 100644 --- a/tests/db/test_bigchain_api.py +++ b/tests/db/test_bigchain_api.py @@ -213,6 +213,88 @@ class TestBigchainApi(object): assert b.get_transaction(tx1.id) is None assert b.get_transaction(tx2.id) == tx2 + @pytest.mark.genesis + def test_text_search(self, b): + from bigchaindb.models import Transaction + from bigchaindb.backend.exceptions import OperationError + from bigchaindb.backend.mongodb.connection import MongoDBConnection + + # define the assets + asset1 = {'msg': 'BigchainDB 1'} + asset2 = {'msg': 'BigchainDB 2'} + asset3 = {'msg': 'BigchainDB 3'} + + # create the transactions + tx1 = Transaction.create([b.me], [([b.me], 1)], + asset=asset1).sign([b.me_private]) + tx2 = Transaction.create([b.me], [([b.me], 1)], + asset=asset2).sign([b.me_private]) + tx3 = Transaction.create([b.me], [([b.me], 1)], + asset=asset3).sign([b.me_private]) + + # create the block + block = b.create_block([tx1, tx2, tx3]) + b.write_block(block) + + # vote valid + vote = b.vote(block.id, b.get_last_voted_block().id, True) + b.write_vote(vote) + + # get the assets through text search + # this query only works with MongoDB + try: + assets = list(b.text_search('bigchaindb')) + except OperationError as exc: + assert not isinstance(b.connection, MongoDBConnection) + else: + assert len(assets) == 3 + + @pytest.mark.genesis + def test_text_search_returns_valid_only(self, monkeypatch, b): + from bigchaindb.models import Transaction + from bigchaindb.backend.exceptions import OperationError + from bigchaindb.backend.mongodb.connection import MongoDBConnection + + asset_valid = {'msg': 'Hello BigchainDB!'} + asset_invalid = {'msg': 'Goodbye BigchainDB!'} + + monkeypatch.setattr('time.time', lambda: 1000000000) + tx1 = Transaction.create([b.me], [([b.me], 1)], + asset=asset_valid) + tx1 = tx1.sign([b.me_private]) + block1 = b.create_block([tx1]) + b.write_block(block1) + + monkeypatch.setattr('time.time', lambda: 1000000020) + tx2 = Transaction.create([b.me], [([b.me], 1)], + asset=asset_invalid) + tx2 = tx2.sign([b.me_private]) + block2 = b.create_block([tx2]) + b.write_block(block2) + + # vote the first block valid + vote = b.vote(block1.id, b.get_last_voted_block().id, True) + b.write_vote(vote) + + # vote the second block invalid + vote = b.vote(block2.id, b.get_last_voted_block().id, False) + b.write_vote(vote) + + # get assets with text search + try: + assets = list(b.text_search('bigchaindb')) + except OperationError: + assert not isinstance(b.connection, MongoDBConnection) + return + + # should only return one asset + assert len(assets) == 1 + # should return the asset created by tx1 + assert assets[0] == { + 'data': {'msg': 'Hello BigchainDB!'}, + 'id': tx1.id + } + @pytest.mark.usefixtures('inputs') def test_write_transaction(self, b, user_pk, user_sk): from bigchaindb import Bigchain