diff --git a/changes/5847.feature b/changes/5847.feature new file mode 100644 index 00000000000..7ae46522af4 --- /dev/null +++ b/changes/5847.feature @@ -0,0 +1,9 @@ +Allow configuring datastore full text field indexes with new +ckan.datastore.default_fts_index_field_types config option. + +The default is an empty list, avoiding automatically creating +separate full text indexes for any individual columns. The +whole-row full text index still exists for all tables. + +Use the `ckan datastore fts-index` command to remove existing +column indexes to reclaim database space. diff --git a/ckanext/datastore/backend/postgres.py b/ckanext/datastore/backend/postgres.py index ab742f35460..46289bd339f 100644 --- a/ckanext/datastore/backend/postgres.py +++ b/ckanext/datastore/backend/postgres.py @@ -445,7 +445,7 @@ def _where_clauses( # add full-text search where clause q: Union[dict[str, str], str, Any] = data_dict.get('q') full_text = data_dict.get('full_text') - if q and not full_text: + if q: if isinstance(q, str): ts_query_alias = _ts_query_alias() clause_str = u'_full_text @@ {0}'.format(ts_query_alias) @@ -459,6 +459,7 @@ def _where_clauses( ftyp = fields_types[field] if not datastore_helpers.should_fts_index_field_type(ftyp): + # use general full text search to narrow results clause_str = u'_full_text @@ {0}'.format(query_field) clauses.append((clause_str,)) @@ -468,19 +469,7 @@ def _where_clauses( identifier(field), query_field) clauses.append((clause_str,)) - elif (full_text and not q): - ts_query_alias = _ts_query_alias() - clause_str = u'_full_text @@ {0}'.format(ts_query_alias) - clauses.append((clause_str,)) - - elif full_text and isinstance(q, dict): - ts_query_alias = _ts_query_alias() - clause_str = u'_full_text @@ {0}'.format(ts_query_alias) - clauses.append((clause_str,)) - # update clauses with q dict - _update_where_clauses_on_q_dict(data_dict, fields_types, q, clauses) - - elif full_text and isinstance(q, str): + if full_text: ts_query_alias = _ts_query_alias() clause_str = u'_full_text @@ {0}'.format(ts_query_alias) clauses.append((clause_str,)) @@ -488,29 +477,6 @@ def _where_clauses( return clauses -def _update_where_clauses_on_q_dict( - data_dict: dict[str, str], fields_types: dict[str, str], - q: dict[str, str], - clauses: WhereClauses) -> None: - lang = _fts_lang(data_dict.get('language')) - for field, _ in q.items(): - if field not in fields_types: - continue - query_field = _ts_query_alias(field) - - ftyp = fields_types[field] - if not datastore_helpers.should_fts_index_field_type(ftyp): - clause_str = u'_full_text @@ {0}'.format(query_field) - clauses.append((clause_str,)) - - clause_str = ( - u'to_tsvector({0}, cast({1} as text)) @@ {2}').format( - literal_string(lang), - identifier(field), - query_field) - clauses.append((clause_str,)) - - def _textsearch_query( lang: str, q: Optional[Union[str, dict[str, str], Any]], plain: bool, full_text: Optional[str]) -> tuple[str, dict[str, str]]: @@ -709,6 +675,7 @@ def _build_fts_indexes( data_dict: dict[str, Any], # noqa sql_index_str_method: str, fields: list[dict[str, Any]]): fts_indexes: list[str] = [] + fts_noindexes: list[str] = [] resource_id = data_dict['resource_id'] fts_lang = data_dict.get( 'language', config.get('ckan.datastore.default_fts_lang')) @@ -722,9 +689,6 @@ def cast_as_text(x: str): full_text_field = {'type': 'tsvector', 'id': '_full_text'} for field in [full_text_field] + fields: - if not datastore_helpers.should_fts_index_field_type(field['type']): - continue - field_str = field['id'] if field['type'] not in ['text', 'tsvector']: field_str = cast_as_text(field_str) @@ -732,13 +696,18 @@ def cast_as_text(x: str): field_str = u'"{0}"'.format(field_str) if field['type'] != 'tsvector': field_str = to_tsvector(field_str) + if field['id'] != '_full_text' and not ( + datastore_helpers.should_fts_index_field_type(field['type'])): + fts_noindexes.append(_generate_index_name(resource_id, field_str)) + continue + fts_indexes.append(sql_index_str_method.format( res_id=resource_id, unique='', name=_generate_index_name(resource_id, field_str), method=_get_fts_index_method(), fields=field_str)) - return fts_indexes + return fts_indexes, fts_noindexes def _drop_indexes(context: Context, data_dict: dict[str, Any], @@ -944,9 +913,8 @@ def create_indexes(context: Context, data_dict: dict[str, Any]): field_ids = _pluck('id', fields) json_fields = [x['id'] for x in fields if x['type'] == 'nested'] - fts_indexes = _build_fts_indexes(data_dict, - sql_index_string_method, - fields) + fts_indexes, fts_noindexes = _build_fts_indexes( + data_dict, sql_index_string_method, fields) sql_index_strings = sql_index_strings + fts_indexes if indexes is not None: @@ -986,10 +954,13 @@ def create_indexes(context: Context, data_dict: dict[str, Any]): current_indexes = _get_index_names(context['connection'], data_dict['resource_id']) + + for fts_idx in current_indexes: + if fts_idx in fts_noindexes: + connection.execute(sa.text( + 'DROP INDEX {0} CASCADE'.format(sa.column(fts_idx)))) for sql_index_string in sql_index_strings: - has_index = [c for c in current_indexes - if sql_index_string.find(c) != -1] - if not has_index: + if not any(c in sql_index_string for c in current_indexes): connection.execute(sa.text(sql_index_string)) diff --git a/ckanext/datastore/cli.py b/ckanext/datastore/cli.py index 6162d2cb8e7..fe97152d187 100644 --- a/ckanext/datastore/cli.py +++ b/ckanext/datastore/cli.py @@ -13,12 +13,14 @@ import ckan.logic as logic import ckanext.datastore as datastore_module +from ckanext.datastore.backend import get_all_resources_ids_in_datastore from ckanext.datastore.backend.postgres import ( identifier, literal_string, get_read_engine, get_write_engine, _get_raw_field_info, + _TIMEOUT, ) from ckanext.datastore.blueprint import DUMP_FORMATS, dump_to @@ -137,27 +139,17 @@ def purge(): site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {}) - result = logic.get_action('datastore_search')( - {'user': site_user['name']}, - {'resource_id': '_table_metadata'} - ) - resource_id_list = [] - for record in result['records']: + for resid in get_all_resources_ids_in_datastore(): try: - # ignore 'alias' records (views) as they are automatically - # deleted when the parent resource table is dropped - if record['alias_of']: - continue - logic.get_action('resource_show')( {'user': site_user['name']}, - {'id': record['name']} + {'id': resid} ) except logic.NotFound: - resource_id_list.append(record['name']) + resource_id_list.append(resid) click.echo("Resource '%s' orphaned - queued for drop" % - record[u'name']) + resid) except KeyError: continue @@ -191,22 +183,12 @@ def upgrade(): '''Move field info to _info so that plugins may add private information to each field for their own purposes.''' - site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {}) - - result = logic.get_action('datastore_search')( - {'user': site_user['name']}, - {'resource_id': '_table_metadata'} - ) - count = 0 skipped = 0 noinfo = 0 read_connection = get_read_engine() - for record in result['records']: - if record['alias_of']: - continue - - raw_fields, old = _get_raw_field_info(read_connection, record['name']) + for resid in get_all_resources_ids_in_datastore(): + raw_fields, old = _get_raw_field_info(read_connection, resid) if not old: if not raw_fields: noinfo += 1 @@ -222,7 +204,7 @@ def upgrade(): raw_sql = literal_string(' ' + json.dumps( raw, ensure_ascii=False, separators=(',', ':'))) alter_sql.append(u'COMMENT ON COLUMN {0}.{1} is {2}'.format( - identifier(record['name']), + identifier(resid), identifier(fid), raw_sql)) @@ -236,5 +218,33 @@ def upgrade(): count, skipped, noinfo)) +@datastore.command( + 'fts-index', + short_help='create or remove full-text search indexes after changing ' + 'the ckan.datastore.default_fts_index_field_types setting' +) +@click.option( + '--timeout', metavar='SECONDS', + type=click.FloatRange(0, 2147483.647), # because postgres max int + default=_TIMEOUT / 1000, show_default=True, + help='maximum index creation time in seconds', +) +def fts_index(timeout: float): + '''Use to create or remove full-text search indexes after changing + the ckan.datastore.default_fts_index_field_types setting. + ''' + site_user = logic.get_action('get_site_user')({'ignore_auth': True}, {}) + resource_ids = get_all_resources_ids_in_datastore() + + for i, resid in enumerate(get_all_resources_ids_in_datastore(), 1): + print(f'\r{resid} [{i}/{len(resource_ids)}] ...', end='') + logic.get_action('datastore_create')( + {'user': site_user['name'], + 'query_timeout': int(timeout * 1000)}, # type: ignore + {'resource_id': resid, 'force': True} + ) + print('\x08\x08\x08done') + + def get_commands(): return (set_permissions, dump, purge, upgrade) diff --git a/ckanext/datastore/config_declaration.yaml b/ckanext/datastore/config_declaration.yaml index 5e308037e9a..bec365ee7f9 100644 --- a/ckanext/datastore/config_declaration.yaml +++ b/ckanext/datastore/config_declaration.yaml @@ -99,3 +99,15 @@ groups: The default method used when creating full-text search indexes. Currently it can be "gin" or "gist". Refer to PostgreSQL's documentation to understand the characteristics of each one and pick the best for your instance. + + - key: ckan.datastore.default_fts_index_field_types + type: list + default: '' + example: text tsvector + description: > + A separate full-text search index will be created by default for fields + with these types, and used when searching on fields by passing a + dictionary to the datastore_search q parameter. + + Indexes increase the time and disk space required to load data + into the DataStore. diff --git a/ckanext/datastore/helpers.py b/ckanext/datastore/helpers.py index 343d29859a4..d7cd5b7814a 100644 --- a/ckanext/datastore/helpers.py +++ b/ckanext/datastore/helpers.py @@ -82,7 +82,8 @@ def _strip(s: Any): def should_fts_index_field_type(field_type: str): - return field_type.lower() in ['tsvector', 'text', 'number'] + return field_type in tk.config.get( + 'ckan.datastore.default_fts_index_field_types', []) def get_table_and_function_names_from_sql(context: Context, sql: str): diff --git a/ckanext/datastore/tests/test_create.py b/ckanext/datastore/tests/test_create.py index c9349044cab..36a76b2c4fb 100644 --- a/ckanext/datastore/tests/test_create.py +++ b/ckanext/datastore/tests/test_create.py @@ -159,6 +159,8 @@ def test_create_adds_index_on_full_text_search_when_not_creating_other_indexes( resource_id = result["resource_id"] assert self._has_index_on_field(resource_id, '"_full_text"') + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text") def test_create_add_full_text_search_indexes_on_every_text_field(self): package = factories.Dataset() data = { diff --git a/ckanext/datastore/tests/test_db.py b/ckanext/datastore/tests/test_db.py index a646ed8a32e..486998edee6 100644 --- a/ckanext/datastore/tests/test_db.py +++ b/ckanext/datastore/tests/test_db.py @@ -39,13 +39,15 @@ def test_default_fts_index_method_can_be_overwritten_by_config_var(self): "_full_text", connection, resource_id, method="gin" ) + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text tsvector") @mock.patch("ckanext.datastore.backend.postgres._get_fields") def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_english_as_default( self, _get_fields ): _get_fields.return_value = [ {"id": "text", "type": "text"}, - {"id": "number", "type": "number"}, + {"id": "tsvector", "type": "tsvector"}, {"id": "nested", "type": "nested"}, {"id": "date", "type": "date"}, {"id": "text array", "type": "text[]"}, @@ -62,9 +64,37 @@ def test_creates_fts_index_on_all_fields_except_dates_nested_and_arrays_with_eng "text", connection, resource_id, "english" ) self._assert_created_index_on( - "number", connection, resource_id, "english", cast=True + "tsvector", connection, resource_id, ) + @mock.patch("ckanext.datastore.backend.postgres._get_fields") + def test_creates_no_fts_indexes_by_default( + self, _get_fields + ): + _get_fields.return_value = [ + {"id": "text", "type": "text"}, + {"id": "tsvector", "type": "tsvector"}, + {"id": "nested", "type": "nested"}, + {"id": "date", "type": "date"}, + {"id": "text array", "type": "text[]"}, + {"id": "timestamp", "type": "timestamp"}, + ] + connection = mock.MagicMock() + context = {"connection": connection} + resource_id = "resource_id" + data_dict = {"resource_id": resource_id} + + db.create_indexes(context, data_dict) + + self._assert_no_index_created_on( + "text", connection, resource_id, "english" + ) + self._assert_no_index_created_on( + "tsvector", connection, resource_id, + ) + + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text tsvector") @pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple") @mock.patch("ckanext.datastore.backend.postgres._get_fields") def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var( @@ -80,6 +110,8 @@ def test_creates_fts_index_on_textual_fields_can_overwrite_lang_with_config_var( self._assert_created_index_on("foo", connection, resource_id, "simple") + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text tsvector") @pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple") @mock.patch("ckanext.datastore.backend.postgres._get_fields") def test_creates_fts_index_on_textual_fields_can_overwrite_lang_using_lang_param( @@ -127,6 +159,38 @@ def _assert_created_index_on( "called with a string containing '%s'" % sql_str ) + def _assert_no_index_created_on( + self, + field, + connection, + resource_id, + lang=None, + cast=False, + method="gist", + ): + field = u'"{0}"'.format(field) + if cast: + field = u"cast({0} AS text)".format(field) + if lang is not None: + sql_str = ( + u'ON "resource_id" ' + u"USING {method}(to_tsvector('{lang}', {field}))" + ) + sql_str = sql_str.format(method=method, lang=lang, field=field) + else: + sql_str = u"USING {method}({field})".format( + method=method, field=field + ) + + calls = connection.execute.call_args_list + + was_called = any(sql_str in str(call.args[0]) for call in calls) + + assert not was_called, ( + "Expected 'connection.execute' to not have been " + "called with a string containing '%s'" % sql_str + ) + class TestGetAllResourcesIdsInDatastore(object): @pytest.mark.ckan_config(u"ckan.plugins", u"datastore") diff --git a/ckanext/datastore/tests/test_helpers.py b/ckanext/datastore/tests/test_helpers.py index 806c6188f65..1aaaed5eb16 100644 --- a/ckanext/datastore/tests/test_helpers.py +++ b/ckanext/datastore/tests/test_helpers.py @@ -47,8 +47,10 @@ def test_is_single_statement(self): for multiple in multiples: assert postgres_backend.is_single_statement(multiple) is False + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text tsvector") def test_should_fts_index_field_type(self): - indexable_field_types = ["tsvector", "text", "number"] + indexable_field_types = ["tsvector", "text"] non_indexable_field_types = [ "nested", diff --git a/ckanext/datastore/tests/test_plugin.py b/ckanext/datastore/tests/test_plugin.py index f8c34928c7c..fa2d2efa14e 100644 --- a/ckanext/datastore/tests/test_plugin.py +++ b/ckanext/datastore/tests/test_plugin.py @@ -113,6 +113,8 @@ def test_ignores_fts_searches_on_inexistent_fields(self): assert result["where"] == [] + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text") def test_fts_where_clause_lang_uses_english_by_default(self): expected_where = [ ( @@ -129,6 +131,8 @@ def test_fts_where_clause_lang_uses_english_by_default(self): assert result["where"] == expected_where + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text") @pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple") def test_fts_where_clause_lang_can_be_overwritten_by_config(self): expected_where = [ @@ -146,6 +150,8 @@ def test_fts_where_clause_lang_can_be_overwritten_by_config(self): assert result["where"] == expected_where + @pytest.mark.ckan_config( + "ckan.datastore.default_fts_index_field_types", "text") @pytest.mark.ckan_config("ckan.datastore.default_fts_lang", "simple") def test_fts_where_clause_lang_can_be_overwritten_using_lang_param(self): expected_where = [