Skip to content

Commit

Permalink
feat(search) BREAKING Support ElasticSearch 7, drop ES5 (#2263)
Browse files Browse the repository at this point in the history
Merges in changes from our ES7 branch, and drops support for ES5.

This is a breaking change due to the upgrade, we have a ES5 branch at the commit before this.
  • Loading branch information
John Plaisted authored Mar 19, 2021
1 parent 711e023 commit 5e91014
Show file tree
Hide file tree
Showing 29 changed files with 455 additions and 418 deletions.
8 changes: 4 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ project.ext.externalDependency = [
'commonsLang': 'commons-lang:commons-lang:2.6',
'ebean': 'io.ebean:ebean:11.33.3',
'ebeanAgent': 'io.ebean:ebean-agent:11.27.1',
'elasticSearchRest': 'org.elasticsearch.client:elasticsearch-rest-high-level-client:5.6.8',
'elasticSearchTransport': 'org.elasticsearch.client:transport:5.6.8',
'elasticSearchRest': 'org.elasticsearch.client:elasticsearch-rest-high-level-client:7.9.3',
'elasticSearchTransport': 'org.elasticsearch.client:transport:7.9.3',
'findbugsAnnotations': 'com.google.code.findbugs:annotations:3.0.1',
'gmaCoreModels': "com.linkedin.datahub-gma:core-models-data-template:$gmaVersion",
'gmaDaoApi': "com.linkedin.datahub-gma:dao-api:$gmaVersion",
'gmaDaoApiDataTemplate': "com.linkedin.datahub-gma:dao-api-data-template:$gmaVersion",
'gmaEbeanDao': "com.linkedin.datahub-gma:ebean-dao:$gmaVersion",
'gmaElasticsearchDao': "com.linkedin.datahub-gma:elasticsearch-dao:$gmaVersion",
'gmaElasticsearchIntegTest': "com.linkedin.datahub-gma:elasticsearch-dao-integ-testing-docker:$gmaVersion",
'gmaElasticsearchDao': "com.linkedin.datahub-gma:elasticsearch-dao-7:$gmaVersion",
'gmaElasticsearchIntegTest': "com.linkedin.datahub-gma:elasticsearch-dao-integ-testing-docker-7:$gmaVersion",
'gmaNeo4jDao': "com.linkedin.datahub-gma:neo4j-dao:$gmaVersion",
'gmaRestliResources': "com.linkedin.datahub-gma:restli-resources:$gmaVersion",
'gmaRestliResourcesDataTemplate': "com.linkedin.datahub-gma:restli-resources-data-template:$gmaVersion",
Expand Down
5 changes: 5 additions & 0 deletions contrib/elasticsearch/es7-upgrade/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM python:3.8
COPY . .
RUN pip install --upgrade pip
RUN pip install elasticsearch
ENTRYPOINT ["python", "transfer.py"]
12 changes: 6 additions & 6 deletions contrib/elasticsearch/es7-upgrade/transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
parser = argparse.ArgumentParser(description="Transfers ES indexes between clusters.")
parser.add_argument('-s', '--source', required=True, help='Source cluster URL and port.')
parser.add_argument('-d', '--dest', required=True, help='Destination cluster URL and port.')
parser.add_argument('--source-ssl', required=False, default=True, help='Enables / disables source SSL.')
parser.add_argument('--dest-ssl', required=False, default=True, help='Enables / disables destination SSL.')
parser.add_argument('--disable-source-ssl', required=False, action='store_true', help='If set, disable source SSL.')
parser.add_argument('--disable-dest-ssl', required=False, action='store_true', help='If set, disable destination SSL.')
parser.add_argument('--cert-file', required=False, default=None, help='Cert file to use with SSL.')
parser.add_argument('--key-file', required=False, default=None, help='Key file to use with SSL.')
parser.add_argument('--ca-file', required=False, default=None, help='Certificate authority file to use for SSL.')
parser.add_argument('--create-only', required=False, default=False, help='If true, only create the index (with settings/mappings/aliases).')
parser.add_argument('--create-only', required=False, action='store_true', help='If set, only create the index (with settings/mappings/aliases).')
parser.add_argument('-i', '--indices', required=False, default="*", help='Regular expression for indexes to copy.')
parser.add_argument('--name-override', required=False, default=None, help='destination index name override')

Expand Down Expand Up @@ -207,9 +207,9 @@ def copy_index_data(clients, index, name_override):


def main():
ssl_context=create_ssl_context()
source_ssl_context = ssl_context if args.source_ssl else None
dest_ssl_context = ssl_context if args.dest_ssl else None
ssl_context = create_ssl_context() if not args.disable_source_ssl or not args.disable_dest_ssl else None
source_ssl_context = ssl_context if not args.disable_source_ssl else None
dest_ssl_context = ssl_context if not args.disable_dest_ssl else None
clients = EsClients(create_client(args.source, source_ssl_context), create_client(args.dest, dest_ssl_context))
indices = get_index_settings(clients.source_client, args.indices)

Expand Down
4 changes: 2 additions & 2 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ services:
- schema-registry

elasticsearch:
image: elasticsearch:5.6.8
image: elasticsearch:7.9.3
env_file: elasticsearch/env/docker.env
container_name: elasticsearch
hostname: elasticsearch
Expand All @@ -96,7 +96,7 @@ services:
- esdata:/usr/share/elasticsearch/data

kibana:
image: kibana:5.6.8
image: kibana:7.9.3
env_file: kibana/env/docker.env
container_name: kibana
hostname: kibana
Expand Down
4 changes: 2 additions & 2 deletions docker/elasticsearch-setup/create-indices.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ function create_index {
jq -n \
--slurpfile settings index/$2 \
--slurpfile mappings index/$3 \
'.settings=$settings[0] | .mappings.doc=$mappings[0]' > /tmp/data
'.settings=$settings[0] | .mappings=$mappings[0]' > /tmp/data

curl -XPUT $ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/$1 --data @/tmp/data
curl -XPUT $ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/$1 -H 'Content-Type: application/json' --data @/tmp/data
}

create_index chartdocument chart/settings.json chart/mappings.json
Expand Down
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ module.exports = {
],
"Advanced Guides": [
"docs/advanced/aspect-versioning",
"docs/advanced/es-7-upgrade",
"docs/advanced/high-cardinality",
"docs/how/scsi-onboarding-guide",
// WIP "docs/advanced/backfilling",
Expand Down
38 changes: 38 additions & 0 deletions docs/advanced/es-7-upgrade.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Elasticsearch upgrade from 5.6.8 to 7.9.3

## Summary of changes
Checkout the list of breaking changes for [Elasticsearch 6](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/breaking-changes-6.0.html) and [Elasticsearch 7](https://www.elastic.co/guide/en/elasticsearch/reference/7.x/breaking-changes-7.0.html). Following is the summary of changes that impact Datahub.

### Search index mapping & settings
- Removal of mapping types (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/removal-of-types.html))
- Specify the maximum allowed difference between `min_gram` and `max_gram` for NGramTokenizer and NGramTokenFilter by adding property `max_ngram_diff` in index settings, particularly if the difference is greater than 1 (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules.html))

### Search query
The following parameters are/were `optional` and hence automatically populated in the search query. Some tests that expect a certain search query to be sent to ES will change with the ES upgrade.
- `disable_coord` parameter of the `bool` and `common_terms` queries has been removed (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/breaking-changes-6.0.html))
- `auto_generate_synonyms_phrase_query` parameter in `match` query is added with a default value of `true` (as mentioned [here](https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-match-query.html))

### Java High Level Rest Client
- In 7.9.3, Java High Level Rest Client instance needs a REST low-level client builder to be built. In 5.6.8, the same instance needs REST low-level client
- Document APIs such as the Index API, Delete API, etc no longer takes the doc `type` as an input

## Migration strategy

As mentioned in the docs, indices created in Elasticsearch 5.x are not readable by Elasticsearch 7.x. Running the upgraded elasticsearch container on the existing esdata volume will fail.

For local development, our recommendation is to run the `docker/nuke.sh` script to remove the existing esdata volume before starting up the containers. Note, all data will be lost.

To migrate without losing data, please refer to the python script and Dockerfile in `contrib/elasticsearch/es7-upgrade`. The script takes source and destination elasticsearch cluster URL and SSL configuration (if applicable) as input. It ports the mappings and settings for all indices in the source cluster to the destination cluster making the necessary changes stated above. Then it transfers all documents in the source cluster to the destination cluster.

You can run the script in a docker container as follows
```
docker build -t migrate-es-7 .
docker run migrate-es-7 -s SOURCE -d DEST [--disable-source-ssl]
[--disable-dest-ssl] [--cert-file CERT_FILE]
[--key-file KEY_FILE] [--ca-file CA_FILE] [--create-only]
[-i INDICES] [--name-override NAME_OVERRIDE]
```

## Plan

We will create an "elasticsearch-5-legacy" branch with the version of master prior to the elasticsearch 7 upgrade. However, we will not be supporting this branch moving forward and all future development will be done using elasticsearch 7.9.3
Original file line number Diff line number Diff line change
Expand Up @@ -44,47 +44,41 @@ public class RestHighLevelClientFactory {
@Bean(name = "elasticSearchRestHighLevelClient")
@Nonnull
protected RestHighLevelClient createInstance() {
try {
RestClient restClient;
if (useSSL) {
restClient = loadRestHttpsClient(host, port, threadCount, connectionRequestTimeout, sslContext);
} else {
restClient = loadRestHttpClient(host, port, threadCount, connectionRequestTimeout);
}

return new RestHighLevelClient(restClient);
} catch (Exception e) {
throw new RuntimeException("Error: RestClient is not properly initialized. " + e.toString());
RestClientBuilder restClientBuilder;

if (useSSL) {
restClientBuilder = loadRestHttpsClient(host, port, threadCount, connectionRequestTimeout, sslContext);
} else {
restClientBuilder = loadRestHttpClient(host, port, threadCount, connectionRequestTimeout);
}

return new RestHighLevelClient(restClientBuilder);
}

@Nonnull
private static RestClient loadRestHttpClient(@Nonnull String host, int port, int threadCount,
int connectionRequestTimeout) {
private static RestClientBuilder loadRestHttpClient(@Nonnull String host, int port, int threadCount,
int connectionRequestTimeout) {
RestClientBuilder builder = RestClient.builder(new HttpHost(host, port, "http"))
.setHttpClientConfigCallback(httpAsyncClientBuilder ->
httpAsyncClientBuilder.setDefaultIOReactorConfig(IOReactorConfig.custom()
.setIoThreadCount(threadCount).build()));
.setHttpClientConfigCallback(httpAsyncClientBuilder -> httpAsyncClientBuilder.setDefaultIOReactorConfig(
IOReactorConfig.custom().setIoThreadCount(threadCount).build()));

builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder.
setConnectionRequestTimeout(connectionRequestTimeout));
setConnectionRequestTimeout(connectionRequestTimeout));

return builder.build();
return builder;
}

@Nonnull
private static RestClient loadRestHttpsClient(@Nonnull String host, int port, int threadCount,
int connectionRequestTimeout, @Nonnull SSLContext sslContext) {

private static RestClientBuilder loadRestHttpsClient(@Nonnull String host, int port, int threadCount,
int connectionRequestTimeout, @Nonnull SSLContext sslContext) {
final RestClientBuilder builder = RestClient.builder(new HttpHost(host, port, "https"))
.setHttpClientConfigCallback(httpAsyncClientBuilder -> httpAsyncClientBuilder.setSSLContext(sslContext)
.setSSLHostnameVerifier(new NoopHostnameVerifier())
.setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(threadCount).build()));
.setHttpClientConfigCallback(httpAsyncClientBuilder -> httpAsyncClientBuilder.setSSLContext(sslContext)
.setSSLHostnameVerifier(new NoopHostnameVerifier())
.setDefaultIOReactorConfig(IOReactorConfig.custom().setIoThreadCount(threadCount).build()));

builder.setRequestConfigCallback(requestConfigBuilder -> requestConfigBuilder.
setConnectionRequestTimeout(connectionRequestTimeout));
setConnectionRequestTimeout(connectionRequestTimeout));

return builder.build();
return builder;
}
}

1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/chart/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 17,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/corp-user/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 17,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/dashboard/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 17,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 47,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/dataflow/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 17,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/datajob/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 17,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
1 change: 1 addition & 0 deletions gms/impl/src/main/resources/index/dataset/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"index": {
"max_ngram_diff": 47,
"analysis": {
"filter": {
"autocomplete_filter": {
Expand Down
Loading

0 comments on commit 5e91014

Please sign in to comment.