diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS deleted file mode 100644 index b3f21c9a43a..00000000000 --- a/.github/CODEOWNERS +++ /dev/null @@ -1,8 +0,0 @@ -# In order to ensure the query microservices are consistent with the wildfly webservice, we need -# to ensure that changes made to QueryExecutorBean match QueryManagementService, and changes made -# to LookupUUIDUtil match LookupService in the Query Service. -QueryExecutorBean.java @jwomeara @ivakegg -LookupUUIDUtil.java @jwomeara @ivakegg -RunningQuery.java @jwomeara @ivakegg -/core/ @jwomeara @ivakegg -/warehouse/query-core/ @jwomeara @ivakegg \ No newline at end of file diff --git a/.github/workflows/build-accumulo.yml b/.github/workflows/build-accumulo.yml new file mode 100644 index 00000000000..c1e15ef3d43 --- /dev/null +++ b/.github/workflows/build-accumulo.yml @@ -0,0 +1,99 @@ +name: Build Accumulo snapshot and update DataWave to use + +on: + workflow_dispatch: + inputs: + accumuloBranch: + required: true + default: "2.1" + description: "Branch name to build. Will be used as image tag." + accumuloRepo: + required: true + default: "apache/accumulo" + description: "Accumulo Repo to use. Expected to be at Github. Example: apache/accumulo" + deployAccumulo: + required: true + default: "false" + description: "Set to false if this accumulo version has already been pushed to Github Packages" + +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository.lowercase }} + ACCUMULO_JAVA_VERSION: '17' + DATAWAVE_JAVA_VERSION: '11' + JAVA_DISTRIBUTION: 'zulu' #This is the default on v1 of the action for 1.8 + USER_NAME: ${{ secrets.GHCR_WRITE_USER_NAME }} + ACCESS_TOKEN: ${{ secrets.GHCR_WRITE_ACCESS_TOKEN }} + +jobs: + build-and-deploy-accumulo: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + # + steps: + - name: Checkout DataWave + uses: actions/checkout@v4 + with: + path: datawave + + - name: Checkout Accumulo + uses: actions/checkout@v4 + id: accumuloCheckout + with: + repository: ${{ github.event.inputs.accumuloRepo }} + path: accumulo + ref: ${{ github.event.inputs.accumuloBranch }} + + - name: Set up JDK ${{env.ACCUMULO_JAVA_VERSION}} + uses: actions/setup-java@v4 + with: + distribution: ${{env.JAVA_DISTRIBUTION}} + java-version: ${{env.ACCUMULO_JAVA_VERSION}} + cache: 'maven' + - run: echo "ACCUMULO_JAVA=$JAVA_HOME" >> $GITHUB_ENV + - name: Set up JDK ${{env.DATAWAVE_JAVA_VERSION}} + uses: actions/setup-java@v4 + with: + distribution: ${{env.JAVA_DISTRIBUTION}} + java-version: ${{env.DATAWAVE_JAVA_VERSION}} + cache: 'maven' + - run: echo "DATAWAVE_JAVA=$JAVA_HOME" >> $GITHUB_ENV + + - name: Get Accumulo Version + id: get-accumulo-version + run: | + export JAVA_HOME="$ACCUMULO_JAVA" + cd "$GITHUB_WORKSPACE/accumulo" + mvn build-helper:parse-version versions:set -DgenerateBackupPoms=false -DnewVersion=\${parsedVersion.majorVersion}.\${parsedVersion.minorVersion}.\${parsedVersion.incrementalVersion}-dwv-$(git rev-parse --short HEAD) + export newVersion=$(mvn -q help:evaluate -DforceStdout -Dexpression=project.version) + echo accumuloVersion=$newVersion >> $GITHUB_OUTPUT + - name: Deploy Accumulo + if: ${{ github.event.inputs.deployAccumulo == 'true'}} + run: | + export JAVA_HOME="$ACCUMULO_JAVA" + cd "$GITHUB_WORKSPACE/accumulo" + mvn -DaltDeploymentRepository=github-datawave::https://maven.pkg.github.com/NationalSecurityAgency/datawave -V -B -e -ntp "-Dstyle.color=always" -DskipTests -T1C clean source:jar deploy -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ env.USER_NAME }} + password: ${{ env.ACCESS_TOKEN }} + + - name: Update DataWave Dependency Version + run: | + + export JAVA_HOME="$DATAWAVE_JAVA" + cd "$GITHUB_WORKSPACE/datawave" + mvn -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" versions:set-property -Dproperty=version.accumulo -DnewVersion=${{ steps.get-accumulo-version.outputs.accumuloVersion }} -DgenerateBackupPoms=false + - name: Build Web and Ingest Docker Images (Maven) + run: | + export JAVA_HOME="$DATAWAVE_JAVA" + cd "$GITHUB_WORKSPACE/datawave" + mvn -s "$GITHUB_WORKSPACE/datawave/.github/workflows/settings.xml" clean install -Prpm,kubernetes,assemble,deploy-ws -Ddist -Pdocker -DpushImage -Ddocker-release -DskipTests -Ddocker.image.accumulo.tag=${{ steps.get-accumulo-version.outputs.accumuloVersion }} + + diff --git a/.github/workflows/build-images.yml b/.github/workflows/build-images.yml index d36f222ad6b..33ff967db6d 100644 --- a/.github/workflows/build-images.yml +++ b/.github/workflows/build-images.yml @@ -32,9 +32,9 @@ jobs: username: ${{ env.USER_NAME }} password: ${{ env.ACCESS_TOKEN }} - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} diff --git a/.github/workflows/microservice-build-image.yaml b/.github/workflows/microservice-build-image.yaml new file mode 100644 index 00000000000..fe9e199ce50 --- /dev/null +++ b/.github/workflows/microservice-build-image.yaml @@ -0,0 +1,41 @@ +# +name: Create and publish a Docker image + +on: + workflow_call: + secrets: + USER_NAME: + description: "User Name for maven pulls" + required: true + ACCESS_TOKEN: + description: "Access token for maven pulls" + required: true + + +jobs: + build-and-push-datawave-images: + runs-on: ubuntu-latest + steps: + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.USER_NAME }} + password: ${{ secrets.ACCESS_TOKEN }} + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + distribution: "zulu" + java-version: 11 + cache: 'maven' + - name: Build And Push Docker Image (Maven) + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + run: | + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e clean install -Pdocker,exec -Ddocker.image.prefix=ghcr.io/nationalsecurityagency/ -DpushImage + + diff --git a/.github/workflows/microservice-maven-tests.yaml b/.github/workflows/microservice-maven-tests.yaml new file mode 100644 index 00000000000..4f20d49c3b8 --- /dev/null +++ b/.github/workflows/microservice-maven-tests.yaml @@ -0,0 +1,72 @@ +name: Tests + +on: + workflow_call: + secrets: + USER_NAME: + description: "User Name for maven pulls" + required: true + ACCESS_TOKEN: + description: "Access token for maven pulls" + required: true + +env: + MAVEN_OPTS: "-Djansi.force=true -Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true -XX:ThreadStackSize=1m" + +jobs: + # Runs the pom sorter and code formatter to ensure that the code + # is formatted and poms are sorted according to project rules. This + # will fail if the formatter makes any changes. + check-code-formatting: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + - uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-format-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven-format- + ${{ runner.os }}-maven- + - name: Format code + run: | + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e clean formatter:format sortpom:sort -Pautoformat + git status + git diff-index --quiet HEAD || (echo "Error! There are modified files after formatting." && false) + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + + # Build the code and run the unit/integration tests. + build-and-test: + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up JDK 11 + uses: actions/setup-java@v4 + with: + java-version: 11 + distribution: 'zulu' + - uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-build-${{ hashFiles('**/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven-build- + ${{ runner.os }}-maven-format- + ${{ runner.os }}-maven- + - name: Build and Run Unit Tests + run: mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -V -B -e -Ddist clean verify + env: + MAVEN_OPTS: "-Dhttps.protocols=TLSv1.2 -Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=WARN -Djava.awt.headless=true" + USER_NAME: ${{ secrets.USER_NAME }} + ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b4461ec42a7..e60d6c94522 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ on: - 'integration' - 'release/version*' pull_request: - paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE'] + paths-ignore: ['*.md', 'CODEOWNERS', 'LICENSE', '.github/workflows/microservice*.yaml'] workflow_dispatch: env: @@ -23,9 +23,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} @@ -60,9 +60,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} @@ -88,15 +88,15 @@ jobs: # runs-on: ubuntu-latest # steps: # - name: Checkout Code - # uses: actions/checkout@v3 + # uses: actions/checkout@v4 # with: # submodules: 'recursive' # - name: Set up JDK ${{env.JAVA_VERSION}} - # uses: actions/setup-java@v3 + # uses: actions/setup-java@v4 # with: # distribution: ${{env.JAVA_DISTRIBUTION}} # java-version: ${{env.JAVA_VERSION}} - # - uses: actions/cache@v3 + # - uses: actions/cache@v4 # with: # path: ~/.m2/repository # key: ${{ runner.os }}-maven-build-${{ hashFiles('**/pom.xml') }} @@ -126,9 +126,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} @@ -165,11 +165,11 @@ jobs: sudo rm -rf /usr/local/share/boost sudo rm -rf $AGENT_TOOLSDIRECTORY - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: 'recursive' - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} @@ -234,15 +234,15 @@ jobs: sudo rm -rf /usr/local/share/boost sudo rm -rf $AGENT_TOOLSDIRECTORY - name: Checkout Code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: submodules: 'recursive' - name: Set up JDK ${{env.JAVA_VERSION}} - uses: actions/setup-java@v3 + uses: actions/setup-java@v4 with: distribution: ${{env.JAVA_DISTRIBUTION}} java-version: ${{env.JAVA_VERSION}} - - uses: actions/cache@v3 + - uses: actions/cache@v4 with: path: ~/.m2/repository key: ${{ runner.os }}-maven-build-${{ hashFiles('**/pom.xml') }} @@ -256,10 +256,17 @@ jobs: USER_NAME: ${{ secrets.USER_NAME }} ACCESS_TOKEN: ${{ secrets.ACCESS_TOKEN }} run: | - mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -B -V -e -Pcompose -Dmicroservice-docker -Dquickstart-docker -Ddeploy -Dtar -DskipTests clean install + # set some bogus URLs to trigger dependency download via maven + DIST_URLS="-Durl.zookeeper=https://bogus.apache.org/zookeeper/zookeeper-3.7.2/apache-zookeeper-3.7.2-bin.tar.gz.tar.gz \ + -Durl.accumulo=https://bogus.apache.org/accumulo/2.1.3/accumulo-2.1.3-bin.tar.gz \ + -Durl.wildfly=https://bogus.jboss.org/wildfly/17.0.1.Final/wildfly-17.0.1.Final.tar.gz \ + -Durl.hadoop=https://bogus.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz \ + -Durl.maven=https://bogus.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz" + + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -B -V -e -Pcompose -Dmicroservice-docker -Dquickstart-docker -Dquickstart-maven ${DIST_URLS} -Ddeploy -Dtar -DskipTests -Dmaven.build.cache.enabled=false clean install # free up some space so that we don't run out docker system prune -f - mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -B -V -e -Pcompose -Dmicroservice-docker -Dquickstart-docker -Ddeploy -Dtar -DskipTests clean + mvn -s $GITHUB_WORKSPACE/.github/workflows/settings.xml -B -V -e -Pcompose -Dmicroservice-docker -Dquickstart-docker -Dquickstart-maven ${DIST_URLS} -Ddeploy -Dtar -DskipTests -Dmaven.build.cache.enabled=false clean cd docker ./bootstrap.sh attempt=0 diff --git a/.gitmodules b/.gitmodules index b7832653b95..26ad0ff918c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -82,3 +82,9 @@ [submodule "microservices/starters/cached-results"] path = microservices/starters/cached-results url = git@github.com:NationalSecurityAgency/datawave-spring-boot-starter-cached-results.git +[submodule "microservices/services/map"] + path = microservices/services/map + url = git@github.com:NationalSecurityAgency/datawave-map-service.git +[submodule "microservices/services/file-provider"] + path = microservices/services/file-provider + url = git@github.com:NationalSecurityAgency/datawave-file-provider-service.git diff --git a/.mvn/maven-build-cache-config.xml b/.mvn/maven-build-cache-config.xml index bf5dbbb1766..549ab1212ed 100644 --- a/.mvn/maven-build-cache-config.xml +++ b/.mvn/maven-build-cache-config.xml @@ -23,7 +23,7 @@ --> - true + false SHA-256 true diff --git a/BUILDME.md b/BUILDME.md index dd05b588e6d..fbd6744675f 100644 --- a/BUILDME.md +++ b/BUILDME.md @@ -64,7 +64,7 @@ mvn -Pdev,assemble,rpm -Ddeploy -Dtar -Ddist -DskipTests clean install Datawave web services utilize several microservices at runtime (currently authorization and auditing, although that list will expand soon). Datawave depends on api modules for some of these services, and the dependencies are set in -the parent pom (see `version.microservice.*` properties) to released versions. If you wish to build the microservices +the parent pom (see `version.datawave.*` properties) to released versions. If you wish to build the microservices for some reason, you can simply add `-Dservices` to your maven build command. ### Releasing Microservices @@ -95,7 +95,7 @@ the authorization service API version 1.0 is tagged with `svc_authorization-api_ Note that simply building a new API or service release won't ensure that it is used anywhere. You will need to update build properties in either the datawave parent pom or within other service poms (for cross-service dependencies) to -ensure that the new version is used. Look for properties starting with `version.microservice.` to see what to update. +ensure that the new version is used. Look for properties starting with `version.datawave.` to see what to update. If you are updating an API module, you should be careful. In general, the associated service will need to be updated as well to support the API changes. The service should _add_ a new version of the API and continue to support the old version until it can be ensured that there are no more consumers of the old API. diff --git a/common-test/pom.xml b/common-test/pom.xml index 0205f196518..790b6746082 100644 --- a/common-test/pom.xml +++ b/common-test/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-common-test ${project.artifactId} diff --git a/contrib/datawave-quickstart/README.md b/contrib/datawave-quickstart/README.md index 4c7a3f50c6f..eef2444c6f1 100644 --- a/contrib/datawave-quickstart/README.md +++ b/contrib/datawave-quickstart/README.md @@ -23,3 +23,16 @@ provide the exact same capabilities described below, but hosted on a CentOS 7 ba ### Get Started DataWave Quickstart installation instructions are [here](https://code.nsa.gov/datawave/docs/quickstart) + +--- +### A note on building: + +You can specify to use maven to fetch the DataWave, Hadoop, Accumulo, ZooKeeper tar.gz files +by using the `-Dquickstart-maven` flag with your mvn command. This will use maven to +download the tar files to your local repository then copy them to the appropriate directory +after downloading. + +In order to deploy a new dependency, you can use a command similar to the following, using +Acccumulo-2.1.3 as an example. The command must be run from the same directory as the tar file. + +`mvn deploy:deploy-file -DrepositoryId=github-datawave -Durl=https://maven.pkg.github.com/NationalSecurityAgency/datawave -DgroupId=gov.nsa.datawave.quickstart -DartifactId=accumulo -Dversion=2.1.3 -Dfile=accumulo-2.1.3-bin.tar.gz -Dpackaging=tar.gz` diff --git a/contrib/datawave-quickstart/bin/common.sh b/contrib/datawave-quickstart/bin/common.sh index 8c6639f50d8..7ada623a06d 100644 --- a/contrib/datawave-quickstart/bin/common.sh +++ b/contrib/datawave-quickstart/bin/common.sh @@ -117,6 +117,39 @@ function downloadTarball() { fi elif [[ ${uri} == https://* ]] ; then $( cd "${tarballdir}" && wget ${DW_WGET_OPTS} "${uri}" ) + else + return 1 + fi + fi +} + +function downloadMavenTarball() { + local pomFile="${DW_DATAWAVE_SOURCE_DIR:-$( cd "${DW_CLOUD_HOME}/../.." && pwd )}/pom.xml" + local rootProject=":$1" + local group="$2" + local artifact="$3" + local version="$4" + local tarballdir="$5" + tarball="${artifact}-${version}.tar.gz" + if [ ! -f "${tarballdir}/${tarball}" ] ; then + # download from maven repo + output=$( mvn -f "${pomFile}" -pl "${rootProject}" -DremoteRepositories="remote-repo::default::${DW_MAVEN_REPOSITORY}" dependency:get -Dartifact="${group}:${artifact}:${version}" -Dpackaging="tar.gz" ) + retVal=$? + if [ $retVal -ne 0 ]; then + error "Failed to download ${tarball} via maven" + error "$output" + return $retVal + else + info "Downloaded ${artifact} via maven" + fi + + # copy to specified directory + output=$( mvn -f "${pomFile}" -pl "${rootProject}" dependency:copy -Dartifact="${group}:${artifact}:${version}:tar.gz" -DoutputDirectory="${tarballdir}" ) + retVal=$? + if [ $retVal -ne 0 ]; then + error "Failed to copy ${tarball} to ${tarballdir} via maven" + error "$output" + return $retVal fi fi } @@ -273,9 +306,9 @@ function checkBinaries() { local localBinaries=() while IFS= read -r line; do if [[ "${line}" =~ ^Source:.* ]] ; then - sourceBinaries+=("$(cut -d: -f2- <<<${line})") + sourceBinaries+=("$(sed 's/.*: //g' <<<${line})") else - localBinaries+=("$(cut -d: -f2- <<<${line})") + localBinaries+=("$(sed 's/.*: //g' <<<${line})") fi done < <( allDisplayBinaryInfo | grep -E 'Source:|Local:' ) diff --git a/contrib/datawave-quickstart/bin/query.sh b/contrib/datawave-quickstart/bin/query.sh index ced895a3d66..d49aa64b4f1 100644 --- a/contrib/datawave-quickstart/bin/query.sh +++ b/contrib/datawave-quickstart/bin/query.sh @@ -131,21 +131,8 @@ function setQueryIdFromResponse() { } function prettyPrintJson() { - local PY=$( which python ) - if [ -n "${PY}" ] ; then - echo "${1}" | ${PY} -c 'from __future__ import print_function;import sys,json;data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' - local exitStatus=$? - echo - if [ "${exitStatus}" != "0" ] ; then - printRawResponse "${1}" - warn "Python encountered error. Printed response without formatting" - echo - fi - else - printRawResponse "${1}" - warn "Couldn't find python in your environment. Json response was printed without formatting" - echo - fi + PY_CMD='from __future__ import print_function; import sys,json; data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' + echo "${1}" | ( python3 -c "${PY_CMD}" 2>/dev/null || python2 -c "${PY_CMD}" 2>/dev/null ) || ( warn "Python encountered error. Printed response without formatting" && printRawResponse "${1}" ) } function printRawResponse() { diff --git a/contrib/datawave-quickstart/bin/services/accumulo/bootstrap.sh b/contrib/datawave-quickstart/bin/services/accumulo/bootstrap.sh index 36b71347df3..15d01f2ece7 100644 --- a/contrib/datawave-quickstart/bin/services/accumulo/bootstrap.sh +++ b/contrib/datawave-quickstart/bin/services/accumulo/bootstrap.sh @@ -16,12 +16,13 @@ DW_ACCUMULO_SERVICE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # You may override DW_ZOOKEEPER_DIST_URI in your env ahead of time, and set as file:///path/to/file.tar.gz for local tarball, if needed # DW_ZOOKEEPER_DIST_URI should, if possible, be using https. There are potential security risks by using http. -DW_ZOOKEEPER_DIST_URI="${DW_ZOOKEEPER_DIST_URI:-https://dlcdn.apache.org/zookeeper/zookeeper-3.7.2/apache-zookeeper-3.7.2-bin.tar.gz}" +DW_ZOOKEEPER_VERSION="3.7.2" +DW_ZOOKEEPER_DIST_URI="${DW_ZOOKEEPER_DIST_URI:-https://dlcdn.apache.org/zookeeper/zookeeper-${DW_ZOOKEEPER_VERSION}/apache-zookeeper-${DW_ZOOKEEPER_VERSION}-bin.tar.gz}" # The sha512 checksum for the tarball. Value should be the hash value only and does not include the file name. Cannot be left blank. DW_ZOOKEEPER_DIST_SHA512_CHECKSUM="${DW_ZOOKEEPER_DIST_SHA512_CHECKSUM:-6afbfc1afc8b9370281bd9862f37dbb1cb95ec54bb2ed4371831aa5c0f08cfee775050bd57ce5fc0836e61af27eed9f0076f54b98997dd0e15159196056e52ea}" # shellcheck disable=SC2154 # shellcheck disable=SC2034 -DW_ZOOKEEPER_DIST="$( downloadTarball "${DW_ZOOKEEPER_DIST_URI}" "${DW_ACCUMULO_SERVICE_DIR}" && echo "${tarball}" )" +DW_ZOOKEEPER_DIST="$( { downloadTarball "${DW_ZOOKEEPER_DIST_URI}" "${DW_ACCUMULO_SERVICE_DIR}" || downloadMavenTarball "datawave-parent" "gov.nsa.datawave.quickstart" "zookeeper" "${DW_ZOOKEEPER_VERSION}" "${DW_ACCUMULO_SERVICE_DIR}"; } && echo "${tarball}" )" DW_ZOOKEEPER_BASEDIR="zookeeper-install" DW_ZOOKEEPER_SYMLINK="zookeeper" @@ -50,11 +51,12 @@ admin.enableServer=false" # You may override DW_ACCUMULO_DIST_URI in your env ahead of time, and set as file:///path/to/file.tar.gz for local tarball, if needed # DW_ACCUMULO_DIST_URI should, if possible, be using https. There are potential security risks by using http. -DW_ACCUMULO_DIST_URI="${DW_ACCUMULO_DIST_URI:-https://dlcdn.apache.org/accumulo/2.1.3/accumulo-2.1.3-bin.tar.gz}" +DW_ACCUMULO_VERSION="2.1.3" +DW_ACCUMULO_DIST_URI="${DW_ACCUMULO_DIST_URI:-https://dlcdn.apache.org/accumulo/${DW_ACCUMULO_VERSION}/accumulo-${DW_ACCUMULO_VERSION}-bin.tar.gz}" # The sha512 checksum for the tarball. Value should be the hash value only and does not include the file name. Cannot be left blank. DW_ACCUMULO_DIST_SHA512_CHECKSUM="${DW_ACCUMULO_DIST_SHA512_CHECKSUM:-1a27a144dc31f55ccc8e081b6c1bc6cc0362a8391838c53c166cb45291ff8f35867fd8e4729aa7b2c540f8b721f8c6953281bf589fc7fe320e4dc4d20b87abc4}" # shellcheck disable=SC2034 -DW_ACCUMULO_DIST="$( downloadTarball "${DW_ACCUMULO_DIST_URI}" "${DW_ACCUMULO_SERVICE_DIR}" && echo "${tarball}" )" +DW_ACCUMULO_DIST="$( { downloadTarball "${DW_ACCUMULO_DIST_URI}" "${DW_ACCUMULO_SERVICE_DIR}" || downloadMavenTarball "datawave-parent" "gov.nsa.datawave.quickstart" "accumulo" "${DW_ACCUMULO_VERSION}" "${DW_ACCUMULO_SERVICE_DIR}"; } && echo "${tarball}" )" DW_ACCUMULO_BASEDIR="accumulo-install" DW_ACCUMULO_SYMLINK="accumulo" DW_ACCUMULO_INSTANCE_NAME="my-instance-01" @@ -300,15 +302,15 @@ function accumuloPidList() { } function accumuloDisplayBinaryInfo() { - echo "Source: ${DW_ACCUMULO_DIST_URI}" - local tarballName="$(basename "$DW_ACCUMULO_DIST_URI")" + echo "Source: ${DW_ACCUMULO_DIST}" + local tarballName="$(basename "$DW_ACCUMULO_DIST")" if [[ -f "${DW_ACCUMULO_SERVICE_DIR}/${tarballName}" ]]; then echo " Local: ${DW_ACCUMULO_SERVICE_DIR}/${tarballName}" else echo " Local: Not loaded" fi - echo "Source: ${DW_ZOOKEEPER_DIST_URI}" - tarballName="$(basename "$DW_ZOOKEEPER_DIST_URI")" + echo "Source: ${DW_ZOOKEEPER_DIST}" + tarballName="$(basename "$DW_ZOOKEEPER_DIST")" if [[ -f "${DW_ACCUMULO_SERVICE_DIR}/${tarballName}" ]]; then echo " Local: ${DW_ACCUMULO_SERVICE_DIR}/${tarballName}" else diff --git a/contrib/datawave-quickstart/bin/services/datawave/bootstrap-web.sh b/contrib/datawave-quickstart/bin/services/datawave/bootstrap-web.sh index 017b80d2137..a6606321d7f 100644 --- a/contrib/datawave-quickstart/bin/services/datawave/bootstrap-web.sh +++ b/contrib/datawave-quickstart/bin/services/datawave/bootstrap-web.sh @@ -1,10 +1,11 @@ # You may override DW_WILDFLY_DIST_URI in your env ahead of time, and set as file:///path/to/file.tar.gz for local tarball, if needed +DW_WILDFLY_VERSION="17.0.1" # DW_WILDFLY_DIST_URI should, if possible, be using https. There are potential security risks by using http. -DW_WILDFLY_DIST_URI="${DW_WILDFLY_DIST_URI:-https://download.jboss.org/wildfly/17.0.1.Final/wildfly-17.0.1.Final.tar.gz}" +DW_WILDFLY_DIST_URI="${DW_WILDFLY_DIST_URI:-https://download.jboss.org/wildfly/${DW_WILDFLY_VERSION}.Final/wildfly-${DW_WILDFLY_VERSION}.Final.tar.gz}" # The sha512 checksum for the tarball. Value should be the hash value only and does not include the file name. Cannot be left blank. DW_WILDFLY_DIST_SHA512_CHECKSUM="${DW_WILDFLY_DIST_SHA512_CHECKSUM:-fcbdff4bc275f478c3bf5f665a83e62468a920e58fcddeaa2710272dd0f1ce3154cdc371d5011763a6be24ae1a5e0bca0218cceea63543edb4b5cf22de60b485}" -DW_WILDFLY_DIST="$( downloadTarball "${DW_WILDFLY_DIST_URI}" "${DW_DATAWAVE_SERVICE_DIR}" && echo "${tarball}" )" +DW_WILDFLY_DIST="$( { downloadTarball "${DW_WILDFLY_DIST_URI}" "${DW_DATAWAVE_SERVICE_DIR}" || downloadMavenTarball "datawave-parent" "gov.nsa.datawave.quickstart" "wildfly" "${DW_WILDFLY_VERSION}" "${DW_DATAWAVE_SERVICE_DIR}"; } && echo "${tarball}" )" DW_WILDFLY_BASEDIR="wildfly-install" DW_WILDFLY_SYMLINK="wildfly" @@ -176,8 +177,8 @@ function datawaveWebDisplayBinaryInfo() { else echo " Local: Not loaded" fi - echo "Source: ${DW_WILDFLY_DIST_URI}" - local tarballName="$(basename "$DW_WILDFLY_DIST_URI")" + echo "Source: ${DW_WILDFLY_DIST}" + local tarballName="$(basename "$DW_WILDFLY_DIST")" if [[ -f "${DW_DATAWAVE_SERVICE_DIR}/${tarballName}" ]]; then echo " Local: ${DW_DATAWAVE_SERVICE_DIR}/${tarballName}" else diff --git a/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh b/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh index af1da19187e..69cf2d47fa5 100755 --- a/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh +++ b/contrib/datawave-quickstart/bin/services/datawave/ingest-examples/tvmaze-api-query.sh @@ -38,10 +38,11 @@ TVMAZE_RESPONSE_STATUS=$( echo ${CURL_RESPONSE} | tr -d '\n' | sed -e 's/.*HTTP_ [ "${TVMAZE_RESPONSE_STATUS}" != "200" ] && error "api.tvmaze.com returned invalid response status: ${TVMAZE_RESPONSE_STATUS}" && exit 1 [ -z "${TVMAZE_RESPONSE_BODY}" ] && error "Response body is empty!" && exit 1 +PY_CMD='from __future__ import print_function; import sys,json; data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' if [ "${PRETTY}" == true ] ; then - echo "${TVMAZE_RESPONSE_BODY}" | python -c 'from __future__ import print_function;import sys,json;data=json.loads(sys.stdin.read()); print(json.dumps(data, indent=2, sort_keys=True))' + echo "${TVMAZE_RESPONSE_BODY}" | ( python3 -c "${PY_CMD}" 2>/dev/null || python2 -c "${PY_CMD}" 2>/dev/null ) || ( warn "Unable to pretty print, Python not detected" && echo "${TVMAZE_RESPONSE_BODY}" ) else - echo "${TVMAZE_RESPONSE_BODY}" + echo "${TVMAZE_RESPONSE_BODY}" fi exit 0 \ No newline at end of file diff --git a/contrib/datawave-quickstart/bin/services/hadoop/bootstrap.sh b/contrib/datawave-quickstart/bin/services/hadoop/bootstrap.sh index 5137be4c5a3..9f191b00402 100644 --- a/contrib/datawave-quickstart/bin/services/hadoop/bootstrap.sh +++ b/contrib/datawave-quickstart/bin/services/hadoop/bootstrap.sh @@ -2,12 +2,13 @@ DW_HADOOP_SERVICE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +DW_HADOOP_VERSION="3.3.6" # You may override DW_HADOOP_DIST_URI in your env ahead of time, and set as file:///path/to/file.tar.gz for local tarball, if needed # DW_HADOOP_DIST_URI should, if possible, be using https. There are potential security risks by using http. -DW_HADOOP_DIST_URI="${DW_HADOOP_DIST_URI:-https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/hadoop-3.3.6.tar.gz}" +DW_HADOOP_DIST_URI="${DW_HADOOP_DIST_URI:-https://dlcdn.apache.org/hadoop/common/hadoop-${DW_HADOOP_VERSION}/hadoop-${DW_HADOOP_VERSION}.tar.gz}" # The sha512 checksum for the tarball. Value should be the hash value only and does not include the file name. Cannot be left blank. DW_HADOOP_DIST_SHA512_CHECKSUM="${DW_HADOOP_DIST_SHA512_CHECKSUM:-de3eaca2e0517e4b569a88b63c89fae19cb8ac6c01ff990f1ff8f0cc0f3128c8e8a23db01577ca562a0e0bb1b4a3889f8c74384e609cd55e537aada3dcaa9f8a}" -DW_HADOOP_DIST="$( downloadTarball "${DW_HADOOP_DIST_URI}" "${DW_HADOOP_SERVICE_DIR}" && echo "${tarball}" )" +DW_HADOOP_DIST="$( { downloadTarball "${DW_HADOOP_DIST_URI}" "${DW_HADOOP_SERVICE_DIR}" || downloadMavenTarball "datawave-parent" "gov.nsa.datawave.quickstart" "hadoop" "${DW_HADOOP_VERSION}" "${DW_HADOOP_SERVICE_DIR}"; } && echo "${tarball}" )" DW_HADOOP_BASEDIR="hadoop-install" DW_HADOOP_SYMLINK="hadoop" @@ -223,8 +224,8 @@ function hadoopPidList() { } function hadoopDisplayBinaryInfo() { - echo "Source: ${DW_HADOOP_DIST_URI}" - local tarballName="$(basename "$DW_HADOOP_DIST_URI")" + echo "Source: ${DW_HADOOP_DIST}" + local tarballName="$(basename "$DW_HADOOP_DIST")" if [[ -f "${DW_HADOOP_SERVICE_DIR}/${tarballName}" ]]; then echo " Local: ${DW_HADOOP_SERVICE_DIR}/${tarballName}" else diff --git a/contrib/datawave-quickstart/bin/services/maven/bootstrap.sh b/contrib/datawave-quickstart/bin/services/maven/bootstrap.sh index 081ec46b0c8..61fc360a58c 100644 --- a/contrib/datawave-quickstart/bin/services/maven/bootstrap.sh +++ b/contrib/datawave-quickstart/bin/services/maven/bootstrap.sh @@ -1,16 +1,18 @@ # Sourced by env.sh DW_MAVEN_SERVICE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +DW_MAVEN_VERSION="3.8.8" # You may override DW_MAVEN_DIST_URI in your env ahead of time, and set as file:///path/to/file.tar.gz for local tarball, if needed -DW_MAVEN_DIST_URI="${DW_MAVEN_DIST_URI:-https://dlcdn.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz}" +DW_MAVEN_DIST_URI="${DW_MAVEN_DIST_URI:-https://dlcdn.apache.org/maven/maven-3/${DW_MAVEN_VERSION}/binaries/apache-maven-${DW_MAVEN_VERSION}-bin.tar.gz}" DW_MAVEN_DIST="$( basename "${DW_MAVEN_DIST_URI}" )" DW_MAVEN_BASEDIR="maven-install" DW_MAVEN_SYMLINK="maven" function bootstrapEmbeddedMaven() { - [ ! -f "${DW_MAVEN_SERVICE_DIR}/${DW_MAVEN_DIST}" ] \ - && info "Maven 3.x not detected. Attempting to bootstrap a dedicated install..." \ - && downloadTarball "${DW_MAVEN_DIST_URI}" "${DW_MAVEN_SERVICE_DIR}" + if [ ! -f "${DW_MAVEN_SERVICE_DIR}/${DW_MAVEN_DIST}" ]; then + info "Maven 3.x not detected. Attempting to bootstrap a dedicated install..." + DW_MAVEN_DIST="$( { downloadTarball "${DW_MAVEN_DIST_URI}" "${DW_MAVEN_SERVICE_DIR}" || downloadMavenTarball "datawave-parent" "gov.nsa.datawave.quickstart" "maven" "${DW_MAVEN_VERSION}" "${DW_MAVEN_SERVICE_DIR}"; } && echo "${tarball}" )" + fi export MAVEN_HOME="${DW_CLOUD_HOME}/${DW_MAVEN_SYMLINK}" export M2_HOME="${MAVEN_HOME}" @@ -105,8 +107,8 @@ function mavenPrintenv() { } function mavenDisplayBinaryInfo() { - echo "Source: ${DW_MAVEN_DIST_URI}" - local tarballName="$(basename "$DW_MAVEN_DIST_URI")" + echo "Source: ${DW_MAVEN_DIST}" + local tarballName="$(basename "$DW_MAVEN_DIST")" if [[ -f "${DW_MAVEN_SERVICE_DIR}/${tarballName}" ]]; then echo " Local: ${DW_MAVEN_SERVICE_DIR}/${tarballName}" else diff --git a/contrib/datawave-quickstart/docker/Dockerfile b/contrib/datawave-quickstart/docker/Dockerfile index db84965271f..7396b0c7527 100644 --- a/contrib/datawave-quickstart/docker/Dockerfile +++ b/contrib/datawave-quickstart/docker/Dockerfile @@ -6,6 +6,13 @@ ARG DATAWAVE_JAVA_HOME ARG DATAWAVE_BUILD_PROFILE ARG DATAWAVE_SKIP_INGEST=false ARG DATAWAVE_SKIP_TESTS=false +ARG DATAWAVE_MAVEN_REPO="https://maven.pkg.github.com/NationalSecurityAgency/datawave" + +ARG ACCUMULO_URL +ARG HADOOP_URL +ARG MAVEN_URL +ARG WILDFLY_URL +ARG ZOOKEEPER_URL USER root @@ -28,7 +35,7 @@ COPY . /opt/datawave # Install dependencies, configure password-less/zero-prompt SSH... -RUN dnf -y install gcc-c++ openssl openssh openssh-server openssh-clients openssl-libs which bc wget git java-11-openjdk-devel iproute && \ +RUN dnf -y install gcc-c++ openssl python3 openssh openssh-server openssh-clients openssl-libs which bc wget git java-11-openjdk-devel iproute && \ dnf clean all && \ ssh-keygen -q -N "" -t rsa -f ~/.ssh/id_rsa && \ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ @@ -51,9 +58,15 @@ RUN rm -f .dockerignore .maven-dockerignore && \ git config user.email "root@localhost.local" && \ git config user.name "Root User" && \ git commit -m "Source Branch :: $DATAWAVE_BRANCH_NAME :: Source Commit :: $DATAWAVE_COMMIT_ID" && \ + echo "export DW_ACCUMULO_DIST_URI=\"$ACCUMULO_URL\"" >> ~/.bashrc && \ + echo "export DW_HADOOP_DIST_URI=\"$HADOOP_URL\"" >> ~/.bashrc && \ + echo "export DW_MAVEN_DIST_URI=\"$MAVEN_URL\"" >> ~/.bashrc && \ + echo "export DW_WILDFLY_DIST_URI=\"$WILDFLY_URL\"" >> ~/.bashrc && \ + echo "export DW_ZOOKEEPER_DIST_URI=\"$ZOOKEEPER_URL\"" >> ~/.bashrc && \ echo "export DW_DATAWAVE_BUILD_PROFILE=\"$DATAWAVE_BUILD_PROFILE\"" >> ~/.bashrc && \ echo "export DW_DATAWAVE_INGEST_TEST_SKIP=\"$DATAWAVE_SKIP_INGEST\"" >> ~/.bashrc && \ - echo "export DW_WGET_OPTS=\"-q\"" >> ~/.bashrc && \ + echo "export DW_MAVEN_REPOSITORY=\"$DATAWAVE_MAVEN_REPO\"" >> ~/.bashrc && \ + echo "export DW_WGET_OPTS=\"-q --no-check-certificate\"" >> ~/.bashrc && \ echo "export JAVA_HOME=\"$DATAWAVE_JAVA_HOME\"" >> ~/.bashrc && \ echo "export PATH=\$JAVA_HOME/bin:\$PATH" >> ~/.bashrc && \ echo "source /opt/datawave/contrib/datawave-quickstart/bin/env.sh" >> ~/.bashrc diff --git a/contrib/datawave-quickstart/docker/pom.xml b/contrib/datawave-quickstart/docker/pom.xml index ceaff8362a3..ef7804f90bb 100644 --- a/contrib/datawave-quickstart/docker/pom.xml +++ b/contrib/datawave-quickstart/docker/pom.xml @@ -4,25 +4,30 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT ../../../pom.xml quickstart pom ${project.artifactId} - - accumulo-2.1.3-bin.tar.gz - hadoop-3.3.6.tar.gz - apache-maven-3.8.8-bin.tar.gz - wildfly-17.0.1.Final.tar.gz - apache-zookeeper-3.7.2-bin.tar.gz + accumulo-${version.accumulo.tar.gz}-bin.tar.gz + hadoop-${version.hadoop.tar.gz}.tar.gz + apache-maven-${version.maven.tar.gz}-bin.tar.gz + wildfly-${version.wildfly.tar.gz}.Final.tar.gz + apache-zookeeper-${version.zookeeper.tar.gz}-bin.tar.gz false - https://dlcdn.apache.org/accumulo/2.1.3/${dist.accumulo} - https://dlcdn.apache.org/hadoop/common/hadoop-3.3.6/${dist.hadoop} - https://dlcdn.apache.org/maven/maven-3/3.8.8/binaries/${dist.maven} - https://download.jboss.org/wildfly/17.0.1.Final/${dist.wildfly} - https://dlcdn.apache.org/zookeeper/zookeeper-3.7.2/${dist.zookeeper} + https://dlcdn.apache.org/accumulo/${version.accumulo.tar.gz}/${dist.accumulo} + https://dlcdn.apache.org/hadoop/common/hadoop-${version.hadoop.tar.gz}/${dist.hadoop} + https://dlcdn.apache.org/maven/maven-3/${version.maven.tar.gz}/binaries/${dist.maven} + https://download.jboss.org/wildfly/${version.wildfly.tar.gz}.Final/${dist.wildfly} + https://dlcdn.apache.org/zookeeper/zookeeper-${version.zookeeper.tar.gz}/${dist.zookeeper} + + 2.1.3 + 3.3.6 + 3.8.8 + 17.0.1 + 3.7.2 @@ -31,73 +36,29 @@ ${project.version} pom + + gov.nsa.datawave + assemble-datawave + ${project.version} + dist + tar.gz + gov.nsa.datawave.webservices datawave-ws-deploy-application ${project.version} pom + + gov.nsa.datawave.webservices + datawave-ws-deploy-application + ${project.version} + ${build.env} + tar.gz + - - - com.googlecode.maven-download-plugin - download-maven-plugin - 1.6.8 - - - download-zookeeper - - wget - - - ${url.zookeeper} - ${project.build.outputDirectory} - - - - download-accumulo - - wget - - - ${url.accumulo} - ${project.build.outputDirectory} - - - - download-hadoop - - wget - - - ${url.hadoop} - ${project.build.outputDirectory} - - - - download-maven - - wget - - - ${url.maven} - ${project.build.outputDirectory} - - - - download-wildfly - - wget - - - ${url.wildfly} - ${project.build.outputDirectory} - - - - maven-clean-plugin 3.2.0 @@ -114,6 +75,8 @@ **/apache-zookeeper-*-bin.tar.gz **/accumulo-*-bin.tar.gz + **/zookeeper-*.tar.gz + **/accumulo-*.tar.gz @@ -134,101 +97,58 @@ ../bin/services/maven **/apache-maven-*-bin.tar.gz + **/maven-*.tar.gz - maven-resources-plugin + maven-dependency-plugin - copy-accumulo-tarballs + copy-datawave-warehouse - copy-resources - - process-resources - - ${project.basedir}/../bin/services/accumulo - - - ${project.build.outputDirectory} - - ${dist.zookeeper} - ${dist.accumulo} - - - - - - - copy-datawave-tarballs - - copy-resources + copy process-resources + gov.nsa.datawave:assemble-datawave:${project.version}:tar.gz:dist ${project.basedir}/../bin/services/datawave - - - ${project.basedir}/../../../warehouse/assemble/datawave/target/ - - datawave-${build.env}-${project.version}-dist.tar.gz - - - - ${project.basedir}/../../../web-services/deploy/application/target/ - - datawave-ws-deploy-application-${project.version}-${build.env}.tar.gz - - - - ${project.build.outputDirectory} - - ${dist.wildfly} - - - - copy-hadoop-tarballs + copy-datawave-webservice - copy-resources + copy process-resources - ${project.basedir}/../bin/services/hadoop - - - ${project.build.outputDirectory} - - ${dist.hadoop} - - - + gov.nsa.datawave.webservices:datawave-ws-deploy-application:${project.version}:tar.gz:${build.env} + ${project.basedir}/../bin/services/datawave + + + + + maven-antrun-plugin + - copy-maven-tarballs + rename-datawave-warehouse - copy-resources + run process-resources - ${project.basedir}/../bin/services/maven - - - ${project.build.outputDirectory} - - ${dist.maven} - - - + + + + @@ -256,7 +176,6 @@ none ${project.basedir}/../../../ - none ${project.basedir}/Dockerfile latest @@ -268,6 +187,11 @@ ${build.env} ${skipIngest} true + ${url.accumulo} + ${url.hadoop} + ${url.maven} + ${url.wildfly} + ${url.zookeeper} @@ -323,5 +247,257 @@ + + quickstart-default + + + !quickstart-maven + + + + + + + com.googlecode.maven-download-plugin + download-maven-plugin + 1.6.8 + + + download-zookeeper + + wget + + + ${url.zookeeper} + ${project.build.outputDirectory} + + + + download-accumulo + + wget + + + ${url.accumulo} + ${project.build.outputDirectory} + + + + download-hadoop + + wget + + + ${url.hadoop} + ${project.build.outputDirectory} + + + + download-maven + + wget + + + ${url.maven} + ${project.build.outputDirectory} + + + + download-wildfly + + wget + + + ${url.wildfly} + ${project.build.outputDirectory} + + + + + + maven-resources-plugin + + + copy-accumulo-tarballs + + copy-resources + + process-resources + + ${project.basedir}/../bin/services/accumulo + + + ${project.build.outputDirectory} + + ${dist.zookeeper} + ${dist.accumulo} + + + + + + + copy-hadoop-tarballs + + copy-resources + + process-resources + + ${project.basedir}/../bin/services/hadoop + + + ${project.build.outputDirectory} + + ${dist.hadoop} + + + + + + + copy-maven-tarballs + + copy-resources + + process-resources + + ${project.basedir}/../bin/services/maven + + + ${project.build.outputDirectory} + + ${dist.maven} + + + + + + + copy-wildfly-tarball + + copy-resources + + process-resources + + ${project.basedir}/../bin/services/datawave + + + ${project.build.outputDirectory} + + ${dist.wildfly} + + + + + + + + + + + + quickstart-maven + + + quickstart-maven + + + + + gov.nsa.datawave.quickstart + accumulo + ${version.accumulo.tar.gz} + tar.gz + + + gov.nsa.datawave.quickstart + hadoop + ${version.hadoop.tar.gz} + tar.gz + + + gov.nsa.datawave.quickstart + maven + ${version.maven.tar.gz} + tar.gz + + + gov.nsa.datawave.quickstart + wildfly + ${version.wildfly.tar.gz} + tar.gz + + + gov.nsa.datawave.quickstart + zookeeper + ${version.zookeeper.tar.gz} + tar.gz + + + + + + maven-dependency-plugin + + + copy-accumulo-tarball + + copy + + process-resources + + gov.nsa.datawave.quickstart:accumulo:${version.accumulo.tar.gz}:tar.gz + ${project.basedir}/../bin/services/accumulo + + + + copy-hadoop-tarball + + copy + + process-resources + + gov.nsa.datawave.quickstart:hadoop:${version.hadoop.tar.gz}:tar.gz + ${project.basedir}/../bin/services/hadoop + + + + copy-maven-tarball + + copy + + process-resources + + gov.nsa.datawave.quickstart:maven:${version.maven.tar.gz}:tar.gz + ${project.basedir}/../bin/services/maven + + + + copy-wildfly-tarball + + copy + + process-resources + + gov.nsa.datawave.quickstart:wildfly:${version.wildfly.tar.gz}:tar.gz + ${project.basedir}/../bin/services/datawave + + + + copy-zookeeper-tarball + + copy + + process-resources + + gov.nsa.datawave.quickstart:zookeeper:${version.zookeeper.tar.gz}:tar.gz + ${project.basedir}/../bin/services/accumulo + + + + + + + - + \ No newline at end of file diff --git a/core/base-rest-responses b/core/base-rest-responses index a5fbf0ce72e..cb0f550615e 160000 --- a/core/base-rest-responses +++ b/core/base-rest-responses @@ -1 +1 @@ -Subproject commit a5fbf0ce72e5a1592b702d7913edc4a64d08f89f +Subproject commit cb0f550615ecd1d9fe1db7b17bb671db861301ff diff --git a/core/cached-results/pom.xml b/core/cached-results/pom.xml index 8ef7827516b..f7f8d9f729f 100644 --- a/core/cached-results/pom.xml +++ b/core/cached-results/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-cached-results ${project.artifactId} diff --git a/core/common-util/pom.xml b/core/common-util/pom.xml index 316e69710cc..1314bf529f4 100644 --- a/core/common-util/pom.xml +++ b/core/common-util/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-common-util ${project.artifactId} diff --git a/core/common/pom.xml b/core/common/pom.xml index 98426209601..9839f80f8d8 100644 --- a/core/common/pom.xml +++ b/core/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-common ${project.artifactId} diff --git a/core/connection-pool/pom.xml b/core/connection-pool/pom.xml index 508695b9952..d451af16cca 100644 --- a/core/connection-pool/pom.xml +++ b/core/connection-pool/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-connection-pool ${project.artifactId} diff --git a/core/in-memory-accumulo b/core/in-memory-accumulo index da2a22dbbe3..8a9d2f46d20 160000 --- a/core/in-memory-accumulo +++ b/core/in-memory-accumulo @@ -1 +1 @@ -Subproject commit da2a22dbbe368527808a22e5323c00f94f1c4828 +Subproject commit 8a9d2f46d2012d4493baff5e8dc9f08f45f746d5 diff --git a/core/map-reduce/pom.xml b/core/map-reduce/pom.xml index 2a8593d2086..fcdadfa5b0e 100644 --- a/core/map-reduce/pom.xml +++ b/core/map-reduce/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-map-reduce ${project.artifactId} diff --git a/core/metrics-reporter b/core/metrics-reporter index cd2fab0762c..992378d6294 160000 --- a/core/metrics-reporter +++ b/core/metrics-reporter @@ -1 +1 @@ -Subproject commit cd2fab0762c3c80ab28c21e1cbc76958b6345eae +Subproject commit 992378d62946730d2ee799606276adca9522e050 diff --git a/core/modification/pom.xml b/core/modification/pom.xml index e7abf8ceb70..d42c12869d9 100644 --- a/core/modification/pom.xml +++ b/core/modification/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-modification ${project.artifactId} diff --git a/core/pom.xml b/core/pom.xml index ba7b1f6ff67..cd285f42816 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT gov.nsa.datawave.core datawave-core-parent diff --git a/core/query/pom.xml b/core/query/pom.xml index 83a2731b80c..83130c6a45f 100644 --- a/core/query/pom.xml +++ b/core/query/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core-query ${project.artifactId} diff --git a/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java b/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java index deadd498218..9368856f9fb 100644 --- a/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java +++ b/core/query/src/main/java/datawave/core/query/configuration/GenericQueryConfiguration.java @@ -77,6 +77,9 @@ public class GenericQueryConfiguration implements Serializable { // either IMMEDIATE or EVENTUAL private Map tableConsistencyLevels = new HashMap<>(); + // provides default scan hints + // NOTE: accumulo reserves the execution hint name 'meta' + // NOTE: datawave reserves the execution hint name 'expansion' for index expansion private Map> tableHints = new HashMap<>(); /** diff --git a/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java b/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java index a60cc62e0ce..e4a6c022328 100644 --- a/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java +++ b/core/query/src/main/java/datawave/core/query/logic/composite/CompositeQueryLogic.java @@ -16,7 +16,6 @@ import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.security.Authorizations; -import org.apache.commons.collections4.functors.NOPTransformer; import org.apache.commons.collections4.iterators.TransformIterator; import org.apache.log4j.Logger; @@ -250,8 +249,6 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting StringBuilder logicQueryStringBuilder = new StringBuilder(); if (!getInitializedLogics().isEmpty()) { logicQueryStringBuilder.append(getConfig().getQueryString()); - } else { - logicQueryStringBuilder.append("CompositeQueryLogic: "); } Map exceptions = new HashMap<>(); @@ -266,12 +263,17 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting if (logicQueryStringBuilder.length() > 0) { logicQueryStringBuilder.append(" || "); } - logicQueryStringBuilder.append("( ( logic = '").append(logicName).append("' )").append(" && "); + + logicQueryStringBuilder.append("( "); + logicQueryStringBuilder.append("( logic = '").append(logicName).append("' )"); try { // duplicate the settings for this query Query settingsCopy = settings.duplicate(settings.getQueryName() + " -> " + logicName); + // ensure we use the same query id + settingsCopy.setId(settings.getId()); + // update the query auths and runtime query authorizations for this logic runtimeQueryAuthorizations = updateRuntimeAuthorizationsAndQueryAuths(logic, settingsCopy); @@ -280,9 +282,9 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting // only add this query logic to the initialized logic states if it was not simply filtered out if (logic instanceof FilteredQueryLogic && ((FilteredQueryLogic) logic).isFiltered()) { log.info("Dropping " + logic.getLogicName() + " as it was filtered out"); - logicQueryStringBuilder.append("( filtered = true )"); + logicQueryStringBuilder.append(" && ").append("( filtered = true )"); } else { - logicQueryStringBuilder.append(config.getQueryString()); + logicQueryStringBuilder.append(" && ").append(config.getQueryString()); QueryLogicHolder holder = new QueryLogicHolder(logicName, logic); holder.setSettings(settingsCopy); holder.setMaxResults(logic.getResultLimit(settingsCopy)); @@ -298,7 +300,7 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting } catch (Exception e) { exceptions.put(logicName, e); log.error("Failed to initialize " + logic.getClass().getName(), e); - logicQueryStringBuilder.append("( failure = '").append(e.getMessage()).append("' )"); + logicQueryStringBuilder.append(" && ").append("( failure = '").append(e.getMessage()).append("' )"); failedQueryLogics.put(logicName, logic); } finally { queryLogics.remove(next.getKey()); @@ -427,22 +429,25 @@ public void setConnectionPriority(String priority) { */ @Override public synchronized QueryLogicTransformer getTransformer(Query settings) { - ResultsPage emptyList = new ResultsPage(); - Class responseClass = null; - List delegates = new ArrayList<>(); - for (QueryLogic logic : getQueryLogics().values()) { - QueryLogicTransformer t = logic.getTransformer(settings); - delegates.add(t); - BaseResponse refResponse = t.createResponse(emptyList); - if (null == responseClass) { - responseClass = refResponse.getClass(); - } else { - if (!responseClass.equals(refResponse.getClass())) { - throw new RuntimeException("All query logics must use transformers that return the same object type"); + if (this.transformer == null) { + ResultsPage emptyList = new ResultsPage(); + Class responseClass = null; + List delegates = new ArrayList<>(); + for (QueryLogic logic : getQueryLogics().values()) { + QueryLogicTransformer t = logic.getTransformer(settings); + delegates.add(t); + BaseResponse refResponse = t.createResponse(emptyList); + if (null == responseClass) { + responseClass = refResponse.getClass(); + } else { + if (!responseClass.equals(refResponse.getClass())) { + throw new RuntimeException("All query logics must use transformers that return the same object type: " + responseClass + " vs " + + refResponse.getClass()); + } } } + this.transformer = new CompositeQueryLogicTransformer(delegates); } - this.transformer = new CompositeQueryLogicTransformer(delegates); return this.transformer; } @@ -457,8 +462,8 @@ public TransformIterator getTransformIterator(Query settings) { return Iterables.getOnlyElement(queryLogics.values()).getTransformIterator(settings); } else { // The objects put into the pageQueue have already been transformed. - // We will iterate over the pagequeue with the No-Op transformer - return new TransformIterator(results.iterator(), NOPTransformer.nopTransformer()); + // CompositeQueryLogicTransformer will iterate over the pageQueue with no change to the objects + return new TransformIterator(results.iterator(), getTransformer(settings)); } } diff --git a/core/query/src/main/java/datawave/core/query/remote/RemoteQueryService.java b/core/query/src/main/java/datawave/core/query/remote/RemoteQueryService.java index aff8427ac1d..f996f9c83c6 100644 --- a/core/query/src/main/java/datawave/core/query/remote/RemoteQueryService.java +++ b/core/query/src/main/java/datawave/core/query/remote/RemoteQueryService.java @@ -28,6 +28,14 @@ public interface RemoteQueryService { */ GenericResponse createQuery(String queryLogicName, Map> queryParameters, ProxiedUserDetails callerObject) throws QueryException; + /** + * Set the class for the next response. The default is to use the event query response but to make this useful for other query services we need to be able + * to override. + * + * @param nextQueryResponseClass + */ + void setNextQueryResponseClass(Class nextQueryResponseClass); + /** * Call next on a remote query service * diff --git a/core/utils/accumulo-utils b/core/utils/accumulo-utils index bc973961511..638b3eda970 160000 --- a/core/utils/accumulo-utils +++ b/core/utils/accumulo-utils @@ -1 +1 @@ -Subproject commit bc973961511f7f9eaaf9a500fb015234188d3a72 +Subproject commit 638b3eda97016bb66a7d014112b215075aac212e diff --git a/core/utils/common-utils b/core/utils/common-utils index 2810ed2bdd7..c96ed213426 160000 --- a/core/utils/common-utils +++ b/core/utils/common-utils @@ -1 +1 @@ -Subproject commit 2810ed2bdd7733b7ec98fb4bf470cc070443f5bc +Subproject commit c96ed21342666db82b9e92ccf676d0987cb7ff8e diff --git a/core/utils/metadata-utils b/core/utils/metadata-utils index 212507ce17b..6169ce38337 160000 --- a/core/utils/metadata-utils +++ b/core/utils/metadata-utils @@ -1 +1 @@ -Subproject commit 212507ce17b255e855ceed209e48a19c736ed5c3 +Subproject commit 6169ce38337a31e9ef0a68285b544f7459010ab7 diff --git a/core/utils/pom.xml b/core/utils/pom.xml index 396fb42cbcb..ba22ec0dd00 100644 --- a/core/utils/pom.xml +++ b/core/utils/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.core datawave-core-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT gov.nsa.datawave.core datawave-utils-parent diff --git a/core/utils/type-utils b/core/utils/type-utils index 79a07173f3b..55d92d5d99c 160000 --- a/core/utils/type-utils +++ b/core/utils/type-utils @@ -1 +1 @@ -Subproject commit 79a07173f3b359abd068421a5fe113087fc5a834 +Subproject commit 55d92d5d99c6e232ba1d7ad12c210ded9ec240a9 diff --git a/docker/README.md b/docker/README.md index ece924ddc3b..14187956565 100644 --- a/docker/README.md +++ b/docker/README.md @@ -11,6 +11,7 @@ out the [prereqs](#prereqs) at a minimum): git submodule update --init --recursive # build docker images for datawave and all of the microservices +# optionally include '-Dquickstart-maven' to download accumulo/zookeeper/hadoop/maven tarballs from the maven repository mvn -Pcompose -Dmicroservice-docker -Dquickstart-docker -Ddeploy -Dtar -Ddist -DskipTests clean install # bootstrap the services, and bring them up using docker compose @@ -135,6 +136,17 @@ Enabled via the 'dictionary', or 'full' profile. You will need to build the docker image for this service on your local machine following the instructions in the dictionary service README. +### File Provider + +Enabled via the 'file-provider', or 'full' profile. + +This microservice is in development, and can be found in this repo. + +[Datawave File Provider Service](https://github.com/NationalSecurityAgency/datawave-file-provider-service/tree/main) provides file management and access to Datawave and it's services. + +You will need to build the docker image for this service on your local machine following the instructions in the file provider service README. + + ## Usage Please read through these instructions in their entirety before attempting to build or deploy Datawave. @@ -207,7 +219,7 @@ export DW_BIND_HOST=0.0.0.0 This will ensure that Hadoop binds to all interfaces, and that Accumulo binds to the hostname/IP address. This is required to connect to the host Accumulo instance from a docker container. -What follows is a brief description of how to setup and run the Datawave Quickstart. For more detailed information see the [DataWave Quickstart Readme](../../contrib/datawave-quickstart/README.md). +What follows is a brief description of how to setup and run the Datawave Quickstart. For more detailed information see the [DataWave Quickstart Readme](../contrib/datawave-quickstart/README.md). ``` # Add the quickstart env.sh to your .bashrc @@ -327,6 +339,10 @@ Start the default services, the kafka services, and the dictionary service: ```docker compose --profile quickstart --profile dictionary --profile kafka up -d``` +Start the default services, and the file provider service: + +```docker compose --profile quickstart --profile file-provider up -d``` + Start all services: ```docker compose --profile quickstart --profile full up -d``` diff --git a/docker/config/application-query.yml b/docker/config/application-query.yml index 552ff8bf433..3b8fc5d024a 100755 --- a/docker/config/application-query.yml +++ b/docker/config/application-query.yml @@ -84,6 +84,9 @@ warehouse: edgeModelName: 'DATAWAVE_EDGE' datawave: + connection: + factory: + defaultPool: 'WAREHOUSE' metadata: all-metadata-auths: - PRIVATE,PUBLIC @@ -91,6 +94,14 @@ datawave: "[datawave.data.type.DateType]": "datawave.data.type.RawDateType" query: + poolLimits: + 'pool1': &defaultPoolLimits + maxQueriesPerExecutor: + 'WAREHOUSE': 40 + 'UUID': 20 + livenessTimeout: 90 + livenessTimeoutUnit: SECONDS + 'pool2': *defaultPoolLimits parser: skipTokenizeUnfieldedFields: - "DOMETA" @@ -516,59 +527,45 @@ datawave: - fieldName: "SOURCE" modelFieldName: "VERTEXA" direction: "REVERSE" - indexOnly: false - fieldName: "SOURCE" modelFieldName: "VERTEXA" direction: "FORWARD" - indexOnly: false - fieldName: "SINK" modelFieldName: "VERTEXB" direction: "REVERSE" - indexOnly: false - fieldName: "SINK" modelFieldName: "VERTEXB" direction: "FORWARD" - indexOnly: false - fieldName: "RELATION" modelFieldName: "RELATION" direction: "REVERSE" - indexOnly: false - fieldName: "RELATION" modelFieldName: "RELATION" direction: "FORWARD" - indexOnly: false - fieldName: "TYPE" modelFieldName: "TYPE" direction: "REVERSE" - indexOnly: false - fieldName: "TYPE" modelFieldName: "TYPE" direction: "FORWARD" - indexOnly: false - fieldName: "ATTRIBUTE1" modelFieldName: "ATTR1" direction: "REVERSE" - indexOnly: false - fieldName: "ATTRIBUTE1" modelFieldName: "ATTR1" direction: "FORWARD" - indexOnly: false - fieldName: "ATTRIBUTE2" modelFieldName: "ATTR2" direction: "REVERSE" - indexOnly: false - fieldName: "ATTRIBUTE2" modelFieldName: "ATTR2" direction: "FORWARD" - indexOnly: false - fieldName: "ATTRIBUTE3" modelFieldName: "ATTR3" direction: "REVERSE" - indexOnly: false - fieldName: "ATTRIBUTE3" modelFieldName: "ATTR3" direction: "FORWARD" - indexOnly: false # Enable additional Hazelcast cluster for use by the query and executor services hazelcast: diff --git a/docker/config/application-querymessaging.yml b/docker/config/application-querymessaging.yml index fd5b0c97e70..8845ce8340c 100755 --- a/docker/config/application-querymessaging.yml +++ b/docker/config/application-querymessaging.yml @@ -7,5 +7,21 @@ datawave: backend: ${messaging.backend} rabbitmq: maxMessageSizeBytes: ${messaging.maxMessageSizeBytes} + # enable the following configuration if you want to use an independent, dedicated rabbitmq cluster for query (i.e. not the default spring one) + useDedicatedInstance: ${USE_DEDICATED_INSTANCE:false} + instanceSettings: + host: ${QUERY_RABBIT_HOST:query-rabbitmq} + port: ${QUERY_RABBIT_PORT:5672} + publisherConfirmType: SIMPLE + # Note - spring doesn't like it when you enable publisherConfirms for the SIMPLE confirm type... + publisherConfirms: false + publisherReturns: true kafka: partitions: 2 + # enable the following configuration if you want to use an independent, dedicated kafka cluster for query (i.e. not the default spring one) + useDedicatedInstance: ${USE_DEDICATED_INSTANCE:false} + instanceSettings: + bootstrapServers: ${QUERY_KAFKA_HOST:query-kafka}:${QUERY_KAFKA_PORT:9092} + autoOffsetReset: earliest + enableAutoCommit: false + allowAutoCreateTopics: false diff --git a/docker/config/executor-pool1.yml b/docker/config/executor-pool1.yml index 8850c4ae636..d77972592bc 100755 --- a/docker/config/executor-pool1.yml +++ b/docker/config/executor-pool1.yml @@ -6,15 +6,3 @@ datawave: swagger: title: "Query Executor Service (Pool 1)" description: "REST API provided by the Query Executor Service" - connection: - factory: - pools: - 'pool1': - zookeepers: '${accumulo.zookeepers}' - instance: '${accumulo.instanceName}' - username: '${accumulo.username}' - password: '${accumulo.password}' - lowPriorityPoolSize: 40 - normalPriorityPoolSize: 40 - highPriorityPoolSize: 40 - adminPriorityPoolSize: 40 diff --git a/docker/config/executor-pool2.yml b/docker/config/executor-pool2.yml index a4757d93235..e643ce1f4dc 100755 --- a/docker/config/executor-pool2.yml +++ b/docker/config/executor-pool2.yml @@ -6,15 +6,3 @@ datawave: swagger: title: "Query Executor Service (Pool 2)" description: "REST API provided by the Query Executor Service" - connection: - factory: - pools: - 'pool2': - zookeepers: '${accumulo.zookeepers}' - instance: '${accumulo.instanceName}' - username: '${accumulo.username}' - password: '${accumulo.password}' - lowPriorityPoolSize: 40 - normalPriorityPoolSize: 40 - highPriorityPoolSize: 40 - adminPriorityPoolSize: 40 diff --git a/docker/config/executor.yml b/docker/config/executor.yml index da54229b34d..a82a46b1e55 100755 --- a/docker/config/executor.yml +++ b/docker/config/executor.yml @@ -26,7 +26,7 @@ datawave: zookeepers: '${accumulo.zookeepers}' tableNames: - '${warehouse.tables.metadata.name}' - poolName: 'default' + poolName: '${datawave.connection.factory.defaultPool}' reloadInterval: 360000 evictionReaperIntervalInSeconds: 360 numLocks: 3 @@ -34,9 +34,8 @@ datawave: reload-crontab: '* * * * * ?' connection: factory: - defaultPool: "default" pools: - "default": + "WAREHOUSE": zookeepers: '${accumulo.zookeepers}' instance: '${accumulo.instanceName}' username: '${accumulo.username}' @@ -45,6 +44,15 @@ datawave: normalPriorityPoolSize: 40 highPriorityPoolSize: 40 adminPriorityPoolSize: 40 + "UUID": + zookeepers: '${accumulo.zookeepers}' + instance: '${accumulo.instanceName}' + username: '${accumulo.username}' + password: '${accumulo.password}' + lowPriorityPoolSize: 20 + normalPriorityPoolSize: 20 + highPriorityPoolSize: 20 + adminPriorityPoolSize: 20 query: executor: pool: "${executor.poolName}" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index c6a0b8ac4b9..a44a40a1859 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -206,6 +206,50 @@ services: configuration: condition: service_started + query-rabbitmq: + profiles: + - queryrabbit + # To enable an additional rabbit cluster, enable the "queryrabbit" profile + # Set an environment variable: USE_DEDICATED_INSTANCE=true. This will force the query and executor service to use a separate "query-rabbitmq" rabbit cluster. + image: docker.io/rabbitmq:3.12.4 + volumes: + - ${RABBITMQ_CONFIG_DIR:-./rabbitmq-query-config}:/etc/rabbitmq + - ./logs:/logs + environment: + - TCP_PORTS=15672,5672 + - RABBITMQ_ERLANG_COOKIE="someothercookie" + ports: + - "15673:15672" + networks: + - demo + depends_on: + consul: + condition: service_started + + # When auto.create.topics.enable is true, this causes deleted topics to be recreated at random. So, leave it disabled. + query-kafka: + profiles: + - querykafka + # To enable an additional rabbit cluster, enable the "querykafka" profile + # Set an environment variable: USE_DEDICATED_INSTANCE=true. This will force the query and executor service to use a separate "query-kafka" kafka cluster. + image: docker.io/bitnami/kafka:3.2 + ports: + - "9095:9095" + networks: + - demo + environment: + - KAFKA_CFG_NODE_ID=1 + - KAFKA_CFG_PROCESS_ROLES=controller,broker + - ALLOW_PLAINTEXT_LISTENER=yes + - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,CONTROLLER:PLAINTEXT,EXTERNAL:PLAINTEXT + - KAFKA_CFG_LISTENERS=CLIENT://:9092,CONTROLLER://:9093,EXTERNAL://:9095 + - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://query-kafka:9092,EXTERNAL://${DW_HOSTNAME}:9095 + - KAFKA_CFG_CONTROLLER_QUORUM_VOTERS=1@query-kafka:9093 + - KAFKA_CFG_CONTROLLER_LISTENER_NAMES=CONTROLLER + - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT + - KAFKA_CFG_AUTO_CREATE_TOPICS_ENABLE=false + - KAFKA_CFG_DELETE_TOPICS_ENABLE=true + authorization: entrypoint: [ "java","-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5008","-jar","app.jar" ] image: datawave/authorization-service @@ -369,6 +413,25 @@ services: authorization: condition: service_healthy + file-provider: + profiles: + - file-provider + - full + image: datawave/file-provider-service + command: + - --spring.output.ansi.enabled=ALWAYS + - --spring.profiles.active=consul,compose,remoteauth + - --spring.cloud.consul.host=consul + - --spring.cloud.consul.discovery.instance-id=$${spring.application.name}:$${random.value} + ports: + - "8580:8080" + - "8943:8443" + volumes: + - ${PKI_DIR:-./pki}:/etc/pki:ro + - ./logs:/logs + networks: + - demo + # If you want to test cached results, enable the cachedresults profile mysql: profiles: @@ -402,6 +465,7 @@ services: - "BACKEND=${BACKEND:-rabbitmq}" - CACHED_RESULTS=${CACHED_RESULTS:-false} - QUERY_CACHE=${QUERY_CACHE:-cache} + - USE_DEDICATED_INSTANCE=${USE_DEDICATED_INSTANCE:-false} ports: - "8080:8080" - "8443:8443" @@ -482,6 +546,7 @@ services: - BACKEND=${BACKEND:-rabbitmq} - HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} - QUERY_CACHE=${QUERY_CACHE:-cache} + - USE_DEDICATED_INSTANCE=${USE_DEDICATED_INSTANCE:-false} # This mapping is required to enable the metrics service to communicate # with host-deployed services like hadoop, zookeeper, and accumulo. # These values are set locally in .env via bootstrap.sh @@ -530,6 +595,7 @@ services: - BACKEND=${BACKEND:-rabbitmq} - HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} - QUERY_CACHE=${QUERY_CACHE:-cache} + - USE_DEDICATED_INSTANCE=${USE_DEDICATED_INSTANCE:-false} # This mapping is required to enable the metrics service to communicate # with host-deployed services like hadoop, zookeeper, and accumulo. # These values are set locally in .env via bootstrap.sh diff --git a/docker/rabbitmq-query-config/enabled_plugins b/docker/rabbitmq-query-config/enabled_plugins new file mode 100755 index 00000000000..90fdaa378e5 --- /dev/null +++ b/docker/rabbitmq-query-config/enabled_plugins @@ -0,0 +1 @@ +[rabbitmq_management]. \ No newline at end of file diff --git a/docker/rabbitmq-query-config/rabbitmq.conf b/docker/rabbitmq-query-config/rabbitmq.conf new file mode 100755 index 00000000000..27f4f5b8fa3 --- /dev/null +++ b/docker/rabbitmq-query-config/rabbitmq.conf @@ -0,0 +1,3 @@ +cluster_partition_handling = autoheal +# Enable the guest user +loopback_users.guest = false \ No newline at end of file diff --git a/docker/scripts/common/query.sh b/docker/scripts/common/query.sh index 550159d0050..f46cbb19268 100755 --- a/docker/scripts/common/query.sh +++ b/docker/scripts/common/query.sh @@ -65,7 +65,7 @@ runQuery() { -H "Pool: $POOL" \ ${DATAWAVE_ENDPOINT}/$QUERY_ID/next -o nextResponse_$i.xml -w '%{http_code}\n' >> querySummary.txt - CONTINUE=`grep 'HTTP/2 200' headers_$i.txt` + CONTINUE=`grep 'HTTP/.* 200' headers_$i.txt` if [ -z "$CONTINUE" ]; then i=-1 diff --git a/docs/pom.xml b/docs/pom.xml index 6e7a2b29b05..75a33ea38b8 100644 --- a/docs/pom.xml +++ b/docs/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-docs diff --git a/microservices/microservice-parent b/microservices/microservice-parent index 94e402333c1..6207c9d6576 160000 --- a/microservices/microservice-parent +++ b/microservices/microservice-parent @@ -1 +1 @@ -Subproject commit 94e402333c16767ae91c01b60a9ff66b5aaafda1 +Subproject commit 6207c9d65768c191773099a6f39f2b935aa52acd diff --git a/microservices/microservice-service-parent b/microservices/microservice-service-parent index 65cda7b2c52..8064d20ccf5 160000 --- a/microservices/microservice-service-parent +++ b/microservices/microservice-service-parent @@ -1 +1 @@ -Subproject commit 65cda7b2c526af5a3a9791b85a88dbd2422bf690 +Subproject commit 8064d20ccf5fb48dbf1d309503ffafa8ddaafb6c diff --git a/microservices/pom.xml b/microservices/pom.xml index 5b369fa08c1..722195928eb 100644 --- a/microservices/pom.xml +++ b/microservices/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT gov.nsa.datawave.microservice datawave-microservice-build-parent diff --git a/microservices/services/accumulo b/microservices/services/accumulo index 9a0b8183c7f..7ecb4dd5602 160000 --- a/microservices/services/accumulo +++ b/microservices/services/accumulo @@ -1 +1 @@ -Subproject commit 9a0b8183c7fb0b9f6411426ef407ce0b350a60cc +Subproject commit 7ecb4dd5602395e4f41a48673cd5cc9e3e966f10 diff --git a/microservices/services/audit b/microservices/services/audit index aa90b87636f..2714c97b19f 160000 --- a/microservices/services/audit +++ b/microservices/services/audit @@ -1 +1 @@ -Subproject commit aa90b87636fb9cfbf817cbd619baef7b0d268c4f +Subproject commit 2714c97b19fdc9635fc7f18e3d1489e6d92a017d diff --git a/microservices/services/authorization b/microservices/services/authorization index 87bedba80c7..292be85b633 160000 --- a/microservices/services/authorization +++ b/microservices/services/authorization @@ -1 +1 @@ -Subproject commit 87bedba80c7b9822fa78b7e6826fc9fb169f018d +Subproject commit 292be85b633c6f0f99ae9b3b382a4373ffc720a7 diff --git a/microservices/services/config b/microservices/services/config index fc5c26bd5b1..fc187c0e4b9 160000 --- a/microservices/services/config +++ b/microservices/services/config @@ -1 +1 @@ -Subproject commit fc5c26bd5b155c88bbf297552943d5a93b4d69d7 +Subproject commit fc187c0e4b90c3eaea16dad3ba9ba5330262bcf7 diff --git a/microservices/services/dictionary b/microservices/services/dictionary index e83b2fb9d83..653cf5b5963 160000 --- a/microservices/services/dictionary +++ b/microservices/services/dictionary @@ -1 +1 @@ -Subproject commit e83b2fb9d831b5d2c2e571f8a59762861e39949c +Subproject commit 653cf5b59634151b3eea881133fe604a0050df6e diff --git a/microservices/services/file-provider b/microservices/services/file-provider new file mode 160000 index 00000000000..27080f3b943 --- /dev/null +++ b/microservices/services/file-provider @@ -0,0 +1 @@ +Subproject commit 27080f3b943722f84aaa8af93e4fda7b41b50bd9 diff --git a/microservices/services/hazelcast b/microservices/services/hazelcast index cd332b47623..8abc2ef7e91 160000 --- a/microservices/services/hazelcast +++ b/microservices/services/hazelcast @@ -1 +1 @@ -Subproject commit cd332b47623b86506b07a3a1612be51187115dd3 +Subproject commit 8abc2ef7e91c90bee920b129f67d540c61217f0a diff --git a/microservices/services/map b/microservices/services/map new file mode 160000 index 00000000000..473ec437082 --- /dev/null +++ b/microservices/services/map @@ -0,0 +1 @@ +Subproject commit 473ec437082e661f51132a9254877b6bb27def84 diff --git a/microservices/services/mapreduce-query b/microservices/services/mapreduce-query index 036eb1673ff..22af8f93c88 160000 --- a/microservices/services/mapreduce-query +++ b/microservices/services/mapreduce-query @@ -1 +1 @@ -Subproject commit 036eb1673ffd7c6ad4d5731b5c6bc2d8e342b79e +Subproject commit 22af8f93c887db097c09078a477323be7877b184 diff --git a/microservices/services/modification b/microservices/services/modification index 6cc297b4ff3..768007f3195 160000 --- a/microservices/services/modification +++ b/microservices/services/modification @@ -1 +1 @@ -Subproject commit 6cc297b4ff30f14bf7d66b883228c40436b79dac +Subproject commit 768007f3195f688149666ba8a2a10b3d56d30fbf diff --git a/microservices/services/pom.xml b/microservices/services/pom.xml index 4b12a813c73..209db1ba076 100644 --- a/microservices/services/pom.xml +++ b/microservices/services/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.microservice datawave-microservice-build-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-microservice-service-build-parent pom @@ -66,6 +66,17 @@ dictionary + + submodule-service-file-provider + + + file-provider/pom.xml + + + + file-provider + + submodule-service-hazelcast diff --git a/microservices/services/query b/microservices/services/query index bb943107feb..14c65febd3d 160000 --- a/microservices/services/query +++ b/microservices/services/query @@ -1 +1 @@ -Subproject commit bb943107febc28bc21a28bef4616fc49fe5c24e9 +Subproject commit 14c65febd3d10548b972f8d3d213f82f9fa86918 diff --git a/microservices/services/query-executor b/microservices/services/query-executor index 3171b32192e..4517c4c29ea 160000 --- a/microservices/services/query-executor +++ b/microservices/services/query-executor @@ -1 +1 @@ -Subproject commit 3171b32192e5034d97b5d38f5a56574b18579415 +Subproject commit 4517c4c29ea3c79499a306d7986bb81a00f12517 diff --git a/microservices/services/query-metric b/microservices/services/query-metric index 8281bfc5d3a..4bce0c89219 160000 --- a/microservices/services/query-metric +++ b/microservices/services/query-metric @@ -1 +1 @@ -Subproject commit 8281bfc5d3a608974e131ea304b325200fa7ff8e +Subproject commit 4bce0c89219d8bb7901f1e9c4993460a7bd50452 diff --git a/microservices/starters/audit b/microservices/starters/audit index b2bf281813f..ef18c9e6521 160000 --- a/microservices/starters/audit +++ b/microservices/starters/audit @@ -1 +1 @@ -Subproject commit b2bf281813fc83c15fb0aa14505ce3e65ba15f91 +Subproject commit ef18c9e6521c36a8fa64cb40a266cf6e532b0e64 diff --git a/microservices/starters/cache b/microservices/starters/cache index 17c220c186b..30196007910 160000 --- a/microservices/starters/cache +++ b/microservices/starters/cache @@ -1 +1 @@ -Subproject commit 17c220c186bcaf68e42d205b4f45bedb16961634 +Subproject commit 3019600791021114e50b387cc312c97375b979ff diff --git a/microservices/starters/cached-results b/microservices/starters/cached-results index 9f83446ca5d..b22a3d6a17a 160000 --- a/microservices/starters/cached-results +++ b/microservices/starters/cached-results @@ -1 +1 @@ -Subproject commit 9f83446ca5d7392e8643b63a92e249f49afcc5c2 +Subproject commit b22a3d6a17a7f4eddfd5dc8e205e937b294e1c3c diff --git a/microservices/starters/datawave b/microservices/starters/datawave index 2baa5f42a43..081e0a028a8 160000 --- a/microservices/starters/datawave +++ b/microservices/starters/datawave @@ -1 +1 @@ -Subproject commit 2baa5f42a4369ba583c359c5e2cbf8e38f1a59b6 +Subproject commit 081e0a028a85ab562a1fa419a590310bed030a7b diff --git a/microservices/starters/metadata b/microservices/starters/metadata index 8cac03428ac..28337e6c230 160000 --- a/microservices/starters/metadata +++ b/microservices/starters/metadata @@ -1 +1 @@ -Subproject commit 8cac03428ac7c05e8a59f0295824da60d2bb552e +Subproject commit 28337e6c2306b44a888a4f6ba7825f268ab6ff18 diff --git a/microservices/starters/pom.xml b/microservices/starters/pom.xml index b6c4a6d8744..d3c602386bc 100644 --- a/microservices/starters/pom.xml +++ b/microservices/starters/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.microservice datawave-microservice-build-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-microservice-starter-build-parent pom diff --git a/microservices/starters/query b/microservices/starters/query index fd0977cd461..ae730eab061 160000 --- a/microservices/starters/query +++ b/microservices/starters/query @@ -1 +1 @@ -Subproject commit fd0977cd46145abe9d324088c5bc46b5bec141cd +Subproject commit ae730eab0610c414bdfefbcaa0e072ec2751fa72 diff --git a/microservices/starters/query-metric b/microservices/starters/query-metric index 5df8fd573e1..fad3ed7859f 160000 --- a/microservices/starters/query-metric +++ b/microservices/starters/query-metric @@ -1 +1 @@ -Subproject commit 5df8fd573e17f17d60c41a0ce6cbf4960fa332b1 +Subproject commit fad3ed7859fdc612a555ed498bd897aca168cfd9 diff --git a/pom.xml b/pom.xml index dfd2d6ba3db..56b8c966e42 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ 4.0.0 gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT pom DataWave DataWave is a Java-based ingest and query framework that leverages Apache Accumulo to provide fast, secure access to your data. @@ -59,6 +59,19 @@ 1.11.0 5.2.0 5.2.0 + 4.0.0 + 4.0.0 + 4.0.0 + 4.0.0 + 4.0.0 + 3.0.0 + 4.0.1 + 1.0.0 + 4.0.6 + 3.0.0 + 1.0.0 + 4.0.7 + 3.0.3 1.9.0 5.2.0 2.15.0 @@ -99,19 +112,6 @@ 7.5.0 2.5.2 1.6.0 - 4.0.0 - 4.0.0 - 4.0.0 - 4.0.0 - 4.0.0 - 3.0.0 - 4.0.0 - 1.0.0 - 4.0.6 - 3.0.0 - 1.0.0 - 4.0.7 - 3.0.3 1.2 2.23.0 8.0.28 @@ -335,47 +335,47 @@ gov.nsa.datawave.microservice accumulo-api - ${version.microservice.accumulo-api} + ${version.datawave.accumulo-api} gov.nsa.datawave.microservice accumulo-utils - ${version.microservice.accumulo-utils} + ${version.datawave.accumulo-utils} gov.nsa.datawave.microservice audit-api - ${version.microservice.audit-api} + ${version.datawave.audit-api} gov.nsa.datawave.microservice authorization-api - ${version.microservice.authorization-api} + ${version.datawave.authorization-api} gov.nsa.datawave.microservice base-rest-responses - ${version.microservice.base-rest-responses} + ${version.datawave.base-rest-responses} gov.nsa.datawave.microservice common-utils - ${version.microservice.common-utils} + ${version.datawave.common-utils} gov.nsa.datawave.microservice dictionary-api - ${version.microservice.dictionary-api} + ${version.datawave.dictionary-api} gov.nsa.datawave.microservice mapreduce-query-api - ${version.microservice.mapreduce-query-api} + ${version.datawave.mapreduce-query-api} gov.nsa.datawave.microservice metadata-utils - ${version.microservice.metadata-utils} + ${version.datawave.metadata-utils} log4j @@ -394,23 +394,23 @@ gov.nsa.datawave.microservice metrics-reporter - ${version.microservice.metrics-reporter} + ${version.datawave.metrics-reporter} gov.nsa.datawave.microservice query-api - ${version.microservice.query-api} + ${version.datawave.query-api} gov.nsa.datawave.microservice query-api - ${version.microservice.query-api} + ${version.datawave.query-api} jboss gov.nsa.datawave.microservice query-metric-api - ${version.microservice.query-metric-api} + ${version.datawave.query-metric-api} gov.nsa.datawave @@ -445,7 +445,7 @@ gov.nsa.datawave.microservice type-utils - ${version.microservice.type-utils} + ${version.datawave.type-utils} log4j @@ -1135,7 +1135,7 @@ gov.nsa.datawave.microservice base-rest-responses - ${version.microservice.base-rest-responses} + ${version.datawave.base-rest-responses} tests test-jar test diff --git a/properties/kubernetes.properties b/properties/kubernetes.properties index ee3ffc36200..71711b88633 100644 --- a/properties/kubernetes.properties +++ b/properties/kubernetes.properties @@ -3,6 +3,8 @@ RCPT_TO=hadoop@localhost docker.image.prefix=ghcr.io/nationalsecurityagency/ +docker.image.accumulo.tag=2.1.3 + # ingest properties DATAWAVE_INGEST_HOME=/opt/datawave-ingest/current diff --git a/warehouse/accumulo-extensions/pom.xml b/warehouse/accumulo-extensions/pom.xml index 521b3f6e58f..157772709c6 100644 --- a/warehouse/accumulo-extensions/pom.xml +++ b/warehouse/accumulo-extensions/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-accumulo-extensions ${project.artifactId} diff --git a/warehouse/age-off-utils/pom.xml b/warehouse/age-off-utils/pom.xml index dc4300e49f5..1bffff14df2 100644 --- a/warehouse/age-off-utils/pom.xml +++ b/warehouse/age-off-utils/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-age-off-utils ${project.artifactId} diff --git a/warehouse/age-off/pom.xml b/warehouse/age-off/pom.xml index 5f3df941349..5448ff3eeea 100644 --- a/warehouse/age-off/pom.xml +++ b/warehouse/age-off/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-age-off ${project.artifactId} diff --git a/warehouse/assemble/datawave/pom.xml b/warehouse/assemble/datawave/pom.xml index 5f617436bb3..4cf533c527d 100644 --- a/warehouse/assemble/datawave/pom.xml +++ b/warehouse/assemble/datawave/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave assemble-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT assemble-datawave jar diff --git a/warehouse/assemble/datawave/src/main/docker/Dockerfile b/warehouse/assemble/datawave/src/main/docker/Dockerfile index 6bc6d4827b2..91b5a37b861 100644 --- a/warehouse/assemble/datawave/src/main/docker/Dockerfile +++ b/warehouse/assemble/datawave/src/main/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM ${docker.image.prefix}datawave-stack-accumulo:2.1.3 +FROM ${docker.image.prefix}datawave-stack-accumulo:${docker.image.accumulo.tag} USER root COPY --from=${docker.image.prefix}datawave-stack-hadoop:3.3.6 /usr/local/hadoop/ /usr/local/hadoop/ diff --git a/warehouse/assemble/pom.xml b/warehouse/assemble/pom.xml index 6154a619165..b11c9868d9c 100644 --- a/warehouse/assemble/pom.xml +++ b/warehouse/assemble/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT assemble-parent pom diff --git a/warehouse/assemble/webservice/pom.xml b/warehouse/assemble/webservice/pom.xml index 03b99376b81..48ee19e1c68 100644 --- a/warehouse/assemble/webservice/pom.xml +++ b/warehouse/assemble/webservice/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave assemble-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT assemble-webservice ${project.artifactId} diff --git a/warehouse/common/pom.xml b/warehouse/common/pom.xml index 2b321645a27..8b60f0d493b 100644 --- a/warehouse/common/pom.xml +++ b/warehouse/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-common ${project.artifactId} diff --git a/warehouse/core/pom.xml b/warehouse/core/pom.xml index 0549d7a39db..24acf36eb36 100644 --- a/warehouse/core/pom.xml +++ b/warehouse/core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-core jar diff --git a/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java b/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java index 67fef0102d2..fd55a67bba5 100644 --- a/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java +++ b/warehouse/core/src/main/java/datawave/mr/bulk/BulkInputFormat.java @@ -56,7 +56,7 @@ import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.accumulo.core.security.TablePermission; -import org.apache.accumulo.core.singletons.SingletonReservation; +import org.apache.accumulo.core.singletons.SingletonManager; import org.apache.accumulo.core.util.Pair; import org.apache.accumulo.core.util.format.DateFormatSupplier; import org.apache.accumulo.core.util.format.DefaultFormatter; @@ -1091,7 +1091,8 @@ protected static TabletLocator getTabletLocator(Configuration conf) throws Table Properties props = Accumulo.newClientProperties().to(conf.get(INSTANCE_NAME), conf.get(ZOOKEEPERS)) .as(getUsername(conf), new PasswordToken(getPassword(conf))).build(); ClientInfo info = ClientInfo.from(props); - ClientContext context = new ClientContext(SingletonReservation.noop(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), Threads.UEH); + ClientContext context = new ClientContext(SingletonManager.getClientReservation(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), + Threads.UEH); return TabletLocator.getLocator(context, context.getTableId(tableName)); } @@ -1132,8 +1133,8 @@ public List getSplits(JobContext job) throws IOException { // its possible that the cache could contain complete, but old information about a tables tablets... so clear it tl.invalidateCache(); ClientInfo info = ClientInfo.from(cbHelper.newClientProperties()); - ClientContext context = new ClientContext(SingletonReservation.noop(), info, ClientConfConverter.toAccumuloConf(info.getProperties()), - Threads.UEH); + ClientContext context = new ClientContext(SingletonManager.getClientReservation(), info, + ClientConfConverter.toAccumuloConf(info.getProperties()), Threads.UEH); while (!tl.binRanges(context, ranges, binnedRanges).isEmpty()) { if (!(client instanceof InMemoryAccumuloClient)) { if (tableId == null) diff --git a/warehouse/data-dictionary-core/pom.xml b/warehouse/data-dictionary-core/pom.xml index 852c5f85863..622b9fad288 100644 --- a/warehouse/data-dictionary-core/pom.xml +++ b/warehouse/data-dictionary-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-data-dictionary-core jar diff --git a/warehouse/edge-dictionary-core/pom.xml b/warehouse/edge-dictionary-core/pom.xml index 7e03155b73d..1d8873e3824 100644 --- a/warehouse/edge-dictionary-core/pom.xml +++ b/warehouse/edge-dictionary-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-edge-dictionary-core jar diff --git a/warehouse/edge-model-configuration-core/pom.xml b/warehouse/edge-model-configuration-core/pom.xml index 6f830d6cea9..dcca1f90226 100644 --- a/warehouse/edge-model-configuration-core/pom.xml +++ b/warehouse/edge-model-configuration-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-edge-model-configuration-core jar diff --git a/warehouse/index-stats/pom.xml b/warehouse/index-stats/pom.xml index 8c514b7ae3e..7e98a2a43f1 100644 --- a/warehouse/index-stats/pom.xml +++ b/warehouse/index-stats/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-index-stats jar diff --git a/warehouse/ingest-configuration/pom.xml b/warehouse/ingest-configuration/pom.xml index 5017389e239..501b17c77bc 100644 --- a/warehouse/ingest-configuration/pom.xml +++ b/warehouse/ingest-configuration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-configuration diff --git a/warehouse/ingest-core/pom.xml b/warehouse/ingest-core/pom.xml index d46f4a23f1b..3e2879b41ce 100644 --- a/warehouse/ingest-core/pom.xml +++ b/warehouse/ingest-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-core jar diff --git a/warehouse/ingest-core/src/main/java/datawave/IdentityDataType.java b/warehouse/ingest-core/src/main/java/datawave/IdentityDataType.java index 757d6bedef1..aa51f4acd2d 100644 --- a/warehouse/ingest-core/src/main/java/datawave/IdentityDataType.java +++ b/warehouse/ingest-core/src/main/java/datawave/IdentityDataType.java @@ -25,6 +25,11 @@ public String normalizeRegex(String in) { throw new UnsupportedOperationException(); } + // @Override + public boolean normalizedRegexIsLossy(String in) { + throw new UnsupportedOperationException(); + } + @Override public Collection expand(String in) { throw new UnsupportedOperationException(); diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java index 3bab7240bec..58e0b0748db 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgeDataTypeHandler.java @@ -617,7 +617,15 @@ public long process(KEYIN key, RawRecordContainer event, Multimap types = TypeRegistry.getTypes(); diff --git a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducer.java b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducer.java index 1eed6e5f53b..922431869c6 100644 --- a/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducer.java +++ b/warehouse/ingest-core/src/main/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducer.java @@ -182,15 +182,6 @@ public void doReduce(BulkIngestKey key, Iterable values, TaskInputOutputC } ctx.getCounter(IngestOutput.TIMESTAMP_DUPLICATE).increment(duplicates); } else { - /** - * Aggregator values if ts < 0, it is a by product of the ts deduper (combiner) - * - */ - ts = outKey.getKey().getTimestamp(); - - if (usingCombiner && (ts < 0)) { - outKey.getKey().setTimestamp(-1 * ts * MILLISPERDAY); - } Iterator valueItr = values.iterator(); diff --git a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java index 93077389532..e1cab12f9ab 100644 --- a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java +++ b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/handler/edge/ProtobufEdgePreconditionTest.java @@ -86,7 +86,7 @@ private RawRecordContainer getEvent(Configuration conf) { @Test public void testUnawarePreconSameGroup() { - // FELINE == 'tabby' + // FELINE =~ 'tabb.*' fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); @@ -141,7 +141,7 @@ public void testUnawarePreconSameGroup() { @Test public void testUnawarePreconSameGroupEarlyActivityDate() { - // FELINE == 'tabby' + // FELINE =~ 'tabb.*' fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); @@ -335,6 +335,37 @@ public void testAwarePreconSameGroup() { } + @Test + public void testAwareTwoNegated() { + // CHEESE != 'apple' AND WINE != 'chianti' + // make sure negations don't take the cross products of groups that each contained things that don't match + + fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); + fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "apple", "FOOD", "0")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "pear", "FOOD", "1")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "orange", "FOOD", "2")); + fields.put("WINE", new NormalizedFieldAndValue("WINE", "pinot noir", "FOOD", "0")); + fields.put("WINE", new NormalizedFieldAndValue("WINE", "chianti", "FOOD", "1")); + fields.put("WINE", new NormalizedFieldAndValue("WINE", "cabernet", "FOOD", "2")); + + ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("cabernet"); + expectedKeys.add("cabernet%00;orange"); + expectedKeys.add("orange"); + expectedKeys.add("orange%00;cabernet"); + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + @Test public void testAwareAllNegated() { // CHEESE != 'apple' AND WINE != 'chianti' @@ -342,8 +373,8 @@ public void testAwareAllNegated() { fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); - fields.put("CHEESE", new NormalizedFieldAndValue("FRUIT", "apple", "FOOD", "0")); - fields.put("CHEESE", new NormalizedFieldAndValue("FRUIT", "pear", "FOOD", "1")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "apple", "FOOD", "0")); + fields.put("FRUIT", new NormalizedFieldAndValue("FRUIT", "pear", "FOOD", "1")); fields.put("WINE", new NormalizedFieldAndValue("WINE", "pinot noir", "FOOD", "0")); fields.put("WINE", new NormalizedFieldAndValue("WINE", "chianti", "FOOD", "1")); @@ -388,6 +419,34 @@ public void testAwareNegation() { } + @Test + public void testAwareNR() { + // BREAD !~ 'ry.*' + + fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); + fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); + fields.put("BREAD", new NormalizedFieldAndValue("BREAD", "rye", "FOOD", "0")); + fields.put("BREAD", new NormalizedFieldAndValue("BREAD", "bagel", "FOOD", "1")); + fields.put("SANDWICH", new NormalizedFieldAndValue("SANDWICH", "reuben", "FOOD", "0")); + fields.put("SANDWICH", new NormalizedFieldAndValue("SANDWICH", "lox", "FOOD", "1")); + + ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("bagel"); + expectedKeys.add("bagel%00;lox"); + expectedKeys.add("lox"); + expectedKeys.add("lox%00;bagel"); + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + @Test public void testAwarePreconDifferentGroup() { // CANINE == 'shepherd' @@ -441,7 +500,37 @@ public void testAwareFieldComparison() { expectedKeys.add("spruce%00;canine"); expectedKeys.add("canine"); expectedKeys.add("spruce"); - ; + + RawRecordContainer myEvent = getEvent(conf); + + EdgeHandlerTestUtil.processEvent(fields, edgeHandler, myEvent, 4, true, false); + Assert.assertEquals(expectedKeys, EdgeHandlerTestUtil.edgeKeyResults.keySet()); + + } + + @Test + public void testAwareERFieldComparison() { + // PERSON =~ METAL + + fields.put("EVENT_DATE", new BaseNormalizedContent("EVENT_DATE", "2022-10-26T01:31:53Z")); + fields.put("UUID", new BaseNormalizedContent("UUID", "0016dd72-0000-827d-dd4d-001b2163ba09")); + fields.put("PERSON", new NormalizedFieldAndValue("PERSON", "leader", "PROFESSION", "0")); + fields.put("METAL", new NormalizedFieldAndValue("METAL", "iron", "TOOL", "0")); + fields.put("IMPLEMENT", new NormalizedFieldAndValue("IMPLEMENT", "words", "TOOL", "0")); + + fields.put("PERSON", new NormalizedFieldAndValue("PERSON", "artist", "PROFESSION", "1")); + fields.put("METAL", new NormalizedFieldAndValue("METAL", "lead", "TOOL", "1")); + fields.put("IMPLEMENT", new NormalizedFieldAndValue("IMPLEMENT", "paint", "TOOL", "1")); + + ProtobufEdgeDataTypeHandler edgeHandler = new ProtobufEdgeDataTypeHandler<>(); + TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); + edgeHandler.setup(context); + + Set expectedKeys = new HashSet<>(); + expectedKeys.add("paint%00;leader"); + expectedKeys.add("leader%00;paint"); + expectedKeys.add("paint"); + expectedKeys.add("leader"); RawRecordContainer myEvent = getEvent(conf); diff --git a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducerTest.java b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducerTest.java index b66faa8c54f..116338c896d 100644 --- a/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducerTest.java +++ b/warehouse/ingest-core/src/test/java/datawave/ingest/mapreduce/job/reduce/BulkIngestKeyAggregatingReducerTest.java @@ -1,13 +1,6 @@ package datawave.ingest.mapreduce.job.reduce; -import static datawave.ingest.config.TableConfigCache.ACCUMULO_CONFIG_CACHE_PATH_PROPERTY; -import static datawave.ingest.config.TableConfigCache.DEFAULT_ACCUMULO_CONFIG_CACHE_PATH; -import static datawave.ingest.data.config.ingest.AccumuloHelper.INSTANCE_NAME; -import static datawave.ingest.data.config.ingest.AccumuloHelper.PASSWORD; -import static datawave.ingest.data.config.ingest.AccumuloHelper.USERNAME; -import static datawave.ingest.data.config.ingest.AccumuloHelper.ZOOKEEPERS; import static datawave.ingest.mapreduce.job.TableConfigurationUtil.ITERATOR_CLASS_MARKER; -import static datawave.ingest.mapreduce.job.reduce.AggregatingReducer.INGEST_VALUE_DEDUP_AGGREGATION_KEY; import static datawave.ingest.mapreduce.job.reduce.AggregatingReducer.MILLISPERDAY; import static datawave.ingest.mapreduce.job.reduce.AggregatingReducer.USE_AGGREGATOR_PROPERTY; import static datawave.ingest.mapreduce.job.reduce.BulkIngestKeyAggregatingReducer.CONTEXT_WRITER_CLASS; @@ -19,7 +12,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -32,7 +24,6 @@ import org.apache.accumulo.core.iterators.Combiner; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.commons.math3.analysis.function.Pow; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; @@ -41,13 +32,10 @@ import org.apache.hadoop.mapreduce.TaskInputOutputContext; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.counters.GenericCounter; -import org.apache.hadoop.yarn.webapp.hamlet2.Hamlet; -import org.easymock.EasyMock; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.Mockito; -import org.powermock.api.easymock.PowerMock; import org.powermock.api.mockito.PowerMockito; import org.powermock.core.classloader.annotations.PrepareForTest; import org.powermock.modules.junit4.PowerMockRunner; @@ -56,9 +44,6 @@ import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; -import datawave.ingest.config.TableConfigCache; -import datawave.ingest.data.config.ConfigurationHelper; -import datawave.ingest.data.config.ingest.AccumuloHelper; import datawave.ingest.mapreduce.job.BulkIngestKey; import datawave.ingest.mapreduce.job.TableConfigurationUtil; import datawave.ingest.mapreduce.job.writer.BulkContextWriter; @@ -88,6 +73,7 @@ public class BulkIngestKeyAggregatingReducerTest { private Counter tab2Counter; private Counter tab3Counter; private Counter combinerCounter; + private Counter negativeTimestampCounter; private Counter dupCounter; private int expectedDuplicateKey; @@ -98,6 +84,7 @@ public class BulkIngestKeyAggregatingReducerTest { private int expectedTab2Counter; private int expectedTab3Counter; private int expectedCombinerCounter; + private int expectedNegativeTimestampCounter; private int expectedDupCounter; private TaskID taskID; @@ -121,6 +108,7 @@ public void setup() throws Exception { tab2Counter = (Counter) new GenericCounter(); tab3Counter = (Counter) new GenericCounter(); combinerCounter = (Counter) new GenericCounter(); + negativeTimestampCounter = (Counter) new GenericCounter(); dupCounter = (Counter) new GenericCounter(); expectedDuplicateKey = 0; @@ -131,6 +119,7 @@ public void setup() throws Exception { expectedTab2Counter = 0; expectedTab3Counter = 0; expectedCombinerCounter = 0; + expectedNegativeTimestampCounter = 0; expectedDupCounter = 0; conf = (Configuration) PowerMockito.mock(Configuration.class); @@ -255,6 +244,7 @@ private void checkCounterValues() { assertEquals(expectedTab2Counter, tab2Counter.getValue()); assertEquals(expectedTab3Counter, tab3Counter.getValue()); assertEquals(expectedCombinerCounter, combinerCounter.getValue()); + assertEquals(expectedNegativeTimestampCounter, negativeTimestampCounter.getValue()); } @Test @@ -554,6 +544,29 @@ public void testUsingCombinerWithVerbosePartitioningCounters() throws Exception assertEquals(expected, output); } + @Test + public void testUsingCombinerWithNegativeTimestamps() throws Exception { + setupUsingCombiner(); + reducer.setup(conf); + + performDoReduce("table1", "r1", 4, -3 * MILLISPERDAY + MILLISPERDAY / 2, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table1", "r2", 3, 3 * MILLISPERDAY + MILLISPERDAY / 3, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table1", "r3", 1, -3 * MILLISPERDAY, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table2", "r1", 2, -2 * MILLISPERDAY + MILLISPERDAY, ExpectedValueType.FIRST_VALUE); + performDoReduce("table2", "r2", 0, -2 * MILLISPERDAY + MILLISPERDAY, ExpectedValueType.ALL_VALUES); + performDoReduce("table2", "r3", 3, -2 * MILLISPERDAY + MILLISPERDAY / 3, ExpectedValueType.FIRST_VALUE); + performDoReduce("table3", "r1", 3, -4 * MILLISPERDAY + MILLISPERDAY / 3, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table3", "r2", 0, -4 * MILLISPERDAY, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table1", "r1", 4, 4 * MILLISPERDAY + MILLISPERDAY / 2, ExpectedValueType.COMBINED_VALUES); + performDoReduce("table1", "r2", 3, 2 * MILLISPERDAY + MILLISPERDAY, ExpectedValueType.COMBINED_VALUES); + + expectedDuplicateKey = 2; + expectedCombinerCounter = 7; + expectedNegativeTimestampCounter = 7; + checkCounterValues(); + assertEquals(expected, output); + } + private void performDoReduce(String table, String row, int numberOfValues) throws Exception { performDoReduce(table, row, numberOfValues, 1L, ExpectedValueType.FIRST_VALUE); } @@ -586,6 +599,10 @@ private void performDoReduce(String table, String row, int numberOfValues, long } reducer.doReduce(bulkIngestKey, values, context); + + if (bulkIngestKey.getKey().getTimestamp() < 0) { + negativeTimestampCounter.increment(1); + } } public static Value combineValues(Iterator iter) { diff --git a/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml b/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml index 5acd9ac10ec..879f7b91c23 100644 --- a/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml +++ b/warehouse/ingest-core/src/test/resources/config/EdgeSpringConfigPrecon.xml @@ -25,7 +25,7 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> - + @@ -153,6 +153,39 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -165,7 +198,7 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> - + @@ -252,6 +285,39 @@ http://www.springframework.org/schema/util/spring-util-4.0.xsd"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/warehouse/ingest-csv/pom.xml b/warehouse/ingest-csv/pom.xml index 061e6cbaad4..9df7e5d63b6 100644 --- a/warehouse/ingest-csv/pom.xml +++ b/warehouse/ingest-csv/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-csv jar diff --git a/warehouse/ingest-json/pom.xml b/warehouse/ingest-json/pom.xml index ec862a78360..721429994df 100644 --- a/warehouse/ingest-json/pom.xml +++ b/warehouse/ingest-json/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-json jar diff --git a/warehouse/ingest-nyctlc/pom.xml b/warehouse/ingest-nyctlc/pom.xml index a352a811d7b..c647eb322a8 100644 --- a/warehouse/ingest-nyctlc/pom.xml +++ b/warehouse/ingest-nyctlc/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-nyctlc jar diff --git a/warehouse/ingest-scripts/pom.xml b/warehouse/ingest-scripts/pom.xml index aacf2cc4c97..106368403be 100644 --- a/warehouse/ingest-scripts/pom.xml +++ b/warehouse/ingest-scripts/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-scripts ${project.artifactId} diff --git a/warehouse/ingest-ssdeep/pom.xml b/warehouse/ingest-ssdeep/pom.xml index 91b1d231159..40443bf7f40 100644 --- a/warehouse/ingest-ssdeep/pom.xml +++ b/warehouse/ingest-ssdeep/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-ssdeep diff --git a/warehouse/ingest-wikipedia/pom.xml b/warehouse/ingest-wikipedia/pom.xml index f6cf906d187..63bd31b38aa 100644 --- a/warehouse/ingest-wikipedia/pom.xml +++ b/warehouse/ingest-wikipedia/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ingest-wikipedia jar diff --git a/warehouse/metrics-core/pom.xml b/warehouse/metrics-core/pom.xml index 7b76b23d8f5..1e67c228f65 100644 --- a/warehouse/metrics-core/pom.xml +++ b/warehouse/metrics-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-metrics-core jar diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java index 0cd6e772c09..940dac61131 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/FileByteSummaryLoader.java @@ -3,19 +3,19 @@ import java.io.IOException; import java.util.Collections; import java.util.Date; +import java.util.Properties; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.accumulo.core.client.Accumulo; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -105,11 +105,21 @@ public int run(String[] args) throws Exception { job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, Authorizations.EMPTY); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance.trim()).withZkHosts(zookeepers.trim())); - AccumuloInputFormat.setRanges(job, Collections.singletonList(dayRange)); + + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance.trim(), zookeepers.trim()) + .as(userName, password) + .build(); + + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(Authorizations.EMPTY) + .ranges(Collections.singletonList(dayRange)) + .store(job); + // @formatter:on + // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java index 27e6b5692d5..8fe578599d1 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/IngestMetricsSummaryLoader.java @@ -7,6 +7,7 @@ import java.util.Date; import java.util.HashSet; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.regex.Matcher; @@ -16,15 +17,13 @@ import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; import org.apache.accumulo.core.client.Scanner; import org.apache.accumulo.core.client.TableNotFoundException; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -277,12 +276,20 @@ public int run(String[] args) throws Exception { job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers)); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, Authorizations.EMPTY); - AccumuloInputFormat.setRanges(job, Collections.singletonList(dayRange)); + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance, zookeepers) + .as(userName, password) + .build(); + + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(Authorizations.EMPTY) + .ranges(Collections.singletonList(dayRange)) + .store(job); + // @formatter:on // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java index 746684e07d6..2eae071041a 100644 --- a/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java +++ b/warehouse/metrics-core/src/main/java/datawave/metrics/analytic/QueryMetricsSummaryLoader.java @@ -8,20 +8,20 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Properties; import java.util.concurrent.TimeUnit; +import org.apache.accumulo.core.client.Accumulo; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; import org.apache.accumulo.core.client.AccumuloSecurityException; -import org.apache.accumulo.core.client.ClientConfiguration; import org.apache.accumulo.core.client.IteratorSetting; -import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat; -import org.apache.accumulo.core.client.security.tokens.PasswordToken; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.user.RegExFilter; import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; @@ -287,17 +287,26 @@ public int run(String[] args) throws Exception { job.setMapOutputValueClass(Value.class); job.setInputFormatClass(AccumuloInputFormat.class); - AccumuloInputFormat.setConnectorInfo(job, userName, new PasswordToken(password)); - AccumuloInputFormat.setZooKeeperInstance(job, ClientConfiguration.loadDefault().withInstance(instance).withZkHosts(zookeepers)); - AccumuloInputFormat.setRanges(job, dayRanges); - AccumuloInputFormat.setAutoAdjustRanges(job, false); - AccumuloInputFormat.setInputTableName(job, inputTable); - AccumuloInputFormat.setScanAuthorizations(job, auths); + // @formatter:off + Properties clientProperties = Accumulo.newClientProperties() + .to(instance, zookeepers) + .as(userName, password) + .build(); + // @formatter:on IteratorSetting regex = new IteratorSetting(50, RegExFilter.class); regex.addOption(RegExFilter.COLF_REGEX, QUERY_METRICS_REGEX); - AccumuloInputFormat.addIterator(job, regex); + // @formatter:off + AccumuloInputFormat.configure() + .clientProperties(clientProperties) + .table(inputTable) + .auths(auths) + .ranges(dayRanges) + .autoAdjustRanges(false) + .addIterator(regex) + .store(job); + // @formatter:on // Ensure all data for a day goes to the same reducer so that we aggregate it correctly before sending to Accumulo RowPartitioner.configureJob(job); diff --git a/warehouse/ops-tools/config-compare/pom.xml b/warehouse/ops-tools/config-compare/pom.xml index 6da250ddbdc..af9185a7b1d 100644 --- a/warehouse/ops-tools/config-compare/pom.xml +++ b/warehouse/ops-tools/config-compare/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-ops-tools-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ops-tools-config-compare diff --git a/warehouse/ops-tools/index-validation/pom.xml b/warehouse/ops-tools/index-validation/pom.xml index bcc768ba93f..8039271e3e1 100644 --- a/warehouse/ops-tools/index-validation/pom.xml +++ b/warehouse/ops-tools/index-validation/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-ops-tools-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ops-tools-index-validation jar diff --git a/warehouse/ops-tools/pom.xml b/warehouse/ops-tools/pom.xml index c90e3790ce5..f142105b8e6 100644 --- a/warehouse/ops-tools/pom.xml +++ b/warehouse/ops-tools/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ops-tools-parent pom diff --git a/warehouse/pom.xml b/warehouse/pom.xml index eadfffc45e4..2cdbeb122f5 100644 --- a/warehouse/pom.xml +++ b/warehouse/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-warehouse-parent pom diff --git a/warehouse/query-core/pom.xml b/warehouse/query-core/pom.xml index eecc22e554e..9b484e98b60 100644 --- a/warehouse/query-core/pom.xml +++ b/warehouse/query-core/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-query-core jar diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java new file mode 100644 index 00000000000..f863a4d0675 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/core/iterators/BoundedRangeExpansionIterator.java @@ -0,0 +1,171 @@ +package datawave.core.iterators; + +import java.io.IOException; +import java.util.Collection; +import java.util.Map; +import java.util.TreeSet; + +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.OptionDescriber; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.user.SeekingFilter; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.io.Text; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Splitter; + +import datawave.query.Constants; +import datawave.query.jexl.LiteralRange; + +/** + * A {@link SeekingFilter} that attempts to expand bounded ranges using the global index + *

+ * The caller is responsible for fetching the appropriate column families. The range is constructed from a {@link LiteralRange}. + *

+ * The only thing this iterator does is advance through datatypes if a filter is supplied, advance to the start date, and advance to the next row within the + * range. + */ +public class BoundedRangeExpansionIterator extends SeekingFilter implements OptionDescriber { + + private static final Logger log = LoggerFactory.getLogger(BoundedRangeExpansionIterator.class); + + public static final String START_DATE = "start.date"; + public static final String END_DATE = "end.date"; + public static final String DATATYPES_OPT = "dts"; + + private TreeSet datatypes; + private String startDate; + private String endDate; + + private Text prevRow; + + @Override + public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { + if (!validateOptions(options)) { + throw new IllegalArgumentException("BoundedRangeExpansionIterator not configured with correct options"); + } + + String opt = options.get(DATATYPES_OPT); + if (StringUtils.isBlank(opt)) { + datatypes = new TreeSet<>(); + } else { + datatypes = new TreeSet<>(Splitter.on(',').splitToList(opt)); + } + + startDate = options.get(START_DATE); + endDate = options.get(END_DATE) + Constants.MAX_UNICODE_STRING; + + super.init(source, options, env); + } + + @Override + public IteratorOptions describeOptions() { + IteratorOptions opts = new IteratorOptions(getClass().getName(), "Expands bounded ranges using the global index", null, null); + opts.addNamedOption(START_DATE, "The start date"); + opts.addNamedOption(END_DATE, "The end date"); + opts.addNamedOption(DATATYPES_OPT, "The set of datatypes used to filter keys (optional)"); + return opts; + } + + @Override + public boolean validateOptions(Map options) { + return options.containsKey(START_DATE) && options.containsKey(END_DATE); + } + + @Override + public FilterResult filter(Key k, Value v) { + log.trace("filter key: {}", k.toStringNoTime()); + + // shard + null + datatype + String cq = k.getColumnQualifier().toString(); + int index = cq.indexOf('\u0000'); + String date = cq.substring(0, index); + + if (date.compareTo(startDate) < 0) { + log.trace("{} is before the start date {}, advancing to start date", date, startDate); + return new FilterResult(false, AdvanceResult.USE_HINT); + } + + if (date.compareTo(endDate) > 0) { + log.trace("{} is past the end date {}, advancing to next row", date, endDate); + return new FilterResult(false, AdvanceResult.NEXT_ROW); + } + + String datatype = cq.substring(index + 1); + if (!datatypes.isEmpty() && !datatypes.contains(datatype)) { + log.trace("datatype {} was filtered out, advancing to next key", datatype); + return new FilterResult(false, AdvanceResult.NEXT); + } + + if (prevRow != null && prevRow.equals(k.getRow())) { + // this iterator should only return a single key per unique row, thus the previous row should never match the current row. + log.warn("should never see a duplicate row -- skip to next row"); + return new FilterResult(false, AdvanceResult.NEXT_ROW); + } + + prevRow = k.getRow(); + return new FilterResult(true, AdvanceResult.NEXT_ROW); + } + + /** + * Hint is only used to seek to the start date + * + * @param k + * a key + * @param v + * a value + * @return the key used to seek + */ + @Override + public Key getNextKeyHint(Key k, Value v) { + log.trace("get next key hint: {}", k.toStringNoTime()); + + // shard + null + datatype + String cq = k.getColumnQualifier().toString(); + int index = cq.indexOf('\u0000'); + String date = cq.substring(0, index); + + if (date.compareTo(startDate) < 0) { + Text columnQualifier; + + if (datatypes.isEmpty()) { + log.trace("seek to start date"); + columnQualifier = new Text(startDate + '\u0000'); + } else { + log.trace("seek to start date and datatype"); + columnQualifier = new Text(startDate + '\u0000' + datatypes.first()); + } + + return new Key(k.getRow(), k.getColumnFamily(), columnQualifier); + } + + log.trace("next hint key was called in a bad state, reverting to no-op"); + return k; + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + if (!range.isStartKeyInclusive()) { + // need to skip to next row + Key skip = new Key(range.getStartKey().getRow().toString() + '\u0000'); + if (skip.compareTo(range.getEndKey()) > 0) { + // handles case of bounded range against single value + // filter key: +cE1 NUM:20150808_0%00;generic [NA] + // skip key would be +cE1 but then the start key is greater than the end key. so we cheat accumulo. + Range skipRange = new Range(range.getEndKey(), true, range.getEndKey(), range.isEndKeyInclusive()); + super.seek(skipRange, columnFamilies, inclusive); + } else { + Range skipRange = new Range(skip, true, range.getEndKey(), range.isEndKeyInclusive()); + super.seek(skipRange, columnFamilies, inclusive); + } + } else { + super.seek(range, columnFamilies, inclusive); + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexCachingIteratorJexl.java b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexCachingIteratorJexl.java index 1dcc248d6f4..f9a7ce19cb8 100644 --- a/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexCachingIteratorJexl.java +++ b/warehouse/query-core/src/main/java/datawave/core/iterators/DatawaveFieldIndexCachingIteratorJexl.java @@ -769,8 +769,8 @@ protected void findTop() throws IOException { // no need to check containership if not returning sorted uids if (!sortedUIDs || this.lastRangeSeeked.contains(key)) { this.topKey = key; - if (log.isDebugEnabled()) { - log.debug("setting as topKey " + topKey); + if (log.isTraceEnabled()) { + log.trace("setting as topKey " + topKey); } break; } @@ -879,6 +879,7 @@ private void fillSortedSets() throws IOException { if (log.isDebugEnabled()) { log.debug("Processing " + boundingFiRanges + " for " + this); } + long startFillSets = System.currentTimeMillis(); TotalResults totalResults = new TotalResults(maxResults); @@ -916,8 +917,11 @@ private void fillSortedSets() throws IOException { } } + long fillSetTiming = System.currentTimeMillis() - startFillSets; + log.info("Filled ivarator sets for " + boundingFiRanges.size() + " ranges took " + fillSetTiming + "ms for " + this); + if (failed) { - log.error("Failed to complete ivarator cache: " + result, exception); + log.error("Failed to complete ivarator cache: " + result + " for " + this, exception); throw new IvaratorException("Failed to complete ivarator cache: " + result, exception); } @@ -1102,6 +1106,7 @@ protected Future fillSet(final Range boundingFiRange, final TotalResults tota // create runnable Runnable runnable = () -> { + long startFillSet = System.currentTimeMillis(); if (log.isDebugEnabled()) { log.debug("Starting fillSet(" + boundingFiRange + ')'); } @@ -1210,6 +1215,8 @@ protected Future fillSet(final Range boundingFiRange, final TotalResults tota log.error("Failed to complete fillSet(" + boundingFiRange + ")", e); throw new RuntimeException(e); } finally { + long timing = System.currentTimeMillis() - startFillSet; + log.info("Completed " + boundingFiRange + " ivarator in " + timing + "ms"); // return the ivarator source back to the pool. returnPoolSource(source); if (log.isDebugEnabled()) { @@ -1644,4 +1651,13 @@ public void setCollectTimingDetails(boolean collectTimingDetails) { public void setQuerySpanCollector(QuerySpanCollector querySpanCollector) { this.querySpanCollector = querySpanCollector; } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("DatawaveFieldIndexCachingIteratorJexl (").append(queryId).append(") fName=").append(getFieldName()).append(", fValue=") + .append(getFieldValue()).append(", negated=").append(isNegated()).append("}"); + return builder.toString(); + } + } diff --git a/warehouse/query-core/src/main/java/datawave/mr/bulk/MultiRfileInputformat.java b/warehouse/query-core/src/main/java/datawave/mr/bulk/MultiRfileInputformat.java index 4b7f1011730..1614fe2991f 100644 --- a/warehouse/query-core/src/main/java/datawave/mr/bulk/MultiRfileInputformat.java +++ b/warehouse/query-core/src/main/java/datawave/mr/bulk/MultiRfileInputformat.java @@ -66,6 +66,7 @@ public class MultiRfileInputformat extends RFileInputFormat { private static LoadingCache>>> locationMap = null; protected static final Map dfsUriMap = new ConcurrentHashMap<>(); + protected static final Map dfsDirMap = new ConcurrentHashMap<>(); @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { @@ -147,11 +148,12 @@ public static List computeSplitPoints(AccumuloClient client, Configu /** * Attempt the following 1) try to get the default namespace from accumulo 2) Use the custom config option 3) use default name in the hdfs configuration */ - if (dfsUriMap.get(tableId) == null) { + if (dfsUriMap.get(tableId) == null || dfsDirMap.get(tableId) == null) { synchronized (MultiRfileInputformat.class) { final InstanceOperations instOps = client.instanceOperations(); - dfsUriMap.put(tableId, instOps.getSystemConfiguration().get(Property.INSTANCE_VOLUMES.getKey())); + dfsUriMap.put(tableId, instOps.getSystemConfiguration().get(Property.INSTANCE_DFS_URI.getKey())); + dfsDirMap.put(tableId, instOps.getSystemConfiguration().get(Property.INSTANCE_DFS_DIR.getKey())); } } @@ -165,7 +167,7 @@ public static List computeSplitPoints(AccumuloClient client, Configu } } - basePath = dfsUriMap.get(tableId); + basePath = dfsDirMap.get(tableId); if (StringUtils.isEmpty(basePath)) { basePath = ACCUMULO_BASE_PATH; diff --git a/warehouse/query-core/src/main/java/datawave/query/ancestor/AncestorQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/ancestor/AncestorQueryIterator.java index 61009d31fba..7528a24e9f6 100644 --- a/warehouse/query-core/src/main/java/datawave/query/ancestor/AncestorQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/ancestor/AncestorQueryIterator.java @@ -40,6 +40,7 @@ import datawave.query.function.RangeProvider; import datawave.query.iterator.NestedQueryIterator; import datawave.query.iterator.QueryIterator; +import datawave.query.iterator.QueryOptions; import datawave.query.iterator.SourcedOptions; import datawave.query.iterator.logic.IndexIterator; import datawave.query.jexl.DatawaveJexlContext; @@ -138,6 +139,31 @@ public EventDataQueryFilter getEvaluationFilter() { return evaluationFilter != null ? evaluationFilter.clone() : null; } + /** + * In the Ancestor case replace the {@link QueryOptions#eventFilter} with an evaluation filter + * + * @return an evaluation filter + */ + public EventDataQueryFilter getEventFilter() { + return getEvaluationFilter(); + } + + @Override + public EventDataQueryFilter getFiEvaluationFilter() { + if (fiEvaluationFilter == null) { + fiEvaluationFilter = getEvaluationFilter(); + } + return fiEvaluationFilter.clone(); + } + + @Override + public EventDataQueryFilter getEventEvaluationFilter() { + if (eventEvaluationFilter == null) { + eventEvaluationFilter = getEvaluationFilter(); + } + return eventEvaluationFilter.clone(); + } + @Override protected JexlEvaluation getJexlEvaluation(NestedQueryIterator documentSource) { return new JexlEvaluation(query, getArithmetic()) { diff --git a/warehouse/query-core/src/main/java/datawave/query/attributes/Document.java b/warehouse/query-core/src/main/java/datawave/query/attributes/Document.java index 2edaa426da2..8bb3d146589 100644 --- a/warehouse/query-core/src/main/java/datawave/query/attributes/Document.java +++ b/warehouse/query-core/src/main/java/datawave/query/attributes/Document.java @@ -210,27 +210,8 @@ public Attribute toDocKeyAttributes(Set docKeys, boolean keepRecordId) { } public void debugDocumentSize(Key docKey) { - long bytes = sizeInBytes(); - // if more than 100M, then error - if (bytes > (ONE_HUNDRED_M)) { - log.error("Document " + docKey + "; size = " + size() + "; bytes = " + bytes); - } - // if more than 10M, then warn - // else if (bytes > (1024l * 1000 * 10)) { - // log.warn("Document " + docKey + "; size = " + size() + "; bytes = " + bytes); - // } - - // if more than 1M, then info - else if (bytes > (ONE_M)) { - log.info("Document " + docKey + "; size = " + size() + "; bytes = " + bytes); - } - // if more than 500K, then debug - else if (bytes > (FIVE_HUNDRED_K) && log.isDebugEnabled()) { - log.debug("Document " + docKey + "; size = " + size() + "; bytes = " + bytes); - } - // trace everything - else if (log.isTraceEnabled()) { - log.trace("Document " + docKey + "; size = " + size() + "; bytes = " + bytes); + if (log.isDebugEnabled()) { + log.debug("Document " + docKey + "; size = " + size() + "; bytes = " + sizeInBytes()); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/config/LookupUUIDTune.java b/warehouse/query-core/src/main/java/datawave/query/config/LookupUUIDTune.java index 7e5dd61b73a..91f084abcf8 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/LookupUUIDTune.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/LookupUUIDTune.java @@ -46,6 +46,7 @@ public class LookupUUIDTune implements Profile { protected boolean reduceQuery = false; private boolean enforceUniqueTermsWithinExpressions = false; private boolean reduceQueryFields = false; + private boolean seekingEventAggregation; protected List transforms = null; protected Map querySyntaxParsers = null; @@ -64,6 +65,7 @@ public void configure(BaseQueryLogic> logic) { rsq.setFiNextSeek(getFiNextSeek()); rsq.setEventFieldSeek(getEventFieldSeek()); rsq.setEventNextSeek(getEventNextSeek()); + rsq.setSeekingEventAggregation(isSeekingEventAggregation()); if (querySyntaxParsers != null) { rsq.setQuerySyntaxParsers(querySyntaxParsers); @@ -136,6 +138,7 @@ public void configure(GenericQueryConfiguration configuration) { rsqc.setFiNextSeek(getFiNextSeek()); rsqc.setEventFieldSeek(getEventFieldSeek()); rsqc.setEventNextSeek(getEventNextSeek()); + rsqc.setSeekingEventAggregation(isSeekingEventAggregation()); // we need this since we've finished the deep copy already rsqc.setSpeculativeScanning(speculativeScanning); @@ -354,4 +357,12 @@ public Map getQuerySyntaxParsers() { public void setQuerySyntaxParsers(Map querySyntaxParsers) { this.querySyntaxParsers = querySyntaxParsers; } + + public boolean isSeekingEventAggregation() { + return seekingEventAggregation; + } + + public void setSeekingEventAggregation(boolean seekingEventAggregation) { + this.seekingEventAggregation = seekingEventAggregation; + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java index 0c29817d43d..94ddc4d5ba7 100644 --- a/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/config/ShardQueryConfiguration.java @@ -84,7 +84,7 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement public static final String QUERY_LOGIC_NAME_SOURCE = "queryLogic"; @SuppressWarnings("unused") - private static final long serialVersionUID = -4354990715046146110L; + private static final long serialVersionUID = 2321985989282659247L; private static final Logger log = Logger.getLogger(ShardQueryConfiguration.class); // is this a tld query, explicitly default to false @@ -99,6 +99,8 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private int maxIndexBatchSize = 1000; private boolean allTermsIndexOnly; private long maxIndexScanTimeMillis = Long.MAX_VALUE; + private long maxAnyFieldScanTimeMillis = Long.MAX_VALUE; + // Allows this query to parse the root uids from TLD uids found in the global shard index. This effectively ignores hits in child documents. private boolean parseTldUids = false; private boolean collapseUids = false; @@ -451,6 +453,11 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private int tfFieldSeek = -1; private int tfNextSeek = -1; + /** + * Flag that enables a field-based seeking aggregation in the standard event query. Must be used in conjunction with {@link #eventFieldSeek} + */ + private boolean seekingEventAggregation = false; + /** * The maximum weight for entries in the visitor function cache. The weight is calculated as the total number of characters for each key and value in the * cache. Default is 5m characters, which is roughly 10MB @@ -477,24 +484,26 @@ public class ShardQueryConfiguration extends GenericQueryConfiguration implement private boolean pruneQueryOptions = false; /** - * Flag to control gathering field counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered. + * Flag that sorts the query prior to the global index lookup using inferred costs. This step may reduce time spent in the global index depending on + * individual term selectivity. */ - private boolean useFieldCounts = false; + private boolean sortQueryPreIndexWithImpliedCounts = false; + /** - * Flag to control gathering term counts from the global index and persisting those to the query iterator. Negated terms and branches are not considered. + * Flag that sorts the query prior to the global index lookup using field counts from the {@link TableName#METADATA} table. This option opens a scanner and + * thus is more expensive than sorting by implied counts, but is potentially more accurate. */ - private boolean useTermCounts = false; + private boolean sortQueryPreIndexWithFieldCounts = false; + /** - * Flag to control sorting a query by inferred default costs prior to the global index lookup. This step may reduce time performing a secondary sort as when - * {@link #sortQueryByCounts} is enabled. + * Flag that sorts the query using field counts gathered as part of the global index lookup. Negated terms and branches are not considered. */ - private boolean sortQueryBeforeGlobalIndex = false; + private boolean sortQueryPostIndexWithFieldCounts = false; /** - * Flag to control if a query is sorted by either field or term counts. Either {@link #useFieldCounts} or {@link #useTermCounts} must be set for this option - * to take effect. + * Flag that sorts the query using term counts gathered as part of the global index lookup. Negated terms and branches are not considered. */ - private boolean sortQueryByCounts = false; + private boolean sortQueryPostIndexWithTermCounts = false; /** * Insert rules for processing the QueryTree to automatically apply hints to queries. Hints will be passed to the ScannerFactory @@ -547,6 +556,7 @@ public void copyFrom(ShardQueryConfiguration other) { this.setMaxIndexBatchSize(other.getMaxIndexBatchSize()); this.setAllTermsIndexOnly(other.isAllTermsIndexOnly()); this.setMaxIndexScanTimeMillis(other.getMaxIndexScanTimeMillis()); + this.setMaxAnyFieldScanTimeMillis(other.getMaxAnyFieldScanTimeMillis()); this.setCollapseUids(other.getCollapseUids()); this.setCollapseUidsThreshold(other.getCollapseUidsThreshold()); this.setEnforceUniqueTermsWithinExpressions(other.getEnforceUniqueTermsWithinExpressions()); @@ -733,6 +743,7 @@ public void copyFrom(ShardQueryConfiguration other) { this.setEventNextSeek(other.getEventNextSeek()); this.setTfFieldSeek(other.getTfFieldSeek()); this.setTfNextSeek(other.getTfNextSeek()); + this.setSeekingEventAggregation(other.isSeekingEventAggregation()); this.setVisitorFunctionMaxWeight(other.getVisitorFunctionMaxWeight()); this.setQueryExecutionForPageTimeout(other.getQueryExecutionForPageTimeout()); this.setLazySetMechanismEnabled(other.isLazySetMechanismEnabled()); @@ -740,10 +751,10 @@ public void copyFrom(ShardQueryConfiguration other) { this.setTfAggregationThresholdMs(other.getTfAggregationThresholdMs()); this.setGroupFields(GroupFields.copyOf(other.getGroupFields())); this.setPruneQueryOptions(other.getPruneQueryOptions()); - this.setUseFieldCounts(other.getUseFieldCounts()); - this.setUseTermCounts(other.getUseTermCounts()); - this.setSortQueryBeforeGlobalIndex(other.isSortQueryBeforeGlobalIndex()); - this.setSortQueryByCounts(other.isSortQueryByCounts()); + this.setSortQueryPreIndexWithImpliedCounts(other.isSortQueryPreIndexWithImpliedCounts()); + this.setSortQueryPreIndexWithFieldCounts(other.isSortQueryPreIndexWithFieldCounts()); + this.setSortQueryPostIndexWithTermCounts(other.isSortQueryPostIndexWithTermCounts()); + this.setSortQueryPostIndexWithFieldCounts(other.isSortQueryPostIndexWithFieldCounts()); this.setUseQueryTreeScanHintRules(other.isUseQueryTreeScanHintRules()); this.setQueryTreeScanHintRules(other.getQueryTreeScanHintRules()); this.setFieldIndexHoleMinThreshold(other.getFieldIndexHoleMinThreshold()); @@ -2651,6 +2662,14 @@ public void setTfNextSeek(int tfNextSeek) { this.tfNextSeek = tfNextSeek; } + public boolean isSeekingEventAggregation() { + return seekingEventAggregation; + } + + public void setSeekingEventAggregation(boolean seekingEventAggregation) { + this.seekingEventAggregation = seekingEventAggregation; + } + public long getVisitorFunctionMaxWeight() { return visitorFunctionMaxWeight; } @@ -2759,36 +2778,36 @@ public void setCachePreviouslyExpandedFields(boolean cachePreviouslyExpandedFiel this.cachePreviouslyExpandedFields = cachePreviouslyExpandedFields; } - public boolean getUseTermCounts() { - return useTermCounts; + public boolean isSortQueryPreIndexWithImpliedCounts() { + return sortQueryPreIndexWithImpliedCounts; } - public void setUseTermCounts(boolean useTermCounts) { - this.useTermCounts = useTermCounts; + public void setSortQueryPreIndexWithImpliedCounts(boolean sortQueryPreIndexWithImpliedCounts) { + this.sortQueryPreIndexWithImpliedCounts = sortQueryPreIndexWithImpliedCounts; } - public boolean getUseFieldCounts() { - return useFieldCounts; + public boolean isSortQueryPreIndexWithFieldCounts() { + return sortQueryPreIndexWithFieldCounts; } - public void setUseFieldCounts(boolean useFieldCounts) { - this.useFieldCounts = useFieldCounts; + public void setSortQueryPreIndexWithFieldCounts(boolean sortQueryPreIndexWithFieldCounts) { + this.sortQueryPreIndexWithFieldCounts = sortQueryPreIndexWithFieldCounts; } - public boolean isSortQueryBeforeGlobalIndex() { - return sortQueryBeforeGlobalIndex; + public boolean isSortQueryPostIndexWithFieldCounts() { + return sortQueryPostIndexWithFieldCounts; } - public void setSortQueryBeforeGlobalIndex(boolean sortQueryBeforeGlobalIndex) { - this.sortQueryBeforeGlobalIndex = sortQueryBeforeGlobalIndex; + public void setSortQueryPostIndexWithFieldCounts(boolean sortQueryPostIndexWithFieldCounts) { + this.sortQueryPostIndexWithFieldCounts = sortQueryPostIndexWithFieldCounts; } - public boolean isSortQueryByCounts() { - return sortQueryByCounts; + public boolean isSortQueryPostIndexWithTermCounts() { + return sortQueryPostIndexWithTermCounts; } - public void setSortQueryByCounts(boolean sortQueryByCounts) { - this.sortQueryByCounts = sortQueryByCounts; + public void setSortQueryPostIndexWithTermCounts(boolean sortQueryPostIndexWithTermCounts) { + this.sortQueryPostIndexWithTermCounts = sortQueryPostIndexWithTermCounts; } @Override @@ -2988,16 +3007,17 @@ public boolean equals(Object o) { getEventNextSeek() == that.getEventNextSeek() && getTfFieldSeek() == that.getTfFieldSeek() && getTfNextSeek() == that.getTfNextSeek() && + isSeekingEventAggregation() == that.isSeekingEventAggregation() && getVisitorFunctionMaxWeight() == that.getVisitorFunctionMaxWeight() && getQueryExecutionForPageTimeout() == that.getQueryExecutionForPageTimeout() && isLazySetMechanismEnabled() == that.isLazySetMechanismEnabled() && getDocAggregationThresholdMs() == that.getDocAggregationThresholdMs() && getTfAggregationThresholdMs() == that.getTfAggregationThresholdMs() && getPruneQueryOptions() == that.getPruneQueryOptions() && - getUseFieldCounts() == that.getUseFieldCounts() && - getUseTermCounts() == that.getUseTermCounts() && - isSortQueryBeforeGlobalIndex() == that.isSortQueryBeforeGlobalIndex() && - isSortQueryByCounts() == that.isSortQueryByCounts(); + isSortQueryPreIndexWithImpliedCounts() == isSortQueryPreIndexWithImpliedCounts() && + isSortQueryPreIndexWithFieldCounts() == isSortQueryPreIndexWithFieldCounts() && + isSortQueryPostIndexWithTermCounts() == isSortQueryPostIndexWithTermCounts() && + isSortQueryPostIndexWithFieldCounts() == isSortQueryPostIndexWithFieldCounts(); // @formatter:on } @@ -3192,16 +3212,18 @@ public int hashCode() { getEventNextSeek(), getTfFieldSeek(), getTfNextSeek(), + isSeekingEventAggregation(), getVisitorFunctionMaxWeight(), getQueryExecutionForPageTimeout(), isLazySetMechanismEnabled(), getDocAggregationThresholdMs(), getTfAggregationThresholdMs(), getPruneQueryOptions(), - getUseFieldCounts(), - getUseTermCounts(), - isSortQueryBeforeGlobalIndex(), - isSortQueryByCounts()); + isSortQueryPreIndexWithImpliedCounts(), + isSortQueryPreIndexWithFieldCounts(), + isSortQueryPostIndexWithTermCounts(), + isSortQueryPostIndexWithFieldCounts() + ); // @formatter:on } @@ -3227,4 +3249,12 @@ public List> getQueryTreeScanHintRules() { public void setQueryTreeScanHintRules(List> queryTreeScanHintRules) { this.queryTreeScanHintRules = queryTreeScanHintRules; } + + public long getMaxAnyFieldScanTimeMillis() { + return maxAnyFieldScanTimeMillis; + } + + public void setMaxAnyFieldScanTimeMillis(long maxAnyFieldScanTimeMillis) { + this.maxAnyFieldScanTimeMillis = maxAnyFieldScanTimeMillis; + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java index ec0987fdb88..3e2c3e21e95 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveredThing.java @@ -3,14 +3,14 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.Objects; +import java.util.StringJoiner; import org.apache.commons.lang.builder.CompareToBuilder; import org.apache.hadoop.io.MapWritable; import org.apache.hadoop.io.VLongWritable; import org.apache.hadoop.io.WritableComparable; -import com.google.common.base.Objects; - import datawave.core.query.configuration.ResultContext; public class DiscoveredThing implements WritableComparable { @@ -86,6 +86,7 @@ public void readFields(DataInput in) throws IOException { @Override public int compareTo(DiscoveredThing o) { + CompareToBuilder cmp = new CompareToBuilder(); if (o == null) { return 1; @@ -96,28 +97,34 @@ public int compareTo(DiscoveredThing o) { cmp.append(getDate(), o.getDate()); cmp.append(getColumnVisibility(), o.getColumnVisibility()); cmp.append(getCount(), o.getCount()); + cmp.append(getCountsByColumnVisibility(), o.getCountsByColumnVisibility()); return cmp.toComparison(); } } @Override public boolean equals(Object o) { - if (!(o instanceof DiscoveredThing)) + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { return false; - DiscoveredThing other = (DiscoveredThing) o; - return Objects.equal(getTerm(), other.getTerm()) && Objects.equal(getField(), other.getField()) && Objects.equal(getType(), other.getType()) - && Objects.equal(getDate(), other.getDate()) && Objects.equal(getColumnVisibility(), other.getColumnVisibility()) - && Objects.equal(getCount(), other.getCount()); + } + DiscoveredThing that = (DiscoveredThing) o; + return Objects.equals(term, that.term) && Objects.equals(field, that.field) && Objects.equals(type, that.type) && Objects.equals(date, that.date) + && Objects.equals(columnVisibility, that.columnVisibility) && Objects.equals(count, that.count) + && Objects.equals(countsByColumnVisibility, that.countsByColumnVisibility); } @Override public int hashCode() { - return Objects.hashCode(getTerm(), getField(), getType(), getDate(), getColumnVisibility(), getCount()); + return Objects.hash(term, field, type, date, columnVisibility, count, countsByColumnVisibility); } @Override public String toString() { - return "DiscoveredThing [term=" + term + ", field=" + field + ", type=" + type + ", date=" + date + ", columnVisibility=" + columnVisibility - + ", count=" + count + "]"; + return new StringJoiner(", ", DiscoveredThing.class.getSimpleName() + "[", "]").add("term='" + term + "'").add("field='" + field + "'") + .add("type='" + type + "'").add("date='" + date + "'").add("columnVisibility='" + columnVisibility + "'").add("count=" + count) + .add("countsByColumnVisibility=" + countsByColumnVisibility).toString(); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java index 1400308f3c2..404d9c29dda 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryIterator.java @@ -1,14 +1,15 @@ package datawave.query.discovery; -import static com.google.common.collect.Collections2.filter; -import static com.google.common.collect.Collections2.transform; -import static com.google.common.collect.Lists.newArrayList; - import java.io.IOException; -import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.stream.Collectors; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; @@ -17,141 +18,312 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.IteratorEnvironment; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; -import org.apache.accumulo.core.util.Pair; +import org.apache.accumulo.core.security.ColumnVisibility; import org.apache.hadoop.io.ArrayWritable; -import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.log4j.Logger; -import com.google.common.base.Predicates; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; +import com.google.protobuf.InvalidProtocolBufferException; + +import datawave.ingest.protobuf.Uid; +import datawave.marking.MarkingFunctions; +import datawave.query.Constants; public class DiscoveryIterator implements SortedKeyValueIterator { + private static final Logger log = Logger.getLogger(DiscoveryIterator.class); + private static final MarkingFunctions markingFunctions = MarkingFunctions.Factory.createMarkingFunctions(); - private Key tk; - private Value tv; - private SortedKeyValueIterator itr; + private Key key; + private Value value; + private SortedKeyValueIterator iterator; private boolean separateCountsByColVis = false; private boolean showReferenceCount = false; private boolean reverseIndex = false; + private boolean sumCounts = false; @Override public DiscoveryIterator deepCopy(IteratorEnvironment env) { - DiscoveryIterator i = new DiscoveryIterator(); - i.itr = itr.deepCopy(env); - return i; + DiscoveryIterator copy = new DiscoveryIterator(); + copy.iterator = iterator.deepCopy(env); + return copy; } @Override public void next() throws IOException { - tk = null; - tv = null; + this.key = null; + this.value = null; - while (itr.hasTop() && tk == null) { - Multimap terms = aggregateDate(); + while (iterator.hasTop() && key == null) { + // Get the entries to aggregate. + Multimap terms = getTermsByDatatype(); if (terms.isEmpty()) { - if (log.isTraceEnabled()) - log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available."); - continue; + log.trace("Couldn't aggregate index info; moving onto next date/field/term if data is available."); } else { - if (log.isTraceEnabled()) - log.trace("Received term info multimap of size [" + terms.size() + "]"); - ArrayList things = newArrayList( - filter(transform(terms.asMap().values(), new TermInfoAggregation(separateCountsByColVis, showReferenceCount, reverseIndex)), - Predicates.notNull())); - if (log.isTraceEnabled()) - log.trace("After conversion to discovery objects, there are [" + things.size() + "] term info objects."); - if (things.isEmpty()) { - continue; - } else { - Pair top = makeTop(things); - tk = top.getFirst(); - tv = top.getSecond(); + // Aggregate the entries. + List things = terms.asMap().values().stream().map(this::aggregate).filter(Objects::nonNull).collect(Collectors.toList()); + // Establish the next top of this iterator. + if (!things.isEmpty()) { + setTop(things); return; } } } - if (log.isTraceEnabled()) - log.trace("No data found."); + log.trace("No data found."); } - private Multimap aggregateDate() throws IOException { - Multimap terms = ArrayListMultimap.create(); - Key start = new Key(itr.getTopKey()), key = null; - while (itr.hasTop() && start.equals((key = itr.getTopKey()), PartialKey.ROW_COLFAM) && datesMatch(start, key)) { - TermInfo ti = new TermInfo(key, itr.getTopValue()); - if (ti.valid) - terms.put(ti.datatype, ti); + /** + * Return a multimap containing mappings of datatypes to term entries that should be aggregated. + */ + private Multimap getTermsByDatatype() throws IOException { + Multimap terms = ArrayListMultimap.create(); + Key start = new Key(iterator.getTopKey()); + Key key; + // If we should sum up counts, we want to collect the term entries for each date seen for the current field and term of start. Otherwise, we only want + // to collect the term entries for the current field, term, and date of start. + BiFunction dateMatchingFunction = sumCounts ? (first, second) -> true : this::datesMatch; + // Find all matching entries and parse term entries from them. + while (iterator.hasTop() && start.equals((key = iterator.getTopKey()), PartialKey.ROW_COLFAM) && dateMatchingFunction.apply(start, key)) { + TermEntry termEntry = new TermEntry(key, iterator.getTopValue()); + if (termEntry.isValid()) + terms.put(termEntry.getDatatype(), termEntry); else { - if (log.isTraceEnabled()) - log.trace("Received invalid term info from key: " + key); + if (log.isTraceEnabled()) { + log.trace("Received invalid term entry from key: " + key); + } } - itr.next(); + iterator.next(); } return terms; } - private static boolean datesMatch(Key reference, Key test) { - ByteSequence a = reference.getColumnQualifierData(), b = test.getColumnQualifierData(); + /** + * Return true if the dates for the two keys match, or false otherwise. + */ + private boolean datesMatch(Key left, Key right) { + ByteSequence leftBytes = left.getColumnQualifierData(); + ByteSequence rightBytes = right.getColumnQualifierData(); for (int i = 0; i < 8; i++) { - if (a.byteAt(i) != b.byteAt(i)) { + if (leftBytes.byteAt(i) != rightBytes.byteAt(i)) { return false; } } return true; } - private Pair makeTop(List things) { - Writable[] returnedThings = new Writable[things.size()]; - for (int i = 0; i < returnedThings.length; ++i) - returnedThings[i] = things.get(i); - ArrayWritable aw = new ArrayWritable(DiscoveredThing.class); - aw.set(returnedThings); + /** + * Return the given term entries aggregated into a single {@link DiscoveredThing} if possible, or return null if any issues occurred. + */ + private DiscoveredThing aggregate(Collection termEntries) { + if (termEntries.isEmpty()) { + return null; + } else { + TermEntry first = termEntries.iterator().next(); + String term = reverseIndex ? new StringBuilder(first.getTerm()).reverse().toString() : first.getTerm(); + String date = sumCounts ? "" : first.date; + + Set visibilities = new HashSet<>(); + Map visibilityToCounts = new HashMap<>(); + long count = 0L; + + // Aggregate the counts and visibilities from each entry. + for (TermEntry termEntry : termEntries) { + // Fetch the count to aggregate based of whether we should show the term count or the reference count. + long currentCount = this.showReferenceCount ? termEntry.getUidListSize() : termEntry.getUidCount(); + try { + // Track the distinct visibilities seen. + visibilities.add(termEntry.getVisibility()); + // If counts by visibility should be tracked, do so. + if (this.separateCountsByColVis) { + String visibility = new String(termEntry.getVisibility().flatten()); + visibilityToCounts.compute(visibility, (k, v) -> v == null ? currentCount : v + currentCount); + } + } catch (Exception e) { + // If an exception occurs, skip to the next entry. + log.trace(e); + continue; + } + // Increment the overall count. + count += currentCount; + } + + // If we do not have a count greater than 0, return null. + if (count <= 0) { + if (log.isTraceEnabled()) { + log.trace("Did not aggregate any counts for [" + first.getTerm() + "][" + first.getField() + "][" + first.getDatatype() + "][" + + first.getDate() + "]. Returning null"); + } + return null; + } else { + // Otherwise, combine the visibilities, and return the aggregated result. + try { + ColumnVisibility visibility = markingFunctions.combine(visibilities); + MapWritable countsByVis = new MapWritable(); + visibilityToCounts.forEach((key, value) -> countsByVis.put(new Text(key), new LongWritable(value))); + return new DiscoveredThing(term, first.getField(), first.getDatatype(), date, new String(visibility.flatten()), count, countsByVis); + } catch (Exception e) { + if (log.isTraceEnabled()) { + log.warn("Invalid column visibilities after combining " + visibilities); + } + return null; + } + } + } + } + /** + * Set the top {@link Key} and {@link Value} of this iterator, created from the given list of {@link DiscoveredThing} instances. + */ + private void setTop(List things) { + // We want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to ensure the keys are + // consistent with the initial seek range. DiscoveredThing thing = things.get(0); - // we want the key to be the last possible key for this date. Return the key as it is in the index (reversed if necessary) to - // ensure the keys are consistent with the initial seek range. - String row = (reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm()); - return new Pair<>(new Key(row, thing.getField(), thing.getDate() + '\uffff'), new Value(WritableUtils.toByteArray(aw))); + String row = (this.reverseIndex ? new StringBuilder().append(thing.getTerm()).reverse().toString() : thing.getTerm()); + Key newKey = new Key(row, thing.getField(), thing.getDate() + "\uffff"); + + // Create a value from the list of things. + ArrayWritable thingArray = new ArrayWritable(DiscoveredThing.class, things.toArray(new DiscoveredThing[0])); + Value newValue = new Value(WritableUtils.toByteArray(thingArray)); + + this.key = newKey; + this.value = newValue; } @Override public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { - - itr.seek(range, columnFamilies, inclusive); - if (log.isTraceEnabled()) - log.trace("My source " + (itr.hasTop() ? "does" : "does not") + " have a top."); + this.iterator.seek(range, columnFamilies, inclusive); + if (log.isTraceEnabled()) { + log.trace("My source " + (this.iterator.hasTop() ? "does" : "does not") + " have a top."); + } next(); } @Override public void init(SortedKeyValueIterator source, Map options, IteratorEnvironment env) throws IOException { - itr = source; - separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS)); - showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT)); - reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX)); + this.iterator = source; + this.separateCountsByColVis = Boolean.parseBoolean(options.get(DiscoveryLogic.SEPARATE_COUNTS_BY_COLVIS)); + this.showReferenceCount = Boolean.parseBoolean(options.get(DiscoveryLogic.SHOW_REFERENCE_COUNT)); + this.reverseIndex = Boolean.parseBoolean(options.get(DiscoveryLogic.REVERSE_INDEX)); + this.sumCounts = Boolean.parseBoolean(options.get(DiscoveryLogic.SUM_COUNTS)); if (log.isTraceEnabled()) { - log.trace("My source is a " + source.getClass().getName()); - log.trace("Separate counts by column visibility = " + separateCountsByColVis); - log.trace("Show reference count only = " + showReferenceCount); + log.trace("Source: " + source.getClass().getName()); + log.trace("Separate counts by column visibility: " + this.separateCountsByColVis); + log.trace("Show reference counts only: " + this.showReferenceCount); + log.trace("Reverse index: " + this.reverseIndex); + log.trace("Sum counts: " + this.sumCounts); } } @Override public boolean hasTop() { - return tk != null; + return key != null; } @Override public Key getTopKey() { - return tk; + return key; } @Override public Value getTopValue() { - return tv; + return value; + } + + /** + * Represents term information parsed from a {@link Key}, {@link Value} pair. + */ + private static class TermEntry { + + private final String term; + private final String field; + private String date; + private String datatype; + private ColumnVisibility visibility; + private long uidCount; + private long uidListSize; + private boolean valid; + + public TermEntry(Key key, Value value) { + term = key.getRow().toString(); + field = key.getColumnFamily().toString(); + + String colq = key.getColumnQualifier().toString(); + int firstSeparatorPos = colq.indexOf(Constants.NULL_BYTE_STRING); + if (firstSeparatorPos != -1) { + int lastSeparatorPos = colq.lastIndexOf(Constants.NULL_BYTE_STRING); + // If multiple separators are present, this is a task datatype entry. + if (firstSeparatorPos != lastSeparatorPos) { + // Ensure that we at least have yyyyMMdd. + if ((lastSeparatorPos - firstSeparatorPos) < 9) { + return; + } + // The form is datatype\0date\0task status (old knowledge entry). + date = colq.substring(firstSeparatorPos + 1, firstSeparatorPos + 9); + datatype = colq.substring(0, firstSeparatorPos); + } else { + // Ensure that we at least have yyyyMMdd. + if (firstSeparatorPos < 8) { + return; + } + // The form is shardId\0datatype. + date = colq.substring(0, 8); + datatype = colq.substring(firstSeparatorPos + 1); + } + + // Parse the UID.List object from the value. + try { + Uid.List uidList = Uid.List.parseFrom(value.get()); + if (uidList != null) { + uidCount = uidList.getCOUNT(); + uidListSize = uidList.getUIDList().size(); + } + } catch (InvalidProtocolBufferException e) { + // Don't add UID information. At least we know what shard it's located in. + } + + visibility = new ColumnVisibility(key.getColumnVisibility()); + + // This is now considered a valid entry for aggregation. + valid = true; + } + } + + public String getTerm() { + return term; + } + + public String getField() { + return field; + } + + public String getDate() { + return date; + } + + public String getDatatype() { + return datatype; + } + + public ColumnVisibility getVisibility() { + return visibility; + } + + public long getUidCount() { + return uidCount; + } + + public long getUidListSize() { + return uidListSize; + } + + public boolean isValid() { + return valid; + } } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java index 91424b1afb1..315a0b5343e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryLogic.java @@ -51,6 +51,7 @@ import datawave.core.query.configuration.QueryData; import datawave.data.type.Type; import datawave.microservice.query.Query; +import datawave.microservice.query.QueryImpl; import datawave.query.Constants; import datawave.query.QueryParameters; import datawave.query.discovery.FindLiteralsAndPatternsVisitor.QueryValues; @@ -72,18 +73,45 @@ public class DiscoveryLogic extends ShardIndexQueryTable { private static final Logger log = Logger.getLogger(DiscoveryLogic.class); + /** + * Used to specify if counts should be separated by column visibility. + */ public static final String SEPARATE_COUNTS_BY_COLVIS = "separate.counts.by.colvis"; + + /** + * Used to specify if reference counts should be shown instead of term counts. + */ public static final String SHOW_REFERENCE_COUNT = "show.reference.count"; + + /** + * Used to specify whether to sum up the counts instead of returning counts per date. + */ + public static final String SUM_COUNTS = "sum.counts"; + + /** + * Used to specify whether to search against the reversed index. + */ public static final String REVERSE_INDEX = "reverse.index"; + private DiscoveryQueryConfiguration config; private MetadataHelper metadataHelper; + /** + * Basic constructor. + */ public DiscoveryLogic() { super(); } + /** + * Copy constructor. + * + * @param other + * the other logic to copy + */ public DiscoveryLogic(DiscoveryLogic other) { super(other); + this.config = new DiscoveryQueryConfiguration(other.config); this.metadataHelper = other.metadataHelper; } @@ -92,7 +120,6 @@ public DiscoveryQueryConfiguration getConfig() { if (this.config == null) { this.config = DiscoveryQueryConfiguration.create(); } - return this.config; } @@ -111,56 +138,51 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting log.debug("Query parameters set to " + settings.getParameters()); } - // Check if the default modelName and modelTableNames have been overriden by custom parameters. - if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME) - && !settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim().isEmpty()) { - setModelName(settings.findParameter(QueryParameters.PARAMETER_MODEL_NAME).getParameterValue().trim()); - } - if (null != settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME) - && !settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim().isEmpty()) { - setModelTableName(settings.findParameter(QueryParameters.PARAMETER_MODEL_TABLE_NAME).getParameterValue().trim()); - } + // Check if the default model name and model table name have been overridden. + setModelName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_NAME, getConfig().getModelName())); + setModelTableName(getOrDefault(settings, QueryParameters.PARAMETER_MODEL_TABLE_NAME, getConfig().getModelTableName())); - // Check if user would like counts separated by column visibility - if (null != settings.findParameter(SEPARATE_COUNTS_BY_COLVIS) - && !settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim().isEmpty()) { - boolean separateCountsByColVis = Boolean.valueOf(settings.findParameter(SEPARATE_COUNTS_BY_COLVIS).getParameterValue().trim()); - getConfig().setSeparateCountsByColVis(separateCountsByColVis); - } + // Check if counts should be separated by column visibility. + setSeparateCountsByColVis(getOrDefaultBoolean(settings, SEPARATE_COUNTS_BY_COLVIS, getSeparateCountsByColVis())); - // Check if user would like to show reference counts instead of term counts - if (null != settings.findParameter(SHOW_REFERENCE_COUNT) && !settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim().isEmpty()) { - boolean showReferenceCount = Boolean.valueOf(settings.findParameter(SHOW_REFERENCE_COUNT).getParameterValue().trim()); - getConfig().setShowReferenceCount(showReferenceCount); - } + // Check if reference counts should be shown. + setShowReferenceCount(getOrDefaultBoolean(settings, SHOW_REFERENCE_COUNT, getShowReferenceCount())); + + // Check if counts should be summed. + setSumCounts(getOrDefaultBoolean(settings, SUM_COUNTS, getSumCounts())); + + // Check if any datatype filters were specified. + getConfig().setDatatypeFilter(getOrDefaultSet(settings, QueryParameters.DATATYPE_FILTER_SET, getConfig().getDatatypeFilter())); + + // Update the query model. setQueryModel(metadataHelper.getQueryModel(getModelTableName(), getModelName(), null)); - // get the data type filter set if any - if (null != settings.findParameter(QueryParameters.DATATYPE_FILTER_SET) - && !settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim().isEmpty()) { - Set dataTypeFilter = new HashSet<>(Arrays.asList(StringUtils - .split(settings.findParameter(QueryParameters.DATATYPE_FILTER_SET).getParameterValue().trim(), Constants.PARAM_VALUE_SEP))); - getConfig().setDatatypeFilter(dataTypeFilter); - if (log.isDebugEnabled()) { - log.debug("Data type filter set to " + dataTypeFilter); - } - } - // Set the connector + // Set the currently indexed fields + getConfig().setIndexedFields(metadataHelper.getIndexedFields(Collections.emptySet())); + + // Set the connector. getConfig().setClient(client); - // Set the auths + + // Set the auths. getConfig().setAuthorizations(auths); - // Get the ranges + // Get the ranges. getConfig().setBeginDate(settings.getBeginDate()); getConfig().setEndDate(settings.getEndDate()); - if (null == getConfig().getBeginDate() || null == getConfig().getEndDate()) { - getConfig().setBeginDate(new Date(0)); + // If a begin date was not specified, default to the earliest date. + if (getConfig().getBeginDate() == null) { + getConfig().setBeginDate(new Date(0L)); + log.warn("Begin date not specified, using earliest begin date."); + } + + // If an end date was not specified, default to the latest date. + if (getConfig().getEndDate() == null) { getConfig().setEndDate(new Date(Long.MAX_VALUE)); - log.warn("Dates not specified, using entire date range"); + log.warn("End date not specified, using latest end date."); } - // start with a trimmed version of the query, converted to JEXL + // Start with a trimmed version of the query, converted to JEXL LuceneToJexlQueryParser parser = new LuceneToJexlQueryParser(); parser.setAllowLeadingWildCard(isAllowLeadingWildcard()); QueryNode node = parser.parse(settings.getQuery().trim()); @@ -173,9 +195,9 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting // Parse & flatten the query ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(getConfig().getQueryString()); + CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script); - script = CaseSensitivityVisitor.upperCaseIdentifiers(getConfig(), metadataHelper, script); - + // Apply the query model. Set dataTypes = getConfig().getDatatypeFilter(); Set allFields; allFields = metadataHelper.getAllFields(dataTypes); @@ -183,14 +205,13 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting QueryValues literalsAndPatterns = FindLiteralsAndPatternsVisitor.find(script); Stopwatch timer = Stopwatch.createStarted(); - // no caching for getAllNormalizers, so try some magic with getFields... + // No caching for getAllNormalizers, so try some magic with getFields... Multimap> dataTypeMap = ArrayListMultimap.create(metadataHelper.getFieldsToDatatypes(getConfig().getDatatypeFilter())); - /* - * we have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all dataTypes - */ - dataTypeMap.putAll(Constants.ANY_FIELD, uniqueByType(dataTypeMap.values())); + // We have a mapping of FIELD->DataType, but not a mapping of ANYFIELD->DataType which should be all datatypes. + dataTypeMap.putAll(Constants.ANY_FIELD, getUniqueTypes(dataTypeMap.values())); timer.stop(); log.debug("Took " + timer.elapsed(TimeUnit.MILLISECONDS) + "ms to get all the dataTypes."); + getConfig().setLiterals(normalize(new LiteralNormalization(), literalsAndPatterns.getLiterals(), dataTypeMap)); getConfig().setPatterns(normalize(new PatternNormalization(), literalsAndPatterns.getPatterns(), dataTypeMap)); getConfig().setRanges(normalizeRanges(new LiteralNormalization(), literalsAndPatterns.getRanges(), dataTypeMap)); @@ -199,44 +220,143 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting log.debug("Normalized Patterns = " + getConfig().getPatterns()); } + // Set the planned queries to execute. getConfig().setQueries(createQueries(getConfig())); return getConfig(); } - public List createQueries(DiscoveryQueryConfiguration config) throws QueryException, TableNotFoundException, IOException, ExecutionException { - final List queries = Lists.newLinkedList(); + /** + * If present, return the value of the given parameter from the given settings, or return the default value otherwise. + */ + private String getOrDefault(Query settings, String parameterName, String defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + return StringUtils.isBlank(value) ? defaultValue : value; + } - Set familiesToSeek = Sets.newHashSet(); - Pair,Set> seekRanges = makeRanges(getConfig(), familiesToSeek, metadataHelper); - Collection forward = seekRanges.getValue0(); + /** + * If present, return the value of the given parameter from the given settings as a boolean, or return the default value otherwise. + */ + private boolean getOrDefaultBoolean(Query settings, String parameterName, boolean defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + log.debug("Trimmed value for " + parameterName + ": " + value); + return StringUtils.isBlank(value) ? defaultValue : Boolean.parseBoolean(value); + } - if (!forward.isEmpty()) { - List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(), - getConfig().getRanges(), false); - if (isCheckpointable()) { - // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly - for (Range range : forward) { - queries.add(new QueryData(config.getIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings)); + /** + * If present, return the value of the given parameter from the given settings as a set, or return the default value otherwise. + */ + private Set getOrDefaultSet(Query settings, String parameterName, Set defaultValue) { + String value = getTrimmedParameter(settings, parameterName); + return StringUtils.isBlank(value) ? defaultValue : new HashSet<>(Arrays.asList(StringUtils.split(value, Constants.PARAM_VALUE_SEP))); + } + + /** + * Return the trimmed value of the given parameter from the given settings, or null if a value is not present. + */ + private String getTrimmedParameter(Query settings, String parameterName) { + QueryImpl.Parameter parameter = settings.findParameter(parameterName); + return parameter != null ? parameter.getParameterValue().trim() : null; + } + + /** + * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer + * instances by their type, so that we only get 1 instance per type of normalizer. + */ + private Collection> getUniqueTypes(Iterable> things) { + Map,Type> map = Maps.newHashMap(); + for (Type t : things) { + map.put(t.getClass(), t); + } + return map.values(); + } + + /** + * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object + * will determine whether a regex or literal is being normalized. + * + * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. + * + * @param normalization + * the normalizer object + * @param valuesToFields + * mapping of values to fields + * @param dataTypeMap + * the data type map + * @return a mapping of the normalized tuples + */ + private Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) { + Multimap normalizedValuesToFields = HashMultimap.create(); + for (Entry valueAndField : valuesToFields.entries()) { + String value = valueAndField.getKey(), field = valueAndField.getValue(); + for (Type dataType : dataTypeMap.get(field)) { + try { + log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); + String normalized = normalization.normalize(dataType, field, value); + normalizedValuesToFields.put(normalized, field); + log.debug("Normalization succeeded!"); + } catch (Exception exception) { + log.debug("Normalization failed."); } - } else { - queries.add(new QueryData(config.getIndexTableName(), null, forward, familiesToSeek, settings)); } } + return normalizedValuesToFields; + } - Collection reverse = seekRanges.getValue1(); - if (!reverse.isEmpty()) { - List settings = getIteratorSettingsForDiscovery(getConfig(), getConfig().getLiterals(), getConfig().getPatterns(), - getConfig().getRanges(), true); - if (isCheckpointable()) { - // if checkpointable, then only one range per query data so that the whole checkpointing thing works correctly - for (Range range : reverse) { - queries.add(new QueryData(config.getReverseIndexTableName(), null, Collections.singleton(range), familiesToSeek, settings)); + /** + * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object + * will determine whether a regex or literal is being normalized. + * + * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. + * + * @param normalization + * the normalizer object + * @param valuesToFields + * mapping of values to fields + * @param dataTypeMap + * the data type map + * @return a mapping of the normalized ranges + */ + private Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields, + Multimap> dataTypeMap) { + Multimap> normalizedValuesToFields = HashMultimap.create(); + for (Entry> valueAndField : valuesToFields.entries()) { + String field = valueAndField.getKey(); + LiteralRange value = valueAndField.getValue(); + for (Type dataType : dataTypeMap.get(field)) { + try { + log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); + String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString()); + String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString()); + normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(), + value.getFieldName(), value.getNodeOperand())); + log.debug("Normalization succeeded!"); + } catch (Exception exception) { + log.debug("Normalization failed."); } - } else { - queries.add(new QueryData(config.getReverseIndexTableName(), null, reverse, familiesToSeek, settings)); } } + return normalizedValuesToFields; + } + + /** + * Create and return a list of planned queries. + * + * @param config + * the config + * @return the list of query data + */ + private List createQueries(DiscoveryQueryConfiguration config) throws TableNotFoundException, ExecutionException { + final List queries = Lists.newLinkedList(); + + Set familiesToSeek = Sets.newHashSet(); // This will be populated by createRanges(). + Pair,Set> seekRanges = createRanges(config, familiesToSeek, metadataHelper); + + // Create the forward queries. + queries.addAll(createQueriesFromRanges(config, seekRanges.getValue0(), familiesToSeek, false)); + + // Create the reverse queries. + queries.addAll(createQueriesFromRanges(config, seekRanges.getValue1(), familiesToSeek, true)); if (log.isDebugEnabled()) { log.debug("Created ranges: " + queries); @@ -245,67 +365,161 @@ public List createQueries(DiscoveryQueryConfiguration config) throws return queries; } - @Override - public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException { - if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) { - throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!"); + /** + * Create planned queries for the given ranges. + * + * @param config + * the config + * @param ranges + * the ranges + * @param familiesToSeek + * the families to seek + * @param reversed + * whether the ranges are for the reversed index + * @return the queries + */ + private List createQueriesFromRanges(DiscoveryQueryConfiguration config, Set ranges, Set familiesToSeek, boolean reversed) { + List queries = new ArrayList<>(); + if (!ranges.isEmpty()) { + List settings = getIteratorSettings(config, reversed); + String tableName = reversed ? config.getReverseIndexTableName() : config.getIndexTableName(); + if (isCheckpointable()) { + for (Range range : ranges) { + queries.add(new QueryData(tableName, null, Collections.singleton(range), familiesToSeek, settings)); + } + } else { + queries.add(new QueryData(tableName, null, ranges, familiesToSeek, settings)); + } } - this.config = (DiscoveryQueryConfiguration) genericConfig; - final List> iterators = Lists.newArrayList(); + return queries; + } - for (QueryData qd : config.getQueries()) { - if (log.isDebugEnabled()) { - log.debug("Creating scanner for " + qd); + /** + * Creates two collections of ranges: one for the forward index (value0) and one for the reverse index (value1). If a literal has a field name, then the + * Range for that term will include the column family. If there are multiple fields, then multiple ranges are created. + * + * @param config + * the discovery config + * @param familiesToSeek + * the families to seek + * @param metadataHelper + * a metadata helper + * @return a pair of ranges + * @throws TableNotFoundException + * if the table is not found + * @throws ExecutionException + * for execution exceptions + */ + private Pair,Set> createRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper) + throws TableNotFoundException, ExecutionException { + Set forwardRanges = new HashSet<>(); + Set reverseRanges = new HashSet<>(); + + // Evaluate the literals. + for (Entry literalAndField : config.getLiterals().entries()) { + String literal = literalAndField.getKey(), field = literalAndField.getValue(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); } - // scan the table - BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal)); + } - bs.setRanges(qd.getRanges()); - for (IteratorSetting setting : qd.getSettings()) { - bs.addScanIterator(setting); + // Evaluate the ranges. + for (Entry> rangeEntry : config.getRanges().entries()) { + LiteralRange range = rangeEntry.getValue(); + String field = rangeEntry.getKey(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); } - for (String cf : qd.getColumnFamilies()) { - bs.fetchColumnFamily(new Text(cf)); + try { + forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range)); + } catch (IllegalRangeArgumentException e) { + log.error("Error using range [" + range + "]", e); } + } - iterators.add(transformScanner(bs, qd)); + // Evaluate the patterns. + for (Entry patternAndField : config.getPatterns().entries()) { + String pattern = patternAndField.getKey(), field = patternAndField.getValue(); + // If the field is _ANYFIELD_, use null when making the range. + field = Constants.ANY_FIELD.equals(field) ? null : field; + // Mark the field as a family to seek if not null. + if (field != null) { + familiesToSeek.add(field); + } + ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description; + try { + description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config); + } catch (JavaRegexParseException e) { + log.error("Error parsing pattern [" + pattern + "]", e); + continue; + } + if (description.isForReverseIndex) { + reverseRanges.add(description.range); + } else { + forwardRanges.add(description.range); + } } - this.iterator = concat(iterators.iterator()); - } - public static List getIteratorSettingsForDiscovery(DiscoveryQueryConfiguration config, Multimap literals, - Multimap patterns, Multimap> ranges, boolean reverseIndex) { + return Pair.with(forwardRanges, reverseRanges); + } + /** + * Return the set of iterator settings that should be applied to queries for the given configuration. + * + * @param config + * the config + * @param reverseIndex + * whether the iterator settings should be configured for a reversed index + * @return the iterator settings + */ + private List getIteratorSettings(DiscoveryQueryConfiguration config, boolean reverseIndex) { List settings = Lists.newLinkedList(); - // The begin date from the query may be down to the second, for doing lookups in the index we want to use the day because - // the times in the index table have been truncated to the day. + + // Add a date range filter. + // The begin date from the query may be down to the second, for doing look-ups in the index we want to use the day because the times in the index table + // have been truncated to the day. Date begin = DateUtils.truncate(config.getBeginDate(), Calendar.DAY_OF_MONTH); - // we don't need to bump up the end date any more because it's not apart of the range set on the scanner + // we don't need to bump up the end date any more because it's not a part of the range set on the scanner. Date end = config.getEndDate(); - LongRange dateRange = new LongRange(begin.getTime(), end.getTime()); - settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDateRangeFilter(config, dateRange)); + + // Add a datatype filter. settings.add(ShardIndexQueryTableStaticMethods.configureGlobalIndexDataTypeFilter(config, config.getDatatypeFilter())); - IteratorSetting matchingIterator = configureIndexMatchingIterator(config, literals, patterns, ranges, reverseIndex); + // Add an iterator to match literals, patterns, and ranges against the index. + IteratorSetting matchingIterator = configureIndexMatchingIterator(config, reverseIndex); if (matchingIterator != null) { settings.add(matchingIterator); } - IteratorSetting discoveryIteratorSetting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class); - discoveryIteratorSetting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex)); - discoveryIteratorSetting.addOption(SEPARATE_COUNTS_BY_COLVIS, config.getSeparateCountsByColVis().toString()); - if (config.getShowReferenceCount()) { - discoveryIteratorSetting.addOption(SHOW_REFERENCE_COUNT, config.getShowReferenceCount().toString()); - } - settings.add(discoveryIteratorSetting); + // Add an iterator to create the actual DiscoveryThings. + settings.add(configureDiscoveryIterator(config, reverseIndex)); return settings; } - public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, Multimap literals, - Multimap patterns, Multimap> ranges, boolean reverseIndex) { + /** + * Return a {@link IteratorSetting} for an {@link IndexMatchingIterator}. + * + * @param config + * the config + * @param reverseIndex + * whether searching against the reversed index. + * @return the iterator setting + */ + private IteratorSetting configureIndexMatchingIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) { + Multimap literals = config.getLiterals(); + Multimap patterns = config.getPatterns(); + Multimap> ranges = config.getRanges(); + if ((literals == null || literals.isEmpty()) && (patterns == null || patterns.isEmpty()) && (ranges == null || ranges.isEmpty())) { return null; } @@ -314,6 +528,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 23, "termMatcher", IndexMatchingIterator.class); IndexMatchingIterator.Configuration conf = new IndexMatchingIterator.Configuration(); + // Add literals. if (literals != null) { for (Entry literal : literals.entries()) { if (Constants.ANY_FIELD.equals(literal.getValue())) { @@ -323,6 +538,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } } } + // Add patterns. if (patterns != null) { for (Entry pattern : patterns.entries()) { if (Constants.ANY_FIELD.equals(pattern.getValue())) { @@ -332,6 +548,7 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } } } + // Add ranges. if (ranges != null) { for (Entry> range : ranges.entries()) { if (Constants.ANY_FIELD.equals(range.getKey())) { @@ -343,25 +560,73 @@ public static final IteratorSetting configureIndexMatchingIterator(DiscoveryQuer } cfg.addOption(IndexMatchingIterator.CONF, IndexMatchingIterator.gson().toJson(conf)); - cfg.addOption(IndexMatchingIterator.REVERSE_INDEX, Boolean.toString(reverseIndex)); return cfg; } + /** + * Return an {@link IteratorSetting} for an {@link DiscoveryIterator}. + * + * @param config + * the config + * @param reverseIndex + * whether searching against the reversed index. + * @return the iterator setting + */ + private IteratorSetting configureDiscoveryIterator(DiscoveryQueryConfiguration config, boolean reverseIndex) { + IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 50, DiscoveryIterator.class); + setting.addOption(REVERSE_INDEX, Boolean.toString(reverseIndex)); + setting.addOption(SEPARATE_COUNTS_BY_COLVIS, Boolean.toString(config.getSeparateCountsByColVis())); + setting.addOption(SHOW_REFERENCE_COUNT, Boolean.toString(config.getShowReferenceCount())); + setting.addOption(SUM_COUNTS, Boolean.toString(config.getSumCounts())); + return setting; + } + + @Override + public void setupQuery(GenericQueryConfiguration genericConfig) throws QueryException, TableNotFoundException, IOException, ExecutionException { + if (!genericConfig.getClass().getName().equals(DiscoveryQueryConfiguration.class.getName())) { + throw new QueryException("Did not receive a DiscoveryQueryConfiguration instance!!"); + } + this.config = (DiscoveryQueryConfiguration) genericConfig; + final List> iterators = Lists.newArrayList(); + + for (QueryData qd : config.getQueries()) { + if (log.isDebugEnabled()) { + log.debug("Creating scanner for " + qd); + } + // scan the table + BatchScanner bs = scannerFactory.newScanner(qd.getTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + + bs.setRanges(qd.getRanges()); + for (IteratorSetting setting : qd.getSettings()) { + bs.addScanIterator(setting); + } + for (String cf : qd.getColumnFamilies()) { + bs.fetchColumnFamily(new Text(cf)); + } + + iterators.add(transformScanner(bs, qd, config.getIndexedFields())); + } + this.iterator = concat(iterators.iterator()); + } + @Override public ShardIndexQueryTable clone() { return new DiscoveryLogic(this); } /** - * Takes in a batch scanner and returns an iterator over the DiscoveredThing objects contained in the value. + * Takes in a batch scanner, removes all DiscoveredThings that do not have an indexed field, and returns an iterator over the DiscoveredThing objects + * contained in the value. * * @param scanner * a batch scanner + * @param indexedFields + * set of currently indexed fields * @return iterator for discoveredthings */ - public static Iterator transformScanner(final BatchScanner scanner, final QueryData queryData) { + private Iterator transformScanner(final BatchScanner scanner, final QueryData queryData, Set indexedFields) { return concat(transform(scanner.iterator(), new Function,Iterator>() { DataInputBuffer in = new DataInputBuffer(); @@ -379,190 +644,47 @@ public Iterator apply(Entry from) { } ArrayList thangs = Lists.newArrayListWithCapacity(aw.get().length); for (Writable w : aw.get()) { - thangs.add((DiscoveredThing) w); + // Check to see if the field is currently indexed, if it's not, we should NOT be adding it to 'thangs' + if (indexedFields.contains(((DiscoveredThing) w).getField())) { + thangs.add((DiscoveredThing) w); + } else { + log.debug(((DiscoveredThing) w).getField() + " was NOT found in IndexedFields"); + } } return thangs.iterator(); } })); } - /** - * Makes two collections of ranges: one for the forward index (value0) and one for the reverse index (value1). - * - * If a literal has a field name, then the Range for that term will include the column family. If there are multiple fields, then multiple ranges are - * created. - * - * @param config - * the discovery config - * @param familiesToSeek - * the families to seek - * @param metadataHelper - * a metadata helper - * @return a pair of ranges - * @throws TableNotFoundException - * if the table is not found - * @throws ExecutionException - * for execution exceptions - */ - @SuppressWarnings("unchecked") - public static Pair,Set> makeRanges(DiscoveryQueryConfiguration config, Set familiesToSeek, MetadataHelper metadataHelper) - throws TableNotFoundException, ExecutionException { - Set forwardRanges = new HashSet<>(); - for (Entry literalAndField : config.getLiterals().entries()) { - String literal = literalAndField.getKey(), field = literalAndField.getValue(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - if (field != null) { - familiesToSeek.add(field); - } - forwardRanges.add(ShardIndexQueryTableStaticMethods.getLiteralRange(field, literal)); - } - for (Entry> rangeEntry : config.getRanges().entries()) { - LiteralRange range = rangeEntry.getValue(); - String field = rangeEntry.getKey(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - if (field != null) { - familiesToSeek.add(field); - } - try { - forwardRanges.add(ShardIndexQueryTableStaticMethods.getBoundedRangeRange(range)); - } catch (IllegalRangeArgumentException e) { - log.error("Error using range [" + range + "]", e); - continue; - } - } - Set reverseRanges = new HashSet<>(); - for (Entry patternAndField : config.getPatterns().entries()) { - String pattern = patternAndField.getKey(), field = patternAndField.getValue(); - // if we're _ANYFIELD_, then use null when making the literal range - field = Constants.ANY_FIELD.equals(field) ? null : field; - ShardIndexQueryTableStaticMethods.RefactoredRangeDescription description; - try { - if (field != null) { - familiesToSeek.add(field); - } - description = ShardIndexQueryTableStaticMethods.getRegexRange(field, pattern, false, metadataHelper, config); - } catch (JavaRegexParseException e) { - log.error("Error parsing pattern [" + pattern + "]", e); - continue; - } - if (description.isForReverseIndex) { - reverseRanges.add(description.range); - } else { - forwardRanges.add(description.range); - } - } - return Pair.with(forwardRanges, reverseRanges); - } - - /** - * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object - * will determine whether or not a regex or literal is being normalized. - * - * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. - * - * @param normalization - * the normalizer object - * @param valuesToFields - * mapping of values to fields - * @param dataTypeMap - * the data type map - * @return a mapping of the noramlized tuples - */ - public static Multimap normalize(Normalization normalization, Multimap valuesToFields, Multimap> dataTypeMap) { - Multimap normalizedValuesToFields = HashMultimap.create(); - for (Entry valueAndField : valuesToFields.entries()) { - String value = valueAndField.getKey(), field = valueAndField.getValue(); - for (Type dataType : dataTypeMap.get(field)) { - try { - log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); - String normalized = normalization.normalize(dataType, field, value); - normalizedValuesToFields.put(normalized, field); - log.debug("Normalization succeeded!"); - } catch (Exception exception) { - log.debug("Normalization failed."); - } - } - } - return normalizedValuesToFields; - } - - /** - * This attempts to normalize all of the {@code } tuples with the corresponding {@code } tuple. The Normalization object - * will determine whether or not a regex or literal is being normalized. - * - * See the {@link PatternNormalization} and {@link LiteralNormalization} implementations. - * - * @param normalization - * the normalizer object - * @param valuesToFields - * mapping of values to fields - * @param dataTypeMap - * the data type map - * @return a mapping of the normalized ranges - */ - public static Multimap> normalizeRanges(Normalization normalization, Multimap> valuesToFields, - Multimap> dataTypeMap) { - Multimap> normalizedValuesToFields = HashMultimap.create(); - for (Entry> valueAndField : valuesToFields.entries()) { - String field = valueAndField.getKey(); - LiteralRange value = valueAndField.getValue(); - for (Type dataType : dataTypeMap.get(field)) { - try { - log.debug("Attempting to normalize [" + value + "] with [" + dataType.getClass() + "]"); - String normalizedLower = normalization.normalize(dataType, field, value.getLower().toString()); - String normalizedUpper = normalization.normalize(dataType, field, value.getUpper().toString()); - normalizedValuesToFields.put(field, new LiteralRange<>(normalizedLower, value.isLowerInclusive(), normalizedUpper, value.isUpperInclusive(), - value.getFieldName(), value.getNodeOperand())); - log.debug("Normalization succeeded!"); - } catch (Exception exception) { - log.debug("Normalization failed."); - } - } - } - return normalizedValuesToFields; - } - - /** - * Given a sequence of objects of type T, this method will return a single object for every unique type passed in. This is used to dedupe normalizer - * instances by their type, so that we only get 1 instance per type of normalizer. - * - * @param things - * iterable list of objects - * @param - * type of the objects - * @return an object for each type passed in - */ - public static Collection uniqueByType(Iterable things) { - Map,T> map = Maps.newHashMap(); - for (T t : things) { - map.put(t.getClass(), t); - } - return map.values(); - } - @Override public Set getOptionalQueryParameters() { Set params = super.getOptionalQueryParameters(); params.add(SEPARATE_COUNTS_BY_COLVIS); + params.add(SUM_COUNTS); return params; } - public Boolean getSeparateCountsByColVis() { + public boolean getSeparateCountsByColVis() { return getConfig().getSeparateCountsByColVis(); } - public void setSeparateCountsByColVis(Boolean separateCountsByColVis) { + public void setSeparateCountsByColVis(boolean separateCountsByColVis) { getConfig().setSeparateCountsByColVis(separateCountsByColVis); } - public Boolean getShowReferenceCount() { + public boolean getShowReferenceCount() { return getConfig().getShowReferenceCount(); } - public void setShowReferenceCount(Boolean showReferenceCount) { + public void setShowReferenceCount(boolean showReferenceCount) { getConfig().setShowReferenceCount(showReferenceCount); } + public boolean getSumCounts() { + return getConfig().getSumCounts(); + } + + public void setSumCounts(boolean sumCounts) { + getConfig().setSumCounts(sumCounts); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java index 13c8fa25d75..59d09666450 100644 --- a/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java +++ b/warehouse/query-core/src/main/java/datawave/query/discovery/DiscoveryQueryConfiguration.java @@ -3,6 +3,7 @@ import java.io.Serializable; import java.util.Collection; import java.util.Objects; +import java.util.StringJoiner; import com.google.common.collect.Multimap; @@ -17,8 +18,9 @@ public class DiscoveryQueryConfiguration extends ShardIndexQueryConfiguration implements Serializable { private Multimap literals, patterns; private Multimap> ranges; - private Boolean separateCountsByColVis = false; - private Boolean showReferenceCount = false; + private boolean separateCountsByColVis = false; + private boolean showReferenceCount = false; + private boolean sumCounts = false; public DiscoveryQueryConfiguration() {} @@ -116,23 +118,31 @@ public void setPatterns(Multimap patterns) { this.patterns = patterns; } - public Boolean getSeparateCountsByColVis() { + public boolean getSeparateCountsByColVis() { return separateCountsByColVis; } - public Boolean getShowReferenceCount() { + public boolean getShowReferenceCount() { return showReferenceCount; } + public boolean getSumCounts() { + return sumCounts; + } + public void setSeparateCountsByColVis(boolean separateCountsByColVis) { this.separateCountsByColVis = separateCountsByColVis; } - public void setShowReferenceCount(Boolean showReferenceCount) { + public void setShowReferenceCount(boolean showReferenceCount) { this.showReferenceCount = showReferenceCount; } + public void setSumCounts(boolean sumCounts) { + this.sumCounts = sumCounts; + } + @Override public DiscoveryQueryConfiguration checkpoint() { // Create a new config that only contains what is needed to execute the specified ranges @@ -156,4 +166,11 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(super.hashCode(), literals, patterns, ranges, separateCountsByColVis, showReferenceCount); } + + @Override + public String toString() { + return new StringJoiner(", ", DiscoveryQueryConfiguration.class.getSimpleName() + "[", "]").add("literals=" + literals).add("patterns=" + patterns) + .add("ranges=" + ranges).add("separateCountsByColVis=" + separateCountsByColVis).add("showReferenceCount=" + showReferenceCount) + .add("sumCounts=" + sumCounts).toString(); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/exceptions/DatawaveAsyncOperationException.java b/warehouse/query-core/src/main/java/datawave/query/exceptions/DatawaveAsyncOperationException.java new file mode 100644 index 00000000000..d8296f1e68f --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/exceptions/DatawaveAsyncOperationException.java @@ -0,0 +1,32 @@ +package datawave.query.exceptions; + +import datawave.query.planner.DefaultQueryPlanner; + +/** + * An exception thrown when the {@link DefaultQueryPlanner} encounters a problem during an async operation like fetching field sets or serializing iterator + * options in another thread + */ +public class DatawaveAsyncOperationException extends RuntimeException { + + private static final long serialVersionUID = -5455973957749708049L; + + public DatawaveAsyncOperationException() { + super(); + } + + public DatawaveAsyncOperationException(String message, Throwable cause) { + super(message, cause); + } + + public DatawaveAsyncOperationException(String message) { + super(message); + } + + public DatawaveAsyncOperationException(Throwable cause) { + super(cause); + } + + protected DatawaveAsyncOperationException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/function/DescendantCountFunction.java b/warehouse/query-core/src/main/java/datawave/query/function/DescendantCountFunction.java index 1aa09e1753d..e21d78aa7de 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/DescendantCountFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/DescendantCountFunction.java @@ -26,6 +26,8 @@ import org.apache.hadoop.io.WritableComparable; import org.apache.log4j.Logger; +import com.google.common.collect.Lists; + import datawave.data.hash.UID; import datawave.data.hash.UIDConstants; import datawave.query.Constants; @@ -75,9 +77,9 @@ public class DescendantCountFunction implements SourcedFunction columnFamilies = KeyToDocumentData.columnFamilies; + private Collection columnFamilies = Lists.newArrayList(new ArrayByteSequence("tf"), new ArrayByteSequence("d")); - private boolean inclusive = KeyToDocumentData.inclusive; + private boolean inclusive = false; private Text indexCf; @@ -133,7 +135,7 @@ public DescendantCount apply(final Tuple3>> tupl long timestamp = key.getTimestamp(); boolean hasChildren = count.hasDescendants(); final Key hasChildrenKey = new Key(key.getRow(), key.getColumnFamily(), - new Text(QueryOptions.DEFAULT_HAS_CHILDREN_FIELDNAME + '\0' + Boolean.toString(hasChildren)), visibility, timestamp); + new Text(QueryOptions.DEFAULT_HAS_CHILDREN_FIELDNAME + '\0' + hasChildren), visibility, timestamp); countKeys.add(hasChildrenKey); } @@ -142,7 +144,7 @@ public DescendantCount apply(final Tuple3>> tupl long timestamp = key.getTimestamp(); int numChildren = count.getFirstGenerationCount(); final Key childCountKey = new Key(key.getRow(), key.getColumnFamily(), - new Text(QueryOptions.DEFAULT_CHILD_COUNT_FIELDNAME + '\0' + Integer.toString(numChildren)), visibility, timestamp); + new Text(QueryOptions.DEFAULT_CHILD_COUNT_FIELDNAME + '\0' + numChildren), visibility, timestamp); countKeys.add(childCountKey); } @@ -152,9 +154,9 @@ public DescendantCount apply(final Tuple3>> tupl int numDescendants = count.getAllGenerationsCount(); final Text text; if (count.skippedDescendants()) { - text = new Text(QueryOptions.DEFAULT_DESCENDANT_COUNT_FIELDNAME + '\0' + Integer.toString(numDescendants - 1) + '+'); + text = new Text(QueryOptions.DEFAULT_DESCENDANT_COUNT_FIELDNAME + '\0' + (numDescendants - 1) + '+'); } else { - text = new Text(QueryOptions.DEFAULT_DESCENDANT_COUNT_FIELDNAME + '\0' + Integer.toString(numDescendants)); + text = new Text(QueryOptions.DEFAULT_DESCENDANT_COUNT_FIELDNAME + '\0' + numDescendants); } final Key descendantCountKey = new Key(key.getRow(), key.getColumnFamily(), text, visibility, timestamp); @@ -266,7 +268,7 @@ private int getCountByEventScan(final Range seekRange, final Text row, final Str Key endKey = new Key(row, new Text(dataType + '\0' + baseUid + Constants.MAX_UNICODE_STRING)); Range range = new Range(startKey, true, endKey, false); - // seek too the new range + // seek to the new range Set emptyCfs = Collections.emptySet(); this.source.seek(range, emptyCfs, false); @@ -388,7 +390,7 @@ private CountResult getCountByFieldIndexScan(final Range seekRange, final Text r } else { // If configured, past an exceptionally large number of irrelevant grandchildren. // Although this would potentially throw off the descendant count, it may be necessary - // if a given event has thousands (or millions) of grandchildren and we're mainly interested + // if a given event has thousands (or millions) of grandchildren, and we're mainly interested // in the number of 1st generation children. nonMatchingDescendants++; if ((this.skipThreshold > 0) && (nonMatchingDescendants >= this.skipThreshold)) { diff --git a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java index 8628206161f..6ff31065b71 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/IndexOnlyKeyToDocumentData.java @@ -38,9 +38,9 @@ * Fetches index-only tf key/values and outputs them as "standard" field key/value pairs */ public class IndexOnlyKeyToDocumentData extends KeyToDocumentData implements Iterator> { - private static final Collection COLUMN_FAMILIES = Lists. newArrayList(new ArrayByteSequence("d")); + private static final Collection COLUMN_FAMILIES = Lists.newArrayList(new ArrayByteSequence("d")); - private static Logger LOG = Logger.getLogger(IndexOnlyKeyToDocumentData.class); + private static final Logger LOG = Logger.getLogger(IndexOnlyKeyToDocumentData.class); private static final Entry INVALID_COLUMNQUALIFIER_FORMAT_KEY = Maps.immutableEntry(new Key("INVALID_COLUMNQUALIFIER_FORMAT_KEY"), EMPTY_VALUE); @@ -159,7 +159,7 @@ public Entry apply(final Entry from) { // get the document key Key docKey = getDocKey(from.getKey()); - // Ensure that we have a non-empty colqual + // Ensure that we have a non-empty column qualifier final Key stopKey = new Key(from.getKey().getRow().toString(), from.getKey().getColumnFamily().toString(), from.getKey().getColumnQualifier().toString() + '\u0000' + '\uffff'); diff --git a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java index 5e1e147b5f3..fabd48d09e1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java +++ b/warehouse/query-core/src/main/java/datawave/query/function/KeyToDocumentData.java @@ -54,8 +54,7 @@ public class KeyToDocumentData implements Function,Entry columnFamilies = Lists.newArrayList(new ArrayByteSequence("tf"), new ArrayByteSequence("d")); - protected static final boolean inclusive = false; + protected final Collection columnFamilies = Lists.newArrayList(new ArrayByteSequence("tf"), new ArrayByteSequence("d")); private final DescendantCountFunction countFunction; @@ -153,7 +152,7 @@ public List> appendHierarchyFields(final List> @Override public Entry apply(Entry from) { - // We want to ensure that we have a non-empty colqual + // We want to ensure that we have a non-empty column qualifier if (null == from || null == from.getKey() || null == from.getValue()) { return null; } @@ -162,7 +161,7 @@ public Entry apply(Entry from) { try { logStart(); - source.seek(keyRange, columnFamilies, inclusive); + source.seek(keyRange, columnFamilies, false); if (log.isDebugEnabled()) log.debug(source.hasTop() + " Key range is " + keyRange); @@ -201,32 +200,6 @@ public Entry apply(Entry from) { * for issues with read/write */ public List> collectDocumentAttributes(final Key documentStartKey, final Set docKeys, final Range keyRange) throws IOException { - return collectAttributesForDocumentKey(documentStartKey, source, equality, filter, docKeys, keyRange); - } - - /** - * Given a Key pointing to the start of an document to aggregate, construct a Range that should encapsulate the "document" to be aggregated together. Also - * checks to see if data was found for the constructed Range before returning. - * - * @param documentStartKey - * A Key of the form "bucket type\x00uid: " - * @param keyRange - * the Range used to initialize source with seek() - * @param source - * a source - * @param equality - * an equality - * @param docKeys - * set of keys - * @param filter - * a query filter - * @return the attributes - * @throws IOException - * for issues with read/write - */ - private static List> collectAttributesForDocumentKey(Key documentStartKey, SortedKeyValueIterator source, Equality equality, - EventDataQueryFilter filter, Set docKeys, Range keyRange) throws IOException { - // set up the document key we are filtering for on the EventDataQueryFilter if (filter != null) { filter.startNewDocument(documentStartKey); @@ -256,7 +229,7 @@ private static List> collectAttributesForDocumentKey(Key docume // request a seek range from the filter Range seekRange = filter.getSeekRange(docAttrKey.get(), keyRange.getEndKey(), keyRange.isEndKeyInclusive()); if (seekRange != null) { - source.seek(seekRange, columnFamilies, inclusive); + source.seek(seekRange, columnFamilies, false); seeked = true; } } @@ -292,7 +265,7 @@ public static Key getDocKey(Key key) { private static List> appendHierarchyFields(List> documentAttributes, Key key, Range seekRange, DescendantCountFunction function, boolean includeParent) { - if ((null != function) || includeParent) { + if (function != null || includeParent) { // get the minimal timestamp and majority visibility from the // attributes @@ -329,7 +302,7 @@ private static List> appendHierarchyFields(List> appendHierarchyFields(List> documentAttributes, final String visibility, long timestamp) { int basicChildCount = 0; - if ((null != function) && (null != key)) { - // Count the descendants, generating keys based on query options and - // document attributes - final Tuple3>> tuple = new Tuple3<>(range, key, documentAttributes); - final DescendantCount count = function.apply(tuple); - - // No need to do any more work if there aren't any descendants - if ((null != count) && count.hasDescendants()) { - // Extract the basic, first-generation count - basicChildCount = count.getFirstGenerationCount(); - - // Get any generated keys, apply any specified visibility, and - // add to the document attributes - final List keys = count.getKeys(); - if ((null != documentAttributes) && !documentAttributes.isEmpty() && !keys.isEmpty()) { - // Create a Text for the Keys' visibility - Text appliedVis; - if ((null != visibility) && !visibility.isEmpty()) { - appliedVis = new Text(visibility); - } else { - appliedVis = new Text(); - } + if (null == function || null == key) { + return basicChildCount; + } - // Conditionally adjust visibility and timestamp - for (final Key childCountKey : keys) { - final Text appliedRow = childCountKey.getRow(); - final Text appliedCf = childCountKey.getColumnFamily(); - final Text appliedCq = childCountKey.getColumnQualifier(); - if ((null == visibility) || visibility.isEmpty()) { - childCountKey.getColumnVisibility(appliedVis); - } - if (timestamp <= 0) { - timestamp = childCountKey.getTimestamp(); - } + // Count the descendants, generating keys based on query options and + // document attributes + final Tuple3>> tuple = new Tuple3<>(range, key, documentAttributes); + final DescendantCount count = function.apply(tuple); + + // No need to do any more work if there aren't any descendants + if (count != null && count.hasDescendants()) { + // Extract the basic, first-generation count + basicChildCount = count.getFirstGenerationCount(); + + // Get any generated keys, apply any specified visibility, and + // add to the document attributes + final List keys = count.getKeys(); + if (documentAttributes != null && !documentAttributes.isEmpty() && !keys.isEmpty()) { + // Create a Text for the Keys' visibility + Text appliedVis; + if (visibility != null && !visibility.isEmpty()) { + appliedVis = new Text(visibility); + } else { + appliedVis = new Text(); + } - final Key appliedKey = new Key(appliedRow, appliedCf, appliedCq, appliedVis, timestamp); - documentAttributes.add(Maps.immutableEntry(appliedKey, EMPTY_VALUE)); + // Conditionally adjust visibility and timestamp + for (final Key childCountKey : keys) { + final Text appliedRow = childCountKey.getRow(); + final Text appliedCf = childCountKey.getColumnFamily(); + final Text appliedCq = childCountKey.getColumnQualifier(); + if (visibility == null || visibility.isEmpty()) { + childCountKey.getColumnVisibility(appliedVis); } + if (timestamp <= 0) { + timestamp = childCountKey.getTimestamp(); + } + + final Key appliedKey = new Key(appliedRow, appliedCf, appliedCq, appliedVis, timestamp); + documentAttributes.add(Maps.immutableEntry(appliedKey, EMPTY_VALUE)); } } } diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java index 19bec2cb83b..bc2a0dc8781 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/IndexInfo.java @@ -328,11 +328,17 @@ public IndexInfo union(IndexInfo o, List delayedNodes) { merged.count = merged.uids.size(); } - merged.setFieldCounts(this.getFieldCounts()); - merged.mergeFieldCounts(o.getFieldCounts()); + if (this == o) { + // handle idiosyncrasy of the peeking iterator where the first term is merged with itself + merged.setFieldCounts(o.getFieldCounts()); + merged.setTermCounts(o.getTermCounts()); + } else { + merged.setFieldCounts(getFieldCounts()); + merged.setTermCounts(getTermCounts()); - merged.setTermCounts(this.getTermCounts()); - merged.mergeTermCounts(o.getTermCounts()); + merged.mergeFieldCounts(o.getFieldCounts()); + merged.mergeTermCounts(o.getTermCounts()); + } /* * If there are multiple levels within a union we could have an ASTOrNode. We cannot prune OrNodes as we would with an intersection, so propagate the diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java index 0eb3fe6b144..9d1f2951563 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/RangeStream.java @@ -157,8 +157,8 @@ public RangeStream(ShardQueryConfiguration config, ScannerFactory scanners, Meta streamExecutor = new ThreadPoolExecutor(executeLookupMin, maxLookup, 100, TimeUnit.MILLISECONDS, runnables); fieldDataTypes = config.getQueryFieldsDatatypes(); collapseUids = config.getCollapseUids(); - fieldCounts = config.getUseFieldCounts(); - termCounts = config.getUseTermCounts(); + fieldCounts = config.isSortQueryPostIndexWithFieldCounts(); + termCounts = config.isSortQueryPostIndexWithTermCounts(); try { Set ioFields = metadataHelper.getIndexOnlyFields(null); if (null != ioFields) { @@ -264,8 +264,8 @@ public Iterator iterator() { this.itr = filter(concat(transform(queryStream, new TupleToRange(config.getShardTableName(), queryStream.currentNode(), config))), getEmptyPlanPruner()); - if (config.isSortQueryByCounts() && (config.getUseFieldCounts() || config.getUseTermCounts())) { - this.itr = transform(itr, new OrderingTransform(config.getUseFieldCounts(), config.getUseTermCounts())); + if (config.isSortQueryPostIndexWithFieldCounts() || config.isSortQueryPostIndexWithTermCounts()) { + this.itr = transform(itr, new OrderingTransform(config.isSortQueryPostIndexWithFieldCounts(), config.isSortQueryPostIndexWithTermCounts())); } } } finally { @@ -362,7 +362,7 @@ public QueryPlan apply(QueryPlan plan) { Map counts = plan.getTermCounts().getCounts(); OrderByCostVisitor.orderByTermCount(plan.getQueryTree(), counts); } else if (useFieldCounts) { - Map counts = plan.getTermCounts().getCounts(); + Map counts = plan.getFieldCounts().getCounts(); OrderByCostVisitor.orderByFieldCount(plan.getQueryTree(), counts); } return plan; @@ -602,6 +602,10 @@ public ScannerStream visit(ASTEQNode node, Object data) { String queryString = fieldName + "=='" + literal + "'"; options.addScanIterator(QueryScannerHelper.getQueryInfoIterator(config.getQuery(), false, queryString)); + // easier to apply hints to new options than deal with copying existing hints between + options.applyExecutionHints(config.getIndexTableName(), config.getTableHints()); + options.applyConsistencyLevel(config.getIndexTableName(), config.getTableConsistencyLevels()); + scannerSession.setOptions(options); scannerSession.setMaxResults(config.getMaxIndexBatchSize()); scannerSession.setExecutor(streamExecutor); diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java index 6681b9ebd0b..4972d173f5f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardLimitingIterator.java @@ -9,7 +9,7 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; diff --git a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java index 2b437ea61c5..55cf0b0dff0 100644 --- a/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java +++ b/warehouse/query-core/src/main/java/datawave/query/index/lookup/ShardRangeStream.java @@ -1,5 +1,6 @@ package datawave.query.index.lookup; +import java.io.IOException; import java.util.Collections; import java.util.Date; import java.util.Iterator; @@ -12,15 +13,17 @@ import org.apache.accumulo.core.data.PartialKey; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.commons.jexl3.parser.JexlNode; import com.google.common.base.Function; +import com.google.common.collect.HashMultimap; import com.google.common.collect.Iterators; +import com.google.common.collect.Multimap; +import datawave.data.type.Type; import datawave.query.CloseableIterable; import datawave.query.config.ShardQueryConfiguration; -import datawave.query.exceptions.DatawaveQueryException; import datawave.query.index.lookup.IndexStream.StreamContext; import datawave.query.iterator.FieldIndexOnlyQueryIterator; import datawave.query.iterator.QueryOptions; @@ -65,7 +68,7 @@ public CloseableIterable streamPlans(JexlNode node) { DefaultQueryPlanner.addOption(cfg, QueryOptions.DATATYPE_FILTER, config.getDatatypeFilterAsString(), false); DefaultQueryPlanner.addOption(cfg, QueryOptions.END_TIME, Long.toString(config.getEndDate().getTime()), false); - DefaultQueryPlanner.configureTypeMappings(config, cfg, metadataHelper, true); + configureTypeMappings(config, cfg, metadataHelper); scanner.setRanges(Collections.singleton(rangeForTerm(null, null, config))); @@ -97,7 +100,7 @@ public CloseableIterable streamPlans(JexlNode node) { } - } catch (TableNotFoundException | DatawaveQueryException e) { + } catch (TableNotFoundException e) { throw new RuntimeException(e); } finally { // shut down the executor as all threads have completed @@ -134,4 +137,29 @@ public QueryPlan apply(Entry entry) { // @formatter:on } } + + /** + * Lift and shift from DefaultQueryPlanner to avoid reliance on static methods + */ + private void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper) { + DefaultQueryPlanner.addOption(cfg, QueryOptions.QUERY_MAPPING_COMPRESS, Boolean.toString(true), false); + + Multimap> nonIndexedQueryFieldsDatatypes = HashMultimap.create(config.getQueryFieldsDatatypes()); + nonIndexedQueryFieldsDatatypes.keySet().removeAll(config.getIndexedFields()); + String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); + DefaultQueryPlanner.addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false); + + try { + String serializedTypeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()).toString(); + DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); + + String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); + requiredAuthsString = QueryOptions.compressOption(requiredAuthsString, QueryOptions.UTF8); + DefaultQueryPlanner.addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false); + } catch (TableNotFoundException | IOException e) { + throw new RuntimeException(e); + } + + DefaultQueryPlanner.addOption(cfg, QueryOptions.METADATA_TABLE_NAME, config.getMetadataTableName(), false); + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/ParentQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/ParentQueryIterator.java index a5e5c948814..63a612f7b3e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/ParentQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/ParentQueryIterator.java @@ -69,6 +69,31 @@ public EventDataQueryFilter getEvaluationFilter() { return evaluationFilter != null ? evaluationFilter.clone() : null; } + /** + * In the Parent case replace the {@link QueryOptions#eventFilter} with an evaluation filter + * + * @return an evaluation filter + */ + public EventDataQueryFilter getEventFilter() { + return getEvaluationFilter(); + } + + @Override + public EventDataQueryFilter getFiEvaluationFilter() { + if (fiEvaluationFilter == null) { + fiEvaluationFilter = getEvaluationFilter(); + } + return fiEvaluationFilter.clone(); + } + + @Override + public EventDataQueryFilter getEventEvaluationFilter() { + if (eventEvaluationFilter == null) { + eventEvaluationFilter = getEvaluationFilter(); + } + return eventEvaluationFilter.clone(); + } + @Override public Iterator> mapDocument(SortedKeyValueIterator deepSourceCopy, Iterator> documents, CompositeMetadata compositeMetadata) { diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java index 1d4dc14dcb5..6509b471db3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/QueryIterator.java @@ -759,22 +759,25 @@ public Iterator> createDocumentPipeline(SortedKeyValueIterat if (log.isTraceEnabled()) { log.trace("isFieldIndexSatisfyingQuery"); } - docMapper = new Function,Entry>() { + docMapper = new Function<>() { @Nullable @Override public Entry apply(@Nullable Entry input) { - Entry entry = null; if (input != null) { - entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.EMPTY_LIST, true), + entry = Maps.immutableEntry(new DocumentData(input.getKey(), Collections.singleton(input.getKey()), Collections.emptyList(), true), input.getValue()); } return entry; } }; } else { - docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, getEquality(), getEvaluationFilter(), this.includeHierarchyFields, - this.includeHierarchyFields).withRangeProvider(getRangeProvider()).withAggregationThreshold(getDocAggregationThresholdMs()); + // @formatter:off + docMapper = new KeyToDocumentData(deepSourceCopy, myEnvironment, documentOptions, getEquality(), getEventEvaluationFilter(), this.includeHierarchyFields, + this.includeHierarchyFields) + .withRangeProvider(getRangeProvider()) + .withAggregationThreshold(getDocAggregationThresholdMs()); + // @formatter:on } Iterator> sourceIterator = Iterators.transform(documentSpecificSource, from -> { @@ -787,7 +790,7 @@ public Entry apply(@Nullable Entry input) { // which do not fall within the expected time range Iterator> documents = null; Aggregation a = new Aggregation(this.getTimeFilter(), this.typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), - this.includeRecordId, this.disableIndexOnlyDocuments(), getEvaluationFilter(), isTrackSizes()); + this.includeRecordId, this.disableIndexOnlyDocuments(), getEventEvaluationFilter(), isTrackSizes()); if (gatherTimingDetails()) { documents = Iterators.transform(sourceIterator, new EvaluationTrackingFunction<>(QuerySpan.Stage.Aggregation, trackingSpan, a)); } else { @@ -1094,15 +1097,18 @@ protected Iterator> mapDocument(SortedKeyValueIterator> mappedDocuments = Iterators.transform(documents, new GetDocument(docMapper, new Aggregation(this.getTimeFilter(), typeMetadataWithNonIndexed, compositeMetadata, this.isIncludeGroupingContext(), this.includeRecordId, this.disableIndexOnlyDocuments(), - getEvaluationFilter(), isTrackSizes()))); + getEventEvaluationFilter(), isTrackSizes()))); Iterator> retDocuments = Iterators.transform(mappedDocuments, new TupleToEntry<>()); // Inject the document permutations if required @@ -1422,6 +1428,8 @@ protected IteratorBuildingVisitor createIteratorBuildingVisitor(Class fileSystemCache = CacheBuilder.newBuilder().concurrencyLevel(10).maximumSize(100).build(); - public static final Charset UTF8 = StandardCharsets.UTF_8; public static final String DEBUG_MULTITHREADED_SOURCES = "debug.multithreaded.sources"; @@ -273,6 +267,8 @@ public class QueryOptions implements OptionDescriber { public static final String TF_FIELD_SEEK = "tf.field.seek"; public static final String TF_NEXT_SEEK = "tf.next.seek"; + public static final String SEEKING_EVENT_AGGREGATION = "seeking.event.aggregation"; + public static final String DOC_AGGREGATION_THRESHOLD_MS = "doc.agg.threshold"; public static final String TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS = "tf.agg.threshold"; @@ -321,7 +317,12 @@ public class QueryOptions implements OptionDescriber { protected FieldIndexAggregator fiAggregator; protected Equality equality; + // filter for any key type (fi, event, tf) protected EventDataQueryFilter evaluationFilter; + protected EventDataQueryFilter fiEvaluationFilter; + protected EventDataQueryFilter eventEvaluationFilter; + // filter specifically for event keys. required when performing a seeking aggregation + protected EventDataQueryFilter eventFilter; protected int maxEvaluationPipelines = 25; protected int maxPipelineCachedResults = 25; @@ -335,7 +336,7 @@ public class QueryOptions implements OptionDescriber { protected List documentPermutationClasses = new ArrayList<>(); protected List documentPermutations = null; - protected long startTime = 0l; + protected long startTime = 0L; protected long endTime = System.currentTimeMillis(); protected TimeFilter timeFilter = null; @@ -438,6 +439,8 @@ public class QueryOptions implements OptionDescriber { private int tfFieldSeek = -1; private int tfNextSeek = -1; + private boolean seekingEventAggregation = false; + // aggregation thresholds private int docAggregationThresholdMs = -1; private int tfAggregationThresholdMs = -1; @@ -502,6 +505,8 @@ public void deepCopy(QueryOptions other) { this.getDocumentKey = other.getDocumentKey; this.equality = other.equality; this.evaluationFilter = other.evaluationFilter; + this.fiEvaluationFilter = other.fiEvaluationFilter; + this.eventEvaluationFilter = other.eventEvaluationFilter; this.ivaratorCacheDirConfigs = (other.ivaratorCacheDirConfigs == null) ? null : new ArrayList<>(other.ivaratorCacheDirConfigs); this.hdfsSiteConfigURLs = other.hdfsSiteConfigURLs; @@ -552,6 +557,8 @@ public void deepCopy(QueryOptions other) { this.tfFieldSeek = other.tfFieldSeek; this.tfNextSeek = other.tfNextSeek; + this.seekingEventAggregation = other.seekingEventAggregation; + this.docAggregationThresholdMs = other.docAggregationThresholdMs; this.tfAggregationThresholdMs = other.tfAggregationThresholdMs; @@ -772,7 +779,7 @@ public void setArithmetic(JexlArithmetic arithmetic) { */ public FieldIndexAggregator getFiAggregator() { if (fiAggregator == null) { - this.fiAggregator = new IdentityAggregator(getNonEventFields(), getEvaluationFilter(), getEventNextSeek()); + this.fiAggregator = new IdentityAggregator(getNonEventFields(), getFiEvaluationFilter(), getEventNextSeek()); } return fiAggregator; } @@ -781,10 +788,96 @@ public EventDataQueryFilter getEvaluationFilter() { return evaluationFilter != null ? evaluationFilter.clone() : null; } + public EventDataQueryFilter getFiEvaluationFilter() { + return fiEvaluationFilter != null ? fiEvaluationFilter.clone() : null; + } + + public EventDataQueryFilter getEventEvaluationFilter() { + return eventEvaluationFilter != null ? eventEvaluationFilter.clone() : null; + } + public void setEvaluationFilter(EventDataQueryFilter evaluationFilter) { this.evaluationFilter = evaluationFilter; } + public void setFiEvaluationFilter(EventDataQueryFilter fiEvaluationFilter) { + this.fiEvaluationFilter = fiEvaluationFilter; + } + + public void setEventEvaluationFilter(EventDataQueryFilter eventEvaluationFilter) { + this.eventEvaluationFilter = eventEvaluationFilter; + } + + /** + * Return or build a field filter IFF this query is projecting results + * + * @return a field filter, or null if results are not projected + */ + public EventDataQueryFilter getEventFilter() { + + if (!useAllowListedFields || allowListedFields instanceof UniversalSet || !isSeekingEventAggregation()) { + return null; + } + + if (eventFilter == null) { + + Set fields = getEventFieldsToRetain(); + if (fields.contains(Constants.ANY_FIELD)) { + return null; + } + + // @formatter:off + eventFilter = new EventDataQueryFieldFilter() + .withFields(fields) + .withMaxNextCount(getEventNextSeek()); + // @formatter:on + } + + return eventFilter == null ? null : eventFilter.clone(); + } + + /** + * Get the event fields to retain + * + * @return the set of event fields + */ + private Set getEventFieldsToRetain() { + Set fields = getQueryFields(); + + if (!allowListedFields.isEmpty()) { + fields.addAll(allowListedFields); + } + + if (groupFields != null) { + fields.addAll(groupFields.getGroupByFields()); + } + + if (!indexOnlyFields.isEmpty()) { + // index only fields are not present in the event column + fields.removeAll(indexOnlyFields); + } + + // add composite components + if (compositeMetadata != null && !compositeMetadata.isEmpty()) { + Collection> entries = compositeMetadata.getCompositeFieldMapByType().values(); + for (Multimap entry : entries) { + fields.addAll(entry.values()); + } + } + + return fields; + } + + private Set getQueryFields() { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + return JexlASTHelper.getIdentifierNames(script); + } catch (ParseException e) { + // ignore + throw new FatalBeanException("Could not parse query"); + } + } + public TimeFilter getTimeFilter() { return timeFilter; } @@ -1409,6 +1502,8 @@ public boolean validateOptions(Map options) { } this.evaluationFilter = null; + this.fiEvaluationFilter = null; + this.eventEvaluationFilter = null; this.getDocumentKey = GetStartKey.instance(); this.mustUseFieldIndex = false; @@ -1468,6 +1563,10 @@ public boolean validateOptions(Map options) { this.tfNextSeek = Integer.parseInt(options.get(TF_NEXT_SEEK)); } + if (options.containsKey(SEEKING_EVENT_AGGREGATION)) { + this.seekingEventAggregation = Boolean.parseBoolean(options.get(SEEKING_EVENT_AGGREGATION)); + } + if (options.containsKey(DOC_AGGREGATION_THRESHOLD_MS)) { this.docAggregationThresholdMs = Integer.parseInt(options.get(DOC_AGGREGATION_THRESHOLD_MS)); } @@ -2259,4 +2358,8 @@ public Equality getEquality() { } return equality; } + + public boolean isSeekingEventAggregation() { + return seekingEventAggregation; + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/facets/DynamicFacetIterator.java b/warehouse/query-core/src/main/java/datawave/query/iterator/facets/DynamicFacetIterator.java index bbdb9b5255a..ef112e6b259 100644 --- a/warehouse/query-core/src/main/java/datawave/query/iterator/facets/DynamicFacetIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/iterator/facets/DynamicFacetIterator.java @@ -168,7 +168,7 @@ public Iterator> getDocumentIterator(Range range, Collection Iterator> documents = null; if (!configuration.getFacetedFields().isEmpty()) { - projection = new EventDataQueryFieldFilter(configuration.getFacetedFields(), Projection.ProjectionType.INCLUDES); + projection = new EventDataQueryFieldFilter().withFields(configuration.getFacetedFields()); } if (!configuration.hasFieldLimits() || projection != null) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java index 82d9e9b24fa..96452c57741 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/functions/ContentFunctionsDescriptor.java @@ -15,6 +15,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.commons.jexl3.parser.ASTAndNode; @@ -37,6 +38,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import com.google.common.collect.Sets; +import com.google.common.collect.Streams; import datawave.query.attributes.AttributeFactory; import datawave.query.config.ShardQueryConfiguration; @@ -104,19 +106,18 @@ public JexlNode getIndexQuery(Set termFrequencyFields, Set index // get the cartesian product of all the fields and terms MutableBoolean oredFields = new MutableBoolean(); - Set[] fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true); - if (!fieldsAndTerms[0].isEmpty()) { + FieldTerms fieldsAndTerms = fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, true); + Set fields = fieldsAndTerms.getFields(); + if (!fields.isEmpty()) { final JexlNode eq = new ASTEQNode(ParserTreeConstants.JJTEQNODE); - - for (String field : fieldsAndTerms[0]) { - nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, fieldsAndTerms[1])); - } + Set terms = fieldsAndTerms.getTerms(); + fields.forEach(field -> nodes.add(JexlNodeFactory.createNodeTreeFromFieldValues(ContainerType.AND_NODE, eq, null, field, terms))); } - if (fieldsAndTerms[0].size() == 0) { + if (fields.isEmpty()) { log.warn("No fields found for content function, will not expand index query"); return new ASTTrueNode(ParserTreeConstants.JJTTRUENODE); - } else if (fieldsAndTerms[0].size() == 1) { + } else if (fields.size() == 1) { // A single field needs no wrapper node. return nodes.iterator().next(); } else if (oredFields.booleanValue()) { @@ -194,7 +195,7 @@ public Set fieldsForNormalization(MetadataHelper helper, Set dat public Set fields(MetadataHelper helper, Set datatypeFilter) { try { return fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), - helper.getContentFields(datatypeFilter), null)[0]; + helper.getContentFields(datatypeFilter), null).getFields(); } catch (TableNotFoundException e) { QueryException qe = new QueryException(DatawaveErrorCode.METADATA_TABLE_FETCH_ERROR, e); throw new DatawaveFatalQueryException(qe); @@ -206,15 +207,15 @@ public Set fields(MetadataHelper helper, Set datatypeFilter) { public Set> fieldSets(MetadataHelper helper, Set datatypeFilter) { try { MutableBoolean oredFields = new MutableBoolean(); - Set[] fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), + FieldTerms fieldsAndTerms = fieldsAndTerms(helper.getTermFrequencyFields(datatypeFilter), helper.getIndexedFields(datatypeFilter), helper.getContentFields(datatypeFilter), oredFields); Set> fieldSets = new HashSet<>(); if (oredFields.booleanValue()) { - for (String field : fieldsAndTerms[0]) { + for (String field : fieldsAndTerms.getFields()) { fieldSets.add(Collections.singleton(field)); } } else { - fieldSets.add(fieldsAndTerms[0]); + fieldSets.add(fieldsAndTerms.getFields()); } return fieldSets; } catch (TableNotFoundException e) { @@ -224,174 +225,200 @@ public Set> fieldSets(MetadataHelper helper, Set datatypeFil } - public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) { + public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields) { return fieldsAndTerms(termFrequencyFields, indexedFields, contentFields, oredFields, false); } @SuppressWarnings("unchecked") - public Set[] fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields, + public FieldTerms fieldsAndTerms(Set termFrequencyFields, Set indexedFields, Set contentFields, MutableBoolean oredFields, boolean validateFields) { + if (this.args.isEmpty()) { + NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING, + MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), this.namespace, this.name)); + throw new IllegalArgumentException(qe); + } - final String funcName = name; - - PeekingIterator args = Iterators.peekingIterator(this.args.iterator()); - - Set termFreqFields = Sets.newHashSet(termFrequencyFields); - Set fields = Sets.newHashSetWithExpectedSize(termFreqFields.size()); - Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1); - Iterator itr = termFreqFields.iterator(); // Can any one of the fields satisfy the query? Always true unless the zone is specified in an AND clause. if (oredFields != null) { oredFields.setValue(true); } - while (itr.hasNext()) { - String field = itr.next(); - if (indexedFields.contains(field) && (contentFields.isEmpty() || contentFields.contains(field))) { - fields.add(field); - } - } - - if (args.hasNext()) { - JexlNode termOffsetMap = null; - if (CONTENT_ADJACENT_FUNCTION_NAME.equals(funcName)) { - JexlNode firstArg = args.next(); + PeekingIterator argsIterator = Iterators.peekingIterator(this.args.iterator()); + FieldTerms fieldTerms = new FieldTerms(); + JexlNode termOffsetMap; - // we override the zones if the first argument is a string - if (firstArg instanceof ASTStringLiteral) { - fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg)); - termOffsetMap = args.next(); - } else { - JexlNode nextArg = args.peek(); - - // The zones may (more likely) be specified as an idenfifier - if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { - if (oredFields != null && firstArg instanceof ASTAndNode) { - oredFields.setValue(false); - } - - fields = JexlASTHelper.getIdentifierNames(firstArg); - termOffsetMap = args.next(); - } else { - termOffsetMap = firstArg; - } - } - } else if (CONTENT_PHRASE_FUNCTION_NAME.equals(funcName)) { - JexlNode firstArg = args.next(); + switch (this.name) { + case CONTENT_ADJACENT_FUNCTION_NAME: + termOffsetMap = examineContentAdjacentFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_PHRASE_FUNCTION_NAME: + termOffsetMap = examineContentPhraseFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_SCORED_PHRASE_FUNCTION_NAME: + termOffsetMap = examineContentScoredPhraseFunction(argsIterator, fieldTerms, oredFields); + break; + case CONTENT_WITHIN_FUNCTION_NAME: + termOffsetMap = examineContentWithinFunction(argsIterator, fieldTerms, oredFields); + break; + default: + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING); + throw new IllegalArgumentException(qe); + } - // we override the zones if the first argument is a string - if (firstArg instanceof ASTStringLiteral) { - fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral()); + // Verify that a term offset map with terms were specified. + validateTermsOffsetMapAndTermsPresent(termOffsetMap, argsIterator); - termOffsetMap = args.next(); - } else { - JexlNode nextArg = args.peek(); + // If the fields were not established above, ensure that the fields at least contain any term frequency fields that are indexed and, if any content + // fields were specified, present within there as well. + if (fieldTerms.fields == null) { + Set fields = termFrequencyFields.stream() + .filter(f -> indexedFields.contains(f) && (contentFields.isEmpty() || contentFields.contains(f))).collect(Collectors.toSet()); + fieldTerms.fields = fields; + } - // The zones may (more likely) be specified as an identifier - if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { - if (oredFields != null && firstArg instanceof ASTAndNode) { - oredFields.setValue(false); - } - - fields = JexlASTHelper.getIdentifierNames(firstArg); - termOffsetMap = args.next(); - } else { - termOffsetMap = firstArg; - } + // Moving this validation later in the call stack, since it requires other processing (i.e. apply query model) + if (validateFields) { + for (String field : fieldTerms.fields) { + // Deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname. + if (!termFrequencyFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) { + PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED, + MessageFormat.format("Field: {0}", field)); + throw new IllegalArgumentException(qe); } - } else if (CONTENT_SCORED_PHRASE_FUNCTION_NAME.equals(funcName)) { - JexlNode arg = args.next(); + } + } - if (arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode) { - // if the first argument is a number, then no field exists - // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value') - termOffsetMap = args.next(); - } else { - if (arg instanceof ASTIdentifier) { - // single field case - // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value') - fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(arg))); - } else { - // multi field case - // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value') - Set identifiers = JexlASTHelper.getIdentifierNames(arg); - if (!identifiers.isEmpty()) { - fields = identifiers; - - if (oredFields != null && arg instanceof ASTAndNode) { - oredFields.setValue(false); - } - } - } + // Now take the remaining string literals in the arguments as terms. + Set terms = Sets.newHashSetWithExpectedSize(this.args.size() - 1); + // @formatter:off + Streams.stream(argsIterator) + .filter(ASTStringLiteral.class::isInstance) + .map(JexlNodes::getIdentifierOrLiteralAsString) + .forEach(terms::add); + // @formatter:on + fieldTerms.terms = terms; + + return fieldTerms; + } - // skip score because it is not needed when gathering just the fields and values from a function - args.next(); + // Finds and sets the fields for a content:adjacent functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentAdjacentFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + if (firstArg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(firstArg)); + return argsIterator.next(); + } else { + JexlNode nextArg = argsIterator.peek(); + // The zones may (more likely) be specified as an idenfifier + if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { + if (oredFields != null && firstArg instanceof ASTAndNode) { + oredFields.setValue(false); + } + fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg); + return argsIterator.next(); + } else { + return firstArg; + } + } + } - termOffsetMap = args.next(); + // Finds and sets the fields for a content:phrase functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + // we override the zones if the first argument is a string + if (firstArg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(((ASTStringLiteral) firstArg).getLiteral()); + return argsIterator.next(); + } else { + JexlNode nextArg = argsIterator.peek(); + // The zones may (more likely) be specified as an identifier + if (!JexlASTHelper.getIdentifiers(firstArg).isEmpty() && !JexlASTHelper.getIdentifiers(nextArg).isEmpty()) { + if (oredFields != null && firstArg instanceof ASTAndNode) { + oredFields.setValue(false); } - } else if (CONTENT_WITHIN_FUNCTION_NAME.equals(funcName)) { - JexlNode arg = args.next(); + fieldTerms.fields = JexlASTHelper.getIdentifierNames(firstArg); + return argsIterator.next(); + } else { + return firstArg; + } + } + } - // we override the zones if the first argument is a string or identifier - if (arg instanceof ASTStringLiteral) { - fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg)); - arg = args.next(); - } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) { - if (oredFields != null && arg instanceof ASTAndNode) { + // Finds and sets the fields for a content:scoredPhrase functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentScoredPhraseFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode firstArg = argsIterator.next(); + if (firstArg instanceof ASTNumberLiteral || firstArg instanceof ASTUnaryMinusNode) { + // if the first argument is a number, then no field exists + // for example, content:scoredPhrase(-1.5, termOffsetMap, 'value') + return argsIterator.next(); + } else { + if (firstArg instanceof ASTIdentifier) { + // single field case + // for example, content:scoredPhrase(FIELD, -1.5, termOffsetMap, 'value') + fieldTerms.fields = Collections.singleton(String.valueOf(JexlASTHelper.getIdentifier(firstArg))); + } else { + // multi field case + // for example, content:scoredPhrase((FIELD_A || FIELD_B), -1.5, termOffsetMap, 'value') + Set identifiers = JexlASTHelper.getIdentifierNames(firstArg); + if (!identifiers.isEmpty()) { + fieldTerms.fields = identifiers; + if (oredFields != null && firstArg instanceof ASTAndNode) { oredFields.setValue(false); } - - fields = JexlASTHelper.getIdentifierNames(arg); - arg = args.next(); } + } - // we can trash the distance - if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING); - throw new IllegalArgumentException(qe); - } + // skip score because it is not needed when gathering just the fields and values from a function + argsIterator.next(); + return argsIterator.next(); + } + } - termOffsetMap = args.next(); - } else { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.FUNCTION_ARGUMENTS_MISSING); - throw new IllegalArgumentException(qe); + // Finds and sets the fields for a content:within functions, and returns the anticpatated terms offset map node. + private JexlNode examineContentWithinFunction(PeekingIterator argsIterator, FieldTerms fieldTerms, MutableBoolean oredFields) { + JexlNode arg = argsIterator.next(); + // we override the zones if the first argument is a string or identifier + if (arg instanceof ASTStringLiteral) { + fieldTerms.fields = Collections.singleton(JexlNodes.getIdentifierOrLiteralAsString(arg)); + arg = argsIterator.next(); + } else if (!JexlASTHelper.getIdentifiers(arg).isEmpty()) { + if (oredFields != null && arg instanceof ASTAndNode) { + oredFields.setValue(false); } - if (null == termOffsetMap || !(termOffsetMap instanceof ASTIdentifier)) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING); - throw new IllegalArgumentException(qe); - } + fieldTerms.fields = JexlASTHelper.getIdentifierNames(arg); + arg = argsIterator.next(); + } - if (!args.hasNext()) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING); - throw new IllegalArgumentException(qe); - } + // we can trash the distance + if (!(arg instanceof ASTNumberLiteral || arg instanceof ASTUnaryMinusNode)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.NUMERIC_DISTANCE_ARGUMENT_MISSING); + throw new IllegalArgumentException(qe); + } - // moving this validation later in the call stack, since it requires other processing (i.e. apply query model) - if (validateFields) { - for (String field : fields) { - // deconstruct & upcase the fieldname for testing in case we have not normalized the field names yet. Return the unnormalized fieldname. - if (!termFreqFields.contains(JexlASTHelper.deconstructIdentifier(field.toUpperCase()))) { - PreConditionFailedQueryException qe = new PreConditionFailedQueryException(DatawaveErrorCode.FIELD_PHRASE_QUERY_NOT_INDEXED, - MessageFormat.format("Field: {0}", field)); - throw new IllegalArgumentException(qe); - } - } - } + return argsIterator.next(); + } - // now take the remaining string literals as terms - Iterator termsItr = Iterators.transform(Iterators.filter(args, new StringLiteralsOnly()), new GetImage()); - while (termsItr.hasNext()) { - terms.add(termsItr.next()); - } + /** + * Throws a {@link BadRequestQueryException} if termsOffsetMap is not an instance of {@link ASTIdentifier} or if there are no more nodes in the + * iterator. + * + * @param termOffsetMap + * the terms offset map node + * @param argsIterator + * the iterator of arguments + */ + private void validateTermsOffsetMapAndTermsPresent(JexlNode termOffsetMap, PeekingIterator argsIterator) { + if (!(termOffsetMap instanceof ASTIdentifier)) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMOFFSETMAP_AND_TERMS_MISSING); + throw new IllegalArgumentException(qe); + } - } else { - NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.JEXL_NODES_MISSING, - MessageFormat.format("Class: {0}, Namespace: {1}, Function: {2}", this.getClass().getSimpleName(), namespace, funcName)); + if (!argsIterator.hasNext()) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.TERMS_MISSING); throw new IllegalArgumentException(qe); } - return new Set[] {fields, terms}; } /** @@ -616,6 +643,29 @@ public boolean allowIvaratorFiltering() { } } + public static class FieldTerms { + + private Set fields; + private Set terms; + + public FieldTerms() { + fields = null; + terms = null; + } + + public Set getFields() { + return fields; + } + + public int totalFields() { + return fields.size(); + } + + public Set getTerms() { + return terms; + } + } + @Override public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) { FunctionJexlNodeVisitor fvis = new FunctionJexlNodeVisitor(); @@ -636,5 +686,4 @@ public ContentJexlArgumentDescriptor getArgumentDescriptor(ASTFunctionNode node) return new ContentJexlArgumentDescriptor(node, fvis.namespace(), fvis.name(), fvis.args()); } - } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/AsyncIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/AsyncIndexLookup.java index 137d21e5c3b..46b6f94005a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/AsyncIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/AsyncIndexLookup.java @@ -6,6 +6,7 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicLong; import org.apache.log4j.Logger; @@ -41,7 +42,7 @@ protected long getRemainingTimeMillis(long startTimeMillis) { return Math.max(0L, config.getMaxIndexScanTimeMillis() - (System.currentTimeMillis() - startTimeMillis)); } - protected void timedScanWait(Future future, CountDownLatch startedLatch, CountDownLatch stoppedLatch, long startTimeMillis, long timeout) { + protected void timedScanWait(Future future, CountDownLatch startedLatch, CountDownLatch stoppedLatch, AtomicLong startTimeMillis, long timeout) { // this ensures that we don't wait for the future response until the task has started if (startedLatch != null) { try { @@ -68,7 +69,7 @@ protected void timedScanWait(Future future, CountDownLatch startedLatch // timeout exception and except ( a max lookup specified ) 3) we receive a value under timeout and we break while (!execService.isShutdown() && !execService.isTerminated()) { try { - future.get((swallowTimeout) ? maxLookup : getRemainingTimeMillis(startTimeMillis), TimeUnit.MILLISECONDS); + future.get((swallowTimeout) ? maxLookup : getRemainingTimeMillis(startTimeMillis.get()), TimeUnit.MILLISECONDS); } catch (TimeoutException e) { if (swallowTimeout) { continue; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java index a3b352f2a98..4300300771a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/BoundedRangeIndexLookup.java @@ -1,15 +1,16 @@ package datawave.query.jexl.lookups; -import java.io.IOException; +import static datawave.query.jexl.lookups.ShardIndexQueryTableStaticMethods.EXPANSION_HINT_KEY; + import java.text.MessageFormat; import java.util.Collections; import java.util.Iterator; import java.util.Map.Entry; -import java.util.SortedMap; import java.util.concurrent.Callable; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; import org.apache.accumulo.core.client.BatchScanner; import org.apache.accumulo.core.client.IteratorSetting; @@ -17,15 +18,15 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; -import org.apache.accumulo.core.iterators.user.WholeRowIterator; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; import org.springframework.util.StringUtils; +import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import datawave.core.common.logging.ThreadConfigurableLogger; -import datawave.core.iterators.ColumnQualifierRangeIterator; +import datawave.core.iterators.BoundedRangeExpansionIterator; import datawave.core.iterators.CompositeSeekingIterator; import datawave.core.iterators.TimeoutExceptionIterator; import datawave.core.iterators.TimeoutIterator; @@ -50,7 +51,7 @@ public class BoundedRangeIndexLookup extends AsyncIndexLookup { private final LiteralRange literalRange; protected Future timedScanFuture; - protected long lookupStartTimeMillis = Long.MAX_VALUE; + protected AtomicLong lookupStartTimeMillis = new AtomicLong(Long.MAX_VALUE); protected CountDownLatch lookupStartedLatch; protected CountDownLatch lookupStoppedLatch; @@ -125,25 +126,25 @@ public synchronized void submit() { log.debug("Range: " + range); bs = null; try { - bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery()); + // the 'newScanner' method in the ScannerFactory has no knowledge about the 'expansion' hint, so determine hint here + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + bs = scannerFactory.newScanner(config.getIndexTableName(), config.getAuthorizations(), config.getNumQueryThreads(), config.getQuery(), hintKey); bs.setRanges(Collections.singleton(range)); bs.fetchColumnFamily(new Text(literalRange.getFieldName())); - // set up the GlobalIndexRangeSamplingIterator - - IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 50, "WholeRowIterator", WholeRowIterator.class); - bs.addScanIterator(cfg); - - cfg = new IteratorSetting(config.getBaseIteratorPriority() + 48, "DateFilter", ColumnQualifierRangeIterator.class); - // search from 20YYddMM to 20ZZddMM\uffff to ensure we encompass all of the current day - String end = endDay + Constants.MAX_UNICODE_STRING; - cfg.addOption(ColumnQualifierRangeIterator.RANGE_NAME, ColumnQualifierRangeIterator.encodeRange(new Range(startDay, end))); - - bs.addScanIterator(cfg); + IteratorSetting setting = new IteratorSetting(config.getBaseIteratorPriority() + 20, "BoundedRangeExpansionIterator", + BoundedRangeExpansionIterator.class); + setting.addOption(BoundedRangeExpansionIterator.START_DATE, startDay); + setting.addOption(BoundedRangeExpansionIterator.END_DATE, endDay); + if (!config.getDatatypeFilter().isEmpty()) { + setting.addOption(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(config.getDatatypeFilter())); + } + bs.addScanIterator(setting); // If this is a composite field, with multiple terms, we need to setup our query to filter based on each component of the composite range - if (config.getCompositeToFieldMap().get(literalRange.getFieldName()) != null) { + if (!config.getCompositeToFieldMap().get(literalRange.getFieldName()).isEmpty()) { String compositeSeparator = null; if (config.getCompositeFieldSeparators() != null) @@ -168,8 +169,8 @@ public synchronized void submit() { } if (null != fairnessIterator) { - cfg = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class); - bs.addScanIterator(cfg); + IteratorSetting timeoutSetting = new IteratorSetting(config.getBaseIteratorPriority() + 100, TimeoutExceptionIterator.class); + bs.addScanIterator(timeoutSetting); } timedScanFuture = execService.submit(createTimedCallable(bs.iterator())); @@ -179,10 +180,6 @@ public synchronized void submit() { log.error(qe); throw new DatawaveFatalQueryException(qe); - } catch (IOException e) { - QueryException qe = new QueryException(DatawaveErrorCode.RANGE_CREATE_ERROR, e, MessageFormat.format("{0}", this.literalRange)); - log.debug(qe); - throw new IllegalRangeArgumentException(qe); } } } @@ -211,7 +208,7 @@ protected Callable createTimedCallable(final Iterator> return () -> { try { - lookupStartTimeMillis = System.currentTimeMillis(); + lookupStartTimeMillis.set(System.currentTimeMillis()); lookupStartedLatch.countDown(); Text holder = new Text(); @@ -236,61 +233,22 @@ protected Callable createTimedCallable(final Iterator> k.getRow(holder); String uniqueTerm = holder.toString(); - SortedMap keymap = WholeRowIterator.decodeRow(entry.getKey(), entry.getValue()); - - String field = null; - - boolean foundDataType = false; - - for (Key topKey : keymap.keySet()) { - if (null == field) { - topKey.getColumnFamily(holder); - field = holder.toString(); - } - // Get the column qualifier from the key. It - // contains the datatype and normalizer class - - if (null != topKey.getColumnQualifier()) { - if (null != config.getDatatypeFilter() && !config.getDatatypeFilter().isEmpty()) { - - String colq = topKey.getColumnQualifier().toString(); - int idx = colq.indexOf(Constants.NULL); - - if (idx != -1) { - String type = colq.substring(idx + 1); - - // If types are specified and this type - // is not in the list, skip it. - if (config.getDatatypeFilter().contains(type)) { - if (log.isTraceEnabled()) { - log.trace(config.getDatatypeFilter() + " contains " + type); - } - - foundDataType = true; - break; - } - } - } else { - foundDataType = true; - } - } - } - if (foundDataType) { + k.getColumnFamily(holder); + String field = holder.toString(); - // obtaining the size of a map can be expensive, - // instead - // track the count of each unique item added. - indexLookupMap.put(field, uniqueTerm); + // safety check... + Preconditions.checkState(field.equals(literalRange.getFieldName()), + "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName()); - // safety check... - Preconditions.checkState(field.equals(literalRange.getFieldName()), - "Got an unexpected field name when expanding range" + field + " " + literalRange.getFieldName()); + // obtaining the size of a map can be expensive, + // instead + // track the count of each unique item added. + indexLookupMap.put(field, uniqueTerm); - // If this range expands into to many values, we can - // stop - if (indexLookupMap.get(field).isThresholdExceeded()) { - return true; - } + // If this range expands into to many values, we can + // stop + if (indexLookupMap.get(field).isThresholdExceeded()) { + return true; } } } catch (Exception e) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java index ec1e7a50ed6..b40001d5fd2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/FieldNameIndexLookup.java @@ -1,5 +1,7 @@ package datawave.query.jexl.lookups; +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.util.Collection; import java.util.Collections; import java.util.HashSet; @@ -10,8 +12,8 @@ import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; -import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; @@ -37,7 +39,7 @@ public class FieldNameIndexLookup extends AsyncIndexLookup { protected Set terms; protected Future timedScanFuture; - protected long lookupStartTimeMillis = Long.MAX_VALUE; + protected AtomicLong lookupStartTimeMillis = new AtomicLong(Long.MAX_VALUE); protected CountDownLatch lookupStartedLatch; protected CountDownLatch lookupStoppedLatch; @@ -75,7 +77,7 @@ public void submit() { Iterator> iter = Collections.emptyIterator(); - ScannerSession bs; + ScannerSession bs = null; try { if (!fields.isEmpty()) { @@ -104,9 +106,13 @@ public void submit() { } timedScanFuture = execService.submit(createTimedCallable(iter)); - } catch (TableNotFoundException e) { + } catch (IOException | InvocationTargetException | NoSuchMethodException | InstantiationException | IllegalAccessException | RuntimeException e) { log.error(e); - } catch (Exception e) { + // ensure the scanner is cleaned up if no longer listening + if (bs != null) { + bs.close(); + sessions.remove(bs); + } throw new RuntimeException(e); } } @@ -117,7 +123,10 @@ public synchronized IndexLookupMap lookup() { if (!sessions.isEmpty()) { try { // for field name lookups, we wait indefinitely - timedScanWait(timedScanFuture, lookupStartedLatch, lookupStoppedLatch, lookupStartTimeMillis, Long.MAX_VALUE); + // TODO consider if this really should be Long.MAX_VALUE or some time less. Other index scanners are set to config.getMaxIndexScanTimeMillis(). + // However the code currently can't handle a failure here, where other index lookup failures can conditionally still allow the query to be + // executed. See UnfieldedIndexExpansionVisitor.expandUnfielded() + timedScanWait(timedScanFuture, lookupStartedLatch, lookupStoppedLatch, lookupStartTimeMillis, config.getMaxAnyFieldScanTimeMillis()); } finally { for (ScannerSession sesh : sessions) { scannerFactory.close(sesh); @@ -135,13 +144,18 @@ protected Callable createTimedCallable(final Iterator> return () -> { try { - lookupStartTimeMillis = System.currentTimeMillis(); + lookupStartTimeMillis.set(System.currentTimeMillis()); lookupStartedLatch.countDown(); final Text holder = new Text(); try { while (iter.hasNext()) { + // check for interrupt which may be triggered by closing the batch scanner + if (Thread.interrupted()) { + throw new InterruptedException(); + } + Entry entry = iter.next(); if (log.isTraceEnabled()) { log.trace("Index entry: " + entry.getKey()); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/RegexIndexLookup.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/RegexIndexLookup.java index 96ec7dab0d9..7c5e9fc0e9c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/RegexIndexLookup.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/RegexIndexLookup.java @@ -11,6 +11,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.client.TableNotFoundException; @@ -266,6 +267,10 @@ protected Callable createTimedCallable(final Iterator> } while (iter.hasNext()) { + // check if interrupted which may be triggered by closing a batch scanner + if (Thread.interrupted()) { + throw new InterruptedException(); + } Entry entry = iter.next(); @@ -359,7 +364,7 @@ private static class RegexLookupData { private Future timedScanFuture; private CountDownLatch lookupStartedLatch; private CountDownLatch lookupStoppedLatch; - private long lookupStartTimeMillis = Long.MAX_VALUE; + private AtomicLong lookupStartTimeMillis = new AtomicLong(Long.MAX_VALUE); public Collection getSessions() { return sessions; @@ -393,12 +398,12 @@ public void setLookupStoppedLatch(CountDownLatch lookupStoppedLatch) { this.lookupStoppedLatch = lookupStoppedLatch; } - public long getLookupStartTimeMillis() { + public AtomicLong getLookupStartTimeMillis() { return lookupStartTimeMillis; } public void setLookupStartTimeMillis(long lookupStartTimeMillis) { - this.lookupStartTimeMillis = lookupStartTimeMillis; + this.lookupStartTimeMillis.set(lookupStartTimeMillis); } } } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java index 61a827704ed..6c82c7f01f1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/lookups/ShardIndexQueryTableStaticMethods.java @@ -1,6 +1,7 @@ package datawave.query.jexl.lookups; import java.io.IOException; +import java.lang.reflect.InvocationTargetException; import java.text.MessageFormat; import java.util.Collection; import java.util.List; @@ -67,6 +68,9 @@ public class ShardIndexQueryTableStaticMethods { private static FastDateFormat formatter = FastDateFormat.getInstance("yyyyMMdd"); + // name reserved for executor pools + public static final String EXPANSION_HINT_KEY = "expansion"; + /** * Create an IndexLookup task to find field names give a JexlNode and a set of Types for that node * @@ -438,19 +442,29 @@ public static Range getLiteralRange(String fieldName, String normalizedQueryTerm * @param limitToUniqueTerms * check for limiting unique terms * @return the scanner session - * @throws Exception - * if there are issues + * @throws InvocationTargetException + * if no target exists + * @throws NoSuchMethodException + * if no method exists + * @throws InstantiationException + * if there is a problem initializing + * @throws IllegalAccessException + * if there is an illegal access + * @throws IOException + * dates can't be formatted */ public static ScannerSession configureTermMatchOnly(ShardQueryConfiguration config, ScannerFactory scannerFactory, String tableName, Collection ranges, Collection literals, Collection patterns, boolean reverseIndex, boolean limitToUniqueTerms) - throws Exception { + throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException, IOException { // if we have no ranges, then nothing to scan if (ranges.isEmpty()) { return null; } - ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery()); + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey); bs.setRanges(ranges); @@ -471,14 +485,16 @@ public static ScannerSession configureTermMatchOnly(ShardQueryConfiguration conf public static ScannerSession configureLimitedDiscovery(ShardQueryConfiguration config, ScannerFactory scannerFactory, String tableName, Collection ranges, Collection literals, Collection patterns, boolean reverseIndex, boolean limitToUniqueTerms) - throws Exception { + throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException, IOException { // if we have no ranges, then nothing to scan if (ranges.isEmpty()) { return null; } - ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery()); + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : tableName; + + ScannerSession bs = scannerFactory.newLimitedScanner(AnyFieldScanner.class, tableName, config.getAuthorizations(), config.getQuery(), hintKey); bs.setRanges(ranges); @@ -506,6 +522,13 @@ public static final void configureGlobalIndexDateRangeFilter(ShardQueryConfigura } IteratorSetting cfg = configureGlobalIndexDateRangeFilter(config, dateRange); bs.addScanIterator(cfg); + + // unused method, but we'll still configure execution hints if possible + String executionHintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + if (config.getTableHints().containsKey(executionHintKey)) { + bs.setExecutionHints(config.getTableHints().get(executionHintKey)); + } } public static final IteratorSetting configureGlobalIndexDateRangeFilter(ShardQueryConfiguration config, LongRange dateRange) { @@ -575,6 +598,16 @@ public static final void configureGlobalIndexTermMatchingIterator(ShardQueryConf bs.addScanIterator(cfg); + // unused method, but we'll still configure execution hints if possible + if (!reverseIndex) { + // only apply hints to the global index + String hintKey = config.getTableHints().containsKey(EXPANSION_HINT_KEY) ? EXPANSION_HINT_KEY : config.getIndexTableName(); + + if (config.getTableHints().containsKey(hintKey)) { + bs.setExecutionHints(config.getTableHints().get(hintKey)); + } + } + setExpansionFields(config, bs, reverseIndex, expansionFields); } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java new file mode 100644 index 00000000000..9c2cc475401 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/AbstractNodeCostComparator.java @@ -0,0 +1,108 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParserTreeConstants; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.util.count.CountMap; + +/** + * Class that contains core logic for field and term comparators + */ +public abstract class AbstractNodeCostComparator extends JexlNodeComparator { + private static final long NODE_ID_MULTIPLIER = 5000L; + private static final int SEGMENT = Integer.MAX_VALUE / 48; + + private final DefaultJexlNodeComparator comparator = new DefaultJexlNodeComparator(); + + private final Map counts; + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + protected AbstractNodeCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + protected AbstractNodeCostComparator(Map counts) { + this.counts = counts; + } + + @Override + public int compare(JexlNode left, JexlNode right) { + left = JexlASTHelper.dereference(left); + right = JexlASTHelper.dereference(right); + + int leftCost = getCostIndex(left); + int rightCost = getCostIndex(right); + + int result = Integer.compare(leftCost, rightCost); + + if (result == 0) { + result = comparator.compare(left, right); + } + + return result; + } + + @Override + public int getCostIndex(JexlNode node) { + if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) { + return getCostIndex(node.jjtGetChild(0)); + } else if (node instanceof ASTOrNode) { + return getCostForUnion(node); + } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return Integer.MAX_VALUE; + } else if (node instanceof ASTAndNode) { + return getCostForIntersection(node); + } else { + String key = getNodeKey(node); + long score = counts.getOrDefault(key, getDefaultScore(node)); + if (score > Integer.MAX_VALUE) { + score = Integer.MAX_VALUE; + } + return (int) score; + } + } + + /** + * This method is the only difference between calculating cost based on field or term + * + * @param node + * a JexlNode + * @return the node key + */ + abstract String getNodeKey(JexlNode node); + + private long getDefaultScore(JexlNode node) { + int id = JexlNodes.id(node); + switch (id) { + case ParserTreeConstants.JJTFUNCTIONNODE: + return SEGMENT - 4L; + case ParserTreeConstants.JJTNENODE: + return SEGMENT - 3L; + case ParserTreeConstants.JJTNRNODE: + return SEGMENT - 2L; + case ParserTreeConstants.JJTNOTNODE: + return SEGMENT - 1L; + default: + return id * NODE_ID_MULTIPLIER; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java new file mode 100644 index 00000000000..af8a2be45fe --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultJexlNodeComparator.java @@ -0,0 +1,87 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParserTreeConstants; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Comparator that enforces default ordering according to implied cost + *

+ * Nodes are sorted by node type, then junction, then lexicographically + */ +public class DefaultJexlNodeComparator extends JexlNodeComparator { + + private static final int SEGMENT = Integer.MAX_VALUE / 48; + + private final JunctionComparator junctionComparator = new JunctionComparator(); + private final LexicographicalNodeComparator lexiComparator = new LexicographicalNodeComparator(); + + @Override + public int compare(JexlNode left, JexlNode right) { + left = JexlASTHelper.dereference(left); + right = JexlASTHelper.dereference(right); + + int result = Integer.compare(getCostIndex(left), getCostIndex(right)); + + // EQ vs. (EQ AND EQ) will match + if (result == 0) { + result = junctionComparator.compare(left, right); + } + + if (result == 0) { + result = lexiComparator.compare(left, right); + } + + return result; + } + + /** + * + * @param node + * an arbitrary JexlNode + * @return the node cost + */ + @Override + protected int getCostIndex(JexlNode node) { + if ((node instanceof ASTReference || node instanceof ASTReferenceExpression) && node.jjtGetNumChildren() == 1) { + return getCostIndex(node.jjtGetChild(0)); + } else if (node instanceof ASTOrNode) { + return getCostForUnion(node); + } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return Integer.MAX_VALUE; + } else if (node instanceof ASTAndNode) { + return getCostForIntersection(node); + } else { + return getNodeScore(node); + } + } + + /** + * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms + * + * @param node + * any JexlNode + * @return a score for the node + */ + private int getNodeScore(JexlNode node) { + int id = JexlNodes.id(node); + switch (id) { + case ParserTreeConstants.JJTFUNCTIONNODE: + return SEGMENT - 4; + case ParserTreeConstants.JJTNENODE: + return SEGMENT - 3; + case ParserTreeConstants.JJTNRNODE: + return SEGMENT - 2; + case ParserTreeConstants.JJTNOTNODE: + return SEGMENT - 1; + default: + return id; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java deleted file mode 100644 index fa5edcc8db7..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/DefaultNodeCostComparator.java +++ /dev/null @@ -1,69 +0,0 @@ -package datawave.query.jexl.nodes; - -import org.apache.commons.jexl3.parser.ASTAndNode; -import org.apache.commons.jexl3.parser.ASTOrNode; -import org.apache.commons.jexl3.parser.ASTReference; -import org.apache.commons.jexl3.parser.ASTReferenceExpression; -import org.apache.commons.jexl3.parser.JexlNode; -import org.apache.commons.jexl3.parser.JexlNodes; -import org.apache.commons.jexl3.parser.ParserTreeConstants; - -/** - * Provides default node cost calculations based on the Jexl node id - */ -public class DefaultNodeCostComparator extends NodeCostComparator { - - /** - * - * @param node - * an arbitrary JexlNode - * @return the node cost - */ - @Override - protected int getCostIndex(JexlNode node) { - if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) { - QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); - if (instance.isAnyType()) { - return Integer.MAX_VALUE - 4; - } - return getCostIndex(node.jjtGetChild(0)); - } else if (node instanceof ASTOrNode) { - int sum = 0; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - sum += getCostIndex(node.jjtGetChild(i)); - } - return sum; - } else if (node instanceof ASTAndNode) { - int lowest = Integer.MAX_VALUE; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - int cost = getCostIndex(node.jjtGetChild(i)); - if (cost < lowest) - lowest = cost; - } - return lowest; - } else { - return getNodeScore(node); - } - } - - /** - * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms - * - * @param node - * any JexlNode - * @return a score for the node - */ - private int getNodeScore(JexlNode node) { - int id = JexlNodes.id(node); - switch (id) { - case ParserTreeConstants.JJTNENODE: - return Integer.MAX_VALUE - 3; - case ParserTreeConstants.JJTNRNODE: - return Integer.MAX_VALUE - 2; - case ParserTreeConstants.JJTNOTNODE: - return Integer.MAX_VALUE - 1; - default: - return id; - } - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java new file mode 100644 index 00000000000..eb3d1e2956c --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldCostComparator.java @@ -0,0 +1,55 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTNENode; +import org.apache.commons.jexl3.parser.ASTNRNode; +import org.apache.commons.jexl3.parser.ASTNotNode; +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.util.count.CountMap; + +/** + * Comparator that operates on field cardinality + */ +public class FieldCostComparator extends AbstractNodeCostComparator { + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + public FieldCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + public FieldCostComparator(Map counts) { + super(counts); + } + + /** + * The {@link FieldCostComparator} uses a node's identifier to calculate cost + * + * @param node + * a JexlNode + * @return the node key + */ + @Override + public String getNodeKey(JexlNode node) { + if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) { + // certain node types are always kicked out + return null; + } + return JexlASTHelper.getIdentifier(node); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java deleted file mode 100644 index 4e66d8e9599..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/FieldOrTermNodeCostComparator.java +++ /dev/null @@ -1,116 +0,0 @@ -package datawave.query.jexl.nodes; - -import java.util.Map; - -import org.apache.commons.jexl3.parser.ASTAndNode; -import org.apache.commons.jexl3.parser.ASTFunctionNode; -import org.apache.commons.jexl3.parser.ASTNENode; -import org.apache.commons.jexl3.parser.ASTNRNode; -import org.apache.commons.jexl3.parser.ASTNotNode; -import org.apache.commons.jexl3.parser.ASTOrNode; -import org.apache.commons.jexl3.parser.ASTReference; -import org.apache.commons.jexl3.parser.ASTReferenceExpression; -import org.apache.commons.jexl3.parser.JexlNode; -import org.apache.commons.jexl3.parser.JexlNodes; -import org.apache.commons.jexl3.parser.ParserTreeConstants; - -import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.visitors.JexlStringBuildingVisitor; - -/** - * Orders nodes based on field or term counts - */ -public class FieldOrTermNodeCostComparator extends NodeCostComparator { - - private final boolean isFieldCount; - private static final long NODE_ID_MULTIPLIER = 5000; - private final Map counts; - - public FieldOrTermNodeCostComparator(Map counts, boolean isFieldCount) { - this.counts = counts; - this.isFieldCount = isFieldCount; - } - - @Override - int getCostIndex(JexlNode node) { - if (node.jjtGetNumChildren() == 1 && (node instanceof ASTReference || node instanceof ASTReferenceExpression)) { - return getCostIndex(node.jjtGetChild(0)); - } else if (node instanceof ASTOrNode) { - int sum = 0; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - sum += getCostIndex(node.jjtGetChild(i)); - } - return sum; - } else if (QueryPropertyMarker.findInstance(node).isAnyType()) { - return Integer.MAX_VALUE; - } else if (node instanceof ASTAndNode) { - int lowest = Integer.MAX_VALUE; - for (int i = 0; i < node.jjtGetNumChildren(); i++) { - int cost = getCostIndex(node.jjtGetChild(i)); - if (cost < lowest) { - lowest = cost; - } - } - return lowest; - } else { - return getCostForLeaf(node); - } - } - - /** - * Get the cost for a leaf according to the count map. - *

- * The extra code to handle integer overflows is due to term counts in the global index being a Long but Java's {@link Comparable#compareTo(Object)} returns - * an integer. - * - * @param node - * a JexlNode - * @return an integer used to compare nodes - */ - private int getCostForLeaf(JexlNode node) { - String key = getNodeKey(node); - long value = counts.getOrDefault(key, getNodeScore(node)); - if (value > Integer.MAX_VALUE) { - value = Integer.MAX_VALUE; - } - return (int) value; - } - - /** - * Generate a key for the count map. It's either the field, or the whole node. - * - * @param node - * a JexlNode - * @return a node key - */ - private String getNodeKey(JexlNode node) { - if (node instanceof ASTNotNode || node instanceof ASTNENode || node instanceof ASTNRNode || node instanceof ASTFunctionNode) { - return "NO_KEY"; - } else if (isFieldCount) { - return JexlASTHelper.getIdentifier(node); - } else { - return JexlStringBuildingVisitor.buildQueryWithoutParse(node); - } - } - - /** - * Wrapper around {@link JexlNodes#id(JexlNode)} so that we can boost the score of negated terms - * - * @param node - * any JexlNode - * @return a score for the node - */ - private long getNodeScore(JexlNode node) { - int id = JexlNodes.id(node); - switch (id) { - case ParserTreeConstants.JJTNENODE: - return Integer.MAX_VALUE - 3L; - case ParserTreeConstants.JJTNRNODE: - return Integer.MAX_VALUE - 2L; - case ParserTreeConstants.JJTNOTNODE: - return Integer.MAX_VALUE - 1L; - default: - return id * NODE_ID_MULTIPLIER; - } - } -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java new file mode 100644 index 00000000000..4796e20b5a6 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JexlNodeComparator.java @@ -0,0 +1,72 @@ +package datawave.query.jexl.nodes; + +import java.util.Comparator; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; + +/** + * Comparator for JexlNodes. + *

+ * Implementing classes may prioritize different features for sorting. For example, sorting leaves before junctions, EQ nodes before ER nodes, or sorting + * lexicographically by field and value + *

+ * EQ < ER < Functions + */ +public abstract class JexlNodeComparator implements Comparator { + + @Override + public int compare(JexlNode left, JexlNode right) { + int leftCost = getCostIndex(JexlASTHelper.dereference(left)); + int rightCost = getCostIndex(JexlASTHelper.dereference(right)); + + return Integer.compare(leftCost, rightCost); + } + + /** + * Calculates a cost for the provided node + * + * @param node + * an arbitrary JexlNode + * @return the integer cost + */ + abstract int getCostIndex(JexlNode node); + + /** + * Get the cost for a union by summing the cost of each child + * + * @param node + * the union + * @return the cost + */ + protected int getCostForUnion(JexlNode node) { + int cost = 0; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + cost += getCostIndex(node.jjtGetChild(i)); + // check for overflows + if (cost == Integer.MAX_VALUE || cost < 0) { + return Integer.MAX_VALUE; + } + } + return cost; + } + + /** + * Get the cost for an intersection by taking the lowest cost of all children + * + * @param node + * the intersection + * @return the cost + */ + protected int getCostForIntersection(JexlNode node) { + int cost = Integer.MAX_VALUE; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + int childCost = getCostIndex(node.jjtGetChild(i)); + if (childCost < cost) { + cost = childCost; + } + } + return cost; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java new file mode 100644 index 00000000000..859d117700c --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/JunctionComparator.java @@ -0,0 +1,24 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.JexlNode; + +/** + * Comparator that pushes single leaf nodes to the left and junctions to the right + *

+ * Note: should only be used to break ties in other comparators. + */ +public class JunctionComparator extends JexlNodeComparator { + + @Override + public int getCostIndex(JexlNode node) { + if (node instanceof ASTAndNode && !QueryPropertyMarker.findInstance(node).isAnyType()) { + return 3; + } else if (node instanceof ASTOrNode) { + return 2; + } else { + return 1; + } + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java new file mode 100644 index 00000000000..37e183c46bf --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/LexicographicalNodeComparator.java @@ -0,0 +1,25 @@ +package datawave.query.jexl.nodes; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; + +/** + * Sorts nodes according to the node string. + *

+ * Note: this comparator is intended to break ties between nodes of similar type or cost. Running this comparator in isolation will produce unexpected results. + */ +public class LexicographicalNodeComparator extends JexlNodeComparator { + + @Override + public int compare(JexlNode left, JexlNode right) { + String leftQuery = JexlStringBuildingVisitor.buildQuery(left); + String rightQuery = JexlStringBuildingVisitor.buildQuery(right); + return leftQuery.compareTo(rightQuery); + } + + @Override + public int getCostIndex(JexlNode node) { + throw new IllegalStateException("Not implemented"); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java deleted file mode 100644 index a238e5c6007..00000000000 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/NodeCostComparator.java +++ /dev/null @@ -1,40 +0,0 @@ -package datawave.query.jexl.nodes; - -import java.util.Comparator; - -import org.apache.commons.jexl3.parser.JexlNode; - -import datawave.query.jexl.visitors.JexlStringBuildingVisitor; - -/** - * Compare nodes based on arbitrary cost. - *

- * EQ < ER < Functions - */ -public abstract class NodeCostComparator implements Comparator { - - @Override - public int compare(JexlNode left, JexlNode right) { - int leftCost = getCostIndex(left); - int rightCost = getCostIndex(right); - - int result = Integer.compare(leftCost, rightCost); - if (result == 0) { - // if comparing by field cost (same field) provide an opportunity to sort alphabetically - result = JexlStringBuildingVisitor.buildQuery(left).compareTo(JexlStringBuildingVisitor.buildQuery(right)); - } - - return result; - } - - // Evaluate OR nodes last, then And nodes, then nodes by node id - - /** - * Calculates a cost for the provided node - * - * @param node - * an arbitrary JexlNode - * @return the integer cost - */ - abstract int getCostIndex(JexlNode node); -} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java new file mode 100644 index 00000000000..ae3b62c2273 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/nodes/TermCostComparator.java @@ -0,0 +1,46 @@ +package datawave.query.jexl.nodes; + +import java.util.Map; + +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.util.count.CountMap; + +/** + * Comparator that operates on term cardinality + */ +public class TermCostComparator extends AbstractNodeCostComparator { + + /** + * Constructor that accepts a {@link CountMap} + * + * @param counts + * the count map + */ + public TermCostComparator(CountMap counts) { + this(counts.getCounts()); + } + + /** + * Constructor that accepts a {@link Map} of counts + * + * @param counts + * the count map + */ + public TermCostComparator(Map counts) { + super(counts); + } + + /** + * The {@link TermCostComparator} uses the whole node string to calculate cost + * + * @param node + * a JexlNode + * @return the node key + */ + public String getNodeKey(JexlNode node) { + return JexlStringBuildingVisitor.buildQuery(node); + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java index 6dd428cb411..8b576303a01 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IngestTypeVisitor.java @@ -391,7 +391,7 @@ private Set getFieldsForFunctionNode(ASTFunctionNode node) { if (visitor.namespace().equals(CONTENT_FUNCTION_NAMESPACE)) { // all content function fields are added ContentFunctionsDescriptor.ContentJexlArgumentDescriptor contentDescriptor = new ContentFunctionsDescriptor().getArgumentDescriptor(node); - return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null)[0]; + return contentDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null).getFields(); } else { JexlArgumentDescriptor descriptor = JexlFunctionArgumentDescriptorFactory.F.getArgumentDescriptor(node); if (descriptor == null) { diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java index 4f6a8c6fb2c..49dbba0a2c9 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IsNotNullPruningVisitor.java @@ -217,17 +217,25 @@ private JexlNode pruneNode(JexlNode node, Set fields) { * @return the original node, or null if it is pruned */ private JexlNode pruneUnion(JexlNode node, Set fields) { + // if there is a isNotNull in the union, and we know we have an equality node involving one of the isNotNull nodes, + // we have the means to prune the entire union. + boolean willPrune = false; + for (int i = 0; i < node.jjtGetNumChildren(); i++) { JexlNode deref = JexlASTHelper.dereference(node.jjtGetChild(i)); - if (!isIsNotNullFunction(deref)) { - return node; + if (isIsNotNullFunction(deref) && !willPrune) { + String field = fieldForNode(deref); + if (fields.contains(field)) { + willPrune = true; + } } - String field = fieldForNode(deref); - if (!fields.contains(field)) { - return node; - } } + + if (!willPrune) { + return node; + } + return null; } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IteratorBuildingVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IteratorBuildingVisitor.java index 185f0c074a4..e2caf83fa8d 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IteratorBuildingVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IteratorBuildingVisitor.java @@ -162,6 +162,8 @@ public class IteratorBuildingVisitor extends BaseVisitor { protected TypeMetadata typeMetadata; protected EventDataQueryFilter attrFilter; + protected EventDataQueryFilter fiAttrFilter; + protected EventDataQueryFilter eventAttrFilter; protected Set fieldsToAggregate = Collections.emptySet(); protected Set termFrequencyFields = Collections.emptySet(); protected boolean allowTermFrequencyLookup = true; @@ -458,7 +460,9 @@ private NestedIterator buildExceededFromTermFrequency(String identifier, Je builder.setTypeMetadata(typeMetadata); builder.setFieldsToAggregate(fieldsToAggregate); builder.setTimeFilter(timeFilter); - builder.setAttrFilter(attrFilter); + // this code path is only executed in the context of a document range. This optimization scans + // the TF directly instead of the FI. + builder.setAttrFilter(eventAttrFilter); builder.setEnv(env); builder.setTermFrequencyAggregator(getTermFrequencyAggregator(identifier, sourceNode, attrFilter, tfNextSeek)); builder.setNode(rootNode); @@ -1566,6 +1570,16 @@ public IteratorBuildingVisitor setAttrFilter(EventDataQueryFilter attrFilter) { return this; } + public IteratorBuildingVisitor setFiAttrFilter(EventDataQueryFilter fiAttrFilter) { + this.fiAttrFilter = fiAttrFilter; + return this; + } + + public IteratorBuildingVisitor setEventAttrFilter(EventDataQueryFilter eventAttrFilter) { + this.eventAttrFilter = eventAttrFilter; + return this; + } + public IteratorBuildingVisitor setDatatypeFilter(Predicate datatypeFilter) { this.datatypeFilter = datatypeFilter; return this; diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IvaratorRequiredVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IvaratorRequiredVisitor.java index d6bf8782b60..e282beea3bc 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IvaratorRequiredVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/IvaratorRequiredVisitor.java @@ -32,7 +32,7 @@ public Object visit(ASTAndNode and, Object data) { QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(and); if (instance.isType(EVALUATION_ONLY)) { return data; - } else if (instance.isAnyTypeOf(EXCEEDED_OR, EXCEEDED_VALUE)) { + } else if (instance.isIvarator()) { ivaratorRequired = true; } else if (!instance.isAnyTypeOf()) { super.visit(and, data); diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java index b2821874110..1b4f0cddff3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/order/OrderByCostVisitor.java @@ -10,30 +10,34 @@ import org.apache.commons.jexl3.parser.JexlNode; import org.apache.commons.jexl3.parser.JexlNodes; import org.apache.commons.jexl3.parser.ParseException; +import org.apache.commons.jexl3.parser.ParserTreeConstants; import org.apache.log4j.Logger; import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.nodes.DefaultNodeCostComparator; -import datawave.query.jexl.nodes.FieldOrTermNodeCostComparator; -import datawave.query.jexl.nodes.NodeCostComparator; +import datawave.query.jexl.nodes.DefaultJexlNodeComparator; +import datawave.query.jexl.nodes.FieldCostComparator; +import datawave.query.jexl.nodes.JexlNodeComparator; import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.nodes.TermCostComparator; import datawave.query.jexl.visitors.BaseVisitor; import datawave.query.jexl.visitors.JexlStringBuildingVisitor; /** - * Orders query nodes by cost. + * Orders query nodes by cost using one or more {@link JexlNodeComparator}s. *

- * Cost is calculated based on field counts, term counts, or a default cost based on the node id {@link org.apache.commons.jexl3.parser.ParserTreeConstants}. + * The {@link DefaultJexlNodeComparator} orders a query based on the implied cost via the node id, see {@link ParserTreeConstants}. In general an EQ node is + * faster to resolve than an ER node, or a Marker node. *

- * In general an EQ node is faster to resolve than an ER node. + * The {@link FieldCostComparator} orders a query cased on the field cardinality. This cardinality can be gathered from the metadata table across the entire + * date range of the query, or the cardinality can be gathered from the global index and applied on a per-shard basis. *

- * In general an ER node is faster to resolve than a function node. + * The {@link TermCostComparator} orders a query based on the term cardinality. This is gathered from the global index and applied on a per-shard basis. */ public class OrderByCostVisitor extends BaseVisitor { private static final Logger log = Logger.getLogger(OrderByCostVisitor.class); - private NodeCostComparator costComparator; + private JexlNodeComparator comparator; private final boolean isFieldMap; private final Map countMap; @@ -50,8 +54,7 @@ public static String order(String query) { script = order(script); return JexlStringBuildingVisitor.buildQueryWithoutParse(script); } catch (ParseException e) { - log.error("Could not order query by cost: " + query); - e.printStackTrace(); + log.error("Could not order query by cost: " + query, e); } return null; } @@ -182,7 +185,7 @@ private Object visitJunction(JexlNode node, Object data) { QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); if (!instance.isAnyType()) { JexlNode[] children = JexlNodes.getChildren(node); - Arrays.sort(children, getCostComparator()); + Arrays.sort(children, getComparator()); JexlNodes.setChildren(node, children); node.childrenAccept(this, data); @@ -190,15 +193,19 @@ private Object visitJunction(JexlNode node, Object data) { return data; } - private NodeCostComparator getCostComparator() { - if (costComparator == null) { + private JexlNodeComparator getComparator() { + if (comparator == null) { if (countMap != null) { - costComparator = new FieldOrTermNodeCostComparator(countMap, isFieldMap); + if (isFieldMap) { + comparator = new FieldCostComparator(countMap); + } else { + comparator = new TermCostComparator(countMap); + } } else { - costComparator = new DefaultNodeCostComparator(); + comparator = new DefaultJexlNodeComparator(); } } - return costComparator; + return comparator; } } diff --git a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java index d601d49b8c9..3a06f4ac411 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/functions/jexl/NoExpansion.java @@ -4,26 +4,27 @@ import java.util.ArrayList; import java.util.List; +import datawave.query.jexl.functions.QueryFunctions; import datawave.query.language.functions.QueryFunction; import datawave.webservice.query.exception.BadRequestQueryException; import datawave.webservice.query.exception.DatawaveErrorCode; /** * This function accepts a comma separated list of fields to be excluded from QueryModel expansion. The purpose is to provide users with an easy way to avoid - * undesirable query model expansions. - * - * Note: The exclude is only applied to the fields in the original query. An original field can be expanded into an excluded field. + * undesirable query model expansions.
+ * Note: The exclusion is only applied to the fields in the original query. An original field can be expanded into an excluded field. */ public class NoExpansion extends JexlQueryFunction { public NoExpansion() { - super("noExpansion", new ArrayList<>()); + super(QueryFunctions.NO_EXPANSION, new ArrayList<>()); } @Override public void validate() throws IllegalArgumentException { - if (this.parameterList.size() != 1) { - BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, MessageFormat.format("{0}", this.name)); + if (this.parameterList.isEmpty()) { + BadRequestQueryException qe = new BadRequestQueryException(DatawaveErrorCode.INVALID_FUNCTION_ARGUMENTS, + MessageFormat.format("{0} requires at least one argument", this.name)); throw new IllegalArgumentException(qe); } } @@ -35,7 +36,19 @@ public QueryFunction duplicate() { @Override public String toString() { - List params = getParameterList(); - return "f:noExpansion(" + String.join("", params) + ")"; + StringBuilder sb = new StringBuilder(); + + sb.append(QueryFunctions.QUERY_FUNCTION_NAMESPACE).append(':').append(QueryFunctions.NO_EXPANSION); + if (parameterList.isEmpty()) { + sb.append("()"); + } else { + char separator = '('; + for (String param : parameterList) { + sb.append(separator).append(escapeString(param)); + separator = ','; + } + sb.append(')'); + } + return sb.toString(); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java b/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java index 82a30c5be99..948107b9609 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java @@ -21,8 +21,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Set; +import java.util.TreeSet; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; @@ -53,16 +55,17 @@ *

* Applies tokenization to {@link TextableQueryNode} objects using a configured Lucene {@link Analyzer}. *

- * * Uses the {@link Analyzer} specified in the the {@link ConfigurationKeys#ANALYZER} attribute of the {@link QueryConfigHandler} to process non-wildcard * {@link FieldQueryNode}s for fields listed in tokenizedFields. - * + *

* (Nodes that are {@link WildcardQueryNode}, {@link FuzzyQueryNode} or {@link RegexpQueryNode} or are part of a {@link TermRangeQueryNode} are NOT processed by * this processor.) - * + *

+ *

* The text of each {@link TextableQueryNode} is processed using the {@link Analyzer} to generate tokens. If the analyzer returns one or more terms that are not * identical to the input, the processor generates an {@link OrQueryNode} containing the original query node and a new {@link QuotedFieldQueryNode} or * {@link SlopQueryNode} depending on the nature of the original query node and whether useSlopForTokenizedTerms is false. + *

*

* There are three primary cases where tokenization will be applied to input query terms - single terms (e.g: wi-fi), phrases (e.g: "portable wi-fi"), and * phrases with slop (e.g: "portable wi-fi"~3). In the case of single term input, tokenization will produce a phrase with slop equals to the number of positions @@ -250,9 +253,7 @@ protected List setChildrenOrder(List children) throws Quer return children; /* no-op */ } - private QueryNode tokenizeNode(QueryNode node, final String text, final String field) throws QueryNodeException { - CachingTokenFilter buffer = null; - + private QueryNode tokenizeNode(final QueryNode node, final String text, final String field) throws QueryNodeException { if (analyzer == null) { if (logger.isDebugEnabled()) { logger.debug("Skipping tokenization of node: '" + node + "'; no analyzer is set"); @@ -266,125 +267,154 @@ private QueryNode tokenizeNode(QueryNode node, final String text, final String f if (logger.isDebugEnabled()) { logger.debug("Skipping processed query node: " + node.toString()); } - return node; - } else { - // mark the original node processed. - node.setTag(NODE_PROCESSED, Boolean.TRUE); } - try { - // Take a pass over the tokens and buffer them in the caching token filter. - TokenStream source = this.analyzer.tokenStream(field, new StringReader(text)); - source.reset(); - - buffer = new CachingTokenFilter(source); + node.setTag(NODE_PROCESSED, Boolean.TRUE); // mark this node as processed, so we don't process it again. - PositionIncrementAttribute posIncrAtt = null; - int numTokens = 0; + try (TokenStream buffer = this.analyzer.tokenStream(field, new StringReader(text))) { - if (buffer.hasAttribute(PositionIncrementAttribute.class)) { - posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); - } - - while (buffer.incrementToken()) { - numTokens++; - } - - // rewind the buffer stream + // prepare the source for reading. buffer.reset(); - // close original stream - all tokens buffered - source.close(); - if (!buffer.hasAttribute(CharTermAttribute.class) || numTokens == 0) { - // no terms found, return unmodified node. - return node; + if (!buffer.hasAttribute(CharTermAttribute.class)) { + return node; // tokenizer can't produce terms, return unmodified query node. } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); + final PositionIncrementAttribute posIncrAtt = buffer.hasAttribute(PositionIncrementAttribute.class) + ? buffer.getAttribute(PositionIncrementAttribute.class) + : null; + + // the variant builder will maintain multiple versions of the tokenized query as we find tokens + // that have multiple variants in the same position - e.g., stems, roots or lemmas. + final VariantBuilder b = new VariantBuilder(); - StringBuilder b = new StringBuilder(); - int slopRange = 0; + // build the new query strings from the tokenizer output while tracking cases where we've dropped words + // and will need to adjust the phrase slop for the query as a result. + int positionCount = 0; - String term; while (buffer.incrementToken()) { - term = termAtt.toString(); - b.append(term).append(" "); - - // increment the slop range for the tokenized text based on the - // positionIncrement attribute if available, otherwise one position - // per token. - if (posIncrAtt != null && this.positionIncrementsEnabled) { - slopRange += posIncrAtt.getPositionIncrement(); - } else { - slopRange++; - } + String token = termAtt.toString(); + final int positionIncrement = posIncrAtt != null ? posIncrAtt.getPositionIncrement() : 1; + positionCount += positionIncrementsEnabled ? positionIncrement : 1; + b.append(token, positionIncrement == 0); } - b.setLength(b.length() - 1); // trim trailing whitespace - - if (b.length() > 0) { - final String tokenizedText = b.toString(); + if (b.hasNoVariants()) { + return node; // If we didn't produce anything from the tokenizer, return unmodified query node. + } - // Check to see that the tokenizer produced output that was different from the original query node. - // If so avoid creating an OR clause. We compare the 'escaped' string of the original query so that we - // do not mistreat things like spaces. - if (TextableQueryNode.class.isAssignableFrom(node.getClass())) { - final CharSequence c = ((TextableQueryNode) node).getText(); - final String cmp = UnescapedCharSequence.class.isAssignableFrom(c.getClass()) ? toStringEscaped((UnescapedCharSequence) c) : c.toString(); - if (tokenizedText.equalsIgnoreCase(cmp)) { - return node; - } + // calculate the amount of slop we need to add based on the original query and the number of positions observed + int slopNeeded = calculateSlopNeeded(node, text, positionCount); + + // process each of the 'variants' to ensure they are different from the base query, if so, potentially + // create a new query node and add it to the set of OR clauses. Variants are guaranteed unique, so no + // need to deduplicate there. + final String baseQueryText = getEscapedBaseQueryText(node); + final LinkedList clauses = new LinkedList<>(); + for (String tokenizedText : b.getVariants()) { + if (tokenizedText.equalsIgnoreCase(baseQueryText)) { + continue; // skip this variant - it adds nothing new over the base query. } + QueryNode newQueryNode = createNewQueryNode(field, tokenizedText, slopNeeded); + clauses.add(newQueryNode); + } - QueryNode n = new QuotedFieldQueryNode(field, new UnescapedCharSequence(tokenizedText), -1, -1); - // mark the derived node processed so we don't process it again later. - n.setTag(NODE_PROCESSED, Boolean.TRUE); - - // Adjust the slop based on the difference between the original - // slop minus the original token count (based on whitespace) - int originalSlop = 0; - if (node.getTag(ORIGINAL_SLOP) != null) { - originalSlop = (Integer) node.getTag(ORIGINAL_SLOP); - final int delta = originalSlop - text.split("\\s+").length; - slopRange += delta; - } + if (clauses.isEmpty()) { + return node; + } - // Only add slop if the original had slop, or the original was not a phrase and slop is enabled. - // Using slop for non-quoted terms is a workaround until the phrase function will accept multiple - // terms in the same position as a valid match. - boolean originalWasQuoted = QuotedFieldQueryNode.class.isAssignableFrom(node.getClass()); - if ((useSlopForTokenizedTerms && !originalWasQuoted) || originalSlop > 0) { - n = new SlopQueryNode(n, slopRange); - } + // If we made it here, the tokenizer produced output that was different from the original query node, and + // we want to build an 'OR' clause that will match either query string. + clauses.addFirst(possiblyWrapOriginalQueryNode(node)); + return new GroupQueryNode(new OrQueryNode(clauses)); + } catch (IOException e) { + throw new QueryNodeException(e); + } + } - // The tokenizer produced output that was different from the original query node, wrap the original - // node and the tokenizer produced node in a OR query. To do this properly, we need to wrap the - // original node in a slop query node if it was originally in a slop query node. - if (originalSlop > 0) { - // restore the original slop wrapper to the base node if it was present originally. - node = new SlopQueryNode(node, originalSlop); - } + /** + * Create a new query node for the specified field and tokenize text, optionally wrapping it in a SlopQueryNode if we've determined that slop is needed + * (either due to tokens being removed or there being slop on the original query we need to account for. + * + * @param field + * the field for the query node + * @param tokenizedText + * the text for the query node + * @param slopNeeded + * whether slop is needed. + * @return a new QuotedFieldQueryNode or possibly a SlopQueryNode containing the new clause. Both of these nodes will be marked as 'PROCESSED'. + */ + public QueryNode createNewQueryNode(String field, String tokenizedText, int slopNeeded) { + QueryNode newQueryNode = new QuotedFieldQueryNode(field, new UnescapedCharSequence(tokenizedText), -1, -1); + newQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); // don't process this node again. + if (slopNeeded > 0) { + newQueryNode = new SlopQueryNode(newQueryNode, slopNeeded); + newQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); // don't process this node again. + } + return newQueryNode; + } - final List clauses = new ArrayList<>(); - clauses.add(node); - clauses.add(n); + /** + * Calculate the amount of slop we need to add to a new query node for tokenized text. This is based on the based on the number of positions observed in the + * tokenized text and the difference between the slop in the original query minus the original token count. + * + * @param node + * the original query node from which the tokenized text originated. + * @param text + * the text of the original query. + * @param positionsObserved + * the number of positions observed in the tokenized text. + * @return the amount of slop we need to add to our new query clauses. + */ + private int calculateSlopNeeded(QueryNode node, String text, int positionsObserved) { + int slopNeeded = positionsObserved; - node = new GroupQueryNode(new OrQueryNode(clauses)); - } - } catch (IOException e) { - throw new QueryNodeException(e); - } finally { - if (buffer != null) { - try { - buffer.close(); - } catch (IOException ex) { - logger.warn("Exception closing caching token filter: ", ex); - } + final boolean originalWasQuoted = QuotedFieldQueryNode.class.isAssignableFrom(node.getClass()); + final int originalSlop = node.getTag(ORIGINAL_SLOP) != null ? (Integer) node.getTag(ORIGINAL_SLOP) : 0; + + if ((useSlopForTokenizedTerms && !originalWasQuoted) || originalSlop > 0) { + // Adjust the slop needed based on the slop in the original query. + final int delta = originalSlop - text.split("\\s+").length; + if (delta > 0) { + slopNeeded += delta; } + } else { + slopNeeded = 0; } + return slopNeeded; + } - return node; + /** + * If the original query node was nested in a SlopQueryNode, that fact has been stored in the ORIGINAL_SLOP tag, and we'll need to re-create that slop node. + * Otherwise, return the original node unchanged. + * + * @param node + * the node to process. + * @return the node wrapped in a SlopQueryNode, if the input node originally had slop. + */ + private static QueryNode possiblyWrapOriginalQueryNode(QueryNode node) { + final int originalSlop = node.getTag(ORIGINAL_SLOP) != null ? (Integer) node.getTag(ORIGINAL_SLOP) : 0; + final QueryNode originalQueryNode = originalSlop > 0 ? new SlopQueryNode(node, originalSlop) : node; + originalQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); + return originalQueryNode; + } + + /** + * If a query node was something that has text, get the text. If the query node was already unescaped, convert it to it's escaped version. This way it can + * be compared to other nodes with escapes in place. + * + * @param node + * the node to extract text from + * @return the escaped version of the text from the node, null if the node had no text. + */ + private static String getEscapedBaseQueryText(QueryNode node) { + if (TextableQueryNode.class.isAssignableFrom(node.getClass())) { + final CharSequence c = ((TextableQueryNode) node).getText(); + return UnescapedCharSequence.class.isAssignableFrom(c.getClass()) ? toStringEscaped((UnescapedCharSequence) c) : c.toString(); + } + return null; } /** @@ -394,7 +424,7 @@ private QueryNode tokenizeNode(QueryNode node, final String text, final String f * string value * @return unescaped string */ - private String toStringEscaped(UnescapedCharSequence unescaped) { + private static String toStringEscaped(UnescapedCharSequence unescaped) { // non efficient implementation final StringBuilder result = new StringBuilder(); final int len = unescaped.length(); @@ -408,4 +438,68 @@ private String toStringEscaped(UnescapedCharSequence unescaped) { } return result.toString(); } + + /** + * Maintains one or more buffers for tokenized queries. During standard operation, works like a StringBuilder. If the tokenizer encounters a variant (e.g., + * zero position offset, same start and end as the previous token) appendVariant will start building a second buffer containing that variant. + */ + public static class VariantBuilder { + List> variants = new ArrayList<>(); + + public VariantBuilder append(String input, boolean appendVariant) { + return appendVariant ? appendVariant(input) : append(input); + } + + public VariantBuilder append(String input) { + if (variants.isEmpty()) { + variants.add(new ArrayList<>()); + } + + for (List b : variants) { + b.add(input); + } + + return this; + } + + public VariantBuilder appendVariant(String input) { + if (variants.isEmpty()) { + append(input); + } else { + + List> newVariants = new ArrayList<>(); + + for (List b : variants) { + // create a new variant of all the existing strings, replacing the + List newVariant = new ArrayList<>(b); + newVariant.set(newVariant.size() - 1, input); + newVariants.add(newVariant); + } + + variants.addAll(newVariants); + } + + return this; + } + + public boolean hasNoVariants() { + boolean hasNoVariants = true; + for (List b : variants) { + if (!b.isEmpty()) { + // at least one of the variant buffers has something. + hasNoVariants = false; + break; + } + } + return hasNoVariants; + } + + public Set getVariants() { + Set result = new TreeSet<>(); + for (List b : variants) { + result.add(String.join(" ", b)); + } + return result; + } + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 91980c1db2e..8a0adb9ce04 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -20,6 +20,7 @@ import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Objects; import java.util.Set; import java.util.TimeZone; import java.util.TreeSet; @@ -28,6 +29,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; @@ -80,6 +82,7 @@ import datawave.query.config.ScanHintRule; import datawave.query.config.ShardQueryConfiguration; import datawave.query.exceptions.CannotExpandUnfieldedTermFatalException; +import datawave.query.exceptions.DatawaveAsyncOperationException; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.exceptions.DatawaveQueryException; import datawave.query.exceptions.DoNotPerformOptimizedQueryException; @@ -158,6 +161,15 @@ import datawave.query.jexl.visitors.order.OrderByCostVisitor; import datawave.query.jexl.visitors.whindex.WhindexVisitor; import datawave.query.model.QueryModel; +import datawave.query.planner.async.AbstractQueryPlannerCallable; +import datawave.query.planner.async.FetchCompositeMetadata; +import datawave.query.planner.async.FetchContentExpansionFields; +import datawave.query.planner.async.FetchIndexOnlyFields; +import datawave.query.planner.async.FetchIndexedFields; +import datawave.query.planner.async.FetchNonEventFields; +import datawave.query.planner.async.FetchTermFrequencyFields; +import datawave.query.planner.async.FetchTypeMetadata; +import datawave.query.planner.async.SerializeIvaratorCacheDirs; import datawave.query.planner.comparator.DefaultQueryPlanComparator; import datawave.query.planner.comparator.GeoWaveQueryPlanComparator; import datawave.query.planner.pushdown.PushDownVisitor; @@ -272,10 +284,46 @@ public class DefaultQueryPlanner extends QueryPlanner implements Cloneable { protected String rangeStreamClass = RangeStream.class.getCanonicalName(); - protected ExecutorService builderThread = null; + protected ExecutorService executor = null; + + protected AbstractQueryPlannerCallable compositeMetadataCallable; + protected AbstractQueryPlannerCallable typeMetadataCallable; + protected AbstractQueryPlannerCallable contentExpansionFieldsCallable; + protected AbstractQueryPlannerCallable ivaratorCacheDirCallable; + protected AbstractQueryPlannerCallable> indexedFieldsCallable; + protected AbstractQueryPlannerCallable> indexOnlyFieldsCallable; + protected AbstractQueryPlannerCallable> nonEventFieldsCallable; + protected AbstractQueryPlannerCallable> termFrequencyFieldsCallable; + + protected Future compositeMetadataFuture; + protected Future typeMetadataFuture; + protected Future contentExpansionFieldsFuture; + protected Future ivaratorCacheDirFuture; + protected Future> indexedFieldsFuture; + protected Future> indexOnlyFieldsFuture; + protected Future> nonEventFieldsFuture; + protected Future> termFrequencyFieldsFuture; + + protected CompositeMetadata compositeMetadata; + protected TypeMetadata typeMetadata; + protected String contentExpansionFields; + protected String serializedIvaratorDirs; + protected Set indexedFields; + protected Set indexOnlyFields; + protected Set nonEventFields; + protected Set termFrequencyFields; protected Future settingFuture = null; + private boolean logConcurrentStageExecution = false; + private int concurrentTimeoutMillis = 10_000; // 10 second default + + // tracks time spent in various stages that may not be covered in the other query stopwatches + protected QueryStopwatch stageStopWatch = new QueryStopwatch(); + + // tracks time saved via concurrent task execution + protected QueryStopwatch futureStopWatch = new QueryStopwatch(); + protected long maxRangeWaitMillis = 125; /** @@ -394,10 +442,10 @@ public CloseableIterable process(GenericQueryConfiguration genericCon throw new ClassCastException("Config object must be an instance of ShardQueryConfiguration"); } - builderThread = Executors.newSingleThreadExecutor(); - ShardQueryConfiguration config = (ShardQueryConfiguration) genericConfig; + startConcurrentExecution(config); + // lets mark the query as started (used by ivarators at a minimum) try { markQueryStarted(config, settings); @@ -408,6 +456,47 @@ public CloseableIterable process(GenericQueryConfiguration genericCon return process(scannerFactory, getMetadataHelper(config), getDateIndexHelper(config), config, query, settings); } + /** + * This method starts a number of long-running tasks that can be done in parallel. + * + * @param config + * the config + */ + protected void startConcurrentExecution(ShardQueryConfiguration config) { + // iterator setting future + seven futures below make 8, add two for growth/extension + executor = Executors.newFixedThreadPool(10); + + compositeMetadata = null; + typeMetadata = null; + contentExpansionFields = null; + serializedIvaratorDirs = null; + indexedFields = null; + indexOnlyFields = null; + nonEventFields = null; + termFrequencyFields = null; + + // expensive operations are executed in parallel + compositeMetadataCallable = new FetchCompositeMetadata(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + typeMetadataCallable = new FetchTypeMetadata(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + contentExpansionFieldsCallable = new FetchContentExpansionFields(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + ivaratorCacheDirCallable = new SerializeIvaratorCacheDirs(futureStopWatch, this, config); + indexedFieldsCallable = new FetchIndexedFields(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + indexOnlyFieldsCallable = new FetchIndexOnlyFields(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + nonEventFieldsCallable = new FetchNonEventFields(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + termFrequencyFieldsCallable = new FetchTermFrequencyFields(futureStopWatch, metadataHelper, config.getDatatypeFilter()); + + // field sets tend to be needed first, so submit those before others + indexOnlyFieldsFuture = executor.submit(indexOnlyFieldsCallable); + indexedFieldsFuture = executor.submit(indexedFieldsCallable); + nonEventFieldsFuture = executor.submit(nonEventFieldsCallable); + termFrequencyFieldsFuture = executor.submit(termFrequencyFieldsCallable); + + compositeMetadataFuture = executor.submit(compositeMetadataCallable); + typeMetadataFuture = executor.submit(typeMetadataCallable); + contentExpansionFieldsFuture = executor.submit(contentExpansionFieldsCallable); + ivaratorCacheDirFuture = executor.submit(ivaratorCacheDirCallable); + } + protected CloseableIterable process(ScannerFactory scannerFactory, MetadataHelper metadataHelper, DateIndexHelper dateIndexHelper, ShardQueryConfiguration config, String query, Query settings) throws DatawaveQueryException { settingFuture = null; @@ -415,7 +504,7 @@ protected CloseableIterable process(ScannerFactory scannerFactory, Me IteratorSetting cfg = null; if (preloadOptions) { - cfg = getQueryIterator(metadataHelper, config, settings, "", false, true); + cfg = getQueryIterator(metadataHelper, config, "", false, true); } try { @@ -470,7 +559,7 @@ protected CloseableIterable process(ScannerFactory scannerFactory, Me if (!config.isGeneratePlanOnly()) { while (null == cfg) { - cfg = getQueryIterator(metadataHelper, config, settings, "", false, false); + cfg = getQueryIterator(metadataHelper, config, "", false, false); } configureIterator(config, cfg, newQueryString, isFullTable); } @@ -482,6 +571,10 @@ protected CloseableIterable process(ScannerFactory scannerFactory, Me this.plannedScript = newQueryString; config.setQueryString(this.plannedScript); + if (logConcurrentStageExecution) { + logTimeSavedViaConcurrentExecution(); + } + if (!config.isGeneratePlanOnly()) { // add the geo query comparator to sort by geo range granularity if this is a geo query List> queryPlanComparators = null; @@ -544,8 +637,9 @@ private void configureIterator(ShardQueryConfiguration config, IteratorSetting c addOption(cfg, QueryOptions.FULL_TABLE_SCAN_ONLY, Boolean.toString(isFullTable), false); addOption(cfg, QueryOptions.TRACK_SIZES, Boolean.toString(config.isTrackSizes()), false); addOption(cfg, QueryOptions.ACTIVE_QUERY_LOG_NAME, config.getActiveQueryLogName(), false); + // Set the start and end dates - configureTypeMappings(config, cfg, metadataHelper, getCompressOptionMappings()); + configureTypeMappings(config, cfg, metadataHelper, getCompressOptionMappings(), false); } /** @@ -575,8 +669,8 @@ public void close(GenericQueryConfiguration genericConfig, Query settings) { log.warn("Config object must be an instance of ShardQueryConfiguration to properly close the DefaultQueryPlanner. You gave me a " + genericConfig); } - if (null != builderThread) { - builderThread.shutdown(); + if (null != executor) { + executor.shutdown(); } return; } @@ -590,8 +684,8 @@ public void close(GenericQueryConfiguration genericConfig, Query settings) { log.error("Failed to close query " + settings.getId(), e); } - if (null != builderThread) { - builderThread.shutdown(); + if (null != executor) { + executor.shutdown(); } } @@ -780,7 +874,7 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH // | Post Query Model Expansion Clean Up | // +-------------------------------------+ - Set indexOnlyFields = loadIndexedFields(config); + Set indexOnlyFields = getIndexOnlyFields(); if (!indexOnlyFields.isEmpty()) { // filter:includeRegex and filter:excludeRegex functions cannot be run against index-only fields, clean that up @@ -837,7 +931,7 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH // check the query for any fields that are term frequencies // if any exist, populate the shard query config with these fields - timedCheckForTokenizedFields(timers, "Check for term frequency (tokenized) fields", config, metadataHelper); + timedCheckForTokenizedFields(timers, "Check for term frequency (tokenized) fields", config); if (reduceQuery) { config.setQueryTree(timedReduce(timers, "Reduce Query Final", config.getQueryTree())); @@ -910,14 +1004,9 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard Set indexOnlyFields = null; Set nonEventFields = null; if (config.getMinSelectivity() > 0 || !disableBoundedLookup) { - try { - indexedFields = metadataHelper.getIndexedFields(config.getDatatypeFilter()); - indexOnlyFields = metadataHelper.getIndexOnlyFields(config.getDatatypeFilter()); - nonEventFields = metadataHelper.getNonEventFields(config.getDatatypeFilter()); - } catch (TableNotFoundException te) { - QueryException qe = new QueryException(DatawaveErrorCode.METADATA_ACCESS_ERROR, te); - throw new DatawaveFatalQueryException(qe); - } + indexedFields = getIndexedFields(); + indexOnlyFields = getIndexOnlyFields(); + nonEventFields = getNonEventFields(); } // apply the node transform rules @@ -1166,20 +1255,14 @@ protected void timeScanHintRules(QueryStopwatch timers, String stage, ShardQuery stopwatch.stop(); } - protected void timedCheckForTokenizedFields(QueryStopwatch timers, String stage, ShardQueryConfiguration config, MetadataHelper metadataHelper) { + protected void timedCheckForTokenizedFields(QueryStopwatch timers, String stage, ShardQueryConfiguration config) { TraceStopwatch stopwatch = timers.newStartedStopwatch("DefaultQueryPlanner - " + stage); // Figure out if the query contained any term frequency terms so we know // if we may use the term frequencies instead of the fields index in some cases Set queryTfFields = Collections.emptySet(); - Set termFrequencyFields; - try { - termFrequencyFields = metadataHelper.getTermFrequencyFields(config.getDatatypeFilter()); - } catch (TableNotFoundException e) { - stopwatch.stop(); - QueryException qe = new QueryException(DatawaveErrorCode.TERM_FREQUENCY_FIELDS_RETRIEVAL_ERROR, e); - throw new DatawaveFatalQueryException(qe); - } + Set termFrequencyFields = getTermFrequencyFields(); + if (!termFrequencyFields.isEmpty()) { queryTfFields = SetMembershipVisitor.getMembers(termFrequencyFields, config, config.getQueryTree()); @@ -1216,37 +1299,6 @@ protected QueryModel loadQueryModel(ShardQueryConfiguration config) { return queryModelProvider.getQueryModel(); } - /* - - - */ - - protected Set loadIndexedFields(ShardQueryConfiguration config) { - try { - return metadataHelper.getIndexOnlyFields(config.getDatatypeFilter()); - } catch (TableNotFoundException e) { - QueryException qe = new QueryException(DatawaveErrorCode.INDEX_ONLY_FIELDS_RETRIEVAL_ERROR, e); - throw new DatawaveFatalQueryException(qe); - } - } - - /** - * Loads expansion fields filtered by datatype. If an error occurs that error is rethrown as a {@link DatawaveFatalQueryException} - * - * @param config - * a configuration - * @return list of expansion fields - */ - protected Set loadExpansionFields(ShardQueryConfiguration config) { - try { - return metadataHelper.getExpansionFields(config.getDatatypeFilter()); - } catch (TableNotFoundException e) { - QueryException qe = new QueryException(DatawaveErrorCode.METADATA_ACCESS_ERROR, e); - log.info(qe); - throw new DatawaveFatalQueryException(qe); - } - } - /* * Start methods that operate on the query tree */ @@ -1401,7 +1453,7 @@ protected ASTJexlScript timedApplyNodeTransformRules(QueryStopwatch timers, Stri protected ASTJexlScript timedExpandAnyFieldRegexNodes(QueryStopwatch timers, final ASTJexlScript script, ShardQueryConfiguration config, MetadataHelper metadataHelper, ScannerFactory scannerFactory, String query) throws DatawaveQueryException { try { - config.setIndexedFields(metadataHelper.getIndexedFields(config.getDatatypeFilter())); + config.setIndexedFields(getIndexedFields()); config.setReverseIndexedFields(metadataHelper.getReverseIndexedFields(config.getDatatypeFilter())); // @formatter:off @@ -2200,10 +2252,11 @@ protected void configureAdditionalOptions(ShardQueryConfiguration config, Iterat // no-op } - protected Future loadQueryIterator(final MetadataHelper metadataHelper, final ShardQueryConfiguration config, final Query settings, - final String queryString, final Boolean isFullTable, boolean isPreload) throws DatawaveQueryException { + protected Future loadQueryIterator(final MetadataHelper metadataHelper, final ShardQueryConfiguration config, final Boolean isFullTable, + boolean isPreload) { + + return executor.submit(() -> { - return builderThread.submit(() -> { // VersioningIterator is typically set at 20 on the table IteratorSetting cfg = new IteratorSetting(config.getBaseIteratorPriority() + 40, "query", getQueryIteratorClass()); @@ -2230,7 +2283,7 @@ protected Future loadQueryIterator(final MetadataHelper metadat addOption(cfg, QueryOptions.ZOOKEEPER_CONFIG, config.getZookeeperConfig(), false); } if (config.getIvaratorCacheDirConfigs() != null && !config.getIvaratorCacheDirConfigs().isEmpty()) { - addOption(cfg, QueryOptions.IVARATOR_CACHE_DIR_CONFIG, IvaratorCacheDirConfig.toJson(getShuffledIvaratoCacheDirConfigs(config)), false); + addOption(cfg, QueryOptions.IVARATOR_CACHE_DIR_CONFIG, getSerializedIvaratorDirs(), false); } addOption(cfg, QueryOptions.IVARATOR_CACHE_BUFFER_SIZE, Integer.toString(config.getIvaratorCacheBufferSize()), false); addOption(cfg, QueryOptions.IVARATOR_SCAN_PERSIST_THRESHOLD, Long.toString(config.getIvaratorCacheScanPersistThreshold()), false); @@ -2259,27 +2312,17 @@ protected Future loadQueryIterator(final MetadataHelper metadat loadFields(cfg, config, isPreload); configureSeekingOptions(cfg, config); - try { - CompositeMetadata compositeMetadata = metadataHelper.getCompositeMetadata().filter(config.getQueryFieldsDatatypes().keySet()); - if (compositeMetadata != null && !compositeMetadata.isEmpty()) { - addOption(cfg, QueryOptions.COMPOSITE_METADATA, java.util.Base64.getEncoder().encodeToString(CompositeMetadata.toBytes(compositeMetadata)), - false); - } - } catch (TableNotFoundException e) { - QueryException qe = new QueryException(DatawaveErrorCode.COMPOSITE_METADATA_CONFIG_ERROR, e); - throw new DatawaveQueryException(qe); + CompositeMetadata compositeMetadata = getCompositeMetadata(); + compositeMetadata = compositeMetadata.filter(config.getQueryFieldsDatatypes().keySet()); + if (compositeMetadata != null && !compositeMetadata.isEmpty()) { + addOption(cfg, QueryOptions.COMPOSITE_METADATA, java.util.Base64.getEncoder().encodeToString(CompositeMetadata.toBytes(compositeMetadata)), + false); } String datatypeFilter = config.getDatatypeFilterAsString(); - addOption(cfg, QueryOptions.DATATYPE_FILTER, datatypeFilter, false); - try { - addOption(cfg, QueryOptions.CONTENT_EXPANSION_FIELDS, Joiner.on(',').join(metadataHelper.getContentFields(config.getDatatypeFilter())), false); - } catch (TableNotFoundException e) { - QueryException qe = new QueryException(DatawaveErrorCode.CONTENT_FIELDS_RETRIEVAL_ERROR, e); - throw new DatawaveQueryException(qe); - } + addOption(cfg, QueryOptions.CONTENT_EXPANSION_FIELDS, getContentExpansionFields(), false); if (config.isDebugMultithreadedSources()) { addOption(cfg, QueryOptions.DEBUG_MULTITHREADED_SOURCES, Boolean.toString(config.isDebugMultithreadedSources()), false); @@ -2312,8 +2355,8 @@ protected Future loadQueryIterator(final MetadataHelper metadat private void loadFields(IteratorSetting cfg, ShardQueryConfiguration config, boolean isPreload) throws DatawaveQueryException { try { Set compositeFields = metadataHelper.getCompositeToFieldMap(config.getDatatypeFilter()).keySet(); - Set indexedFields = metadataHelper.getIndexedFields(config.getDatatypeFilter()); - Set indexOnlyFields = metadataHelper.getIndexOnlyFields(config.getDatatypeFilter()); + Set indexedFields = getIndexedFields(); + Set indexOnlyFields = getIndexOnlyFields(); // only reduce the query fields if planning has occurred if (!isPreload && config.getReduceQueryFields()) { @@ -2360,6 +2403,10 @@ protected void configureSeekingOptions(IteratorSetting cfg, ShardQueryConfigurat if (config.getTfNextSeek() > 0) { addOption(cfg, QueryOptions.TF_NEXT_SEEK, String.valueOf(config.getTfNextSeek()), false); } + + if (config.isSeekingEventAggregation()) { + addOption(cfg, QueryOptions.SEEKING_EVENT_AGGREGATION, String.valueOf(config.isSeekingEventAggregation()), false); + } } /** @@ -2369,7 +2416,7 @@ protected void configureSeekingOptions(IteratorSetting cfg, ShardQueryConfigurat * the shard config * @return a list of ivarator cache dirs */ - private List getShuffledIvaratoCacheDirConfigs(ShardQueryConfiguration config) { + public List getShuffledIvaratoCacheDirConfigs(ShardQueryConfiguration config) { List shuffledIvaratorCacheDirs = new ArrayList<>(); // group the ivarator cache dirs by their priority @@ -2393,8 +2440,6 @@ private List getShuffledIvaratoCacheDirConfigs(ShardQuer * the {@link MetadataHelper} * @param config * the {@link ShardQueryConfiguration} - * @param settings - * the {@link Query} * @param queryString * the raw query string * @param isFullTable @@ -2405,10 +2450,10 @@ private List getShuffledIvaratoCacheDirConfigs(ShardQuer * @throws DatawaveQueryException * if something goes wrong */ - protected IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, Query settings, String queryString, - Boolean isFullTable, boolean isPreload) throws DatawaveQueryException { + protected IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, String queryString, Boolean isFullTable, + boolean isPreload) throws DatawaveQueryException { if (null == settingFuture) - settingFuture = loadQueryIterator(metadataHelper, config, settings, queryString, isFullTable, isPreload); + settingFuture = loadQueryIterator(metadataHelper, config, isFullTable, isPreload); if (settingFuture.isDone()) try { return settingFuture.get(); @@ -2419,12 +2464,12 @@ protected IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQ return null; } - public static void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper, boolean compressMappings) + public void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper, boolean compressMappings) throws DatawaveQueryException { configureTypeMappings(config, cfg, metadataHelper, compressMappings, false); } - public static void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper, boolean compressMappings, + public void configureTypeMappings(ShardQueryConfiguration config, IteratorSetting cfg, MetadataHelper metadataHelper, boolean compressMappings, boolean isPreload) throws DatawaveQueryException { try { addOption(cfg, QueryOptions.QUERY_MAPPING_COMPRESS, Boolean.toString(compressMappings), false); @@ -2437,7 +2482,7 @@ public static void configureTypeMappings(ShardQueryConfiguration config, Iterato String nonIndexedTypes = QueryOptions.buildFieldNormalizerString(nonIndexedQueryFieldsDatatypes); String requiredAuthsString = metadataHelper.getUsersMetadataAuthorizationSubset(); - TypeMetadata typeMetadata = metadataHelper.getTypeMetadata(config.getDatatypeFilter()); + TypeMetadata typeMetadata = getTypeMetadata(); if (config.getReduceTypeMetadata() && !isPreload) { Set fieldsToRetain = ReduceFields.getQueryFields(config.getQueryTree()); @@ -2454,12 +2499,13 @@ public static void configureTypeMappings(ShardQueryConfiguration config, Iterato serializedTypeMetadata = QueryOptions.compressOption(serializedTypeMetadata, QueryOptions.UTF8); } } + addOption(cfg, QueryOptions.NON_INDEXED_DATATYPES, nonIndexedTypes, false); addOption(cfg, QueryOptions.TYPE_METADATA, serializedTypeMetadata, false); addOption(cfg, QueryOptions.TYPE_METADATA_AUTHS, requiredAuthsString, false); addOption(cfg, QueryOptions.METADATA_TABLE_NAME, config.getMetadataTableName(), false); - } catch (TableNotFoundException | IOException e) { + } catch (IOException e) { QueryException qe = new QueryException(DatawaveErrorCode.TYPE_MAPPING_CONFIG_ERROR, e); throw new DatawaveQueryException(qe); } @@ -2485,10 +2531,8 @@ public static void addOption(IteratorSetting cfg, String option, String value, b * the config * @param cfg * the iterator configuration - * @throws DatawaveQueryException - * for issues with running the query */ - protected void setCommonIteratorOptions(ShardQueryConfiguration config, IteratorSetting cfg) throws DatawaveQueryException { + protected void setCommonIteratorOptions(ShardQueryConfiguration config, IteratorSetting cfg) { // Applying filtering options, including classnames, whether applied to // post-processing or field index if (config.getUseFilters()) { @@ -2768,8 +2812,10 @@ public Tuple2,Boolean> getQueryRanges(ScannerFactor } } - if (config.isSortQueryBeforeGlobalIndex()) { + if (config.isSortQueryPreIndexWithFieldCounts()) { config.setQueryTree(timedSortQueryBeforeGlobalIndex(config, getMetadataHelper())); + } else if (config.isSortQueryPreIndexWithImpliedCounts()) { + config.setQueryTree(timedSortQueryBeforeGlobalIndex(config)); } // if a simple examination of the query has not forced a full table @@ -2864,18 +2910,20 @@ protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration Map counts = metadataHelper.getCountsForFieldsInDateRange(fields, datatypes, config.getBeginDate(), config.getEndDate()); if (!counts.isEmpty()) { return OrderByCostVisitor.orderByFieldCount(config.getQueryTree(), counts); + } else { + // fall back to sorting by implied cardinality + return OrderByCostVisitor.order(config.getQueryTree()); } } return config.getQueryTree(); }); } - private TypeMetadata getTypeMetadata() { - try { - return metadataHelper.getTypeMetadata(); - } catch (TableNotFoundException e) { - throw new DatawaveFatalQueryException("Could not get TypeMetadata"); - } + protected ASTJexlScript timedSortQueryBeforeGlobalIndex(ShardQueryConfiguration config) throws DatawaveQueryException { + return visitorManager.timedVisit(config.getTimers(), "SortQueryBeforeGlobalIndex", () -> { + // sort by implied cardinality + return OrderByCostVisitor.order(config.getQueryTree()); + }); } /** @@ -3052,7 +3100,7 @@ protected Multimap> configureIndexedAndNormalizedFields(MetadataH Multimap> fieldToDatatypeMap = FetchDataTypesVisitor.fetchDataTypes(metadataHelper, config.getDatatypeFilter(), queryTree, false); try { - return configureIndexedAndNormalizedFields(fieldToDatatypeMap, metadataHelper.getIndexedFields(null), metadataHelper.getReverseIndexedFields(null), + return configureIndexedAndNormalizedFields(fieldToDatatypeMap, getIndexedFields(), metadataHelper.getReverseIndexedFields(null), metadataHelper.getAllNormalized(), config, queryTree); } catch (InstantiationException | IllegalAccessException | TableNotFoundException e) { throw new DatawaveFatalQueryException(e); @@ -3230,8 +3278,211 @@ public static Date getEndDateForIndexLookup(Date endDate) { @Override public void finalize() { - if (null != builderThread) { - builderThread.shutdown(); + if (null != executor) { + executor.shutdown(); } } + + protected CompositeMetadata getCompositeMetadata() { + if (compositeMetadata == null && compositeMetadataCallable != null) { + TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(compositeMetadataCallable.stageName()); + try { + while (compositeMetadata == null) { + compositeMetadata = compositeMetadataFuture.get(concurrentTimeoutMillis, TimeUnit.MILLISECONDS); + } + } catch (InterruptedException | ExecutionException | TimeoutException e) { + log.error("Failed to fetch CompositeMetadata", e); + throw new DatawaveAsyncOperationException("Failed to fetch CompositeMetadata", e); + } finally { + stopwatch.stop(); + } + } + return compositeMetadata; + } + + protected TypeMetadata getTypeMetadata() { + if (typeMetadata == null && typeMetadataCallable != null) { + TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(typeMetadataCallable.stageName()); + try { + while (typeMetadata == null) { + typeMetadata = typeMetadataFuture.get(concurrentTimeoutMillis, TimeUnit.MILLISECONDS); + } + } catch (InterruptedException | ExecutionException | TimeoutException e) { + log.error("Failed to fetch TypeMetadata", e); + throw new DatawaveAsyncOperationException("Failed to fetch TypeMetadata", e); + } finally { + stopwatch.stop(); + } + } + return typeMetadata; + } + + protected String getContentExpansionFields() { + if (contentExpansionFields == null && contentExpansionFieldsCallable != null) { + TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(contentExpansionFieldsCallable.stageName()); + try { + while (contentExpansionFields == null) { + contentExpansionFields = contentExpansionFieldsFuture.get(concurrentTimeoutMillis, TimeUnit.MILLISECONDS); + } + } catch (InterruptedException | ExecutionException | TimeoutException e) { + log.error("Failed to fetch Content Expansion fields", e); + throw new DatawaveAsyncOperationException("Failed to fetch Content Expansion fields", e); + } finally { + stopwatch.stop(); + } + } + return contentExpansionFields; + } + + protected String getSerializedIvaratorDirs() { + if (serializedIvaratorDirs == null && ivaratorCacheDirCallable != null) { + TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(ivaratorCacheDirCallable.stageName()); + try { + while (serializedIvaratorDirs == null) { + serializedIvaratorDirs = ivaratorCacheDirFuture.get(concurrentTimeoutMillis, TimeUnit.MILLISECONDS); + } + } catch (InterruptedException | ExecutionException | TimeoutException e) { + log.error("Failed to serialize ivarator cache dirs", e); + throw new DatawaveAsyncOperationException("Failed to serialize ivarator cache dirs", e); + } finally { + stopwatch.stop(); + } + } + return serializedIvaratorDirs; + } + + protected Set getIndexedFields() { + if (indexedFields == null && indexedFieldsCallable != null) { + indexedFields = getFieldSet(indexedFieldsCallable.stageName(), indexedFieldsFuture); + } + + return Objects.requireNonNullElse(indexedFields, Collections.emptySet()); + } + + protected Set getIndexOnlyFields() { + if (indexOnlyFields == null && indexOnlyFieldsCallable != null) { + indexOnlyFields = getFieldSet(indexOnlyFieldsCallable.stageName(), indexOnlyFieldsFuture); + } + + return Objects.requireNonNullElse(indexOnlyFields, Collections.emptySet()); + } + + protected Set getNonEventFields() { + if (nonEventFields == null && nonEventFieldsCallable != null) { + nonEventFields = getFieldSet(nonEventFieldsCallable.stageName(), nonEventFieldsFuture); + } + + return Objects.requireNonNullElse(nonEventFields, Collections.emptySet()); + } + + protected Set getTermFrequencyFields() { + if (termFrequencyFields == null && termFrequencyFieldsCallable != null) { + termFrequencyFields = getFieldSet(termFrequencyFieldsCallable.stageName(), termFrequencyFieldsFuture); + } + + return Objects.requireNonNullElse(termFrequencyFields, Collections.emptySet()); + } + + /** + * Common code to fetch a field set or throw an exception + * + * @param stageName + * the stage name associated with the callable + * @param future + * the future + * @return the field set + */ + protected Set getFieldSet(String stageName, Future> future) { + TraceStopwatch stopwatch = stageStopWatch.newStartedStopwatch(stageName); + try { + Set fields = null; + while (fields == null) { + fields = future.get(concurrentTimeoutMillis, TimeUnit.MILLISECONDS); + } + return fields; + } catch (ExecutionException | InterruptedException | TimeoutException e) { + log.error("Stage[" + stageName + "] failed", e); + throw new DatawaveAsyncOperationException("Stage[" + stageName + "] failed", e); + } finally { + stopwatch.stop(); + } + } + + /** + * Log the execution time for each stage and use the time spent waiting on each future to calculate the time saved via concurrent execution. + *

+ * If a particular stage spent a lot of time waiting on the result, consider refactoring query planning to avoid busy waiting. + */ + protected void logTimeSavedViaConcurrentExecution() { + // timers share stage names, so we can compare the time spent executing the task with + // the time spent waiting on the future to calculate time saved + + long totalExecution = 0L; + long totalGetFuture = 0L; + long totalTimeSaved = 0L; + + log.info("Execution\tGetFuture\tTimeSaved\t\tStage"); + for (String stageName : getStageNames()) { + TraceStopwatch future = futureStopWatch.get(stageName); + TraceStopwatch stage = stageStopWatch.get(stageName); + + if (future == null) { + continue; + } + + long execution = future.elapsed(TimeUnit.NANOSECONDS); + long get; + if (stage == null) { + get = 0; // handles the case where a task was submitted but the result was never used + } else { + get = stage.elapsed(TimeUnit.NANOSECONDS); + } + + long saved = execution - get; + + totalExecution += execution; + totalGetFuture += get; + totalTimeSaved += saved; + + log.info(execution + "\t\t" + get + "\t\t" + saved + "\t\t" + stageName); + } + + log.info("Total concurrent execution time: " + TimeUnit.NANOSECONDS.toMillis(totalExecution) + " ms"); + log.info("Total get future time: " + TimeUnit.NANOSECONDS.toMillis(totalGetFuture) + " ms"); + log.info("Total time saved: " + TimeUnit.NANOSECONDS.toMillis(totalTimeSaved) + " ms"); + } + + /** + * Collect the stage names for all {@link AbstractQueryPlannerCallable}s. + * + * @return a list of stage names + */ + protected List getStageNames() { + List names = new ArrayList<>(); + names.add(compositeMetadataCallable.stageName()); + names.add(typeMetadataCallable.stageName()); + names.add(contentExpansionFieldsCallable.stageName()); + names.add(ivaratorCacheDirCallable.stageName()); + names.add(indexedFieldsCallable.stageName()); + names.add(indexOnlyFieldsCallable.stageName()); + names.add(nonEventFieldsCallable.stageName()); + names.add(termFrequencyFieldsCallable.stageName()); + return names; + } + + public void setLogConcurrentStageExecution(boolean logConcurrentStageExecution) { + this.logConcurrentStageExecution = logConcurrentStageExecution; + } + + public boolean getLogConcurrentStageExecution() { + return logConcurrentStageExecution; + } + + public int getConcurrentTimeoutMillis() { + return concurrentTimeoutMillis; + } + + public void setConcurrentTimeoutMillis(int concurrentTimeoutMillis) { + this.concurrentTimeoutMillis = concurrentTimeoutMillis; + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/FacetedQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/FacetedQueryPlanner.java index 008cfae747a..218cbaaa8d3 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/FacetedQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/FacetedQueryPlanner.java @@ -51,15 +51,15 @@ public FacetedQueryPlanner(final FacetedConfiguration config) { } @Override - public IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, Query settings, String queryString, - Boolean isFullTable, boolean isPreload) throws DatawaveQueryException { + public IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, String queryString, Boolean isFullTable, + boolean isPreload) throws DatawaveQueryException { if (isFullTable) { QueryException qe = new QueryException(DatawaveErrorCode.FULL_TABLE_SCAN_DISALLOWED); throw new FullTableScansDisallowedException(qe); } - IteratorSetting cfg = super.getQueryIterator(metadataHelper, config, settings, queryString, isFullTable, isPreload); + IteratorSetting cfg = super.getQueryIterator(metadataHelper, config, queryString, isFullTable, isPreload); if (!usePrecomputedFacets) cfg.setIteratorClass(DynamicFacetIterator.class.getName()); else { diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/FederatedQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/FederatedQueryPlanner.java index c6da616d5f3..a23ce01ef6d 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/FederatedQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/FederatedQueryPlanner.java @@ -13,6 +13,7 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.UUID; import java.util.stream.Collectors; import org.apache.accumulo.core.client.TableNotFoundException; @@ -34,6 +35,7 @@ import datawave.query.exceptions.EmptyUnfieldedTermExpansionException; import datawave.query.exceptions.NoResultsException; import datawave.query.index.lookup.UidIntersector; +import datawave.query.jexl.lookups.ExpandedFieldCache; import datawave.query.jexl.visitors.QueryFieldsVisitor; import datawave.query.jexl.visitors.UnfieldedIndexExpansionVisitor; import datawave.query.model.FieldIndexHole; @@ -67,6 +69,7 @@ public class FederatedQueryPlanner extends QueryPlanner implements Cloneable { private final Set plans = new LinkedHashSet<>(); private DefaultQueryPlanner queryPlanner; private String plannedScript; + protected ExpandedFieldCache previouslyExpandedFieldCache; /** * Return a new {@link FederatedQueryPlanner} instance with a new {@link DefaultQueryPlanner} inner query planner instance. @@ -94,6 +97,7 @@ public FederatedQueryPlanner(DefaultQueryPlanner queryPlanner) { public FederatedQueryPlanner(FederatedQueryPlanner other) { this.queryPlanner = other.queryPlanner != null ? other.queryPlanner.clone() : null; this.plannedScript = other.plannedScript; + this.previouslyExpandedFieldCache = other.previouslyExpandedFieldCache; } /** @@ -285,6 +289,10 @@ public CloseableIterable process(GenericQueryConfiguration genericCon this.plannedScript = null; this.plans.clear(); + if (previouslyExpandedFieldCache == null) { + this.previouslyExpandedFieldCache = new ExpandedFieldCache(); + } + if (log.isDebugEnabled()) { log.debug("Federated query: " + query); } @@ -320,6 +328,7 @@ public CloseableIterable process(GenericQueryConfiguration genericCon FederatedQueryIterable results = new FederatedQueryIterable(); int totalProcessed = 1; ShardQueryConfiguration firstConfigCopy = null; + UUID queryId = originalConfig.getQuery().getId(); for (Pair dateRange : dateRanges) { // Format the start and end date of the current sub-query to execute. String subStartDate = dateFormat.format(dateRange.getLeft()); @@ -334,10 +343,14 @@ public CloseableIterable process(GenericQueryConfiguration genericCon configCopy.setBeginDate(dateRange.getLeft()); configCopy.setEndDate(dateRange.getRight()); + // we want to make sure the same query id for tracking purposes and execution + configCopy.getQuery().setId(queryId); + // Create a copy of the original default query planner, and process the query with the new date range. DefaultQueryPlanner subPlan = this.queryPlanner.clone(); try { + CloseableIterable queryData = subPlan.process(configCopy, query, settings, scannerFactory); results.addIterable(queryData); } catch (Exception e) { @@ -567,7 +580,7 @@ private Set getFieldsForQuery(ShardQueryConfiguration config, String que try { configCopy.setIndexedFields(metadataHelper.getIndexedFields(config.getDatatypeFilter())); configCopy.setReverseIndexedFields(metadataHelper.getReverseIndexedFields(config.getDatatypeFilter())); - queryTree = UnfieldedIndexExpansionVisitor.expandUnfielded(configCopy, scannerFactory, metadataHelper, queryTree); + queryTree = UnfieldedIndexExpansionVisitor.expandUnfielded(configCopy, scannerFactory, metadataHelper, queryTree, previouslyExpandedFieldCache); } catch (TableNotFoundException e) { QueryException qe = new QueryException(DatawaveErrorCode.METADATA_ACCESS_ERROR, e); throw new DatawaveFatalQueryException(qe); diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/IndexQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/IndexQueryPlanner.java index 9ead8720514..2a29c0c8ebb 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/IndexQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/IndexQueryPlanner.java @@ -33,14 +33,14 @@ public IndexQueryPlanner() { } @Override - public IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, Query settings, String queryString, - Boolean isFullTable, boolean isPreload) throws DatawaveQueryException { + public IteratorSetting getQueryIterator(MetadataHelper metadataHelper, ShardQueryConfiguration config, String queryString, Boolean isFullTable, + boolean isPreload) throws DatawaveQueryException { if (isFullTable) { QueryException qe = new QueryException(DatawaveErrorCode.FULL_TABLE_SCAN_DISALLOWED); throw new FullTableScansDisallowedException(qe); } - IteratorSetting cfg = super.getQueryIterator(metadataHelper, config, settings, queryString, isFullTable, isPreload); + IteratorSetting cfg = super.getQueryIterator(metadataHelper, config, queryString, isFullTable, isPreload); if (null == cfg) { try { cfg = settingFuture.get(); diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/AbstractQueryPlannerCallable.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/AbstractQueryPlannerCallable.java new file mode 100644 index 00000000000..c446f3b41b3 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/AbstractQueryPlannerCallable.java @@ -0,0 +1,34 @@ +package datawave.query.planner.async; + +import java.util.concurrent.Callable; + +import datawave.query.util.QueryStopwatch; + +/** + * Generic interface that allows stage names to be associated with various tasks. Extending classes may pass in a {@link QueryStopwatch} to capture timing + * details of the operation. + * + * @param + * the object type + */ +public abstract class AbstractQueryPlannerCallable implements Callable { + + protected QueryStopwatch timer; + + /** + * Constructor that supports timing operations + * + * @param timer + * a stop watch + */ + protected AbstractQueryPlannerCallable(QueryStopwatch timer) { + this.timer = timer; + } + + /** + * The stage name used for one or more timers + * + * @return the stage name + */ + public abstract String stageName(); +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchCompositeMetadata.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchCompositeMetadata.java new file mode 100644 index 00000000000..b56e3d0a6f0 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchCompositeMetadata.java @@ -0,0 +1,49 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.composite.CompositeMetadata; +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +/** + * A wrapper around {@link MetadataHelper#getCompositeMetadata(Set)} that allows for concurrent execution of expensive planner steps. + */ +public class FetchCompositeMetadata extends AbstractQueryPlannerCallable { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchCompositeMetadata(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchCompositeMetadata(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public CompositeMetadata call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + CompositeMetadata compositeMetadata = helper.getCompositeMetadata(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return compositeMetadata; + } + + @Override + public String stageName() { + return "Fetch CompositeMetadata"; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchContentExpansionFields.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchContentExpansionFields.java new file mode 100644 index 00000000000..5f10a3747a8 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchContentExpansionFields.java @@ -0,0 +1,62 @@ +package datawave.query.planner.async; + +import java.util.Set; +import java.util.concurrent.Callable; + +import org.apache.accumulo.core.client.TableNotFoundException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Joiner; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +/** + * A wrapper around the {@link MetadataHelper#getContentFields(Set)} method that allows for concurrent execution of expensive planner steps. + */ +public class FetchContentExpansionFields extends AbstractQueryPlannerCallable { + + private static final Logger log = LoggerFactory.getLogger(FetchContentExpansionFields.class); + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchContentExpansionFields(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchContentExpansionFields(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public String call() { + try { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + String fields = Joiner.on(',').join(helper.getContentFields(datatypes)); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return fields; + } catch (TableNotFoundException e) { + log.error("Failed to fetch content expansion fields"); + throw new RuntimeException(e); + } + } + + @Override + public String stageName() { + return "Fetch ContentExpansionFields"; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexOnlyFields.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexOnlyFields.java new file mode 100644 index 00000000000..fbe3f2cc03b --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexOnlyFields.java @@ -0,0 +1,46 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +public class FetchIndexOnlyFields extends AbstractQueryPlannerCallable> { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchIndexOnlyFields(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchIndexOnlyFields(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public Set call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + Set indexOnlyFields = helper.getIndexOnlyFields(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return indexOnlyFields; + } + + @Override + public String stageName() { + return "Fetch IndexOnly Fields"; + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexedFields.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexedFields.java new file mode 100644 index 00000000000..520d99d0361 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchIndexedFields.java @@ -0,0 +1,46 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +public class FetchIndexedFields extends AbstractQueryPlannerCallable> { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchIndexedFields(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchIndexedFields(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public Set call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + Set indexedFields = helper.getIndexedFields(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return indexedFields; + } + + @Override + public String stageName() { + return "Fetch Indexed Fields"; + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchNonEventFields.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchNonEventFields.java new file mode 100644 index 00000000000..7a2e8876829 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchNonEventFields.java @@ -0,0 +1,46 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +public class FetchNonEventFields extends AbstractQueryPlannerCallable> { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchNonEventFields(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchNonEventFields(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public Set call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + Set nonEventFields = helper.getNonEventFields(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return nonEventFields; + } + + @Override + public String stageName() { + return "Fetch NonEvent Fields"; + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTermFrequencyFields.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTermFrequencyFields.java new file mode 100644 index 00000000000..942268f618e --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTermFrequencyFields.java @@ -0,0 +1,45 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +public class FetchTermFrequencyFields extends AbstractQueryPlannerCallable> { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchTermFrequencyFields(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchTermFrequencyFields(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public Set call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + Set termFrequencyFields = helper.getTermFrequencyFields(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return termFrequencyFields; + } + + @Override + public String stageName() { + return "Fetch TermFrequency Fields"; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTypeMetadata.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTypeMetadata.java new file mode 100644 index 00000000000..2c884e7af4e --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/FetchTypeMetadata.java @@ -0,0 +1,49 @@ +package datawave.query.planner.async; + +import java.util.Set; + +import datawave.query.util.MetadataHelper; +import datawave.query.util.QueryStopwatch; +import datawave.query.util.TypeMetadata; +import datawave.util.time.TraceStopwatch; + +/** + * Wrapper around {@link MetadataHelper#getTypeMetadata(Set)} that allows for concurrent execution of expensive planner steps. + */ +public class FetchTypeMetadata extends AbstractQueryPlannerCallable { + + private final MetadataHelper helper; + private final Set datatypes; + + private TraceStopwatch stopwatch; + + public FetchTypeMetadata(MetadataHelper helper, Set datatypes) { + this(null, helper, datatypes); + } + + public FetchTypeMetadata(QueryStopwatch timer, MetadataHelper helper, Set datatypes) { + super(timer); + this.helper = helper; + this.datatypes = datatypes; + } + + @Override + public TypeMetadata call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + TypeMetadata typeMetadata = helper.getTypeMetadata(datatypes); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return typeMetadata; + } + + @Override + public String stageName() { + return "Fetch TypeMetadata"; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/async/SerializeIvaratorCacheDirs.java b/warehouse/query-core/src/main/java/datawave/query/planner/async/SerializeIvaratorCacheDirs.java new file mode 100644 index 00000000000..40041e35c7a --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/async/SerializeIvaratorCacheDirs.java @@ -0,0 +1,48 @@ +package datawave.query.planner.async; + +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.iterator.ivarator.IvaratorCacheDirConfig; +import datawave.query.planner.DefaultQueryPlanner; +import datawave.query.util.QueryStopwatch; +import datawave.util.time.TraceStopwatch; + +/** + * Wrapper around Ivarator json serialization that allows for asynchronous execution. + */ +public class SerializeIvaratorCacheDirs extends AbstractQueryPlannerCallable { + + private final DefaultQueryPlanner planner; + private final ShardQueryConfiguration config; + + private TraceStopwatch stopwatch; + + public SerializeIvaratorCacheDirs(DefaultQueryPlanner planner, ShardQueryConfiguration config) { + this(null, planner, config); + } + + public SerializeIvaratorCacheDirs(QueryStopwatch timer, DefaultQueryPlanner planner, ShardQueryConfiguration config) { + super(timer); + this.planner = planner; + this.config = config; + } + + @Override + public String call() throws Exception { + if (timer != null) { + stopwatch = timer.newStartedStopwatch(stageName()); + } + + String serialized = IvaratorCacheDirConfig.toJson(planner.getShuffledIvaratoCacheDirConfigs(config)); + + if (stopwatch != null) { + stopwatch.stop(); + } + + return serialized; + } + + @Override + public String stageName() { + return "Serialize IvaratorCacheDirs"; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java index 18bfd3bc275..ab5150b526c 100644 --- a/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/postprocessing/tf/DocumentKeysFunction.java @@ -48,7 +48,7 @@ protected void populateContentFunctions(JexlNode node) { ContentFunctionsDescriptor descriptor = new ContentFunctionsDescriptor(); ContentJexlArgumentDescriptor argsDescriptor; - Set[] fieldsAndTerms; + ContentFunctionsDescriptor.FieldTerms fieldsAndTerms; JexlNode parent; String field; @@ -67,12 +67,12 @@ protected void populateContentFunctions(JexlNode node) { // content, tf, and indexed fields are not actually needed to extract fields from the function node fieldsAndTerms = argsDescriptor.fieldsAndTerms(Collections.emptySet(), Collections.emptySet(), Collections.emptySet(), null); - if (fieldsAndTerms[0].size() != 1) { + if (fieldsAndTerms.totalFields() != 1) { throw new IllegalStateException("content function had more than one field"); } - field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms[0].iterator().next()); - ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms[1]); + field = JexlASTHelper.deconstructIdentifier(fieldsAndTerms.getFields().iterator().next()); + ContentFunction contentFunction = new ContentFunction(field, fieldsAndTerms.getTerms()); contentFunctions.put(contentFunction.getField(), contentFunction); if (isFunctionNegated(f)) { diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/AncestorEventDataFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/AncestorEventDataFilter.java index 9f2c5b03bd7..a40609e0556 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/AncestorEventDataFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/AncestorEventDataFilter.java @@ -36,12 +36,12 @@ public AncestorEventDataFilter(AncestorEventDataFilter other) { * @see datawave.query.function.Filter#accept(org.apache.accumulo.core.data.Key) */ @Override - public boolean apply(Entry input) { + public boolean apply(Entry entry) { // if the base document, then accept em all, otherwise defer to the quey field filter - if (keep(input.getKey())) { + if (keep(entry.getKey())) { return true; } else { - return super.apply(input); + return super.apply(entry); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryExpressionFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryExpressionFilter.java index 52c89bd6559..d9ab1d29e31 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryExpressionFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryExpressionFilter.java @@ -72,13 +72,13 @@ protected Map getFilters() { } @Override - public boolean apply(Map.Entry input) { - return apply(input.getKey(), true); + public boolean apply(Map.Entry entry) { + return apply(entry.getKey(), true); } @Override - public boolean peek(Map.Entry input) { - return apply(input.getKey(), false); + public boolean peek(Map.Entry entry) { + return apply(entry.getKey(), false); } public boolean peek(Key key) { diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java index 29d19c50bcc..daff189087e 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFieldFilter.java @@ -2,82 +2,87 @@ import java.util.Map; import java.util.Set; +import java.util.TreeSet; import javax.annotation.Nullable; import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; -import org.apache.commons.jexl3.parser.ASTIdentifier; -import org.apache.commons.jexl3.parser.ASTJexlScript; - -import com.google.common.collect.Sets; +import org.apache.hadoop.io.Text; +import datawave.query.data.parsers.EventKey; import datawave.query.jexl.JexlASTHelper; -import datawave.query.predicate.EventDataQueryFilter; -import datawave.query.predicate.KeyProjection; -import datawave.query.predicate.Projection; /** - * This filter will filter event data keys by only those fields that are required in the specified query. + * Inclusive filter that ensures only event keys which match the set of fields to retain are kept for evaluation. + *

+ * The fields to retain are built from query fields and user-specified return.fields + *

+ * This filter only operates on event keys. */ public class EventDataQueryFieldFilter implements EventDataQueryFilter { - private Set nonEventFields; - private KeyProjection keyProjection; + private Key document = null; + // the number of times next is called before issuing a seek + private int maxNextCount = -1; + // track the number of times next is called on the same field + private int nextCount; + // track the current field + private String currentField = null; - public EventDataQueryFieldFilter(EventDataQueryFieldFilter other) { - this.nonEventFields = other.nonEventFields; - if (other.document != null) { - document = new Key(other.document); - } - this.keyProjection = other.getProjection(); + // the set of fields to retain + private TreeSet fields; + private final EventKey parser; + + /** + * Default constructor + */ + public EventDataQueryFieldFilter() { + this.parser = new EventKey(); } /** - * Initialize filter with an empty projection + * Copy constructor used by the {@link #clone()} method * - * @param projections - * the projection - * @param projectionType - * the projection type + * @param other + * an instance of the {@link EventDataQueryFieldFilter} */ - public EventDataQueryFieldFilter(Set projections, Projection.ProjectionType projectionType) { - this.keyProjection = new KeyProjection(projections, projectionType); + public EventDataQueryFieldFilter(EventDataQueryFieldFilter other) { + if (other.document != null) { + this.document = new Key(other.document); + } + this.maxNextCount = other.maxNextCount; + this.fields = other.fields; + this.parser = other.parser; + // do not copy nextCount or currentField because that is internal state + this.nextCount = 0; + this.currentField = null; } /** - * Initiate from a KeyProjection + * Builder-style method used to set the fields to retain * - * @param projection - * the projection + * @param fields + * the fields to retain + * @return the filter */ - public EventDataQueryFieldFilter(KeyProjection projection) { - this.keyProjection = projection; + public EventDataQueryFieldFilter withFields(Set fields) { + this.fields = new TreeSet<>(fields); + return this; } /** - * Initialize the query field filter with all of the fields required to evaluation this query + * Builder-style method used to set the maximum next count * - * @param script - * a script - * @param nonEventFields - * a set of non-event fields + * @param maxNextCount + * the max next count + * @return the filter */ - @Deprecated - public EventDataQueryFieldFilter(ASTJexlScript script, Set nonEventFields) { - this.nonEventFields = nonEventFields; - - Set queryFields = Sets.newHashSet(); - for (ASTIdentifier identifier : JexlASTHelper.getIdentifiers(script)) { - queryFields.add(JexlASTHelper.deconstructIdentifier(identifier)); - } - - this.keyProjection = new KeyProjection(queryFields, Projection.ProjectionType.INCLUDES); - + public EventDataQueryFieldFilter withMaxNextCount(int maxNextCount) { + this.maxNextCount = maxNextCount; + return this; } - protected Key document = null; - @Override public void startNewDocument(Key document) { this.document = document; @@ -93,18 +98,52 @@ public boolean keep(Key k) { return true; } - public KeyProjection getProjection() { - return keyProjection; + @Override + public boolean apply(@Nullable Map.Entry entry) { + if (entry == null) { + return false; + } + return apply(entry.getKey(), true); } @Override - public boolean apply(@Nullable Map.Entry input) { - return keyProjection.apply(input); + public boolean peek(@Nullable Map.Entry entry) { + if (entry == null) { + return false; + } + // equivalent to apply in the event column case, simple redirect + return apply(entry.getKey(), false); } - @Override - public boolean peek(@Nullable Map.Entry input) { - return keyProjection.peek(input); + /** + * The field filter applies if the key's field is in the set of fields to retain + * + * @param key + * the key + * @param update + * flag that indicates if the {@link #nextCount} should be incremented + * @return true if the key should be retained + */ + private boolean apply(Key key, boolean update) { + parser.parse(key); + String field = parser.getField(); + field = JexlASTHelper.deconstructIdentifier(field); + + if (fields.contains(field)) { + nextCount = 0; // reset count + return true; + } else if (update) { + if (currentField != null && currentField.equals(field)) { + // only increment the count for consecutive misses within the same field + nextCount++; + } else { + // new field means new count + currentField = field; + nextCount = 0; + } + } + + return false; } /** @@ -120,19 +159,38 @@ public boolean peek(@Nullable Map.Entry input) { */ @Override public Range getSeekRange(Key current, Key endKey, boolean endKeyInclusive) { - // not yet implemented - return null; + if (current == null || maxNextCount == -1 || nextCount < maxNextCount) { + return null; + } + + parser.parse(current); + String higher = fields.higher(parser.getField()); + + Text columnQualifier; + if (higher == null) { + // generate a rollover range + Text columnFamily = new Text(current.getColumnFamilyData().toString() + '\u0000'); + Key start = new Key(current.getRow(), columnFamily); + return new Range(start, false, endKey, endKeyInclusive); + } else { + // seek to next available field + columnQualifier = new Text(higher + '\u0000'); + Key start = new Key(current.getRow(), current.getColumnFamily(), columnQualifier); + return new Range(start, false, endKey, endKeyInclusive); + } } @Override public int getMaxNextCount() { - // not yet implemented - return -1; + // while technically implemented, do not return the max next count here. This method is only used + // by the ChainableEventDataQueryFilter which does NOT guarantee that the filter will exclusively + // be applied to event keys. + throw new UnsupportedOperationException("EventDataQueryFieldFilter should not be chained with other filters"); } @Override public Key transform(Key toLimit) { - // not yet implemented + // not required because the EventDataQueryFieldFilter only operates on event keys return null; } @@ -140,29 +198,4 @@ public Key transform(Key toLimit) { public EventDataQueryFilter clone() { return new EventDataQueryFieldFilter(this); } - - /** - * Configure the delegate {@link Projection} with the fields to exclude - * - * @param excludes - * the set of fields to exclude - * @deprecated This method is deprecated and should no longer be used. - */ - @Deprecated - public void setExcludes(Set excludes) { - this.keyProjection.setExcludes(excludes); - } - - /** - * Set the delegate {@link Projection} with the fields to include - * - * @param includedFields - * the sorted set of fields to include - * @deprecated This method is deprecated and should no longer be used. - */ - @Deprecated - public void setIncludes(Set includedFields) { - this.keyProjection.setIncludes(includedFields); - } - } diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFilter.java index 97fbd92fe80..029fc3ad42b 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/EventDataQueryFilter.java @@ -5,12 +5,9 @@ import javax.annotation.Nullable; import org.apache.accumulo.core.data.Key; -import org.apache.accumulo.core.data.Range; - -import datawave.query.attributes.Document; /** - * This filter will filter event data keys by only those fields that are required in the specified query. + * Filters event keys to only the fields required by the specific query. */ public interface EventDataQueryFilter extends PeekingPredicate>, Filter, SeekingFilter, TransformingFilter, Cloneable { @@ -24,21 +21,21 @@ public interface EventDataQueryFilter extends PeekingPredicate var1); + boolean apply(@Nullable Map.Entry entry); @Override - boolean peek(@Nullable Map.Entry var1); + boolean peek(@Nullable Map.Entry entry); /** * The keep method is used to filter out those fields returned from the apply method above that will be returned to the user. diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDEventDataFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDEventDataFilter.java index 4266ad1d98d..0585883b988 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDEventDataFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDEventDataFilter.java @@ -154,18 +154,18 @@ public void startNewDocument(Key document) { * client. If a Key returns true but keep() returns false the document will be used for context evaluation, but will not be returned to the client. If a Key * returns true and keep() returns true the key will be used for context evaluation and returned to the client. * - * @param input + * @param entry * an input * @return true if Key should be added to context, false otherwise */ @Override - public boolean apply(Entry input) { - return apply(input, true); + public boolean apply(Entry entry) { + return apply(entry, true); } @Override - public boolean peek(Entry input) { - return apply(input, false); + public boolean peek(Entry entry) { + return apply(entry, false); } private boolean apply(Entry input, boolean update) { diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDFieldIndexQueryFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDFieldIndexQueryFilter.java index 88a6a47be33..fb4f9ec02b4 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDFieldIndexQueryFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDFieldIndexQueryFilter.java @@ -46,24 +46,24 @@ public void startNewDocument(Key documentKey) { /** * Always returns true. * - * @param var1 + * @param entry * an entry of type Key-Value * @return true, always */ @Override - public boolean apply(@Nullable Map.Entry var1) { + public boolean apply(@Nullable Map.Entry entry) { return true; } /** * Always returns true * - * @param var1 + * @param entry * an entry of type Key-Value * @return true, always */ @Override - public boolean peek(@Nullable Map.Entry var1) { + public boolean peek(@Nullable Map.Entry entry) { return true; } diff --git a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilter.java b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilter.java index 2ede84e69ed..3df5df28732 100644 --- a/warehouse/query-core/src/main/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilter.java +++ b/warehouse/query-core/src/main/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilter.java @@ -10,8 +10,8 @@ import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.Range; -import datawave.query.attributes.Document; import datawave.query.data.parsers.DatawaveKey; +import datawave.query.jexl.JexlASTHelper; /** * An EventDataQueryFilter for TermFrequencies, for use in a TLDQuery @@ -19,11 +19,11 @@ public class TLDTermFrequencyEventDataQueryFilter implements EventDataQueryFilter { private final Set indexOnlyFields; - private final EventDataQueryFilter attrFilter; + private final Set fields; - public TLDTermFrequencyEventDataQueryFilter(Set indexOnlyFields, EventDataQueryFilter attrFilter) { + public TLDTermFrequencyEventDataQueryFilter(Set indexOnlyFields, Set fields) { this.indexOnlyFields = indexOnlyFields; - this.attrFilter = attrFilter; + this.fields = fields; } @Override @@ -32,13 +32,13 @@ public void startNewDocument(Key documentKey) { } @Override - public boolean apply(@Nullable Map.Entry var1) { + public boolean apply(@Nullable Map.Entry entry) { // accept all return true; } @Override - public boolean peek(@Nullable Map.Entry var1) { + public boolean peek(@Nullable Map.Entry entry) { // accept all return true; } @@ -53,7 +53,13 @@ public boolean peek(@Nullable Map.Entry var1) { @Override public boolean keep(Key k) { DatawaveKey key = new DatawaveKey(k); - return (!TLDEventDataFilter.isRootPointer(k) || indexOnlyFields.contains(key.getFieldName())) && attrFilter.peek(new SimpleEntry(k, null)); + return (!TLDEventDataFilter.isRootPointer(k) || indexOnlyFields.contains(key.getFieldName())) && fieldMatches(k); + } + + private boolean fieldMatches(Key key) { + DatawaveKey parser = new DatawaveKey(key); + String fieldName = JexlASTHelper.deconstructIdentifier(parser.getFieldName()); + return fields.contains(fieldName); } @Override diff --git a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java index 9f168e63e32..ed032b43bdb 100644 --- a/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java +++ b/warehouse/query-core/src/main/java/datawave/query/scheduler/PushdownFunction.java @@ -120,6 +120,10 @@ public List apply(QueryData qd) { options.setQueryConfig(this.config); + String tableName = tableId.canonical(); + options.applyExecutionHints(tableName, config.getTableHints()); + options.applyConsistencyLevel(tableName, config.getTableConsistencyLevels()); + chunks.add(new ScannerChunk(options, plan.getRanges(), qd, server)); } catch (Exception e) { log.error(e); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java index 94a332e9772..1f3623a3ae1 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/RangeStreamScanner.java @@ -28,7 +28,7 @@ import org.apache.accumulo.core.data.Range; import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.security.Authorizations; -import org.apache.accumulo.core.util.PeekingIterator; +import org.apache.commons.collections4.iterators.PeekingIterator; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java index cf53d01206d..b78488a8d0a 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerFactory.java @@ -1,5 +1,6 @@ package datawave.query.tables; +import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -18,7 +19,8 @@ import org.apache.accumulo.core.conf.ClientProperty; import org.apache.accumulo.core.security.Authorizations; import org.apache.hadoop.conf.Configuration; -import org.apache.log4j.Logger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; @@ -49,10 +51,13 @@ public class ScannerFactory { protected ResourceQueue scanQueue = null; protected ShardQueryConfiguration config = null; - protected Map consistencyByTable = new HashMap<>(); - protected Map> hintsByTable = new HashMap<>(); + // consistency and execution hints can be mapped to table names or functional names + // for example, 'shardIndex' might map to a default executor pool for the shard index table + // while 'expansion' might map to a separate executor pool on the shard index + protected Map consistencyLevelMap = new HashMap<>(); + protected Map> executionHintMap = new HashMap<>(); - private static final Logger log = Logger.getLogger(ScannerFactory.class); + private static final Logger log = LoggerFactory.getLogger(ScannerFactory.class); /** * Preferred constructor, builds scanner factory from configs @@ -93,7 +98,7 @@ public ScannerFactory(AccumuloClient client, int queueSize) { } /** - * Method that allows a ScannerFactory to be updated by a config after initialization + * Method that allows a ScannerFactory to use scan execution and consistency hints from the provided {@link GenericQueryConfiguration}. * * @param genericConfig * a {@link GenericQueryConfiguration} @@ -104,12 +109,12 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) { Map consistencyLevels = genericConfig.getTableConsistencyLevels(); if (consistencyLevels != null && !consistencyLevels.isEmpty()) { - this.consistencyByTable = genericConfig.getTableConsistencyLevels(); + this.consistencyLevelMap = genericConfig.getTableConsistencyLevels(); } Map> hints = genericConfig.getTableHints(); if (hints != null && !hints.isEmpty()) { - this.hintsByTable = genericConfig.getTableHints(); + this.executionHintMap = genericConfig.getTableHints(); } int numThreads = DEFAULT_MAX_THREADS; @@ -131,18 +136,19 @@ public void updateConfigs(GenericQueryConfiguration genericConfig) { } if (log.isDebugEnabled()) { - log.debug("Created ScannerFactory " + System.identityHashCode(this) + " is wrapped ? " + (client instanceof WrappedConnector)); + log.debug("Created ScannerFactory {}, wrapped={}", System.identityHashCode(this), (client instanceof WrappedConnector)); } } public Scanner newSingleScanner(String tableName, Set auths, Query query) throws TableNotFoundException { if (open.get()) { Scanner bs = QueryScannerHelper.createScannerWithoutInfo(client, tableName, auths, query); + applyConfigs(bs, tableName); - log.debug("Created scanner " + System.identityHashCode(bs)); + log.debug("Created scanner {}", System.identityHashCode(bs)); if (log.isTraceEnabled()) { - log.trace("Adding instance " + bs.hashCode()); + log.trace("Adding instance {}", bs.hashCode()); } synchronized (open) { @@ -159,14 +165,51 @@ public Scanner newSingleScanner(String tableName, Set auths, Que } } + /** + * Create a new {@link BatchScanner} using the table name as the execution hint + * + * @param tableName + * the table name + * @param auths + * the set of authorizations + * @param threads + * the number of threads + * @param query + * the Query + * @return a BatchScanner + * @throws TableNotFoundException + * if no table exists + */ public BatchScanner newScanner(String tableName, Set auths, int threads, Query query) throws TableNotFoundException { + return newScanner(tableName, auths, threads, query, tableName); + } + + /** + * Creates a new {@link BatchScanner} with execution hints + * + * @param tableName + * the table name + * @param auths + * the set of authorizations + * @param threads + * the number of threads to use + * @param query + * the Query + * @param hintKey + * the key used to select an execution hint + * @return a BatchScanner + * @throws TableNotFoundException + * if no table exists + */ + public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, String hintKey) throws TableNotFoundException { if (open.get()) { BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query); - applyConfigs(bs, tableName); - log.debug("Created scanner " + System.identityHashCode(bs)); + applyConfigs(bs, hintKey, tableName); + + log.debug("Created scanner {}", System.identityHashCode(bs)); if (log.isTraceEnabled()) { - log.trace("Adding instance " + bs.hashCode()); + log.trace("Adding instance {}", bs.hashCode()); } synchronized (open) { if (open.get()) { @@ -185,11 +228,12 @@ public BatchScanner newScanner(String tableName, Set auths, int public BatchScanner newScanner(String tableName, Set auths, int threads, Query query, boolean reportErrors) throws TableNotFoundException { if (open.get()) { BatchScanner bs = QueryScannerHelper.createBatchScanner(client, tableName, auths, threads, query, reportErrors); + applyConfigs(bs, tableName); - log.debug("Created scanner " + System.identityHashCode(bs)); + log.debug("Created scanner {}", System.identityHashCode(bs)); if (log.isTraceEnabled()) { - log.trace("Adding instance " + bs.hashCode()); + log.trace("Adding instance {}", bs.hashCode()); } synchronized (open) { if (open.get()) { @@ -228,7 +272,28 @@ public BatchScanner newScanner(String tableName, Query query) throws TableNotFou * if there are issues */ public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings) throws Exception { - return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings).setThreads(scanQueue.getCapacity()); + return newQueryScanner(tableName, auths, settings, tableName); + } + + /** + * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries + * is hardcoded, below, to 1000, but can be changed + * + * @param tableName + * the table string + * @param auths + * a set of auths + * @param settings + * query settings + * @param executionHintKey + * a key used to select a scan execution hint + * @return a new scanner session + * @throws Exception + * if there are issues + */ + public BatchScannerSession newQueryScanner(final String tableName, final Set auths, Query settings, String executionHintKey) + throws Exception { + return newLimitedScanner(BatchScannerSession.class, tableName, auths, settings, executionHintKey).setThreads(scanQueue.getCapacity()); } /** @@ -246,17 +311,55 @@ public BatchScannerSession newQueryScanner(final String tableName, final Set T newLimitedScanner(Class wrapper, final String tableName, final Set auths, final Query settings) - throws Exception { + throws NoSuchMethodException, InvocationTargetException, InstantiationException, IllegalAccessException { + return newLimitedScanner(wrapper, tableName, auths, settings, tableName); + } + + /** + * Builds a new scanner session using a finalized table name and set of authorizations using the previously defined queue. Note that the number of entries + * is hardcoded, below, to 1000, but can be changed + * + * @param tableName + * the table string + * @param auths + * a set of auths + * @param settings + * query settings + * @param hintKey + * the key used to select an execution hint + * @param + * type of the wrapper + * @param wrapper + * a wrapper class + * @return a new scanner session + * @throws NoSuchMethodException + * in the case of no such method + * @throws InvocationTargetException + * in the case of no invocation target + * @throws InstantiationException + * in the case something fails to instantiate + * @throws IllegalAccessException + * in the case of an illegal access + * + */ + public T newLimitedScanner(Class wrapper, final String tableName, final Set auths, final Query settings, + String hintKey) throws NoSuchMethodException, InvocationTargetException, InstantiationException, IllegalAccessException { Preconditions.checkNotNull(scanQueue); Preconditions.checkNotNull(wrapper); Preconditions.checkArgument(open.get(), "Factory has been locked. No New scanners can be created"); - log.debug("Creating limited scanner whose max threads is is " + scanQueue.getCapacity() + " and max capacity is " + maxQueue); + log.debug("Creating limited scanner whose max threads is {} and max capacity is {}", scanQueue.getCapacity(), maxQueue); ScanSessionStats stats = null; if (accrueStats) { @@ -271,11 +374,11 @@ public T newLimitedScanner(Class wrapper, final St .newInstance(new ScannerSession(tableName, auths, scanQueue, maxQueue, settings).applyStats(stats)); } - applyConfigs(session, tableName); + applyConfigs(session, hintKey, tableName); - log.debug("Created session " + System.identityHashCode(session)); + log.debug("Created session {}", System.identityHashCode(session)); if (log.isTraceEnabled()) { - log.trace("Adding instance " + session.hashCode()); + log.trace("Adding instance {}", session.hashCode()); } synchronized (open) { if (open.get()) { @@ -312,7 +415,7 @@ public RangeStreamScanner newRangeScanner(String tableName, Set public boolean close(ScannerBase bs) { try { - log.debug("Closed scanner " + System.identityHashCode(bs)); + log.debug("Closed scanner {}", System.identityHashCode(bs)); if (instances.remove(bs)) { if (log.isTraceEnabled()) { log.trace("Closing instance " + bs.hashCode()); @@ -358,16 +461,16 @@ public boolean lockdown() { public void close(ScannerSession bs) { try { - log.debug("Closed session " + System.identityHashCode(bs)); + log.debug("Closed session {}", System.identityHashCode(bs)); if (sessionInstances.remove(bs)) { if (log.isTraceEnabled()) { - log.trace("Closing instance " + bs.hashCode()); + log.trace("Closing instance {}", bs.hashCode()); } bs.close(); } } catch (Exception e) { // ANY EXCEPTION HERE CAN SAFELY BE IGNORED - log.trace("Exception closing ScannerSession, can be safely ignored: {}", e); + log.trace("Exception closing ScannerSession, can be safely ignored:", e); } } @@ -413,40 +516,93 @@ public ScannerBase newRfileScanner(String tableName, Set auths, } /** - * Apply table-specific scanner configs to the provided scanner base object + * Apply table-specific scanner configs to the provided scanner base object using the table name as the key + * + * @param scannerBase + * a {@link ScannerBase} + * @param tableName + * the secondary hint key + */ + public void applyConfigs(ScannerBase scannerBase, String tableName) { + applyConfigs(scannerBase, tableName, tableName); + } + + /** + * Apply table-specific scanner configs to the provided scanner base object using the provided hint key, falling back to the table name if necessary * * @param scannerBase * a {@link ScannerBase} + * @param hintKey + * the primary hint key * @param tableName - * the table + * the secondary hint key */ - protected void applyConfigs(ScannerBase scannerBase, String tableName) { - if (consistencyByTable != null && consistencyByTable.containsKey(tableName)) { - scannerBase.setConsistencyLevel(consistencyByTable.get(tableName)); + public void applyConfigs(ScannerBase scannerBase, String hintKey, String tableName) { + + if (consistencyLevelMap != null && !consistencyLevelMap.isEmpty()) { + ScannerBase.ConsistencyLevel level = consistencyLevelMap.get(hintKey); + if (level == null) { + level = consistencyLevelMap.get(tableName); + } + + if (level == null) { + log.trace("no consistency level found for table: {} key: {}", tableName, hintKey); + } else { + scannerBase.setConsistencyLevel(level); + } } - if (hintsByTable != null && hintsByTable.containsKey(tableName)) { - scannerBase.setExecutionHints(hintsByTable.get(tableName)); + if (executionHintMap != null && !executionHintMap.isEmpty()) { + Map hint = executionHintMap.get(hintKey); + if (hint == null) { + hint = executionHintMap.get(tableName); + } + + if (hint == null) { + log.trace("no execution hint found for table: {} key: {} ", tableName, hintKey); + } else { + scannerBase.setExecutionHints(hint); + } } } /** - * Apply table-specific scanner configs to the provided scanner session + * Apply table-specific scanner configs to the provided scanner session using the provided hint key, falling back to the table name if necessary * * @param scannerSession * the {@link ScannerSession} + * @param hintKey + * the primary hint key * @param tableName - * the table + * used as a secondary hint key */ - protected void applyConfigs(ScannerSession scannerSession, String tableName) { + protected void applyConfigs(ScannerSession scannerSession, String hintKey, String tableName) { SessionOptions options = scannerSession.getOptions(); - if (consistencyByTable != null && consistencyByTable.containsKey(tableName)) { - options.setConsistencyLevel(consistencyByTable.get(tableName)); + if (consistencyLevelMap != null && !consistencyLevelMap.isEmpty()) { + ScannerBase.ConsistencyLevel level = consistencyLevelMap.get(hintKey); + if (level == null) { + level = consistencyLevelMap.get(tableName); + } + + if (level == null) { + log.trace("no consistency level found for table: {} key: {}", tableName, hintKey); + } else { + options.setConsistencyLevel(level); + } } - if (hintsByTable != null && hintsByTable.containsKey(tableName)) { - options.setExecutionHints(hintsByTable.get(tableName)); + if (executionHintMap != null && !executionHintMap.isEmpty()) { + Map hint = executionHintMap.get(hintKey); + if (hint == null) { + hint = executionHintMap.get(tableName); + } + + if (hint == null) { + log.trace("no execution hint found for table: {} key: {} ", tableName, hintKey); + } else { + options.setExecutionHints(hint); + } } scannerSession.setOptions(options); diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerSession.java b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerSession.java index 6cbccce6089..c01da58c973 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ScannerSession.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ScannerSession.java @@ -538,7 +538,7 @@ protected void findTop() throws Exception { } } - protected int scannerInvariant(final Iterator iter) { + protected int scannerInvariant(final Iterator iter) throws InterruptedException { int retrievalCount = 0; Result myEntry = null; @@ -554,6 +554,19 @@ protected int scannerInvariant(final Iterator iter) { // this creates a bottleneck on the resultQueue size, but guarantees no results will be lost boolean accepted = false; while (!accepted) { + // this thread exists in between the batch scanner and the other side of the queue, so check both side + // are still running, otherwise terminate + if (!isRunning() || state().equals(State.TERMINATED) || state().equals(State.FAILED)) { + log.info("aborting offer on scanner invariant due to thread no longer running"); + throw new InterruptedException("aborting offer on scanner invariant due to thread no longer running"); + } else if (uncaughtExceptionHandler.getThrowable() != null) { + log.warn("aborting offer on scanner invariant due to throwable", uncaughtExceptionHandler.getThrowable()); + throw new RuntimeException("aborting offer on scanner invariant due to throwable", uncaughtExceptionHandler.getThrowable()); + } else if (forceClose) { + log.info("cleaning up scanner due to external close"); + throw new InterruptedException("cleaning up scanner due to external close"); + } + try { accepted = resultQueue.offer(myEntry, 200, TimeUnit.MILLISECONDS); } catch (InterruptedException e) { diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java b/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java index 4303e13f5ff..9f869eeecd2 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/SessionOptions.java @@ -1,6 +1,7 @@ package datawave.query.tables; import java.util.Collection; +import java.util.Map; import org.apache.accumulo.core.client.IteratorSetting; import org.apache.accumulo.core.clientImpl.ScannerOptions; @@ -46,4 +47,24 @@ public Collection getIterators() { } return settings; } + + public void applyExecutionHints(Map scanHints) { + setExecutionHints(scanHints); + } + + public void applyExecutionHints(String tableName, Map> tableScanHints) { + if (tableScanHints.containsKey(tableName)) { + setExecutionHints(tableScanHints.get(tableName)); + } + } + + public void applyConsistencyLevel(ConsistencyLevel consistencyLevel) { + setConsistencyLevel(consistencyLevel); + } + + public void applyConsistencyLevel(String tableName, Map consistencyLevels) { + if (consistencyLevels.containsKey(tableName)) { + setConsistencyLevel(consistencyLevels.get(tableName)); + } + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardIndexQueryTable.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardIndexQueryTable.java index aeb685151a3..8914a164c14 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardIndexQueryTable.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardIndexQueryTable.java @@ -99,7 +99,7 @@ public ShardIndexQueryTable() {} public ShardIndexQueryTable(ShardIndexQueryTable other) { super(other); this.config = ShardIndexQueryConfiguration.create(other); - this.previouslyExpandedFieldCache = new ExpandedFieldCache(); + this.previouslyExpandedFieldCache = other.previouslyExpandedFieldCache; } @Override @@ -191,6 +191,10 @@ public GenericQueryConfiguration initialize(AccumuloClient client, Query setting getConfig().setClient(client); this.scannerFactory = new ScannerFactory(getConfig()); + if (this.previouslyExpandedFieldCache == null) { + this.previouslyExpandedFieldCache = new ExpandedFieldCache(); + } + MetadataHelper metadataHelper = initializeMetadataHelper(client, config.getMetadataTableName(), auths); if (StringUtils.isEmpty(settings.getQuery())) { diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java index 9bed672298f..3a113c75a66 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/ShardQueryLogic.java @@ -785,7 +785,30 @@ protected void loadQueryParameters(ShardQueryConfiguration config, Query setting if (StringUtils.isNotBlank(typeList)) { HashSet typeFilter = new HashSet<>(); - typeFilter.addAll(Arrays.asList(StringUtils.split(typeList, Constants.PARAM_VALUE_SEP))); + HashSet excludeSet = new HashSet<>(); + + for (String dataType : Arrays.asList(StringUtils.split(typeList, Constants.PARAM_VALUE_SEP))) { + if (dataType.charAt(0) == '!') { + excludeSet.add(StringUtils.substring(dataType, 1)); + } else { + typeFilter.add(dataType); + } + } + + if (!excludeSet.isEmpty()) { + if (typeFilter.isEmpty()) { + MetadataHelper metadataHelper = prepareMetadataHelper(config.getClient(), this.getMetadataTableName(), config.getAuthorizations(), + config.isRawTypes()); + + try { + typeFilter.addAll(metadataHelper.getDatatypes(null)); + } catch (TableNotFoundException e) { + throw new RuntimeException(e); + } + } + + typeFilter.removeAll(excludeSet); + } if (log.isDebugEnabled()) { log.debug("Type Filter: " + typeFilter); @@ -1562,6 +1585,14 @@ public void setTfNextSeek(int tfNextSeek) { getConfig().setTfNextSeek(tfNextSeek); } + public boolean isSeekingEventAggregation() { + return getConfig().isSeekingEventAggregation(); + } + + public void setSeekingEventAggregation(boolean seekingEventAggregation) { + getConfig().setSeekingEventAggregation(seekingEventAggregation); + } + public String getdisallowlistedFieldsString() { return getConfig().getDisallowlistedFieldsAsString(); } @@ -2432,6 +2463,14 @@ public void setMaxIndexScanTimeMillis(long maxTime) { getConfig().setMaxIndexScanTimeMillis(maxTime); } + public long getMaxAnyFieldScanTimeMillis() { + return getConfig().getMaxAnyFieldScanTimeMillis(); + } + + public void setMaxAnyFieldScanTimeMillis(long maxAnyFieldScanTimeMillis) { + getConfig().setMaxAnyFieldScanTimeMillis(maxAnyFieldScanTimeMillis); + } + public Function getQueryMacroFunction() { return queryMacroFunction; } @@ -2893,52 +2932,52 @@ public void setPruneQueryOptions(boolean pruneQueryOptions) { getConfig().setPruneQueryOptions(pruneQueryOptions); } - public boolean getUseFieldCounts() { - return getConfig().getUseFieldCounts(); + public boolean isRebuildDatatypeFilter() { + return getConfig().isRebuildDatatypeFilter(); } - public void setUseFieldCounts(boolean useFieldCounts) { - getConfig().setUseFieldCounts(useFieldCounts); + public void setRebuildDatatypeFilter(boolean rebuildDatatypeFilter) { + getConfig().setRebuildDatatypeFilter(rebuildDatatypeFilter); } - public boolean getUseTermCounts() { - return getConfig().getUseTermCounts(); + public boolean isRebuildDatatypeFilterPerShard() { + return getConfig().isRebuildDatatypeFilterPerShard(); } - public void setUseTermCounts(boolean useTermCounts) { - getConfig().setUseTermCounts(useTermCounts); + public void setRebuildDatatypeFilterPerShard(boolean rebuildDatatypeFilterPerShard) { + getConfig().setRebuildDatatypeFilterPerShard(rebuildDatatypeFilterPerShard); } - public boolean getSortQueryBeforeGlobalIndex() { - return getConfig().isSortQueryBeforeGlobalIndex(); + public boolean isSortQueryPreIndexWithImpliedCounts() { + return getConfig().isSortQueryPreIndexWithImpliedCounts(); } - public void setSortQueryBeforeGlobalIndex(boolean sortQueryBeforeGlobalIndex) { - getConfig().setSortQueryBeforeGlobalIndex(sortQueryBeforeGlobalIndex); + public void setSortQueryPreIndexWithImpliedCounts(boolean sortQueryPreIndexWithImpliedCounts) { + getConfig().setSortQueryPreIndexWithImpliedCounts(sortQueryPreIndexWithImpliedCounts); } - public boolean getSortQueryByCounts() { - return getConfig().isSortQueryByCounts(); + public boolean isSortQueryPreIndexWithFieldCounts() { + return getConfig().isSortQueryPreIndexWithFieldCounts(); } - public void setSortQueryByCounts(boolean sortQueryByCounts) { - getConfig().setSortQueryByCounts(sortQueryByCounts); + public void setSortQueryPreIndexWithFieldCounts(boolean sortQueryPreIndexWithFieldCounts) { + getConfig().setSortQueryPreIndexWithImpliedCounts(sortQueryPreIndexWithFieldCounts); } - public boolean isRebuildDatatypeFilter() { - return getConfig().isRebuildDatatypeFilter(); + public boolean isSortQueryPostIndexWithFieldCounts() { + return getConfig().isSortQueryPostIndexWithFieldCounts(); } - public void setRebuildDatatypeFilter(boolean rebuildDatatypeFilter) { - getConfig().setRebuildDatatypeFilter(rebuildDatatypeFilter); + public void setSortQueryPostIndexWithFieldCounts(boolean sortQueryPostIndexWithFieldCounts) { + getConfig().setSortQueryPostIndexWithFieldCounts(sortQueryPostIndexWithFieldCounts); } - public boolean isRebuildDatatypeFilterPerShard() { - return getConfig().isRebuildDatatypeFilterPerShard(); + public boolean isSortQueryPostIndexWithTermCounts() { + return getConfig().isSortQueryPostIndexWithTermCounts(); } - public void setRebuildDatatypeFilterPerShard(boolean rebuildDatatypeFilterPerShard) { - getConfig().setRebuildDatatypeFilterPerShard(rebuildDatatypeFilterPerShard); + public void setSortQueryPostIndexWithTermCounts(boolean sortQueryPostIndexWithTermCounts) { + getConfig().setSortQueryPostIndexWithTermCounts(sortQueryPostIndexWithTermCounts); } public boolean isUseQueryTreeScanHintRules() { diff --git a/warehouse/query-core/src/main/java/datawave/query/tables/async/Scan.java b/warehouse/query-core/src/main/java/datawave/query/tables/async/Scan.java index 2ff0b11e2b1..c3f991301ab 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tables/async/Scan.java +++ b/warehouse/query-core/src/main/java/datawave/query/tables/async/Scan.java @@ -199,7 +199,7 @@ public Scan call() throws Exception { Class initializer = delegatedResourceInitializer; boolean docSpecific = RangeDefinition.isDocSpecific(currentRange); - if (!docSpecific && !initializer.isInstance(RfileResource.class)) { + if (!docSpecific && !initializer.isAssignableFrom(RfileResource.class)) { // this catches the case where a scanner was created with a RunningResource and a shard range was generated // when bypassing accumulo with a RFileResource, do not override the initializer with a BatchResource initializer = BatchResource.class; diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDIndexBuildingVisitor.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDIndexBuildingVisitor.java index 42adf88f818..719bee71028 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDIndexBuildingVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDIndexBuildingVisitor.java @@ -200,20 +200,19 @@ protected EventFieldAggregator getEventFieldAggregator(String field, ChainableEv * * @param identifier * the field to be aggregated - * @param filter + * @param filterChain * a {@link ChainableEventDataQueryFilter} * @param maxNextCount * the maximum number of next calls before a seek is issued * @return a {@link TermFrequencyAggregator} loaded with the provided filter */ @Override - protected TermFrequencyAggregator buildTermFrequencyAggregator(String identifier, ChainableEventDataQueryFilter filter, int maxNextCount) { - - filter.addFilter(new TLDTermFrequencyEventDataQueryFilter(indexOnlyFields, attrFilter)); + protected TermFrequencyAggregator buildTermFrequencyAggregator(String identifier, ChainableEventDataQueryFilter filterChain, int maxNextCount) { Set toAggregate = fieldsToAggregate.contains(identifier) ? Collections.singleton(identifier) : Collections.emptySet(); + filterChain.addFilter(new TLDTermFrequencyEventDataQueryFilter(indexOnlyFields, toAggregate)); - return new TLDTermFrequencyAggregator(toAggregate, filter, tfNextSeek); + return new TLDTermFrequencyAggregator(toAggregate, filterChain, tfNextSeek); } /** diff --git a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java index e8602e72b31..a4f45e14d81 100644 --- a/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java +++ b/warehouse/query-core/src/main/java/datawave/query/tld/TLDQueryIterator.java @@ -9,6 +9,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Set; import org.apache.accumulo.core.data.ByteSequence; import org.apache.accumulo.core.data.Key; @@ -31,6 +32,7 @@ import datawave.query.function.TLDRangeProvider; import datawave.query.iterator.NestedIterator; import datawave.query.iterator.QueryIterator; +import datawave.query.iterator.QueryOptions; import datawave.query.iterator.SourcedOptions; import datawave.query.iterator.logic.IndexIterator; import datawave.query.jexl.functions.FieldIndexAggregator; @@ -101,38 +103,75 @@ public void init(SortedKeyValueIterator source, Map op @Override public FieldIndexAggregator getFiAggregator() { if (fiAggregator == null) { - fiAggregator = new TLDFieldIndexAggregator(getNonEventFields(), getFIEvaluationFilter(), getFiNextSeek()); + fiAggregator = new TLDFieldIndexAggregator(getNonEventFields(), getFiEvaluationFilter(), getFiNextSeek()); } return fiAggregator; } + @Override + public EventDataQueryFilter getEvaluationFilter() { + if (this.evaluationFilter == null && script != null) { + + AttributeFactory attributeFactory = new AttributeFactory(typeMetadata); + Map expressionFilters = getExpressionFilters(script, attributeFactory); + + // setup an evaluation filter to avoid loading every single child key into the event + this.evaluationFilter = new TLDEventDataFilter(script, getAllFields(), expressionFilters, useAllowListedFields ? allowListedFields : null, + useDisallowListedFields ? disallowListedFields : null, getEventFieldSeek(), getEventNextSeek(), + limitFieldsPreQueryEvaluation ? limitFieldsMap : Collections.emptyMap(), limitFieldsField, getNonEventFields()); + } + return this.evaluationFilter != null ? evaluationFilter.clone() : null; + } + /** * Distinct from getEvaluation filter as the FI filter is used to prevent FI hits on nonEventFields that are not indexOnly fields * * @return an {@link EventDataQueryFilter} */ - protected EventDataQueryFilter getFIEvaluationFilter() { - ChainableEventDataQueryFilter filterChain = new ChainableEventDataQueryFilter(); - // primary filter on the current filter - filterChain.addFilter(getEvaluationFilter()); - // prevent anything that is not an index only field from being kept at the tld level, otherwise allow all - filterChain.addFilter(new TLDFieldIndexQueryFilter(getIndexOnlyFields())); - return filterChain; + @Override + public EventDataQueryFilter getFiEvaluationFilter() { + if (fiEvaluationFilter == null && script != null) { + if (QueryIterator.isDocumentSpecificRange(range)) { + // this is to deal with a TF optimization where the TF is scanned instead of the FI in the + // document specific case. + fiEvaluationFilter = getEventEvaluationFilter(); + } else { + fiEvaluationFilter = new TLDFieldIndexQueryFilter(getIndexOnlyFields()); + } + + return fiEvaluationFilter.clone(); + } + return fiEvaluationFilter != null ? fiEvaluationFilter.clone() : null; } @Override - public EventDataQueryFilter getEvaluationFilter() { - if (this.evaluationFilter == null && script != null) { + public EventDataQueryFilter getEventEvaluationFilter() { + if (this.eventEvaluationFilter == null && script != null) { AttributeFactory attributeFactory = new AttributeFactory(typeMetadata); Map expressionFilters = getExpressionFilters(script, attributeFactory); // setup an evaluation filter to avoid loading every single child key into the event - this.evaluationFilter = new TLDEventDataFilter(script, getAllFields(), expressionFilters, useAllowListedFields ? allowListedFields : null, + this.eventEvaluationFilter = new TLDEventDataFilter(script, getEventFields(), expressionFilters, useAllowListedFields ? allowListedFields : null, useDisallowListedFields ? disallowListedFields : null, getEventFieldSeek(), getEventNextSeek(), limitFieldsPreQueryEvaluation ? limitFieldsMap : Collections.emptyMap(), limitFieldsField, getNonEventFields()); } - return this.evaluationFilter != null ? evaluationFilter.clone() : null; + return this.eventEvaluationFilter != null ? eventEvaluationFilter.clone() : null; + } + + public Set getEventFields() { + Set fields = getAllFields(); + fields.removeAll(getIndexOnlyFields()); + return fields; + } + + /** + * In the TLD case replace the {@link QueryOptions#eventFilter} with an evaluation filter + * + * @return an evaluation filter + */ + public EventDataQueryFilter getEventFilter() { + return getEvaluationFilter(); } @Override diff --git a/warehouse/query-core/src/main/java/datawave/query/util/QueryStopwatch.java b/warehouse/query-core/src/main/java/datawave/query/util/QueryStopwatch.java index f30db8b073b..2a64d8c9086 100644 --- a/warehouse/query-core/src/main/java/datawave/query/util/QueryStopwatch.java +++ b/warehouse/query-core/src/main/java/datawave/query/util/QueryStopwatch.java @@ -8,6 +8,7 @@ import java.util.ArrayDeque; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Map.Entry; import java.util.NoSuchElementException; import java.util.concurrent.TimeUnit; @@ -28,8 +29,9 @@ * */ public class QueryStopwatch { - public static final String NEWLINE = "\n", INDENT = " "; - protected ArrayDeque> watches = Queues.newArrayDeque(); + public static final String NEWLINE = "\n"; + public static final String INDENT = " "; + protected final ArrayDeque> watches = Queues.newArrayDeque(); /** * Creates a new Stopwatch for use but does not start it @@ -43,7 +45,9 @@ private TraceStopwatch newStopwatch(String header) { TraceStopwatch sw = new TraceStopwatch(header); - watches.add(Maps.immutableEntry(header, sw)); + synchronized (watches) { + watches.add(Maps.immutableEntry(header, sw)); + } return sw; } @@ -55,7 +59,7 @@ public TraceStopwatch newStartedStopwatch(String header) { return sw; } - public TraceStopwatch peek() { + public synchronized TraceStopwatch peek() { Entry entry = watches.peekLast(); if (null == entry) { NotFoundQueryException qe = new NotFoundQueryException(DatawaveErrorCode.STOPWATCH_MISSING); @@ -65,13 +69,31 @@ public TraceStopwatch peek() { return entry.getValue(); } + /** + * Get the stopwatch associated with the stage name, or null if no such stopwatch exists + * + * @param stageName + * the stage name + * @return the stopwatch, or null if no such stopwatch exists + */ + public TraceStopwatch get(String stageName) { + synchronized (watches) { + for (Map.Entry entry : watches) { + if (entry.getKey().equals(stageName)) { + return entry.getValue(); + } + } + } + return null; + } + public String summarize() { List logLines = summarizeAsList(); return Joiner.on('\n').join(logLines); } - public List summarizeAsList() { + public synchronized List summarizeAsList() { if (this.watches.isEmpty()) { return Collections.emptyList(); } @@ -109,7 +131,9 @@ public List summarizeAsList() { } public void appendTimers(QueryStopwatch queryStopwatch) { - this.watches.addAll(queryStopwatch.watches); + synchronized (watches) { + this.watches.addAll(queryStopwatch.watches); + } } protected String formatMillis(long elapsedMillis) { diff --git a/warehouse/query-core/src/main/resources/DATAWAVE_EDGE.xml b/warehouse/query-core/src/main/resources/DATAWAVE_EDGE.xml index 31d6cedfd7f..8fb59109828 100644 --- a/warehouse/query-core/src/main/resources/DATAWAVE_EDGE.xml +++ b/warehouse/query-core/src/main/resources/DATAWAVE_EDGE.xml @@ -9,26 +9,26 @@ name representation as needed for the deployment environment's default query syntax. --> - - + + - - + + - - + + - - + + - - + + - - + + - - + + diff --git a/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java b/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java new file mode 100644 index 00000000000..09cbd010988 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/core/iterators/BoundedRangeExpansionIteratorTest.java @@ -0,0 +1,332 @@ +package datawave.core.iterators; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.google.common.base.Joiner; + +public class BoundedRangeExpansionIteratorTest { + + private static final Value emptyValue = new Value(); + + private String startDate; + private String endDate; + + private String lower; + private String upper; + + private Set datatypes = new HashSet<>(); + private Set expected = new HashSet<>(); + + @BeforeEach + public void beforeEach() { + startDate = null; + endDate = null; + lower = null; + upper = null; + datatypes.clear(); + expected.clear(); + } + + @Test + public void testSingleDay_SingleValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testSingleDay_SingleValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testSingleDay_SingleValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testSingleDay_MultiValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testSingleDay_AllValues_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withExpected(Set.of("value-1", "value-2", "value-3")); + drive(); + } + + @Test + public void testSingleDay_AllValues_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + // value-3 does not contain datatype-a + drive(); + } + + @Test + public void testSingleDay_AllValues_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240501"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + // + + @Test + public void testAllDays_SingleValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testMultiDay_SingleValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1")); + drive(); + } + + @Test + public void testMultiDay_SingleValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-1"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + drive(); + } + + @Test + public void testMultiDay_MultiValue_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-2"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + @Test + public void testMultiDay_AllValues_EmptyDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withExpected(Set.of("value-1", "value-2", "value-3")); + drive(); + } + + @Test + public void testMultiDay_AllValues_CorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-a")); + withExpected(Set.of("value-1", "value-2")); + // value-3 does not contain datatype-a + drive(); + } + + @Test + public void testMultiDay_AllValues_IncorrectDatatypeFilter() { + withBoundedRange("value-1", "value-3"); + withDateRange("20240501", "20240505"); + withDatatypes(Set.of("datatype-z")); + drive(); + } + + private void drive() { + assertNotNull(lower, "lower bound must be specified"); + assertNotNull(upper, "upper bound must be specified"); + assertNotNull(startDate, "start date must be specified"); + assertNotNull(endDate, "end date must be specified"); + + Map options = new HashMap<>(); + options.put(BoundedRangeExpansionIterator.START_DATE, startDate); + options.put(BoundedRangeExpansionIterator.END_DATE, endDate); + if (!datatypes.isEmpty()) { + options.put(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(datatypes)); + } + + SortedMapIterator data = createData(); + BoundedRangeExpansionIterator iter = new BoundedRangeExpansionIterator(); + + Range range = new Range(lower, true, upper, true); + + try { + iter.init(data, options, null); + iter.seek(range, Collections.emptySet(), true); + + Set results = new HashSet<>(); + while (iter.hasTop()) { + Key k = iter.getTopKey(); + boolean first = results.add(k.getRow().toString()); + assertTrue(first, "Iterator returned the same row twice"); + iter.next(); + } + + assertEquals(expected, results); + + } catch (Exception e) { + fail("Failed to execute test", e); + } + } + + @Test + public void testTeardownRebuild() { + withDateRange("20240501", "20240505"); + + Map options = new HashMap<>(); + options.put(BoundedRangeExpansionIterator.START_DATE, startDate); + options.put(BoundedRangeExpansionIterator.END_DATE, endDate); + if (!datatypes.isEmpty()) { + options.put(BoundedRangeExpansionIterator.DATATYPES_OPT, Joiner.on(',').join(datatypes)); + } + + SortedMapIterator data = createData(); + BoundedRangeExpansionIterator iter = new BoundedRangeExpansionIterator(); + + Range range = new Range("value-2", false, "value-3", true); + + try { + iter.init(data, options, null); + iter.seek(range, Collections.emptySet(), true); + + assertTrue(iter.hasTop()); + Key k = iter.getTopKey(); + + assertEquals("value-3", k.getRow().toString()); + } catch (Exception e) { + fail("Failed to execute test", e); + } + } + + private void withBoundedRange(String lower, String upper) { + assertNotNull(lower); + assertNotNull(upper); + this.lower = lower; + this.upper = upper; + } + + private void withDateRange(String startDate, String endDate) { + assertNotNull(startDate); + assertNotNull(endDate); + this.startDate = startDate; + this.endDate = endDate; + } + + private void withDatatypes(Set datatypes) { + assertFalse(datatypes.isEmpty()); + this.datatypes = datatypes; + } + + private void withExpected(Set expectedRows) { + assertFalse(expectedRows.isEmpty()); + this.expected = expectedRows; + } + + /** + * Simulate fetching the column family by only having one field + * + * @return the data + */ + private SortedMapIterator createData() { + SortedMap data = new TreeMap<>(); + data.put(new Key("value-1", "FIELD_A", "20240501_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_1\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_2\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240501_3\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240502_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240503_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240504_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-1", "FIELD_A", "20240505_0\u0000datatype-a"), emptyValue); + + data.put(new Key("value-2", "FIELD_A", "20240501_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240501_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240502_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240502_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240503_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240503_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240504_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240504_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240505_0\u0000datatype-a"), emptyValue); + data.put(new Key("value-2", "FIELD_A", "20240505_0\u0000datatype-b"), emptyValue); + + data.put(new Key("value-3", "FIELD_A", "20240501_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240501_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240502_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240502_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240503_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240503_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240504_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240504_1\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240505_0\u0000datatype-b"), emptyValue); + data.put(new Key("value-3", "FIELD_A", "20240505_1\u0000datatype-b"), emptyValue); + return new SortedMapIterator(data); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/AnyFieldQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/AnyFieldQueryTest.java index b6fea3bfa12..76c962e47d8 100644 --- a/warehouse/query-core/src/test/java/datawave/query/AnyFieldQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/AnyFieldQueryTest.java @@ -9,10 +9,42 @@ import static datawave.query.testframework.RawDataManager.RN_OP; import static org.junit.Assert.fail; +import java.lang.reflect.InvocationTargetException; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; - +import java.util.Iterator; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; +import org.apache.accumulo.core.client.BatchDeleter; +import org.apache.accumulo.core.client.BatchScanner; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.ConditionalWriter; +import org.apache.accumulo.core.client.ConditionalWriterConfig; +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.MultiTableBatchWriter; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.ScannerBase; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.client.admin.InstanceOperations; +import org.apache.accumulo.core.client.admin.NamespaceOperations; +import org.apache.accumulo.core.client.admin.ReplicationOperations; +import org.apache.accumulo.core.client.admin.SecurityOperations; +import org.apache.accumulo.core.client.admin.TableOperations; +import org.apache.accumulo.core.client.sample.SamplerConfiguration; +import org.apache.accumulo.core.data.Key; import org.apache.accumulo.core.data.KeyValue; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.hadoop.io.Text; import org.apache.log4j.Level; import org.apache.log4j.Logger; import org.junit.BeforeClass; @@ -21,14 +53,22 @@ import com.google.common.collect.Multimap; +import datawave.accumulo.inmemory.InMemoryAccumuloClient; +import datawave.accumulo.inmemory.InMemoryInstance; import datawave.data.ColumnFamilyConstants; import datawave.ingest.data.config.ingest.CompositeIngest; +import datawave.microservice.query.Query; import datawave.query.exceptions.DatawaveFatalQueryException; import datawave.query.exceptions.FullTableScansDisallowedException; import datawave.query.jexl.JexlASTHelper; import datawave.query.planner.DefaultQueryPlanner; import datawave.query.planner.FederatedQueryPlanner; import datawave.query.planner.rules.RegexPushdownTransformRule; +import datawave.query.tables.AnyFieldScanner; +import datawave.query.tables.ResourceQueue; +import datawave.query.tables.ScannerFactory; +import datawave.query.tables.ScannerSession; +import datawave.query.tables.SessionOptions; import datawave.query.testframework.AbstractFunctionalQuery; import datawave.query.testframework.AccumuloSetup; import datawave.query.testframework.CitiesDataType; @@ -92,6 +132,27 @@ public void testEqual() throws Exception { } } + @Test(expected = DatawaveFatalQueryException.class) + public void testEqualsTimeout() throws Exception { + log.info("------ testEqualsTimeout ------"); + + // set very fast timeouts + logic.getConfig().setMaxAnyFieldScanTimeMillis(1); + + for (final TestCities city : TestCities.values()) { + String cityPhrase = EQ_OP + "'" + city.name() + "'"; + String query = Constants.ANY_FIELD + cityPhrase; + + // Test the plan with all expansions + String anyCity = CityField.CITY.name() + cityPhrase; + if (city.name().equals("london")) { + anyCity = "(" + anyCity + JEXL_OR_OP + CityField.STATE.name() + cityPhrase + ")"; + } + String plan = getPlan(new DelayedClient(client, 6000), query, true, true); + assertPlanEquals(anyCity, plan); + } + } + @Test public void testEqualMissesRemovedIndexedField() throws Exception { log.info("------ testEqualMissesRemovedIndexedField ------"); @@ -154,7 +215,7 @@ public void testNotEqual() throws Exception { assertPlanEquals(anyCity, plan); // test running the query - anyCity = this.dataManager.convertAnyField(cityPhrase, RawDataManager.AND_OP); + anyCity = this.dataManager.convertAnyField(cityPhrase, AND_OP); try { runTest(query, anyCity); fail("expecting exception"); @@ -186,7 +247,7 @@ public void testNotEqual() throws Exception { plan = getPlan(query, false, true); assertPlanEquals(anyCity, plan); - anyCity = this.dataManager.convertAnyField(cityPhrase, RawDataManager.AND_OP); + anyCity = this.dataManager.convertAnyField(cityPhrase, AND_OP); runTest(query, anyCity); } finally { this.logic.setFullTableScanEnabled(false); @@ -1401,4 +1462,413 @@ protected void testInit() { this.auths = CitiesDataType.getTestAuths(); this.documentKey = CityField.EVENT_ID.name(); } + + private static class DelayedClient implements AccumuloClient { + private final AccumuloClient client; + private long delay; + + public DelayedClient(AccumuloClient client, long delay) { + this.client = client; + this.delay = delay; + } + + @Override + public BatchScanner createBatchScanner(String s, Authorizations authorizations, int i) throws TableNotFoundException { + if (s.equals("shardIndex")) { + return new DelayedScanner(client.createBatchScanner(s, authorizations, i), delay); + } + + return client.createBatchScanner(s, authorizations, i); + } + + @Override + public BatchScanner createBatchScanner(String s, Authorizations authorizations) throws TableNotFoundException { + if (s.equals("shardIndex")) { + return new DelayedScanner(client.createBatchScanner(s, authorizations), delay); + } + + return client.createBatchScanner(s, authorizations); + } + + @Override + public BatchScanner createBatchScanner(String s) throws TableNotFoundException, AccumuloSecurityException, AccumuloException { + if (s.equals("shardIndex")) { + return new DelayedScanner(client.createBatchScanner(s), delay); + } + + return client.createBatchScanner(s); + } + + @Override + public BatchDeleter createBatchDeleter(String s, Authorizations authorizations, int i, BatchWriterConfig batchWriterConfig) + throws TableNotFoundException { + return client.createBatchDeleter(s, authorizations, i, batchWriterConfig); + } + + @Override + public BatchDeleter createBatchDeleter(String s, Authorizations authorizations, int i) throws TableNotFoundException { + return client.createBatchDeleter(s, authorizations, i); + } + + @Override + public BatchWriter createBatchWriter(String s, BatchWriterConfig batchWriterConfig) throws TableNotFoundException { + return client.createBatchWriter(s, batchWriterConfig); + } + + @Override + public BatchWriter createBatchWriter(String s) throws TableNotFoundException { + return client.createBatchWriter(s); + } + + @Override + public MultiTableBatchWriter createMultiTableBatchWriter(BatchWriterConfig batchWriterConfig) { + return client.createMultiTableBatchWriter(batchWriterConfig); + } + + @Override + public MultiTableBatchWriter createMultiTableBatchWriter() { + return client.createMultiTableBatchWriter(); + } + + @Override + public Scanner createScanner(String s, Authorizations authorizations) throws TableNotFoundException { + if (s.equals("shardIndex")) { + return new DelayedScanner(client.createScanner(s, authorizations), delay); + } + + return client.createScanner(s, authorizations); + } + + @Override + public Scanner createScanner(String s) throws TableNotFoundException, AccumuloSecurityException, AccumuloException { + if (s.equals("shardIndex")) { + return new DelayedScanner(client.createScanner(s), delay); + } + + return client.createScanner(s); + } + + @Override + public ConditionalWriter createConditionalWriter(String s, ConditionalWriterConfig conditionalWriterConfig) throws TableNotFoundException { + return client.createConditionalWriter(s, conditionalWriterConfig); + } + + @Override + public ConditionalWriter createConditionalWriter(String s) throws TableNotFoundException { + return client.createConditionalWriter(s); + } + + @Override + public String whoami() { + return client.whoami(); + } + + @Override + public TableOperations tableOperations() { + return client.tableOperations(); + } + + @Override + public NamespaceOperations namespaceOperations() { + return client.namespaceOperations(); + } + + @Override + public SecurityOperations securityOperations() { + return client.securityOperations(); + } + + @Override + public InstanceOperations instanceOperations() { + return client.instanceOperations(); + } + + @Override + public ReplicationOperations replicationOperations() { + return client.replicationOperations(); + } + + @Override + public Properties properties() { + return client.properties(); + } + + @Override + public void close() { + client.close(); + } + } + + private static class DelayedIterator implements Iterator { + private Iterator delegate; + private long delay; + + private DelayedIterator(Iterator delegate, long delay) { + this.delegate = delegate; + this.delay = delay; + } + + @Override + public boolean hasNext() { + try { + Thread.sleep(delay); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + return delegate.hasNext(); + } + + @Override + public T next() { + try { + Thread.sleep(delay); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + + return delegate.next(); + } + } + + private static class DelayedScanner implements Scanner, BatchScanner { + private Scanner delegateScanner; + private BatchScanner delegateBatchScanner; + private long delay = 0; + + private DelayedScanner(Scanner scanner, long delay) { + this.delegateScanner = scanner; + this.delay = delay; + } + + private DelayedScanner(BatchScanner batchScanner, long delay) { + this.delegateBatchScanner = batchScanner; + this.delay = delay; + } + + @Override + public void setRanges(Collection collection) { + this.delegateBatchScanner.setRanges(collection); + } + + @Override + public void setRange(Range range) { + this.delegateScanner.setRange(range); + } + + @Override + public Range getRange() { + return this.delegateScanner.getRange(); + } + + @Override + public void setBatchSize(int i) { + this.delegateScanner.setBatchSize(i); + } + + @Override + public int getBatchSize() { + return this.delegateScanner.getBatchSize(); + } + + @Override + public void enableIsolation() { + this.delegateScanner.enableIsolation(); + } + + @Override + public void disableIsolation() { + this.delegateScanner.disableIsolation(); + } + + @Override + public long getReadaheadThreshold() { + return this.delegateScanner.getReadaheadThreshold(); + } + + @Override + public void setReadaheadThreshold(long l) { + this.delegateScanner.setReadaheadThreshold(l); + } + + @Override + public void addScanIterator(IteratorSetting iteratorSetting) { + if (this.delegateScanner != null) { + this.delegateScanner.addScanIterator(iteratorSetting); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.addScanIterator(iteratorSetting); + } + } + + @Override + public void removeScanIterator(String s) { + if (this.delegateScanner != null) { + this.delegateScanner.removeScanIterator(s); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.removeScanIterator(s); + } + } + + @Override + public void updateScanIteratorOption(String s, String s1, String s2) { + if (this.delegateScanner != null) { + this.delegateScanner.updateScanIteratorOption(s, s1, s2); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.updateScanIteratorOption(s, s1, s2); + } + } + + @Override + public void fetchColumnFamily(Text text) { + if (this.delegateScanner != null) { + this.delegateScanner.fetchColumnFamily(text); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.fetchColumnFamily(text); + } + } + + @Override + public void fetchColumn(Text text, Text text1) { + if (this.delegateScanner != null) { + this.delegateScanner.fetchColumn(text, text1); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.fetchColumn(text, text1); + } + } + + @Override + public void fetchColumn(IteratorSetting.Column column) { + if (this.delegateScanner != null) { + this.delegateScanner.fetchColumn(column); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.fetchColumn(column); + } + } + + @Override + public void clearColumns() { + if (this.delegateScanner != null) { + this.delegateScanner.clearColumns(); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.clearColumns(); + } + } + + @Override + public void clearScanIterators() { + if (this.delegateScanner != null) { + this.delegateScanner.clearScanIterators(); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.clearScanIterators(); + } + } + + @Override + public Iterator> iterator() { + Iterator> iterator = null; + if (this.delegateScanner != null) { + iterator = this.delegateScanner.iterator(); + } else if (this.delegateBatchScanner != null) { + iterator = this.delegateBatchScanner.iterator(); + } + + iterator = new DelayedIterator<>(iterator, delay); + + return iterator; + } + + @Override + public void setTimeout(long l, TimeUnit timeUnit) { + if (this.delegateScanner != null) { + this.delegateScanner.setTimeout(l, timeUnit); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.setTimeout(l, timeUnit); + } + } + + @Override + public long getTimeout(TimeUnit timeUnit) { + if (this.delegateScanner != null) { + return this.delegateScanner.getTimeout(timeUnit); + } else if (this.delegateBatchScanner != null) { + return this.delegateBatchScanner.getTimeout(timeUnit); + } + + return -1; + } + + @Override + public void close() { + if (this.delegateScanner != null) { + this.delegateScanner.close(); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.close(); + } + } + + @Override + public Authorizations getAuthorizations() { + if (this.delegateScanner != null) { + return this.delegateScanner.getAuthorizations(); + } else if (this.delegateBatchScanner != null) { + return this.delegateBatchScanner.getAuthorizations(); + } + + return null; + } + + @Override + public void setSamplerConfiguration(SamplerConfiguration samplerConfiguration) { + if (this.delegateScanner != null) { + this.delegateScanner.setSamplerConfiguration(samplerConfiguration); + } else if (this.delegateBatchScanner != null) { + this.delegateBatchScanner.setSamplerConfiguration(samplerConfiguration); + } + } + + @Override + public SamplerConfiguration getSamplerConfiguration() { + return null; + } + + @Override + public void clearSamplerConfiguration() { + + } + + @Override + public void setBatchTimeout(long l, TimeUnit timeUnit) { + + } + + @Override + public long getBatchTimeout(TimeUnit timeUnit) { + return 0; + } + + @Override + public void setClassLoaderContext(String s) { + + } + + @Override + public void clearClassLoaderContext() { + + } + + @Override + public String getClassLoaderContext() { + return ""; + } + + @Override + public ConsistencyLevel getConsistencyLevel() { + return null; + } + + @Override + public void setConsistencyLevel(ConsistencyLevel consistencyLevel) { + + } + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/CompositeFunctionsTest.java b/warehouse/query-core/src/test/java/datawave/query/CompositeFunctionsTest.java index 46fb753936d..74778b573a8 100644 --- a/warehouse/query-core/src/test/java/datawave/query/CompositeFunctionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/CompositeFunctionsTest.java @@ -453,12 +453,12 @@ public void testNulls() throws Exception { Arrays.asList("CORLEONE", "CAPONE", "SOPRANO"), Collections.emptyList(), Arrays.asList("CORLEONE", "CAPONE", "SOPRANO"), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), Collections.emptyList(), Collections.emptyList(), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() @@ -503,14 +503,14 @@ public void testNotNulls() throws Exception { // @formatter:off @SuppressWarnings("unchecked") List[] expectedLists = new List[] { - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), - Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), + Arrays.asList("CORLEONE", "CAPONE", "SOPRANO", "ANDOLINI", "TATTAGLIA"), Arrays.asList("CORLEONE", "CAPONE", "SOPRANO"), Collections.emptyList(), Collections.emptyList(), diff --git a/warehouse/query-core/src/test/java/datawave/query/FunctionalSetTest.java b/warehouse/query-core/src/test/java/datawave/query/FunctionalSetTest.java index 825e2e5fafd..10ab4511af2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/FunctionalSetTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/FunctionalSetTest.java @@ -257,12 +257,12 @@ public void testMinMax() throws Exception { }; @SuppressWarnings("unchecked") List[] expectedLists = new List[] { - Arrays.asList("ANDOLINI", "SOPRANO", "CORLEONE", "CAPONE"), - Arrays.asList("CORLEONE", "CAPONE"), + Arrays.asList("ANDOLINI", "SOPRANO", "CORLEONE", "CAPONE", "TATTAGLIA"), Arrays.asList("CORLEONE", "CAPONE"), + Arrays.asList("CORLEONE", "CAPONE", "TATTAGLIA"), Arrays.asList(), - Arrays.asList("CORLEONE", "CAPONE"), + Arrays.asList("CORLEONE", "CAPONE", "TATTAGLIA"), Arrays.asList("CORLEONE", "CAPONE"), Arrays.asList("CAPONE"), diff --git a/warehouse/query-core/src/test/java/datawave/query/LenientFieldsTest.java b/warehouse/query-core/src/test/java/datawave/query/LenientFieldsTest.java index 8fa1cfbe16d..9432f73c688 100644 --- a/warehouse/query-core/src/test/java/datawave/query/LenientFieldsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/LenientFieldsTest.java @@ -269,7 +269,7 @@ public void testLenientFields() throws Exception { List[] expectedLists = new List[] { Arrays.asList(), Arrays.asList("CORLEONE", "CAPONE"), - Arrays.asList("CORLEONE", "SOPRANO", "CAPONE"), + Arrays.asList("CORLEONE", "SOPRANO", "CAPONE", "TATTAGLIA"), Arrays.asList() }; // @formatter:on diff --git a/warehouse/query-core/src/test/java/datawave/query/LongRunningQueryTest.java b/warehouse/query-core/src/test/java/datawave/query/LongRunningQueryTest.java index cdae87d968e..f1636c12031 100644 --- a/warehouse/query-core/src/test/java/datawave/query/LongRunningQueryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/LongRunningQueryTest.java @@ -6,13 +6,11 @@ import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.TimeZone; import java.util.UUID; @@ -22,9 +20,6 @@ import org.junit.Before; import org.junit.Test; -import com.fasterxml.jackson.annotation.JsonCreator; -import com.fasterxml.jackson.annotation.JsonValue; -import com.google.common.collect.Multimap; import com.google.common.collect.Sets; import datawave.core.common.connection.AccumuloConnectionFactory; @@ -36,8 +31,6 @@ import datawave.microservice.query.QueryImpl; import datawave.microservice.query.config.QueryExpirationProperties; import datawave.microservice.querymetric.QueryMetricFactoryImpl; -import datawave.query.attributes.UniqueFields; -import datawave.query.attributes.UniqueGranularity; import datawave.query.config.ShardQueryConfiguration; import datawave.query.tables.ShardQueryLogic; import datawave.query.util.DateIndexHelperFactory; diff --git a/warehouse/query-core/src/test/java/datawave/query/QueryPlanTest.java b/warehouse/query-core/src/test/java/datawave/query/QueryPlanTest.java index 18d255f1808..0805be9d0b4 100644 --- a/warehouse/query-core/src/test/java/datawave/query/QueryPlanTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/QueryPlanTest.java @@ -196,7 +196,7 @@ public void planInMetricsAfterTableNotFoundExceptionDefaultQueryPlannerNE() thro runTestQuery(Collections.emptyList(), query, this.dataManager.getShardStartEndDate()[0], this.dataManager.getShardStartEndDate()[1], Collections.emptyMap()); fail("Expected DatawaveFatalQueryException."); - } catch (DatawaveFatalQueryException e) { + } catch (RuntimeException e) { assertEquals(expectedPlan, metric.getPlan()); } } @@ -212,7 +212,7 @@ public void planInMetricsAfterTableNotFoundExceptionDefaultQueryPlannerNotEq() t runTestQuery(Collections.emptyList(), query, this.dataManager.getShardStartEndDate()[0], this.dataManager.getShardStartEndDate()[1], Collections.emptyMap()); fail("Expected DatawaveFatalQueryException."); - } catch (DatawaveFatalQueryException e) { + } catch (RuntimeException e) { assertEquals(expectedPlan, metric.getPlan()); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/RebuildingScannerTestHelper.java b/warehouse/query-core/src/test/java/datawave/query/RebuildingScannerTestHelper.java index ff5396df4d6..2e60d169f37 100644 --- a/warehouse/query-core/src/test/java/datawave/query/RebuildingScannerTestHelper.java +++ b/warehouse/query-core/src/test/java/datawave/query/RebuildingScannerTestHelper.java @@ -525,6 +525,11 @@ public void setConsistencyLevel(ConsistencyLevel consistencyLevel) { } + @Override + public synchronized void setExecutionHints(Map hints) { + // no-op + } + @Override public void setRange(Range range) { ((InMemoryScanner) delegate).setRange(range); @@ -596,6 +601,11 @@ public void setConsistencyLevel(ConsistencyLevel consistencyLevel) { } + @Override + public synchronized void setExecutionHints(Map hints) { + // no-op + } + @Override public void setRanges(Collection ranges) { ((InMemoryBatchScanner) delegate).setRanges(ranges); diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index 8149117800d..a44295201ef 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -870,7 +870,26 @@ public void testPermutations() throws Exception { } @Test - public void testSortQueryBeforeGlobalIndex() throws Exception { + public void testSortQueryPreIndexWithImpliedCounts() throws Exception { + try { + // sorting via implied counts should push TYPE to the right of SHAPE + withQuery("TYPE == 'pentagon' || SHAPE == 'triangle'"); + withParameter(QueryParameters.DATATYPE_FILTER_SET, "triangle,pentagon"); + + Set expectedUids = new HashSet<>(triangleUids); + withExpected(expectedUids); + + disableAllSortOptions(); + logic.setSortQueryPreIndexWithImpliedCounts(true); + planAndExecuteQuery(); + assertPlannedQuery("SHAPE == 'triangle' || TYPE == 'pentagon'"); + } finally { + disableAllSortOptions(); + } + } + + @Test + public void testSortQueryPreIndexWithFieldCounts() throws Exception { try { // SHAPE cardinality for triangle and pentagon types is 23 // TYPE cardinality for triangle and pentagon types is 21 @@ -880,12 +899,20 @@ public void testSortQueryBeforeGlobalIndex() throws Exception { Set expectedUids = new HashSet<>(triangleUids); withExpected(expectedUids); - logic.setSortQueryBeforeGlobalIndex(true); + disableAllSortOptions(); + logic.setSortQueryPreIndexWithFieldCounts(true); planAndExecuteQuery(); assertPlannedQuery("TYPE == 'pentagon' || SHAPE == 'triangle'"); } finally { - logic.setSortQueryBeforeGlobalIndex(false); + disableAllSortOptions(); } } + private void disableAllSortOptions() { + logic.setSortQueryPreIndexWithImpliedCounts(false); + logic.setSortQueryPreIndexWithFieldCounts(false); + logic.setSortQueryPostIndexWithFieldCounts(false); + logic.setSortQueryPostIndexWithTermCounts(false); + } + } diff --git a/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java b/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java index 57871ef4516..3f105da01fd 100644 --- a/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java +++ b/warehouse/query-core/src/test/java/datawave/query/TestLimitReturnedGroupsToHitTermGroups.java @@ -349,9 +349,11 @@ public void testGroupWithExpandedRegexAlphabeticalOrderAndMatchesInGroupPartTwo( goodResults.addAll(Sets.newHashSet("BIRD.PET.0:parakeet", "CANINE.PET.0:beagle")); // disable just for this test to prove group 0 can be returned - logic.setSortQueryBeforeGlobalIndex(false); + logic.setSortQueryPreIndexWithFieldCounts(false); + logic.setSortQueryPreIndexWithImpliedCounts(false); runTestQuery(queryString, format.parse("20091231"), format.parse("20150101"), extraParameters, goodResults); - logic.setSortQueryBeforeGlobalIndex(true); + logic.setSortQueryPreIndexWithFieldCounts(true); + logic.setSortQueryPreIndexWithImpliedCounts(true); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java index 22bc3f3ef5c..2043c612f1f 100644 --- a/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/config/ShardQueryConfigurationTest.java @@ -154,6 +154,8 @@ public void setUp() throws Exception { updatedValues.put("allTermsIndexOnly", true); defaultValues.put("maxIndexScanTimeMillis", Long.MAX_VALUE); updatedValues.put("maxIndexScanTimeMillis", 100000L); + defaultValues.put("maxAnyFieldScanTimeMillis", Long.MAX_VALUE); + updatedValues.put("maxAnyFieldScanTimeMillis", 100000L); defaultValues.put("parseTldUids", false); updatedValues.put("parseTldUids", true); defaultValues.put("ignoreNonExistentFields", false); @@ -481,6 +483,8 @@ public void setUp() throws Exception { updatedValues.put("tfFieldSeek", 14); defaultValues.put("tfNextSeek", -1); updatedValues.put("tfNextSeek", 15); + defaultValues.put("seekingEventAggregation", false); + updatedValues.put("seekingEventAggregation", true); defaultValues.put("visitorFunctionMaxWeight", 5000000L); updatedValues.put("visitorFunctionMaxWeight", 1000000L); defaultValues.put("lazySetMechanismEnabled", false); @@ -579,14 +583,14 @@ public void setUp() throws Exception { defaultValues.put("groupFields", new GroupFields()); updatedValues.put("groupFields", GroupFields.from("GROUP(FIELD_G,FIELD_H)")); - defaultValues.put("useFieldCounts", false); - updatedValues.put("useFieldCounts", true); - defaultValues.put("useTermCounts", false); - updatedValues.put("useTermCounts", true); - defaultValues.put("sortQueryBeforeGlobalIndex", false); - updatedValues.put("sortQueryBeforeGlobalIndex", true); - defaultValues.put("sortQueryByCounts", false); - updatedValues.put("sortQueryByCounts", true); + defaultValues.put("sortQueryPreIndexWithImpliedCounts", false); + updatedValues.put("sortQueryPreIndexWithImpliedCounts", true); + defaultValues.put("sortQueryPreIndexWithFieldCounts", false); + updatedValues.put("sortQueryPreIndexWithFieldCounts", true); + defaultValues.put("sortQueryPostIndexWithTermCounts", false); + updatedValues.put("sortQueryPostIndexWithTermCounts", true); + defaultValues.put("sortQueryPostIndexWithFieldCounts", false); + updatedValues.put("sortQueryPostIndexWithFieldCounts", true); defaultValues.put("tableConsistencyLevels", Collections.emptyMap()); updatedValues.put("tableConsistencyLevels", Collections.singletonMap(TableName.SHARD, ScannerBase.ConsistencyLevel.EVENTUAL)); defaultValues.put("tableHints", Collections.emptyMap()); diff --git a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java index 3a47941f047..1ac3c24e829 100644 --- a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryIteratorTest.java @@ -44,7 +44,6 @@ import datawave.query.iterator.SourceManagerTest; public class DiscoveryIteratorTest { - static final Logger log = Logger.getLogger(DiscoveryIteratorTest.class); @Test public void testHappyPath() throws Throwable { diff --git a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java index b8d3c226d0a..f0512fc2ac6 100644 --- a/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/discovery/DiscoveryLogicTest.java @@ -1,7 +1,5 @@ package datawave.query.discovery; -import static org.junit.Assert.assertEquals; - import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collections; @@ -15,21 +13,25 @@ import java.util.concurrent.TimeUnit; import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; import org.apache.accumulo.core.client.BatchWriter; import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.TableExistsException; +import org.apache.accumulo.core.client.TableNotFoundException; import org.apache.accumulo.core.data.Mutation; import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.user.SummingCombiner; import org.apache.accumulo.core.security.Authorizations; import org.apache.accumulo.core.security.ColumnVisibility; +import org.apache.hadoop.io.MapWritable; import org.apache.log4j.Logger; -import org.javatuples.Pair; +import org.assertj.core.api.Assertions; +import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Sets; - import datawave.core.query.configuration.GenericQueryConfiguration; import datawave.core.query.result.event.DefaultResponseObjectFactory; import datawave.data.type.LcNoDiacriticsType; @@ -42,245 +44,480 @@ import datawave.util.TableName; public class DiscoveryLogicTest { - private static Logger log = Logger.getLogger(DiscoveryLogicTest.class); - protected static Set> terms; - protected static Set> terms2; - protected static Value blank; + private static final Logger log = Logger.getLogger(DiscoveryLogicTest.class); + + private static final Value BLANK_VALUE = new Value(new byte[0]); + private static final Set AUTHS = Collections.singleton(new Authorizations("FOO", "BAR")); + private static final String QUERY_AUTHS = "FOO,BAR"; - protected static Set auths = Collections.singleton(new Authorizations("FOO", "BAR")); - protected static String queryAuths = "FOO,BAR"; - protected AccumuloClient client = null; - protected MockAccumuloRecordWriter recordWriter; - protected DiscoveryLogic logic; - protected SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd"); + private final SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd"); + private AccumuloClient client = null; + private DiscoveryLogic logic; + + private String query; + private String startDate; + private String endDate; + private Map parameters = new HashMap<>(); + + private final List expected = new ArrayList<>(); @BeforeClass public static void setUp() { - blank = new Value(new byte[0]); - terms = Sets.newHashSet(Pair.with("firetruck", "vehicle"), Pair.with("ruddy duck", "bird"), Pair.with("ruddy duck", "unidentified flying object"), - Pair.with("motorcycle", "vehicle"), Pair.with("motorboat", "vehicle"), Pair.with("strike", "actionable offense"), - Pair.with("car", "vehicle"), Pair.with("trophy", "prize"), Pair.with("police officer", "otherperson"), - Pair.with("skydiver", "occupation"), Pair.with("bbc", "network"), Pair.with("onyx", "pokemon"), Pair.with("onyx", "rock"), - Pair.with("onyx", "rooster"), Pair.with("rooster", "cockadoodledoo")); - - terms2 = Sets.newHashSet(Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), Pair.with("skydiver", "job"), - Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), - Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), Pair.with("skydiver", "occupation"), - Pair.with("skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), - Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), Pair.with("xxx.skydiver", "occupation"), - Pair.with("yyy.skydiver", "occupation"), Pair.with("yyy.skydiver", "occupation"), Pair.with("yyy.skydiver", "occupation"), - Pair.with("zskydiver", "occupation")); - System.setProperty(MetadataHelperFactory.ALL_AUTHS_PROPERTY, queryAuths); + System.setProperty(MetadataHelperFactory.ALL_AUTHS_PROPERTY, QUERY_AUTHS); } @Before public void setup() throws Throwable { + initClient(); + writeData(); + initLogic(); + } + + private void initClient() throws AccumuloException, TableNotFoundException, TableExistsException, AccumuloSecurityException { QueryTestTableHelper testTableHelper = new QueryTestTableHelper(DiscoveryLogicTest.class.getCanonicalName(), log); - recordWriter = new MockAccumuloRecordWriter(); + MockAccumuloRecordWriter recordWriter = new MockAccumuloRecordWriter(); testTableHelper.configureTables(recordWriter); - client = testTableHelper.client; - - for (Pair p : terms) { - insertIndex(p); - } - - insertForwardModel("animal", "rooster"); - insertForwardModel("animal", "bird"); - insertReverseModel("occupation", "job"); - - logic = new DiscoveryLogic(); - logic.setIndexTableName(TableName.SHARD_INDEX); - logic.setReverseIndexTableName(TableName.SHARD_RINDEX); - logic.setModelTableName(QueryTestTableHelper.METADATA_TABLE_NAME); - logic.setMetadataTableName(QueryTestTableHelper.METADATA_TABLE_NAME); - logic.setModelName("DATAWAVE"); - logic.setFullTableScanEnabled(false); - logic.setMaxResults(-1); - logic.setMaxWork(-1); - logic.setAllowLeadingWildcard(true); - logic.setResponseObjectFactory(new DefaultResponseObjectFactory()); - logic.setMarkingFunctions(new MarkingFunctions.Default()); - logic.setMetadataHelperFactory(new MetadataHelperFactory()); + this.client = testTableHelper.client; } - protected Uid.List makeUidList(int count) { - Uid.List.Builder builder = Uid.List.newBuilder(); - builder.setIGNORE(true); - builder.setCOUNT(count); - return builder.build(); + private void writeData() throws Throwable { + writeEntries("VEHICLE", "motorcycle", "csv", "FOO", "20130101", 10, 20, 2); + writeEntries("VEHICLE", "motorcycle", "csv", "FOO", "20130102", 10, 20, 2); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130101", 1, 1, 1); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130102", 1, 3, 4); + writeEntries("ROCK", "onyx", "csv", "FOO", "20130103", 1, 3, 3); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130101", 20, 5, 5); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130102", 10, 1, 1); + writeEntries("POKEMON", "onyx", "csv", "FOO", "20130103", 1, 1, 22); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130101", 5, 24, 2); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130102", 5, 24, 2); + writeEntries("ROOSTER", "onyx", "csv", "BAR", "20130103", 5, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130101", 10, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130102", 10, 24, 20); + writeEntries("NETWORK", "bbc", "csv", "FOO", "20130103", 10, 24, 20); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "xxx.skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130101", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130102", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130103", 10, 10, 5); + writeEntries("OCCUPATION", "yyy.skydiver", "text", "FOO", "20130104", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130101", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130102", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130103", 10, 10, 5); + writeEntries("JOB", "skydiver", "text", "BAR", "20130104", 10, 10, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130101", 15, 15, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130102", 15, 15, 5); + writeEntries("JOB", "police officer", "idem", "FOO", "20130103", 15, 15, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130101", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130102", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130103", 1, 5, 5); + writeEntries("PRIZE", "trophy", "idem", "FOO", "20130104", 1, 5, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130101", 2, 15, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130102", 2, 15, 5); + writeEntries("FLOCK", "rooster", "stock", "BAR", "20130103", 2, 15, 5); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130101", 20, 15, 2); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130102", 20, 15, 2); + writeEntries("BIRD", "ruddy duck", "stock", "FOO", "20130103", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130101", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130101", 1, 1, 2); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130102", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130102", 5, 5, 5); + writeEntries("VEHICLE", "ranger", "stock", "FOO", "20130103", 20, 15, 2); + writeEntries("VEHICLE", "ranger", "stock", "BAR", "20130103", 6, 1, 2); + writeEntries("NON_INDEXED_FIELD", "coffee", "csv", "FOO", "20130101", 1, 1, 1); + writeEntries("NON_INDEXED_FIELD", "espresso", "csv", "FOO", "20130102", 1, 5, 5); + + writeForwardModel("ANIMAL", "ROOSTER"); + writeForwardModel("ANIMAL", "BIRD"); + writeReverseModel("occupation", "job"); } - protected void insertIndex(Pair valueField) throws Throwable { + private void writeEntries(String field, String term, String datatype, String visibility, String dateStr, int numShards, int uidListCount, int uidListSize) + throws Exception { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); - ColumnVisibility viz = new ColumnVisibility("FOO"); - - List dates = new ArrayList<>(); - for (int i = 1; i <= 3; i++) { - dates.add(dateFormatter.parse("2013010" + i)); - } + ColumnVisibility columnVisibility = new ColumnVisibility(visibility); + Date date = dateFormatter.parse(dateStr); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(valueField.getValue1().toUpperCase()); - m.put("t", "datatype\u0000" + LcNoDiacriticsType.class.getName(), viz, blank); - m.put("i", "datatype", viz, blank); - m.put("ri", "datatype", viz, blank); - writer.addMutation(m); + Mutation mutation = new Mutation(field); + mutation.put("t", datatype + "\u0000" + LcNoDiacriticsType.class.getName(), columnVisibility, BLANK_VALUE); + if (!field.equals("NON_INDEXED_FIELD")) { + mutation.put("i", datatype + "\u0000" + dateStr, columnVisibility, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); + } + mutation.put("ri", datatype + "\u0000" + dateStr, columnVisibility, new Value(SummingCombiner.VAR_LEN_ENCODER.encode(1L))); + writer.addMutation(mutation); } try (BatchWriter writer = client.createBatchWriter(TableName.SHARD_INDEX, config)) { - Mutation m = new Mutation(valueField.getValue0().toLowerCase()); - int numShards = 10; + Mutation mutation = new Mutation(term); for (int i = 0; i < numShards; i++) { - for (Date date : dates) { - String shard = dateFormatter.format(date); - m.put(valueField.getValue1().toUpperCase(), shard + "_" + i + "\u0000datatype", viz, date.getTime(), - new Value(makeUidList(24).toByteArray())); - } + mutation.put(field, dateStr + "_" + i + "\u0000" + datatype, columnVisibility, date.getTime(), createUidListValue(uidListCount, uidListSize)); } - writer.addMutation(m); + writer.addMutation(mutation); } try (BatchWriter writer = client.createBatchWriter(TableName.SHARD_RINDEX, config)) { - Mutation m = new Mutation(new StringBuilder().append(valueField.getValue0().toLowerCase()).reverse().toString()); - int numShards = 10; + Mutation mutation = new Mutation(new StringBuilder(term).reverse().toString()); for (int i = 0; i < numShards; i++) { - for (Date date : dates) { - String shard = dateFormatter.format(date); - m.put(valueField.getValue1().toUpperCase(), shard + "_" + i + "\u0000datatype", viz, date.getTime(), - new Value(makeUidList(24).toByteArray())); - } + mutation.put(field, dateStr + "_" + i + "\u0000" + datatype, columnVisibility, date.getTime(), createUidListValue(uidListCount, uidListSize)); } - writer.addMutation(m); + writer.addMutation(mutation); + } + } + + private Value createUidListValue(int count, int listSize) { + Uid.List.Builder builder = Uid.List.newBuilder().setIGNORE(true).setCOUNT(count); + for (int i = 0; i < listSize; i++) { + builder.addUID(UUID.randomUUID().toString()); } + return new Value(builder.build().toByteArray()); } - protected void insertForwardModel(String from, String to) throws Throwable { + private void writeForwardModel(String from, String to) throws Throwable { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); ColumnVisibility viz = new ColumnVisibility("FOO"); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(from.toUpperCase()); - m.put("DATAWAVE", to.toUpperCase() + "\u0000forward", viz, blank); + Mutation m = new Mutation(from); + m.put("DATAWAVE", to + "\u0000forward", viz, BLANK_VALUE); writer.addMutation(m); } } - protected void insertReverseModel(String from, String to) throws Throwable { + private void writeReverseModel(String from, String to) throws Throwable { BatchWriterConfig config = new BatchWriterConfig().setMaxMemory(1024L).setMaxLatency(1, TimeUnit.SECONDS).setMaxWriteThreads(1); ColumnVisibility viz = new ColumnVisibility("FOO"); try (BatchWriter writer = client.createBatchWriter(QueryTestTableHelper.METADATA_TABLE_NAME, config)) { - Mutation m = new Mutation(from.toUpperCase()); - m.put("DATAWAVE", to.toUpperCase() + "\u0000reverse", viz, blank); + Mutation m = new Mutation(from); + m.put("DATAWAVE", to + "\u0000reverse", viz, BLANK_VALUE); writer.addMutation(m); } } - protected Iterator runTestQuery(String querystr) throws Throwable { - return runTestQuery(querystr, dateFormatter.parse("20130101"), dateFormatter.parse("20130102")); + private void initLogic() { + logic = new DiscoveryLogic(); + logic.setIndexTableName(TableName.SHARD_INDEX); + logic.setReverseIndexTableName(TableName.SHARD_RINDEX); + logic.setModelTableName(QueryTestTableHelper.METADATA_TABLE_NAME); + logic.setMetadataTableName(QueryTestTableHelper.METADATA_TABLE_NAME); + logic.setModelName("DATAWAVE"); + logic.setFullTableScanEnabled(false); + logic.setMaxResults(-1); + logic.setMaxWork(-1); + logic.setAllowLeadingWildcard(true); + logic.setResponseObjectFactory(new DefaultResponseObjectFactory()); + logic.setMarkingFunctions(new MarkingFunctions.Default()); + logic.setMetadataHelperFactory(new MetadataHelperFactory()); } - protected Iterator runTestQuery(String querystr, Date startDate, Date endDate) throws Throwable { - return runTestQuery(querystr, new HashMap<>(), startDate, endDate); + @After + public void tearDown() throws Exception { + query = null; + startDate = null; + endDate = null; + parameters.clear(); + expected.clear(); } - protected Iterator runTestQuery(String querystr, Map params, Date startDate, Date endDate) throws Throwable { + private void assertQueryResults() throws Exception { QueryImpl settings = new QueryImpl(); - settings.setBeginDate(startDate); - settings.setEndDate(endDate); - + settings.setBeginDate(dateFormatter.parse(startDate)); + settings.setEndDate(dateFormatter.parse(endDate)); settings.setPagesize(Integer.MAX_VALUE); - settings.setQueryAuthorizations(queryAuths); - settings.setQuery(querystr); + settings.setQueryAuthorizations(QUERY_AUTHS); + settings.setQuery(query); settings.setId(UUID.randomUUID()); - settings.addParameters(params); + settings.addParameters(this.parameters); - GenericQueryConfiguration config = logic.initialize(client, settings, auths); + GenericQueryConfiguration config = logic.initialize(client, settings, AUTHS); logic.setupQuery(config); - return logic.iterator(); + Iterator iterator = logic.iterator(); + List actual = new ArrayList<>(); + while (iterator.hasNext()) { + actual.add(iterator.next()); + } + + Assertions.assertThat(actual).hasSize(expected.size()); + for (int i = 0; i < expected.size(); i++) { + DiscoveredThing actualThing = actual.get(i); + DiscoveredThing expectedThing = expected.get(i); + Assertions.assertThat(actualThing).isEqualTo(expectedThing); + Assertions.assertThat(actualThing.getCountsByColumnVisibility()).isEqualTo(expectedThing.getCountsByColumnVisibility()); + } + } + + private void givenQuery(String query) { + this.query = query; + } + + private void givenStartDate(String startDate) { + this.startDate = startDate; + } + + private void givenEndDate(String endDate) { + this.endDate = endDate; + } + + private void givenParameter(String parameter, String value) { + this.parameters.put(parameter, value); + } + + private void expect(DiscoveredThing discoveredThing) { + this.expected.add(discoveredThing); } @Test - public void testUnfieldedLiterals() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("bbc OR onyx"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } - assertEquals(ImmutableSet.of(Pair.with("bbc", "NETWORK"), Pair.with("onyx", "POKEMON"), Pair.with("onyx", "ROCK"), Pair.with("onyx", "ROOSTER")), - matches); + public void testLiterals() throws Exception { + givenQuery("bbc OR onyx"); + givenStartDate("20130101"); + givenEndDate("20130102"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130101", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130102", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130101", "BAR", 120L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130102", "BAR", 120L, new MapWritable())); + + assertQueryResults(); } @Test - public void testUnfieldedPatterns() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*er OR m*"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testPatterns() throws Exception { + givenQuery("*yx OR b*"); + givenStartDate("20130101"); + givenEndDate("20130102"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130101", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "20130102", "FOO", 240L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130101", "BAR", 120L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "20130102", "BAR", 120L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("motorcycle", "VEHICLE"), Pair.with("motorboat", "VEHICLE"), Pair.with("police officer", "OTHERPERSON"), - Pair.with("skydiver", "OCCUPATION"), Pair.with("rooster", "COCKADOODLEDOO")), matches); + @Test + public void testPatternAndLiteral() throws Exception { + givenQuery("*er OR trophy"); + givenStartDate("20130102"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130102", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130103", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "20130104", "FOO", 5L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "20130102", "FOO", 225L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "20130103", "FOO", 225L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "20130102", "BAR&FOO", 325L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "20130103", "BAR&FOO", 306L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130102", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130103", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130102", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130103", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "20130104", "BAR", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + + assertQueryResults(); } @Test - public void testUnfielded() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*er OR trophy"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testFieldedLiterals() throws Exception { + givenQuery("rock:onyx OR pokemon:onyx"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130103", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130103", "FOO", 3L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("trophy", "PRIZE"), Pair.with("police officer", "OTHERPERSON"), Pair.with("skydiver", "OCCUPATION"), - Pair.with("rooster", "COCKADOODLEDOO")), matches); + @Test + public void testFieldedPatterns() throws Exception { + givenQuery("rock:*n*x OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130103"); + + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130101", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130102", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130103", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130101", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130102", "FOO", 3L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "20130103", "FOO", 3L, new MapWritable())); + + assertQueryResults(); } @Test - public void testFieldedLiteral() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("rock:onyx OR pokemon:onyx"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } - assertEquals(ImmutableSet.of(Pair.with("onyx", "POKEMON"), Pair.with("onyx", "ROCK")), matches); + public void testFieldLiteralAndPattern() throws Exception { + givenQuery("pokemon:onyx OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130102", "FOO", 10L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "20130103", "FOO", 1L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130101", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130102", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "20130103", "FOO", 300L, new MapWritable())); + + assertQueryResults(); } @Test - public void testFieldedPattern() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("vehicle:*r*k OR bird:*r*k"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testIgnoreNonIndexedField() throws Exception { + givenQuery("coffee OR espresso OR rooster"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130101", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130102", "BAR", 30L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "20130103", "BAR", 30L, new MapWritable())); - assertEquals(ImmutableSet.of(Pair.with("firetruck", "VEHICLE"), Pair.with("ruddy duck", "BIRD")), matches); + assertQueryResults(); } @Test - public void testFielded() throws Throwable { - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("pokemon:onyx OR bird:*r*k"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + public void testReverse() throws Exception { + givenQuery("*.sky*er"); + givenStartDate("20130101"); + givenEndDate("20130104"); + + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130101", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130102", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130103", "FOO", 100L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "20130104", "FOO", 100L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("onyx", "POKEMON"), Pair.with("ruddy duck", "BIRD")), matches); + @Test + public void testSumCountsForLiterals() throws Exception { + givenQuery("bbc OR onyx"); + givenStartDate("20130101"); + givenEndDate("20130102"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "", "FOO", 480L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 110L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 4L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "", "BAR", 240L, new MapWritable())); + + assertQueryResults(); } @Test - public void testReverse() throws Throwable { - for (Pair p : terms2) { - insertIndex(p); - } + public void testSumCountsForPatterns() throws Exception { + givenQuery("*yx OR b*"); + givenStartDate("20130101"); + givenEndDate("20130102"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("bbc", "NETWORK", "csv", "", "FOO", 480L, new MapWritable())); + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 110L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 4L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROOSTER", "csv", "", "BAR", 240L, new MapWritable())); + + assertQueryResults(); + } - Set> matches = Sets.newHashSet(); - for (Iterator it = runTestQuery("*.sky*er"); it.hasNext();) { - DiscoveredThing thing = it.next(); - matches.add(Pair.with(thing.getTerm(), thing.getField())); - } + @Test + public void testSumCountsForPatternAndLiteral() throws Exception { + givenQuery("*er OR trophy"); + givenStartDate("20130102"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("trophy", "PRIZE", "idem", "", "FOO", 15L, new MapWritable())); + expect(new DiscoveredThing("police officer", "JOB", "idem", "", "FOO", 450L, new MapWritable())); + expect(new DiscoveredThing("ranger", "VEHICLE", "stock", "", "BAR&FOO", 631L, new MapWritable())); + expect(new DiscoveredThing("rooster", "FLOCK", "stock", "", "BAR", 60L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "JOB", "text", "", "BAR", 300L, new MapWritable())); + expect(new DiscoveredThing("skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "", "FOO", 300L, new MapWritable())); + + assertQueryResults(); + } + + @Test + public void testSumCountsForFieldedLiterals() throws Exception { + givenQuery("rock:onyx OR pokemon:onyx"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 111L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 7L, new MapWritable())); + + assertQueryResults(); + } - assertEquals(ImmutableSet.of(Pair.with("xxx.skydiver", "OCCUPATION"), Pair.with("yyy.skydiver", "OCCUPATION")), matches); + @Test + public void testSumCountsForFieldedPatterns() throws Exception { + givenQuery("rock:*n*x OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130103"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "", "FOO", 900L, new MapWritable())); + expect(new DiscoveredThing("onyx", "ROCK", "csv", "", "FOO", 7L, new MapWritable())); + + assertQueryResults(); + } + + @Test + public void testSumCountsForFieldLiteralAndPattern() throws Exception { + givenQuery("pokemon:onyx OR bird:*r*k"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("onyx", "POKEMON", "csv", "", "FOO", 111L, new MapWritable())); + expect(new DiscoveredThing("ruddy duck", "BIRD", "stock", "", "FOO", 900L, new MapWritable())); + + assertQueryResults(); } + @Test + public void testSumCountsForReverse() throws Exception { + givenQuery("*.sky*er"); + givenStartDate("20130101"); + givenEndDate("20130104"); + givenParameter(DiscoveryLogic.SUM_COUNTS, "true"); + + expect(new DiscoveredThing("xxx.skydiver", "OCCUPATION", "text", "", "FOO", 400L, new MapWritable())); + expect(new DiscoveredThing("yyy.skydiver", "OCCUPATION", "text", "", "FOO", 400L, new MapWritable())); + + assertQueryResults(); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/function/KeyToDocumentDataTest.java b/warehouse/query-core/src/test/java/datawave/query/function/KeyToDocumentDataTest.java index d724f58253e..15721a878e1 100644 --- a/warehouse/query-core/src/test/java/datawave/query/function/KeyToDocumentDataTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/function/KeyToDocumentDataTest.java @@ -23,8 +23,6 @@ import datawave.query.iterator.aggregation.DocumentData; import datawave.query.predicate.EventDataQueryFieldFilter; import datawave.query.predicate.EventDataQueryFilter; -import datawave.query.predicate.KeyProjection; -import datawave.query.predicate.Projection; public class KeyToDocumentDataTest { @@ -49,15 +47,13 @@ public void testEventData_defaultEquality_noFilter() { @Test public void testEventData_defaultEquality_withFilter() { - KeyProjection projection = new KeyProjection(Set.of("FIELD_A", "FIELD_B"), Projection.ProjectionType.INCLUDES); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(projection); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A", "FIELD_B")); KeyToDocumentData data = new KeyToDocumentData(getSource(), equality, filter, false, false).withRangeProvider(rangeProvider); drive(data, getEntry(), 2); assertFields(Set.of("FIELD_A", "FIELD_B")); // exclusive filter should result with nothing - projection = new KeyProjection(Set.of("FIELD_Z"), Projection.ProjectionType.INCLUDES); - filter = new EventDataQueryFieldFilter(projection); + filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_Z")); data = new KeyToDocumentData(getSource(), equality, filter, false, false).withRangeProvider(rangeProvider); drive(data, getEntry(), 0); assertFields(Set.of()); @@ -72,8 +68,7 @@ public void testEventData_TLDEquality_noFilter() { @Test public void testEventData_TLDEquality_withFilter() { - KeyProjection projection = new KeyProjection(Set.of("FIELD_A", "FIELD_B"), Projection.ProjectionType.INCLUDES); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(projection); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A", "FIELD_B")); KeyToDocumentData data = new KeyToDocumentData(getSource(), tldEquality, filter, false, false).withRangeProvider(rangeProvider); drive(data, getEntry(), 2); assertFields(Set.of("FIELD_A", "FIELD_B")); @@ -89,8 +84,7 @@ public void testTLDData_defaultEquality_noFilter() { @Test public void testTLDData_defaultEquality_withFilter() { - KeyProjection projection = new KeyProjection(Set.of("FIELD_A", "FIELD_B"), Projection.ProjectionType.INCLUDES); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(projection); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A", "FIELD_B")); KeyToDocumentData data = new KeyToDocumentData(getTLDSource(), equality, filter, false, false).withRangeProvider(tldRangeProvider); drive(data, getEntry(), 2); assertFields(Set.of("FIELD_A", "FIELD_B")); @@ -105,8 +99,7 @@ public void testTLDData_TLDEquality_noFilter() { @Test public void testTLDData_TLDEquality_withFilter() { - KeyProjection projection = new KeyProjection(Set.of("FIELD_A", "FIELD_B"), Projection.ProjectionType.INCLUDES); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(projection); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A", "FIELD_B")); KeyToDocumentData data = new KeyToDocumentData(getTLDSource(), tldEquality, filter, false, false).withRangeProvider(tldRangeProvider); drive(data, getEntry(), 4); assertFields(Set.of("FIELD_A", "FIELD_B")); diff --git a/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java b/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java index 4f9ddec3d2a..feed357294f 100644 --- a/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/index/lookup/IndexInfoTest.java @@ -11,10 +11,8 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import org.apache.commons.jexl3.parser.JexlNode; diff --git a/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java b/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java index ef440984a4d..9abd2c30a91 100644 --- a/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java +++ b/warehouse/query-core/src/test/java/datawave/query/index/lookup/RangeStreamTestX.java @@ -343,6 +343,26 @@ public static void setupAccumulo() throws Exception { m.put(new Text("F4"), new Text("20200101_13\0datatype1"), valueForShard); bw.addMutation(m); + // --------------- some entries for post-index sorting via field or term counts + + m = new Mutation("23"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(23L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(23L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(23L)); + bw.addMutation(m); + + m = new Mutation("34"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(34L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(34L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(34L)); + bw.addMutation(m); + + m = new Mutation("45"); + m.put(new Text("FIELD_A"), new Text("20200101_10\0sort-type"), createValue(45L)); + m.put(new Text("FIELD_B"), new Text("20200101_10\0sort-type"), createValue(45L)); + m.put(new Text("FIELD_C"), new Text("20200101_10\0sort-type"), createValue(45L)); + bw.addMutation(m); + // --------------- bw.flush(); @@ -358,6 +378,21 @@ private static Value buildValueForShard() { return new Value(list.toByteArray()); } + /** + * Create a value with a count + * + * @param count + * the count + * @return a value + */ + private static Value createValue(long count) { + Uid.List.Builder builder = Uid.List.newBuilder(); + builder.setIGNORE(true); + builder.setCOUNT(count); + Uid.List list = builder.build(); + return new Value(list.toByteArray()); + } + // A value that will roll into a day range. private static Value buildValueForDay() { Uid.List.Builder builder = Uid.List.newBuilder(); @@ -371,6 +406,10 @@ private static Value buildValueForDay() { public void setupTest() { config = new ShardQueryConfiguration(); config.setClient(client); + + // disable all post-index sort options by default + config.setSortQueryPostIndexWithFieldCounts(false); + config.setSortQueryPostIndexWithTermCounts(false); } // A && B @@ -3400,6 +3439,24 @@ public void testOrAndOrWithDeeplyNestedDelayedTerm() throws Exception { runTest(query, expectedRanges, expectedQueries); } + @Test + public void testSortingByFieldCardinality() { + String query = "FIELD_A == '45' || FIELD_B == '34' || FIELD_C == '23'"; + String expected = "(FIELD_C == '23' || FIELD_B == '34' || FIELD_A == '45')"; + + config.setSortQueryPostIndexWithFieldCounts(true); + drive(query, expected); + } + + @Test + public void testSortingByTermCardinality() { + String query = "FIELD_A == '45' || FIELD_B == '34' || FIELD_C == '23'"; + String expected = "(FIELD_C == '23' || FIELD_B == '34' || FIELD_A == '45')"; + + config.setSortQueryPostIndexWithTermCounts(true); + drive(query, expected); + } + private void runTest(String query, List expectedRanges, List expectedQueries) throws Exception { assertEquals("Expected ranges and queries do not match, ranges: " + expectedRanges.size() + " queries: " + expectedQueries.size(), @@ -3485,4 +3542,53 @@ private void runTest(RangeStream rangeStream, ASTJexlScript script, List if (queryIter.hasNext()) fail("Expected queries still exist after test: " + queryIter.next()); } + + /** + * Drives a query against a subset of the index data to verify post-index sorting options + * + * @param query + * the input query + * @param expected + * the expected query + */ + private void drive(String query, String expected) { + try { + ASTJexlScript script = JexlASTHelper.parseJexlQuery(query); + + SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); + config.setBeginDate(sdf.parse("20200101")); + config.setEndDate(sdf.parse("20200105")); + + config.setDatatypeFilter(Sets.newHashSet("sort-type")); + + Multimap> dataTypes = HashMultimap.create(); + dataTypes.putAll("FIELD_A", Sets.newHashSet(new LcNoDiacriticsType())); + dataTypes.putAll("FIELD_B", Sets.newHashSet(new LcNoDiacriticsType())); + dataTypes.putAll("FIELD_C", Sets.newHashSet(new LcNoDiacriticsType())); + + config.setQueryFieldsDatatypes(dataTypes); + config.setIndexedFields(dataTypes); + + MockMetadataHelper helper = new MockMetadataHelper(); + helper.setIndexedFields(dataTypes.keySet()); + + // Run a standard limited-scanner range stream. + ScannerFactory scannerFactory = new ScannerFactory(config); + try (RangeStream rangeStream = new RangeStream(config, scannerFactory, helper)) { + rangeStream.setLimitScanners(true); + + Iterator plans = rangeStream.streamPlans(script).iterator(); + + assertTrue(plans.hasNext()); + QueryPlan plan = plans.next(); + + String plannedQuery = plan.getQueryString(); + assertEquals(expected, plannedQuery); + + assertFalse(plans.hasNext()); + } + } catch (Exception e) { + fail("test failed: " + e.getMessage()); + } + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/iterator/QueryOptionsTest.java b/warehouse/query-core/src/test/java/datawave/query/iterator/QueryOptionsTest.java index b83988aaa6d..3d5e0280460 100644 --- a/warehouse/query-core/src/test/java/datawave/query/iterator/QueryOptionsTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/iterator/QueryOptionsTest.java @@ -6,10 +6,13 @@ import static datawave.query.iterator.QueryOptions.FI_FIELD_SEEK; import static datawave.query.iterator.QueryOptions.FI_NEXT_SEEK; import static datawave.query.iterator.QueryOptions.QUERY; +import static datawave.query.iterator.QueryOptions.SEEKING_EVENT_AGGREGATION; import static datawave.query.iterator.QueryOptions.TERM_FREQUENCY_AGGREGATION_THRESHOLD_MS; import static datawave.query.iterator.QueryOptions.TF_FIELD_SEEK; import static datawave.query.iterator.QueryOptions.TF_NEXT_SEEK; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; import java.io.IOException; import java.nio.charset.Charset; @@ -127,6 +130,7 @@ public void testSeekingConfiguration() { optionsMap.put(EVENT_NEXT_SEEK, "13"); optionsMap.put(TF_FIELD_SEEK, "14"); optionsMap.put(TF_NEXT_SEEK, "15"); + optionsMap.put(SEEKING_EVENT_AGGREGATION, "true"); QueryOptions options = new QueryOptions(); @@ -137,6 +141,7 @@ public void testSeekingConfiguration() { assertEquals(-1, options.getEventNextSeek()); assertEquals(-1, options.getTfFieldSeek()); assertEquals(-1, options.getTfNextSeek()); + assertFalse(options.isSeekingEventAggregation()); options.validateOptions(optionsMap); @@ -147,6 +152,7 @@ public void testSeekingConfiguration() { assertEquals(13, options.getEventNextSeek()); assertEquals(14, options.getTfFieldSeek()); assertEquals(15, options.getTfNextSeek()); + assertTrue(options.isSeekingEventAggregation()); } @Test @@ -177,7 +183,7 @@ public void testGetEquality() { } @Test - public void testSimple() throws IOException { + public void testSimple() { Map> expected = new HashMap<>(); expected.put("FIELD1", new HashSet<>(Arrays.asList("norm1", "norm2"))); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java index c5af43d9a6d..a3774a319dd 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/ContentFunctionsDescriptorTest.java @@ -88,29 +88,31 @@ private void assertHitTermValues(ContentJexlArgumentDescriptor jexlDescriptor, S @Test @SuppressWarnings("unchecked") void testFieldsAndTerms() { - assertFieldsAndTerms(getDescriptor(unfieldedPhrase), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedPhrase), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedPhrase), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedPhrase), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedScoredPhrase), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedScoredPhrase), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedScoredPhrase), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedScoredPhrase), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedScoredPhrase), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedScoredPhrase), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedAdjacent), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedAdjacent), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedAdjacent), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedAdjacent), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedAdjacent), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedAdjacent), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); - assertFieldsAndTerms(getDescriptor(unfieldedWithin), new Set[] {Set.of(), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(fieldedWithin), new Set[] {Set.of("FIELD"), Set.of("foo", "bar")}); - assertFieldsAndTerms(getDescriptor(multiFieldedWithin), new Set[] {Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")}); + assertFieldsAndTerms(getDescriptor(unfieldedWithin), Set.of(), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(fieldedWithin), Set.of("FIELD"), Set.of("foo", "bar")); + assertFieldsAndTerms(getDescriptor(multiFieldedWithin), Set.of("FIELD_A", "FIELD_B"), Set.of("foo", "bar")); } - private void assertFieldsAndTerms(ContentJexlArgumentDescriptor jexlDescriptor, Set[] expected) { - Set[] fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true)); - assertArrayEquals(expected, fieldsAndTerms); + private void assertFieldsAndTerms(ContentJexlArgumentDescriptor jexlDescriptor, Set fields, Set terms) { + ContentFunctionsDescriptor.FieldTerms fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true)); + assertEquals(fields, fieldsAndTerms.getFields()); + assertEquals(terms, fieldsAndTerms.getTerms()); fieldsAndTerms = jexlDescriptor.fieldsAndTerms(Set.of(), Set.of(), Set.of(), new MutableBoolean(true), false); - assertArrayEquals(expected, fieldsAndTerms); + assertEquals(fields, fieldsAndTerms.getFields()); + assertEquals(terms, fieldsAndTerms.getTerms()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/TermFrequencyAggregatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/TermFrequencyAggregatorTest.java index 4ec909f7139..055f339f5c1 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/functions/TermFrequencyAggregatorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/functions/TermFrequencyAggregatorTest.java @@ -17,21 +17,17 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator; -import org.apache.commons.jexl3.parser.ParseException; import org.junit.Before; import org.junit.Test; import com.google.common.collect.Maps; -import com.google.common.collect.Sets; import datawave.query.Constants; import datawave.query.attributes.AttributeFactory; import datawave.query.attributes.Document; import datawave.query.data.parsers.DatawaveKey; -import datawave.query.jexl.JexlASTHelper; import datawave.query.predicate.EventDataQueryFieldFilter; import datawave.query.predicate.EventDataQueryFilter; -import datawave.query.predicate.Projection; import datawave.query.util.TypeMetadata; public class TermFrequencyAggregatorTest { @@ -57,7 +53,7 @@ public void apply_buildDocNotKeep() throws IOException { Set keepFields = new HashSet<>(); keepFields.add("FIELD2"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(Sets.newHashSet("FIELD1"), Projection.ProjectionType.EXCLUDES); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Collections.singleton("FIELD1")); aggregator = new TermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); @@ -74,7 +70,7 @@ public void apply_buildDocNotKeep() throws IOException { } @Test - public void apply_buildDocKeep() throws IOException, ParseException { + public void apply_buildDocKeep() throws IOException { Document doc = new Document(); AttributeFactory attributeFactory = new AttributeFactory(new TypeMetadata()); @@ -88,7 +84,7 @@ public void apply_buildDocKeep() throws IOException, ParseException { Set keepFields = new HashSet<>(); keepFields.add("FIELD1"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(JexlASTHelper.parseJexlQuery("FIELD1 == 'VALUE1'"), Collections.emptySet()); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Collections.singleton("FIELD1")); aggregator = new TermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); @@ -111,7 +107,7 @@ public void apply_buildDocKeep() throws IOException, ParseException { } @Test - public void apply_buildDocKeepFilteredOut() throws IOException, ParseException { + public void apply_buildDocKeepFilteredOut() throws IOException { Document doc = new Document(); AttributeFactory attributeFactory = new AttributeFactory(new TypeMetadata()); @@ -125,7 +121,7 @@ public void apply_buildDocKeepFilteredOut() throws IOException, ParseException { Set keepFields = new HashSet<>(); keepFields.add("FIELD2"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(JexlASTHelper.parseJexlQuery("FIELD2 == 'VALUE1'"), Collections.EMPTY_SET); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Collections.singleton("FIELD2")); aggregator = new TermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java new file mode 100644 index 00000000000..2ec75fd13e3 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/BoundedRangeIndexLookupTest.java @@ -0,0 +1,314 @@ +package datawave.query.jexl.lookups; + +import static org.easymock.EasyMock.createMock; +import static org.easymock.EasyMock.eq; +import static org.easymock.EasyMock.expect; +import static org.easymock.EasyMock.isA; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.powermock.api.easymock.PowerMock.replayAll; +import static org.powermock.api.easymock.PowerMock.verifyAll; + +import java.text.SimpleDateFormat; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.client.security.tokens.PasswordToken; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.minicluster.MiniAccumuloCluster; +import org.easymock.EasyMockSupport; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import datawave.microservice.query.QueryImpl; +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.iterator.SortedListKeyValueIterator; +import datawave.query.jexl.LiteralRange; +import datawave.query.scanner.LocalBatchScanner; +import datawave.query.tables.ScannerFactory; +import datawave.util.TableName; +import datawave.util.time.DateHelper; + +public class BoundedRangeIndexLookupTest extends EasyMockSupport { + + @ClassRule + public static TemporaryFolder temporaryFolder = new TemporaryFolder(); + + private static final String PASSWORD = "password"; + + private static final String shard = "2024070"; + private static final Set fields = Set.of("FIELD_A", "FIELD_B", "FIELD_C", "FIELD_D", "FIELD_E"); + private static final Set datatypes = Set.of("datatype-a", "datatype-b", "datatype-c", "datatype-d", "datatype-e"); + + private static MiniAccumuloCluster cluster; + private static AccumuloClient client; + private ExecutorService executorService; + + private ShardQueryConfiguration config; + private ScannerFactory scannerFactory; + + private final SortedSet expected = new TreeSet<>(); + + // variables for large row test + private BoundedRangeIndexLookup largeLookup; + private ShardQueryConfiguration largeConfig; + private ScannerFactory largeScannerFactory; + + @BeforeClass + public static void setupClass() throws Exception { + cluster = new MiniAccumuloCluster(temporaryFolder.newFolder(), PASSWORD); + cluster.start(); + + client = cluster.createAccumuloClient("root", new PasswordToken(PASSWORD)); + + writeData(); + } + + @Before + public void setup() { + scannerFactory = new ScannerFactory(client); + + config = new ShardQueryConfiguration(); + config.setClient(client); + + executorService = Executors.newFixedThreadPool(5); + + expected.clear(); + + // large lookup + largeConfig = new ShardQueryConfiguration(); + largeScannerFactory = createMock(ScannerFactory.class); + } + + @After + public void teardown() { + executorService.shutdownNow(); + } + + public static void writeData() throws Exception { + client.tableOperations().create(TableName.SHARD_INDEX); + + int numTerms = 25; + + try (BatchWriter bw = client.createBatchWriter(TableName.SHARD_INDEX, new BatchWriterConfig())) { + for (int i = 0; i < numTerms; i++) { + Mutation m = new Mutation("value-" + i); + for (String field : fields) { + for (int j = 0; j < 10; j++) { + for (String datatype : datatypes) { + for (int k = 0; k < 5; k++) { + m.put(field, shard + j + '_' + k + '\u0000' + datatype, new Value()); + } + } + } + } + bw.addMutation(m); + } + } + } + + @Test + public void testSingleDay_singleValue() { + withDateRange("20240701", "20240701"); + withDatatypeFilter(Set.of("datatype-b")); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testSingleDay_multiValue() { + withDateRange("20240701", "20240701"); + withExpected(Set.of("value-10", "value-12", "value-11", "value-14", "value-13", "value-16", "value-15", "value-18", "value-17", "value-19", "value-1", + "value-2")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-2"); + test(lookup, "FIELD_A"); + } + + @Test + public void testSingleDay_allValues() { + withDateRange("20240701", "20240701"); + withExpected(createAllValues(1, 25)); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_singleValue() { + withDateRange("20240701", "20240703"); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_multiValue() { + withDateRange("20240701", "20240703"); + withExpected(Set.of("value-3", "value-4", "value-5")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-3", "value-5"); + test(lookup, "FIELD_A"); + } + + @Test + public void testMultiDay_allValues() { + withDateRange("20240701", "20240703"); + withExpected(createAllValues(1, 25)); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_singleValue() { + withDateRange("20240701", "20240709"); + withExpected(Set.of("value-1")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_multiValue() { + withDateRange("20240701", "20240709"); + withExpected(Set.of("value-21", "value-3", "value-2", "value-20", "value-23", "value-22", "value-24")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-2", "value-3"); + test(lookup, "FIELD_A"); + } + + @Test + public void testAllDays_allValues() { + withDateRange("20240701", "20240709"); + withExpected(createAllValues(1, 25)); + + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-9"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidDateRange() { + withDateRange("20240808", "20240909"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidBoundedRange() { + withDateRange("20240701", "20240709"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "abc", "def"); + test(lookup, "FIELD_A"); + } + + @Test + public void testInvalidField() { + withDateRange("20240701", "20240709"); + BoundedRangeIndexLookup lookup = createLookup("FIELD_Z", "value-1", "value-1"); + test(lookup, "FIELD_Z"); + } + + @Test + public void testInvalidDataTypeFilter() { + withDateRange("20240701", "20240709"); + withDatatypeFilter(Set.of("datatype-z")); + BoundedRangeIndexLookup lookup = createLookup("FIELD_A", "value-1", "value-1"); + test(lookup, "FIELD_A"); + } + + private void test(BoundedRangeIndexLookup lookup, String field) { + lookup.submit(); + + IndexLookupMap lookupMap = lookup.lookup(); + + if (expected.isEmpty()) { + assertTrue(lookupMap.keySet().isEmpty()); + } else { + assertTrue(lookupMap.containsKey(field)); + Set values = new HashSet<>(lookupMap.get(field)); + assertEquals(expected, values); + } + } + + private BoundedRangeIndexLookup createLookup(String field, String lower, String upper) { + LiteralRange range = new LiteralRange<>(lower, true, upper, true, field, LiteralRange.NodeOperand.AND); + return createLookup(range); + } + + private BoundedRangeIndexLookup createLookup(LiteralRange range) { + return new BoundedRangeIndexLookup(config, scannerFactory, range, executorService); + } + + private void withDateRange(String start, String end) { + assertNotNull(config); + config.setBeginDate(DateHelper.parse(start)); + config.setEndDate(DateHelper.parse(end)); + } + + private void withDatatypeFilter(Set datatypes) { + assertNotNull(config); + config.setDatatypeFilter(datatypes); + } + + private void withExpected(Set expected) { + assertTrue("should only set expected values once per test", this.expected.isEmpty()); + this.expected.addAll(expected); + } + + private Set createAllValues(int start, int stop) { + Set values = new HashSet<>(); + for (int i = start; i < stop; i++) { + values.add("value-" + i); + } + return values; + } + + @Test + public void largeRowInBoundedRangeTest() throws TableNotFoundException { + ExecutorService s = Executors.newSingleThreadExecutor(); + Date begin = new Date(); + Date end = new Date(); + config.setBeginDate(begin); + config.setEndDate(end); + config.setNumQueryThreads(1); + // defaults to 5000 + config.setMaxValueExpansionThreshold(1); + SimpleDateFormat sdf = new SimpleDateFormat("YYYYMMdd"); + LiteralRange range = new LiteralRange("R", true, "S", false, "FOO", LiteralRange.NodeOperand.OR); + largeLookup = new BoundedRangeIndexLookup(config, largeScannerFactory, range, s); + // create index data to iterate over + List> src = new ArrayList<>(); + for (int i = 0; i < 10000; i++) { + src.add(new AbstractMap.SimpleImmutableEntry<>(new Key("R" + i, "FOO", sdf.format(begin) + "_1" + '\0' + "myDataType"), new Value())); + } + SortedListKeyValueIterator itr = new SortedListKeyValueIterator(src); + LocalBatchScanner scanner = new LocalBatchScanner(itr, true); + // add expects for the scanner factory + expect(largeScannerFactory.newScanner(eq("shardIndex"), isA(Set.class), eq(1), isA(QueryImpl.class), eq("shardIndex"))).andAnswer(() -> scanner); + expect(largeScannerFactory.close(scanner)).andReturn(true); + replayAll(); + largeLookup.submit(); + IndexLookupMap map = largeLookup.lookup(); + // verify we went over all the data even though the threshold was lower than this + assertEquals(10001, scanner.getSeekCount()); // with new iterator this is initial seek + one seek per unique row in the range + // this represents data collapsed and sent back to the client by the WholeRowIterator + assertEquals(0, scanner.getNextCount()); // no next cals with seeking filter + assertTrue(map.get("FOO").isThresholdExceeded()); + verifyAll(); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java new file mode 100644 index 00000000000..5861c8af516 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/lookups/FieldNameIndexLookupTest.java @@ -0,0 +1,123 @@ +package datawave.query.jexl.lookups; + +import static org.easymock.EasyMock.anyObject; +import static org.easymock.EasyMock.expect; +import static org.easymock.EasyMock.expectLastCall; +import static org.easymock.EasyMock.isA; + +import java.lang.reflect.InvocationTargetException; +import java.util.Date; +import java.util.HashSet; +import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.easymock.EasyMockSupport; +import org.junit.Before; +import org.junit.Test; + +import datawave.microservice.query.QueryImpl; +import datawave.query.config.ShardQueryConfiguration; +import datawave.query.tables.AnyFieldScanner; +import datawave.query.tables.ScannerFactory; +import datawave.query.tables.SessionOptions; + +public class FieldNameIndexLookupTest extends EasyMockSupport { + private FieldNameIndexLookup lookup; + private ShardQueryConfiguration config; + private ScannerFactory scannerFactory; + private ExecutorService executorService; + + private Set fields = new HashSet<>(); + private Set terms = new HashSet<>(); + + @Before + public void setup() { + config = new ShardQueryConfiguration(); + scannerFactory = createMock(ScannerFactory.class); + executorService = createMock(ExecutorService.class); + + fields = new HashSet<>(); + terms = new HashSet<>(); + } + + @Test + public void initTest() { + lookup = new FieldNameIndexLookup(config, scannerFactory, fields, terms, executorService); + } + + @Test(expected = RuntimeException.class) + @SuppressWarnings({"unchecked", "ConstantConditions"}) + public void submitErrorEnsureCloseTest() throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { + AnyFieldScanner scannerSession = createMock(AnyFieldScanner.class); + + Date begin = new Date(); + Date end = new Date(); + config.setBeginDate(begin); + config.setEndDate(end); + + fields.add("f1"); + fields.add("f2"); + terms.add("lookMeUp"); + lookup = new FieldNameIndexLookup(config, scannerFactory, fields, terms, executorService); + + expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class), isA(String.class))) + .andReturn(scannerSession).anyTimes(); + expect(scannerSession.setRanges(anyObject())).andReturn(scannerSession); + expect(scannerSession.setOptions(anyObject())).andReturn(scannerSession); + expect(scannerSession.getOptions()).andAnswer(SessionOptions::new).anyTimes(); + // this is sort of contrived, but necessary to test that the cleanup of the batch scanner would actually happen + expect(executorService.submit(isA(Callable.class))).andThrow(new RuntimeException("testing")); + scannerSession.close(); + + replayAll(); + + lookup.submit(); + + verifyAll(); + } + + @Test + @SuppressWarnings({"unchecked", "ConstantConditions"}) + public void timeoutTest() throws InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException { + AnyFieldScanner scannerSession = createMock(AnyFieldScanner.class); + + ExecutorService s = Executors.newSingleThreadExecutor(); + + Date begin = new Date(); + Date end = new Date(); + config.setBeginDate(begin); + config.setEndDate(end); + + config.setMaxAnyFieldScanTimeMillis(1); + config.setMaxIndexScanTimeMillis(1); + + fields.add("f1"); + fields.add("f2"); + terms.add("lookMeUp"); + lookup = new FieldNameIndexLookup(config, scannerFactory, fields, terms, s); + + expect(scannerFactory.newLimitedScanner(isA(Class.class), isA(String.class), isA(Set.class), isA(QueryImpl.class), isA(String.class))) + .andReturn(scannerSession); + expect(scannerSession.setRanges(anyObject())).andReturn(scannerSession); + expect(scannerSession.setOptions(anyObject())).andReturn(scannerSession); + expect(scannerSession.getOptions()).andAnswer(SessionOptions::new).anyTimes(); + + expect(scannerSession.hasNext()).andAnswer(() -> { + Thread.sleep(100000); + return true; + }); + + scannerFactory.close(scannerSession); + // once inside lookup() and another inside the runnable + expectLastCall().times(2); + + replayAll(); + + lookup.submit(); + lookup.lookup(); + + verifyAll(); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java new file mode 100644 index 00000000000..9cf025557aa --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultJexlNodeComparatorTest.java @@ -0,0 +1,49 @@ +package datawave.query.jexl.nodes; + +import org.junit.Test; + +public class DefaultJexlNodeComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new DefaultJexlNodeComparator(); + + @Test + public void testSortSameFieldDifferentValues() { + String query = "FOO == 'baz' || FOO == 'bar'"; + String expected = "FOO == 'bar' || FOO == 'baz'"; + drive(query, expected, comparator); + } + + @Test + public void testDifferentFieldSameValues() { + String query = "FOO_B == 'baz' || FOO_A == 'baz'"; + String expected = "FOO_A == 'baz' || FOO_B == 'baz'"; + drive(query, expected, comparator); + } + + @Test + public void testSortOrderWithNodePairs() { + // EQ before NE + String query = "FOO != 'bar' || FOO == 'bar'"; + String expected = "FOO == 'bar' || FOO != 'bar'"; + drive(query, expected, comparator); + } + + @Test + public void testSortSingleNodesBeforeJunctions() { + String query = "(FOO == 'bar' && FOO == 'baz') || FOO == 'fizz'"; + String expected = "FOO == 'fizz' || (FOO == 'bar' && FOO == 'baz')"; + drive(query, expected, comparator); + + query = "(FOO == 'bar' || FOO == 'baz') && FOO == 'fizz'"; + expected = "FOO == 'fizz' && (FOO == 'bar' || FOO == 'baz')"; + drive(query, expected, comparator); + } + + @Test + public void testMarkersSortLast() { + String query = "B == '2' && ((_Value_ = true) && (A =~ 'ba.*')) && A == '1'"; + String expected = "A == '1' && B == '2' && ((_Value_ = true) && (A =~ 'ba.*'))"; + drive(query, expected, comparator); + } + +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java deleted file mode 100644 index 279b8f6743f..00000000000 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/DefaultNodeCostComparatorTest.java +++ /dev/null @@ -1,80 +0,0 @@ -package datawave.query.jexl.nodes; - -import static org.junit.Assert.assertEquals; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -import org.apache.commons.jexl3.parser.JexlNode; -import org.junit.Test; - -import datawave.query.jexl.JexlNodeFactory; - -public class DefaultNodeCostComparatorTest { - - @Test - public void testCompareTwoEq() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildEQNode("FOO", "baz"); - - List nodes = new LinkedList<>(); - nodes.add(left); - nodes.add(right); - - Iterator iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Order should not have changed - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } - - @Test - public void testCompareEqAndRe() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildERNode("FOO", "baz.*"); - - List nodes = new LinkedList<>(); - nodes.add(right); - nodes.add(left); - - // Assert insert order - Iterator iter = nodes.iterator(); - assertEquals(right, iter.next()); - assertEquals(left, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Assert proper sort order, EQ before ER - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } - - @Test - public void testCompareEqAndFunction() { - JexlNode left = JexlNodeFactory.buildEQNode("FOO", "bar"); - JexlNode right = JexlNodeFactory.buildFunctionNode("content", "phrase", "FOO", "baz"); - - List nodes = new LinkedList<>(); - nodes.add(right); - nodes.add(left); - - // Assert insert order - Iterator iter = nodes.iterator(); - assertEquals(right, iter.next()); - assertEquals(left, iter.next()); - - nodes.sort(new DefaultNodeCostComparator()); - - // Assert proper sort order, EQ before ER - iter = nodes.iterator(); - assertEquals(left, iter.next()); - assertEquals(right, iter.next()); - } -} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java new file mode 100644 index 00000000000..ecc2002e07d --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/FieldCostComparatorTest.java @@ -0,0 +1,147 @@ +package datawave.query.jexl.nodes; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.jexl.visitors.order.OrderByCostVisitor; +import datawave.query.util.count.CountMap; + +/** + * Tests for the {@link FieldCostComparator} + */ +public class FieldCostComparatorTest extends NodeComparatorTestUtil { + + private CountMap counts; + private JexlNodeComparator comparator; + + // sort when all fields present in map + @Test + public void testAllFieldsHaveCardinality() { + String query = "F23 == '23' || F12 == '12'"; + String expected = "F12 == '12' || F23 == '23'"; + + drive(query, expected, getComparator()); + } + + // sort when some fields present in map + @Test + public void testSomeFieldsHaveCardinality() { + // F11 is not found in the count map, should get sorted to the right + String query = "F11 == '11' || F12 == '12'"; + String expected = "F12 == '12' || F11 == '11'"; + + drive(query, expected, getComparator()); + } + + // sort when no fields are present in map (default ordering) + @Test + public void testNoFieldsHaveCardinality() { + String query = "F2 == '2' || F1 == '1' || F2 == '1'"; + String expected = "F1 == '1' || F2 == '1' || F2 == '2'"; + + drive(query, expected, getComparator()); + } + + // sort with leaves and unions + + @Test + public void testJunctionSortsLeftOfHighCostLeaf() { + String query = "F45 == '45' && (F12 == '12' || F23 == '23')"; + String expected = "(F12 == '12' || F23 == '23') && F45 == '45'"; + drive(query, expected, getComparator()); + } + + @Test + public void testIntersectionSortsRightWithUniformCosts() { + // because intersections take the lowest cost, if a leaf joins with a junction + // and the leaf shares the lowest cost node in the junction, you get a tie + String query = "(F12 == '12' && F23 == '23') || F12 == '12'"; + String expected = "F12 == '12' || (F12 == '12' && F23 == '23')"; + drive(query, expected, getComparator()); + } + + // sort with leaves or junctions + + // sort with unions of variable sizes + @Test + public void testSortUnionsOfVariableSizeAndCost() { + // lower cardinality unions should sort first even if it has more terms + String query = "(F45 == '45' || F45 == '45') && (F12 == '12' || F12 == '12' || F12 == '12')"; + String expected = "(F12 == '12' || F12 == '12' || F12 == '12') && (F45 == '45' || F45 == '45')"; + drive(query, expected, getComparator()); + } + + // sort with intersections of variable sizes + @Test + public void testSortIntersectionsOfVariableSizeAndCost() { + // lower cardinality intersections should sort first even if it has more terms + String query = "(F45 == '45' && F45 == '45') || (F12 == '12' && F12 == '12' && F12 == '12')"; + String expected = "(F12 == '12' && F12 == '12' && F12 == '12') || (F45 == '45' && F45 == '45')"; + drive(query, expected, getComparator()); + } + + // test integer overflow with multiple negation nodes + @Test + public void testNestedUnionOfNegatedTermsSortsLast() { + String query = "(!(F == '1') || !(F == '1')) && F12 == '12'"; + String expected = "F12 == '12' && (!(F == '1') || !(F == '1'))"; + drive(query, expected, getComparator()); + + query = "(F != '1' || F != '1') && F12 == '12'"; + expected = "F12 == '12' && (F != '1' || F != '1')"; + drive(query, expected, getComparator()); + } + + // test integer overflow with multiple marker nodes + @Test + public void testAvoidIntegerOverFlowWithMultipleMarkerNodes() { + String query = "((_Value_ = true) && (F =~ 'aa.*')) && ((_Value_ = true) && (F =~ 'bb.*')) && F == '2'"; + String expected = "F == '2' && ((_Value_ = true) && (F =~ 'aa.*')) && ((_Value_ = true) && (F =~ 'bb.*'))"; + drive(query, expected, getComparator()); + } + + /** + * Explicit override of test utility code so the {@link OrderByCostVisitor} can be run + * + * @param query + * the input query + * @param expected + * the expected query + * @param comparator + * the comparator + */ + @Override + public void drive(String query, String expected, JexlNodeComparator comparator) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + script = OrderByCostVisitor.orderByFieldCount(script, getCounts().getCounts()); + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, ordered); + } catch (Exception e) { + fail("Failed to run test", e); + } + } + + private JexlNodeComparator getComparator() { + if (comparator == null) { + comparator = new FieldCostComparator(getCounts()); + } + return comparator; + } + + private CountMap getCounts() { + if (counts == null) { + counts = new CountMap(); + counts.put("F12", 12L); + counts.put("F23", 23L); + counts.put("F34", 34L); + counts.put("F45", 45L); + } + return counts; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java new file mode 100644 index 00000000000..6bb1561d140 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/JunctionComparatorTest.java @@ -0,0 +1,190 @@ +package datawave.query.jexl.nodes; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link JunctionComparator} to verify that leaf nodes sort before junctions + */ +public class JunctionComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new JunctionComparator(); + + /** + * Test that asserts no changes to queries of the following types + *

+ * A && B + *

+ */ + @Test + public void testIntersectionOfLeafNodes() { + // @formatter:off + String[] queries = new String[] { + "F == '1' && F == '2'", // eq + "F != '1' && F == '2'", // ne + "F < '1' && F == '2'", // lt + "F > '1' && F == '2'", // gt + "F <= '1' && F == '2'", // le + "F >= '1' && F == '2'", // ge + "F =~ '1' && F == '2'", // er + "F !~ '1' && F == '2'", // nr + "!(F == '1') && F == '2'", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // A || B + @Test + public void testUnionOfLeafNodes() { + // @formatter:off + String[] queries = new String[] { + "F == '1' || F == '2'", // eq + "F != '1' || F == '2'", // ne + "F < '1' || F == '2'", // lt + "F > '1' || F == '2'", // gt + "F <= '1' || F == '2'", // le + "F >= '1' || F == '2'", // ge + "F =~ '1' || F == '2'", // er + "F !~ '1' || F == '2'", // nr + "!(F == '1') || F == '2'", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // A && (B || C) + @Test + public void testIntersectionWithNestedUnion() { + // first, assert queries with no change + // @formatter:off + String[] queries = new String[] { + "F == '1' && (F == '2' || F == '3')", // eq + "F != '1' && (F == '2' || F == '3')", // ne + "F < '1' && (F == '2' || F == '3')", // lt + "F > '1' && (F == '2' || F == '3')", // gt + "F <= '1' && (F == '2' || F == '3')", // le + "F >= '1' && (F == '2' || F == '3')", // ge + "F =~ '1' && (F == '2' || F == '3')", // er + "F !~ '1' && (F == '2' || F == '3')", // nr + "!(F == '1') && (F == '2' || F == '3')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // next, assert queries with change to sort order + // @formatter:off + String[][] sortable = new String[][] { + {"(F == '2' || F == '3') && F == '1'", "F == '1' && (F == '2' || F == '3')"}, // eq + {"(F == '2' || F == '3') && F != '1'", "F != '1' && (F == '2' || F == '3')"}, // ne + {"(F == '2' || F == '3') && F < '1'", "F < '1' && (F == '2' || F == '3')"}, // lt + {"(F == '2' || F == '3') && F > '1'", "F > '1' && (F == '2' || F == '3')"}, // gt + {"(F == '2' || F == '3') && F <= '1'", "F <= '1' && (F == '2' || F == '3')"}, // le + {"(F == '2' || F == '3') && F >= '1'", "F >= '1' && (F == '2' || F == '3')"}, // ge + {"(F == '2' || F == '3') && F =~ '1'", "F =~ '1' && (F == '2' || F == '3')"}, // er + {"(F == '2' || F == '3') && F !~ '1'", "F !~ '1' && (F == '2' || F == '3')"}, // nr + {"(F == '2' || F == '3') && !(F == '1')", "!(F == '1') && (F == '2' || F == '3')"} // not + }; + // @formatter:off + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // A || (B && C) + @Test + public void testUnionWithNestedIntersection() { + // first, assert queries with no change + // @formatter:off + String[] queries = new String[] { + "F == '1' || (F == '2' && F == '3')", // eq + "F != '1' || (F == '2' && F == '3')", // ne + "F < '1' || (F == '2' && F == '3')", // lt + "F > '1' || (F == '2' && F == '3')", // gt + "F <= '1' || (F == '2' && F == '3')", // le + "F >= '1' || (F == '2' && F == '3')", // ge + "F =~ '1' || (F == '2' && F == '3')", // er + "F !~ '1' || (F == '2' && F == '3')", // nr + "!(F == '1') || (F == '2' && F == '3')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // next, assert queries with change to sort order + // @formatter:off + String[][] sortable = new String[][] { + {"(F == '2' && F == '3') || F == '1'", "F == '1' || (F == '2' && F == '3')"}, // eq + {"(F == '2' && F == '3') || F != '1'", "F != '1' || (F == '2' && F == '3')"}, // ne + {"(F == '2' && F == '3') || F < '1'", "F < '1' || (F == '2' && F == '3')"}, // lt + {"(F == '2' && F == '3') || F > '1'", "F > '1' || (F == '2' && F == '3')"}, // gt + {"(F == '2' && F == '3') || F <= '1'", "F <= '1' || (F == '2' && F == '3')"}, // le + {"(F == '2' && F == '3') || F >= '1'", "F >= '1' || (F == '2' && F == '3')"}, // ge + {"(F == '2' && F == '3') || F =~ '1'", "F =~ '1' || (F == '2' && F == '3')"}, // er + {"(F == '2' && F == '3') || F !~ '1'", "F !~ '1' || (F == '2' && F == '3')"}, // nr + {"(F == '2' && F == '3') || !(F == '1')", "!(F == '1') || (F == '2' && F == '3')"} // not + }; + // @formatter:off + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // (A || B) && (C || D) + @Test + public void testIntersectionOfNestedUnions() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + "(F == '1' || F == '2') && (F == '3' || F == '4')", // eq + "(F == '1' || F != '2') && (F != '3' || F == '4')", // ne + "(F == '1' || F < '2') && (F < '3' || F == '4')", // lt + "(F == '1' || F > '2') && (F > '3' || F == '4')", // gt + "(F == '1' || F <= '2') && (F <= '3' || F == '4')", // le + "(F == '1' || F >= '2') && (F >= '3' || F == '4')", // ge + "(F == '1' || F =~ '2') && (F =~ '3' || F == '4')", // er + "(F == '1' || F !~ '2') && (F !~ '3' || F == '4')", // nr + "(F == '1' || !(F == '2')) && (!(F == '3') || F == '4')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // (A && B) || (C && D) + @Test + public void testUnionOfNestedIntersections() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + "(F == '1' && F == '2') || (F == '3' && F == '4')", // eq + "(F == '1' && F != '2') || (F != '3' && F == '4')", // ne + "(F == '1' && F < '2') || (F < '3' && F == '4')", // lt + "(F == '1' && F > '2') || (F > '3' && F == '4')", // gt + "(F == '1' && F <= '2') || (F <= '3' && F == '4')", // le + "(F == '1' && F >= '2') || (F >= '3' && F == '4')", // ge + "(F == '1' && F =~ '2') || (F =~ '3' && F == '4')", // er + "(F == '1' && F !~ '2') || (F !~ '3' && F == '4')", // nr + "(F == '1' && !(F == '2')) || (!(F == '3') && F == '4')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java new file mode 100644 index 00000000000..22b168aa4cd --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/LexicographicalNodeComparatorTest.java @@ -0,0 +1,247 @@ +package datawave.query.jexl.nodes; + +import org.junit.jupiter.api.Test; + +/** + * Tests for the {@link LexicographicalNodeComparator} to verify expected sorts with different fields and values + */ +public class LexicographicalNodeComparatorTest extends NodeComparatorTestUtil { + + private final JexlNodeComparator comparator = new LexicographicalNodeComparator(); + + // same node, same field, same value + @Test + public void testSameNodeType_sameField_sameValue() { + // assert no changes + // @formatter:off + String[] queries = new String[] { + // intersections + "F == '1' && F == '1'", // eq + "F != '1' && F != '1'", // ne + "F < '1' && F < '1'", // lt + "F > '1' && F > '1'", // gt + "F <= '1' && F <= '1'", // le + "F >= '1' && F >= '1'", // ge + "F =~ '1' && F =~ '1'", // er + "F !~ '1' && F !~ '1'", // nr + "!(F == '1') && !(F == '1')", // not + // unions + "F == '1' || F == '1'", // eq + "F != '1' || F != '1'", // ne + "F < '1' || F < '1'", // lt + "F > '1' || F > '1'", // gt + "F <= '1' || F <= '1'", // le + "F >= '1' || F >= '1'", // ge + "F =~ '1' || F =~ '1'", // er + "F !~ '1' || F !~ '1'", // nr + "!(F == '1') || !(F == '1')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + } + + // same node, same field, different values + @Test + public void testSameNodeType_sameField_differentValue() { + // different values, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F == '1' && F == '2'", // eq + "F != '1' && F != '2'", // ne + "F < '1' && F < '2'", // lt + "F > '1' && F > '2'", // gt + "F <= '1' && F <= '2'", // le + "F >= '1' && F >= '2'", // ge + "F =~ '1' && F =~ '2'", // er + "F !~ '1' && F !~ '2'", // nr + "!(F == '1') && !(F == '2')", // not + // unions + "F == '1' || F == '2'", // eq + "F != '1' || F != '2'", // ne + "F < '1' || F < '2'", // lt + "F > '1' || F > '2'", // gt + "F <= '1' || F <= '2'", // le + "F >= '1' || F >= '2'", // ge + "F =~ '1' || F =~ '2'", // er + "F !~ '1' || F !~ '2'", // nr + "!(F == '1') || !(F == '2')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different values, incorrect order, expect changes + + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F == '2' && F == '1'", "F == '1' && F == '2'"}, // eq + {"F != '2' && F != '1'", "F != '1' && F != '2'"}, // ne + {"F < '2' && F < '1'", "F < '1' && F < '2'"}, // lt + {"F > '2' && F > '1'", "F > '1' && F > '2'"}, // gt + {"F <= '2' && F <= '1'", "F <= '1' && F <= '2'"}, // le + {"F >= '2' && F >= '1'", "F >= '1' && F >= '2'"}, // ge + {"F =~ '2' && F =~ '1'", "F =~ '1' && F =~ '2'"}, // er + {"F !~ '2' && F !~ '1'", "F !~ '1' && F !~ '2'"}, // nr + {"!(F == '2') && !(F == '1')", "!(F == '1') && !(F == '2')"}, // not + // unions + {"F == '2' || F == '1'", "F == '1' || F == '2'"}, // eq + {"F != '2' || F != '1'", "F != '1' || F != '2'"}, // ne + {"F < '2' || F < '1'", "F < '1' || F < '2'"}, // lt + {"F > '2' || F > '1'", "F > '1' || F > '2'"}, // gt + {"F <= '2' || F <= '1'", "F <= '1' || F <= '2'"}, // le + {"F >= '2' || F >= '1'", "F >= '1' || F >= '2'"}, // ge + {"F =~ '2' || F =~ '1'", "F =~ '1' || F =~ '2'"}, // er + {"F !~ '2' || F !~ '1'", "F !~ '1' || F !~ '2'"}, // nr + {"!(F == '2') || !(F == '1')", "!(F == '1') || !(F == '2')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // same node, different field, same values + @Test + public void testSameNodeType_differentField_sameValue() { + // different fields, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F1 == '1' && F2 == '1'", // eq + "F1 != '1' && F2 != '1'", // ne + "F1 < '1' && F2 < '1'", // lt + "F1 > '1' && F2 > '1'", // gt + "F1 <= '1' && F2 <= '1'", // le + "F1 >= '1' && F2 >= '1'", // ge + "F1 =~ '1' && F2 =~ '1'", // er + "F1 !~ '1' && F2 !~ '1'", // nr + "!(F1 == '1') && !(F2 == '1')", // not + // unions + "F1 == '1' || F2 == '1'", // eq + "F1 != '1' || F2 != '1'", // ne + "F1 < '1' || F2 < '1'", // lt + "F1 > '1' || F2 > '1'", // gt + "F1 <= '1' || F2 <= '1'", // le + "F1 >= '1' || F2 >= '1'", // ge + "F1 =~ '1' || F2 =~ '1'", // er + "F1 !~ '1' || F2 !~ '1'", // nr + "!(F1 == '1') || !(F2 == '1')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different fields, incorrect order, expect changes + + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F2 == '1' && F1 == '1'", "F1 == '1' && F2 == '1'"}, // eq + {"F2 != '1' && F1 != '1'", "F1 != '1' && F2 != '1'"}, // ne + {"F2 < '1' && F1 < '1'", "F1 < '1' && F2 < '1'"}, // lt + {"F2 > '1' && F1 > '1'", "F1 > '1' && F2 > '1'"}, // gt + {"F2 <= '1' && F1 <= '1'", "F1 <= '1' && F2 <= '1'"}, // le + {"F2 >= '1' && F1 >= '1'", "F1 >= '1' && F2 >= '1'"}, // ge + {"F2 =~ '1' && F1 =~ '1'", "F1 =~ '1' && F2 =~ '1'"}, // er + {"F2 !~ '1' && F1 !~ '1'", "F1 !~ '1' && F2 !~ '1'"}, // nr + {"!(F2 == '1') && !(F1 == '1')", "!(F1 == '1') && !(F2 == '1')"}, // not + // unions + {"F2 == '1' || F1 == '1'", "F1 == '1' || F2 == '1'"}, // eq + {"F2 != '1' || F1 != '1'", "F1 != '1' || F2 != '1'"}, // ne + {"F2 < '1' || F1 < '1'", "F1 < '1' || F2 < '1'"}, // lt + {"F2 > '1' || F1 > '1'", "F1 > '1' || F2 > '1'"}, // gt + {"F2 <= '1' || F1 <= '1'", "F1 <= '1' || F2 <= '1'"}, // le + {"F2 >= '1' || F1 >= '1'", "F1 >= '1' || F2 >= '1'"}, // ge + {"F2 =~ '1' || F1 =~ '1'", "F1 =~ '1' || F2 =~ '1'"}, // er + {"F2 !~ '1' || F1 !~ '1'", "F1 !~ '1' || F2 !~ '1'"}, // nr + {"!(F2 == '1') || !(F1 == '1')", "!(F1 == '1') || !(F2 == '1')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + // same node, different field, different values + @Test + public void testSameNodeType_differentField_differentValue() { + // different fields and values, correct order, no change + // @formatter:off + String[] queries = new String[] { + // intersections + "F1 == '1' && F2 == '1' && F2 == '2'", // eq + "F1 != '1' && F2 != '1' && F2 != '2'", // ne + "F1 < '1' && F2 < '1' && F2 < '2'", // lt + "F1 > '1' && F2 > '1' && F2 > '2'", // gt + "F1 <= '1' && F2 <= '1' && F2 <= '2'", // le + "F1 >= '1' && F2 >= '1' && F2 >= '2'", // ge + "F1 =~ '1' && F2 =~ '1' && F2 =~ '2'", // er + "F1 !~ '1' && F2 !~ '1' && F2 !~ '2'", // nr + "!(F1 == '1') && !(F2 == '1') && !(F2 == '2')", // not + // unions + "F1 == '1' || F2 == '1' || F2 == '2'", // eq + "F1 != '1' || F2 != '1' || F2 != '2'", // ne + "F1 < '1' || F2 < '1' || F2 < '2'", // lt + "F1 > '1' || F2 > '1' || F2 > '2'", // gt + "F1 <= '1' || F2 <= '1' || F2 <= '2'", // le + "F1 >= '1' || F2 >= '1' || F2 >= '2'", // ge + "F1 =~ '1' || F2 =~ '1' || F2 =~ '2'", // er + "F1 !~ '1' || F2 !~ '1' || F2 !~ '2'", // nr + "!(F1 == '1') || !(F2 == '1') || !(F2 == '2')", // not + }; + // @formatter:on + + for (String query : queries) { + drive(query, query, comparator); + } + + // different fields and values, incorrect order, change expected + // @formatter:off + String[][] sortable = new String[][] { + // intersections + {"F2 == '2' && F2 == '1' && F1 == '1'", "F1 == '1' && F2 == '1' && F2 == '2'"}, // eq + {"F2 != '2' && F2 != '1' && F1 != '1'", "F1 != '1' && F2 != '1' && F2 != '2'"}, // ne + {"F2 < '2' && F2 < '1' && F1 < '1'", "F1 < '1' && F2 < '1' && F2 < '2'"}, // lt + {"F2 > '2' && F2 > '1' && F1 > '1'", "F1 > '1' && F2 > '1' && F2 > '2'"}, // gt + {"F2 <= '2' && F2 <= '1' && F1 <= '1'", "F1 <= '1' && F2 <= '1' && F2 <= '2'"}, // le + {"F2 >= '2' && F2 >= '1' && F1 >= '1'", "F1 >= '1' && F2 >= '1' && F2 >= '2'"}, // ge + {"F2 =~ '2' && F2 =~ '1' && F1 =~ '1'", "F1 =~ '1' && F2 =~ '1' && F2 =~ '2'"}, // er + {"F2 !~ '2' && F2 !~ '1' && F1 !~ '1'", "F1 !~ '1' && F2 !~ '1' && F2 !~ '2'"}, // nr + {"!(F2 == '2') && !(F2 == '1') && !(F1 == '1')", "!(F1 == '1') && !(F2 == '1') && !(F2 == '2')"}, // not + // unions + {"F2 == '2' || F2 == '1' || F1 == '1'", "F1 == '1' || F2 == '1' || F2 == '2'"}, // eq + {"F2 != '2' || F2 != '1' || F1 != '1'", "F1 != '1' || F2 != '1' || F2 != '2'"}, // ne + {"F2 < '2' || F2 < '1' || F1 < '1'", "F1 < '1' || F2 < '1' || F2 < '2'"}, // lt + {"F2 > '2' || F2 > '1' || F1 > '1'", "F1 > '1' || F2 > '1' || F2 > '2'"}, // gt + {"F2 <= '2' || F2 <= '1' || F1 <= '1'", "F1 <= '1' || F2 <= '1' || F2 <= '2'"}, // le + {"F2 >= '2' || F2 >= '1' || F1 >= '1'", "F1 >= '1' || F2 >= '1' || F2 >= '2'"}, // ge + {"F2 =~ '2' || F2 =~ '1' || F1 =~ '1'", "F1 =~ '1' || F2 =~ '1' || F2 =~ '2'"}, // er + {"F2 !~ '2' || F2 !~ '1' || F1 !~ '1'", "F1 !~ '1' || F2 !~ '1' || F2 !~ '2'"}, // nr + {"!(F2 == '2') || !(F2 == '1') || !(F1 == '1')", "!(F1 == '1') || !(F2 == '1') || !(F2 == '2')"}, // not + }; + // @formatter:on + + for (String[] query : sortable) { + drive(query[0], query[1], comparator); + } + } + + @Test + public void testDemonstrateJunctionSortOrder() { + // this test case demonstrates why this visitor should only be used to break ties between two otherwise equivalent nodes + String query = "F == '1' && (F == '2' || F == '3')"; + String expected = "(F == '2' || F == '3') && F == '1'"; + drive(query, expected, comparator); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java new file mode 100644 index 00000000000..a5bc13cdeda --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/NodeComparatorTestUtil.java @@ -0,0 +1,57 @@ +package datawave.query.jexl.nodes; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.Arrays; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; +import org.apache.commons.jexl3.parser.ParseException; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; + +/** + * Common test code for node comparator tests + */ +public class NodeComparatorTestUtil { + + /** + * Assumes the provided queries are either a union or an intersection + * + * @param query + * the input query + * @param expected + * the expected query + */ + public void drive(String query, String expected, JexlNodeComparator comparator) { + JexlNode[] queryChildren = parse(query); + Arrays.sort(queryChildren, comparator); + + JexlNode[] expectedChildren = parse(expected); + + assertEquals(expectedChildren.length, queryChildren.length); + for (int i = 0; i < expectedChildren.length; i++) { + String expectedChild = JexlStringBuildingVisitor.buildQuery(expectedChildren[i]); + String queryChild = JexlStringBuildingVisitor.buildQuery(queryChildren[i]); + assertEquals(expectedChild, queryChild); + } + } + + private JexlNode[] parse(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + JexlNode node = script.jjtGetChild(0); + assertTrue(node instanceof ASTAndNode || node instanceof ASTOrNode); + return JexlNodes.getChildren(node); + } catch (ParseException e) { + fail("Failed test: " + query); + throw new RuntimeException("Failed test: " + query); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java new file mode 100644 index 00000000000..19a451b97e2 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/nodes/TermCostComparatorTest.java @@ -0,0 +1,159 @@ +package datawave.query.jexl.nodes; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.visitors.JexlStringBuildingVisitor; +import datawave.query.jexl.visitors.order.OrderByCostVisitor; +import datawave.query.util.count.CountMap; + +public class TermCostComparatorTest extends NodeComparatorTestUtil { + + private CountMap counts; + private JexlNodeComparator comparator; + + // sort all terms have cardinality + @Test + public void testAllTermsHaveCardinality() { + String[][] queries = {{"F == '23' || F == '12'", "F == '12' || F == '23'"}, {"F == '23' && F == '12'", "F == '12' && F == '23'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort some terms have cardinality + @Test + public void testSomeTermsHaveCardinality() { + String[][] queries = {{"F == '0' || F == '12'", "F == '12' || F == '0'"}, {"F == '0' && F == '12'", "F == '12' && F == '0'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort no terms have cardinality (fallback) + @Test + public void testNoTermsHaveCardinality() { + String[][] queries = {{"F == '2' || F == '1'", "F == '1' || F == '2'"}, {"F == '2' && F == '1'", "F == '1' && F == '2'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions all terms have cardinality + variable size + @Test + public void testJunctionsSortLeftOfHighCostLeaf() { + String[][] queries = {{"(F == '12' || F == '23') && F == '45'", "(F == '12' || F == '23') && F == '45'"}, + {"(F == '12' && F == '23') || F == '45'", "(F == '12' && F == '23') || F == '45'"}, + // sort order applied to nested junctions + {"(F == '23' || F == '12') && F == '45'", "(F == '12' || F == '23') && F == '45'"}, + {"(F == '23' && F == '12') || F == '45'", "(F == '12' && F == '23') || F == '45'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + @Test + public void testJunctionSort() { + String[][] queries = { + // assert no change ordered nested junctions and ordered top level junction + {"(F == '12' || F == '23') && (F == '34' || F == '45')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '12' && F == '23') || (F == '34' && F == '45')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert unordered nested junctions and ordered top level junctions + {"(F == '23' || F == '12') && (F == '45' || F == '34')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '23' && F == '12') || (F == '45' && F == '34')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert ordered nested junctions and unordered top level junctions + {"(F == '34' || F == '45') && (F == '12' || F == '23')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '34' && F == '45') || (F == '12' && F == '23')", "(F == '12' && F == '23') || (F == '34' && F == '45')"}, + // assert unordered nested junctions and unordered top level junctions + {"(F == '45' || F == '34') && (F == '23' || F == '12')", "(F == '12' || F == '23') && (F == '34' || F == '45')"}, + {"(F == '45' && F == '34') || (F == '23' && F == '12')", "(F == '12' && F == '23') || (F == '34' && F == '45')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + @Test + public void testJunctionsOfVariableSize() { + String[][] queries = { + // ordered junctions + {"(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')", "(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')"}, + {"(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')", "(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')"}, + // unordered junctions + {"(F == '34' || F == '45') && (F == '12' || F == '12' || F == '12')", "(F == '12' || F == '12' || F == '12') && (F == '34' || F == '45')"}, + {"(F == '34' && F == '45') || (F == '12' && F == '12' && F == '12')", "(F == '12' && F == '12' && F == '12') || (F == '34' && F == '45')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions partial cardinality + @Test + public void testJunctionsWithPartialCardinality() { + String[][] queries = {{"F == '1' || F == '23'", "F == '23' || F == '1'"}, {"F == '1' && F == '23'", "F == '23' && F == '1'"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + // sort junctions one side has cardinality the other does not + @Test + public void testSomeJunctionsHaveCardinality() { + String[][] queries = {{"(F == '1' || F == '2') && (F == '12' || F == '23')", "(F == '12' || F == '23') && (F == '1' || F == '2')"}, + {"(F == '1' && F == '2') || (F == '12' && F == '23')", "(F == '12' && F == '23') || (F == '1' && F == '2')"},}; + + for (String[] query : queries) { + drive(query[0], query[1], getComparator()); + } + } + + /** + * Explicit override of test utility code so the {@link OrderByCostVisitor} can be run + * + * @param query + * the input query + * @param expected + * the expected query + * @param comparator + * the comparator + */ + @Override + public void drive(String query, String expected, JexlNodeComparator comparator) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + script = OrderByCostVisitor.orderByTermCount(script, getCounts().getCounts()); + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, ordered); + } catch (Exception e) { + fail("Failed to run test", e); + } + } + + private JexlNodeComparator getComparator() { + if (comparator == null) { + comparator = new TermCostComparator(getCounts()); + } + return comparator; + } + + private CountMap getCounts() { + if (counts == null) { + counts = new CountMap(); + counts.put("F == '12'", 12L); + counts.put("F == '23'", 23L); + counts.put("F == '34'", 34L); + counts.put("F == '45'", 45L); + } + return counts; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java index 4bda59efc5f..89058778946 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IsNotNullPruningVisitorTest.java @@ -493,13 +493,13 @@ public void testFutureCase_PartialPruneOfUnionViaUnion() { // union of same field should allow us to perform a partial prune String query = "(!(FOO == null) || !(FOO2 == null)) && (FOO == 'bar' || FOO == 'baz')"; - // String expected = "!(FOO2 == null) && (FOO == 'bar' || FOO == 'baz')"; - test(query, query); + String expected = "(FOO == 'bar' || FOO == 'baz')"; + test(query, expected); // should also work for filter:includeRegex query = "(!(FOO == null) || !(FOO2 == null)) && (filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; - // expected = "!(FOO2 == null) && (filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; - test(query, query); + expected = "(filter:includeRegex(FOO, 'bar.*') || filter:includeRegex(FOO, 'baz.*'))"; + test(query, expected); } // test cases where nothing should be done @@ -537,10 +537,12 @@ public void testNoOpCases() { // cannot prune half of a union query = "(!(FOO == null) || !(FOO2 == null)) && FOO == 'bar'"; - test(query, query); + String expected = "FOO == 'bar'"; + test(query, expected); query = "(!(FOO == null) || !(FOO2 == null)) && FOO =~ 'ba.*'"; - test(query, query); + expected = "FOO =~ 'ba.*'"; + test(query, expected); } @Test @@ -575,6 +577,22 @@ public void testNoOpQueryPropertyMarkers() { test(query, query); } + @Test + public void testPruningNestedUnionOfIsNotNullFunctions() { + // logically, these unions are equivalent and the 'is not null' side can be pruned + String query = "FOO == 'bar' && (!(FOO == null) || !(FOO2 == null) || !(FOO3 == null) || !(FOO4 == null))"; + String expected = "FOO == 'bar'"; + + test(query, expected); + } + + @Test + public void testPruningNestedUnionOfIsNotNullFunctions_Two() { + // in this case, since the FOO field is not in the union nothing will be pruned. + String query = "FOO == 'bar' && (!(FOO2 == null) || !(FOO4 == null))"; + test(query, query); + } + private void test(String query, String expected) { try { ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IvaratorRequiredVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IvaratorRequiredVisitorTest.java new file mode 100644 index 00000000000..1c283ae2dca --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/IvaratorRequiredVisitorTest.java @@ -0,0 +1,113 @@ +package datawave.query.jexl.visitors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ParseException; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; + +public class IvaratorRequiredVisitorTest { + + @Test + public void testSingleTerms() { + // equality and not operators + test("A == '11'", false); + test("B != '23'", false); + test("!(C == '34')", false); + + // range operators + test("A <= '11'", false); + test("B >= '23'", false); + test("C < '34'", false); + test("D > '77'", false); + } + + @Test + public void testMarkers() { + // ivarator required + test("((_Value_ = true) && (F =~ 'ba.*'))", true); + test("((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))", true); + test("((_List_ = true) && (((id = 'some-bogus-id') && (field = 'QUOTE') && (params = '{\"values\":[\"a\",\"b\",\"c\"]}'))))", true); + + // ivarator not required + test("((_Bounded_ = true) && (A >= 1 && A <= 2))", false); + test("((_Delayed_ = true) && (F =~ 'pushed'))", false); + test("((_Drop_ = true) && (F =~ 'evaluation'))", false); + test("((_Eval_ = true) && (F =~ 'evaluation'))", false); + test("((_Lenient_ = true) && (F =~ 'evaluation'))", false); + } + + @Test + public void testDoubleWrappedMarkers() { + test("((_Value_ = true) && ((_Bounded_ = true) && (A >= 1 && A <= 2)) )", true); + } + + @Test + public void testOrIvarators() { + // ivarator required + test("A == '1' || ((_Value_ = true) && (F =~ 'ba.*'))", true); + test("A == '1' || ((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))", true); + test("A == '1' || ((_List_ = true) && (((id = 'some-bogus-id') && (field = 'QUOTE') && (params = '{\"values\":[\"a\",\"b\",\"c\"]}'))))", true); + + // ivarator not required + test("A == '1' || ((_Bounded_ = true) && (A >= 1 && A <= 2))", false); + test("A == '1' || ((_Delayed_ = true) && (F =~ 'pushed'))", false); + test("A == '1' || ((_Drop_ = true) && (F =~ 'evaluation'))", false); + test("A == '1' || ((_Eval_ = true) && (F =~ 'evaluation'))", false); + test("A == '1' || ((_Lenient_ = true) && (F =~ 'evaluation'))", false); + } + + @Test + public void testAndIvarators() { + // ivarator required + test("A == '1' && ((_Value_ = true) && (F =~ 'ba.*'))", true); + test("A == '1' && ((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))", true); + test("A == '1' && ((_List_ = true) && (((id = 'some-bogus-id') && (field = 'QUOTE') && (params = '{\"values\":[\"a\",\"b\",\"c\"]}'))))", true); + + // ivarator not required + test("A == '1' && ((_Bounded_ = true) && (A >= 1 && A <= 2))", false); + test("A == '1' && ((_Eval_ = true) && (F =~ 'evaluation'))", false); + test("A == '1' && ((_Delayed_ = true) && (F =~ 'pushed'))", false); + test("A == '1' && ((_Drop_ = true) && (F =~ 'evaluation'))", false); + test("A == '1' && ((_Lenient_ = true) && (F =~ 'evaluation'))", false); + } + + @Test + public void testDistributedNestedUnions() { + // (A or Ivarator) AND (B or Ivarator) + test("(A == '1' || ((_Value_ = true) && (F =~ 'ba.*'))) && (B == '2' || ((_Value_ = true) && (F =~ 'ba.*')))", true); + // order should not matter + test("(((_Value_ = true) && (F =~ 'ba.*')) || A == '1') && (((_Value_ = true) && (F =~ 'ba.*')) || B == '2')", true); + } + + @Test + public void testDistributedNestedIntersections() { + // (A and ivarator) or (B and ivarator) + test("(A == '1' && ((_Value_ = true) && (F =~ 'ba.*'))) || (B == '2' || ((_Value_ = true) && (F =~ 'ba.*')))", true); + // order should not matter + test("(((_Value_ = true) && (F =~ 'ba.*')) && A == '1') || (((_Value_ = true) && (F =~ 'ba.*')) && B == '2')", true); + } + + @Test + public void testDeeplyNestedIvarators() { + // A and (B or (C and ivarator)) + test("A == '1' && (B == '2' || (C == '3' && ((_Value_ = true) && (F =~ 'ba.*'))))", true); + } + + private void test(String query, boolean expected) { + ASTJexlScript script = parse(query); + assertEquals(expected, IvaratorRequiredVisitor.isIvaratorRequired(script)); + } + + private ASTJexlScript parse(String query) { + try { + return JexlASTHelper.parseAndFlattenJexlQuery(query); + } catch (ParseException e) { + fail("Failed to parse: " + query); + throw new RuntimeException(e); + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java index cde4e603d7b..88355eca03c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/order/OrderByCostVisitorTest.java @@ -332,4 +332,19 @@ private Map getTermCountMap() { counts.put("F == '6'", 5L); // same counts for E and F return counts; } + + @Test + public void testCase() throws Exception { + Map counts = new HashMap<>(); + counts.put("FIELD_A", 23L); + counts.put("FIELD_B", 34L); + counts.put("FIELD_C", 45L); + + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery("FIELD_C == 'v' || FIELD_B == 'v' || FIELD_A == 'v'"); + + OrderByCostVisitor.orderByFieldCount(script, counts); + + String ordered = JexlStringBuildingVisitor.buildQuery(script); + assertEquals("FIELD_A == 'v' || FIELD_B == 'v' || FIELD_C == 'v'", ordered); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java b/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java new file mode 100644 index 00000000000..06dd0ff76ee --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/language/functions/jexl/NoExpansionTest.java @@ -0,0 +1,63 @@ +package datawave.query.language.functions.jexl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThrows; + +import java.util.List; + +import org.junit.Test; + +public class NoExpansionTest { + + /** + * Verify that {@link NoExpansion#validate()} throws an exception given an empty parameter list. + */ + @Test + public void testValidateWithEmptyParameters() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of()); + Exception exception = assertThrows(IllegalArgumentException.class, noExpansion::validate); + assertEquals("datawave.webservice.query.exception.BadRequestQueryException: Invalid arguments to function. noExpansion requires at least one argument", + exception.getMessage()); + } + + /** + * Verify that {@link NoExpansion#validate()} does not throw an error for a single parameter. + */ + @Test + public void testValidateWithOneField() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1")); + noExpansion.validate(); + } + + /** + * Verify that {@link NoExpansion#validate()} does not throw an error for multiple parameters. + */ + @Test + public void testValidateWithMultipleFields() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1", "field2", "field3")); + noExpansion.validate(); + } + + @Test + public void testToStringWithNoParameters() { + NoExpansion noExpansion = new NoExpansion(); + assertEquals("f:noExpansion()", noExpansion.toString()); + } + + @Test + public void testToStringWithOneParameter() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1")); + assertEquals("f:noExpansion('field1')", noExpansion.toString()); + } + + @Test + public void testToStringWithMultipleParameter() { + NoExpansion noExpansion = new NoExpansion(); + noExpansion.setParameterList(List.of("field1", "field2", "field3")); + assertEquals("f:noExpansion('field1','field2','field3')", noExpansion.toString()); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java index 4cce7c7bcdd..65d8c5111a9 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlParser.java @@ -57,7 +57,10 @@ public void test1() throws Exception { public void testParseFunction_NoExpansion() throws ParseException { LuceneToJexlQueryParser parser = getQueryParser(); QueryNode node = parser.parse("FIELD:SOMETHING AND #NOEXPANSION(FIELD)"); - Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion(FIELD)", node.getOriginalQuery()); + Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion('FIELD')", node.getOriginalQuery()); + + node = parser.parse("FIELD:SOMETHING AND #NOEXPANSION(FIELD1,FIELD2)"); + Assert.assertEquals("FIELD == 'SOMETHING' && f:noExpansion('FIELD1','FIELD2')", node.getOriginalQuery()); } @Test diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java index 49fe61074e7..4779d421239 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java @@ -642,12 +642,18 @@ public void testSynonymTokenization() throws ParseException { TokenSearch searchUtil = TokenSearch.Factory.newInstance(); Analyzer analyzer = new StandardAnalyzer(searchUtil); parser.setAnalyzer(analyzer); - // this isn't the most realistic test, but it does verify that we don't lose the rest of the token stream - // when the first token emitted is the same as the input token. - Assert.assertEquals( - "(TOKFIELD == '/home/datawave/README.md' || " - + "content:phrase(TOKFIELD, termOffsetMap, '/home/datawave/readme.md', 'home/datawave/readme.md', " - + "'home', 'datawave/readme.md', 'datawave', 'readme.md', 'readme', 'md'))", - parseQuery("TOKFIELD:\"/home/datawave/README.md\"")); + // @formatter:off + String expected = "(" + + "TOKFIELD == '/home/datawave/README.md' || " + + "TOKFIELD == 'datawave' || " + + "TOKFIELD == 'datawave/readme.md' || " + + "TOKFIELD == 'home' || " + + "TOKFIELD == 'home/datawave/readme.md' || " + + "TOKFIELD == 'md' || " + + "TOKFIELD == 'readme' || " + + "TOKFIELD == 'readme.md'" + + ")"; + // @formatter:on + Assert.assertEquals(expected, parseQuery("TOKFIELD:\"/home/datawave/README.md\"")); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java new file mode 100644 index 00000000000..53d416ec7b4 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java @@ -0,0 +1,136 @@ +package datawave.query.language.parser.jexl; + +import static org.junit.Assert.assertEquals; + +import org.apache.lucene.analysis.CharArraySet; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.collect.Sets; + +import datawave.ingest.data.tokenize.DefaultTokenSearch; +import datawave.ingest.data.tokenize.StandardAnalyzer; +import datawave.ingest.data.tokenize.TokenSearch; +import datawave.query.language.parser.ParseException; +import datawave.query.language.tree.QueryNode; +import datawave.query.language.tree.ServerHeadNode; + +public class TestLuceneToJexlQueryParserVariants { + private LuceneToJexlQueryParser parser; + + @Before + public void setUp() { + CharArraySet stopwords = new CharArraySet(1, true); + stopwords.add("STOP"); + + // TokenSearch is used for ingesting variants, and generally should never be used at query time + // but is good for simulating the case where we want variants at query time. + TokenSearch tokenSearch = TokenSearch.Factory.newInstance(DefaultTokenSearch.class.getName(), stopwords); + StandardAnalyzer analyzer = new StandardAnalyzer(tokenSearch); + parser = new LuceneToJexlQueryParser(); + parser.setSkipTokenizeUnfieldedFields(Sets.newHashSet("noToken")); + parser.setTokenizedFields(Sets.newHashSet("tokField")); + parser.setAnalyzer(analyzer); + } + + @Test + public void testVariantSingleTerm() throws ParseException { + assertEquals("(TOKFIELD == 'foo@bar.com' || TOKFIELD == '@bar.com' || TOKFIELD == 'foo')", parseQuery("TOKFIELD:foo@bar.com")); + } + + @Test + public void testVariantStopword() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'STOP', 'foo@bar.com', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', '@bar.com', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'foo', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'foo@bar.com', 'baz')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email STOP foo@bar.com baz\"")); + } + + @Test + public void testVariantSlopStopword() throws ParseException { + // the split file `wi-fi` increases the slop + // @formatter:off + String expected = "(" + + "content:within(TOKFIELD, 6, termOffsetMap, 'email', 'STOP', 'foo@bar.com', 'wi-fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', '@bar.com', 'wi', 'fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', 'foo', 'wi', 'fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', 'foo@bar.com', 'wi', 'fi')" + + ")"; + // @formatter:off + + assertEquals(expected, parseQuery("TOKFIELD:\"email STOP foo@bar.com wi-fi\"~6")); + } + + @Test + public void testVariantsEnd() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', 'foo@bar.com') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', '@bar.com') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', 'foo')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email to address foo@bar.com\"")); + } + + @Test + public void testVariantsBegin() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'foo@bar.com', 'email', 'from', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, '@bar.com', 'email', 'from', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'foo', 'email', 'from', 'address')" + + ")"; + // @formatter:on + + assertEquals(expected, parseQuery("TOKFIELD:\"foo@bar.com email from address\"")); + } + + @Test + public void testVariantsMiddle() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', 'foo@bar.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', '@bar.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', 'foo', 'address')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email from foo@bar.com address\"")); + } + + @Test + public void testVariantsMultiple() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', 'bar', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', 'bar', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', 'bar', 'address')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"from foo@bar.com to bar@foo.com address\"")); + } + + private String parseQuery(String query) throws ParseException { + String parsedQuery = null; + try { + QueryNode node = parser.parse(query); + if (node instanceof ServerHeadNode) { + parsedQuery = node.getOriginalQuery(); + } + } catch (RuntimeException e) { + throw new ParseException(e); + } + return parsedQuery; + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/predicate/EventDataQueryFieldFilterTest.java b/warehouse/query-core/src/test/java/datawave/query/predicate/EventDataQueryFieldFilterTest.java new file mode 100644 index 00000000000..eb382d63568 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/predicate/EventDataQueryFieldFilterTest.java @@ -0,0 +1,99 @@ +package datawave.query.predicate; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.AbstractMap; +import java.util.Map; +import java.util.Set; + +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.junit.jupiter.api.Test; + +public class EventDataQueryFieldFilterTest { + + @Test + public void testFieldAccept() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A")); + + assertTrue(filter.apply(createEntry("FIELD_A"))); + assertTrue(filter.apply(createEntry("FIELD_A"))); + assertTrue(filter.apply(createEntry("FIELD_A"))); + } + + @Test + public void testFieldRejection_sameFieldRepeated() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A")); + + Map.Entry entry = createEntry("FIELD_B"); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + assertNull(filter.getSeekRange(entry.getKey(), null, false)); + } + + @Test + public void testFieldRejection_differentFieldRepeated() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A")); + + Map.Entry entry = createEntry("FIELD_B"); + assertFalse(filter.apply(createEntry("FIELD_B"))); + assertFalse(filter.apply(createEntry("FIELD_C"))); + assertFalse(filter.apply(createEntry("FIELD_D"))); + assertNull(filter.getSeekRange(entry.getKey(), null, false)); + } + + // tests the case where no key is accepted and the filter seeks to the first field + @Test + public void testGetSeekRange_seekForwardToFirstField() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_B")).withMaxNextCount(1); + + Map.Entry entry = createEntry("FIELD_A"); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + + Range range = filter.getSeekRange(entry.getKey(), null, false); + assertSeekRangeStartKey(range, new Key("row", "datatype\0uid", "FIELD_B\0")); + } + + // tests the case where some keys were accepted and the filter seeks to the second field + @Test + public void testGetSeekRange_seekToSecondField() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_B", "FIELD_D")).withMaxNextCount(1); + + Map.Entry entry = createEntry("FIELD_C"); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + + Range range = filter.getSeekRange(entry.getKey(), null, false); + assertSeekRangeStartKey(range, new Key("row", "datatype\0uid", "FIELD_D\0")); + } + + @Test + public void testGetSeekRange_rolloverRange() { + EventDataQueryFieldFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD_A", "FIELD_B")).withMaxNextCount(1); + + Map.Entry entry = createEntry("FIELD_C"); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + assertFalse(filter.apply(entry)); + + Range range = filter.getSeekRange(entry.getKey(), null, false); + assertSeekRangeStartKey(range, new Key("row", "datatype\0uid\0")); + } + + private Map.Entry createEntry(String field) { + Key key = new Key("row", "datatype\u0000uid", field + "\u0000value"); + return new AbstractMap.SimpleEntry<>(key, ""); + } + + private void assertSeekRangeStartKey(Range range, Key expected) { + assertEquals(expected, range.getStartKey()); + } + +} diff --git a/warehouse/query-core/src/test/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilterTest.java b/warehouse/query-core/src/test/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilterTest.java index 9b360c120b6..db33f063259 100644 --- a/warehouse/query-core/src/test/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilterTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/predicate/TLDTermFrequencyEventDataQueryFilterTest.java @@ -7,14 +7,10 @@ import java.util.Set; import org.apache.accumulo.core.data.Key; -import org.apache.commons.jexl3.parser.ASTJexlScript; -import org.apache.commons.jexl3.parser.ParseException; import org.junit.Test; import com.google.common.collect.Sets; -import datawave.query.jexl.JexlASTHelper; - public class TLDTermFrequencyEventDataQueryFilterTest { private final Key tldField1 = new Key("row", "fi\0FIELD1", "value\0datatype\0d8zay2.-3pnndm.-anolok"); @@ -26,14 +22,9 @@ public class TLDTermFrequencyEventDataQueryFilterTest { private final Key childField3 = new Key("row", "fi\0FIELD3", "value\0datatype\0d8zay2.-3pnndm.-anolok.45"); @Test - public void testTLDTermFrequencyEventDataQueryFilter() throws ParseException { - - String query = "FIELD1 == 'value'"; - ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); - EventDataQueryFieldFilter fieldFilter = new EventDataQueryFieldFilter(script, Collections.emptySet()); - + public void testTLDTermFrequencyEventDataQueryFilter() { Set indexOnlyFields = Sets.newHashSet("FIELD1", "FIELD2"); - TLDTermFrequencyEventDataQueryFilter filter = new TLDTermFrequencyEventDataQueryFilter(indexOnlyFields, fieldFilter); + TLDTermFrequencyEventDataQueryFilter filter = new TLDTermFrequencyEventDataQueryFilter(indexOnlyFields, Set.of("FIELD1")); // retain query index-only fields in the tld assertTrue(filter.keep(tldField1)); diff --git a/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java b/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java index 0dd9877d65e..2076fd2d60d 100644 --- a/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/predicate/ValueToAttributesTest.java @@ -245,9 +245,9 @@ public void testComposites() { compositeMetadata.setCompositeFieldMappingByType(ingestType, "MAKE_COLOR", Arrays.asList("MAKE", "COLOR")); compositeMetadata.setCompositeFieldMappingByType(ingestType, "COLOR_WHEELS", Arrays.asList("MAKE", "COLOR")); } - TypeMetadata typeMetadata = new TypeMetadata( - "MAKE:[beep:datawave.data.type.LcNoDiacriticsType];MAKE_COLOR:[beep:datawave.data.type.NoOpType];START_DATE:[beep:datawave.data.type.DateType];TYPE_NOEVAL:[beep:datawave.data.type.LcNoDiacriticsType];IP_ADDR:[beep:datawave.data.type.IpAddressType];WHEELS:[beep:datawave.data.type.LcNoDiacriticsType,datawave.data.type.NumberType];COLOR:[beep:datawave.data.type.LcNoDiacriticsType];COLOR_WHEELS:[beep:datawave.data.type.NoOpType];TYPE:[beep:datawave.data.type.LcNoDiacriticsType]"); + "dts:[0:beep];types:[0:datawave.data.type.DateType,1:datawave.data.type.IpAddressType,2:datawave.data.type.LcNoDiacriticsType,3:datawave.data.type.NoOpType,4:datawave.data.type.NumberType];MAKE:[0:2];MAKE_COLOR:[0:3];START_DATE:[0:0];TYPE_NOEVAL:[0:2];IP_ADDR:[0:1];WHEELS:[0:2,0:4];COLOR:[0:2];COLOR_WHEELS:[0:3];TYPE:[0:2]"); + MarkingFunctions markingFunctions = new MarkingFunctions.Default(); ValueToAttributes valueToAttributes = new ValueToAttributes(compositeMetadata, typeMetadata, null, markingFunctions, true); } diff --git a/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java b/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java new file mode 100644 index 00000000000..88192ed8eec --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/scanner/LocalBatchScanner.java @@ -0,0 +1,153 @@ +package datawave.query.scanner; + +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.accumulo.core.client.BatchScanner; +import org.apache.accumulo.core.data.ArrayByteSequence; +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Column; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.IteratorUtil; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.iteratorsImpl.IteratorBuilder; +import org.apache.accumulo.core.iteratorsImpl.IteratorConfigUtil; +import org.apache.accumulo.core.security.Authorizations; + +import datawave.query.iterator.SortedListKeyValueIterator; +import datawave.query.tables.SessionOptions; + +public class LocalBatchScanner extends SessionOptions implements BatchScanner { + private SortedListKeyValueIterator itr; + private Collection ranges; + private boolean statsEnabled = false; + private StatsIterator statsIterator; + + public LocalBatchScanner(SortedListKeyValueIterator itr) { + this(itr, false); + } + + public LocalBatchScanner(SortedListKeyValueIterator itr, boolean statsEnabled) { + this.itr = itr; + this.statsEnabled = statsEnabled; + } + + public long getNextCount() { + return statsIterator == null ? -1 : statsIterator.getNextCount(); + } + + public long getSeekCount() { + return statsIterator == null ? -1 : statsIterator.getSeekCount(); + } + + @Override + public Iterator> iterator() { + Collections.sort(serverSideIteratorList, (o1, o2) -> { + if (o1.priority < o2.priority) { + return -1; + } else if (o1.priority > o2.priority) { + return 1; + } else { + return 0; + } + }); + + SortedKeyValueIterator base = this.itr; + IteratorEnvironment env = new LocalIteratorEnvironment(); + + if (statsEnabled) { + statsIterator = new StatsIterator(); + try { + statsIterator.init(base, Collections.emptyMap(), env); + base = statsIterator; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + IteratorBuilder iteratorBuilder = IteratorBuilder.builder(serverSideIteratorList).opts(serverSideIteratorOptions).env(env).build(); + + List> list = new ArrayList<>(); + try { + SortedKeyValueIterator created = IteratorConfigUtil.loadIterators(base, iteratorBuilder); + List columns = new ArrayList<>(); + for (Column c : fetchedColumns) { + columns.add(new ArrayByteSequence(c.columnFamily)); + } + + for (Range range : ranges) { + created.seek(range, columns, true); + while (created.hasTop()) { + list.add(new AbstractMap.SimpleImmutableEntry<>(created.getTopKey(), created.getTopValue())); + created.next(); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + + return list.iterator(); + } + + @Override + public void setRanges(Collection ranges) { + this.ranges = ranges; + } + + public static class LocalIteratorEnvironment implements IteratorEnvironment { + @Override + public IteratorUtil.IteratorScope getIteratorScope() { + return IteratorUtil.IteratorScope.scan; + } + + @Override + public boolean isUserCompaction() { + return false; + } + + @Override + public boolean isFullMajorCompaction() { + return false; + } + + @Override + public Authorizations getAuthorizations() { + return new Authorizations(); + } + } + + public static class StatsIterator extends WrappingIterator { + private long nextCount = 0; + private long seekCount = 0; + + @Override + public void next() throws IOException { + super.next(); + nextCount++; + } + + @Override + public void seek(Range range, Collection columnFamilies, boolean inclusive) throws IOException { + super.seek(range, columnFamilies, inclusive); + seekCount++; + } + + public long getNextCount() { + return nextCount; + } + + public long getSeekCount() { + return seekCount; + } + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/IndexQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/IndexQueryLogicTest.java index 16887ff25c1..98f74f942f2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/IndexQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/IndexQueryLogicTest.java @@ -24,7 +24,6 @@ import datawave.marking.MarkingFunctions; import datawave.query.Constants; import datawave.query.QueryTestTableHelper; -import datawave.query.planner.DefaultQueryPlanner; import datawave.query.planner.FederatedQueryPlanner; import datawave.query.testframework.AbstractFunctionalQuery; import datawave.query.testframework.AccumuloSetup; diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEdgeQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEdgeQueryLogicTest.java index bdbe0fd28d9..7014bf35756 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEdgeQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEdgeQueryLogicTest.java @@ -141,6 +141,11 @@ public GenericResponse createQuery(String queryLogicName, Map nextQueryResponseClass) { + // noop + } + @Override public BaseQueryResponse next(String id, ProxiedUserDetails callerObject) { return nextResponses.poll(); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEventQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEventQueryLogicTest.java index f7000e34f5e..3a07901af58 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEventQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/RemoteEventQueryLogicTest.java @@ -121,6 +121,11 @@ public GenericResponse createQuery(String queryLogicName, Map nextQueryResponseClass) { + // noop + } + @Override public BaseQueryResponse next(String id, ProxiedUserDetails callerObject) { return nextResponses.poll(); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ScannerFactoryTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ScannerFactoryTest.java index 499773360b9..6f55d265b4c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ScannerFactoryTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ScannerFactoryTest.java @@ -33,6 +33,8 @@ class ScannerFactoryTest { private static ScannerFactory scannerFactory; private static final ShardQueryConfiguration config = new ShardQueryConfiguration(); + private static final String ALT_INDEX = "altIndex"; + @BeforeAll public static void before() throws Exception { AccumuloClient client = new MyAccumuloClient("", instance); @@ -40,6 +42,8 @@ public static void before() throws Exception { scannerFactory = new ScannerFactory(config); client.tableOperations().create(TableName.SHARD); + client.tableOperations().create(TableName.SHARD_INDEX); + client.tableOperations().create(ALT_INDEX); client.instanceOperations().setProperty("accumulo.instance.name", "required-for-tests"); } @@ -171,6 +175,72 @@ void testRFileScanner() { assertEventualConsistency(scanner); } + @Test + public void testSingleScannerWithAbsentTableName() throws Exception { + Scanner scanner = scannerFactory.newSingleScanner(ALT_INDEX, getAuths(), getQuery()); + assertImmediateConsistency(scanner); + } + + @Test + public void testScannerWithAbsentTableName() throws Exception { + BatchScanner scanner = scannerFactory.newScanner(ALT_INDEX, getQuery()); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newScanner(ALT_INDEX, getAuths(), 1, getQuery(), "ALT_HINT"); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newScanner(ALT_INDEX, getAuths(), 1, getQuery(), null); + assertImmediateConsistency(scanner); + } + + @Test + public void testQueryScannerWithAbsentTableName() throws Exception { + BatchScannerSession scanner = scannerFactory.newQueryScanner(ALT_INDEX, getAuths(), getQuery()); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newQueryScanner(ALT_INDEX, getAuths(), getQuery(), "ALT_HINT"); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newQueryScanner(ALT_INDEX, getAuths(), getQuery(), null); + assertImmediateConsistency(scanner); + } + + @Test + public void testLimitedAnyFieldScannerWithAbsentTableName() throws Exception { + AnyFieldScanner scanner = scannerFactory.newLimitedScanner(AnyFieldScanner.class, ALT_INDEX, getAuths(), getQuery()); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(AnyFieldScanner.class, ALT_INDEX, getAuths(), getQuery(), "ALT_HINT"); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(AnyFieldScanner.class, ALT_INDEX, getAuths(), getQuery(), null); + assertImmediateConsistency(scanner); + } + + @Test + public void testLimitedRangeStreamScannerWithAbsentTableName() throws Exception { + RangeStreamScanner scanner = scannerFactory.newLimitedScanner(RangeStreamScanner.class, ALT_INDEX, getAuths(), getQuery()); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(RangeStreamScanner.class, ALT_INDEX, getAuths(), getQuery(), "ALT_HINT"); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(RangeStreamScanner.class, ALT_INDEX, getAuths(), getQuery(), null); + assertImmediateConsistency(scanner); + } + + @Test + public void testLimitedBatchScannerSessionWithAbsentTableName() throws Exception { + BatchScannerSession scanner = scannerFactory.newLimitedScanner(BatchScannerSession.class, ALT_INDEX, getAuths(), getQuery()); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(BatchScannerSession.class, ALT_INDEX, getAuths(), getQuery(), "ALT_HINT"); + assertImmediateConsistency(scanner); + + scanner = scannerFactory.newLimitedScanner(BatchScannerSession.class, ALT_INDEX, getAuths(), getQuery(), null); + assertImmediateConsistency(scanner); + } + private void setEventualConsistency() { Map consistencyLevels = new HashMap<>(); consistencyLevels.put(TableName.SHARD, ScannerBase.ConsistencyLevel.EVENTUAL); diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ScannerSessionTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ScannerSessionTest.java index 7601c042712..82f298898b8 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ScannerSessionTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ScannerSessionTest.java @@ -1,6 +1,12 @@ package datawave.query.tables; +import static java.lang.Thread.sleep; + +import static org.junit.Assert.fail; + import java.io.IOException; +import java.time.Duration; +import java.time.temporal.ChronoUnit; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -10,6 +16,10 @@ import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.accumulo.core.client.AccumuloClient; import org.apache.accumulo.core.client.AccumuloException; @@ -36,6 +46,9 @@ import org.junit.Test; import org.junit.rules.TemporaryFolder; +import com.google.common.util.concurrent.MoreExecutors; +import com.google.common.util.concurrent.Service; + /** * This test spins up a mini accumulo to accurately test the effect of underlying Scanner/Batch scanners against the ScannerSession. InMemoryAccumulo makes some * simplifications that in the past have masked bugs @@ -79,7 +92,7 @@ private static void setupTable() client.tableOperations().addSplits("testTable", splits); // give the table a chance to be split - Thread.sleep(10000); + sleep(10000); // force writing all the data or fail try { @@ -161,6 +174,48 @@ public void testScannerSessionWithRuntimeExceptionResource() throws TableNotFoun validate(ss); } + @Test + public void testScannerSessionThreadCleanupWaitingOnClient() { + Set auths = new HashSet<>(); + auths.add(new Authorizations()); + // set maxResults to 1 so that the ScannerSession will block adding results to the queue not allowing the scanner to close + ScannerSession ss = new ScannerSession("testTable", auths, resourceQueue, 1, null); + + List ranges = Arrays.asList(new Range(new Text(String.valueOf(25)), true, new Text(String.valueOf(27)), false), + new Range(new Text(String.valueOf(1)), true, new Text(String.valueOf(2)), false), + new Range(new Text(String.valueOf(98)), true, new Text(String.valueOf(99)), false)); + + ss.setRanges(ranges); + + // this should kick off scanner in another thread and put one result on the resultQueue, forcing it to loop attempting + // to offer further results + ss.hasNext(); + + long startWait = System.currentTimeMillis(); + AtomicBoolean forceClose = new AtomicBoolean(false); + Executors.newScheduledThreadPool(1).schedule(() -> { + // this should cause a shutdown + ss.close(); + }, 5, TimeUnit.SECONDS); + + Executors.newScheduledThreadPool(1).schedule(() -> { + forceClose.set(true); + }, 10, TimeUnit.SECONDS); + + // this should block until the internal thread finishes + Duration d = Duration.of(12, ChronoUnit.SECONDS); + try { + ss.awaitTerminated(d); + } catch (TimeoutException e) { + // no-op + } + long endWait = System.currentTimeMillis(); + // didn't end before the close + Assert.assertTrue(endWait - startWait >= 5000); + // ended before the force kill + Assert.assertFalse(forceClose.get()); + } + private void validate(ScannerSession ss) throws TableNotFoundException { List ranges = Arrays.asList(new Range(new Text(String.valueOf(25)), true, new Text(String.valueOf(27)), false), new Range(new Text(String.valueOf(1)), true, new Text(String.valueOf(2)), false), diff --git a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java index 8a0be8427ed..ee244d7388e 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tables/ShardQueryLogicTest.java @@ -193,23 +193,27 @@ protected void runTestQuery(Set> expected, String querystr, Date sta Assert.assertTrue(response instanceof DefaultEventQueryResponse); DefaultEventQueryResponse eventQueryResponse = (DefaultEventQueryResponse) response; - for (Iterator> it = expected.iterator(); it.hasNext();) { - Set expectedSet = it.next(); - boolean found = false; - - for (EventBase event : eventQueryResponse.getEvents()) { - - if (expectedSet.contains("UID:" + event.getMetadata().getInternalId())) { - expectedSet.remove("UID:" + event.getMetadata().getInternalId()); - ((List) event.getFields()).forEach((f) -> expectedSet.remove(f.getName() + ":" + f.getValueString())); - if (expectedSet.isEmpty()) { - found = true; - it.remove(); + if (expected.isEmpty()) { + Assert.assertTrue(eventQueryResponse.getEvents() == null || eventQueryResponse.getEvents().isEmpty()); + } else { + for (Iterator> it = expected.iterator(); it.hasNext();) { + Set expectedSet = it.next(); + boolean found = false; + + for (EventBase event : eventQueryResponse.getEvents()) { + + if (expectedSet.contains("UID:" + event.getMetadata().getInternalId())) { + expectedSet.remove("UID:" + event.getMetadata().getInternalId()); + ((List) event.getFields()).forEach((f) -> expectedSet.remove(f.getName() + ":" + f.getValueString())); + if (expectedSet.isEmpty()) { + found = true; + it.remove(); + } + break; } - break; } + Assert.assertTrue("field not found " + expectedSet, found); } - Assert.assertTrue("field not found " + expectedSet, found); } } @@ -312,8 +316,9 @@ public void testNegativeRegex() throws Exception { String queryString = "UUID=='CAPONE' AND QUOTE!~'.*ind'"; Set> expected = new HashSet<>(); - runTestQuery(expected, queryString, format.parse("20091231"), format.parse("20150101"), extraParameters); + expected.add(Sets.newHashSet("UID:" + WiseGuysIngest.caponeUID)); + runTestQuery(expected, queryString, format.parse("20091231"), format.parse("20150101"), extraParameters); } @Test @@ -323,8 +328,9 @@ public void testNegativeRegexV2() throws Exception { String queryString = "UUID=='CAPONE' AND !(QUOTE=~'.*ind')"; Set> expected = new HashSet<>(); - runTestQuery(expected, queryString, format.parse("20091231"), format.parse("20150101"), extraParameters); + expected.add(Sets.newHashSet("UID:" + WiseGuysIngest.caponeUID)); + runTestQuery(expected, queryString, format.parse("20091231"), format.parse("20150101"), extraParameters); } @Test @@ -360,4 +366,50 @@ public void testNegativeFilterRegexV2() throws Exception { runTestQuery(expected, queryString, format.parse("20091231"), format.parse("20150101"), extraParameters); } + + @Test + public void testExcludeDataTypesBangDataType() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("datatype.filter.set", "!test2"); + + Date startDate = format.parse("20091231"); + Date endDate = format.parse("20150101"); + + String queryString = "UUID=='TATTAGLIA'"; + Set> expected = new HashSet<>(); + // No results expected + + runTestQuery(expected, queryString, startDate, endDate, extraParameters); + } + + @Test + public void testExcludeDataTypesNegateDataType() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("datatype.filter.set", "test2,!test2"); + + Date startDate = format.parse("20091231"); + Date endDate = format.parse("20150101"); + + String queryString = "UUID=='TATTAGLIA'"; + Set> expected = new HashSet<>(); + // Expect one result, since the negated data type results in empty set, which is treated by Datawave as all data types + expected.add(Sets.newHashSet("UID:" + WiseGuysIngest.tattagliaUID)); + + runTestQuery(expected, queryString, startDate, endDate, extraParameters); + } + + @Test + public void testExcludeDataTypesIncludeOneTypeExcludeOneType() throws Exception { + Map extraParameters = new HashMap<>(); + extraParameters.put("datatype.filter.set", "test2,!test"); + + Date startDate = format.parse("20091231"); + Date endDate = format.parse("20150101"); + + String queryString = "UUID=='TATTAGLIA' || UUID=='CAPONE'"; + Set> expected = new HashSet<>(); + expected.add(Sets.newHashSet("UID:" + WiseGuysIngest.tattagliaUID)); + + runTestQuery(expected, queryString, startDate, endDate, extraParameters); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/testframework/AbstractFunctionalQuery.java b/warehouse/query-core/src/test/java/datawave/query/testframework/AbstractFunctionalQuery.java index efc2f88917d..f7ce338487f 100644 --- a/warehouse/query-core/src/test/java/datawave/query/testframework/AbstractFunctionalQuery.java +++ b/warehouse/query-core/src/test/java/datawave/query/testframework/AbstractFunctionalQuery.java @@ -608,6 +608,10 @@ protected GenericQueryConfiguration setupConfig(final String queryStr) throws Ex * error condition from query initialization */ protected String getPlan(final String queryStr, boolean expandFields, boolean expandValues) throws Exception { + return getPlan(client, queryStr, expandFields, expandValues); + } + + protected String getPlan(AccumuloClient client, final String queryStr, boolean expandFields, boolean expandValues) throws Exception { Date[] startEndDate = this.dataManager.getShardStartEndDate(); if (log.isDebugEnabled()) { log.debug(" query[" + queryStr + "] start(" + YMD_DateFormat.format(startEndDate[0]) + ") end(" + YMD_DateFormat.format(startEndDate[1]) + ")"); diff --git a/warehouse/query-core/src/test/java/datawave/query/tld/TLDTermFrequencyAggregatorTest.java b/warehouse/query-core/src/test/java/datawave/query/tld/TLDTermFrequencyAggregatorTest.java index c1d7093b16e..d7f8267b5a2 100644 --- a/warehouse/query-core/src/test/java/datawave/query/tld/TLDTermFrequencyAggregatorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/tld/TLDTermFrequencyAggregatorTest.java @@ -22,7 +22,6 @@ import org.apache.accumulo.core.data.Value; import org.apache.accumulo.core.iterators.SortedKeyValueIterator; import org.apache.accumulo.core.iteratorsImpl.system.SortedMapIterator; -import org.apache.commons.jexl3.parser.ParseException; import org.junit.Before; import org.junit.Test; @@ -33,7 +32,6 @@ import datawave.query.attributes.Document; import datawave.query.attributes.TypeAttribute; import datawave.query.data.parsers.DatawaveKey; -import datawave.query.jexl.JexlASTHelper; import datawave.query.predicate.EventDataQueryFieldFilter; import datawave.query.predicate.EventDataQueryFilter; import datawave.query.util.TypeMetadata; @@ -47,7 +45,7 @@ public void setup() { } @Test - public void apply_buildDocNotKeep() throws IOException, ParseException { + public void apply_buildDocNotKeep() throws IOException { Document doc = new Document(); AttributeFactory attributeFactory = new AttributeFactory(new TypeMetadata()); @@ -72,7 +70,7 @@ public void apply_buildDocNotKeep() throws IOException, ParseException { Set keepFields = new HashSet<>(); keepFields.add("FIELD2"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(JexlASTHelper.parseJexlQuery("FIELD2 == 'abc'"), Collections.emptySet()); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Collections.singleton("FIELD2")); aggregator = new TLDTermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); @@ -88,7 +86,7 @@ public void apply_buildDocNotKeep() throws IOException, ParseException { } @Test - public void apply_buildDocKeep() throws IOException, ParseException { + public void apply_buildDocKeep() throws IOException { Document doc = new Document(); AttributeFactory attributeFactory = new AttributeFactory(new TypeMetadata()); @@ -114,8 +112,7 @@ public void apply_buildDocKeep() throws IOException, ParseException { keepFields.add("FIELD1"); keepFields.add("FIELD2"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(JexlASTHelper.parseJexlQuery("FIELD1 == 'VALUE1' && FIELD2 == 'VALUE2'"), - Collections.emptySet()); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Set.of("FIELD1", "FIELD2")); aggregator = new TLDTermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); @@ -159,7 +156,7 @@ public void apply_buildDocKeep() throws IOException, ParseException { } @Test - public void apply_buildDocOnlyKeepToKeep() throws IOException, ParseException { + public void apply_buildDocOnlyKeepToKeep() throws IOException { Document doc = new Document(); AttributeFactory attributeFactory = new AttributeFactory(new TypeMetadata()); @@ -174,7 +171,7 @@ public void apply_buildDocOnlyKeepToKeep() throws IOException, ParseException { Set keepFields = new HashSet<>(); keepFields.add("FIELD2"); - EventDataQueryFilter filter = new EventDataQueryFieldFilter(JexlASTHelper.parseJexlQuery("FIELD2 == 'VALUE1'"), Collections.emptySet()); + EventDataQueryFilter filter = new EventDataQueryFieldFilter().withFields(Collections.singleton("FIELD2")); aggregator = new TLDTermFrequencyAggregator(keepFields, filter, -1); Key result = aggregator.apply(itr, doc, attributeFactory); diff --git a/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java b/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java index 340379642d6..d36f1c70ee5 100644 --- a/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java +++ b/warehouse/query-core/src/test/java/datawave/query/transformer/NoExpansionTests.java @@ -149,7 +149,7 @@ private void runTestQuery() throws Exception { // order of terms in planned script is arbitrary, fall back to comparing the jexl trees ASTJexlScript plannedScript = JexlASTHelper.parseJexlQuery(plan); ASTJexlScript expectedScript = JexlASTHelper.parseJexlQuery(this.expectedPlan); - JexlNodeAssert.assertThat(expectedScript).isEqualTo(plannedScript); + JexlNodeAssert.assertThat(plannedScript).isEqualTo(expectedScript); } private AccumuloClient createClient() throws Exception { @@ -179,8 +179,8 @@ private void givenExpectedPlan(String expectedPlan) { */ @Test public void testDefaultQueryModelExpansion() throws Exception { - givenQuery("COLOR == 'blue'"); - givenExpectedPlan("(COLOR == 'blue' || HUE == 'blue')"); + givenQuery("COLOR == 'blue' && FASTENER == 'bolt'"); + givenExpectedPlan("(COLOR == 'blue' || HUE == 'blue') && (FASTENER == 'bolt' || FIXTURE == 'bolt')"); runTestQuery(); } @@ -196,6 +196,17 @@ public void testNoExpansionViaFunction() throws Exception { runTestQuery(); } + /** + * Verify that when #NO_EXPANSION is specified in the query string itself with multiple fields, expansion does not occur. + */ + @Test + public void testNoExpansionViaFunctionWithMultipleFields() throws Exception { + givenQuery("COLOR == 'blue' && FASTENER == 'bolt' && f:noExpansion(COLOR,FASTENER)"); + givenExpectedPlan("COLOR == 'blue' && FASTENER == 'bolt'"); + + runTestQuery(); + } + /** * Verify that when #NO_EXPANSION is specified via the query parameters, expansion does not occur. */ @@ -208,6 +219,18 @@ public void testNoExpansionViaQueryParameters() throws Exception { runTestQuery(); } + /** + * Verify that when #NO_EXPANSION is specified via the query parameters, expansion does not occur. + */ + @Test + public void testNoExpansionViaQueryParametersWithMultipleFields() throws Exception { + givenQuery("COLOR == 'blue' && FASTENER == 'bolt'"); + givenQueryParameter(QueryParameters.NO_EXPANSION_FIELDS, "COLOR,FASTENER"); + givenExpectedPlan("COLOR == 'blue' && FASTENER == 'bolt'"); + + runTestQuery(); + } + /** * Verify that when #NO_EXPANSION is specified in the query string itself and in query parameters, expansion does not occur. */ diff --git a/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java b/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java index d38f646e984..0e0170ac4e7 100644 --- a/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java +++ b/warehouse/query-core/src/test/java/datawave/query/util/VisibilityWiseGuysIngestWithModel.java @@ -776,6 +776,24 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); bw.addMutation(mutation); + // for testing #NOEXPANSION function + mutation = new Mutation("FASTENER"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(10L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); + bw.addMutation(mutation); + + // for testing #NOEXPANSION function + mutation = new Mutation("FIXTURE"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(10L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(datatype), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + lcNoDiacriticsType.getClass().getName()), emptyValue); + bw.addMutation(mutation); + } finally { if (null != bw) { bw.close(); @@ -822,6 +840,12 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put("DATAWAVE", "HUE" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); bw.addMutation(mutation); + // specifically for testing the #NOEXPANSION function + mutation = new Mutation("FASTENER"); + mutation.put("DATAWAVE", "FASTENER" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); + mutation.put("DATAWAVE", "FIXTURE" + "\u0000" + "forward", columnVisibility, timeStamp, emptyValue); + bw.addMutation(mutation); + } finally { if (null != bw) { bw.close(); diff --git a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java index 898b6481381..9ed431a66d9 100644 --- a/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java +++ b/warehouse/query-core/src/test/java/datawave/query/util/WiseGuysIngest.java @@ -46,6 +46,7 @@ public enum WhatKindaRange { private static final Type geoType = new GeometryType(); protected static final String datatype = "test"; + protected static final String secondDataType = "test2"; protected static final String date = "20130101"; protected static final String shard = date + "_0"; protected static final ColumnVisibility columnVisibility = new ColumnVisibility("ALL"); @@ -56,6 +57,7 @@ public enum WhatKindaRange { public static final String corleoneChildUID = UID.builder().newId("Corleone".getBytes(), (Date) null, "1").toString(); public static final String sopranoUID = UID.builder().newId("Soprano".toString().getBytes(), (Date) null).toString(); public static final String caponeUID = UID.builder().newId("Capone".toString().getBytes(), (Date) null).toString(); + public static final String tattagliaUID = UID.builder().newId("Tattaglia".toString().getBytes(), (Date) null).toString(); protected static String normalizeColVal(Map.Entry colVal) { switch (colVal.getKey()) { @@ -170,6 +172,12 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(datatype + "\u0000" + caponeUID, "NUMBER" + "\u0000" + "25", columnVisibility, timeStamp, emptyValue); mutation.put(datatype + "\u0000" + caponeUID, "GEO" + "\u0000" + "POINT(30 30)", columnVisibility, timeStamp, emptyValue); + // second datatype shard data + mutation.put(secondDataType + "\u0000" + tattagliaUID, "NAME.0" + "\u0000" + "Philip", columnVisibility, timeStamp, emptyValue); + mutation.put(secondDataType + "\u0000" + tattagliaUID, "GENDER.0" + "\u0000" + "MALE", columnVisibility, timeStamp, emptyValue); + mutation.put(secondDataType + "\u0000" + tattagliaUID, "AGE.0" + "\u0000" + "70", columnVisibility, timeStamp, emptyValue); + mutation.put(secondDataType + "\u0000" + tattagliaUID, "UUID.0" + "\u0000" + "TATTAGLIA", columnVisibility, timeStamp, emptyValue); + bw.addMutation(mutation); } finally { @@ -366,6 +374,23 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw bw.addMutation(mutation); } + // second datatype shard index data + // uuid + mutation = new Mutation(lcNoDiacriticsType.normalize("TATTAGLIA")); + mutation.put("UUID".toUpperCase(), shard + "\u0000" + secondDataType, columnVisibility, timeStamp, + range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(tattagliaUID)); + bw.addMutation(mutation); + // names + mutation = new Mutation(lcNoDiacriticsType.normalize("Philip")); + mutation.put("NAME".toUpperCase(), shard + "\u0000" + secondDataType, columnVisibility, timeStamp, + range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(tattagliaUID)); + bw.addMutation(mutation); + // ages + mutation = new Mutation(numberType.normalize("70")); + mutation.put("AGE".toUpperCase(), shard + "\u0000" + secondDataType, columnVisibility, timeStamp, + range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(tattagliaUID)); + bw.addMutation(mutation); + // add some index-only fields mutation = new Mutation("chicago"); mutation.put("LOCATION", shard + "\u0000" + datatype, columnVisibility, timeStamp, @@ -560,6 +585,10 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put("GENDER".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp, range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(caponeUID)); bw.addMutation(mutation); + mutation = new Mutation(new StringBuilder(lcNoDiacriticsType.normalize("MALE")).reverse()); + mutation.put("GENDER".toUpperCase(), shard + "\u0000" + secondDataType, columnVisibility, timeStamp, + range == WhatKindaRange.SHARD ? getValueForNuthinAndYourHitsForFree() : getValueForBuilderFor(tattagliaUID)); + bw.addMutation(mutation); // ages mutation = new Mutation(new StringBuilder(numberType.normalize("30")).reverse()); mutation.put("AGE".toUpperCase(), shard + "\u0000" + datatype, columnVisibility, timeStamp, @@ -707,6 +736,20 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put("fi\u0000" + "GEO", normalized + "\u0000" + datatype + "\u0000" + corleoneUID, columnVisibility, timeStamp, emptyValue); } + // second datatype field index data + // uuid + mutation.put("fi\u0000" + "UUID", lcNoDiacriticsType.normalize("TATTAGLIA") + "\u0000" + secondDataType + "\u0000" + tattagliaUID, columnVisibility, + timeStamp, emptyValue); + // names + mutation.put("fi\u0000" + "NAME", lcNoDiacriticsType.normalize("PHILIP") + "\u0000" + secondDataType + "\u0000" + tattagliaUID, columnVisibility, + timeStamp, emptyValue); + // genders + mutation.put("fi\u0000" + "GENDER", lcNoDiacriticsType.normalize("MALE") + "\u0000" + secondDataType + "\u0000" + tattagliaUID, columnVisibility, + timeStamp, emptyValue); + // ages + mutation.put("fi\u0000" + "AGE", numberType.normalize("70") + "\u0000" + secondDataType + "\u0000" + tattagliaUID, columnVisibility, timeStamp, + emptyValue); + // add some index-only fields mutation.put("fi\u0000" + "LOCATION", "chicago" + "\u0000" + datatype + "\u0000" + caponeUID, columnVisibility, timeStamp, emptyValue); mutation.put("fi\u0000" + "POSIZIONE", "newyork" + "\u0000" + datatype + "\u0000" + corleoneUID, columnVisibility, timeStamp, emptyValue); @@ -736,6 +779,14 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + normalizerForColumn("NAME")), emptyValue); bw.addMutation(mutation); + mutation = new Mutation("NAME"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(secondDataType + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(10L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(secondDataType + "\u0000" + normalizerForColumn("NAME")), emptyValue); + bw.addMutation(mutation); + mutation = new Mutation("NOME"); mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(19L))); @@ -752,6 +803,14 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + normalizerForColumn("GENDER")), emptyValue); bw.addMutation(mutation); + mutation = new Mutation("GENDER"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(secondDataType + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(11L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(secondDataType + "\u0000" + normalizerForColumn("GENDER")), emptyValue); + bw.addMutation(mutation); + mutation = new Mutation("GENERE"); mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(21L))); @@ -768,6 +827,14 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + normalizerForColumn("AGE")), emptyValue); bw.addMutation(mutation); + mutation = new Mutation("AGE"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(secondDataType + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(12L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(secondDataType + "\u0000" + normalizerForColumn("AGE")), emptyValue); + bw.addMutation(mutation); + mutation = new Mutation("ETA"); mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(22L))); @@ -810,6 +877,14 @@ public static void writeItAll(AccumuloClient client, WhatKindaRange range) throw mutation.put(ColumnFamilyConstants.COLF_T, new Text(datatype + "\u0000" + normalizerForColumn("UUID")), emptyValue); bw.addMutation(mutation); + mutation = new Mutation("UUID"); + mutation.put(ColumnFamilyConstants.COLF_E, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_F, new Text(secondDataType + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(3L))); + mutation.put(ColumnFamilyConstants.COLF_I, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_RI, new Text(secondDataType), emptyValue); + mutation.put(ColumnFamilyConstants.COLF_T, new Text(secondDataType + "\u0000" + normalizerForColumn("UUID")), emptyValue); + bw.addMutation(mutation); + mutation = new Mutation("BIRTH_DATE"); mutation.put(ColumnFamilyConstants.COLF_E, new Text(datatype), emptyValue); mutation.put(ColumnFamilyConstants.COLF_F, new Text(datatype + "\u0000" + date), new Value(SummingCombiner.VAR_LEN_ENCODER.encode(3L))); diff --git a/warehouse/query-core/src/test/resources/DATAWAVE_EDGE.xml b/warehouse/query-core/src/test/resources/DATAWAVE_EDGE.xml index 31d6cedfd7f..b9015c9da49 100644 --- a/warehouse/query-core/src/test/resources/DATAWAVE_EDGE.xml +++ b/warehouse/query-core/src/test/resources/DATAWAVE_EDGE.xml @@ -9,26 +9,26 @@ name representation as needed for the deployment environment's default query syntax. --> - + - - + + - - + + - - + + - - + + - - + + - - + + diff --git a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml index 33c37ac318d..f652a4731b5 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/EventQueryLogicFactory.xml @@ -128,6 +128,12 @@ + + + + + +
diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index c7c3d8026c6..7c043448b9e 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -231,6 +231,25 @@ + + + + + + + + + + + + + + + + + + + @@ -242,7 +261,7 @@ - + @@ -262,12 +281,14 @@ - - - - + + + + + + @@ -275,6 +296,16 @@ datawave.query.function.NormalizedVersionPredicate + + + + + + + + + + diff --git a/warehouse/regression-testing/pom.xml b/warehouse/regression-testing/pom.xml index 25f301bcc37..ed69c0dbe63 100644 --- a/warehouse/regression-testing/pom.xml +++ b/warehouse/regression-testing/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-regression-testing ${project.artifactId} diff --git a/warehouse/ssdeep-common/pom.xml b/warehouse/ssdeep-common/pom.xml index 6bbbe8e2639..d26a9c37724 100644 --- a/warehouse/ssdeep-common/pom.xml +++ b/warehouse/ssdeep-common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-warehouse-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ssdeep-common diff --git a/web-services/accumulo/pom.xml b/web-services/accumulo/pom.xml index 5bf74d169fb..03802f0ec9a 100644 --- a/web-services/accumulo/pom.xml +++ b/web-services/accumulo/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-accumulo ejb diff --git a/web-services/atom/pom.xml b/web-services/atom/pom.xml index 6eff4950ac3..2507c685dc0 100644 --- a/web-services/atom/pom.xml +++ b/web-services/atom/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-atom ejb diff --git a/web-services/cached-results/pom.xml b/web-services/cached-results/pom.xml index ec179935f24..a5f641cf457 100644 --- a/web-services/cached-results/pom.xml +++ b/web-services/cached-results/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-cached-results ejb diff --git a/web-services/client/pom.xml b/web-services/client/pom.xml index b0c92344377..f41f319a63a 100644 --- a/web-services/client/pom.xml +++ b/web-services/client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-client jar diff --git a/web-services/common-util/pom.xml b/web-services/common-util/pom.xml index 67eec918cf0..d4cd0da1cdf 100644 --- a/web-services/common-util/pom.xml +++ b/web-services/common-util/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-common-util jar diff --git a/web-services/common/pom.xml b/web-services/common/pom.xml index 86bc5fbe4d3..8dbcf946246 100644 --- a/web-services/common/pom.xml +++ b/web-services/common/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-common ejb diff --git a/web-services/deploy/application/pom.xml b/web-services/deploy/application/pom.xml index e1d77c2ab4f..8ab7e98f66a 100644 --- a/web-services/deploy/application/pom.xml +++ b/web-services/deploy/application/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-deploy-application ear diff --git a/web-services/deploy/configuration/pom.xml b/web-services/deploy/configuration/pom.xml index edbf42b5480..1ed563542fd 100644 --- a/web-services/deploy/configuration/pom.xml +++ b/web-services/deploy/configuration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-deploy-configuration jar diff --git a/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml b/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml index b3a04ec7bb6..3817a39f851 100644 --- a/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml +++ b/web-services/deploy/configuration/src/main/resources/datawave/mapreduce/MapReduceJobs.xml @@ -15,7 +15,7 @@ - org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat + org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat datawave.mr.bulk.BulkInputFormat diff --git a/web-services/deploy/docs/pom.xml b/web-services/deploy/docs/pom.xml index ce30b0d9762..1a7e1179d66 100644 --- a/web-services/deploy/docs/pom.xml +++ b/web-services/deploy/docs/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-deploy-docs war diff --git a/web-services/deploy/pom.xml b/web-services/deploy/pom.xml index 9e459ca18d2..55212c43e2e 100644 --- a/web-services/deploy/pom.xml +++ b/web-services/deploy/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT gov.nsa.datawave.webservices datawave-ws-deploy-parent diff --git a/web-services/deploy/spring-framework-integration/pom.xml b/web-services/deploy/spring-framework-integration/pom.xml index a36af3ace72..9af6ee24e0a 100644 --- a/web-services/deploy/spring-framework-integration/pom.xml +++ b/web-services/deploy/spring-framework-integration/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-deploy-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT spring-framework-integration ${project.artifactId} diff --git a/web-services/dictionary/pom.xml b/web-services/dictionary/pom.xml index 7290e8899aa..c1ed7962d59 100644 --- a/web-services/dictionary/pom.xml +++ b/web-services/dictionary/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-dictionary ejb diff --git a/web-services/examples/client-login/pom.xml b/web-services/examples/client-login/pom.xml index 7a01e015ae6..4f1e485c578 100644 --- a/web-services/examples/client-login/pom.xml +++ b/web-services/examples/client-login/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-examples-client-login ejb diff --git a/web-services/examples/http-client/pom.xml b/web-services/examples/http-client/pom.xml index da675d6bb41..d564122e6ed 100644 --- a/web-services/examples/http-client/pom.xml +++ b/web-services/examples/http-client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-examples-http-client jar diff --git a/web-services/examples/jms-client/pom.xml b/web-services/examples/jms-client/pom.xml index f8bb71fe039..7cf824d5209 100644 --- a/web-services/examples/jms-client/pom.xml +++ b/web-services/examples/jms-client/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-examples-jms-client jar diff --git a/web-services/examples/pom.xml b/web-services/examples/pom.xml index 93d79955d49..86282126945 100644 --- a/web-services/examples/pom.xml +++ b/web-services/examples/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-examples-parent pom diff --git a/web-services/examples/query-war/pom.xml b/web-services/examples/query-war/pom.xml index 45b942d0dcc..aecdd4c8f9d 100644 --- a/web-services/examples/query-war/pom.xml +++ b/web-services/examples/query-war/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-examples-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-examples-query-war war diff --git a/web-services/map-reduce-embedded/pom.xml b/web-services/map-reduce-embedded/pom.xml index a7b1984331c..76341e64cd3 100644 --- a/web-services/map-reduce-embedded/pom.xml +++ b/web-services/map-reduce-embedded/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-map-reduce-embedded jar diff --git a/web-services/map-reduce-status/pom.xml b/web-services/map-reduce-status/pom.xml index 9cf22c6d2c0..305adbea0a0 100644 --- a/web-services/map-reduce-status/pom.xml +++ b/web-services/map-reduce-status/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-map-reduce-status ejb diff --git a/web-services/map-reduce/pom.xml b/web-services/map-reduce/pom.xml index 77f24fb3e39..61c7dd9e569 100644 --- a/web-services/map-reduce/pom.xml +++ b/web-services/map-reduce/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-map-reduce ejb diff --git a/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java b/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java index 4fa9026af48..d0943b3d0d4 100644 --- a/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java +++ b/web-services/map-reduce/src/main/java/datawave/webservice/mr/MapReduceBean.java @@ -43,6 +43,7 @@ import javax.ws.rs.core.MultivaluedMap; import javax.ws.rs.core.StreamingOutput; +import org.apache.accumulo.core.client.mapred.AccumuloInputFormat; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.lang.StringUtils; diff --git a/web-services/metrics/pom.xml b/web-services/metrics/pom.xml index 7d3991321ef..a3cf54ab50c 100644 --- a/web-services/metrics/pom.xml +++ b/web-services/metrics/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-metrics ejb diff --git a/web-services/model/pom.xml b/web-services/model/pom.xml index 5d4d194dc2b..20578156126 100644 --- a/web-services/model/pom.xml +++ b/web-services/model/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-model ejb diff --git a/web-services/modification/pom.xml b/web-services/modification/pom.xml index 483e43267d8..ac6f499950e 100644 --- a/web-services/modification/pom.xml +++ b/web-services/modification/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-modification ejb diff --git a/web-services/pom.xml b/web-services/pom.xml index d849a55a3d3..1acd2fa7ed6 100644 --- a/web-services/pom.xml +++ b/web-services/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave datawave-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT gov.nsa.datawave.webservices datawave-ws-parent diff --git a/web-services/query-websocket/pom.xml b/web-services/query-websocket/pom.xml index 5319f91823c..b564e60d122 100644 --- a/web-services/query-websocket/pom.xml +++ b/web-services/query-websocket/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-query-websocket war diff --git a/web-services/query/pom.xml b/web-services/query/pom.xml index 674784e811b..bf4c9eca70c 100644 --- a/web-services/query/pom.xml +++ b/web-services/query/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-query ejb diff --git a/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java b/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java index 4044facb91e..ddb81dc9a7d 100644 --- a/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java +++ b/web-services/query/src/main/java/datawave/webservice/query/runner/QueryExecutorBean.java @@ -2997,6 +2997,26 @@ private void updateQueryParams(Query q, String queryLogicName, String query, Dat } } + /** + * @param queryLogicName + * the logic name + * @param queryParameters + * the query parameters + * @return the generic response + */ + @POST + @Produces({"application/xml", "text/xml", "application/json", "text/yaml", "text/x-yaml", "application/x-yaml", "application/x-protobuf", + "application/x-protostuff"}) + @Path("/{logicName}/validate") + @Interceptors({RequiredInterceptor.class, ResponseInterceptor.class}) + @Timed(name = "dw.query.validateQuery", absolute = true) + public GenericResponse validateQuery(@Required("logicName") @PathParam("logicName") String queryLogicName, + MultivaluedMap queryParameters) { + GenericResponse response = new GenericResponse<>(); + response.setMessages(Collections.singletonList("Query validator coming soon.")); + throw new DatawaveWebApplicationException(new UnsupportedOperationException("Query validator not implemented"), response, 501); + } + /** * Administrator credentials required. Returns list of queries for some other user * diff --git a/web-services/rest-api/pom.xml b/web-services/rest-api/pom.xml index bb521e12f4d..c60baedfca9 100644 --- a/web-services/rest-api/pom.xml +++ b/web-services/rest-api/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-rest-api war diff --git a/web-services/security/pom.xml b/web-services/security/pom.xml index 3ad449774ab..8ae0a67c4cb 100644 --- a/web-services/security/pom.xml +++ b/web-services/security/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-security ejb diff --git a/web-services/web-root/pom.xml b/web-services/web-root/pom.xml index 8a8c5f68e28..85b45b975fd 100644 --- a/web-services/web-root/pom.xml +++ b/web-services/web-root/pom.xml @@ -4,7 +4,7 @@ gov.nsa.datawave.webservices datawave-ws-parent - 7.7.0-SNAPSHOT + 7.11.0-SNAPSHOT datawave-ws-web-root war