diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dda6180 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +.svn +target +dependency-reduced-pom.xml +.idea +.classpath +.project +.settings +*.iml +*.ipr +*.iws +*.bin +nbactions.xml +nb-configuration.xml +*.DS_Store +*.tmp-inception +*.snap +.*.swp +tika-deployment/tika-snap-app/parts/ +tika-deployment/tika-snap-app/prime/ +tika-deployment/tika-snap-app/snap/ +tika-deployment/tika-snap-app/stage/ +tika-deployment/tika-snap-app/test/ +tika-deployment/tika-snap-server/parts/ +tika-deployment/tika-snap-server/prime/ +tika-deployment/tika-snap-server/snap/ +tika-deployment/tika-snap-server/stage/ + diff --git a/.travis.ci.yml b/.travis.ci.yml new file mode 100644 index 0000000..913a0bc --- /dev/null +++ b/.travis.ci.yml @@ -0,0 +1,9 @@ +language: bash +services: docker +env: + matrix: + - VERSION=1.23 + - VERSION=1.22 +script: + - docker-tool.sh build $VERSION + - docker-tool.sh test $VERSION diff --git a/README.md b/README.md new file mode 100644 index 0000000..773b484 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# tika-docker + +Work in Progress \ No newline at end of file diff --git a/docker-tool.sh b/docker-tool.sh new file mode 100755 index 0000000..39e02c3 --- /dev/null +++ b/docker-tool.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +while getopts ":h" opt; do + case ${opt} in + h ) + echo "Usage:" + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build Builds image(s) for ." + echo " docker-tool.sh test Tests image(s) for ." + echo " docker-tool.sh publish Publishes image(s) for to Docker Hub." + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + esac +done + +test_docker_image() { + docker run -d --name "$1" -p 9998:9998 apache/tika:"$1" + sleep 10 + curl http://localhost:9998/version + docker kill "$1" + docker rm "$1" +} + +shift $((OPTIND -1)) +subcommand=$1; shift +version=$1; shift + +case "$subcommand" in + build) + # Build slim version with minimal dependencies + docker build -t apache/tika:${version} --build-arg TIKA_VERSION=${version} - < minimal/Dockerfile + # Build full version with OCR, Fonts and GDAL + docker build -t apache/tika:${version}-full --build-arg TIKA_VERSION=${version} - < full/Dockerfile + ;; + + test) + # Test minimal image + test_docker_image ${version} + # Test full image + test_docker_image "${version}-full" + ;; + + publish) + echo "Does nothing until we get Docker Hub access setup under Apache Organisation" + ;; + +esac diff --git a/full/Dockerfile b/full/Dockerfile new file mode 100644 index 0000000..a9ad964 --- /dev/null +++ b/full/Dockerfile @@ -0,0 +1,50 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +FROM ubuntu:bionic as base +RUN apt-get update + +FROM base as dependencies + +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install openjdk-11-jre-headless gdal-bin tesseract-ocr \ + tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu + +RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y curl xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract + +FROM dependencies as fetch_tika +ARG TIKA_VERSION + +ENV NEAREST_TIKA_SERVER_URL="https://www.apache.org/dyn/closer.cgi/tika/tika-server-${TIKA_VERSION}.jar?filename=tika/tika-server-${TIKA_VERSION}.jar&action=download" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://www.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg2 curl wget \ + && curl -sSL https://www.apache.org/dist/tika/KEYS | gpg --import \ + && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \ + && wget $NEAREST_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ + && wget $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ + && gpg --verify /tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar + +FROM dependencies as runtime +RUN apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION +COPY --from=fetch_tika /tika-server-${TIKA_VERSION}.jar /tika-server-${TIKA_VERSION}.jar + +EXPOSE 9998 +ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0 + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/minimal/Dockerfile b/minimal/Dockerfile new file mode 100644 index 0000000..917ef38 --- /dev/null +++ b/minimal/Dockerfile @@ -0,0 +1,45 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +FROM ubuntu:bionic as base +RUN apt-get update + +FROM base as dependencies +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install openjdk-11-jre-headless + +FROM dependencies as fetch_tika +ARG TIKA_VERSION + +ENV NEAREST_TIKA_SERVER_URL="https://www.apache.org/dyn/closer.cgi/tika/tika-server-${TIKA_VERSION}.jar?filename=tika/tika-server-${TIKA_VERSION}.jar&action=download" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://www.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg2 curl wget \ + && curl -sSL https://www.apache.org/dist/tika/KEYS | gpg --import \ + && echo "Nearest mirror: $NEAREST_TIKA_SERVER_URL" \ + && wget $NEAREST_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ + && wget $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ + && gpg --verify /tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar + +FROM dependencies as runtime +RUN apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION +COPY --from=fetch_tika /tika-server-${TIKA_VERSION}.jar /tika-server-${TIKA_VERSION}.jar + +EXPOSE 9998 +ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0 + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org"