diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..951c281 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,11 @@ +*.swo +*.swp +.git +.DS_Store +node_modules +bin +build +scripts +coverage +.nyc_output +. \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5ed0a20..673354f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ node_modules/ .DS_Store yarn.lock *.tar.gz +build diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5475b1e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +from lambci/lambda:build-nodejs8.10 + +RUN yum install -y autoconf aclocal automake install libtool libjpeg-devel \ + libpng-devel libtiff-devel zlib-devel wget gzip make cmakegcc freetype-devel \ + gcc gcc-c++ git lcms2-devel libjpeg-turbo-devel autogen libpng-devel \ + libtiff-devel libwebp-devel libzip-devel zlib-devel libgcc + +RUN yum groupinstall "Development Tools" -y + +RUN yum install -y cmake + +COPY . . + +RUN wget https://github.com/google/brotli/archive/v1.0.7.tar.gz +RUN tar -zxvf v1.0.7.tar.gz +RUN cd brotli-1.0.7 && mkdir out && cd out && ../configure-cmake && make && make test && make install diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..265e4aa --- /dev/null +++ b/Makefile @@ -0,0 +1,18 @@ +NAME=aws-lambda-tesseract +PWD=$(shell pwd) + +build: build-docker build-tesseract compress-tesseract + +build-dist: build-tesseract compress-tesseract + +build-docker: + docker build -f Dockerfile -t $(NAME) . + +version: + @echo $(shell git rev-parse HEAD) + +build-tesseract: + docker run -it -v $(PWD)/scripts:/scripts -v $(PWD)/build:/build -v $(PWD)/build:/build $(NAME) /scripts/compile-tesseract.sh + +compress-tesseract: + docker run -it -v $(PWD)/scripts:/scripts -v $(PWD)/build:/build $(NAME) /scripts/compress-with-brotli.sh diff --git a/compile-tesseract.sh b/compile-tesseract.sh deleted file mode 100644 index e108033..0000000 --- a/compile-tesseract.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env bash - -# install basic stuff required for compilation -sudo yum-config-manager --enable epel - -sudo yum install -y aclocal autoconf automake cmakegcc freetype-devel gcc gcc-c++ \ - git lcms2-devel libjpeg-devel libjpeg-turbo-devel autogen autoconf libtool \ - libpng-devel libtiff-devel libtool libwebp-devel libzip-devel make zlib-devel -sudo yum groupinstall "Development Tools" -y - -# autoconf -cd ~ -wget http://babyname.tips/mirrors/gnu/autoconf-archive/autoconf-archive-2017.09.28.tar.xz -tar -xvf autoconf-archive-2017.09.28.tar.xz -cd autoconf-archive-2017.09.28 -./configure && make && sudo make install -sudo cp m4/* /usr/share/aclocal/cd ~ wget http://babynam - -# leptonica -cd ~ -git clone https://github.com/DanBloomberg/leptonica.git -cd leptonica/ -./autogen.sh -./configure -make -sudo make install - -# tesseract -cd ~ -git clone https://github.com/tesseract-ocr/tesseract.git -cd tesseract -export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -./autogen.sh -./configure -make -sudo make install - -cd ~ -mkdir tesseract-standalone - -# trim unneeded ~ 15 MB -strip ./tesseract-standalone/**/* - -# copy files -cd tesseract-standalone -cp /usr/local/bin/tesseract . -mkdir lib -cp /usr/local/lib/libtesseract.so.4 lib/ -cp /usr/local/lib/liblept.so.5 lib/ -cp /usr/lib64/libjpeg.so.62 lib/ -cp /usr/lib64/libwebp.so.4 lib/ -cp /usr/lib64/libstdc++.so.6 lib/ - -# copy training data -mkdir tessdata -cd tessdata -wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata - -# archive -cd ~ -tar -zcvf tesseract.tar.gz tesseract-standalone - -# download from EC2 to local machine -scp ec2-user@ec2-54-162-129-95.compute-1.amazonaws.com:/home/ec2-user/tesseract.tar.gz $(pwd) - -# run compress-with-brotli.sh on local machine now diff --git a/compress-with-brotli.sh b/compress-with-brotli.sh deleted file mode 100644 index 833a9a5..0000000 --- a/compress-with-brotli.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -tar -cf tt.tar ./tesseract -brotli --best --force ./tt.tar diff --git a/package.json b/package.json index b24227e..32474a5 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ ], "dependencies": { "@shelf/aws-lambda-brotli-unpacker": "0.0.2", + "csv-parse": "^4.3.0", "is-image": "2.0.0" }, "devDependencies": { diff --git a/readme.md b/readme.md index 544b614..6d237d0 100644 --- a/readme.md +++ b/readme.md @@ -18,18 +18,49 @@ When a Lambda starts, it unpacks an archive with a binary to the `/tmp` folder a ## Usage +#### Using a path + ```js const {getTextFromImage, isSupportedFile} = require('@shelf/aws-lambda-tesseract'); module.exports.handler = async event => { // assuming there is a photo.jpg inside /tmp dir - // original file will be deleted afterwards - if (!isSupportedFile('/tmp/photo.jpg')) { return false; } - return getTextFromImage('/tmp/photo.jpg'); + getTextFromImage('/tmp/photo.jpg').then(result => console.log(result)); +}; +``` + +#### Using a stream + +This is useful for when you want to stream the file data from a remote source like a URL. + +```js +const https = require('https'); +const {getTextFromImage, isSupportedFile} = require('@shelf/aws-lambda-tesseract'); + +module.exports.handler = async event => { + // assuming that the url exists and is readable. + const url = 'https://cdn-std.dprcdn.net/files/acc_55602/9X4IIL'; + const fileStream = await new Promise(resolve => https.get(url, resolve)); + getTextFromImage(fileStream).then(result => console.log(result)); +}; +``` + +#### Extracting words and their coordinates + +The `getWordsAndBounds` function returns a JSON object of extracted words and their coordinates on the page. + +```js +const {getWordsAndBounds} = require('@shelf/aws-lambda-tesseract'); + +module.exports.handler = async event => { + // assuming that photo.jpg exists and is readable. + const file = fs.createReadStream(__dirname + '/photo.jpg'); + + getWordsAndBounds(file).then(result => console.log(result)); }; ``` @@ -38,7 +69,15 @@ unsupported by Tesseract file extensions. ## Compile It Yourself -See [compile-tesseract.sh](compile-tesseract.sh) & [compress-with-brotli.sh](compress-with-brotli.sh) files +Compile Tesseract for deployment on Lambda. Requires [Docker](https://www.docker.com/) & [Make](https://www.gnu.org/software/make/manual/html_node/Introduction.html) to be installed. + +`$ make build`: Builds Docker image, compiles Tesseract 4.0.0, and compresses result into the `tt.tar.br` archive. + +`$ make build-tesseract`: Compiles Tesseract 4.0.0 and creates `tesseract.tar.gz` file as output. + +`$ make compress-tesseract`: Runs brotli compression on built Tesseract and compresses `tesseract.tar.gz` into `tt.tar.bz`. + +**Note:** After compiling and compressing you need to copy the latest `tt.tar.bz` into the `/bin` directory. `$ cp ./build/tt.tar.bz ./bin` ## See Also diff --git a/scripts/compile-tesseract.sh b/scripts/compile-tesseract.sh new file mode 100755 index 0000000..3229bb3 --- /dev/null +++ b/scripts/compile-tesseract.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +echo "Building" +# Build leptonica +wget http://www.leptonica.com/source/leptonica-1.77.0.tar.gz +tar -zxvf leptonica-1.77.0.tar.gz +ls -la ./ +cd leptonica-1.77.0 +ls -la +./configure +make +make install + +# Build tesseract 4.0 +cd .. +wget https://github.com/tesseract-ocr/tesseract/archive/4.0.0.tar.gz +tar -zxvf 4.0.0.tar.gz +cd tesseract-4.0.0/ +export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig +./autogen.sh +./configure +make +make install +ldconfig + +cd ~ +mkdir tesseract-standalone + +# trim unneeded ~ 15 MB +strip ./tesseract-standalone/**/* + +# copy files +cd tesseract-standalone +cp /usr/local/bin/tesseract . +mkdir lib +cp /usr/local/lib/libtesseract.so.4 lib/ +cp /usr/local/lib/liblept.so.5 lib/ +# cp /usr/lib64/* lib/ +cp /usr/lib64/libjpeg.so.62 lib/ +cp /usr/lib64/libwebp.so.4 lib/ +cp /usr/lib64/libstdc++.so.6 lib/ +cp /usr/lib64/libpng15.so.15 lib/ +cp /usr/lib64/libtiff.so.5 lib/ +cp /usr/lib64/libgomp.so.1 lib/ +cp /usr/lib64/libjbig.so.2.0 lib/ + +# copy training data +mkdir tessdata +cd tessdata +wget https://github.com/tesseract-ocr/tessdata_fast/raw/master/eng.traineddata + +# Create configs +mkdir configs +echo "tessedit_create_tsv 1" > configs/tsv + +# archive +cd ~ +tar -zcvf tesseract.tar.gz tesseract-standalone +mv tesseract.tar.gz /build/ + +echo "Done!" diff --git a/scripts/compress-brotli.sh b/scripts/compress-brotli.sh new file mode 100755 index 0000000..ee9f446 --- /dev/null +++ b/scripts/compress-brotli.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +tar -C /build -zxvf /build/tesseract.tar.gz +mv /build/tesseract-standalone /build/tesseract +cd /build +tar -cf tt.tar tesseract +rm -rf tesseract +echo "Running brotli (this can take a few minutes)" +brotli --best --force --verbose /build/tt.tar +echo "Done" diff --git a/src/index.js b/src/index.js index a6ff507..bf87580 100644 --- a/src/index.js +++ b/src/index.js @@ -1,30 +1,71 @@ -const {unpack} = require('@shelf/aws-lambda-brotli-unpacker'); -const {execFileSync, execSync} = require('child_process'); +const {execFile} = require('child_process'); const path = require('path'); +const fs = require('fs'); +const parseCSV = require('csv-parse'); const isImage = require('is-image'); +const {unpack} = require('@shelf/aws-lambda-brotli-unpacker'); const unsupportedExtensions = new Set(['ai', 'emf', 'eps', 'gif', 'ico', 'psd', 'svg']); const inputPath = path.join(__dirname, '..', 'bin', 'tt.tar.br'); const outputPath = '/tmp/tesseract/tesseract'; +async function runTesseract(file, opts) { + const ttBinary = process.env.TESSERACT_BINARY_PATH || (await unpack({inputPath, outputPath})); + let processFile = 'stdin'; + if (typeof file === 'string' && fs.existsSync(file)) processFile = file; + if (!file) processFile = false; + + const options = { + env: {} + }; + if (!process.env.TESSERACT_BINARY_PATH) { + options.env.LD_LIBRARY_PATH = + `${process.env.LD_LIBRARY_PATH}:/tmp/tesseract/lib` || `/tmp/tesseract/lib`; + options.env.TESSDATA_PREFIX = process.env.TESSDATA_PREFIX || `/tmp/tesseract/tessdata`; + } + if (!process.env.TESSERACT_BINARY_PATH) options.cwd = '/tmp/tesseract'; + return new Promise((resolve, reject) => { + const finalOpts = processFile ? [processFile, ...opts] : opts; + const child = execFile(ttBinary, finalOpts, options, (error, stdout, stderr) => { + if (error) return reject(error); + return resolve(stdout); + }); + if (processFile === 'stdin') file.pipe(child.stdin); + }); +} + +function isUnsupportedFileExtension(filePath) { + const ext = path + .extname(filePath) + .slice(1) + .toLowerCase(); + + return unsupportedExtensions.has(ext); +} + module.exports.getExecutablePath = async function() { return unpack({inputPath, outputPath}); }; -module.exports.getTextFromImage = async function(filePath) { - const ttBinary = await unpack({inputPath, outputPath}); - - const stdout = execFileSync(ttBinary, [filePath, 'stdout', '-l', 'eng'], { - cwd: '/tmp/tesseract', - env: { - LD_LIBRARY_PATH: './lib', - TESSDATA_PREFIX: './tessdata' - } - }); +module.exports.getTextFromImage = async function(file) { + const result = await runTesseract(file, ['stdout', '-l', 'eng']); + return result.toString(); +}; - execSync(`rm ${filePath}`); +module.exports.getWordsAndBounds = async function(file) { + const result = await runTesseract(file, ['stdout', '-l', 'eng', 'tsv']); + const object = await new Promise((resolve, reject) => + parseCSV(result.toString(), {delimiter: '\t', columns: true}, (err, result) => { + if (err) return reject(err); + return resolve(result); + }) + ); + return object; +}; - return stdout.toString(); +module.exports.version = async function() { + const result = await runTesseract(false, ['--version']); + return result.toString(); }; module.exports.isSupportedFile = function(filePath) { @@ -35,12 +76,3 @@ module.exports.isSupportedFile = function(filePath) { return !isUnsupportedFileExtension(filePath); }; - -function isUnsupportedFileExtension(filePath) { - const ext = path - .extname(filePath) - .slice(1) - .toLowerCase(); - - return unsupportedExtensions.has(ext); -}