seeddms-docker/sources/ocrmypdf.sh
Niels Lippke 571fe17129 Update to version 5.1.23
- support for https
- preventing directories from being ocr'd
2021-09-02 13:08:56 +02:00

37 lines
747 B
Bash
Executable File

#!/bin/bash
set -e
inputpdf=$1
tmpdir=/tmp/seed
lockfile=$tmpdir/`basename $0`
cores=2
# skip directories
if [ -d "$1"]; then
exit 0
fi
mkdir -p $tmpdir
while [ -e "$lockfile" ];
do
sleep 5
done
if ( set -o noclobber; echo "locked" > "$lockfile"); then
trap 'rm -f "$lockfile"; exit $?' INT TERM KILL EXIT
else
exit 1
fi
pdf_contents=`pdftotext -nopgbrk $1 - | sed -e 's/ [a-zA-Z0-9.]\{1\} / /g' -e 's/[0-9.]//g'`
if [ -z "$pdf_contents" ]; then
echo "ocrmypdf $1"
tmpfile=$tmpdir/`date +%s%N`
ocrmypdf -l deu --rotate-pages --jobs $cores --output-type pdfa $1 $tmpfile 2> /dev/null
pdf_contents=`pdftotext -nopgbrk $tmpfile - | sed -e 's/ [a-zA-Z0-9.]\{1\} / /g' -e 's/[0-9.]//g'`
mv $tmpfile $1
fi
echo $pdf_contents