mirror of
https://git.code.sf.net/p/seeddms/code
synced 2025-11-27 18:10:42 +00:00
64 lines
2.3 KiB
Markdown
64 lines
2.3 KiB
Markdown
OCR
|
|
====
|
|
|
|
SeedDMS itself has no support for optical character recognition (OCR)
|
|
because it does not care about the content of file. Though, external
|
|
OCR software can be used to convert an image into text and index it
|
|
by the full text search engine. From SeedDMS point of view, it would
|
|
be sufficient to have a conversion service which converts an image
|
|
into text. This can be implemented in any possible way, but most
|
|
likely as a SeedDMS extension.
|
|
|
|
The following script can be use to convert a pdf with scanned images
|
|
into a text. The script converts any page into a image, runs it through
|
|
tesseract, which creates a pdf again containing a text layer. All those
|
|
pdf documents will be united into a single pdf and through `pdftotext` again.
|
|
It was published in the SeedDMS forum
|
|
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
|
|
|
```
|
|
#!/bin/bash
|
|
inputpdf=$1
|
|
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
|
lockfile=/tmp/seed
|
|
protokolldatei=./tesser_syslog
|
|
cores=2
|
|
|
|
mkdir -p $lockfile
|
|
|
|
while [ -e "$lockfile"/"`basename $0`" ];
|
|
do
|
|
sleep 5
|
|
done
|
|
|
|
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
|
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lock file will be deleted: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
|
# write date and script name into log file
|
|
echo $(date) " Lock file created: " $lockfile"/"`basename $0` >> $protokolldatei
|
|
else
|
|
# Exit script if lock file could not be created
|
|
echo $(date) " Script will exit, because lock file could not be created: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p $temp_folder
|
|
|
|
$(pdftotext -raw $1 - 1> $temp_folder''tmp.txt )
|
|
pdf_contents=`cat $temp_folder''tmp.txt`
|
|
pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
|
|
if [ -z "$pdf_contents" ]; then
|
|
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
|
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
|
|
|
num=`find $temp_folder -name '*.pdf'| wc -l`
|
|
if [ "$num" -gt "1" ]; then
|
|
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
|
else
|
|
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
|
fi
|
|
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
|
mv $temp_folder''tmp.pdf $1
|
|
fi
|
|
cat $temp_folder''tmp.txt
|
|
```
|