mirror of
https://git.code.sf.net/p/seeddms/code
synced 2024-11-26 23:42:11 +00:00
60 lines
2.0 KiB
Plaintext
60 lines
2.0 KiB
Plaintext
OCR
|
|
====
|
|
|
|
SeedDMS itself has no support for optical character recognition (OCR)
|
|
because it does not care about the content of file. Though, external
|
|
OCR software can be used to convert an image into text and index it
|
|
by the full text search engine.
|
|
|
|
The following script can be use to convert a scanned image into pdf
|
|
with a text layer added. The script actually takes this file to
|
|
ran it through pdftotext. It was published in the seeddms forum
|
|
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
|
|
|
|
|
#!/bin/bash
|
|
inputpdf=$1
|
|
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
|
lockfile=/tmp/seed
|
|
protokolldatei=./tesser_syslog
|
|
cores=2
|
|
|
|
mkdir -p $lockfile
|
|
|
|
while [ -e "$lockfile"/"`basename $0`" ];
|
|
do
|
|
sleep 5
|
|
done
|
|
|
|
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
|
|
|
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
|
#das Datum mit dem Scriptnamen in die Protokolldatei schreiben
|
|
echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei
|
|
|
|
else
|
|
#Script beenden falls Lockdatei nicht erstellt werden konnte
|
|
echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
|
exit 1
|
|
fi
|
|
|
|
mkdir -p $temp_folder
|
|
|
|
$(pdftotext -raw $1 - 1> $temp_folder''tmp.txt )
|
|
pdf_contents=`cat $temp_folder''tmp.txt`
|
|
pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
|
|
if [ -z "$pdf_contents" ]; then
|
|
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
|
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
|
|
|
num=`find $temp_folder -name '*.pdf'| wc -l`
|
|
if [ "$num" -gt "1" ]; then
|
|
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
|
else
|
|
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
|
fi
|
|
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
|
mv $temp_folder''tmp.pdf $1
|
|
fi
|
|
cat $temp_folder''tmp.txt
|