mirror of
https://git.code.sf.net/p/seeddms/code
synced 2024-11-26 15:32:13 +00:00
add some info on how ocr software can be integrated
This commit is contained in:
parent
49167c8149
commit
8521b19c5d
59
doc/README.ocr
Normal file
59
doc/README.ocr
Normal file
|
@ -0,0 +1,59 @@
|
|||
OCR
|
||||
====
|
||||
|
||||
SeedDMS itself has no support for optical character recognition (OCR)
|
||||
because it does not care about the content of file. Though, external
|
||||
OCR software can be used to convert an image into text and index it
|
||||
by the full text search engine.
|
||||
|
||||
The following script can be use to convert a scanned image into pdf
|
||||
with a text layer added. The script actually takes this file to
|
||||
ran it through pdftotext. It was published in the seeddms forum
|
||||
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
||||
|
||||
|
||||
#!/bin/bash
|
||||
inputpdf=$1
|
||||
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
||||
lockfile=/tmp/seed
|
||||
protokolldatei=./tesser_syslog
|
||||
cores=2
|
||||
|
||||
mkdir -p $lockfile
|
||||
|
||||
while [ -e "$lockfile"/"`basename $0`" ];
|
||||
do
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
||||
|
||||
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
||||
#das Datum mit dem Scriptnamen in die Protokolldatei schreiben
|
||||
echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei
|
||||
|
||||
else
|
||||
#Script beenden falls Lockdatei nicht erstellt werden konnte
|
||||
echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p $temp_folder
|
||||
|
||||
$(pdftotext -raw $1 - 1> $temp_folder''tmp.txt )
|
||||
pdf_contents=`cat $temp_folder''tmp.txt`
|
||||
pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
|
||||
if [ -z "$pdf_contents" ]; then
|
||||
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
||||
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
||||
|
||||
num=`find $temp_folder -name '*.pdf'| wc -l`
|
||||
if [ "$num" -gt "1" ]; then
|
||||
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||
else
|
||||
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||
fi
|
||||
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
||||
mv $temp_folder''tmp.pdf $1
|
||||
fi
|
||||
cat $temp_folder''tmp.txt
|
Loading…
Reference in New Issue
Block a user