diff --git a/doc/README.ocr b/doc/README.ocr index aaf6a9196..fcc8054ae 100644 --- a/doc/README.ocr +++ b/doc/README.ocr @@ -4,14 +4,19 @@ OCR SeedDMS itself has no support for optical character recognition (OCR) because it does not care about the content of file. Though, external OCR software can be used to convert an image into text and index it -by the full text search engine. +by the full text search engine. From SeedDMS point of view, it would +be sufficient to have a conversion service which converts an image +into text. This can be implemented in any possible way, but most +likely as a SeedDMS extension. -The following script can be use to convert a scanned image into pdf -with a text layer added. The script actually takes this file to -ran it through pdftotext. It was published in the seeddms forum +The following script can be use to convert a pdf with scanned images +into a text. The script converts any page into a image, runs it through +tesseract, which creates a pdf again containing a text layer. All those +pdf documents will be united into a single pdf and through `pdftotext` again. +It was published in the SeedDMS forum https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/ - +``` #!/bin/bash inputpdf=$1 temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/ @@ -27,15 +32,13 @@ do done if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then - -trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT - #das Datum mit dem Scriptnamen in die Protokolldatei schreiben - echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei - + trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lock file will be deleted: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT + # write date and script name into log file + echo $(date) " Lock file created: " $lockfile"/"`basename $0` >> $protokolldatei else - #Script beenden falls Lockdatei nicht erstellt werden konnte - echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei - exit 1 + # Exit script if lock file could not be created + echo $(date) " Script will exit, because lock file could not be created: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei + exit 1 fi mkdir -p $temp_folder @@ -44,16 +47,17 @@ $(pdftotext -raw $1 - 1> $temp_folder''tmp.txt ) pdf_contents=`cat $temp_folder''tmp.txt` pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'` if [ -z "$pdf_contents" ]; then - convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg - find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf + convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg + find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf -num=`find $temp_folder -name '*.pdf'| wc -l` -if [ "$num" -gt "1" ]; then + num=`find $temp_folder -name '*.pdf'| wc -l` + if [ "$num" -gt "1" ]; then pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf -else + else mv $temp_folder*.pdf $temp_folder''tmp.pdf -fi - pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt - mv $temp_folder''tmp.pdf $1 + fi + pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt + mv $temp_folder''tmp.pdf $1 fi cat $temp_folder''tmp.txt +```