From 8521b19c5d5cba30af388593502a4d15da0c0afd Mon Sep 17 00:00:00 2001 From: Uwe Steinmann Date: Fri, 9 Sep 2022 10:14:52 +0200 Subject: [PATCH] add some info on how ocr software can be integrated --- doc/README.ocr | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 doc/README.ocr diff --git a/doc/README.ocr b/doc/README.ocr new file mode 100644 index 000000000..aaf6a9196 --- /dev/null +++ b/doc/README.ocr @@ -0,0 +1,59 @@ +OCR +==== + +SeedDMS itself has no support for optical character recognition (OCR) +because it does not care about the content of file. Though, external +OCR software can be used to convert an image into text and index it +by the full text search engine. + +The following script can be use to convert a scanned image into pdf +with a text layer added. The script actually takes this file to +ran it through pdftotext. It was published in the seeddms forum +https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/ + + +#!/bin/bash +inputpdf=$1 +temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/ +lockfile=/tmp/seed +protokolldatei=./tesser_syslog +cores=2 + +mkdir -p $lockfile + +while [ -e "$lockfile"/"`basename $0`" ]; +do + sleep 5 +done + +if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then + +trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT + #das Datum mit dem Scriptnamen in die Protokolldatei schreiben + echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei + +else + #Script beenden falls Lockdatei nicht erstellt werden konnte + echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei + exit 1 +fi + +mkdir -p $temp_folder + +$(pdftotext -raw $1 - 1> $temp_folder''tmp.txt ) +pdf_contents=`cat $temp_folder''tmp.txt` +pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'` +if [ -z "$pdf_contents" ]; then + convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg + find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf + +num=`find $temp_folder -name '*.pdf'| wc -l` +if [ "$num" -gt "1" ]; then + pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf +else + mv $temp_folder*.pdf $temp_folder''tmp.pdf +fi + pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt + mv $temp_folder''tmp.pdf $1 +fi +cat $temp_folder''tmp.txt