mirror of
				https://git.code.sf.net/p/seeddms/code
				synced 2025-10-30 12:41:20 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			60 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			60 lines
		
	
	
		
			2.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| OCR
 | |
| ====
 | |
| 
 | |
| SeedDMS itself has no support for optical character recognition (OCR)
 | |
| because it does not care about the content of file. Though, external
 | |
| OCR software can be used to convert an image into text and index it
 | |
| by the full text search engine.
 | |
| 
 | |
| The following script can be use to convert a scanned image into pdf
 | |
| with a text layer added. The script actually takes this file to
 | |
| ran it through pdftotext. It was published in the seeddms forum
 | |
| https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
 | |
| 
 | |
| 
 | |
| #!/bin/bash
 | |
| inputpdf=$1
 | |
| temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
 | |
| lockfile=/tmp/seed
 | |
| protokolldatei=./tesser_syslog
 | |
| cores=2
 | |
| 
 | |
| mkdir -p $lockfile
 | |
| 
 | |
| while [ -e "$lockfile"/"`basename $0`" ];
 | |
| do
 | |
|     sleep 5
 | |
| done
 | |
| 
 | |
| if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
 | |
| 
 | |
| trap 'rm -f "$lockfile"/"`basename $0`";  echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder;  exit $?'  INT TERM KILL EXIT
 | |
|   #das Datum mit dem Scriptnamen in die Protokolldatei schreiben
 | |
|   echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei
 | |
| 
 | |
| else
 | |
|     #Script beenden falls Lockdatei nicht erstellt werden konnte
 | |
|     echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden:  $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
 | |
|     exit 1
 | |
| fi
 | |
| 
 | |
| mkdir -p $temp_folder
 | |
| 
 | |
| $(pdftotext -raw $1 - 1> $temp_folder''tmp.txt )
 | |
| pdf_contents=`cat $temp_folder''tmp.txt`
 | |
| pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
 | |
| if [ -z "$pdf_contents" ]; then
 | |
|     convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
 | |
|     find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
 | |
| 
 | |
| num=`find $temp_folder -name '*.pdf'| wc -l`
 | |
| if [ "$num" -gt "1" ]; then
 | |
|     pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
 | |
| else
 | |
|     mv $temp_folder*.pdf $temp_folder''tmp.pdf
 | |
| fi
 | |
|     pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
 | |
|     mv $temp_folder''tmp.pdf $1
 | |
| fi
 | |
| cat $temp_folder''tmp.txt
 | 
