mirror of
https://git.code.sf.net/p/seeddms/code
synced 2025-11-27 10:00:41 +00:00
explain and improve script
This commit is contained in:
parent
c188f65112
commit
a4aa705fac
|
|
@ -4,14 +4,19 @@ OCR
|
|||
SeedDMS itself has no support for optical character recognition (OCR)
|
||||
because it does not care about the content of file. Though, external
|
||||
OCR software can be used to convert an image into text and index it
|
||||
by the full text search engine.
|
||||
by the full text search engine. From SeedDMS point of view, it would
|
||||
be sufficient to have a conversion service which converts an image
|
||||
into text. This can be implemented in any possible way, but most
|
||||
likely as a SeedDMS extension.
|
||||
|
||||
The following script can be use to convert a scanned image into pdf
|
||||
with a text layer added. The script actually takes this file to
|
||||
ran it through pdftotext. It was published in the seeddms forum
|
||||
The following script can be use to convert a pdf with scanned images
|
||||
into a text. The script converts any page into a image, runs it through
|
||||
tesseract, which creates a pdf again containing a text layer. All those
|
||||
pdf documents will be united into a single pdf and through `pdftotext` again.
|
||||
It was published in the SeedDMS forum
|
||||
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
||||
|
||||
|
||||
```
|
||||
#!/bin/bash
|
||||
inputpdf=$1
|
||||
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
||||
|
|
@ -27,14 +32,12 @@ do
|
|||
done
|
||||
|
||||
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
||||
|
||||
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
||||
#das Datum mit dem Scriptnamen in die Protokolldatei schreiben
|
||||
echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei
|
||||
|
||||
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lock file will be deleted: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
||||
# write date and script name into log file
|
||||
echo $(date) " Lock file created: " $lockfile"/"`basename $0` >> $protokolldatei
|
||||
else
|
||||
#Script beenden falls Lockdatei nicht erstellt werden konnte
|
||||
echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
||||
# Exit script if lock file could not be created
|
||||
echo $(date) " Script will exit, because lock file could not be created: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
@ -47,13 +50,14 @@ if [ -z "$pdf_contents" ]; then
|
|||
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
||||
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
||||
|
||||
num=`find $temp_folder -name '*.pdf'| wc -l`
|
||||
if [ "$num" -gt "1" ]; then
|
||||
num=`find $temp_folder -name '*.pdf'| wc -l`
|
||||
if [ "$num" -gt "1" ]; then
|
||||
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||
else
|
||||
else
|
||||
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||
fi
|
||||
fi
|
||||
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
||||
mv $temp_folder''tmp.pdf $1
|
||||
fi
|
||||
cat $temp_folder''tmp.txt
|
||||
```
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user