mirror of
https://git.code.sf.net/p/seeddms/code
synced 2025-11-27 10:00:41 +00:00
explain and improve script
This commit is contained in:
parent
c188f65112
commit
a4aa705fac
|
|
@ -4,14 +4,19 @@ OCR
|
||||||
SeedDMS itself has no support for optical character recognition (OCR)
|
SeedDMS itself has no support for optical character recognition (OCR)
|
||||||
because it does not care about the content of file. Though, external
|
because it does not care about the content of file. Though, external
|
||||||
OCR software can be used to convert an image into text and index it
|
OCR software can be used to convert an image into text and index it
|
||||||
by the full text search engine.
|
by the full text search engine. From SeedDMS point of view, it would
|
||||||
|
be sufficient to have a conversion service which converts an image
|
||||||
|
into text. This can be implemented in any possible way, but most
|
||||||
|
likely as a SeedDMS extension.
|
||||||
|
|
||||||
The following script can be use to convert a scanned image into pdf
|
The following script can be use to convert a pdf with scanned images
|
||||||
with a text layer added. The script actually takes this file to
|
into a text. The script converts any page into a image, runs it through
|
||||||
ran it through pdftotext. It was published in the seeddms forum
|
tesseract, which creates a pdf again containing a text layer. All those
|
||||||
|
pdf documents will be united into a single pdf and through `pdftotext` again.
|
||||||
|
It was published in the SeedDMS forum
|
||||||
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
https://sourceforge.net/p/seeddms/discussion/general/thread/4ec5973d/
|
||||||
|
|
||||||
|
```
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
inputpdf=$1
|
inputpdf=$1
|
||||||
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
temp_folder=/tmp/seedinput/$(date +"%Y_%m_%d_%H%M%S")/
|
||||||
|
|
@ -27,15 +32,13 @@ do
|
||||||
done
|
done
|
||||||
|
|
||||||
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
if ( set -o noclobber; echo "locked" > "$lockfile"/"`basename $0`"); then
|
||||||
|
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lock file will be deleted: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
||||||
trap 'rm -f "$lockfile"/"`basename $0`"; echo $(date) " Lockdatei wird geloescht: " $lockfile"/"`basename $0` Aufrufparameter: $* >> $protokolldatei ;rm -r $temp_folder; exit $?' INT TERM KILL EXIT
|
# write date and script name into log file
|
||||||
#das Datum mit dem Scriptnamen in die Protokolldatei schreiben
|
echo $(date) " Lock file created: " $lockfile"/"`basename $0` >> $protokolldatei
|
||||||
echo $(date) " Lockdatei erstellt: " $lockfile"/"`basename $0` >> $protokolldatei
|
|
||||||
|
|
||||||
else
|
else
|
||||||
#Script beenden falls Lockdatei nicht erstellt werden konnte
|
# Exit script if lock file could not be created
|
||||||
echo $(date) " Programm wird beendet, Lockdatei konnte nicht erstellt werden: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
echo $(date) " Script will exit, because lock file could not be created: $lockfile"/"`basename $0` Aufrufparameter: $* " >> $protokolldatei
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mkdir -p $temp_folder
|
mkdir -p $temp_folder
|
||||||
|
|
@ -44,16 +47,17 @@ $(pdftotext -raw $1 - 1> $temp_folder''tmp.txt )
|
||||||
pdf_contents=`cat $temp_folder''tmp.txt`
|
pdf_contents=`cat $temp_folder''tmp.txt`
|
||||||
pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
|
pdf_contents=`echo "$pdf_contents" | tr -dc '[:print:]'`
|
||||||
if [ -z "$pdf_contents" ]; then
|
if [ -z "$pdf_contents" ]; then
|
||||||
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
convert -density 300 -quality 95 $inputpdf +adjoin $temp_folder''image%03d.jpg
|
||||||
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
find $temp_folder -name '*.jpg'| parallel --gnu -j $cores tesseract -l deu --psm 6 {} {} pdf
|
||||||
|
|
||||||
num=`find $temp_folder -name '*.pdf'| wc -l`
|
num=`find $temp_folder -name '*.pdf'| wc -l`
|
||||||
if [ "$num" -gt "1" ]; then
|
if [ "$num" -gt "1" ]; then
|
||||||
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
pdfunite $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||||
else
|
else
|
||||||
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
mv $temp_folder*.pdf $temp_folder''tmp.pdf
|
||||||
fi
|
fi
|
||||||
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
pdftotext $temp_folder''tmp.pdf $temp_folder''tmp.txt
|
||||||
mv $temp_folder''tmp.pdf $1
|
mv $temp_folder''tmp.pdf $1
|
||||||
fi
|
fi
|
||||||
cat $temp_folder''tmp.txt
|
cat $temp_folder''tmp.txt
|
||||||
|
```
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user