Merge branch 'seeddms-5.1.x' into seeddms-6.0.x

This commit is contained in:
Uwe Steinmann 2023-01-05 08:39:45 +01:00
commit 12ffb0174e
18 changed files with 243 additions and 12 deletions

View File

@ -51,7 +51,7 @@ class SeedDMS_Lucene_Indexer {
* Do some initialization
*
*/
static function init($stopWordsFile='') { /* {{{ */
public function init($stopWordsFile='') { /* {{{ */
$analyzer = new Zend_Search_Lucene_Analysis_Analyzer_Common_Utf8_CaseInsensitive();
if($stopWordsFile && file_exists($stopWordsFile)) {
$stopWordsFilter = new Zend_Search_Lucene_Analysis_TokenFilter_StopWords();

View File

@ -31,7 +31,8 @@ class SeedDMS_Preview_PdfPreviewer extends SeedDMS_Preview_Base {
} /* }}} */
/**
* Return the physical filename of the preview image on disk
* Return the physical filename of the preview image on disc
* including the path
*
* @param object $object document content or document file
* @return string file name of preview image
@ -47,7 +48,7 @@ class SeedDMS_Preview_PdfPreviewer extends SeedDMS_Preview_Base {
case $dms->getClassname('documentcontent'):
$target = $dir.'p'.$object->getVersion();
break;
case "SeedDMS_Core_DocumentFile":
case $dms->getClassname('documentfile'):
$target = $dir.'f'.$object->getID();
break;
default:

View File

@ -29,6 +29,16 @@ class SeedDMS_Preview_Previewer extends SeedDMS_Preview_Base {
*/
protected $width;
/**
* Create instance of image previewer
*
* @param string $previewDir path of base directory where all images are
* stored. This directory will have a subdirectory derived from the object id.
* @param integer $width default width of an image
* @param integer $timeout timeout for shell commands to create a preview image
* @param boolean $xsendfile if set to true the apache module xsendfile will
* be used.
*/
function __construct($previewDir, $width=40, $timeout=5, $xsendfile=true) { /* {{{ */
parent::__construct($previewDir, $timeout, $xsendfile);
$this->converters = array(
@ -46,7 +56,8 @@ class SeedDMS_Preview_Previewer extends SeedDMS_Preview_Base {
} /* }}} */
/**
* Return the physical filename of the preview image on disk
* Return the physical filename of the preview image on disc
* including the path
*
* @param object $object document content or document file
* @param integer $width width of preview image
@ -103,6 +114,7 @@ class SeedDMS_Preview_Previewer extends SeedDMS_Preview_Base {
* @param string $mimetype MimeType of input file
* @param integer $width width of generated preview image
* @param string $target optional name of preview image (without extension)
* @param boolean $new will be set to true if the preview images was created
* @return boolean true on success, false on failure
*/
public function createRawPreview($infile, $dir, $mimetype, $width=0, $target='', &$new=false) { /* {{{ */
@ -171,6 +183,7 @@ class SeedDMS_Preview_Previewer extends SeedDMS_Preview_Base {
* @param object $object instance of SeedDMS_Core_DocumentContent
* or SeedDMS_Core_DocumentFile
* @param integer $width desired width of preview image
* @param boolean $new will be set to true if the preview images was created
* @return boolean true on success, false on failure
*/
public function createPreview($object, $width=0, &$new=false) { /* {{{ */

View File

@ -25,18 +25,48 @@
class SeedDMS_SQLiteFTS_Indexer {
/**
* @var string $ftstype
* @var string $_ftstype
* @access protected
*/
protected $_ftstype;
/**
* @var object $index sqlite index
* @var object $_conn sqlite index
* @access protected
*/
protected $_conn;
/**
* @var array $_stop_words array of stop words
* @access protected
*/
protected $_stop_words;
const ftstype = 'fts5';
/**
* Remove stopwords from string
*/
protected function strip_stopwords($str = "") { /* {{{ */
// 1.) break string into words
// [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
// if input is unicode/utf-8, the u flag is needed: /pattern/u
$words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY);
// 2.) if we have at least 2 words, remove stopwords
if(count($words) > 1) {
$stopwords = $this->_stop_words;
$words = array_filter($words, function ($w) use (&$stopwords) {
return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf- 8")]));
});
}
// check if not too much was removed such as "the the" would return empty
if(!empty($words))
return implode(" ", $words);
return $str;
} /* }}} */
/**
* Constructor
*
@ -48,6 +78,7 @@ class SeedDMS_SQLiteFTS_Indexer {
$this->_rawid = 'rowid';
else
$this->_rawid = 'docid';
$this->_stop_words = [];
} /* }}} */
/**
@ -109,7 +140,9 @@ class SeedDMS_SQLiteFTS_Indexer {
* Do some initialization
*
*/
static function init($stopWordsFile='') { /* {{{ */
public function init($stopWordsFile='') { /* {{{ */
if($stopWordsFile)
$this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile)));
} /* }}} */
/**
@ -135,6 +168,9 @@ class SeedDMS_SQLiteFTS_Indexer {
if($res === false) {
return false;
}
if($this->_stop_words)
$content = $this->strip_stopwords($content);
$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
$res = $this->_conn->exec($sql);
if($res === false) {

View File

@ -26,6 +26,7 @@
- add optional parameter $order to SeedDMS_SQLiteFTS_Indexer::find()
- add optional parameters $query and $col to SeedDMS_SQLiteFTS_Indexer::terms()
- IndexedDocument() accepts a callable for conversion to text
- remove stop words from content
</notes>
<contents>
<dir baseinstalldir="SeedDMS" name="/">

View File

@ -90,6 +90,9 @@ image/jpeg
image/png
convert -resize %wx '%f' 'png:%o'
text/plain
convert -density 100 -resize %wx 'text:%f[0]' 'png:%o'
application/pdf
gs -dBATCH -dNOPAUSE -sDEVICE=png16m -dPDFFitPage -r72x72 -sOutputFile=- -dFirstPage=1 -dLastPage=1 -q '%f' | convert -resize %wx png:- '%o'

View File

@ -16,6 +16,7 @@ require_once("inc/inc.ClassConversionServiceImageToImage.php");
require_once("inc/inc.ClassConversionServiceImageToText.php");
require_once("inc/inc.ClassConversionServicePdfToImage.php");
require_once("inc/inc.ClassConversionServiceTextToText.php");
require_once("inc/inc.ClassConversionServiceTextToImage.php");
/**
* Implementation of conversion manager

View File

@ -36,9 +36,15 @@ abstract class SeedDMS_ConversionServiceBase {
*/
protected $logger;
/**
* @var $success set to false if conversion failed
*/
protected $success;
public function __construct() {
$this->from = null;
$this->to = null;
$this->success = true;
}
public function setLogger($logger) {
@ -53,6 +59,10 @@ abstract class SeedDMS_ConversionServiceBase {
return [];
} /* }}} */
public function wasSuccessful() { /* {{{ */
return $this->success;
} /* }}} */
/**
* This method does the conversion
*

View File

@ -92,6 +92,7 @@ class SeedDMS_ConversionServiceExec extends SeedDMS_ConversionServiceBase {
} /* }}} */
public function __construct($from, $to, $cmd, $timeout=5) {
parent::__construct();
$this->from = $from;
$this->to = $to;
$this->cmd = $cmd;

View File

@ -29,6 +29,7 @@ class SeedDMS_ConversionServiceImageToImage extends SeedDMS_ConversionServiceBas
public $timeout;
public function __construct($from, $to) { /* {{{ */
parent::__construct();
$this->from = $from;
$this->to = $to;
$this->timeout = 5;

View File

@ -29,6 +29,7 @@ class SeedDMS_ConversionServiceImageToText extends SeedDMS_ConversionServiceBase
public $timeout;
public function __construct($from, $to) { /* {{{ */
parent::__construct();
$this->from = $from;
$this->to = $to;
} /* }}} */

View File

@ -29,6 +29,7 @@ class SeedDMS_ConversionServicePdfToImage extends SeedDMS_ConversionServiceBase
public $timeout;
public function __construct($from, $to) {
parent::__construct();
$this->from = $from;
$this->to = $to;
$this->timeout = 5;
@ -75,6 +76,7 @@ class SeedDMS_ConversionServicePdfToImage extends SeedDMS_ConversionServiceBase
}
}
} catch (ImagickException $e) {
$this->success = false;
return false;
}
return false;

View File

@ -0,0 +1,144 @@
<?php
/**
* Implementation of conversion service class
*
* @category DMS
* @package SeedDMS
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2023 Uwe Steinmann
* @version Release: @package_version@
*/
require_once("inc/inc.ClassConversionServiceBase.php");
/**
* Implementation of conversion service from text to image
*
* @category DMS
* @package SeedDMS
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2023 Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_ConversionServiceTextToImage extends SeedDMS_ConversionServiceBase {
public function __construct($from, $to) {
parent::__construct();
$this->from = $from;
$this->to = $to;
}
public function getInfo() {
return "Convert with imagick php functions";
}
public function getAdditionalParams() { /* {{{ */
return [
['name'=>'width', 'type'=>'number', 'description'=>'Width of converted image'],
['name'=>'page', 'type'=>'number', 'description'=>'Page of text document'],
];
} /* }}} */
private function wordWrapAnnotation($image, $draw, $text, $maxWidth) { /* {{{ */
$words = preg_split('%\s%', trim($text), -1, PREG_SPLIT_NO_EMPTY);
$lines = array();
$i = 0;
$lineHeight = 0;
while (count($words) > 0) {
$metrics = $image->queryFontMetrics($draw, implode(' ', array_slice($words, 0, ++$i)));
$lineHeight = max($metrics['textHeight'], $lineHeight);
// check if we have found the word that exceeds the line width
if ($metrics['textWidth'] > $maxWidth or count($words) < $i) {
// handle case where a single word is longer than the allowed line width (just add this as a word on its own line?)
if ($i == 1)
$i++;
$lines[] = implode(' ', array_slice($words, 0, --$i));
$words = array_slice($words, $i);
$i = 0;
}
}
return array($lines, $lineHeight);
} /* }}} */
public function convert($infile, $target = null, $params = array()) { /* {{{ */
$boxWidth = 596;
$boxHeight = 842;
$boxTop = 30;
$boxBottom = 30;
$boxLeft = 30;
$boxRight = 30;
$parSep = 10;
$fontSize = 10;
$start = microtime(true);
$imagick = new Imagick();
/* Setting a smaller resolution will speed up the conversion
* A resolution of 72,72 will create a 596x842 image
* Setting it to 36,36 will create a 298x421 image which should
* be sufficient in most cases, but keep in mind that images are
* not scaled up. Hence, a width of 400px still results in a 298px
* wide image
*/
$imagick->setResolution(72,72);
$page = 0;
if(!empty($params['page']) && intval($params['page']) > 0)
$page = intval($params['page'])-1;
try {
if($imagick->newImage($boxWidth, $boxHeight, "white")) {
$draw = new ImagickDraw();
$draw->setStrokeColor("none");
$draw->setFont("Courier");
$draw->setFontSize($fontSize);
$draw->setTextAlignment(Imagick::ALIGN_LEFT);
$content = file_get_contents($infile);
$lines = preg_split('~\R~',$content);
$boxY = $boxTop;
$pagecount = 0;
foreach($lines as $line) {
if($line) {
$rlines = $this->wordWrapAnnotation($imagick, $draw, $line, $boxWidth-$boxLeft-$boxRight);
foreach($rlines[0] as $rline) {
if($pagecount == $page && $boxY < ($boxHeight-$boxBottom)) {
$imagick->annotateImage($draw, $boxLeft, $boxY, 0, $rline);
}
$boxY = $boxY + $rlines[1];
}
} else {
$boxY += $parSep;
}
if($boxY >= ($boxHeight-$boxBottom)) {
$pagecount++;
$boxY = $boxTop;
if($pagecount > $page)
break;
}
}
if(!empty($params['width']))
$imagick->scaleImage(min((int) $params['width'], $imagick->getImageWidth()), 0);
$imagick->setImageFormat('png');
$end = microtime(true);
if($this->logger) {
$this->logger->log('Conversion from '.$this->from.' to '.$this->to.' with text service took '.($end-$start).' sec.', PEAR_LOG_INFO);
}
if($target) {
return $imagick->writeImage($target);
} else {
return $imagick->getImageBlob();
}
}
} catch (ImagickException $e) {
return false;
}
return false;
} /* }}} */
}

View File

@ -24,6 +24,7 @@ require_once("inc/inc.ClassConversionServiceBase.php");
*/
class SeedDMS_ConversionServiceTextToText extends SeedDMS_ConversionServiceBase {
public function __construct($from, $to) {
parent::__construct();
$this->from = $from;
$this->to = $to;
}

View File

@ -7,17 +7,27 @@
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2016 Uwe Steinmann
* @copyright Copyright (C) 2021-2023 Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Implementation of fulltext service
*
* The fulltext service is wrapper around single services for a full text
* search. Such a service can be based on Solr, SQlite, etc. It implements
* three major methods:
* IndexedDocument() for creating an instance of an indexed document
* Indexer() for creating an instance of the index
* Search() fro creating an instance of a search frontend
*
* Though this class can manage more than one service, it will only
* use the first one.
*
* @category DMS
* @package SeedDMS
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2016 Uwe Steinmann
* @copyright Copyright (C) 2021-2023 Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_FulltextService {
@ -136,7 +146,7 @@ class SeedDMS_FulltextService {
/**
* Returns callback function to convert a document into plain text
*
* This variant just uses the text previewer which
* This variant uses the text previewer which
* caches the converted document
*/
public function getConversionWithPreviewCallback() { /* {{{ */
@ -162,7 +172,7 @@ class SeedDMS_FulltextService {
} /* }}} */
/**
* Return an indexable document from the given document or folder
* Return an indexable document based on the given document or folder
*
* @param SeedDMS_Core_Document|SeedDMS_Core_Folder $object document or folder
* to be indexed
@ -183,7 +193,7 @@ class SeedDMS_FulltextService {
/**
* Returns an instance of the indexer
*
* The indexer provides access to fulltext index. It allows to add and
* The indexer provides access to the fulltext index. It allows to add and
* get documents.
*
* @return object instance of class specified in 'Indexer'

View File

@ -34,6 +34,10 @@ if(extension_loaded('gd') || extension_loaded('imagick')) {
$conversionmgr->addService(new SeedDMS_ConversionServiceImageToImage('image/gif', 'image/png'))->setLogger($logger);
}
if(extension_loaded('imagick')) {
$conversionmgr->addService(new SeedDMS_ConversionServiceTextToImage('text/plain', 'image/png'))->setLogger($logger);
}
$conversionmgr->addService(new SeedDMS_ConversionServiceImageToText('image/jpeg', 'text/plain'))->setLogger($logger);
$conversionmgr->addService(new SeedDMS_ConversionServiceImageToText('image/jpg', 'text/plain'))->setLogger($logger);

View File

@ -42,6 +42,7 @@ if($settings->_enableFullSearch) {
$fulltextservice->addService($settings->_fullSearchEngine, $indexconf);
}
}
/* setConverters() is deprecated */
$fulltextservice->setConverters(isset($settings->_converters['fulltext']) ? $settings->_converters['fulltext'] : null);
$fulltextservice->setConversionMgr($conversionmgr);
$fulltextservice->setMaxSize($settings->_maxSizeForFullText);

View File

@ -1064,6 +1064,7 @@ switch($command) {
}
if($object) {
if($index = $fulltextservice->Indexer()) {
$index->init($settings->_stopWordsFile);
$idoc = $fulltextservice->IndexedDocument($object, true);
$error = $idoc->getErrorMsg();
if(!$error) {