add support for indexing folders, remove old predefined indexing commands

This commit is contained in:
Uwe Steinmann 2020-12-12 16:27:53 +01:00
parent f2c376cadc
commit 82e5bf4687
8 changed files with 227 additions and 181 deletions

View File

@ -91,7 +91,7 @@ class SeedDMS_Lucene_IndexedDocument extends Zend_Search_Lucene_Document {
* Constructor. Creates our indexable document and adds all
* necessary fields to it using the passed in document
* @param SeedDMS_Core_DMS $dms
* @param SeedDMS_Core_Document $document
* @param SeedDMS_Core_Document|Folder $document
* @param null $convcmd
* @param bool $nocontent
* @param int $timeout
@ -100,44 +100,8 @@ class SeedDMS_Lucene_IndexedDocument extends Zend_Search_Lucene_Document {
$this->errormsg = '';
$this->cmd = '';
$this->mimetype = '';
$_convcmd = array(
'application/pdf' => 'pdftotext -enc UTF-8 -nopgbrk %s - |sed -e \'s/ [a-zA-Z0-9.]\{1\} / /g\' -e \'s/[0-9.]//g\'',
'application/postscript' => 'ps2pdf14 %s - | pdftotext -enc UTF-8 -nopgbrk - - | sed -e \'s/ [a-zA-Z0-9.]\{1\} / /g\' -e \'s/[0-9.]//g\'',
'application/msword' => 'catdoc %s',
'application/vnd.ms-excel' => 'ssconvert -T Gnumeric_stf:stf_csv -S %s fd://1',
'audio/mp3' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'audio/mpeg' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'text/plain' => 'cat %s',
);
if($convcmd) {
$_convcmd = $convcmd;
}
$version = $document->getLatestContent();
$this->addField(Zend_Search_Lucene_Field::Keyword('document_id', $document->getID()));
if($version) {
$this->addField(Zend_Search_Lucene_Field::Keyword('mimetype', $version->getMimeType()));
$this->addField(Zend_Search_Lucene_Field::Keyword('origfilename', $version->getOriginalFileName(), 'utf-8'));
if(!$nocontent)
$this->addField(Zend_Search_Lucene_Field::UnIndexed('created', $version->getDate()));
if($attributes = $version->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField(Zend_Search_Lucene_Field::Keyword('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue(), 'utf-8'));
else
$this->addField(Zend_Search_Lucene_Field::Text('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue(), 'utf-8'));
}
}
}
$this->addField(Zend_Search_Lucene_Field::Text('title', $document->getName(), 'utf-8'));
if($categories = $document->getCategories()) {
$names = array();
foreach($categories as $cat) {
$names[] = $cat->getName();
}
$this->addField(Zend_Search_Lucene_Field::Text('category', implode(' ', $names), 'utf-8'));
}
if($acllist = $document->getReadAccessList(1, 1, 1)) {
$allu = [];
foreach($acllist['users'] as $u)
@ -159,49 +123,79 @@ class SeedDMS_Lucene_IndexedDocument extends Zend_Search_Lucene_Document {
$this->addField(Zend_Search_Lucene_Field::Text('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue(), 'utf-8'));
}
}
$owner = $document->getOwner();
$this->addField(Zend_Search_Lucene_Field::Text('owner', $owner->getLogin(), 'utf-8'));
if($keywords = $document->getKeywords()) {
$this->addField(Zend_Search_Lucene_Field::Text('keywords', $keywords, 'utf-8'));
}
if($comment = $document->getComment()) {
$this->addField(Zend_Search_Lucene_Field::Text('comment', $comment, 'utf-8'));
}
if($version) {
$status = $version->getStatus();
$this->addField(Zend_Search_Lucene_Field::Keyword('status', $status['status'], 'utf-8'));
}
if($version && !$nocontent) {
$path = $dms->contentDir . $version->getPath();
if(file_exists($path)) {
$content = '';
$mimetype = $version->getMimeType();
$this->mimetype = $mimetype;
$cmd = '';
$mimeparts = explode('/', $mimetype, 2);
if(isset($_convcmd[$mimetype])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
} elseif(isset($_convcmd[$mimeparts[0].'/*'])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
} elseif(isset($_convcmd['*'])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
}
if($cmd) {
$this->cmd = $cmd;
try {
$content = self::execWithTimeout($cmd, $timeout);
if($content['stdout']) {
$this->addField(Zend_Search_Lucene_Field::UnStored('content', $content['stdout'], 'utf-8'));
}
if($content['stderr']) {
$this->errormsg = $content['stderr'];
}
} catch (Exception $e) {
if($document->isType('document')) {
$this->addField(Zend_Search_Lucene_Field::Keyword('document_id', 'D'.$document->getID()));
$version = $document->getLatestContent();
if($version) {
$this->addField(Zend_Search_Lucene_Field::Keyword('mimetype', $version->getMimeType()));
$this->addField(Zend_Search_Lucene_Field::Keyword('origfilename', $version->getOriginalFileName(), 'utf-8'));
if(!$nocontent)
$this->addField(Zend_Search_Lucene_Field::UnIndexed('created', $version->getDate()));
if($attributes = $version->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField(Zend_Search_Lucene_Field::Keyword('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue(), 'utf-8'));
else
$this->addField(Zend_Search_Lucene_Field::Text('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue(), 'utf-8'));
}
}
}
}
if($categories = $document->getCategories()) {
$names = array();
foreach($categories as $cat) {
$names[] = $cat->getName();
}
$this->addField(Zend_Search_Lucene_Field::Text('category', implode(' ', $names), 'utf-8'));
}
if($keywords = $document->getKeywords()) {
$this->addField(Zend_Search_Lucene_Field::Text('keywords', $keywords, 'utf-8'));
}
if($version) {
$status = $version->getStatus();
$this->addField(Zend_Search_Lucene_Field::Keyword('status', $status['status'], 'utf-8'));
}
if($version && !$nocontent) {
$path = $dms->contentDir . $version->getPath();
if(file_exists($path)) {
$content = '';
$mimetype = $version->getMimeType();
$this->mimetype = $mimetype;
$cmd = '';
$mimeparts = explode('/', $mimetype, 2);
if(isset($convcmd[$mimetype])) {
$cmd = sprintf($convcmd[$mimetype], $path);
} elseif(isset($convcmd[$mimeparts[0].'/*'])) {
$cmd = sprintf($convcmd[$mimetype], $path);
} elseif(isset($convcmd['*'])) {
$cmd = sprintf($convcmd[$mimetype], $path);
}
if($cmd) {
$this->cmd = $cmd;
try {
$content = self::execWithTimeout($cmd, $timeout);
if($content['stdout']) {
$this->addField(Zend_Search_Lucene_Field::UnStored('content', $content['stdout'], 'utf-8'));
}
if($content['stderr']) {
$this->errormsg = $content['stderr'];
}
} catch (Exception $e) {
}
}
}
}
} elseif($document->isType('folder')) {
$this->addField(Zend_Search_Lucene_Field::Keyword('document_id', 'F'.$document->getID()));
$this->addField(Zend_Search_Lucene_Field::UnIndexed('created', $document->getDate()));
}
} /* }}} */
public function getErrorMsg() { /* {{{ */

View File

@ -49,7 +49,18 @@ class SeedDMS_Lucene_Search {
* @return object instance of SeedDMS_Lucene_Document of false
*/
function getDocument($id) { /* {{{ */
$hits = $this->index->find('document_id:'.$id);
$hits = $this->index->find('document_id:D'.$id);
return $hits ? $hits[0] : false;
} /* }}} */
/**
* Get folder from index
*
* @param object $index lucene index
* @return object instance of SeedDMS_Lucene_Document of false
*/
function getFolder($id) { /* {{{ */
$hits = $this->index->find('document_id:F'.$id);
return $hits ? $hits[0] : false;
} /* }}} */

View File

@ -44,7 +44,8 @@ class SeedDMS_SQLiteFTS_Document {
} /* }}} */
public function addField($key, $value) { /* {{{ */
if($key == 'document_id') {
//if($key == 'document_id') {
if($key == 'docid') {
$this->id = $this->fields[$key] = (int) $value;
} else {
if(isset($this->fields[$key]))

View File

@ -94,62 +94,8 @@ class SeedDMS_SQLiteFTS_IndexedDocument extends SeedDMS_SQLiteFTS_Document {
$this->errormsg = '';
$this->cmd = '';
$this->mimetype = '';
$_convcmd = array(
'application/pdf' => 'pdftotext -enc UTF-8 -nopgbrk %s - |sed -e \'s/ [a-zA-Z0-9.]\{1\} / /g\' -e \'s/[0-9.]//g\'',
'application/postscript' => 'ps2pdf14 %s - | pdftotext -enc UTF-8 -nopgbrk - - | sed -e \'s/ [a-zA-Z0-9.]\{1\} / /g\' -e \'s/[0-9.]//g\'',
'application/msword' => 'catdoc %s',
'application/vnd.ms-excel' => 'ssconvert -T Gnumeric_stf:stf_csv -S %s fd://1',
'audio/mp3' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'audio/mpeg' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'text/plain' => 'cat %s',
);
if($convcmd) {
$_convcmd = $convcmd;
}
$version = $document->getLatestContent();
$this->addField('document_id', $document->getID());
if($version) {
$this->addField('mimetype', $version->getMimeType());
$this->addField('origfilename', $version->getOriginalFileName());
if(!$nocontent)
$this->addField('created', $version->getDate(), 'unindexed');
if($attributes = $version->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
}
$this->addField('title', $document->getName());
if($categories = $document->getCategories()) {
$names = array();
foreach($categories as $cat) {
$names[] = $cat->getName();
}
$this->addField('category', implode(' ', $names));
}
if($attributes = $document->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
$owner = $document->getOwner();
$this->addField('owner', $owner->getLogin());
if($keywords = $document->getKeywords()) {
$this->addField('keywords', $keywords);
}
if($comment = $document->getComment()) {
$this->addField('comment', $comment);
}
if($acllist = $document->getReadAccessList(1, 1, 1)) {
$allu = [];
foreach($acllist['users'] as $u)
@ -162,39 +108,85 @@ class SeedDMS_SQLiteFTS_IndexedDocument extends SeedDMS_SQLiteFTS_Document {
$this->addField('groups', implode(' ', $allg));
*/
}
if($version) {
$status = $version->getStatus();
$this->addField('status', $status['status']+10);
if($attributes = $document->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
if($version && !$nocontent) {
$path = $dms->contentDir . $version->getPath();
if(file_exists($path)) {
$content = '';
$mimetype = $version->getMimeType();
$this->mimetype = $mimetype;
$cmd = '';
$mimeparts = explode('/', $mimetype, 2);
if(isset($_convcmd[$mimetype])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
} elseif(isset($_convcmd[$mimeparts[0].'/*'])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
} elseif(isset($_convcmd['*'])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
}
if($cmd) {
$this->cmd = $cmd;
try {
$content = self::execWithTimeout($cmd, $timeout);
if($content['stdout']) {
$this->addField('content', $content['stdout'], 'unstored');
}
if($content['stderr']) {
$this->errormsg = $content['stderr'];
}
} catch (Exception $e) {
$owner = $document->getOwner();
$this->addField('owner', $owner->getLogin());
if($comment = $document->getComment()) {
$this->addField('comment', $comment);
}
if($document->isType('document')) {
$this->addField('document_id', 'D'.$document->getID());
$version = $document->getLatestContent();
if($version) {
$this->addField('mimetype', $version->getMimeType());
$this->addField('origfilename', $version->getOriginalFileName());
if(!$nocontent)
$this->addField('created', $version->getDate(), 'unindexed');
if($attributes = $version->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
}
if($categories = $document->getCategories()) {
$names = array();
foreach($categories as $cat) {
$names[] = $cat->getName();
}
$this->addField('category', implode(' ', $names));
}
if($keywords = $document->getKeywords()) {
$this->addField('keywords', $keywords);
}
if($version) {
$status = $version->getStatus();
$this->addField('status', $status['status']+10);
}
if($version && !$nocontent) {
$path = $dms->contentDir . $version->getPath();
if(file_exists($path)) {
$content = '';
$mimetype = $version->getMimeType();
$this->mimetype = $mimetype;
$cmd = '';
$mimeparts = explode('/', $mimetype, 2);
if(isset($convcmd[$mimetype])) {
$cmd = sprintf($convcmd[$mimetype], $path);
} elseif(isset($convcmd[$mimeparts[0].'/*'])) {
$cmd = sprintf($convcmd[$mimetype], $path);
} elseif(isset($convcmd['*'])) {
$cmd = sprintf($convcmd[$mimetype], $path);
}
if($cmd) {
$this->cmd = $cmd;
try {
$content = self::execWithTimeout($cmd, $timeout);
if($content['stdout']) {
$this->addField('content', $content['stdout'], 'unstored');
}
if($content['stderr']) {
$this->errormsg = $content['stderr'];
}
} catch (Exception $e) {
}
}
}
}
} elseif($document->isType('folder')) {
$this->addField('document_id', 'F'.$document->getID());
}
} /* }}} */

View File

@ -63,9 +63,9 @@ class SeedDMS_SQLiteFTS_Indexer {
*/
$version = SQLite3::version();
if($version['versionNumber'] >= 3008000)
$sql = 'CREATE VIRTUAL TABLE docs USING fts4(title, comment, keywords, category, mimetype, origfilename, owner, content, created, users, status, notindexed=created, matchinfo=fts3)';
$sql = 'CREATE VIRTUAL TABLE docs USING fts4(documentid, title, comment, keywords, category, mimetype, origfilename, owner, content, created, users, status, notindexed=created, matchinfo=fts3)';
else
$sql = 'CREATE VIRTUAL TABLE docs USING fts4(title, comment, keywords, category, mimetype, origfilename, owner, content, created, users, status, matchinfo=fts3)';
$sql = 'CREATE VIRTUAL TABLE docs USING fts4(documentid, title, comment, keywords, category, mimetype, origfilename, owner, content, created, users, status, matchinfo=fts3)';
$res = $index->_conn->exec($sql);
if($res === false) {
return null;
@ -96,7 +96,7 @@ class SeedDMS_SQLiteFTS_Indexer {
if(!$this->_conn)
return false;
$sql = "INSERT INTO docs (docid, title, comment, keywords, category, owner, content, mimetype, origfilename, created, users, status) VALUES(".$doc->getFieldValue('document_id').", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($doc->getFieldValue('comment')).", ".$this->_conn->quote($doc->getFieldValue('keywords')).", ".$this->_conn->quote($doc->getFieldValue('category')).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($doc->getFieldValue('content')).", ".$this->_conn->quote($doc->getFieldValue('mimetype')).", ".$this->_conn->quote($doc->getFieldValue('origfilename')).", ".(int)$doc->getFieldValue('created').", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($doc->getFieldValue('status'))/*time()*/.")";
$sql = "INSERT INTO docs (documentid, title, comment, keywords, category, owner, content, mimetype, origfilename, created, users, status) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($doc->getFieldValue('comment')).", ".$this->_conn->quote($doc->getFieldValue('keywords')).", ".$this->_conn->quote($doc->getFieldValue('category')).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($doc->getFieldValue('content')).", ".$this->_conn->quote($doc->getFieldValue('mimetype')).", ".$this->_conn->quote($doc->getFieldValue('origfilename')).", ".(int)$doc->getFieldValue('created').", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($doc->getFieldValue('status'))/*time()*/.")";
$res = $this->_conn->exec($sql);
if($res === false) {
return false;
@ -150,7 +150,7 @@ class SeedDMS_SQLiteFTS_Indexer {
$res = $this->_conn->query($sql);
$row = $res->fetch();
$sql = "SELECT docid FROM docs";
$sql = "SELECT docid, documentid FROM docs";
if($query)
$sql .= " WHERE docs MATCH ".$this->_conn->quote($query);
$res = $this->_conn->query($sql);
@ -164,6 +164,7 @@ class SeedDMS_SQLiteFTS_Indexer {
foreach($res as $rec) {
$hit = new SeedDMS_SQLiteFTS_QueryHit($this);
$hit->id = $rec['docid'];
$hit->documentid = $rec['documentid'];
$hits[] = $hit;
}
}
@ -196,19 +197,21 @@ class SeedDMS_SQLiteFTS_Indexer {
/**
* Get a single document from index
*
* @param integer $id id of document
* @param integer $id id of index record
* @return boolean false in case of an error, otherwise true
*/
public function getDocument($id) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT title, comment, owner, keywords, category, mimetype, origfilename, created, users, status FROM docs WHERE docid=".(int) $id;
$sql = "SELECT docid, documentid, title, comment, owner, keywords, category, mimetype, origfilename, created, users, status FROM docs WHERE docid=".$id;
$res = $this->_conn->query($sql);
$doc = false;
if($res) {
$rec = $res->fetch(PDO::FETCH_ASSOC);
$doc = new SeedDMS_SQLiteFTS_Document();
$doc->addField('docid', $rec['docid']);
$doc->addField('document_id', $rec['documentid']);
$doc->addField('title', $rec['title']);
$doc->addField('comment', $rec['comment']);
$doc->addField('keywords', $rec['keywords']);
@ -223,6 +226,33 @@ class SeedDMS_SQLiteFTS_Indexer {
return $doc;
} /* }}} */
/**
* Get a single folder from index
*
* @param integer $id id of folder
* @return boolean false in case of an error, otherwise true
*/
public function getFolder($id) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT docid, documentid, title, comment, owner, keywords, category, mimetype, origfilename, created, users, status FROM docs WHERE documentid='F".$id."'";
$res = $this->_conn->query($sql);
$doc = false;
if($res) {
$rec = $res->fetch(PDO::FETCH_ASSOC);
$doc = new SeedDMS_SQLiteFTS_Document();
$doc->addField('docid', $rec['docid']);
$doc->addField('document_id', $rec['documentid']);
$doc->addField('title', $rec['title']);
$doc->addField('comment', $rec['comment']);
$doc->addField('owner', $rec['owner']);
$doc->addField('created', $rec['created']);
$doc->addField('users', $rec['users']);
}
return $doc;
} /* }}} */
/**
* Return list of terms in index
*

View File

@ -37,11 +37,17 @@ class SeedDMS_SQLiteFTS_QueryHit {
protected $_document;
/**
* @var integer $id id of document
* @var integer $id id of index document
* @access public
*/
public $id;
/**
* @var integer $id id of real document
* @access public
*/
public $documentid;
/**
*
*/

View File

@ -43,14 +43,25 @@ class SeedDMS_SQliteFTS_Search {
} /* }}} */
/**
* Get hit from index
* Get document from index
*
* @param object $index lucene index
* @return object instance of SeedDMS_Lucene_Document of false
* @param int $id real document id
* @return object instance of SeedDMS_SQliteFTS_QueryHit or false
*/
function getDocument($id) { /* {{{ */
$hits = $this->index->findById((int) $id);
return $hits ? $hits[0] : false;
$hits = $this->index->find('D'.$id);
return $hits['hits'] ? $hits['hits'][0] : false;
} /* }}} */
/**
* Get folder from index
*
* @param int $id real folder id
* @return object instance of SeedDMS_SQliteFTS_QueryHit or false
*/
function getFolder($id) { /* {{{ */
$hits = $this->index->find('F'.$id);
return $hits['hits'] ? $hits['hits'][0] : false;
} /* }}} */
/**
@ -102,7 +113,7 @@ class SeedDMS_SQliteFTS_Search {
$result = $this->index->find($querystr, $limit);
$recs = array();
foreach($result["hits"] as $hit) {
$recs[] = array('id'=>$hit->id, 'document_id'=>$hit->id);
$recs[] = array('id'=>$hit->id, 'document_id'=>$hit->documentid);
}
return array('count'=>$result['count'], 'hits'=>$recs, 'facets'=>array());
} catch (Exception $e) {

View File

@ -48,17 +48,18 @@ class SeedDMS_SQLiteFTS_Term {
public function __construct($term, $col, $occurrence) { /* {{{ */
$this->text = $term;
$fields = array(
0 => 'title',
1 => 'comment',
2 => 'keywords',
3 => 'category',
4 => 'mimetype',
5 => 'origfilename',
6 => 'owner',
7 => 'content',
8 => 'created',
9 => 'user',
10 => 'status'
0 => 'documentid',
1 => 'title',
2 => 'comment',
3 => 'keywords',
4 => 'category',
5 => 'mimetype',
6 => 'origfilename',
7 => 'owner',
8 => 'content',
9 => 'created',
10 => 'user',
11 => 'status'
);
$this->field = $fields[$col];
$this->_occurrence = $occurrence;