add classes for creating fulltext index using sqlitefts

This commit is contained in:
Uwe Steinmann 2015-08-10 21:39:05 +02:00
parent 8ca8e17047
commit 65da6a4e7b
9 changed files with 783 additions and 0 deletions

View File

@ -0,0 +1,44 @@
<?php
// SeedDMS. Document Management System
// Copyright (C) 2011-2015 Uwe Steinmann
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
/**
* @uses SeedDMS_SQLiteFTS_Indexer
*/
require_once('SQLiteFTS/Indexer.php');
/**
* @uses SeedDMS_SQLiteFTS_Search
*/
require_once('SQLiteFTS/Search.php');
/**
* @uses SeedDMS_SQLiteFTS_Term
*/
require_once('SQLiteFTS/Term.php');
/**
* @uses SeedDMS_SQLiteFTS_QueryHit
*/
require_once('SQLiteFTS/QueryHit.php');
/**
* @uses SeedDMS_SQLiteFTS_IndexedDocument
*/
require_once('SQLiteFTS/IndexedDocument.php');
?>

View File

@ -0,0 +1,58 @@
<?php
/**
* Implementation of a document
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Class for managing a document.
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQLiteFTS_Document {
/**
* @var integer $id id of document
* @access protected
*/
public $id;
/**
* @var array $fields fields
* @access protected
*/
protected $fields;
public function addField($key, $value) { /* {{{ */
if($key == 'document_id') {
$this->id = $this->fields[$key] = (int) $value;
} else {
if(isset($this->fields[$key]))
$this->fields[$key] .= ' '.$value;
else
$this->fields[$key] = $value;
}
} /* }}} */
public function getFieldValue($key) { /* {{{ */
if(isset($this->fields[$key]))
return $this->fields[$key];
else
return false;
} /* }}} */
}
?>

View File

@ -0,0 +1,140 @@
<?php
/**
* Implementation of an indexed document
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* @uses SeedDMS_SQLiteFTS_Document
*/
require_once('Document.php');
/**
* Class for managing an indexed document.
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQLiteFTS_IndexedDocument extends SeedDMS_SQLiteFTS_Document {
static function execWithTimeout($cmd, $timeout=2) { /* {{{ */
$descriptorspec = array(
0 => array("pipe", "r"),
1 => array("pipe", "w"),
2 => array("pipe", "w")
);
$pipes = array();
$timeout += time();
$process = proc_open($cmd, $descriptorspec, $pipes);
if (!is_resource($process)) {
throw new Exception("proc_open failed on: " . $cmd);
}
$output = '';
do {
$timeleft = $timeout - time();
$read = array($pipes[1]);
stream_select($read, $write = NULL, $exeptions = NULL, $timeleft, NULL);
if (!empty($read)) {
$output .= fread($pipes[1], 8192);
}
} while (!feof($pipes[1]) && $timeleft > 0);
if ($timeleft <= 0) {
proc_terminate($process);
throw new Exception("command timeout on: " . $cmd);
} else {
return $output;
}
} /* }}} */
/**
* Constructor. Creates our indexable document and adds all
* necessary fields to it using the passed in document
*/
public function __construct($dms, $document, $convcmd=null, $nocontent=false, $timeout=5) {
$_convcmd = array(
'application/pdf' => 'pdftotext -enc UTF-8 -nopgbrk %s - |sed -e \'s/ [a-zA-Z0-9.]\{1\} / /g\' -e \'s/[0-9.]//g\'',
'application/msword' => 'catdoc %s',
'application/vnd.ms-excel' => 'ssconvert -T Gnumeric_stf:stf_csv -S %s fd://1',
'audio/mp3' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'audio/mpeg' => "id3 -l -R %s | egrep '(Title|Artist|Album)' | sed 's/^[^:]*: //g'",
'text/plain' => 'cat %s',
);
if($convcmd) {
$_convcmd = $convcmd;
}
$version = $document->getLatestContent();
$this->addField('document_id', $document->getID());
if($version) {
$this->addField('mimetype', $version->getMimeType());
$this->addField('origfilename', $version->getOriginalFileName());
if(!$nocontent)
$this->addField('created', $version->getDate(), 'unindexed');
if($attributes = $version->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
}
$this->addField('title', $document->getName());
if($categories = $document->getCategories()) {
$names = array();
foreach($categories as $cat) {
$names[] = $cat->getName();
}
$this->addField('category', implode(' ', $names));
}
if($attributes = $document->getAttributes()) {
foreach($attributes as $attribute) {
$attrdef = $attribute->getAttributeDefinition();
if($attrdef->getValueSet() != '')
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
else
$this->addField('attr_'.str_replace(' ', '_', $attrdef->getName()), $attribute->getValue());
}
}
$owner = $document->getOwner();
$this->addField('owner', $owner->getLogin());
if($keywords = $document->getKeywords()) {
$this->addField('keywords', $keywords);
}
if($comment = $document->getComment()) {
$this->addField('comment', $comment);
}
if($version && !$nocontent) {
$path = $dms->contentDir . $version->getPath();
$content = '';
$fp = null;
$mimetype = $version->getMimeType();
if(isset($_convcmd[$mimetype])) {
$cmd = sprintf($_convcmd[$mimetype], $path);
$content = self::execWithTimeout($cmd);
if($content) {
$this->addField('content', $content, 'unstored');
}
}
}
}
}
?>

View File

@ -0,0 +1,251 @@
<?php
/**
* Implementation of SQLiteFTS index
*
* @category DMS
* @package SeedDMS_Lucene
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Class for managing a SQLiteFTS index.
*
* @category DMS
* @package SeedDMS_Lucene
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQLiteFTS_Indexer {
/**
* @var object $index sqlite index
* @access protected
*/
protected $_conn;
/**
* Constructor
*
*/
function __construct($indexerDir) { /* {{{ */
$this->_conn = new PDO('sqlite:'.$indexerDir.'/index.db');
} /* }}} */
/**
* Open an existing index
*
* @param string $indexerDir directory on disk containing the index
*/
static function open($indexerDir) { /* {{{ */
if(file_exists($indexerDir.'/index.db')) {
return new SeedDMS_SQLiteFTS_Indexer($indexerDir);
} else
return self::create($indexerDir);
} /* }}} */
/**
* Create a new index
*
* @param string $indexerDir directory on disk containing the index
*/
static function create($indexerDir) { /* {{{ */
if(!@unlink($indexerDir.'/index.db'))
return null;
$index = new SeedDMS_SQLiteFTS_Indexer($indexerDir);
$sql = 'CREATE VIRTUAL TABLE docs USING fts4(title, comment, keywords, category, owner, content, created, notindexed=created, matchinfo=fts3)';
$res = $index->_conn->exec($sql);
if($res === false) {
return null;
}
$sql = 'CREATE VIRTUAL TABLE docs_terms USING fts4aux(docs);';
$res = $index->_conn->exec($sql);
if($res === false) {
return null;
}
return($index);
} /* }}} */
/**
* Do some initialization
*
*/
static function init($stopWordsFile='') { /* {{{ */
} /* }}} */
/**
* Add document to index
*
* @param object $doc indexed document of class
* SeedDMS_SQLiteFTS_IndexedDocument
* @return boolean false in case of an error, otherwise true
*/
function addDocument($doc) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "INSERT INTO docs (docid, title, comment, keywords, category, owner, content, created) VALUES(".$doc->getFieldValue('document_id').", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($doc->getFieldValue('comment')).", ".$this->_conn->quote($doc->getFieldValue('keywords')).", ".$this->_conn->quote($doc->getFieldValue('category')).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($doc->getFieldValue('content')).", ".time().")";
$res = $this->_conn->exec($sql);
if($res === false) {
var_dump($this->_conn->errorInfo());
}
return $res;
} /* }}} */
/**
* Remove document from index
*
* @param object $doc indexed document of class
* SeedDMS_SQLiteFTS_IndexedDocument
* @return boolean false in case of an error, otherwise true
*/
public function delete($id) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "DELETE FROM docs WHERE docid=".(int) $id;
$res = $this->_conn->exec($sql);
return $res;
} /* }}} */
/**
* Check if document was deleted
*
* Just for compatibility with lucene.
*
* @return boolean always false
*/
public function isDeleted($id) { /* {{{ */
return false;
} /* }}} */
/**
* Find documents in index
*
* @param object $doc indexed document of class
* SeedDMS_SQLiteFTS_IndexedDocument
* @return boolean false in case of an error, otherwise true
*/
public function find($query) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT docid FROM docs WHERE docs MATCH ".$this->_conn->quote($query);
$res = $this->_conn->query($sql);
$hits = array();
if($res) {
foreach($res as $rec) {
$hit = new SeedDMS_SQLiteFTS_QueryHit($this);
$hit->id = $rec['docid'];
$hits[] = $hit;
}
}
return $hits;
} /* }}} */
/**
* Get a single document from index
*
* @param integer $id id of document
* @return boolean false in case of an error, otherwise true
*/
public function findById($id) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT docid FROM docs WHERE docid=".(int) $id;
$res = $this->_conn->query($sql);
$hits = array();
if($res) {
while($rec = $res->fetch(PDO::FETCH_ASSOC)) {
$hit = new SeedDMS_SQLiteFTS_QueryHit($this);
$hit->id = $rec['docid'];
$hits[] = $hit;
}
}
return $hits;
} /* }}} */
/**
* Get a single document from index
*
* @param integer $id id of document
* @return boolean false in case of an error, otherwise true
*/
public function getDocument($id) { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT title, comment, owner, keywords, category, created FROM docs WHERE docid=".(int) $id;
$res = $this->_conn->query($sql);
$doc = false;
if($res) {
$rec = $res->fetch(PDO::FETCH_ASSOC);
$doc = new SeedDMS_SQLiteFTS_Document();
$doc->addField('title', $rec['title']);
$doc->addField('comment', $rec['comment']);
$doc->addField('keywords', $rec['keywords']);
$doc->addField('category', $rec['category']);
$doc->addField('owner', $rec['owner']);
$doc->addField('created', $rec['created']);
}
return $doc;
} /* }}} */
/**
* Return list of terms in index
*
* This function does nothing!
*/
public function terms() { /* {{{ */
if(!$this->_conn)
return false;
$sql = "SELECT term, col, occurrences FROM docs_terms WHERE col!='*' ORDER BY col";
$res = $this->_conn->query($sql);
$terms = array();
if($res) {
while($rec = $res->fetch(PDO::FETCH_ASSOC)) {
$term = new SeedDMS_SQLiteFTS_Term($rec['term'], $rec['col'], $rec['occurrences']);
$terms[] = $term;
}
}
return $terms;
} /* }}} */
/**
* Return list of documents in index
*
*/
public function count() { /* {{{ */
$sql = "SELECT count(*) c FROM docs";
$res = $this->_conn->query($sql);
if($res) {
$rec = $res->fetch(PDO::FETCH_ASSOC);
return $rec['c'];
}
return 0;
} /* }}} */
/**
* Commit changes
*
* This function does nothing!
*/
function commit() { /* {{{ */
} /* }}} */
/**
* Optimize index
*
* This function does nothing!
*/
function optimize() { /* {{{ */
} /* }}} */
}
?>

View File

@ -0,0 +1,65 @@
<?php
/**
* Implementation of a query hit
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Class for managing a query hit.
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQLiteFTS_QueryHit {
/**
* @var SeedDMS_SQliteFTS_Indexer $index
* @access protected
*/
protected $_index;
/**
* @var SeedDMS_SQliteFTS_Document $document
* @access protected
*/
protected $_document;
/**
* @var integer $id id of document
* @access public
*/
public $id;
/**
*
*/
public function __construct(SeedDMS_SQLiteFTS_Indexer $index) { /* {{{ */
$this->_index = $index;
} /* }}} */
/**
* Return the document associated with this hit
*
* @return SeedDMS_SQLiteFTS_Document
*/
public function getDocument() { /* {{{ */
if (!$this->_document instanceof SeedDMS_SQLiteFTS_Document) {
$this->_document = $this->_index->getDocument($this->id);
}
return $this->_document;
} /* }}} */
}
?>

View File

@ -0,0 +1,94 @@
<?php
/**
* Implementation of search in SQlite FTS index
*
* @category DMS
* @package SeedDMS_Lucene
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Class for searching in a SQlite FTS index.
*
* @category DMS
* @package SeedDMS_Lucene
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQliteFTS_Search {
/**
* @var object $index SQlite FTS index
* @access protected
*/
protected $index;
/**
* Create a new instance of the search
*
* @param object $index SQlite FTS index
* @return object instance of SeedDMS_SQliteFTS_Search
*/
function __construct($index) { /* {{{ */
$this->index = $index;
$this->version = '@package_version@';
if($this->version[0] == '@')
$this->version = '3.0.0';
} /* }}} */
/**
* Get hit from index
*
* @param object $index lucene index
* @return object instance of SeedDMS_Lucene_Document of false
*/
function getDocument($id) { /* {{{ */
$hits = $this->index->findById((int) $id);
return $hits ? $hits[0] : false;
} /* }}} */
/**
* Search in index
*
* @param object $index SQlite FTS index
* @return object instance of SeedDMS_Lucene_Search
*/
function search($term, $owner, $status='', $categories=array(), $fields=array()) { /* {{{ */
$querystr = '';
if($fields) {
} else {
if($term)
$querystr .= trim($term);
}
if($owner) {
if($querystr)
$querystr .= ' AND ';
$querystr .= 'owner:'.$owner;
}
if($categories) {
if($querystr)
$querystr .= ' AND ';
$querystr .= 'category:';
$querystr .= implode(' OR category:', $categories);
$querystr .= '';
}
// echo $querystr;
try {
$hits = $this->index->find($querystr);
$recs = array();
foreach($hits as $hit) {
$recs[] = array('id'=>$hit->id, 'document_id'=>$hit->id);
}
return $recs;
} catch (Exception $e) {
return false;
}
} /* }}} */
}
?>

View File

@ -0,0 +1,64 @@
<?php
/**
* Implementation of a term
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @license GPL 2
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2010, Uwe Steinmann
* @version Release: @package_version@
*/
/**
* Class for managing a term.
*
* @category DMS
* @package SeedDMS_SQLiteFTS
* @version @version@
* @author Uwe Steinmann <uwe@steinmann.cx>
* @copyright Copyright (C) 2011, Uwe Steinmann
* @version Release: @package_version@
*/
class SeedDMS_SQLiteFTS_Term {
/**
* @var string $text
* @access public
*/
public $text;
/**
* @var string $field
* @access public
*/
public $field;
/**
* @var integer $occurrence
* @access public
*/
public $_occurrence;
/**
*
*/
public function __construct($term, $col, $occurrence) { /* {{{ */
$this->text = $term;
$fields = array(
0 => 'title',
1 => 'comment',
2 => 'keywords',
3 => 'category',
4 => 'owner',
5 => 'content',
6 => 'created'
);
$this->field = $fields[$col];
$this->_occurrence = $occurrence;
} /* }}} */
}
?>

View File

@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8"?>
<package packagerversion="1.8.1" version="2.0" xmlns="http://pear.php.net/dtd/package-2.0" xmlns:tasks="http://pear.php.net/dtd/tasks-1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://pear.php.net/dtd/tasks-1.0 http://pear.php.net/dtd/tasks-1.0.xsd http://pear.php.net/dtd/package-2.0 http://pear.php.net/dtd/package-2.0.xsd">
<name>SeedDMS_SQLiteFTS</name>
<channel>pear.php.net</channel>
<summary>Fulltext search based on sqlite for SeedDMS</summary>
<description>SeedDMS is a web based document management system (DMS). This is
the fulltext search engine for it, based on SQLite FTS.</description>
<lead>
<name>Uwe Steinmann</name>
<user>steinm</user>
<email>uwe@steinmann.cx</email>
<active>yes</active>
</lead>
<date>2015-08-10</date>
<time>21:13:13</time>
<version>
<release>1.0.0</release>
<api>1.0.0</api>
</version>
<stability>
<release>stable</release>
<api>stable</api>
</stability>
<license uri="http://opensource.org/licenses/gpl-license">GPL License</license>
<notes>
initial release
</notes>
<contents>
<dir baseinstalldir="SeedDMS" name="/">
<dir name="SQLiteFTS">
<file name="Indexer.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
<file name="IndexedDocument.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
<file name="Document.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
<file name="QueryHit.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
<file name="Search.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
<file name="Term.php" role="php">
<tasks:replace from="@package_version@" to="version" type="package-info" />
</file>
</dir> <!-- /SQLiteFTS -->
<dir name="tests">
</dir> <!-- /tests -->
</dir> <!-- / -->
</contents>
<dependencies>
<required>
<php>
<min>4.3.0</min>
</php>
<pearinstaller>
<min>1.5.4</min>
</pearinstaller>
</required>
</dependencies>
<phprelease />
<changelog>
</changelog>
</package>

View File