mirror of
				https://git.code.sf.net/p/seeddms/code
				synced 2025-10-25 18:21:19 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			464 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			464 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | |
| /**
 | |
|  * Implementation of SQLiteFTS index
 | |
|  *
 | |
|  * @category   DMS
 | |
|  * @package    SeedDMS_Lucene
 | |
|  * @license    GPL 2
 | |
|  * @version    @version@
 | |
|  * @author     Uwe Steinmann <uwe@steinmann.cx>
 | |
|  * @copyright  Copyright (C) 2010, Uwe Steinmann
 | |
|  * @version    Release: @package_version@
 | |
|  */
 | |
| 
 | |
| 
 | |
| /**
 | |
|  * Class for managing a SQLiteFTS index.
 | |
|  *
 | |
|  * @category   DMS
 | |
|  * @package    SeedDMS_Lucene
 | |
|  * @version    @version@
 | |
|  * @author     Uwe Steinmann <uwe@steinmann.cx>
 | |
|  * @copyright  Copyright (C) 2011, Uwe Steinmann
 | |
|  * @version    Release: @package_version@
 | |
|  */
 | |
| class SeedDMS_SQLiteFTS_Indexer {
 | |
| 
 | |
| 	/**
 | |
| 	 * @var string $_ftstype
 | |
| 	 * @access protected
 | |
| 	 */
 | |
| 	protected $_ftstype;
 | |
| 
 | |
| 	/**
 | |
| 	 * @var object $_conn sqlite index
 | |
| 	 * @access protected
 | |
| 	 */
 | |
| 	protected $_conn;
 | |
| 
 | |
| 	/**
 | |
| 	 * @var array $_stop_words array of stop words
 | |
| 	 * @access protected
 | |
| 	 */
 | |
| 	protected $_stop_words;
 | |
| 
 | |
| 	const ftstype = 'fts5';
 | |
| 
 | |
| 	/**
 | |
| 	 * Remove stopwords from string
 | |
| 	 */
 | |
|   protected function strip_stopwords($str = "") { /* {{{ */
 | |
|     // 1.) break string into words
 | |
|     // [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
 | |
|     // if input is unicode/utf-8, the u flag is needed: /pattern/u
 | |
|     $words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY);
 | |
| 
 | |
|     // 2.) if we have at least 2 words, remove stopwords
 | |
|     if(!empty($words)) {
 | |
|       $stopwords = $this->_stop_words;
 | |
|       $words = array_filter($words, function ($w) use (&$stopwords) {
 | |
|         return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf-8")]));
 | |
|       });
 | |
|     }
 | |
| 
 | |
|     // check if not too much was removed such as "the the" would return empty
 | |
|     if(!empty($words))
 | |
|       return implode(" ", $words);
 | |
|     return $str;
 | |
|   } /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Constructor
 | |
| 	 *
 | |
| 	 */
 | |
| 	function __construct($indexerDir) { /* {{{ */
 | |
| 		$this->_conn = new PDO('sqlite:'.$indexerDir.'/index.db');
 | |
| 		$this->_ftstype = self::ftstype;
 | |
| 		if($this->_ftstype == 'fts5')
 | |
| 			$this->_rawid = 'rowid';
 | |
| 		else
 | |
| 			$this->_rawid = 'docid';
 | |
| 		$this->_stop_words = [];
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Open an existing index
 | |
| 	 *
 | |
| 	 * @param string $indexerDir directory on disk containing the index
 | |
| 	 */
 | |
| 	static function open($conf) { /* {{{ */
 | |
| 		if(file_exists($conf['indexdir'].'/index.db')) {
 | |
| 			return new SeedDMS_SQLiteFTS_Indexer($conf['indexdir']);
 | |
| 		} else
 | |
| 			return static::create($conf);
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Create a new index
 | |
| 	 *
 | |
| 	 * @param array $conf $conf['indexdir'] is the directory on disk containing the index
 | |
| 	 */
 | |
| 	static function create($conf) { /* {{{ */
 | |
| 		if(file_exists($conf['indexdir'].'/index.db'))
 | |
| 			unlink($conf['indexdir'].'/index.db');
 | |
| 		$index =  new SeedDMS_SQLiteFTS_Indexer($conf['indexdir']);
 | |
| 		/* Make sure the sequence of fields is identical to the field list
 | |
| 		 * in SeedDMS_SQLiteFTS_Term
 | |
| 		 */
 | |
| 		$version = SQLite3::version();
 | |
| 		if(self::ftstype == 'fts4') {
 | |
| 			if($version['versionNumber'] >= 3008000)
 | |
| 				$sql = 'CREATE VIRTUAL TABLE docs USING fts4(documentid, record_type, title, comment, keywords, category, mimetype, origfilename, owner, content, created, indexed, users, status, path, notindexed=created, notindexed=indexed, matchinfo=fts3)';
 | |
| 			else
 | |
| 				$sql = 'CREATE VIRTUAL TABLE docs USING fts4(documentid, record_type, title, comment, keywords, category, mimetype, origfilename, owner, content, created, indexed, users, status, path, matchinfo=fts3)';
 | |
| 			$res = $index->_conn->exec($sql);
 | |
| 			if($res === false) {
 | |
| 				return null;
 | |
| 			}
 | |
| 			$sql = 'CREATE VIRTUAL TABLE docs_terms USING fts4aux(docs);';
 | |
| 			$res = $index->_conn->exec($sql);
 | |
| 			if($res === false) {
 | |
| 				return null;
 | |
| 			}
 | |
| 		} elseif(self::ftstype == 'fts5') {
 | |
| 			$sql = 'CREATE VIRTUAL TABLE docs USING fts5(documentid, record_type, title, comment, keywords, category, mimetype, origfilename, owner, content, created unindexed, indexed unindexed, users, status, path)';
 | |
| 			$res = $index->_conn->exec($sql);
 | |
| 			if($res === false) {
 | |
| 				return null;
 | |
| 			}
 | |
| 			$sql = 'CREATE VIRTUAL TABLE docs_terms USING fts5vocab(docs, \'col\');';
 | |
| 			$res = $index->_conn->exec($sql);
 | |
| 			if($res === false) {
 | |
| 				return null;
 | |
| 			}
 | |
| 		} else
 | |
| 			return null;
 | |
| 		return($index);
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Do some initialization
 | |
| 	 *
 | |
| 	 */
 | |
| 	public function init($stopWordsFile='') { /* {{{ */
 | |
| 		if($stopWordsFile)
 | |
| 			$this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile)));
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Add document to index
 | |
| 	 *
 | |
| 	 * @param object $doc indexed document of class 
 | |
| 	 * SeedDMS_SQLiteFTS_IndexedDocument
 | |
| 	 * @return boolean false in case of an error, otherwise true
 | |
| 	 */
 | |
| 	function addDocument($doc) { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		foreach(array('comment', 'keywords', 'category', 'content', 'mimetype', 'origfilename', 'status', 'created', 'indexed') as $kk) {
 | |
| 			try {
 | |
| 				${$kk} = $doc->getFieldValue($kk);
 | |
| 			} catch (Exception $e) {
 | |
| 				${$kk} = '';
 | |
| 			}
 | |
| 		}
 | |
| 		$sql = "DELETE FROM docs WHERE documentid=".$this->_conn->quote($doc->getFieldValue('document_id'));
 | |
| 		$res = $this->_conn->exec($sql);
 | |
| 		if($res === false) {
 | |
| 			return false;
 | |
| 		}
 | |
| 		if($this->_stop_words)
 | |
| 			$content = $this->strip_stopwords($content);
 | |
| 
 | |
| 		$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
 | |
| 		$res = $this->_conn->exec($sql);
 | |
| 		if($res === false) {
 | |
| 			return false;
 | |
| 			var_dump($this->_conn->errorInfo());
 | |
| 		}
 | |
| 		return $res;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Remove document from index
 | |
| 	 *
 | |
| 	 * @param object $id internal id of document
 | |
| 	 * @return boolean false in case of an error, otherwise true
 | |
| 	 */
 | |
| 	public function delete($id) { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		$sql = "DELETE FROM docs WHERE ".$this->_rawid."=".(int) $id;
 | |
| 		$res = $this->_conn->exec($sql);
 | |
| 		return $res;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Check if document was deleted
 | |
| 	 *
 | |
| 	 * Just for compatibility with lucene.
 | |
| 	 *
 | |
| 	 * @return boolean always false
 | |
| 	 */
 | |
| 	public function isDeleted($id) { /* {{{ */
 | |
| 		return false;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Find documents in index
 | |
| 	 *
 | |
| 	 * @param string $query 
 | |
| 	 * @param array $limit array with elements 'limit' and 'offset'
 | |
| 	 * @return boolean false in case of an error, otherwise array with elements
 | |
| 	 * 'count', 'hits', 'facets'. 'hits' is an array of SeedDMS_SQLiteFTS_QueryHit
 | |
| 	 */
 | |
| 	public function find($query, $filter='', $limit=array(), $order=array()) { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		/* First count some records for facets */
 | |
| 		foreach(array('owner', 'mimetype', 'category', 'status') as $facetname) {
 | |
| 			$sql = "SELECT `".$facetname."`, count(*) AS `c` FROM `docs`";
 | |
| 			if($query) {
 | |
| 				$sql .= " WHERE docs MATCH ".$this->_conn->quote($query);
 | |
| 			}
 | |
| 			if($filter) {
 | |
| 				if($query)
 | |
| 					$sql .= " AND ".$filter;
 | |
| 				else
 | |
| 					$sql .= " WHERE ".$filter;
 | |
| 			}
 | |
| 			$res = $this->_conn->query($sql." GROUP BY `".$facetname."`");
 | |
| 			if(!$res)
 | |
| 				throw new SeedDMS_SQLiteFTS_Exception("Counting records in facet \"$facetname\" failed.");
 | |
| //				return false;
 | |
| 			$facets[$facetname] = array();
 | |
| 			foreach($res as $row) {
 | |
| 				if($row[$facetname] && $row['c']) {
 | |
| 					if($facetname == 'category') {
 | |
| 						$tmp = explode('#', $row[$facetname]);
 | |
| 						if(count($tmp) > 1) {
 | |
| 							foreach($tmp as $t) {
 | |
| 								if(!isset($facets[$facetname][$t]))
 | |
| 									$facets[$facetname][$t] = $row['c'];
 | |
| 								else
 | |
| 									$facets[$facetname][$t] += $row['c'];
 | |
| 							}
 | |
| 						} else {
 | |
| 							if(!isset($facets[$facetname][$row[$facetname]]))
 | |
| 								$facets[$facetname][$row[$facetname]] = $row['c'];
 | |
| 							else
 | |
| 								$facets[$facetname][$row[$facetname]] += $row['c'];
 | |
| 						}
 | |
| 					} elseif($facetname == 'status') {
 | |
| 						$facets[$facetname][($row[$facetname]-10).''] = $row['c'];
 | |
| 					} else
 | |
| 						$facets[$facetname][$row[$facetname]] = $row['c'];
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		$sql = "SELECT `record_type`, count(*) AS `c` FROM `docs`";
 | |
| 		if($query)
 | |
| 			$sql .= " WHERE docs MATCH ".$this->_conn->quote($query);
 | |
| 		if($filter) {
 | |
| 			if($query)
 | |
| 				$sql .= " AND ".$filter;
 | |
| 			else
 | |
| 				$sql .= " WHERE ".$filter;
 | |
| 		}
 | |
| 		$res = $this->_conn->query($sql." GROUP BY `record_type`");
 | |
| 		if(!$res)
 | |
| 			throw new SeedDMS_SQLiteFTS_Exception("Counting records in facet \"record_type\" failed.");
 | |
| //			return false;
 | |
| 		$facets['record_type'] = array('document'=>0, 'folder'=>0);
 | |
| 		foreach($res as $row) {
 | |
| 			$facets['record_type'][$row['record_type']] = $row['c'];
 | |
| 		}
 | |
| 		$total = $facets['record_type']['document'] + $facets['record_type']['folder'];
 | |
| 
 | |
| 		$sql = "SELECT ".$this->_rawid.", documentid FROM docs";
 | |
| 		if($query)
 | |
| 			$sql .= " WHERE docs MATCH ".$this->_conn->quote($query);
 | |
| 		if($filter) {
 | |
| 			if($query)
 | |
| 				$sql .= " AND ".$filter;
 | |
| 			else
 | |
| 				$sql .= " WHERE ".$filter;
 | |
| 		}
 | |
| 		if($this->_ftstype == 'fts5') {
 | |
| 			//$sql .= " ORDER BY rank";
 | |
| 			// boost documentid, record_type, title, comment, keywords, category, mimetype, origfilename, owner, content, created unindexed, users, status, path
 | |
| 			if(!empty($order['by'])) {
 | |
| 				switch($order['by']) {
 | |
| 				case "title":
 | |
| 					$sql .= " ORDER BY title";
 | |
| 					break;
 | |
| 				case "created":
 | |
| 					$sql .= " ORDER BY created";
 | |
| 					break;
 | |
| 				default:
 | |
| 					$sql .= " ORDER BY bm25(docs, 10.0, 0.0, 10.0, 5.0, 5.0, 10.0)";
 | |
| 				}
 | |
| 				if(!empty($order['dir'])) {
 | |
| 					if($order['dir'] == 'desc')
 | |
| 						$sql .= " DESC";
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 		if(!empty($limit['limit']))
 | |
| 			$sql .= " LIMIT ".(int) $limit['limit'];
 | |
| 		if(!empty($limit['offset']))
 | |
| 			$sql .= " OFFSET ".(int) $limit['offset'];
 | |
| 		$res = $this->_conn->query($sql);
 | |
| 		if(!$res)
 | |
| 			throw new SeedDMS_SQLiteFTS_Exception("Searching for documents failed.");
 | |
| 		$hits = array();
 | |
| 		if($res) {
 | |
| 			foreach($res as $rec) {
 | |
| 				$hit = new SeedDMS_SQLiteFTS_QueryHit($this);
 | |
| 				$hit->id = $rec[$this->_rawid];
 | |
| 				$hit->documentid = $rec['documentid'];
 | |
| 				$hits[] = $hit;
 | |
| 			}
 | |
| 		}
 | |
| 		return array('count'=>$total, 'hits'=>$hits, 'facets'=>$facets);
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Get a single document from index
 | |
| 	 *
 | |
| 	 * @param string $id id of document
 | |
| 	 * @return boolean false in case of an error, otherwise true
 | |
| 	 */
 | |
| 	public function findById($id) { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		$sql = "SELECT ".$this->_rawid.", documentid FROM docs WHERE documentid=".$this->_conn->quote($id);
 | |
| 		$res = $this->_conn->query($sql);
 | |
| 		$hits = array();
 | |
| 		if($res) {
 | |
| 			while($rec = $res->fetch(PDO::FETCH_ASSOC)) {
 | |
| 				$hit = new SeedDMS_SQLiteFTS_QueryHit($this);
 | |
| 				$hit->id = $rec[$this->_rawid];
 | |
| 				$hit->documentid = $rec['documentid'];
 | |
| 				$hits[] = $hit;
 | |
| 			}
 | |
| 		}
 | |
| 		return $hits;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Get a single document from index
 | |
| 	 *
 | |
| 	 * @param integer $id id of index record
 | |
| 	 * @return boolean false in case of an error, otherwise true
 | |
| 	 */
 | |
| 	public function getDocument($id, $content=true) { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		$sql = "SELECT ".$this->_rawid.", documentid, title, comment, owner, keywords, category, mimetype, origfilename, created, indexed, users, status, path".($content ? ", content" : "")." FROM docs WHERE ".$this->_rawid."='".$id."'";
 | |
| 		$res = $this->_conn->query($sql);
 | |
| 		$doc = false;
 | |
| 		if($res) {
 | |
| 			if(!($rec = $res->fetch(PDO::FETCH_ASSOC)))
 | |
| 				return false;
 | |
| 			$doc = new SeedDMS_SQLiteFTS_Document();
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('docid', $rec[$this->_rawid]));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('document_id', $rec['documentid']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('title', $rec['title']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('comment', $rec['comment']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('keywords', $rec['keywords']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('category', $rec['category']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('mimetype', $rec['mimetype']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('origfilename', $rec['origfilename']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('owner', $rec['owner']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('created', $rec['created']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('indexed', $rec['indexed']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Text('users', $rec['users']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('status', $rec['status']));
 | |
| 			$doc->addField(SeedDMS_SQLiteFTS_Field::Keyword('path', explode('x', substr($rec['path'], 1, -1))));
 | |
| 			if($content)
 | |
| 				$doc->addField(SeedDMS_SQLiteFTS_Field::UnStored('content', $rec['content']));
 | |
| 		}
 | |
| 		return $doc;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Return list of terms in index
 | |
| 	 *
 | |
| 	 * @return array list of SeedDMS_SQLiteFTS_Term
 | |
| 	 */
 | |
| 	public function terms($prefix='', $col='') { /* {{{ */
 | |
| 		if(!$this->_conn)
 | |
| 			return false;
 | |
| 
 | |
| 		if($this->_ftstype == 'fts5') {
 | |
| 			$sql = "SELECT term, col, doc as occurrences FROM docs_terms";
 | |
| 			if($prefix || $col) {
 | |
| 				$sql .= " WHERE";
 | |
| 				if($prefix) {
 | |
| 					$sql .= " term like '".$prefix."%'";
 | |
| 					if($col)
 | |
| 						$sql .= " AND";
 | |
| 				}
 | |
| 				if($col)
 | |
| 					$sql .= " col = '".$col."'";
 | |
| 			}
 | |
| 			$sql .= " ORDER BY col, occurrences desc";
 | |
| 		} else {
 | |
| 			$sql = "SELECT term, col, occurrences FROM docs_terms WHERE col!='*'";
 | |
| 			if($prefix)
 | |
| 				$sql .= " AND term like '".$prefix."%'";
 | |
| 			if($col)
 | |
| 				$sql .= " AND col = '".$col."'";
 | |
| 			$sql .=	" ORDER BY col, occurrences desc";
 | |
| 		}
 | |
| 		$res = $this->_conn->query($sql);
 | |
| 		$terms = array();
 | |
| 		if($res) {
 | |
| 			while($rec = $res->fetch(PDO::FETCH_ASSOC)) {
 | |
| 				$term = new SeedDMS_SQLiteFTS_Term($rec['term'], $rec['col'], $rec['occurrences']);
 | |
| 				$terms[] = $term;
 | |
| 			}
 | |
| 		}
 | |
| 		return $terms;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Return number of documents in index
 | |
| 	 *
 | |
| 	 * @return interger number of documents
 | |
| 	 */
 | |
| 	public function count() { /* {{{ */
 | |
| 		$sql = "SELECT count(*) c FROM docs";
 | |
| 		$res = $this->_conn->query($sql);
 | |
| 		if($res) {
 | |
| 			$rec = $res->fetch(PDO::FETCH_ASSOC);
 | |
| 			return $rec['c'];
 | |
| 		}
 | |
| 		return 0;
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Commit changes
 | |
| 	 *
 | |
| 	 * This function does nothing!
 | |
| 	 */
 | |
| 	function commit() { /* {{{ */
 | |
| 	} /* }}} */
 | |
| 
 | |
| 	/**
 | |
| 	 * Optimize index
 | |
| 	 *
 | |
| 	 * This function does nothing!
 | |
| 	 */
 | |
| 	function optimize() { /* {{{ */
 | |
| 	} /* }}} */
 | |
| }
 | |
| ?>
 | 
