diff --git a/SeedDMS_SQLiteFTS/SQLiteFTS/Indexer.php b/SeedDMS_SQLiteFTS/SQLiteFTS/Indexer.php index bb9585c73..697843923 100644 --- a/SeedDMS_SQLiteFTS/SQLiteFTS/Indexer.php +++ b/SeedDMS_SQLiteFTS/SQLiteFTS/Indexer.php @@ -25,18 +25,48 @@ class SeedDMS_SQLiteFTS_Indexer { /** - * @var string $ftstype + * @var string $_ftstype * @access protected */ protected $_ftstype; /** - * @var object $index sqlite index + * @var object $_conn sqlite index * @access protected */ protected $_conn; + /** + * @var array $_stop_words array of stop words + * @access protected + */ + protected $_stop_words; + const ftstype = 'fts5'; + + /** + * Remove stopwords from string + */ + protected function strip_stopwords($str = "") { /* {{{ */ + // 1.) break string into words + // [^-\w\'] matches characters, that are not [0-9a-zA-Z_-'] + // if input is unicode/utf-8, the u flag is needed: /pattern/u + $words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY); + + // 2.) if we have at least 2 words, remove stopwords + if(count($words) > 1) { + $stopwords = $this->_stop_words; + $words = array_filter($words, function ($w) use (&$stopwords) { + return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf- 8")])); + }); + } + + // check if not too much was removed such as "the the" would return empty + if(!empty($words)) + return implode(" ", $words); + return $str; + } /* }}} */ + /** * Constructor * @@ -48,6 +78,7 @@ class SeedDMS_SQLiteFTS_Indexer { $this->_rawid = 'rowid'; else $this->_rawid = 'docid'; + $this->_stop_words = []; } /* }}} */ /** @@ -109,7 +140,9 @@ class SeedDMS_SQLiteFTS_Indexer { * Do some initialization * */ - static function init($stopWordsFile='') { /* {{{ */ + public function init($stopWordsFile='') { /* {{{ */ + if($stopWordsFile) + $this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile))); } /* }}} */ /** @@ -135,6 +168,9 @@ class SeedDMS_SQLiteFTS_Indexer { if($res === false) { return false; } + if($this->_stop_words) + $content = $this->strip_stopwords($content); + $sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")"; $res = $this->_conn->exec($sql); if($res === false) {