use stop words

This commit is contained in:
Uwe Steinmann 2023-01-05 08:02:14 +01:00
parent e28911711b
commit b9ac1860cf

View File

@ -25,18 +25,48 @@
class SeedDMS_SQLiteFTS_Indexer {
/**
* @var string $ftstype
* @var string $_ftstype
* @access protected
*/
protected $_ftstype;
/**
* @var object $index sqlite index
* @var object $_conn sqlite index
* @access protected
*/
protected $_conn;
/**
* @var array $_stop_words array of stop words
* @access protected
*/
protected $_stop_words;
const ftstype = 'fts5';
/**
* Remove stopwords from string
*/
protected function strip_stopwords($str = "") { /* {{{ */
// 1.) break string into words
// [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
// if input is unicode/utf-8, the u flag is needed: /pattern/u
$words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY);
// 2.) if we have at least 2 words, remove stopwords
if(count($words) > 1) {
$stopwords = $this->_stop_words;
$words = array_filter($words, function ($w) use (&$stopwords) {
return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf- 8")]));
});
}
// check if not too much was removed such as "the the" would return empty
if(!empty($words))
return implode(" ", $words);
return $str;
} /* }}} */
/**
* Constructor
*
@ -48,6 +78,7 @@ class SeedDMS_SQLiteFTS_Indexer {
$this->_rawid = 'rowid';
else
$this->_rawid = 'docid';
$this->_stop_words = [];
} /* }}} */
/**
@ -109,7 +140,9 @@ class SeedDMS_SQLiteFTS_Indexer {
* Do some initialization
*
*/
static function init($stopWordsFile='') { /* {{{ */
public function init($stopWordsFile='') { /* {{{ */
if($stopWordsFile)
$this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile)));
} /* }}} */
/**
@ -135,6 +168,9 @@ class SeedDMS_SQLiteFTS_Indexer {
if($res === false) {
return false;
}
if($this->_stop_words)
$content = $this->strip_stopwords($content);
$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
$res = $this->_conn->exec($sql);
if($res === false) {