mirror of
https://git.code.sf.net/p/seeddms/code
synced 2025-05-09 13:06:14 +00:00
use stop words
This commit is contained in:
parent
e28911711b
commit
b9ac1860cf
|
@ -25,18 +25,48 @@
|
||||||
class SeedDMS_SQLiteFTS_Indexer {
|
class SeedDMS_SQLiteFTS_Indexer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var string $ftstype
|
* @var string $_ftstype
|
||||||
* @access protected
|
* @access protected
|
||||||
*/
|
*/
|
||||||
protected $_ftstype;
|
protected $_ftstype;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @var object $index sqlite index
|
* @var object $_conn sqlite index
|
||||||
* @access protected
|
* @access protected
|
||||||
*/
|
*/
|
||||||
protected $_conn;
|
protected $_conn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @var array $_stop_words array of stop words
|
||||||
|
* @access protected
|
||||||
|
*/
|
||||||
|
protected $_stop_words;
|
||||||
|
|
||||||
const ftstype = 'fts5';
|
const ftstype = 'fts5';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove stopwords from string
|
||||||
|
*/
|
||||||
|
protected function strip_stopwords($str = "") { /* {{{ */
|
||||||
|
// 1.) break string into words
|
||||||
|
// [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
|
||||||
|
// if input is unicode/utf-8, the u flag is needed: /pattern/u
|
||||||
|
$words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY);
|
||||||
|
|
||||||
|
// 2.) if we have at least 2 words, remove stopwords
|
||||||
|
if(count($words) > 1) {
|
||||||
|
$stopwords = $this->_stop_words;
|
||||||
|
$words = array_filter($words, function ($w) use (&$stopwords) {
|
||||||
|
return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf- 8")]));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if not too much was removed such as "the the" would return empty
|
||||||
|
if(!empty($words))
|
||||||
|
return implode(" ", $words);
|
||||||
|
return $str;
|
||||||
|
} /* }}} */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructor
|
* Constructor
|
||||||
*
|
*
|
||||||
|
@ -48,6 +78,7 @@ class SeedDMS_SQLiteFTS_Indexer {
|
||||||
$this->_rawid = 'rowid';
|
$this->_rawid = 'rowid';
|
||||||
else
|
else
|
||||||
$this->_rawid = 'docid';
|
$this->_rawid = 'docid';
|
||||||
|
$this->_stop_words = [];
|
||||||
} /* }}} */
|
} /* }}} */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -109,7 +140,9 @@ class SeedDMS_SQLiteFTS_Indexer {
|
||||||
* Do some initialization
|
* Do some initialization
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
static function init($stopWordsFile='') { /* {{{ */
|
public function init($stopWordsFile='') { /* {{{ */
|
||||||
|
if($stopWordsFile)
|
||||||
|
$this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile)));
|
||||||
} /* }}} */
|
} /* }}} */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -135,6 +168,9 @@ class SeedDMS_SQLiteFTS_Indexer {
|
||||||
if($res === false) {
|
if($res === false) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if($this->_stop_words)
|
||||||
|
$content = $this->strip_stopwords($content);
|
||||||
|
|
||||||
$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
|
$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
|
||||||
$res = $this->_conn->exec($sql);
|
$res = $this->_conn->exec($sql);
|
||||||
if($res === false) {
|
if($res === false) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user