mirror of
https://git.code.sf.net/p/seeddms/code
synced 2025-05-09 04:56:06 +00:00
use stop words
This commit is contained in:
parent
e28911711b
commit
b9ac1860cf
|
@ -25,18 +25,48 @@
|
|||
class SeedDMS_SQLiteFTS_Indexer {
|
||||
|
||||
/**
|
||||
* @var string $ftstype
|
||||
* @var string $_ftstype
|
||||
* @access protected
|
||||
*/
|
||||
protected $_ftstype;
|
||||
|
||||
/**
|
||||
* @var object $index sqlite index
|
||||
* @var object $_conn sqlite index
|
||||
* @access protected
|
||||
*/
|
||||
protected $_conn;
|
||||
|
||||
/**
|
||||
* @var array $_stop_words array of stop words
|
||||
* @access protected
|
||||
*/
|
||||
protected $_stop_words;
|
||||
|
||||
const ftstype = 'fts5';
|
||||
|
||||
/**
|
||||
* Remove stopwords from string
|
||||
*/
|
||||
protected function strip_stopwords($str = "") { /* {{{ */
|
||||
// 1.) break string into words
|
||||
// [^-\w\'] matches characters, that are not [0-9a-zA-Z_-']
|
||||
// if input is unicode/utf-8, the u flag is needed: /pattern/u
|
||||
$words = preg_split('/[^-\w\']+/u', $str, -1, PREG_SPLIT_NO_EMPTY);
|
||||
|
||||
// 2.) if we have at least 2 words, remove stopwords
|
||||
if(count($words) > 1) {
|
||||
$stopwords = $this->_stop_words;
|
||||
$words = array_filter($words, function ($w) use (&$stopwords) {
|
||||
return ((mb_strlen($w, 'utf-8') > 2) && !isset($stopwords[mb_strtolower($w, "utf- 8")]));
|
||||
});
|
||||
}
|
||||
|
||||
// check if not too much was removed such as "the the" would return empty
|
||||
if(!empty($words))
|
||||
return implode(" ", $words);
|
||||
return $str;
|
||||
} /* }}} */
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
|
@ -48,6 +78,7 @@ class SeedDMS_SQLiteFTS_Indexer {
|
|||
$this->_rawid = 'rowid';
|
||||
else
|
||||
$this->_rawid = 'docid';
|
||||
$this->_stop_words = [];
|
||||
} /* }}} */
|
||||
|
||||
/**
|
||||
|
@ -109,7 +140,9 @@ class SeedDMS_SQLiteFTS_Indexer {
|
|||
* Do some initialization
|
||||
*
|
||||
*/
|
||||
static function init($stopWordsFile='') { /* {{{ */
|
||||
public function init($stopWordsFile='') { /* {{{ */
|
||||
if($stopWordsFile)
|
||||
$this->_stop_words = array_flip(preg_split("/[\s,]+/", file_get_contents($stopWordsFile)));
|
||||
} /* }}} */
|
||||
|
||||
/**
|
||||
|
@ -135,6 +168,9 @@ class SeedDMS_SQLiteFTS_Indexer {
|
|||
if($res === false) {
|
||||
return false;
|
||||
}
|
||||
if($this->_stop_words)
|
||||
$content = $this->strip_stopwords($content);
|
||||
|
||||
$sql = "INSERT INTO docs (documentid, record_type, title, comment, keywords, category, owner, content, mimetype, origfilename, created, indexed, users, status, path) VALUES (".$this->_conn->quote($doc->getFieldValue('document_id')).", ".$this->_conn->quote($doc->getFieldValue('record_type')).", ".$this->_conn->quote($doc->getFieldValue('title')).", ".$this->_conn->quote($comment).", ".$this->_conn->quote($keywords).", ".$this->_conn->quote($category).", ".$this->_conn->quote($doc->getFieldValue('owner')).", ".$this->_conn->quote($content).", ".$this->_conn->quote($mimetype).", ".$this->_conn->quote($origfilename).", ".(int)$created.", ".(int)$indexed.", ".$this->_conn->quote($doc->getFieldValue('users')).", ".$this->_conn->quote($status).", ".$this->_conn->quote($doc->getFieldValue('path'))/*time()*/.")";
|
||||
$res = $this->_conn->exec($sql);
|
||||
if($res === false) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user