search for similar docs

This commit is contained in:
Uwe Steinmann 2023-03-15 14:23:03 +01:00
parent 28d2e4dc4b
commit 6a84abb6ce

View File

@ -55,6 +55,14 @@ use Psr\Container\ContainerInterface;
class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
protected $container;
static public function mb_word_count($string, $mode = MB_CASE_TITLE, $characters = null) { /* {{{ */
$string = mb_convert_case($string, $mode, "UTF-8");
$addChars = $characters ? preg_quote($characters, '~') : "";
// $regEx = "~[^\p{L}".$addChars."]+~u";
$regEx = "~[^\p{L}".$addChars."]+~u";
return array_count_values(preg_split($regEx,$string, -1, PREG_SPLIT_NO_EMPTY));
} /* }}} */
protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */
$fulltextservice = $this->container->fulltextservice;
$settings = $this->container->config;
@ -496,6 +504,11 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
$limit = isset($params['page_size']) ? (int) $params['page_size'] : 25;
$page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1;
$offset = ($page-1)*$limit;
/* Truncate content if requested
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
*/
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
$order = [];
if (isset($params["ordering"]) && is_string($params["ordering"])) {
@ -553,12 +566,61 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
/* more_like_id is set to find similar documents */
if(isset($params['more_like_id'])) {
$index = $fulltextservice->Indexer();
$lucenesearch = $fulltextservice->Search();
if($searchhit = $lucenesearch->getDocument((int) $params['more_like_id'])) {
$idoc = $searchhit->getDocument();
if($idoc) {
try {
$fullcontent = $idoc->getFieldValue('content');
} catch (Exception $e) {
$fullcontent = '';
}
$wcl = 2000;
$shortcontent = mb_strimwidth($fullcontent, 0, $wcl);
/* Create a list of words an its occurences to be passed
* to the classification.
* The '.' is added as valid character in a word, because solr's
* standard tokenizer treats it as a valid char as well.
*/
$wordcount = self::mb_word_count($shortcontent, MB_CASE_LOWER, '.');
arsort($wordcount);
$newquery = [];
foreach($wordcount as $word=>$n) {
if(mb_strlen($word) > 4 && ($n > 2 || count($newquery) < 5))
$newquery[] = $word;
}
// echo implode(' ', $newquery);
$logger->log("Query for '".implode(' ', $newquery)."'", PEAR_LOG_DEBUG);
$searchresult = $lucenesearch->search(implode(' ', $newquery), array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder), array('limit'=>$limit, 'offset'=>$offset), $order);
if($searchresult) {
$recs = array();
if($searchresult['hits']) {
$allids = '';
foreach($searchresult['hits'] as $hit) {
if($hit['document_id'][0] == 'D') {
if($tmp = $dms->getDocument((int) substr($hit['document_id'], 1))) {
$allids .= $hit['document_id'].' ';
$recs[] = $this->__getDocumentData($tmp, $truncate_content);
}
}
}
$logger->log('Result is '.$allids, PEAR_LOG_DEBUG);
return $response->withJson(array('count'=>$searchresult['count'], 'next'=>null, 'previous'=>null, 'offset'=>$offset, 'limit'=>$limit, 'results'=>$recs), 200);
}
}
}
}
return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200);
/* Get all documents in the same folder and subfolders */
/* Get all documents in the same folder and subfolders
$likeid = (int) $params['more_like_id'];
if($likeid && $likedoc = $dms->getDocument($likeid)) {
$startfolder = $likedoc->getFolder();
}
*/
}
$cattrs = [];
@ -605,13 +667,8 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
$aend = (int) makeTsFromDate($params['created__date__lt']);
}
/* Truncate content if requested
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
*/
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
$index = $fulltextservice->Indexer();
if($index) {
$offset = ($page-1)*$limit;
$logger->log('Query is '.$query, PEAR_LOG_DEBUG);
$lucenesearch = $fulltextservice->Search();
$searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);