mirror of
https://codeberg.org/SeedDMS/paperless
synced 2024-11-26 15:32:08 +00:00
search for similar docs
This commit is contained in:
parent
28d2e4dc4b
commit
6a84abb6ce
|
@ -55,6 +55,14 @@ use Psr\Container\ContainerInterface;
|
|||
class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
||||
protected $container;
|
||||
|
||||
static public function mb_word_count($string, $mode = MB_CASE_TITLE, $characters = null) { /* {{{ */
|
||||
$string = mb_convert_case($string, $mode, "UTF-8");
|
||||
$addChars = $characters ? preg_quote($characters, '~') : "";
|
||||
// $regEx = "~[^\p{L}".$addChars."]+~u";
|
||||
$regEx = "~[^\p{L}".$addChars."]+~u";
|
||||
return array_count_values(preg_split($regEx,$string, -1, PREG_SPLIT_NO_EMPTY));
|
||||
} /* }}} */
|
||||
|
||||
protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */
|
||||
$fulltextservice = $this->container->fulltextservice;
|
||||
$settings = $this->container->config;
|
||||
|
@ -496,6 +504,11 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
|||
|
||||
$limit = isset($params['page_size']) ? (int) $params['page_size'] : 25;
|
||||
$page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1;
|
||||
$offset = ($page-1)*$limit;
|
||||
/* Truncate content if requested
|
||||
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
|
||||
*/
|
||||
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
|
||||
|
||||
$order = [];
|
||||
if (isset($params["ordering"]) && is_string($params["ordering"])) {
|
||||
|
@ -553,12 +566,61 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
|||
|
||||
/* more_like_id is set to find similar documents */
|
||||
if(isset($params['more_like_id'])) {
|
||||
|
||||
$index = $fulltextservice->Indexer();
|
||||
$lucenesearch = $fulltextservice->Search();
|
||||
if($searchhit = $lucenesearch->getDocument((int) $params['more_like_id'])) {
|
||||
$idoc = $searchhit->getDocument();
|
||||
if($idoc) {
|
||||
try {
|
||||
$fullcontent = $idoc->getFieldValue('content');
|
||||
} catch (Exception $e) {
|
||||
$fullcontent = '';
|
||||
}
|
||||
$wcl = 2000;
|
||||
$shortcontent = mb_strimwidth($fullcontent, 0, $wcl);
|
||||
|
||||
/* Create a list of words an its occurences to be passed
|
||||
* to the classification.
|
||||
* The '.' is added as valid character in a word, because solr's
|
||||
* standard tokenizer treats it as a valid char as well.
|
||||
*/
|
||||
$wordcount = self::mb_word_count($shortcontent, MB_CASE_LOWER, '.');
|
||||
arsort($wordcount);
|
||||
$newquery = [];
|
||||
foreach($wordcount as $word=>$n) {
|
||||
if(mb_strlen($word) > 4 && ($n > 2 || count($newquery) < 5))
|
||||
$newquery[] = $word;
|
||||
}
|
||||
// echo implode(' ', $newquery);
|
||||
$logger->log("Query for '".implode(' ', $newquery)."'", PEAR_LOG_DEBUG);
|
||||
$searchresult = $lucenesearch->search(implode(' ', $newquery), array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder), array('limit'=>$limit, 'offset'=>$offset), $order);
|
||||
if($searchresult) {
|
||||
$recs = array();
|
||||
if($searchresult['hits']) {
|
||||
$allids = '';
|
||||
foreach($searchresult['hits'] as $hit) {
|
||||
if($hit['document_id'][0] == 'D') {
|
||||
if($tmp = $dms->getDocument((int) substr($hit['document_id'], 1))) {
|
||||
$allids .= $hit['document_id'].' ';
|
||||
$recs[] = $this->__getDocumentData($tmp, $truncate_content);
|
||||
}
|
||||
}
|
||||
}
|
||||
$logger->log('Result is '.$allids, PEAR_LOG_DEBUG);
|
||||
return $response->withJson(array('count'=>$searchresult['count'], 'next'=>null, 'previous'=>null, 'offset'=>$offset, 'limit'=>$limit, 'results'=>$recs), 200);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200);
|
||||
/* Get all documents in the same folder and subfolders */
|
||||
/* Get all documents in the same folder and subfolders
|
||||
$likeid = (int) $params['more_like_id'];
|
||||
if($likeid && $likedoc = $dms->getDocument($likeid)) {
|
||||
$startfolder = $likedoc->getFolder();
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
$cattrs = [];
|
||||
|
@ -605,13 +667,8 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
|||
$aend = (int) makeTsFromDate($params['created__date__lt']);
|
||||
}
|
||||
|
||||
/* Truncate content if requested
|
||||
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
|
||||
*/
|
||||
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
|
||||
$index = $fulltextservice->Indexer();
|
||||
if($index) {
|
||||
$offset = ($page-1)*$limit;
|
||||
$logger->log('Query is '.$query, PEAR_LOG_DEBUG);
|
||||
$lucenesearch = $fulltextservice->Search();
|
||||
$searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);
|
||||
|
|
Loading…
Reference in New Issue
Block a user