mirror of
https://codeberg.org/SeedDMS/paperless
synced 2025-05-15 06:01:18 +00:00
search for similar docs
This commit is contained in:
parent
28d2e4dc4b
commit
6a84abb6ce
|
@ -55,6 +55,14 @@ use Psr\Container\ContainerInterface;
|
||||||
class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
||||||
protected $container;
|
protected $container;
|
||||||
|
|
||||||
|
static public function mb_word_count($string, $mode = MB_CASE_TITLE, $characters = null) { /* {{{ */
|
||||||
|
$string = mb_convert_case($string, $mode, "UTF-8");
|
||||||
|
$addChars = $characters ? preg_quote($characters, '~') : "";
|
||||||
|
// $regEx = "~[^\p{L}".$addChars."]+~u";
|
||||||
|
$regEx = "~[^\p{L}".$addChars."]+~u";
|
||||||
|
return array_count_values(preg_split($regEx,$string, -1, PREG_SPLIT_NO_EMPTY));
|
||||||
|
} /* }}} */
|
||||||
|
|
||||||
protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */
|
protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */
|
||||||
$fulltextservice = $this->container->fulltextservice;
|
$fulltextservice = $this->container->fulltextservice;
|
||||||
$settings = $this->container->config;
|
$settings = $this->container->config;
|
||||||
|
@ -496,6 +504,11 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
||||||
|
|
||||||
$limit = isset($params['page_size']) ? (int) $params['page_size'] : 25;
|
$limit = isset($params['page_size']) ? (int) $params['page_size'] : 25;
|
||||||
$page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1;
|
$page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1;
|
||||||
|
$offset = ($page-1)*$limit;
|
||||||
|
/* Truncate content if requested
|
||||||
|
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
|
||||||
|
*/
|
||||||
|
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
|
||||||
|
|
||||||
$order = [];
|
$order = [];
|
||||||
if (isset($params["ordering"]) && is_string($params["ordering"])) {
|
if (isset($params["ordering"]) && is_string($params["ordering"])) {
|
||||||
|
@ -553,12 +566,61 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
||||||
|
|
||||||
/* more_like_id is set to find similar documents */
|
/* more_like_id is set to find similar documents */
|
||||||
if(isset($params['more_like_id'])) {
|
if(isset($params['more_like_id'])) {
|
||||||
|
|
||||||
|
$index = $fulltextservice->Indexer();
|
||||||
|
$lucenesearch = $fulltextservice->Search();
|
||||||
|
if($searchhit = $lucenesearch->getDocument((int) $params['more_like_id'])) {
|
||||||
|
$idoc = $searchhit->getDocument();
|
||||||
|
if($idoc) {
|
||||||
|
try {
|
||||||
|
$fullcontent = $idoc->getFieldValue('content');
|
||||||
|
} catch (Exception $e) {
|
||||||
|
$fullcontent = '';
|
||||||
|
}
|
||||||
|
$wcl = 2000;
|
||||||
|
$shortcontent = mb_strimwidth($fullcontent, 0, $wcl);
|
||||||
|
|
||||||
|
/* Create a list of words an its occurences to be passed
|
||||||
|
* to the classification.
|
||||||
|
* The '.' is added as valid character in a word, because solr's
|
||||||
|
* standard tokenizer treats it as a valid char as well.
|
||||||
|
*/
|
||||||
|
$wordcount = self::mb_word_count($shortcontent, MB_CASE_LOWER, '.');
|
||||||
|
arsort($wordcount);
|
||||||
|
$newquery = [];
|
||||||
|
foreach($wordcount as $word=>$n) {
|
||||||
|
if(mb_strlen($word) > 4 && ($n > 2 || count($newquery) < 5))
|
||||||
|
$newquery[] = $word;
|
||||||
|
}
|
||||||
|
// echo implode(' ', $newquery);
|
||||||
|
$logger->log("Query for '".implode(' ', $newquery)."'", PEAR_LOG_DEBUG);
|
||||||
|
$searchresult = $lucenesearch->search(implode(' ', $newquery), array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder), array('limit'=>$limit, 'offset'=>$offset), $order);
|
||||||
|
if($searchresult) {
|
||||||
|
$recs = array();
|
||||||
|
if($searchresult['hits']) {
|
||||||
|
$allids = '';
|
||||||
|
foreach($searchresult['hits'] as $hit) {
|
||||||
|
if($hit['document_id'][0] == 'D') {
|
||||||
|
if($tmp = $dms->getDocument((int) substr($hit['document_id'], 1))) {
|
||||||
|
$allids .= $hit['document_id'].' ';
|
||||||
|
$recs[] = $this->__getDocumentData($tmp, $truncate_content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
$logger->log('Result is '.$allids, PEAR_LOG_DEBUG);
|
||||||
|
return $response->withJson(array('count'=>$searchresult['count'], 'next'=>null, 'previous'=>null, 'offset'=>$offset, 'limit'=>$limit, 'results'=>$recs), 200);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200);
|
return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200);
|
||||||
/* Get all documents in the same folder and subfolders */
|
/* Get all documents in the same folder and subfolders
|
||||||
$likeid = (int) $params['more_like_id'];
|
$likeid = (int) $params['more_like_id'];
|
||||||
if($likeid && $likedoc = $dms->getDocument($likeid)) {
|
if($likeid && $likedoc = $dms->getDocument($likeid)) {
|
||||||
$startfolder = $likedoc->getFolder();
|
$startfolder = $likedoc->getFolder();
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
$cattrs = [];
|
$cattrs = [];
|
||||||
|
@ -605,13 +667,8 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
|
||||||
$aend = (int) makeTsFromDate($params['created__date__lt']);
|
$aend = (int) makeTsFromDate($params['created__date__lt']);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Truncate content if requested
|
|
||||||
* See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
|
|
||||||
*/
|
|
||||||
$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
|
|
||||||
$index = $fulltextservice->Indexer();
|
$index = $fulltextservice->Indexer();
|
||||||
if($index) {
|
if($index) {
|
||||||
$offset = ($page-1)*$limit;
|
|
||||||
$logger->log('Query is '.$query, PEAR_LOG_DEBUG);
|
$logger->log('Query is '.$query, PEAR_LOG_DEBUG);
|
||||||
$lucenesearch = $fulltextservice->Search();
|
$lucenesearch = $fulltextservice->Search();
|
||||||
$searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);
|
$searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user