From 6a84abb6cef82544003f27e283789b1278c33e03 Mon Sep 17 00:00:00 2001 From: Uwe Steinmann Date: Wed, 15 Mar 2023 14:23:03 +0100 Subject: [PATCH] search for similar docs --- class.paperless.php | 69 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/class.paperless.php b/class.paperless.php index 006ebc9..2039741 100644 --- a/class.paperless.php +++ b/class.paperless.php @@ -55,6 +55,14 @@ use Psr\Container\ContainerInterface; class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */ protected $container; + static public function mb_word_count($string, $mode = MB_CASE_TITLE, $characters = null) { /* {{{ */ + $string = mb_convert_case($string, $mode, "UTF-8"); + $addChars = $characters ? preg_quote($characters, '~') : ""; +// $regEx = "~[^\p{L}".$addChars."]+~u"; + $regEx = "~[^\p{L}".$addChars."]+~u"; + return array_count_values(preg_split($regEx,$string, -1, PREG_SPLIT_NO_EMPTY)); + } /* }}} */ + protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */ $fulltextservice = $this->container->fulltextservice; $settings = $this->container->config; @@ -496,6 +504,11 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */ $limit = isset($params['page_size']) ? (int) $params['page_size'] : 25; $page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1; + $offset = ($page-1)*$limit; + /* Truncate content if requested + * See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py + */ + $truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true'); $order = []; if (isset($params["ordering"]) && is_string($params["ordering"])) { @@ -553,12 +566,61 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */ /* more_like_id is set to find similar documents */ if(isset($params['more_like_id'])) { + + $index = $fulltextservice->Indexer(); + $lucenesearch = $fulltextservice->Search(); + if($searchhit = $lucenesearch->getDocument((int) $params['more_like_id'])) { + $idoc = $searchhit->getDocument(); + if($idoc) { + try { + $fullcontent = $idoc->getFieldValue('content'); + } catch (Exception $e) { + $fullcontent = ''; + } + $wcl = 2000; + $shortcontent = mb_strimwidth($fullcontent, 0, $wcl); + + /* Create a list of words an its occurences to be passed + * to the classification. + * The '.' is added as valid character in a word, because solr's + * standard tokenizer treats it as a valid char as well. + */ + $wordcount = self::mb_word_count($shortcontent, MB_CASE_LOWER, '.'); + arsort($wordcount); + $newquery = []; + foreach($wordcount as $word=>$n) { + if(mb_strlen($word) > 4 && ($n > 2 || count($newquery) < 5)) + $newquery[] = $word; + } +// echo implode(' ', $newquery); + $logger->log("Query for '".implode(' ', $newquery)."'", PEAR_LOG_DEBUG); + $searchresult = $lucenesearch->search(implode(' ', $newquery), array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder), array('limit'=>$limit, 'offset'=>$offset), $order); + if($searchresult) { + $recs = array(); + if($searchresult['hits']) { + $allids = ''; + foreach($searchresult['hits'] as $hit) { + if($hit['document_id'][0] == 'D') { + if($tmp = $dms->getDocument((int) substr($hit['document_id'], 1))) { + $allids .= $hit['document_id'].' '; + $recs[] = $this->__getDocumentData($tmp, $truncate_content); + } + } + } + $logger->log('Result is '.$allids, PEAR_LOG_DEBUG); + return $response->withJson(array('count'=>$searchresult['count'], 'next'=>null, 'previous'=>null, 'offset'=>$offset, 'limit'=>$limit, 'results'=>$recs), 200); + } + } + } + } + return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200); - /* Get all documents in the same folder and subfolders */ + /* Get all documents in the same folder and subfolders $likeid = (int) $params['more_like_id']; if($likeid && $likedoc = $dms->getDocument($likeid)) { $startfolder = $likedoc->getFolder(); } + */ } $cattrs = []; @@ -605,13 +667,8 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */ $aend = (int) makeTsFromDate($params['created__date__lt']); } - /* Truncate content if requested - * See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py - */ - $truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true'); $index = $fulltextservice->Indexer(); if($index) { - $offset = ($page-1)*$limit; $logger->log('Query is '.$query, PEAR_LOG_DEBUG); $lucenesearch = $fulltextservice->Search(); $searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);