search for similar docs

2025-06-06 13:19:53 +00:00 · 2023-03-15 14:23:03 +01:00 · 2023-03-15 14:23:03 +01:00 · 6a84abb6ce
commit 6a84abb6ce
parent 28d2e4dc4b
1 changed files with 63 additions and 6 deletions
--- a/class.paperless.php
+++ b/class.paperless.php
@ -55,6 +55,14 @@ use Psr\Container\ContainerInterface;
 class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
 	protected $container;

+	static public function mb_word_count($string, $mode = MB_CASE_TITLE, $characters = null) { /* {{{ */
+		$string = mb_convert_case($string, $mode, "UTF-8");
+		$addChars = $characters ? preg_quote($characters, '~') : "";
+//		$regEx = "~[^\p{L}".$addChars."]+~u";
+		$regEx = "~[^\p{L}".$addChars."]+~u";
+		return array_count_values(preg_split($regEx,$string, -1, PREG_SPLIT_NO_EMPTY));
+	} /* }}} */
+
 	protected function __getDocumentData($document, $truncate_content=false) { /* {{{ */
 		$fulltextservice = $this->container->fulltextservice;
 		$settings = $this->container->config;
@ -496,6 +504,11 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */

 			$limit = isset($params['page_size']) ? (int) $params['page_size'] : 25;
 			$page = (isset($params['page']) && $params['page'] > 0) ? (int) $params['page'] : 1;
+			$offset = ($page-1)*$limit;
+			/* Truncate content if requested
+			 * See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
+			 */
+			$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');

 			$order = [];
 			if (isset($params["ordering"]) && is_string($params["ordering"])) {
@ -553,12 +566,61 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */

 			/* more_like_id is set to find similar documents */
 			if(isset($params['more_like_id'])) {
+
+				$index = $fulltextservice->Indexer();
+				$lucenesearch = $fulltextservice->Search();
+				if($searchhit = $lucenesearch->getDocument((int) $params['more_like_id'])) {
+					$idoc = $searchhit->getDocument();
+					if($idoc) {
+						try {
+							$fullcontent = $idoc->getFieldValue('content');
+						} catch (Exception $e) {
+							$fullcontent = '';
+						}
+						$wcl = 2000;
+						$shortcontent = mb_strimwidth($fullcontent, 0, $wcl);
+
+						/* Create a list of words an its occurences to be passed
+						 * to the classification.
+						 * The '.' is added as valid character in a word, because solr's
+						 * standard tokenizer treats it as a valid char as well.
+						 */
+						$wordcount = self::mb_word_count($shortcontent, MB_CASE_LOWER, '.');
+						arsort($wordcount);
+						$newquery = [];
+						foreach($wordcount as $word=>$n) {
+							if(mb_strlen($word) > 4 && ($n > 2 || count($newquery) < 5))
+								$newquery[] = $word;
+						}
+//						echo implode(' ', $newquery);
+						$logger->log("Query for '".implode(' ', $newquery)."'", PEAR_LOG_DEBUG);
+						$searchresult = $lucenesearch->search(implode(' ', $newquery), array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder), array('limit'=>$limit, 'offset'=>$offset), $order);
+						if($searchresult) {
+							$recs = array();
+							if($searchresult['hits']) {
+								$allids = '';
+								foreach($searchresult['hits'] as $hit) {
+									if($hit['document_id'][0] == 'D') {
+										if($tmp = $dms->getDocument((int) substr($hit['document_id'], 1))) {
+											$allids .= $hit['document_id'].' ';
+											$recs[] = $this->__getDocumentData($tmp, $truncate_content);
+										}
+									}
+								}
+								$logger->log('Result is '.$allids, PEAR_LOG_DEBUG);
+								return $response->withJson(array('count'=>$searchresult['count'], 'next'=>null, 'previous'=>null, 'offset'=>$offset, 'limit'=>$limit, 'results'=>$recs), 200);
+							}
+						}
+					}
+				}
+
 				return $response->withJson(array('count'=>0, 'next'=>null, 'previous'=>null, 'offset'=>0, 'limit'=>$limit, 'results'=>[]), 200);
-				/* Get all documents in the same folder and subfolders */
+				/* Get all documents in the same folder and subfolders
 				$likeid = (int) $params['more_like_id'];
 				if($likeid && $likedoc = $dms->getDocument($likeid)) {
 					$startfolder = $likedoc->getFolder();
 				}
+				 */
 			}

 			$cattrs = [];
@ -605,13 +667,8 @@ class SeedDMS_ExtPaperless_RestAPI_Controller { /* {{{ */
 				$aend = (int) makeTsFromDate($params['created__date__lt']);
 			}

-			/* Truncate content if requested
-			 * See https://github.com/paperless-ngx/paperless-ngx/blob/main/src/documents/serialisers.py
-			 */
-			$truncate_content = isset($params['truncate_content']) && ($params['truncate_content'] == 'true');
 			$index = $fulltextservice->Indexer();
 			if($index) {
-				$offset = ($page-1)*$limit;
 				$logger->log('Query is '.$query, PEAR_LOG_DEBUG);
 				$lucenesearch = $fulltextservice->Search();
 				$searchresult = $lucenesearch->search($query, array('record_type'=>['document'], 'status'=>[2], 'user'=>[$userobj->getLogin()], 'category'=>$categorynames, 'created_start'=>$astart, 'created_end'=>$aend, 'startFolder'=>$startfolder, 'rootFolder'=>$rootfolder, 'attributes'=>$cattrs), array('limit'=>$limit, 'offset'=>$offset), $order);