fulltext.php 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959
  1. <?php
  2. /**
  3. * DokuWiki fulltextsearch functions using the index
  4. *
  5. * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
  6. * @author Andreas Gohr <andi@splitbrain.org>
  7. */
  8. use dokuwiki\Extension\Event;
  9. use dokuwiki\Utf8\Clean;
  10. use dokuwiki\Utf8\PhpString;
  11. use dokuwiki\Utf8\Sort;
  12. /**
  13. * create snippets for the first few results only
  14. */
  15. if(!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER',15);
  16. /**
  17. * The fulltext search
  18. *
  19. * Returns a list of matching documents for the given query
  20. *
  21. * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
  22. *
  23. * @param string $query
  24. * @param array $highlight
  25. * @param string $sort
  26. * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments
  27. * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments
  28. *
  29. * @return array
  30. */
  31. function ft_pageSearch($query,&$highlight, $sort = null, $after = null, $before = null){
  32. if ($sort === null) {
  33. $sort = 'hits';
  34. }
  35. $data = [
  36. 'query' => $query,
  37. 'sort' => $sort,
  38. 'after' => $after,
  39. 'before' => $before
  40. ];
  41. $data['highlight'] =& $highlight;
  42. return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
  43. }
  44. /**
  45. * Returns a list of matching documents for the given query
  46. *
  47. * @author Andreas Gohr <andi@splitbrain.org>
  48. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  49. *
  50. * @param array $data event data
  51. * @return array matching documents
  52. */
  53. function _ft_pageSearch(&$data) {
  54. $Indexer = idx_get_indexer();
  55. // parse the given query
  56. $q = ft_queryParser($Indexer, $data['query']);
  57. $data['highlight'] = $q['highlight'];
  58. if (empty($q['parsed_ary'])) return array();
  59. // lookup all words found in the query
  60. $lookup = $Indexer->lookup($q['words']);
  61. // get all pages in this dokuwiki site (!: includes nonexistent pages)
  62. $pages_all = array();
  63. foreach ($Indexer->getPages() as $id) {
  64. $pages_all[$id] = 0; // base: 0 hit
  65. }
  66. // process the query
  67. $stack = array();
  68. foreach ($q['parsed_ary'] as $token) {
  69. switch (substr($token, 0, 3)) {
  70. case 'W+:':
  71. case 'W-:':
  72. case 'W_:': // word
  73. $word = substr($token, 3);
  74. if(isset($lookup[$word])) {
  75. $stack[] = (array)$lookup[$word];
  76. }
  77. break;
  78. case 'P+:':
  79. case 'P-:': // phrase
  80. $phrase = substr($token, 3);
  81. // since phrases are always parsed as ((W1)(W2)...(P)),
  82. // the end($stack) always points the pages that contain
  83. // all words in this phrase
  84. $pages = end($stack);
  85. $pages_matched = array();
  86. foreach(array_keys($pages) as $id){
  87. $evdata = array(
  88. 'id' => $id,
  89. 'phrase' => $phrase,
  90. 'text' => rawWiki($id)
  91. );
  92. $evt = new Event('FULLTEXT_PHRASE_MATCH',$evdata);
  93. if ($evt->advise_before() && $evt->result !== true) {
  94. $text = PhpString::strtolower($evdata['text']);
  95. if (strpos($text, $phrase) !== false) {
  96. $evt->result = true;
  97. }
  98. }
  99. $evt->advise_after();
  100. if ($evt->result === true) {
  101. $pages_matched[$id] = 0; // phrase: always 0 hit
  102. }
  103. }
  104. $stack[] = $pages_matched;
  105. break;
  106. case 'N+:':
  107. case 'N-:': // namespace
  108. $ns = cleanID(substr($token, 3)) . ':';
  109. $pages_matched = array();
  110. foreach (array_keys($pages_all) as $id) {
  111. if (strpos($id, $ns) === 0) {
  112. $pages_matched[$id] = 0; // namespace: always 0 hit
  113. }
  114. }
  115. $stack[] = $pages_matched;
  116. break;
  117. case 'AND': // and operation
  118. list($pages1, $pages2) = array_splice($stack, -2);
  119. $stack[] = ft_resultCombine(array($pages1, $pages2));
  120. break;
  121. case 'OR': // or operation
  122. list($pages1, $pages2) = array_splice($stack, -2);
  123. $stack[] = ft_resultUnite(array($pages1, $pages2));
  124. break;
  125. case 'NOT': // not operation (unary)
  126. $pages = array_pop($stack);
  127. $stack[] = ft_resultComplement(array($pages_all, $pages));
  128. break;
  129. }
  130. }
  131. $docs = array_pop($stack);
  132. if (empty($docs)) return array();
  133. // check: settings, acls, existence
  134. foreach (array_keys($docs) as $id) {
  135. if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) {
  136. unset($docs[$id]);
  137. }
  138. }
  139. $docs = _ft_filterResultsByTime($docs, $data['after'], $data['before']);
  140. if ($data['sort'] === 'mtime') {
  141. uksort($docs, 'ft_pagemtimesorter');
  142. } else {
  143. // sort docs by count
  144. uksort($docs, 'ft_pagesorter');
  145. arsort($docs);
  146. }
  147. return $docs;
  148. }
  149. /**
  150. * Returns the backlinks for a given page
  151. *
  152. * Uses the metadata index.
  153. *
  154. * @param string $id The id for which links shall be returned
  155. * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected
  156. * @return array The pages that contain links to the given page
  157. */
  158. function ft_backlinks($id, $ignore_perms = false){
  159. $result = idx_get_indexer()->lookupKey('relation_references', $id);
  160. if(!count($result)) return $result;
  161. // check ACL permissions
  162. foreach(array_keys($result) as $idx){
  163. if(($ignore_perms !== true && (
  164. isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
  165. )) || !page_exists($result[$idx], '', false)){
  166. unset($result[$idx]);
  167. }
  168. }
  169. Sort::sort($result);
  170. return $result;
  171. }
  172. /**
  173. * Returns the pages that use a given media file
  174. *
  175. * Uses the relation media metadata property and the metadata index.
  176. *
  177. * Note that before 2013-07-31 the second parameter was the maximum number of results and
  178. * permissions were ignored. That's why the parameter is now checked to be explicitely set
  179. * to true (with type bool) in order to be compatible with older uses of the function.
  180. *
  181. * @param string $id The media id to look for
  182. * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false)
  183. * @return array A list of pages that use the given media file
  184. */
  185. function ft_mediause($id, $ignore_perms = false){
  186. $result = idx_get_indexer()->lookupKey('relation_media', $id);
  187. if(!count($result)) return $result;
  188. // check ACL permissions
  189. foreach(array_keys($result) as $idx){
  190. if(($ignore_perms !== true && (
  191. isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ
  192. )) || !page_exists($result[$idx], '', false)){
  193. unset($result[$idx]);
  194. }
  195. }
  196. Sort::sort($result);
  197. return $result;
  198. }
  199. /**
  200. * Quicksearch for pagenames
  201. *
  202. * By default it only matches the pagename and ignores the
  203. * namespace. This can be changed with the second parameter.
  204. * The third parameter allows to search in titles as well.
  205. *
  206. * The function always returns titles as well
  207. *
  208. * @triggers SEARCH_QUERY_PAGELOOKUP
  209. * @author Andreas Gohr <andi@splitbrain.org>
  210. * @author Adrian Lang <lang@cosmocode.de>
  211. *
  212. * @param string $id page id
  213. * @param bool $in_ns match against namespace as well?
  214. * @param bool $in_title search in title?
  215. * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments
  216. * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments
  217. *
  218. * @return string[]
  219. */
  220. function ft_pageLookup($id, $in_ns=false, $in_title=false, $after = null, $before = null){
  221. $data = [
  222. 'id' => $id,
  223. 'in_ns' => $in_ns,
  224. 'in_title' => $in_title,
  225. 'after' => $after,
  226. 'before' => $before
  227. ];
  228. $data['has_titles'] = true; // for plugin backward compatibility check
  229. return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup');
  230. }
  231. /**
  232. * Returns list of pages as array(pageid => First Heading)
  233. *
  234. * @param array &$data event data
  235. * @return string[]
  236. */
  237. function _ft_pageLookup(&$data){
  238. // split out original parameters
  239. $id = $data['id'];
  240. $Indexer = idx_get_indexer();
  241. $parsedQuery = ft_queryParser($Indexer, $id);
  242. if (count($parsedQuery['ns']) > 0) {
  243. $ns = cleanID($parsedQuery['ns'][0]) . ':';
  244. $id = implode(' ', $parsedQuery['highlight']);
  245. }
  246. if (count($parsedQuery['notns']) > 0) {
  247. $notns = cleanID($parsedQuery['notns'][0]) . ':';
  248. $id = implode(' ', $parsedQuery['highlight']);
  249. }
  250. $in_ns = $data['in_ns'];
  251. $in_title = $data['in_title'];
  252. $cleaned = cleanID($id);
  253. $Indexer = idx_get_indexer();
  254. $page_idx = $Indexer->getPages();
  255. $pages = array();
  256. if ($id !== '' && $cleaned !== '') {
  257. foreach ($page_idx as $p_id) {
  258. if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) {
  259. if (!isset($pages[$p_id]))
  260. $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
  261. }
  262. }
  263. if ($in_title) {
  264. foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) {
  265. if (!isset($pages[$p_id]))
  266. $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER);
  267. }
  268. }
  269. }
  270. if (isset($ns)) {
  271. foreach (array_keys($pages) as $p_id) {
  272. if (strpos($p_id, $ns) !== 0) {
  273. unset($pages[$p_id]);
  274. }
  275. }
  276. }
  277. if (isset($notns)) {
  278. foreach (array_keys($pages) as $p_id) {
  279. if (strpos($p_id, $notns) === 0) {
  280. unset($pages[$p_id]);
  281. }
  282. }
  283. }
  284. // discard hidden pages
  285. // discard nonexistent pages
  286. // check ACL permissions
  287. foreach(array_keys($pages) as $idx){
  288. if(!isVisiblePage($idx) || !page_exists($idx) ||
  289. auth_quickaclcheck($idx) < AUTH_READ) {
  290. unset($pages[$idx]);
  291. }
  292. }
  293. $pages = _ft_filterResultsByTime($pages, $data['after'], $data['before']);
  294. uksort($pages,'ft_pagesorter');
  295. return $pages;
  296. }
  297. /**
  298. * @param array $results search results in the form pageid => value
  299. * @param int|string $after only returns results with mtime after this date, accepts timestap or strtotime arguments
  300. * @param int|string $before only returns results with mtime after this date, accepts timestap or strtotime arguments
  301. *
  302. * @return array
  303. */
  304. function _ft_filterResultsByTime(array $results, $after, $before) {
  305. if ($after || $before) {
  306. $after = is_int($after) ? $after : strtotime($after);
  307. $before = is_int($before) ? $before : strtotime($before);
  308. foreach ($results as $id => $value) {
  309. $mTime = filemtime(wikiFN($id));
  310. if ($after && $after > $mTime) {
  311. unset($results[$id]);
  312. continue;
  313. }
  314. if ($before && $before < $mTime) {
  315. unset($results[$id]);
  316. }
  317. }
  318. }
  319. return $results;
  320. }
  321. /**
  322. * Tiny helper function for comparing the searched title with the title
  323. * from the search index. This function is a wrapper around stripos with
  324. * adapted argument order and return value.
  325. *
  326. * @param string $search searched title
  327. * @param string $title title from index
  328. * @return bool
  329. */
  330. function _ft_pageLookupTitleCompare($search, $title) {
  331. if (Clean::isASCII($search)) {
  332. $pos = stripos($title, $search);
  333. } else {
  334. $pos = PhpString::strpos(
  335. PhpString::strtolower($title),
  336. PhpString::strtolower($search)
  337. );
  338. }
  339. return $pos !== false;
  340. }
  341. /**
  342. * Sort pages based on their namespace level first, then on their string
  343. * values. This makes higher hierarchy pages rank higher than lower hierarchy
  344. * pages.
  345. *
  346. * @param string $a
  347. * @param string $b
  348. * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, and 0 if they are equal.
  349. */
  350. function ft_pagesorter($a, $b){
  351. $ac = count(explode(':',$a));
  352. $bc = count(explode(':',$b));
  353. if($ac < $bc){
  354. return -1;
  355. }elseif($ac > $bc){
  356. return 1;
  357. }
  358. return Sort::strcmp($a,$b);
  359. }
  360. /**
  361. * Sort pages by their mtime, from newest to oldest
  362. *
  363. * @param string $a
  364. * @param string $b
  365. *
  366. * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a and 0 if they are of the same age
  367. */
  368. function ft_pagemtimesorter($a, $b) {
  369. $mtimeA = filemtime(wikiFN($a));
  370. $mtimeB = filemtime(wikiFN($b));
  371. return $mtimeB - $mtimeA;
  372. }
  373. /**
  374. * Creates a snippet extract
  375. *
  376. * @author Andreas Gohr <andi@splitbrain.org>
  377. * @triggers FULLTEXT_SNIPPET_CREATE
  378. *
  379. * @param string $id page id
  380. * @param array $highlight
  381. * @return mixed
  382. */
  383. function ft_snippet($id,$highlight){
  384. $text = rawWiki($id);
  385. $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
  386. $evdata = array(
  387. 'id' => $id,
  388. 'text' => &$text,
  389. 'highlight' => &$highlight,
  390. 'snippet' => '',
  391. );
  392. $evt = new Event('FULLTEXT_SNIPPET_CREATE',$evdata);
  393. if ($evt->advise_before()) {
  394. $match = array();
  395. $snippets = array();
  396. $utf8_offset = $offset = $end = 0;
  397. $len = PhpString::strlen($text);
  398. // build a regexp from the phrases to highlight
  399. $re1 = '(' .
  400. join(
  401. '|',
  402. array_map(
  403. 'ft_snippet_re_preprocess',
  404. array_map(
  405. 'preg_quote_cb',
  406. array_filter((array) $highlight)
  407. )
  408. )
  409. ) .
  410. ')';
  411. $re2 = "$re1.{0,75}(?!\\1)$re1";
  412. $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
  413. for ($cnt=4; $cnt--;) {
  414. if (0) {
  415. } else if (preg_match('/'.$re3.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
  416. } else if (preg_match('/'.$re2.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
  417. } else if (preg_match('/'.$re1.'/iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) {
  418. } else {
  419. break;
  420. }
  421. list($str,$idx) = $match[0];
  422. // convert $idx (a byte offset) into a utf8 character offset
  423. $utf8_idx = PhpString::strlen(substr($text,0,$idx));
  424. $utf8_len = PhpString::strlen($str);
  425. // establish context, 100 bytes surrounding the match string
  426. // first look to see if we can go 100 either side,
  427. // then drop to 50 adding any excess if the other side can't go to 50,
  428. $pre = min($utf8_idx-$utf8_offset,100);
  429. $post = min($len-$utf8_idx-$utf8_len,100);
  430. if ($pre>50 && $post>50) {
  431. $pre = $post = 50;
  432. } else if ($pre>50) {
  433. $pre = min($pre,100-$post);
  434. } else if ($post>50) {
  435. $post = min($post, 100-$pre);
  436. } else if ($offset == 0) {
  437. // both are less than 50, means the context is the whole string
  438. // make it so and break out of this loop - there is no need for the
  439. // complex snippet calculations
  440. $snippets = array($text);
  441. break;
  442. }
  443. // establish context start and end points, try to append to previous
  444. // context if possible
  445. $start = $utf8_idx - $pre;
  446. $append = ($start < $end) ? $end : false; // still the end of the previous context snippet
  447. $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context
  448. if ($append) {
  449. $snippets[count($snippets)-1] .= PhpString::substr($text,$append,$end-$append);
  450. } else {
  451. $snippets[] = PhpString::substr($text,$start,$end-$start);
  452. }
  453. // set $offset for next match attempt
  454. // continue matching after the current match
  455. // if the current match is not the longest possible match starting at the current offset
  456. // this prevents further matching of this snippet but for possible matches of length
  457. // smaller than match length + context (at least 50 characters) this match is part of the context
  458. $utf8_offset = $utf8_idx + $utf8_len;
  459. $offset = $idx + strlen(PhpString::substr($text,$utf8_idx,$utf8_len));
  460. $offset = Clean::correctIdx($text,$offset);
  461. }
  462. $m = "\1";
  463. $snippets = preg_replace('/'.$re1.'/iu',$m.'$1'.$m,$snippets);
  464. $snippet = preg_replace(
  465. '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
  466. '<strong class="search_hit">$1</strong>',
  467. hsc(join('... ', $snippets))
  468. );
  469. $evdata['snippet'] = $snippet;
  470. }
  471. $evt->advise_after();
  472. unset($evt);
  473. return $evdata['snippet'];
  474. }
  475. /**
  476. * Wraps a search term in regex boundary checks.
  477. *
  478. * @param string $term
  479. * @return string
  480. */
  481. function ft_snippet_re_preprocess($term) {
  482. // do not process asian terms where word boundaries are not explicit
  483. if(\dokuwiki\Utf8\Asian::isAsianWords($term)) return $term;
  484. if (UTF8_PROPERTYSUPPORT) {
  485. // unicode word boundaries
  486. // see http://stackoverflow.com/a/2449017/172068
  487. $BL = '(?<!\pL)';
  488. $BR = '(?!\pL)';
  489. } else {
  490. // not as correct as above, but at least won't break
  491. $BL = '\b';
  492. $BR = '\b';
  493. }
  494. if(substr($term,0,2) == '\\*'){
  495. $term = substr($term,2);
  496. }else{
  497. $term = $BL.$term;
  498. }
  499. if(substr($term,-2,2) == '\\*'){
  500. $term = substr($term,0,-2);
  501. }else{
  502. $term = $term.$BR;
  503. }
  504. if($term == $BL || $term == $BR || $term == $BL.$BR) $term = '';
  505. return $term;
  506. }
  507. /**
  508. * Combine found documents and sum up their scores
  509. *
  510. * This function is used to combine searched words with a logical
  511. * AND. Only documents available in all arrays are returned.
  512. *
  513. * based upon PEAR's PHP_Compat function for array_intersect_key()
  514. *
  515. * @param array $args An array of page arrays
  516. * @return array
  517. */
  518. function ft_resultCombine($args){
  519. $array_count = count($args);
  520. if($array_count == 1){
  521. return $args[0];
  522. }
  523. $result = array();
  524. if ($array_count > 1) {
  525. foreach ($args[0] as $key => $value) {
  526. $result[$key] = $value;
  527. for ($i = 1; $i !== $array_count; $i++) {
  528. if (!isset($args[$i][$key])) {
  529. unset($result[$key]);
  530. break;
  531. }
  532. $result[$key] += $args[$i][$key];
  533. }
  534. }
  535. }
  536. return $result;
  537. }
  538. /**
  539. * Unites found documents and sum up their scores
  540. *
  541. * based upon ft_resultCombine() function
  542. *
  543. * @param array $args An array of page arrays
  544. * @return array
  545. *
  546. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  547. */
  548. function ft_resultUnite($args) {
  549. $array_count = count($args);
  550. if ($array_count === 1) {
  551. return $args[0];
  552. }
  553. $result = $args[0];
  554. for ($i = 1; $i !== $array_count; $i++) {
  555. foreach (array_keys($args[$i]) as $id) {
  556. $result[$id] += $args[$i][$id];
  557. }
  558. }
  559. return $result;
  560. }
  561. /**
  562. * Computes the difference of documents using page id for comparison
  563. *
  564. * nearly identical to PHP5's array_diff_key()
  565. *
  566. * @param array $args An array of page arrays
  567. * @return array
  568. *
  569. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  570. */
  571. function ft_resultComplement($args) {
  572. $array_count = count($args);
  573. if ($array_count === 1) {
  574. return $args[0];
  575. }
  576. $result = $args[0];
  577. foreach (array_keys($result) as $id) {
  578. for ($i = 1; $i !== $array_count; $i++) {
  579. if (isset($args[$i][$id])) unset($result[$id]);
  580. }
  581. }
  582. return $result;
  583. }
  584. /**
  585. * Parses a search query and builds an array of search formulas
  586. *
  587. * @author Andreas Gohr <andi@splitbrain.org>
  588. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  589. *
  590. * @param dokuwiki\Search\Indexer $Indexer
  591. * @param string $query search query
  592. * @return array of search formulas
  593. */
  594. function ft_queryParser($Indexer, $query){
  595. /**
  596. * parse a search query and transform it into intermediate representation
  597. *
  598. * in a search query, you can use the following expressions:
  599. *
  600. * words:
  601. * include
  602. * -exclude
  603. * phrases:
  604. * "phrase to be included"
  605. * -"phrase you want to exclude"
  606. * namespaces:
  607. * @include:namespace (or ns:include:namespace)
  608. * ^exclude:namespace (or -ns:exclude:namespace)
  609. * groups:
  610. * ()
  611. * -()
  612. * operators:
  613. * and ('and' is the default operator: you can always omit this)
  614. * or (or pipe symbol '|', lower precedence than 'and')
  615. *
  616. * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain
  617. * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'".
  618. * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ]
  619. * as long as you don't mind hit counts.
  620. *
  621. * intermediate representation consists of the following parts:
  622. *
  623. * ( ) - group
  624. * AND - logical and
  625. * OR - logical or
  626. * NOT - logical not
  627. * W+:, W-:, W_: - word (underscore: no need to highlight)
  628. * P+:, P-: - phrase (minus sign: logically in NOT group)
  629. * N+:, N-: - namespace
  630. */
  631. $parsed_query = '';
  632. $parens_level = 0;
  633. $terms = preg_split('/(-?".*?")/u', PhpString::strtolower($query),
  634. -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  635. foreach ($terms as $term) {
  636. $parsed = '';
  637. if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) {
  638. // phrase-include and phrase-exclude
  639. $not = $matches[1] ? 'NOT' : '';
  640. $parsed = $not.ft_termParser($Indexer, $matches[2], false, true);
  641. } else {
  642. // fix incomplete phrase
  643. $term = str_replace('"', ' ', $term);
  644. // fix parentheses
  645. $term = str_replace(')' , ' ) ', $term);
  646. $term = str_replace('(' , ' ( ', $term);
  647. $term = str_replace('- (', ' -(', $term);
  648. // treat pipe symbols as 'OR' operators
  649. $term = str_replace('|', ' or ', $term);
  650. // treat ideographic spaces (U+3000) as search term separators
  651. // FIXME: some more separators?
  652. $term = preg_replace('/[ \x{3000}]+/u', ' ', $term);
  653. $term = trim($term);
  654. if ($term === '') continue;
  655. $tokens = explode(' ', $term);
  656. foreach ($tokens as $token) {
  657. if ($token === '(') {
  658. // parenthesis-include-open
  659. $parsed .= '(';
  660. ++$parens_level;
  661. } elseif ($token === '-(') {
  662. // parenthesis-exclude-open
  663. $parsed .= 'NOT(';
  664. ++$parens_level;
  665. } elseif ($token === ')') {
  666. // parenthesis-any-close
  667. if ($parens_level === 0) continue;
  668. $parsed .= ')';
  669. $parens_level--;
  670. } elseif ($token === 'and') {
  671. // logical-and (do nothing)
  672. } elseif ($token === 'or') {
  673. // logical-or
  674. $parsed .= 'OR';
  675. } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) {
  676. // namespace-exclude
  677. $parsed .= 'NOT(N+:'.$matches[1].')';
  678. } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) {
  679. // namespace-include
  680. $parsed .= '(N+:'.$matches[1].')';
  681. } elseif (preg_match('/^-(.+)$/', $token, $matches)) {
  682. // word-exclude
  683. $parsed .= 'NOT('.ft_termParser($Indexer, $matches[1]).')';
  684. } else {
  685. // word-include
  686. $parsed .= ft_termParser($Indexer, $token);
  687. }
  688. }
  689. }
  690. $parsed_query .= $parsed;
  691. }
  692. // cleanup (very sensitive)
  693. $parsed_query .= str_repeat(')', $parens_level);
  694. do {
  695. $parsed_query_old = $parsed_query;
  696. $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query);
  697. } while ($parsed_query !== $parsed_query_old);
  698. $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query);
  699. $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query);
  700. $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query);
  701. $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query);
  702. $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query);
  703. // adjustment: make highlightings right
  704. $parens_level = 0;
  705. $notgrp_levels = array();
  706. $parsed_query_new = '';
  707. $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  708. foreach ($tokens as $token) {
  709. if ($token === 'NOT(') {
  710. $notgrp_levels[] = ++$parens_level;
  711. } elseif ($token === '(') {
  712. ++$parens_level;
  713. } elseif ($token === ')') {
  714. if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels);
  715. } elseif (count($notgrp_levels) % 2 === 1) {
  716. // turn highlight-flag off if terms are logically in "NOT" group
  717. $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token);
  718. }
  719. $parsed_query_new .= $token;
  720. }
  721. $parsed_query = $parsed_query_new;
  722. /**
  723. * convert infix notation string into postfix (Reverse Polish notation) array
  724. * by Shunting-yard algorithm
  725. *
  726. * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation
  727. * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm
  728. */
  729. $parsed_ary = array();
  730. $ope_stack = array();
  731. $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5);
  732. $ope_regex = '/([()]|OR|AND|NOT)/u';
  733. $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
  734. foreach ($tokens as $token) {
  735. if (preg_match($ope_regex, $token)) {
  736. // operator
  737. $last_ope = end($ope_stack);
  738. while ($last_ope !== false && $ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') {
  739. $parsed_ary[] = array_pop($ope_stack);
  740. $last_ope = end($ope_stack);
  741. }
  742. if ($token == ')') {
  743. array_pop($ope_stack); // this array_pop always deletes '('
  744. } else {
  745. $ope_stack[] = $token;
  746. }
  747. } else {
  748. // operand
  749. $token_decoded = str_replace(array('OP', 'CP'), array('(', ')'), $token);
  750. $parsed_ary[] = $token_decoded;
  751. }
  752. }
  753. $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack)));
  754. // cleanup: each double "NOT" in RPN array actually does nothing
  755. $parsed_ary_count = count($parsed_ary);
  756. for ($i = 1; $i < $parsed_ary_count; ++$i) {
  757. if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') {
  758. unset($parsed_ary[$i], $parsed_ary[$i - 1]);
  759. }
  760. }
  761. $parsed_ary = array_values($parsed_ary);
  762. // build return value
  763. $q = array();
  764. $q['query'] = $query;
  765. $q['parsed_str'] = $parsed_query;
  766. $q['parsed_ary'] = $parsed_ary;
  767. foreach ($q['parsed_ary'] as $token) {
  768. if (strlen($token) < 3 || $token[2] !== ':') continue;
  769. $body = substr($token, 3);
  770. switch (substr($token, 0, 3)) {
  771. case 'N+:':
  772. $q['ns'][] = $body; // for backward compatibility
  773. break;
  774. case 'N-:':
  775. $q['notns'][] = $body; // for backward compatibility
  776. break;
  777. case 'W_:':
  778. $q['words'][] = $body;
  779. break;
  780. case 'W-:':
  781. $q['words'][] = $body;
  782. $q['not'][] = $body; // for backward compatibility
  783. break;
  784. case 'W+:':
  785. $q['words'][] = $body;
  786. $q['highlight'][] = $body;
  787. $q['and'][] = $body; // for backward compatibility
  788. break;
  789. case 'P-:':
  790. $q['phrases'][] = $body;
  791. break;
  792. case 'P+:':
  793. $q['phrases'][] = $body;
  794. $q['highlight'][] = $body;
  795. break;
  796. }
  797. }
  798. foreach (array('words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not') as $key) {
  799. $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key]));
  800. }
  801. return $q;
  802. }
  803. /**
  804. * Transforms given search term into intermediate representation
  805. *
  806. * This function is used in ft_queryParser() and not for general purpose use.
  807. *
  808. * @author Kazutaka Miyasaka <kazmiya@gmail.com>
  809. *
  810. * @param dokuwiki\Search\Indexer $Indexer
  811. * @param string $term
  812. * @param bool $consider_asian
  813. * @param bool $phrase_mode
  814. * @return string
  815. */
  816. function ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) {
  817. $parsed = '';
  818. if ($consider_asian) {
  819. // successive asian characters need to be searched as a phrase
  820. $words = \dokuwiki\Utf8\Asian::splitAsianWords($term);
  821. foreach ($words as $word) {
  822. $phrase_mode = $phrase_mode ? true : \dokuwiki\Utf8\Asian::isAsianWords($word);
  823. $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode);
  824. }
  825. } else {
  826. $term_noparen = str_replace(array('(', ')'), ' ', $term);
  827. $words = $Indexer->tokenizer($term_noparen, true);
  828. // W_: no need to highlight
  829. if (empty($words)) {
  830. $parsed = '()'; // important: do not remove
  831. } elseif ($words[0] === $term) {
  832. $parsed = '(W+:'.$words[0].')';
  833. } elseif ($phrase_mode) {
  834. $term_encoded = str_replace(array('(', ')'), array('OP', 'CP'), $term);
  835. $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))';
  836. } else {
  837. $parsed = '((W+:'.implode(')(W+:', $words).'))';
  838. }
  839. }
  840. return $parsed;
  841. }
  842. /**
  843. * Recreate a search query string based on parsed parts, doesn't support negated phrases and `OR` searches
  844. *
  845. * @param array $and
  846. * @param array $not
  847. * @param array $phrases
  848. * @param array $ns
  849. * @param array $notns
  850. *
  851. * @return string
  852. */
  853. function ft_queryUnparser_simple(array $and, array $not, array $phrases, array $ns, array $notns) {
  854. $query = implode(' ', $and);
  855. if (!empty($not)) {
  856. $query .= ' -' . implode(' -', $not);
  857. }
  858. if (!empty($phrases)) {
  859. $query .= ' "' . implode('" "', $phrases) . '"';
  860. }
  861. if (!empty($ns)) {
  862. $query .= ' @' . implode(' @', $ns);
  863. }
  864. if (!empty($notns)) {
  865. $query .= ' ^' . implode(' ^', $notns);
  866. }
  867. return $query;
  868. }
  869. //Setup VIM: ex: et ts=4 :