123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 |
- <?php
- /**
- * Functions to create the fulltext search index
- *
- * @license GPL 2 (http://www.gnu.org/licenses/gpl.html)
- * @author Andreas Gohr <andi@splitbrain.org>
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
- use dokuwiki\Extension\Event;
- use dokuwiki\Search\Indexer;
- // Version tag used to force rebuild on upgrade
- define('INDEXER_VERSION', 8);
- // set the minimum token length to use in the index (note, this doesn't apply to numeric tokens)
- if (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2);
- /**
- * Version of the indexer taking into consideration the external tokenizer.
- * The indexer is only compatible with data written by the same version.
- *
- * @triggers INDEXER_VERSION_GET
- * Plugins that modify what gets indexed should hook this event and
- * add their version info to the event data like so:
- * $data[$plugin_name] = $plugin_version;
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- * @author Michael Hamann <michael@content-space.de>
- *
- * @return int|string
- */
- function idx_get_version(){
- static $indexer_version = null;
- if ($indexer_version == null) {
- $version = INDEXER_VERSION;
- // DokuWiki version is included for the convenience of plugins
- $data = array('dokuwiki'=>$version);
- Event::createAndTrigger('INDEXER_VERSION_GET', $data, null, false);
- unset($data['dokuwiki']); // this needs to be first
- ksort($data);
- foreach ($data as $plugin=>$vers)
- $version .= '+'.$plugin.'='.$vers;
- $indexer_version = $version;
- }
- return $indexer_version;
- }
- /**
- * Measure the length of a string.
- * Differs from strlen in handling of asian characters.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- *
- * @param string $w
- * @return int
- */
- function wordlen($w){
- $l = strlen($w);
- // If left alone, all chinese "words" will get put into w3.idx
- // So the "length" of a "word" is faked
- if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) {
- foreach($leadbytes[0] as $b)
- $l += ord($b) - 0xE1;
- }
- return $l;
- }
- /**
- * Create an instance of the indexer.
- *
- * @return Indexer an Indexer
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
- function idx_get_indexer() {
- static $Indexer;
- if (!isset($Indexer)) {
- $Indexer = new Indexer();
- }
- return $Indexer;
- }
- /**
- * Returns words that will be ignored.
- *
- * @return array list of stop words
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
- function & idx_get_stopwords() {
- static $stopwords = null;
- if (is_null($stopwords)) {
- global $conf;
- $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
- if(file_exists($swfile)){
- $stopwords = file($swfile, FILE_IGNORE_NEW_LINES);
- }else{
- $stopwords = array();
- }
- }
- return $stopwords;
- }
- /**
- * Adds/updates the search index for the given page
- *
- * Locking is handled internally.
- *
- * @param string $page name of the page to index
- * @param boolean $verbose print status messages
- * @param boolean $force force reindexing even when the index is up to date
- * @return string|boolean the function completed successfully
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- */
- function idx_addPage($page, $verbose=false, $force=false) {
- $idxtag = metaFN($page,'.indexed');
- // check if page was deleted but is still in the index
- if (!page_exists($page)) {
- if (!file_exists($idxtag)) {
- if ($verbose) print("Indexer: $page does not exist, ignoring".DOKU_LF);
- return false;
- }
- $Indexer = idx_get_indexer();
- $result = $Indexer->deletePage($page);
- if ($result === "locked") {
- if ($verbose) print("Indexer: locked".DOKU_LF);
- return false;
- }
- @unlink($idxtag);
- return $result;
- }
- // check if indexing needed
- if(!$force && file_exists($idxtag)){
- if(trim(io_readFile($idxtag)) == idx_get_version()){
- $last = @filemtime($idxtag);
- if($last > @filemtime(wikiFN($page))){
- if ($verbose) print("Indexer: index for $page up to date".DOKU_LF);
- return false;
- }
- }
- }
- $indexenabled = p_get_metadata($page, 'internal index', METADATA_RENDER_UNLIMITED);
- if ($indexenabled === false) {
- $result = false;
- if (file_exists($idxtag)) {
- $Indexer = idx_get_indexer();
- $result = $Indexer->deletePage($page);
- if ($result === "locked") {
- if ($verbose) print("Indexer: locked".DOKU_LF);
- return false;
- }
- @unlink($idxtag);
- }
- if ($verbose) print("Indexer: index disabled for $page".DOKU_LF);
- return $result;
- }
- $Indexer = idx_get_indexer();
- $pid = $Indexer->getPID($page);
- if ($pid === false) {
- if ($verbose) print("Indexer: getting the PID failed for $page".DOKU_LF);
- return false;
- }
- $body = '';
- $metadata = array();
- $metadata['title'] = p_get_metadata($page, 'title', METADATA_RENDER_UNLIMITED);
- if (($references = p_get_metadata($page, 'relation references', METADATA_RENDER_UNLIMITED)) !== null)
- $metadata['relation_references'] = array_keys($references);
- else
- $metadata['relation_references'] = array();
- if (($media = p_get_metadata($page, 'relation media', METADATA_RENDER_UNLIMITED)) !== null)
- $metadata['relation_media'] = array_keys($media);
- else
- $metadata['relation_media'] = array();
- $data = compact('page', 'body', 'metadata', 'pid');
- $evt = new Event('INDEXER_PAGE_ADD', $data);
- if ($evt->advise_before()) $data['body'] = $data['body'] . " " . rawWiki($page);
- $evt->advise_after();
- unset($evt);
- extract($data);
- $result = $Indexer->addPageWords($page, $body);
- if ($result === "locked") {
- if ($verbose) print("Indexer: locked".DOKU_LF);
- return false;
- }
- if ($result) {
- $result = $Indexer->addMetaKeys($page, $metadata);
- if ($result === "locked") {
- if ($verbose) print("Indexer: locked".DOKU_LF);
- return false;
- }
- }
- if ($result)
- io_saveFile(metaFN($page,'.indexed'), idx_get_version());
- if ($verbose) {
- print("Indexer: finished".DOKU_LF);
- return true;
- }
- return $result;
- }
- /**
- * Find tokens in the fulltext index
- *
- * Takes an array of words and will return a list of matching
- * pages for each one.
- *
- * Important: No ACL checking is done here! All results are
- * returned, regardless of permissions
- *
- * @param array $words list of words to search for
- * @return array list of pages found, associated with the search terms
- */
- function idx_lookup(&$words) {
- $Indexer = idx_get_indexer();
- return $Indexer->lookup($words);
- }
- /**
- * Split a string into tokens
- *
- * @param string $string
- * @param bool $wc
- *
- * @return array
- */
- function idx_tokenizer($string, $wc=false) {
- $Indexer = idx_get_indexer();
- return $Indexer->tokenizer($string, $wc);
- }
- /* For compatibility */
- /**
- * Read the list of words in an index (if it exists).
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- *
- * @param string $idx
- * @param string $suffix
- * @return array
- */
- function idx_getIndex($idx, $suffix) {
- global $conf;
- $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx';
- if (!file_exists($fn)) return array();
- return file($fn);
- }
- /**
- * Get the list of lengths indexed in the wiki.
- *
- * Read the index directory or a cache file and returns
- * a sorted array of lengths of the words used in the wiki.
- *
- * @author YoBoY <yoboy.leguesh@gmail.com>
- *
- * @return array
- */
- function idx_listIndexLengths() {
- global $conf;
- // testing what we have to do, create a cache file or not.
- if ($conf['readdircache'] == 0) {
- $docache = false;
- } else {
- clearstatcache();
- if (file_exists($conf['indexdir'].'/lengths.idx')
- && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) {
- if (
- ($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES))
- !== false
- ) {
- $idx = array();
- foreach ($lengths as $length) {
- $idx[] = (int)$length;
- }
- return $idx;
- }
- }
- $docache = true;
- }
- if ($conf['readdircache'] == 0 || $docache) {
- $dir = @opendir($conf['indexdir']);
- if ($dir === false)
- return array();
- $idx = array();
- while (($f = readdir($dir)) !== false) {
- if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') {
- $i = substr($f, 1, -4);
- if (is_numeric($i))
- $idx[] = (int)$i;
- }
- }
- closedir($dir);
- sort($idx);
- // save this in a file
- if ($docache) {
- $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w');
- @fwrite($handle, implode("\n", $idx));
- @fclose($handle);
- }
- return $idx;
- }
- return array();
- }
- /**
- * Get the word lengths that have been indexed.
- *
- * Reads the index directory and returns an array of lengths
- * that there are indices for.
- *
- * @author YoBoY <yoboy.leguesh@gmail.com>
- *
- * @param array|int $filter
- * @return array
- */
- function idx_indexLengths($filter) {
- global $conf;
- $idx = array();
- if (is_array($filter)) {
- // testing if index files exist only
- $path = $conf['indexdir']."/i";
- foreach ($filter as $key => $value) {
- if (file_exists($path.$key.'.idx'))
- $idx[] = $key;
- }
- } else {
- $lengths = idx_listIndexLengths();
- foreach ($lengths as $key => $length) {
- // keep all the values equal or superior
- if ((int)$length >= (int)$filter)
- $idx[] = $length;
- }
- }
- return $idx;
- }
- /**
- * Clean a name of a key for use as a file name.
- *
- * Romanizes non-latin characters, then strips away anything that's
- * not a letter, number, or underscore.
- *
- * @author Tom N Harris <tnharris@whoopdedo.org>
- *
- * @param string $name
- * @return string
- */
- function idx_cleanName($name) {
- $name = \dokuwiki\Utf8\Clean::romanize(trim((string)$name));
- $name = preg_replace('#[ \./\\:-]+#', '_', $name);
- $name = preg_replace('/[^A-Za-z0-9_]/', '', $name);
- return strtolower($name);
- }
- //Setup VIM: ex: et ts=4 :
|