Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php'); 00132 00133 00141 class tx_indexedsearch_indexer { 00142 00143 // Messages: 00144 var $reasons = array( 00145 -1 => 'mtime matched the document, so no changes detected and no content updated', 00146 -2 => 'The minimum age was not exceeded', 00147 1 => "The configured max-age was exceeded for the document and thus it's indexed.", 00148 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.', 00149 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.', 00150 4 => 'Page has never been indexed (is not represented in the index_phash table).' 00151 ); 00152 00153 // HTML code blocks to exclude from indexing: 00154 var $excludeSections = 'script,style'; 00155 00156 // Supported Extensions for external files: 00157 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods. 00158 00159 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!) 00160 var $defaultGrList = '0,-1'; 00161 00162 // Min/Max times: 00163 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded. 00164 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime. 00165 var $maxExternalFiles = 0; // Max number of external files to index. 00166 00167 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc. 00168 var $crawlerActive = FALSE; // Set when crawler is detected (internal) 00169 00170 // INTERNALS: 00171 var $defaultContentArray=array( 00172 'title' => '', 00173 'description' => '', 00174 'keywords' => '', 00175 'body' => '', 00176 ); 00177 var $wordcount = 0; 00178 var $externalFileCounter = 0; 00179 00180 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning) 00181 var $indexerConfig = array(); // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search'] 00182 var $hash = array(); // Hash array, contains phash and phash_grouping 00183 var $file_phash_arr = array(); // Hash array for files 00184 var $contentParts = array(); // Content of TYPO3 page 00185 var $content_md5h = ''; 00186 var $internal_log = array(); // Internal log 00187 var $indexExternalUrl_content = ''; 00188 00189 var $cHashParams = array(); // cHashparams array 00190 00191 var $freqRange = 32000; 00192 var $freqMax = 0.1; 00193 00194 // Objects: 00195 var $csObj; // Charset class object , t3lib_cs 00196 var $metaphoneObj; // Metaphone object, if any 00197 var $lexerObj; // Lexer object for word splitting 00198 00199 00200 00207 function hook_indexContent(&$pObj) { 00208 00209 // Indexer configuration from Extension Manager interface: 00210 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00211 00212 // Crawler activation: 00213 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: 00214 if (t3lib_extMgm::isLoaded('crawler') 00215 && $pObj->applicationData['tx_crawler']['running'] 00216 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) { 00217 00218 // Setting simple log message: 00219 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled'; 00220 00221 // Setting variables: 00222 $this->crawlerActive = TRUE; // Crawler active flag 00223 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc. 00224 } 00225 00226 // Determine if page should be indexed, and if so, configure and initialize indexer 00227 if ($pObj->config['config']['index_enable']) { 00228 $this->log_push('Index page',''); 00229 00230 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) { 00231 if (!$pObj->page['no_search']) { 00232 if (!$pObj->no_cache) { 00233 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) { 00234 00235 // Setting up internal configuration from config array: 00236 $this->conf = array(); 00237 00238 // Information about page for which the indexing takes place 00239 $this->conf['id'] = $pObj->id; // Page id 00240 $this->conf['type'] = $pObj->type; // Page type 00241 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing. 00242 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points) 00243 $this->conf['gr_list'] = $pObj->gr_list; // Group list 00244 00245 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters 00246 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters 00247 00248 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page 00249 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has. 00250 00251 // Root line uids 00252 $this->conf['rootline_uids'] = array(); 00253 foreach($pObj->config['rootLine'] as $rlkey => $rldat) { 00254 $this->conf['rootline_uids'][$rlkey] = $rldat['uid']; 00255 } 00256 00257 // Content of page: 00258 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page) 00259 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing 00260 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing) 00261 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed. 00262 00263 // Configuration of behavior: 00264 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible) 00265 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200) 00266 00267 // Set to zero: 00268 $this->conf['recordUid'] = 0; 00269 $this->conf['freeIndexUid'] = 0; 00270 $this->conf['freeIndexSetId'] = 0; 00271 00272 // Init and start indexing: 00273 $this->init(); 00274 $this->indexTypo3PageContent(); 00275 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.'); 00276 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.'); 00277 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!'); 00278 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.'); 00279 $this->log_pull(); 00280 } 00281 } 00282 00283 00284 00285 00286 00287 00288 00289 00290 /**************************** 00291 * 00292 * Backend API 00293 * 00294 ****************************/ 00295 00308 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) { 00309 00310 // Setting up internal configuration from config array: 00311 $this->conf = array(); 00312 00313 // Information about page for which the indexing takes place 00314 $this->conf['id'] = $id; // Page id (integer) 00315 $this->conf['type'] = $type; // Page type (integer) 00316 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer) 00317 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string) 00318 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...) 00319 00320 // cHash values: 00321 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters 00322 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters 00323 00324 // Set to defaults 00325 $this->conf['freeIndexUid'] = 0; 00326 $this->conf['freeIndexSetId'] = 0; 00327 $this->conf['page_cache_reg1'] = ''; 00328 00329 // Root line uids 00330 $this->conf['rootline_uids'] = $uidRL; 00331 00332 // Configuration of behavior: 00333 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible) 00334 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200) 00335 00336 // Init and start indexing: 00337 $this->init(); 00338 } 00339 00347 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) { 00348 $this->conf['freeIndexUid'] = $freeIndexUid; 00349 $this->conf['freeIndexSetId'] = $freeIndexSetId; 00350 } 00351 00365 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) { 00366 00367 // Content of page: 00368 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content 00369 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content 00370 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable 00371 00372 // Construct fake HTML for parsing: 00373 $this->conf['content'] = ' 00374 <html> 00375 <head> 00376 <title>'.htmlspecialchars($title).'</title> 00377 <meta name="keywords" content="'.htmlspecialchars($keywords).'" /> 00378 <meta name="description" content="'.htmlspecialchars($description).'" /> 00379 </head> 00380 <body> 00381 '.htmlspecialchars($content).' 00382 </body> 00383 </html>'; // Content string (HTML of TYPO3 page) 00384 00385 // Initializing charset: 00386 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing) 00387 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing 00388 00389 // Index content as if it was a TYPO3 page: 00390 $this->indexTypo3PageContent(); 00391 } 00392 00393 00394 00395 00396 00397 00398 00399 00400 00401 00402 00403 00404 00405 /******************************** 00406 * 00407 * Initialization 00408 * 00409 *******************************/ 00410 00416 function init() { 00417 global $TYPO3_CONF_VARS; 00418 00419 // Initializing: 00420 $this->cHashParams = $this->conf['cHash_array']; 00421 if (is_array($this->cHashParams) && count($this->cHashParams)) { 00422 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right... 00423 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!! 00424 } 00425 00426 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables: 00427 $this->setT3Hashes(); 00428 00429 // Indexer configuration from Extension Manager interface: 00430 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00431 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0); 00432 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0); 00433 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5); 00434 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255); 00435 00436 // Initialize external document parsers: 00437 // Example configuration, see ext_localconf.php of this file! 00438 if ($this->conf['index_externals']) { 00439 $this->initializeExternalParsers(); 00440 } 00441 00442 // Initialize lexer (class that deconstructs the text into words): 00443 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00444 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? 00445 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 00446 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00447 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef); 00448 $this->lexerObj->debug = $this->indexerConfig['debugMode']; 00449 00450 // Initialize metaphone hook: 00451 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone'; 00452 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) { 00453 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']); 00454 $this->metaphoneObj->pObj = &$this; 00455 } 00456 00457 // Init charset class: 00458 $this->csObj = &t3lib_div::makeInstance('t3lib_cs'); 00459 } 00460 00468 function initializeExternalParsers() { 00469 global $TYPO3_CONF_VARS; 00470 00471 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) { 00472 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) { 00473 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef); 00474 $this->external_parsers[$extension]->pObj = &$this; 00475 00476 // Init parser and if it returns false, unset its entry again: 00477 if (!$this->external_parsers[$extension]->initParser($extension)) { 00478 unset($this->external_parsers[$extension]); 00479 } 00480 } 00481 } 00482 } 00483 00484 00485 00486 00487 00488 00489 00490 00491 00492 00493 00494 00495 00496 00497 00498 /******************************** 00499 * 00500 * Indexing; TYPO3 pages (HTML content) 00501 * 00502 *******************************/ 00503 00509 function indexTypo3PageContent() { 00510 00511 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']); 00512 $is_grlist = $this->is_grlist_set($this->hash['phash']); 00513 00514 if ($check > 0 || !$is_grlist || $this->forceIndexing) { 00515 00516 // Setting message: 00517 if ($this->forceIndexing) { 00518 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1); 00519 } elseif ($check > 0) { 00520 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 00521 } else { 00522 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1); 00523 } 00524 00525 // Divide into title,keywords,description and body: 00526 $this->log_push('Split content',''); 00527 $this->contentParts = $this->splitHTMLContent($this->conf['content']); 00528 if ($this->conf['indexedDocTitle']) { 00529 $this->contentParts['title'] = $this->conf['indexedDocTitle']; 00530 } 00531 $this->log_pull(); 00532 00533 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!) 00534 $this->content_md5h = $this->md5inthash(implode($this->contentParts,'')); 00535 00536 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash. 00537 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more. 00538 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem. 00539 $checkCHash = $this->checkContentHash(); 00540 if (!is_array($checkCHash) || $check===1) { 00541 $Pstart=t3lib_div::milliseconds(); 00542 00543 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8',''); 00544 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']); 00545 $this->log_pull(); 00546 00547 // Splitting words 00548 $this->log_push('Extract words from content',''); 00549 $splitInWords = $this->processWordsInArrays($this->contentParts); 00550 $this->log_pull(); 00551 00552 // Analyse the indexed words. 00553 $this->log_push('Analyse the extracted words',''); 00554 $indexArr = $this->indexAnalyze($splitInWords); 00555 $this->log_pull(); 00556 00557 // Submitting page (phash) record 00558 $this->log_push('Submitting page',''); 00559 $this->submitPage(); 00560 $this->log_pull(); 00561 00562 // Check words and submit to word list if not there 00563 $this->log_push('Check word list and submit words',''); 00564 $this->checkWordList($indexArr); 00565 $this->submitWords($indexArr,$this->hash['phash']); 00566 $this->log_pull(); 00567 00568 // Set parsetime 00569 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart); 00570 00571 // Checking external files if configured for. 00572 $this->log_push('Checking external files',''); 00573 if ($this->conf['index_externals']) { 00574 $this->extractLinks($this->conf['content']); 00575 } 00576 $this->log_pull(); 00577 } else { 00578 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp 00579 $this->updateSetId($this->hash['phash']); 00580 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash. 00581 $this->updateRootline(); 00582 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.'); 00583 } 00584 } else { 00585 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 00586 } 00587 } 00588 00596 function splitHTMLContent($content) { 00597 00598 // divide head from body ( u-ouh :) ) 00599 $contentArr = $this->defaultContentArray; 00600 $contentArr['body'] = stristr($content,'<body'); 00601 $headPart = substr($content,0,-strlen($contentArr['body'])); 00602 00603 // get title 00604 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy); 00605 $titleParts = explode(':',$contentArr['title'],2); 00606 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]); 00607 00608 // get keywords and description metatags 00609 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ } 00610 for($i=0;isset($meta[$i]);$i++) { 00611 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]); 00612 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content']; 00613 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content']; 00614 } 00615 00616 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags: 00617 $this->typoSearchTags($contentArr['body']); 00618 00619 // Get rid of unwanted sections (ie. scripting and style stuff) in body 00620 $tagList = explode(',',$this->excludeSections); 00621 foreach($tagList as $tag) { 00622 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2)); 00623 } 00624 00625 // remove tags, but first make sure we don't concatenate words by doing it 00626 $contentArr['body'] = str_replace('<',' <',$contentArr['body']); 00627 $contentArr['body'] = trim(strip_tags($contentArr['body'])); 00628 00629 $contentArr['keywords'] = trim($contentArr['keywords']); 00630 $contentArr['description'] = trim($contentArr['description']); 00631 00632 // Return array 00633 return $contentArr; 00634 } 00635 00642 function getHTMLcharset($content) { 00643 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) { 00644 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) { 00645 return $reg2[1]; 00646 } 00647 } 00648 } 00649 00657 function convertHTMLToUtf8($content,$charset='') { 00658 00659 // Find charset: 00660 $charset = $charset ? $charset : $this->getHTMLcharset($content); 00661 $charset = $this->csObj->parse_charset($charset); 00662 00663 // Convert charset: 00664 if ($charset && $charset!=='utf-8') { 00665 $content = $this->csObj->utf8_encode($content, $charset); 00666 } 00667 // Convert entities, assuming document is now UTF-8: 00668 $content = $this->csObj->entities_to_utf8($content, TRUE); 00669 00670 return $content; 00671 } 00672 00685 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) { 00686 $endTag = '</'.$tagName.'>'; 00687 $startTag = '<'.$tagName; 00688 00689 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag. 00690 if(!$isTagInText) return false; // if the tag was not found, return false 00691 00692 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2); 00693 $afterTagInText = stristr($isTagInText,$endTag); 00694 if ($afterTagInText) { 00695 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag))); 00696 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText)); 00697 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag)); 00698 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned. 00699 $tagContent=''; 00700 $stringAfter = $isTagInText; 00701 } 00702 00703 return true; 00704 } 00705 00712 function typoSearchTags(&$body) { 00713 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body); 00714 00715 if(count($expBody)>1) { 00716 $body = ''; 00717 00718 foreach($expBody as $val) { 00719 $part = explode('-->',$val,2); 00720 if(trim($part[0])=='begin') { 00721 $body.= $part[1]; 00722 $prev = ''; 00723 } elseif(trim($part[0])=='end') { 00724 $body.= $prev; 00725 } else { 00726 $prev = $val; 00727 } 00728 } 00729 return true; 00730 } else { 00731 return false; 00732 } 00733 } 00734 00741 function extractLinks($content) { 00742 00743 // Get links: 00744 $list = $this->extractHyperLinks($content); 00745 00746 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) { 00747 $this->includeCrawlerClass(); 00748 $crawler = t3lib_div::makeInstance('tx_crawler_lib'); 00749 } 00750 00751 // Traverse links: 00752 foreach($list as $linkInfo) { 00753 00754 // Decode entities: 00755 if ($linkInfo['localPath']) { // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here! 00756 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']); 00757 } else { 00758 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']); 00759 } 00760 00761 // Parse URL: 00762 $qParts = parse_url($linkSource); 00763 00764 // Check for jumpurl (TYPO3 specific thing...) 00765 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) { 00766 parse_str($qParts['query'],$getP); 00767 $linkSource = $getP['jumpurl']; 00768 $qParts = parse_url($linkSource); // parse again due to new linkSource! 00769 } 00770 00771 if ($qParts['scheme']) { 00772 if ($this->indexerConfig['indexExternalURLs']) { 00773 // Index external URL (http or otherwise) 00774 $this->indexExternalUrl($linkSource); 00775 } 00776 } elseif (!$qParts['query']) { 00777 if (t3lib_div::isAllowedAbsPath($linkSource)) { 00778 $localFile = $linkSource; 00779 } else { 00780 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource); 00781 } 00782 if ($localFile && @is_file($localFile)) { 00783 00784 // Index local file: 00785 if ($linkInfo['localPath']) { 00786 00787 $fI = pathinfo($linkSource); 00788 $ext = strtolower($fI['extension']); 00789 if (is_object($crawler)) { 00790 $params = array( 00791 'document' => $linkSource, 00792 'alturl' => $linkInfo['href'], 00793 'conf' => $this->conf 00794 ); 00795 unset($params['conf']['content']); 00796 00797 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); 00798 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); 00799 } else { 00800 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext); 00801 } 00802 } else { 00803 if (is_object($crawler)) { 00804 $params = array( 00805 'document' => $linkSource, 00806 'conf' => $this->conf 00807 ); 00808 unset($params['conf']['content']); 00809 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']); 00810 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1); 00811 } else { 00812 $this->indexRegularDocument($linkSource); 00813 } 00814 } 00815 } 00816 } 00817 } 00818 } 00819 00827 function extractHyperLinks($string) { 00828 if (!is_object($this->htmlParser)) { 00829 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); 00830 } 00831 00832 $parts = $this->htmlParser->splitTags('a',$string); 00833 $list = array(); 00834 foreach ($parts as $k => $v) { 00835 if ($k%2) { 00836 $params = $this->htmlParser->get_tag_attributes($v,1); 00837 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag 00838 00839 switch (strtolower($firstTagName)) { 00840 case 'a': 00841 $src = $params[0]['href']; 00842 if ($src) { 00843 // Check if a local path to that file has been set - useful if you are using a download script. 00844 $md5 = t3lib_div::shortMD5($src); 00845 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) { 00846 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : ''; 00847 } else $localPath=false; 00848 00849 $list[] = array( 00850 'tag' => $v, 00851 'href' => $params[0]['href'], 00852 'localPath' => $localPath 00853 ); 00854 } 00855 break; 00856 } 00857 } 00858 } 00859 00860 return $list; 00861 } 00862 00863 00864 00865 00866 00867 00868 00869 00870 00871 00872 00873 /****************************************** 00874 * 00875 * Indexing; external URL 00876 * 00877 ******************************************/ 00878 00886 function indexExternalUrl($externalUrl) { 00887 00888 // Parse External URL: 00889 $qParts = parse_url($externalUrl); 00890 $fI = pathinfo($qParts['path']); 00891 $ext = strtolower($fI['extension']); 00892 00893 // Get headers: 00894 $urlHeaders = $this->getUrlHeaders($externalUrl); 00895 if (stristr($urlHeaders['Content-Type'],'text/html')) { 00896 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl); 00897 if (strlen($content)) { 00898 00899 // Create temporary file: 00900 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html'; 00901 t3lib_div::writeFile($tmpFile, $content); 00902 00903 // Index that file: 00904 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html'); // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?) 00905 unlink($tmpFile); 00906 } 00907 } 00908 } 00909 00917 function getUrlHeaders($url) { 00918 $content = t3lib_div::getURL($url,2); // Try to get the headers only 00919 00920 if (strlen($content)) { 00921 // Compile headers: 00922 $headers = t3lib_div::trimExplode(chr(10),$content,1); 00923 $retVal = array(); 00924 foreach($headers as $line) { 00925 if (!strlen(trim($line))) { 00926 break; // Stop at the first empty line (= end of header) 00927 } 00928 00929 list($headKey, $headValue) = explode(':', $line, 2); 00930 $retVal[$headKey] = $headValue; 00931 } 00932 return $retVal; 00933 } 00934 } 00935 00936 00937 00938 00939 00940 00941 00942 00943 00944 00945 00946 00947 00948 /****************************************** 00949 * 00950 * Indexing; external files (PDF, DOC, etc) 00951 * 00952 ******************************************/ 00953 00963 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') { 00964 00965 // Init 00966 $fI = pathinfo($file); 00967 $ext = $altExtension ? $altExtension : strtolower($fI['extension']); 00968 00969 // Create abs-path: 00970 if (!$contentTmpFile) { 00971 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site: 00972 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file); 00973 } else { // Absolute, pass-through: 00974 $absFile = $file; 00975 } 00976 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : ''; 00977 } else { 00978 $absFile = $contentTmpFile; 00979 } 00980 00981 // Indexing the document: 00982 if ($absFile && @is_file($absFile)) { 00983 if ($this->external_parsers[$ext]) { 00984 $mtime = filemtime($absFile); 00985 $cParts = $this->fileContentParts($ext,$absFile); 00986 00987 foreach($cParts as $cPKey) { 00988 $this->internal_log = array(); 00989 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),''); 00990 $Pstart = t3lib_div::milliseconds(); 00991 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3" 00992 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo); 00993 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']); 00994 if ($check > 0 || $force) { 00995 if ($check > 0) { 00996 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 00997 } else { 00998 $this->log_setTSlogMessage('Indexing forced by flag',1); 00999 } 01000 01001 // Check external file counter: 01002 if ($this->externalFileCounter < $this->maxExternalFiles || $force) { 01003 01004 // Divide into title,keywords,description and body: 01005 $this->log_push('Split content',''); 01006 $contentParts = $this->readFileContent($ext,$absFile,$cPKey); 01007 $this->log_pull(); 01008 01009 if (is_array($contentParts)) { 01010 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent()) 01011 $content_md5h = $this->md5inthash(implode($contentParts,'')); 01012 01013 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) { 01014 01015 // Increment counter: 01016 $this->externalFileCounter++; 01017 01018 // Splitting words 01019 $this->log_push('Extract words from content',''); 01020 $splitInWords = $this->processWordsInArrays($contentParts); 01021 $this->log_pull(); 01022 01023 // Analyse the indexed words. 01024 $this->log_push('Analyse the extracted words',''); 01025 $indexArr = $this->indexAnalyze($splitInWords); 01026 $this->log_pull(); 01027 01028 // Submitting page (phash) record 01029 $this->log_push('Submitting page',''); 01030 $size = filesize($absFile); 01031 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time... 01032 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts); 01033 $this->log_pull(); 01034 01035 // Check words and submit to word list if not there 01036 $this->log_push('Check word list and submit words',''); 01037 $this->checkWordList($indexArr); 01038 $this->submitWords($indexArr,$phash_arr['phash']); 01039 $this->log_pull(); 01040 01041 // Set parsetime 01042 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart); 01043 } else { 01044 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp 01045 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.'); 01046 } 01047 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.'); 01048 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.'); 01049 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 01050 01051 // Checking and setting sections: 01052 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group) 01053 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed. 01054 $this->log_pull(); 01055 } 01056 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.'); 01057 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.'); 01058 } 01059 01069 function readFileContent($ext,$absFile,$cPKey) { 01070 01071 // Consult relevant external document parser: 01072 if (is_object($this->external_parsers[$ext])) { 01073 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey); 01074 } 01075 01076 return $contentArr; 01077 } 01078 01086 function fileContentParts($ext,$absFile) { 01087 $cParts = array(0); 01088 01089 // Consult relevant external document parser: 01090 if (is_object($this->external_parsers[$ext])) { 01091 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile); 01092 } 01093 01094 return $cParts; 01095 } 01096 01104 function splitRegularContent($content) { 01105 $contentArr = $this->defaultContentArray; 01106 $contentArr['body'] = $content; 01107 01108 return $contentArr; 01109 } 01110 01111 01112 01113 01114 01115 01116 01117 01118 01119 01120 01121 01122 01123 01124 /********************************** 01125 * 01126 * Analysing content, Extracting words 01127 * 01128 **********************************/ 01129 01137 function charsetEntity2utf8(&$contentArr, $charset) { 01138 01139 // Convert charset if necessary 01140 reset($contentArr); 01141 while(list($key,)=each($contentArr)) { 01142 if (strlen($contentArr[$key])) { 01143 01144 if ($charset!=='utf-8') { 01145 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset); 01146 } 01147 01148 // decode all numeric / html-entities in the string to real characters: 01149 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE); 01150 } 01151 } 01152 } 01153 01160 function processWordsInArrays($contentArr) { 01161 01162 // split all parts to words 01163 reset($contentArr); 01164 while(list($key,)=each($contentArr)) { 01165 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]); 01166 } 01167 01168 // For title, keywords, and description we don't want duplicates: 01169 $contentArr['title'] = array_unique($contentArr['title']); 01170 $contentArr['keywords'] = array_unique($contentArr['keywords']); 01171 $contentArr['description'] = array_unique($contentArr['description']); 01172 01173 // Return modified array: 01174 return $contentArr; 01175 } 01176 01185 function procesWordsInArrays($contentArr) { 01186 return $this->processWordsInArrays($contentArr); 01187 } 01188 01195 function bodyDescription($contentArr) { 01196 01197 // Setting description 01198 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200); 01199 if ($maxL) { 01200 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet. 01201 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4))); 01202 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']); 01203 01204 // Shorten the string: 01205 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL); 01206 } 01207 01208 return $bodyDescription; 01209 } 01210 01217 function indexAnalyze($content) { 01218 $indexArr = Array(); 01219 $counter = 0; 01220 01221 $this->analyzeHeaderinfo($indexArr,$content,'title',7); 01222 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6); 01223 $this->analyzeHeaderinfo($indexArr,$content,'description',5); 01224 $this->analyzeBody($indexArr,$content); 01225 01226 return ($indexArr); 01227 } 01228 01238 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) { 01239 reset($content[$key]); 01240 while(list(,$val)=each($content[$key])) { 01241 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01242 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset); 01243 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01244 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01245 $retArr[$val]['metaphone'] = $this->metaphone($val); 01246 $this->wordcount++; 01247 } 01248 } 01249 01257 function analyzeBody(&$retArr,$content) { 01258 foreach($content['body'] as $key => $val) { 01259 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01260 if(!isset($retArr[$val])) { 01261 $retArr[$val]['first'] = $key; 01262 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01263 $retArr[$val]['metaphone'] = $this->metaphone($val); 01264 } 01265 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01266 $this->wordcount++; 01267 } 01268 } 01269 01277 function metaphone($word,$retRaw=FALSE) { 01278 01279 if (is_object($this->metaphoneObj)) { 01280 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']); 01281 } else { 01282 $tmp = metaphone($word); 01283 } 01284 01285 // Return raw value? 01286 if ($retRaw) return $tmp; 01287 01288 // Otherwise create hash and return integer 01289 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7)); 01290 return $ret; 01291 } 01292 01293 01294 01295 01296 01297 01298 01299 01300 01301 01302 01303 01304 01305 01306 01307 01308 /******************************** 01309 * 01310 * SQL; TYPO3 Pages 01311 * 01312 *******************************/ 01313 01319 function submitPage() { 01320 01321 // Remove any current data for this phash: 01322 $this->removeOldIndexedPages($this->hash['phash']); 01323 01324 // setting new phash_row 01325 $fields = array( 01326 'phash' => $this->hash['phash'], 01327 'phash_grouping' => $this->hash['phash_grouping'], 01328 'cHashParams' => serialize($this->cHashParams), 01329 'contentHash' => $this->content_md5h, 01330 'data_page_id' => $this->conf['id'], 01331 'data_page_reg1' => $this->conf['page_cache_reg1'], 01332 'data_page_type' => $this->conf['type'], 01333 'data_page_mp' => $this->conf['MP'], 01334 'gr_list' => $this->conf['gr_list'], 01335 'item_type' => 0, // TYPO3 page 01336 'item_title' => $this->contentParts['title'], 01337 'item_description' => $this->bodyDescription($this->contentParts), 01338 'item_mtime' => $this->conf['mtime'], 01339 'item_size' => strlen($this->conf['content']), 01340 'tstamp' => time(), 01341 'crdate' => time(), 01342 'item_crdate' => $this->conf['crdate'], // Creation date of page 01343 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display! 01344 'externalUrl' => 0, 01345 'recordUid' => intval($this->conf['recordUid']), 01346 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01347 'freeIndexSetId' => intval($this->conf['freeIndexSetId']), 01348 ); 01349 01350 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01351 01352 // PROCESSING index_section 01353 $this->submit_section($this->hash['phash'],$this->hash['phash']); 01354 01355 // PROCESSING index_grlist 01356 $this->submit_grlist($this->hash['phash'],$this->hash['phash']); 01357 01358 // PROCESSING index_fulltext 01359 $fields = array( 01360 'phash' => $this->hash['phash'], 01361 'fulltextdata' => implode(' ', $this->contentParts) 01362 ); 01363 if ($this->indexerConfig['fullTextDataLength']>0) { 01364 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']); 01365 } 01366 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01367 01368 // PROCESSING index_debug 01369 if ($this->indexerConfig['debugMode']) { 01370 $fields = array( 01371 'phash' => $this->hash['phash'], 01372 'debuginfo' => serialize(array( 01373 'cHashParams' => $this->cHashParams, 01374 'external_parsers initialized' => array_keys($this->external_parsers), 01375 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))), 01376 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))), 01377 'logs' => $this->internal_log, 01378 'lexer' => $this->lexerObj->debugString, 01379 )) 01380 ); 01381 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01382 } 01383 } 01384 01393 function submit_grlist($hash,$phash_x) { 01394 01395 // Setting the gr_list record 01396 $fields = array( 01397 'phash' => $hash, 01398 'phash_x' => $phash_x, 01399 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']), 01400 'gr_list' => $this->conf['gr_list'] 01401 ); 01402 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields); 01403 } 01404 01413 function submit_section($hash,$hash_t3) { 01414 $fields = array( 01415 'phash' => $hash, 01416 'phash_t3' => $hash_t3, 01417 'page_id' => intval($this->conf['id']) 01418 ); 01419 01420 $this->getRootLineFields($fields); 01421 01422 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields); 01423 } 01424 01431 function removeOldIndexedPages($phash) { 01432 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here. 01433 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug'); 01434 foreach($tableArr as $table) { 01435 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01436 } 01437 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file). 01438 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash)); 01439 } 01440 01441 01442 01443 01444 01445 01446 01447 01448 01449 01450 01451 01452 01453 /******************************** 01454 * 01455 * SQL; External media 01456 * 01457 *******************************/ 01458 01459 01474 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) { 01475 01476 // Find item Type: 01477 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext]; 01478 $storeItemType = $storeItemType ? $storeItemType : $ext; 01479 01480 // Remove any current data for this phash: 01481 $this->removeOldIndexedFiles($hash['phash']); 01482 01483 // Split filename: 01484 $fileParts = parse_url($file); 01485 01486 // Setting new 01487 $fields = array( 01488 'phash' => $hash['phash'], 01489 'phash_grouping' => $hash['phash_grouping'], 01490 'cHashParams' => serialize($subinfo), 01491 'contentHash' => $content_md5h, 01492 'data_filename' => $file, 01493 'item_type' => $storeItemType, 01494 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file), 01495 'item_description' => $this->bodyDescription($contentParts), 01496 'item_mtime' => $mtime, 01497 'item_size' => $size, 01498 'item_crdate' => $ctime, 01499 'tstamp' => time(), 01500 'crdate' => time(), 01501 'gr_list' => $this->conf['gr_list'], 01502 'externalUrl' => $fileParts['scheme'] ? 1 : 0, 01503 'recordUid' => intval($this->conf['recordUid']), 01504 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01505 'freeIndexSetId' => intval($this->conf['freeIndexSetId']), 01506 ); 01507 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01508 01509 // PROCESSING index_fulltext 01510 $fields = array( 01511 'phash' => $hash['phash'], 01512 'fulltextdata' => implode(' ', $contentParts) 01513 ); 01514 if ($this->indexerConfig['fullTextDataLength']>0) { 01515 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']); 01516 } 01517 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01518 01519 // PROCESSING index_debug 01520 if ($this->indexerConfig['debugMode']) { 01521 $fields = array( 01522 'phash' => $hash['phash'], 01523 'debuginfo' => serialize(array( 01524 'cHashParams' => $subinfo, 01525 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))), 01526 'logs' => $this->internal_log, 01527 'lexer' => $this->lexerObj->debugString, 01528 )) 01529 ); 01530 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01531 } 01532 } 01533 01540 function submitFile_grlist($hash) { 01541 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one. 01542 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')'); 01543 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01544 $this->submit_grlist($hash,$hash); 01545 } 01546 } 01547 01554 function submitFile_section($hash) { 01555 // Testing if there is a section 01556 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id'])); 01557 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01558 $this->submit_section($hash,$this->hash['phash']); 01559 } 01560 } 01561 01568 function removeOldIndexedFiles($phash) { 01569 01570 // Removing old registrations for tables. 01571 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug'); 01572 foreach($tableArr as $table) { 01573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01574 } 01575 } 01576 01577 01578 01579 01580 01581 01582 01583 01584 01585 01586 01587 01588 01589 01590 /******************************** 01591 * 01592 * SQL Helper functions 01593 * 01594 *******************************/ 01595 01604 function checkMtimeTstamp($mtime,$phash) { 01605 01606 // Select indexed page: 01607 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash)); 01608 $out = 0; 01609 01610 // If there was an indexing of the page...: 01611 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01612 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page 01613 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed. 01614 } else { 01615 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime 01616 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed. 01617 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index. 01618 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed. 01619 } else { 01620 $out = -1; // mtime matched the document, so no changes detected and no content updated 01621 if ($this->tstamp_maxAge) { 01622 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1); 01623 } else { 01624 $this->updateTstamp($phash); // Update the timestatmp 01625 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1); 01626 } 01627 } 01628 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed. 01629 } else {$out = -2;} // The minimum age was not exceeded 01630 } 01631 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table). 01632 return $out; 01633 } 01634 01640 function checkContentHash() { 01641 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page. 01642 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h)); 01643 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01644 return $row; 01645 } 01646 return 1; 01647 } 01648 01657 function checkExternalDocContentHash($hashGr,$content_md5h) { 01658 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h)); 01659 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01660 return 0; 01661 } 01662 return 1; 01663 } 01664 01671 function is_grlist_set($phash_x) { 01672 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x)); 01673 return $GLOBALS['TYPO3_DB']->sql_num_rows($res); 01674 } 01675 01684 function update_grlist($phash,$phash_x) { 01685 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list'])); 01686 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01687 $this->submit_grlist($phash,$phash_x); 01688 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1); 01689 } 01690 } 01691 01699 function updateTstamp($phash,$mtime=0) { 01700 $updateFields = array( 01701 'tstamp' => time() 01702 ); 01703 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); } 01704 01705 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01706 } 01707 01714 function updateSetId($phash) { 01715 $updateFields = array( 01716 'freeIndexSetId' => intval($this->conf['freeIndexSetId']) 01717 ); 01718 01719 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01720 } 01721 01729 function updateParsetime($phash,$parsetime) { 01730 $updateFields = array( 01731 'parsetime' => intval($parsetime) 01732 ); 01733 01734 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01735 } 01736 01742 function updateRootline() { 01743 01744 $updateFields = array(); 01745 $this->getRootLineFields($updateFields); 01746 01747 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields); 01748 } 01749 01757 function getRootLineFields(&$fieldArr) { 01758 01759 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]); 01760 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]); 01761 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]); 01762 01763 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) { 01764 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) { 01765 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]); 01766 } 01767 } 01768 } 01769 01776 function removeLoginpagesWithContentHash() { 01777 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', ' 01778 A.phash=B.phash 01779 AND A.phash_grouping='.intval($this->hash['phash_grouping']).' 01780 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).' 01781 AND A.contentHash='.intval($this->content_md5h)); 01782 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01783 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1); 01784 $this->removeOldIndexedPages($row['phash']); 01785 } 01786 } 01787 01793 function includeCrawlerClass() { 01794 global $TYPO3_CONF_VARS; 01795 01796 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php'); 01797 } 01798 01799 01800 01801 01802 01803 01804 01805 01806 01807 01808 /******************************** 01809 * 01810 * SQL; Submitting words 01811 * 01812 *******************************/ 01813 01820 function checkWordList($wl) { 01821 reset($wl); 01822 $phashArr = array(); 01823 while(list($key,) = each($wl)) { 01824 $phashArr[] = $wl[$key]['hash']; 01825 } 01826 if (count($phashArr)) { 01827 $cwl = implode(',',$phashArr); 01828 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')'); 01829 01830 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) { 01831 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1); 01832 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01833 unset($wl[$row['baseword']]); 01834 } 01835 01836 reset($wl); 01837 while(list($key,$val)=each($wl)) { 01838 $insertFields = array( 01839 'wid' => $val['hash'], 01840 'baseword' => $key, 01841 'metaphone' => $val['metaphone'] 01842 ); 01843 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem. 01844 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields); 01845 } 01846 } 01847 } 01848 } 01849 01857 function submitWords($wl,$phash) { 01858 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash)); 01859 01860 foreach($wl as $val) { 01861 $insertFields = array( 01862 'phash' => $phash, 01863 'wid' => $val['hash'], 01864 'count' => $val['count'], 01865 'first' => $val['first'], 01866 'freq' => $this->freqMap(($val['count']/$this->wordcount)), 01867 'flags' => ($val['cmp'] & $this->flagBitMask) 01868 ); 01869 01870 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields); 01871 } 01872 } 01873 01881 function freqMap($freq) { 01882 $mapFactor = $this->freqMax*100*$this->freqRange; 01883 if($freq<1) { 01884 $newFreq = $freq*$mapFactor; 01885 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq; 01886 } else { 01887 $newFreq = $freq/$mapFactor; 01888 } 01889 return $newFreq; 01890 01891 } 01892 01893 01894 01895 01896 01897 01898 01899 01900 01901 01902 01903 /******************************** 01904 * 01905 * Hashing 01906 * 01907 *******************************/ 01908 01914 function setT3Hashes() { 01915 01916 // Set main array: 01917 $hArray = array( 01918 'id' => (integer)$this->conf['id'], 01919 'type' => (integer)$this->conf['type'], 01920 'sys_lang' => (integer)$this->conf['sys_language_uid'], 01921 'MP' => (string)$this->conf['MP'], 01922 'cHash' => $this->cHashParams 01923 ); 01924 01925 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters): 01926 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 01927 01928 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.) 01929 $hArray['gr_list'] = (string)$this->conf['gr_list']; 01930 $this->hash['phash'] = $this->md5inthash(serialize($hArray)); 01931 } 01932 01940 function setExtHashes($file,$subinfo=array()) { 01941 // Set main array: 01942 $hash = array(); 01943 $hArray = array( 01944 'file' => $file, 01945 ); 01946 01947 // Set grouping hash: 01948 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 01949 01950 // Add subinfo 01951 $hArray['subinfo'] = $subinfo; 01952 $hash['phash'] = $this->md5inthash(serialize($hArray)); 01953 01954 return $hash; 01955 } 01956 01964 function md5inthash($str) { 01965 return hexdec(substr(md5($str),0,7)); 01966 } 01967 01974 function makeCHash($paramArray) { 01975 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray); 01976 01977 $pA = t3lib_div::cHashParams($addQueryParams); 01978 01979 return t3lib_div::shortMD5(serialize($pA)); 01980 } 01981 01982 01983 01984 01985 01986 01987 01988 01989 01990 01991 01992 01993 /********************************* 01994 * 01995 * Internal logging functions 01996 * 01997 *********************************/ 01998 02006 function log_push($msg,$key) { 02007 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key); 02008 } 02009 02015 function log_pull() { 02016 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull(); 02017 } 02018 02026 function log_setTSlogMessage($msg, $errorNum=0) { 02027 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum); 02028 $this->internal_log[] = $msg; 02029 } 02030 02031 02032 02033 02034 02035 02036 02037 02038 /************************** 02039 * 02040 * tslib_fe hooks: 02041 * 02042 **************************/ 02043 02051 function fe_headerNoCache(&$params, $ref) { 02052 02053 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: 02054 if (t3lib_extMgm::isLoaded('crawler') 02055 && $params['pObj']->applicationData['tx_crawler']['running'] 02056 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) { 02057 02058 // Setting simple log entry: 02059 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData']; 02060 02061 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached. 02062 $params['disableAcquireCacheData'] = TRUE; 02063 } 02064 } 02065 } 02066 02067 02068 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) { 02069 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']); 02070 } 02071 ?>