Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00128 require_once(PATH_t3lib.'class.t3lib_parsehtml.php'); 00129 00130 00138 class tx_indexedsearch_indexer { 00139 00140 // Messages: 00141 var $reasons = array( 00142 -1 => 'mtime matched the document, so no changes detected and no content updated', 00143 -2 => 'The minimum age was not exceeded', 00144 1 => "The configured max-age was exceeded for the document and thus it's indexed.", 00145 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.', 00146 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.', 00147 4 => 'Page has never been indexed (is not represented in the index_phash table).' 00148 ); 00149 00150 // HTML code blocks to exclude from indexing: 00151 var $excludeSections = 'script,style'; 00152 00153 // Supported Extensions for external files: 00154 var $external_parsers = array(); // External parser objects, keys are file extension names. Values are objects with certain methods. 00155 00156 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!) 00157 var $defaultGrList = '0,-1'; 00158 00159 // Min/Max times: 00160 var $tstamp_maxAge = 0; // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded. 00161 var $tstamp_minAge = 0; // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime. 00162 var $maxExternalFiles = 0; // Max number of external files to index. 00163 00164 var $forceIndexing = FALSE; // If true, indexing is forced despite of hashes etc. 00165 var $crawlerActive = FALSE; // Set when crawler is detected (internal) 00166 00167 // INTERNALS: 00168 var $defaultContentArray=array( 00169 'title' => '', 00170 'description' => '', 00171 'keywords' => '', 00172 'body' => '', 00173 ); 00174 var $wordcount = 0; 00175 var $externalFileCounter = 0; 00176 00177 var $conf = array(); // Configuration set internally (see init functions for required keys and their meaning) 00178 var $indexerConfig = array(); // Indexer configuration 00179 var $hash = array(); // Hash array, contains phash and phash_grouping 00180 var $file_phash_arr = array(); // Hash array for files 00181 var $contentParts = array(); // Content of TYPO3 page 00182 var $content_md5h = ''; 00183 var $internal_log = array(); // Internal log 00184 var $indexExternalUrl_content = ''; 00185 00186 var $cHashParams = array(); // cHashparams array 00187 00188 var $freqRange = 32000; 00189 var $freqMax = 0.1; 00190 00191 // Objects: 00192 var $csObj; // Charset class object , t3lib_cs 00193 var $metaphoneObj; // Metaphone object, if any 00194 var $lexerObj; // Lexer object for word splitting 00195 00196 00197 00204 function hook_indexContent(&$pObj) { 00205 00206 // Indexer configuration from Extension Manager interface: 00207 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00208 00209 // Crawler activation: 00210 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: 00211 if (t3lib_extMgm::isLoaded('crawler') 00212 && $pObj->applicationData['tx_crawler']['running'] 00213 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) { 00214 00215 // Setting simple log message: 00216 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled'; 00217 00218 // Setting variables: 00219 $this->crawlerActive = TRUE; // Crawler active flag 00220 $this->forceIndexing = TRUE; // Force indexing despite timestamps etc. 00221 } 00222 00223 // Determine if page should be indexed, and if so, configure and initialize indexer 00224 if ($pObj->config['config']['index_enable']) { 00225 $this->log_push('Index page',''); 00226 00227 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) { 00228 if (!$pObj->page['no_search']) { 00229 if (!$pObj->no_cache) { 00230 00231 // Setting up internal configuration from config array: 00232 $this->conf = array(); 00233 00234 // Information about page for which the indexing takes place 00235 $this->conf['id'] = $pObj->id; // Page id 00236 $this->conf['type'] = $pObj->type; // Page type 00237 $this->conf['sys_language_uid'] = $pObj->sys_language_uid; // sys_language UID of the language of the indexing. 00238 $this->conf['MP'] = $pObj->MP; // MP variable, if any (Mount Points) 00239 $this->conf['gr_list'] = $pObj->gr_list; // Group list 00240 00241 $this->conf['cHash'] = $pObj->cHash; // cHash string for additional parameters 00242 $this->conf['cHash_array'] = $pObj->cHash_array; // Array of the additional parameters 00243 00244 $this->conf['crdate'] = $pObj->page['crdate']; // The creation date of the TYPO3 page 00245 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1; // reg1 of the caching table. Not known what practical use this has. 00246 00247 // Root line uids 00248 $this->conf['rootline_uids'] = array(); 00249 foreach($pObj->config['rootLine'] as $rlkey => $rldat) { 00250 $this->conf['rootline_uids'][$rlkey] = $rldat['uid']; 00251 } 00252 00253 // Content of page: 00254 $this->conf['content'] = $pObj->content; // Content string (HTML of TYPO3 page) 00255 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle); // Alternative title for indexing 00256 $this->conf['metaCharset'] = $pObj->metaCharset; // Character set of content (will be converted to utf-8 during indexing) 00257 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED']; // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed. 00258 00259 // Configuration of behavior: 00260 $this->conf['index_externals'] = $pObj->config['config']['index_externals']; // Whether to index external documents like PDF, DOC etc. (if possible) 00261 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd']; // Length of description text (max 250, default 200) 00262 00263 // Set to zero: 00264 $this->conf['recordUid'] = 0; 00265 $this->conf['freeIndexUid'] = 0; 00266 00267 // Init and start indexing: 00268 $this->init(); 00269 $this->indexTypo3PageContent(); 00270 00271 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.'); 00272 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!'); 00273 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.'); 00274 } 00275 $this->log_pull(); 00276 } 00277 00278 00279 00280 00281 00282 00283 00284 00285 /**************************** 00286 * 00287 * Backend API 00288 * 00289 ****************************/ 00290 00303 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) { 00304 00305 // Setting up internal configuration from config array: 00306 $this->conf = array(); 00307 00308 // Information about page for which the indexing takes place 00309 $this->conf['id'] = $id; // Page id (integer) 00310 $this->conf['type'] = $type; // Page type (integer) 00311 $this->conf['sys_language_uid'] = $sys_language_uid; // sys_language UID of the language of the indexing (integer) 00312 $this->conf['MP'] = $MP; // MP variable, if any (Mount Points) (string) 00313 $this->conf['gr_list'] = '0,-1'; // Group list (hardcoded for now...) 00314 00315 // cHash values: 00316 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : ''; // cHash string for additional parameters 00317 $this->conf['cHash_array'] = $cHash_array; // Array of the additional parameters 00318 00319 // Set to defaults 00320 $this->conf['freeIndexUid'] = 0; 00321 $this->conf['page_cache_reg1'] = ''; 00322 00323 // Root line uids 00324 $this->conf['rootline_uids'] = $uidRL; 00325 00326 // Configuration of behavior: 00327 $this->conf['index_externals'] = 1; // Whether to index external documents like PDF, DOC etc. (if possible) 00328 $this->conf['index_descrLgd'] = 200; // Length of description text (max 250, default 200) 00329 00330 // Init and start indexing: 00331 $this->init(); 00332 } 00333 00340 function backend_setFreeIndexUid($freeIndexUid) { 00341 $this->conf['freeIndexUid'] = $freeIndexUid; 00342 } 00343 00357 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) { 00358 00359 // Content of page: 00360 $this->conf['mtime'] = $mtime; // Most recent modification time (seconds) of the content 00361 $this->conf['crdate'] = $crdate; // The creation date of the TYPO3 content 00362 $this->conf['recordUid'] = $recordUid; // UID of the record, if applicable 00363 00364 // Construct fake HTML for parsing: 00365 $this->conf['content'] = ' 00366 <html> 00367 <head> 00368 <title>'.htmlspecialchars($title).'</title> 00369 <meta name="keywords" content="'.htmlspecialchars($keywords).'" /> 00370 <meta name="description" content="'.htmlspecialchars($description).'" /> 00371 </head> 00372 <body> 00373 '.htmlspecialchars($content).' 00374 </body> 00375 </html>'; // Content string (HTML of TYPO3 page) 00376 00377 // Initializing charset: 00378 $this->conf['metaCharset'] = $charset; // Character set of content (will be converted to utf-8 during indexing) 00379 $this->conf['indexedDocTitle'] = ''; // Alternative title for indexing 00380 00381 // Index content as if it was a TYPO3 page: 00382 $this->indexTypo3PageContent(); 00383 } 00384 00385 00386 00387 00388 00389 00390 00391 00392 00393 00394 00395 00396 00397 /******************************** 00398 * 00399 * Initialization 00400 * 00401 *******************************/ 00402 00408 function init() { 00409 global $TYPO3_CONF_VARS; 00410 00411 // Initializing: 00412 $this->cHashParams = $this->conf['cHash_array']; 00413 if (is_array($this->cHashParams) && count($this->cHashParams)) { 00414 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash']; // Add this so that URL's come out right... 00415 unset($this->cHashParams['encryptionKey']); // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!! 00416 } 00417 00418 // Setting phash / phash_grouping which identifies the indexed page based on some of these variables: 00419 $this->setT3Hashes(); 00420 00421 // Indexer configuration from Extension Manager interface: 00422 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00423 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0); 00424 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0); 00425 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5); 00426 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255); 00427 00428 // Initialize external document parsers: 00429 // Example configuration, see ext_localconf.php of this file! 00430 if ($this->conf['index_externals']) { 00431 $this->initializeExternalParsers(); 00432 } 00433 00434 // Initialize lexer (class that deconstructs the text into words): 00435 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00436 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ? 00437 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] : 00438 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer'; 00439 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef); 00440 $this->lexerObj->debug = $this->indexerConfig['debugMode']; 00441 00442 // Initialize metaphone hook: 00443 // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone'; 00444 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) { 00445 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']); 00446 $this->metaphoneObj->pObj = &$this; 00447 } 00448 00449 // Init charset class: 00450 $this->csObj = &t3lib_div::makeInstance('t3lib_cs'); 00451 } 00452 00460 function initializeExternalParsers() { 00461 global $TYPO3_CONF_VARS; 00462 00463 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) { 00464 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) { 00465 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef); 00466 $this->external_parsers[$extension]->pObj = &$this; 00467 00468 // Init parser and if it returns false, unset its entry again: 00469 if (!$this->external_parsers[$extension]->initParser($extension)) { 00470 unset($this->external_parsers[$extension]); 00471 } 00472 } 00473 } 00474 } 00475 00476 00477 00478 00479 00480 00481 00482 00483 00484 00485 00486 00487 00488 00489 00490 /******************************** 00491 * 00492 * Indexing; TYPO3 pages (HTML content) 00493 * 00494 *******************************/ 00495 00501 function indexTypo3PageContent() { 00502 00503 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']); 00504 $is_grlist = $this->is_grlist_set($this->hash['phash']); 00505 00506 if ($check > 0 || !$is_grlist || $this->forceIndexing) { 00507 00508 // Setting message: 00509 if ($this->forceIndexing) { 00510 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1); 00511 } elseif ($check > 0) { 00512 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 00513 } else { 00514 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1); 00515 } 00516 00517 // Divide into title,keywords,description and body: 00518 $this->log_push('Split content',''); 00519 $this->contentParts = $this->splitHTMLContent($this->conf['content']); 00520 if ($this->conf['indexedDocTitle']) { 00521 $this->contentParts['title'] = $this->conf['indexedDocTitle']; 00522 } 00523 $this->log_pull(); 00524 00525 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!) 00526 $this->content_md5h = $this->md5inthash(implode($this->contentParts,'')); 00527 00528 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash. 00529 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more. 00530 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem. 00531 $checkCHash = $this->checkContentHash(); 00532 if (!is_array($checkCHash) || $check===1) { 00533 $Pstart=t3lib_div::milliseconds(); 00534 00535 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8',''); 00536 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']); 00537 $this->log_pull(); 00538 00539 // Splitting words 00540 $this->log_push('Extract words from content',''); 00541 $splitInWords = $this->procesWordsInArrays($this->contentParts); 00542 $this->log_pull(); 00543 00544 // Analyse the indexed words. 00545 $this->log_push('Analyse the extracted words',''); 00546 $indexArr = $this->indexAnalyze($splitInWords); 00547 $this->log_pull(); 00548 00549 // Submitting page (phash) record 00550 $this->log_push('Submitting page',''); 00551 $this->submitPage(); 00552 $this->log_pull(); 00553 00554 // Check words and submit to word list if not there 00555 $this->log_push('Check word list and submit words',''); 00556 $this->checkWordList($indexArr); 00557 $this->submitWords($indexArr,$this->hash['phash']); 00558 $this->log_pull(); 00559 00560 // Set parsetime 00561 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart); 00562 00563 // Checking external files if configured for. 00564 $this->log_push('Checking external files',''); 00565 if ($this->conf['index_externals']) { 00566 $this->extractLinks($this->conf['content']); 00567 } 00568 $this->log_pull(); 00569 } else { 00570 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp 00571 $this->update_grlist($checkCHash['phash'],$this->hash['phash']); // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash. 00572 $this->updateRootline(); 00573 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.'); 00574 } 00575 } else { 00576 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 00577 } 00578 } 00579 00587 function splitHTMLContent($content) { 00588 00589 // divide head from body ( u-ouh :) ) 00590 $contentArr = $this->defaultContentArray; 00591 $contentArr['body'] = stristr($content,'<body'); 00592 $headPart = substr($content,0,-strlen($contentArr['body'])); 00593 00594 // get title 00595 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy); 00596 $titleParts = explode(':',$contentArr['title'],2); 00597 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]); 00598 00599 // get keywords and description metatags 00600 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ } 00601 for($i=0;isset($meta[$i]);$i++) { 00602 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]); 00603 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content']; 00604 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content']; 00605 } 00606 00607 // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags: 00608 $this->typoSearchTags($contentArr['body']); 00609 00610 // Get rid of unwanted sections (ie. scripting and style stuff) in body 00611 $tagList = explode(',',$this->excludeSections); 00612 foreach($tagList as $tag) { 00613 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2)); 00614 } 00615 00616 // remove tags, but first make sure we don't concatenate words by doing it 00617 $contentArr['body'] = str_replace('<',' <',$contentArr['body']); 00618 $contentArr['body'] = trim(strip_tags($contentArr['body'])); 00619 00620 $contentArr['keywords'] = trim($contentArr['keywords']); 00621 $contentArr['description'] = trim($contentArr['description']); 00622 00623 // Return array 00624 return $contentArr; 00625 } 00626 00633 function getHTMLcharset($content) { 00634 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) { 00635 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) { 00636 return $reg2[1]; 00637 } 00638 } 00639 } 00640 00648 function convertHTMLToUtf8($content,$charset='') { 00649 00650 // Find charset: 00651 $charset = $charset ? $charset : $this->getHTMLcharset($content); 00652 $charset = $this->csObj->parse_charset($charset); 00653 00654 // Convert charset: 00655 if ($charset && $charset!=='utf-8') { 00656 $content = $this->csObj->utf8_encode($content, $charset); 00657 } 00658 // Convert entities, assuming document is now UTF-8: 00659 $content = $this->csObj->entities_to_utf8($content, TRUE); 00660 00661 return $content; 00662 } 00663 00676 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) { 00677 $endTag = '</'.$tagName.'>'; 00678 $startTag = '<'.$tagName; 00679 00680 $isTagInText = stristr($string,$startTag); // stristr used because we want a case-insensitive search for the tag. 00681 if(!$isTagInText) return false; // if the tag was not found, return false 00682 00683 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2); 00684 $afterTagInText = stristr($isTagInText,$endTag); 00685 if ($afterTagInText) { 00686 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag))); 00687 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText)); 00688 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag)); 00689 } else { // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned. 00690 $tagContent=''; 00691 $stringAfter = $isTagInText; 00692 } 00693 00694 return true; 00695 } 00696 00703 function typoSearchTags(&$body) { 00704 $expBody = explode('<!--TYPO3SEARCH_',$body); 00705 00706 if(count($expBody)>1) { 00707 $body = ''; 00708 00709 foreach($expBody as $val) { 00710 $part = explode('-->',$val,2); 00711 if(trim($part[0])=='begin') { 00712 $body.= $part[1]; 00713 $prev = ''; 00714 } elseif(trim($part[0])=='end') { 00715 $body.= $prev; 00716 } else { 00717 $prev = $val; 00718 } 00719 } 00720 return true; 00721 } else { 00722 return false; 00723 } 00724 } 00725 00732 function extractLinks($content) { 00733 00734 // Get links: 00735 $list = $this->extractHyperLinks($content); 00736 00737 // Traverse links: 00738 foreach($list as $linkInfo) { 00739 00740 // Decode entities: 00741 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']); 00742 00743 // Parse URL: 00744 $qParts = parse_url($linkSource); 00745 00746 // Check for jumpurl (TYPO3 specific thing...) 00747 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) { 00748 parse_str($qParts['query'],$getP); 00749 $linkSource = $getP['jumpurl']; 00750 $qParts = parse_url($linkSource); // parse again due to new linkSource! 00751 } 00752 00753 if ($qParts['scheme']) { 00754 if ($this->indexerConfig['indexExternalURLs']) { 00755 // Index external URL (http or otherwise) 00756 $this->indexExternalUrl($linkSource); 00757 } 00758 } elseif (!$qParts['query']) { 00759 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource); 00760 if ($localFile && @is_file($localFile)) { 00761 // Index local file: 00762 $this->indexRegularDocument($linkSource); 00763 } 00764 } 00765 } 00766 } 00767 00775 function extractHyperLinks($string) { 00776 if (!is_object($this->htmlParser)) { 00777 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); 00778 } 00779 00780 $parts = $this->htmlParser->splitTags('a',$string); 00781 $list = array(); 00782 foreach($parts as $k => $v) { 00783 if ($k%2) { 00784 $params = $this->htmlParser->get_tag_attributes($v,1); 00785 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag 00786 00787 switch(strtolower($firstTagName)) { 00788 case 'a': 00789 $src = $params[0]['href']; 00790 if ($src) { 00791 $list[] = array( 00792 'tag' => $v, 00793 'href' => $params[0]['href'] 00794 ); 00795 } 00796 break; 00797 } 00798 } 00799 } 00800 00801 return $list; 00802 } 00803 00804 00805 00806 00807 00808 00809 00810 00811 00812 00813 00814 /****************************************** 00815 * 00816 * Indexing; external URL 00817 * 00818 ******************************************/ 00819 00827 function indexExternalUrl($externalUrl) { 00828 00829 // Parse External URL: 00830 $qParts = parse_url($externalUrl); 00831 $fI = pathinfo($qParts['path']); 00832 $ext = strtolower($fI['extension']); 00833 00834 // Get headers: 00835 $urlHeaders = $this->getUrlHeaders($externalUrl); 00836 if (stristr($urlHeaders['Content-Type'],'text/html')) { 00837 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl); 00838 if (strlen($content)) { 00839 00840 // Create temporary file: 00841 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html'; 00842 t3lib_div::writeFile($tmpFile, $content); 00843 00844 // Index that file: 00845 $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html'); 00846 unlink($tmpFile); 00847 } 00848 } 00849 } 00850 00858 function getUrlHeaders($url, $timeout = 2) { 00859 $url = parse_url($url); 00860 00861 if(!in_array($url['scheme'],array('','http'))) return FALSE; 00862 00863 $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout); 00864 if (!$fp) { 00865 return FALSE; 00866 } else { 00867 $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n"; 00868 fputs ($fp, $msg); 00869 $d = ''; 00870 while (!feof($fp)) { 00871 $line = fgets ($fp,2048); 00872 00873 $d.=$line; 00874 if (!strlen(trim($line))) { 00875 break; 00876 } 00877 } 00878 fclose ($fp); 00879 00880 // Compile headers: 00881 $headers = t3lib_div::trimExplode(chr(10),$d,1); 00882 $retVal = array(); 00883 foreach($headers as $line) { 00884 list($headKey, $headValue) = explode(':', $line, 2); 00885 $retVal[$headKey] = $headValue; 00886 } 00887 return $retVal; 00888 } 00889 } 00890 00891 00892 00893 00894 00895 00896 00897 00898 00899 00900 00901 00902 00903 /****************************************** 00904 * 00905 * Indexing; external files (PDF, DOC, etc) 00906 * 00907 ******************************************/ 00908 00918 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') { 00919 00920 // Init 00921 $fI = pathinfo($file); 00922 $ext = $altExtension ? $altExtension : strtolower($fI['extension']); 00923 00924 // Create abs-path: 00925 if (!$contentTmpFile) { 00926 if (!t3lib_div::isAbsPath($file)) { // Relative, prepend PATH_site: 00927 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file); 00928 } else { // Absolute, pass-through: 00929 $absFile = $file; 00930 } 00931 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : ''; 00932 } else { 00933 $absFile = $contentTmpFile; 00934 } 00935 00936 // Indexing the document: 00937 if ($absFile && @is_file($absFile)) { 00938 if ($this->external_parsers[$ext]) { 00939 $mtime = filemtime($absFile); 00940 $cParts = $this->fileContentParts($ext,$absFile); 00941 00942 foreach($cParts as $cPKey) { 00943 $this->internal_log = array(); 00944 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),''); 00945 $Pstart = t3lib_div::milliseconds(); 00946 $subinfo = array('key' => $cPKey); // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3" 00947 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo); 00948 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']); 00949 if ($check > 0 || $force) { 00950 if ($check > 0) { 00951 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1); 00952 } else { 00953 $this->log_setTSlogMessage('Indexing forced by flag',1); 00954 } 00955 00956 // Check external file counter: 00957 if ($this->externalFileCounter < $this->maxExternalFiles || $force) { 00958 00959 // Divide into title,keywords,description and body: 00960 $this->log_push('Split content',''); 00961 $contentParts = $this->readFileContent($ext,$absFile,$cPKey); 00962 $this->log_pull(); 00963 00964 if (is_array($contentParts)) { 00965 // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent()) 00966 $content_md5h = $this->md5inthash(implode($contentParts,'')); 00967 00968 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) { 00969 00970 // Increment counter: 00971 $this->externalFileCounter++; 00972 00973 // Splitting words 00974 $this->log_push('Extract words from content',''); 00975 $splitInWords = $this->procesWordsInArrays($contentParts); 00976 $this->log_pull(); 00977 00978 // Analyse the indexed words. 00979 $this->log_push('Analyse the extracted words',''); 00980 $indexArr = $this->indexAnalyze($splitInWords); 00981 $this->log_pull(); 00982 00983 // Submitting page (phash) record 00984 $this->log_push('Submitting page',''); 00985 $size = filesize($absFile); 00986 $ctime = filemtime($absFile); // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time... 00987 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts); 00988 $this->log_pull(); 00989 00990 // Check words and submit to word list if not there 00991 $this->log_push('Check word list and submit words',''); 00992 $this->checkWordList($indexArr); 00993 $this->submitWords($indexArr,$phash_arr['phash']); 00994 $this->log_pull(); 00995 00996 // Set parsetime 00997 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart); 00998 } else { 00999 $this->updateTstamp($phash_arr['phash'],$mtime); // Update the timestamp 01000 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.'); 01001 } 01002 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.'); 01003 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.'); 01004 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]); 01005 01006 // Checking and setting sections: 01007 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group) 01008 $this->submitFile_section($phash_arr['phash']); // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed. 01009 $this->log_pull(); 01010 } 01011 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.'); 01012 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.'); 01013 } 01014 01024 function readFileContent($ext,$absFile,$cPKey) { 01025 01026 // Consult relevant external document parser: 01027 if (is_object($this->external_parsers[$ext])) { 01028 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey); 01029 } 01030 01031 return $contentArr; 01032 } 01033 01041 function fileContentParts($ext,$absFile) { 01042 $cParts = array(0); 01043 01044 // Consult relevant external document parser: 01045 if (is_object($this->external_parsers[$ext])) { 01046 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile); 01047 } 01048 01049 return $cParts; 01050 } 01051 01059 function splitRegularContent($content) { 01060 $contentArr = $this->defaultContentArray; 01061 $contentArr['body'] = $content; 01062 01063 return $contentArr; 01064 } 01065 01066 01067 01068 01069 01070 01071 01072 01073 01074 01075 01076 01077 01078 01079 /********************************** 01080 * 01081 * Analysing content, Extracting words 01082 * 01083 **********************************/ 01084 01092 function charsetEntity2utf8(&$contentArr, $charset) { 01093 01094 // Convert charset if necessary 01095 reset($contentArr); 01096 while(list($key,)=each($contentArr)) { 01097 if (strlen($contentArr[$key])) { 01098 01099 if ($charset!=='utf-8') { 01100 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset); 01101 } 01102 01103 // decode all numeric / html-entities in the string to real characters: 01104 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE); 01105 } 01106 } 01107 } 01108 01115 function procesWordsInArrays($contentArr) { 01116 01117 // split all parts to words 01118 reset($contentArr); 01119 while(list($key,)=each($contentArr)) { 01120 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]); 01121 } 01122 01123 // For title, keywords, and description we don't want duplicates: 01124 $contentArr['title'] = array_unique($contentArr['title']); 01125 $contentArr['keywords'] = array_unique($contentArr['keywords']); 01126 $contentArr['description'] = array_unique($contentArr['description']); 01127 01128 // Return modified array: 01129 return $contentArr; 01130 } 01131 01138 function bodyDescription($contentArr) { 01139 01140 // Setting description 01141 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200); 01142 if ($maxL) { 01143 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet. 01144 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4))); 01145 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']); 01146 01147 // Shorten the string: 01148 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL); 01149 } 01150 01151 return $bodyDescription; 01152 } 01153 01160 function indexAnalyze($content) { 01161 $indexArr = Array(); 01162 $counter = 0; 01163 01164 $this->analyzeHeaderinfo($indexArr,$content,'title',7); 01165 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6); 01166 $this->analyzeHeaderinfo($indexArr,$content,'description',5); 01167 $this->analyzeBody($indexArr,$content); 01168 01169 return ($indexArr); 01170 } 01171 01181 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) { 01182 reset($content[$key]); 01183 while(list(,$val)=each($content[$key])) { 01184 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01185 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset); 01186 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01187 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01188 $retArr[$val]['metaphone'] = $this->metaphone($val); 01189 $this->wordcount++; 01190 } 01191 } 01192 01200 function analyzeBody(&$retArr,$content) { 01201 foreach($content['body'] as $key => $val) { 01202 $val = substr($val,0,60); // Max 60 - because the baseword varchar IS 60. This MUST be the same. 01203 if(!isset($retArr[$val])) { 01204 $retArr[$val]['first'] = $key; 01205 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7)); 01206 $retArr[$val]['metaphone'] = $this->metaphone($val); 01207 } 01208 $retArr[$val]['count'] = $retArr[$val]['count']+1; 01209 $this->wordcount++; 01210 } 01211 } 01212 01220 function metaphone($word,$retRaw=FALSE) { 01221 01222 if (is_object($this->metaphoneObj)) { 01223 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']); 01224 } else { 01225 $tmp = metaphone($word); 01226 } 01227 01228 // Return raw value? 01229 if ($retRaw) return $tmp; 01230 01231 // Otherwise create hash and return integer 01232 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7)); 01233 return $ret; 01234 } 01235 01236 01237 01238 01239 01240 01241 01242 01243 01244 01245 01246 01247 01248 01249 01250 01251 /******************************** 01252 * 01253 * SQL; TYPO3 Pages 01254 * 01255 *******************************/ 01256 01262 function submitPage() { 01263 01264 // Remove any current data for this phash: 01265 $this->removeOldIndexedPages($this->hash['phash']); 01266 01267 // setting new phash_row 01268 $fields = array( 01269 'phash' => $this->hash['phash'], 01270 'phash_grouping' => $this->hash['phash_grouping'], 01271 'cHashParams' => serialize($this->cHashParams), 01272 'contentHash' => $this->content_md5h, 01273 'data_page_id' => $this->conf['id'], 01274 'data_page_reg1' => $this->conf['page_cache_reg1'], 01275 'data_page_type' => $this->conf['type'], 01276 'data_page_mp' => $this->conf['MP'], 01277 'gr_list' => $this->conf['gr_list'], 01278 'item_type' => 0, // TYPO3 page 01279 'item_title' => $this->contentParts['title'], 01280 'item_description' => $this->bodyDescription($this->contentParts), 01281 'item_mtime' => $this->conf['mtime'], 01282 'item_size' => strlen($this->conf['content']), 01283 'tstamp' => time(), 01284 'crdate' => time(), 01285 'item_crdate' => $this->conf['crdate'], // Creation date of page 01286 'sys_language_uid' => $this->conf['sys_language_uid'], // Sys language uid of the page. Should reflect which language it DOES actually display! 01287 'externalUrl' => 0, 01288 'recordUid' => intval($this->conf['recordUid']), 01289 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01290 ); 01291 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01292 01293 // PROCESSING index_section 01294 $this->submit_section($this->hash['phash'],$this->hash['phash']); 01295 01296 // PROCESSING index_grlist 01297 $this->submit_grlist($this->hash['phash'],$this->hash['phash']); 01298 01299 // PROCESSING index_fulltext 01300 $fields = array( 01301 'phash' => $this->hash['phash'], 01302 'fulltextdata' => implode(' ', $this->contentParts) 01303 ); 01304 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01305 01306 // PROCESSING index_debug 01307 if ($this->indexerConfig['debugMode']) { 01308 $fields = array( 01309 'phash' => $this->hash['phash'], 01310 'debuginfo' => serialize(array( 01311 'cHashParams' => $this->cHashParams, 01312 'external_parsers initialized' => array_keys($this->external_parsers), 01313 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))), 01314 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))), 01315 'logs' => $this->internal_log, 01316 'lexer' => $this->lexerObj->debugString, 01317 )) 01318 ); 01319 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01320 } 01321 } 01322 01331 function submit_grlist($hash,$phash_x) { 01332 01333 // Setting the gr_list record 01334 $fields = array( 01335 'phash' => $hash, 01336 'phash_x' => $phash_x, 01337 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']), 01338 'gr_list' => $this->conf['gr_list'] 01339 ); 01340 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields); 01341 } 01342 01351 function submit_section($hash,$hash_t3) { 01352 $fields = array( 01353 'phash' => $hash, 01354 'phash_t3' => $hash_t3, 01355 'page_id' => intval($this->conf['id']) 01356 ); 01357 01358 $this->getRootLineFields($fields); 01359 01360 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields); 01361 } 01362 01369 function removeOldIndexedPages($phash) { 01370 // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here. 01371 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug'); 01372 foreach($tableArr as $table) { 01373 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01374 } 01375 // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file). 01376 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash)); 01377 } 01378 01379 01380 01381 01382 01383 01384 01385 01386 01387 01388 01389 01390 01391 /******************************** 01392 * 01393 * SQL; External media 01394 * 01395 *******************************/ 01396 01397 01412 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) { 01413 01414 // Find item Type: 01415 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext]; 01416 $storeItemType = $storeItemType ? $storeItemType : $ext; 01417 01418 // Remove any current data for this phash: 01419 $this->removeOldIndexedFiles($hash['phash']); 01420 01421 // Split filename: 01422 $fileParts = parse_url($file); 01423 01424 // setting new 01425 $fields = array( 01426 'phash' => $hash['phash'], 01427 'phash_grouping' => $hash['phash_grouping'], 01428 'cHashParams' => serialize($subinfo), 01429 'contentHash' => $content_md5h, 01430 'data_filename' => $file, 01431 'item_type' => $storeItemType, 01432 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file), 01433 'item_description' => $this->bodyDescription($contentParts), 01434 'item_mtime' => $mtime, 01435 'item_size' => $size, 01436 'item_crdate' => $ctime, 01437 'tstamp' => time(), 01438 'crdate' => time(), 01439 'gr_list' => $this->conf['gr_list'], 01440 'externalUrl' => $fileParts['scheme'] ? 1 : 0, 01441 'recordUid' => intval($this->conf['recordUid']), 01442 'freeIndexUid' => intval($this->conf['freeIndexUid']), 01443 ); 01444 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields); 01445 01446 // PROCESSING index_fulltext 01447 $fields = array( 01448 'phash' => $hash['phash'], 01449 'fulltextdata' => implode(' ', $contentParts) 01450 ); 01451 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields); 01452 01453 // PROCESSING index_debug 01454 if ($this->indexerConfig['debugMode']) { 01455 $fields = array( 01456 'phash' => $hash['phash'], 01457 'debuginfo' => serialize(array( 01458 'cHashParams' => $subinfo, 01459 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))), 01460 'logs' => $this->internal_log, 01461 'lexer' => $this->lexerObj->debugString, 01462 )) 01463 ); 01464 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields); 01465 } 01466 } 01467 01474 function submitFile_grlist($hash) { 01475 // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one. 01476 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')'); 01477 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01478 $this->submit_grlist($hash,$hash); 01479 } 01480 } 01481 01488 function submitFile_section($hash) { 01489 // Testing if there is a section 01490 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id'])); 01491 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01492 $this->submit_section($hash,$this->hash['phash']); 01493 } 01494 } 01495 01502 function removeOldIndexedFiles($phash) { 01503 01504 // Removing old registrations for tables. 01505 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug'); 01506 foreach($tableArr as $table) { 01507 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash)); 01508 } 01509 } 01510 01511 01512 01513 01514 01515 01516 01517 01518 01519 01520 01521 01522 01523 01524 /******************************** 01525 * 01526 * SQL Helper functions 01527 * 01528 *******************************/ 01529 01538 function checkMtimeTstamp($mtime,$phash) { 01539 01540 // Select indexed page: 01541 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash)); 01542 $out = 0; 01543 01544 // If there was an indexing of the page...: 01545 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01546 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) { // If max age is exceeded, index the page 01547 $out = 1; // The configured max-age was exceeded for the document and thus it's indexed. 01548 } else { 01549 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) { // if minAge is not set or if minAge is exceeded, consider at mtime 01550 if ($mtime) { // It mtime is set, then it's tested. If not, the page must clearly be indexed. 01551 if ($row['item_mtime'] != $mtime) { // And if mtime is different from the index_phash mtime, it's about time to re-index. 01552 $out = 2; // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed. 01553 } else { 01554 $out = -1; // mtime matched the document, so no changes detected and no content updated 01555 if ($this->tstamp_maxAge) { 01556 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1); 01557 } else { 01558 $this->updateTstamp($phash); // Update the timestatmp 01559 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1); 01560 } 01561 } 01562 } else {$out = 3; } // The minimum age was exceed, but mtime was not set, so the page was indexed. 01563 } else {$out = -2;} // The minimum age was not exceeded 01564 } 01565 } else {$out = 4;} // Page has never been indexed (is not represented in the index_phash table). 01566 return $out; 01567 } 01568 01574 function checkContentHash() { 01575 // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page. 01576 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h)); 01577 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01578 return $row; 01579 } 01580 return 1; 01581 } 01582 01591 function checkExternalDocContentHash($hashGr,$content_md5h) { 01592 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h)); 01593 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01594 return 0; 01595 } 01596 return 1; 01597 } 01598 01605 function is_grlist_set($phash_x) { 01606 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x)); 01607 return $GLOBALS['TYPO3_DB']->sql_num_rows($res); 01608 } 01609 01618 function update_grlist($phash,$phash_x) { 01619 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list'])); 01620 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) { 01621 $this->submit_grlist($phash,$phash_x); 01622 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1); 01623 } 01624 } 01625 01633 function updateTstamp($phash,$mtime=0) { 01634 $updateFields = array( 01635 'tstamp' => time() 01636 ); 01637 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); } 01638 01639 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01640 } 01641 01649 function updateParsetime($phash,$parsetime) { 01650 $updateFields = array( 01651 'parsetime' => intval($parsetime) 01652 ); 01653 01654 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields); 01655 } 01656 01662 function updateRootline() { 01663 01664 $updateFields = array(); 01665 $this->getRootLineFields($updateFields); 01666 01667 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields); 01668 } 01669 01677 function getRootLineFields(&$fieldArr) { 01678 01679 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]); 01680 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]); 01681 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]); 01682 01683 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) { 01684 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) { 01685 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]); 01686 } 01687 } 01688 } 01689 01696 function removeLoginpagesWithContentHash() { 01697 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', ' 01698 A.phash=B.phash 01699 AND A.phash_grouping='.intval($this->hash['phash_grouping']).' 01700 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).' 01701 AND A.contentHash='.intval($this->content_md5h)); 01702 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01703 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1); 01704 $this->removeOldIndexedPages($row['phash']); 01705 } 01706 } 01707 01708 01709 01710 01711 01712 01713 01714 01715 01716 01717 01718 01719 /******************************** 01720 * 01721 * SQL; Submitting words 01722 * 01723 *******************************/ 01724 01731 function checkWordList($wl) { 01732 reset($wl); 01733 $phashArr = array(); 01734 while(list($key,) = each($wl)) { 01735 $phashArr[] = $wl[$key]['hash']; 01736 } 01737 if (count($phashArr)) { 01738 $cwl = implode(',',$phashArr); 01739 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')'); 01740 01741 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) { 01742 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1); 01743 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) { 01744 unset($wl[$row['baseword']]); 01745 } 01746 01747 reset($wl); 01748 while(list($key,$val)=each($wl)) { 01749 $insertFields = array( 01750 'wid' => $val['hash'], 01751 'baseword' => $key, 01752 'metaphone' => $val['metaphone'] 01753 ); 01754 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem. 01755 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields); 01756 } 01757 } 01758 } 01759 } 01760 01768 function submitWords($wl,$phash) { 01769 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash)); 01770 01771 foreach($wl as $val) { 01772 $insertFields = array( 01773 'phash' => $phash, 01774 'wid' => $val['hash'], 01775 'count' => $val['count'], 01776 'first' => $val['first'], 01777 'freq' => $this->freqMap(($val['count']/$this->wordcount)), 01778 'flags' => ($val['cmp'] & $this->flagBitMask) 01779 ); 01780 01781 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields); 01782 } 01783 } 01784 01792 function freqMap($freq) { 01793 $mapFactor = $this->freqMax*100*$this->freqRange; 01794 if($freq<1) { 01795 $newFreq = $freq*$mapFactor; 01796 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq; 01797 } else { 01798 $newFreq = $freq/$mapFactor; 01799 } 01800 return $newFreq; 01801 01802 } 01803 01804 01805 01806 01807 01808 01809 01810 01811 01812 01813 01814 /******************************** 01815 * 01816 * Hashing 01817 * 01818 *******************************/ 01819 01825 function setT3Hashes() { 01826 01827 // Set main array: 01828 $hArray = array( 01829 'id' => (integer)$this->conf['id'], 01830 'type' => (integer)$this->conf['type'], 01831 'sys_lang' => (integer)$this->conf['sys_language_uid'], 01832 'MP' => (string)$this->conf['MP'], 01833 'cHash' => $this->cHashParams 01834 ); 01835 01836 // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters): 01837 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 01838 01839 // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.) 01840 $hArray['gr_list'] = (string)$this->conf['gr_list']; 01841 $this->hash['phash'] = $this->md5inthash(serialize($hArray)); 01842 } 01843 01851 function setExtHashes($file,$subinfo=array()) { 01852 // Set main array: 01853 $hash = array(); 01854 $hArray = array( 01855 'file' => $file, 01856 ); 01857 01858 // Set grouping hash: 01859 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray)); 01860 01861 // Add subinfo 01862 $hArray['subinfo'] = $subinfo; 01863 $hash['phash'] = $this->md5inthash(serialize($hArray)); 01864 01865 return $hash; 01866 } 01867 01875 function md5inthash($str) { 01876 return hexdec(substr(md5($str),0,7)); 01877 } 01878 01885 function makeCHash($paramArray) { 01886 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray); 01887 01888 $pA = t3lib_div::cHashParams($addQueryParams); 01889 01890 return t3lib_div::shortMD5(serialize($pA)); 01891 } 01892 01893 01894 01895 01896 01897 01898 01899 01900 01901 01902 01903 01904 /********************************* 01905 * 01906 * Internal logging functions 01907 * 01908 *********************************/ 01909 01917 function log_push($msg,$key) { 01918 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key); 01919 } 01920 01926 function log_pull() { 01927 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull(); 01928 } 01929 01937 function log_setTSlogMessage($msg, $errorNum=0) { 01938 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum); 01939 $this->internal_log[] = $msg; 01940 } 01941 01942 01943 01944 01945 01946 01947 01948 01949 /************************** 01950 * 01951 * tslib_fe hooks: 01952 * 01953 **************************/ 01954 01962 function fe_headerNoCache(&$params, $ref) { 01963 01964 // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction: 01965 if (t3lib_extMgm::isLoaded('crawler') 01966 && $params['pObj']->applicationData['tx_crawler']['running'] 01967 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) { 01968 01969 // Setting simple log entry: 01970 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData']; 01971 01972 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached. 01973 $params['disableAcquireCacheData'] = TRUE; 01974 } 01975 } 01976 } 01977 01978 01979 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) { 01980 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']); 01981 } 01982 ?>