Documentation TYPO3 par Ameos

class.indexer.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00128 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
00129 
00130 
00138 class tx_indexedsearch_indexer {
00139 
00140                 // Messages:
00141         var $reasons = array(
00142                 -1 => 'mtime matched the document, so no changes detected and no content updated',
00143                 -2 => 'The minimum age was not exceeded',
00144                 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00145                 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00146                 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00147                 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00148         );
00149 
00150                 // HTML code blocks to exclude from indexing:
00151         var $excludeSections = 'script,style';
00152 
00153                 // Supported Extensions for external files:
00154         var $external_parsers = array();                // External parser objects, keys are file extension names. Values are objects with certain methods.
00155 
00156                 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
00157         var $defaultGrList = '0,-1';
00158 
00159                 // Min/Max times:
00160         var $tstamp_maxAge = 0;         // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
00161         var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
00162         var $maxExternalFiles = 0;      // Max number of external files to index.
00163 
00164         var $forceIndexing = FALSE;             // If true, indexing is forced despite of hashes etc.
00165         var $crawlerActive = FALSE;             // Set when crawler is detected (internal)
00166 
00167                 // INTERNALS:
00168         var $defaultContentArray=array(
00169                 'title' => '',
00170                 'description' => '',
00171                 'keywords' => '',
00172                 'body' => '',
00173         );
00174         var $wordcount = 0;
00175         var $externalFileCounter = 0;
00176 
00177         var $conf = array();            // Configuration set internally (see init functions for required keys and their meaning)
00178         var $indexerConfig = array();   // Indexer configuration
00179         var $hash = array();            // Hash array, contains phash and phash_grouping
00180         var $file_phash_arr = array();  // Hash array for files
00181         var $contentParts = array();    // Content of TYPO3 page
00182         var $content_md5h = '';
00183         var $internal_log = array();    // Internal log
00184         var $indexExternalUrl_content = '';
00185 
00186         var $cHashParams = array();     // cHashparams array
00187 
00188         var $freqRange = 32000;
00189         var $freqMax = 0.1;
00190 
00191                 // Objects:
00192         var $csObj;                             // Charset class object , t3lib_cs
00193         var $metaphoneObj;              // Metaphone object, if any
00194         var $lexerObj;                  // Lexer object for word splitting
00195 
00196 
00197 
00204         function hook_indexContent(&$pObj)      {
00205 
00206                         // Indexer configuration from Extension Manager interface:
00207                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00208 
00209                         // Crawler activation:
00210                         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
00211                 if (t3lib_extMgm::isLoaded('crawler')
00212                                 && $pObj->applicationData['tx_crawler']['running']
00213                                 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))        {
00214 
00215                                 // Setting simple log message:
00216                         $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00217 
00218                                 // Setting variables:
00219                         $this->crawlerActive = TRUE;    // Crawler active flag
00220                         $this->forceIndexing = TRUE;    // Force indexing despite timestamps etc.
00221                 }
00222 
00223                         // Determine if page should be indexed, and if so, configure and initialize indexer
00224                 if ($pObj->config['config']['index_enable'])    {
00225                         $this->log_push('Index page','');
00226 
00227                         if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00228                                 if (!$pObj->page['no_search'])  {
00229                                         if (!$pObj->no_cache)   {
00230 
00231                                                         // Setting up internal configuration from config array:
00232                                                 $this->conf = array();
00233 
00234                                                         // Information about page for which the indexing takes place
00235                                                 $this->conf['id'] = $pObj->id;                          // Page id
00236                                                 $this->conf['type'] = $pObj->type;                      // Page type
00237                                                 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
00238                                                 $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
00239                                                 $this->conf['gr_list'] = $pObj->gr_list;        // Group list
00240 
00241                                                 $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
00242                                                 $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
00243 
00244                                                 $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
00245                                                 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
00246 
00247                                                         // Root line uids
00248                                                 $this->conf['rootline_uids'] = array();
00249                                                 foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
00250                                                         $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00251                                                 }
00252 
00253                                                         // Content of page:
00254                                                 $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
00255                                                 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);      // Alternative title for indexing
00256                                                 $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
00257                                                 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
00258 
00259                                                         // Configuration of behavior:
00260                                                 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
00261                                                 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
00262 
00263                                                         // Set to zero:
00264                                                 $this->conf['recordUid'] = 0;
00265                                                 $this->conf['freeIndexUid'] = 0;
00266 
00267                                                         // Init and start indexing:
00268                                                 $this->init();
00269                                                 $this->indexTypo3PageContent();
00270 
00271                                         } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00272                                 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page header!');
00273                         } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00274                 }
00275                 $this->log_pull();
00276         }
00277 
00278 
00279 
00280 
00281 
00282 
00283 
00284 
00285         /****************************
00286          *
00287          * Backend API
00288          *
00289          ****************************/
00290 
00303         function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)      {
00304 
00305                         // Setting up internal configuration from config array:
00306                 $this->conf = array();
00307 
00308                         // Information about page for which the indexing takes place
00309                 $this->conf['id'] = $id;                                // Page id      (integer)
00310                 $this->conf['type'] = $type;                    // Page type (integer)
00311                 $this->conf['sys_language_uid'] = $sys_language_uid;    // sys_language UID of the language of the indexing (integer)
00312                 $this->conf['MP'] = $MP;                                // MP variable, if any (Mount Points) (string)
00313                 $this->conf['gr_list'] = '0,-1';        // Group list (hardcoded for now...)
00314 
00315                         // cHash values:
00316                 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';      // cHash string for additional parameters
00317                 $this->conf['cHash_array'] = $cHash_array;              // Array of the additional parameters
00318 
00319                         // Set to defaults
00320                 $this->conf['freeIndexUid'] = 0;
00321                 $this->conf['page_cache_reg1'] = '';
00322 
00323                         // Root line uids
00324                 $this->conf['rootline_uids'] = $uidRL;
00325 
00326                         // Configuration of behavior:
00327                 $this->conf['index_externals'] = 1;     // Whether to index external documents like PDF, DOC etc. (if possible)
00328                 $this->conf['index_descrLgd'] = 200;            // Length of description text (max 250, default 200)
00329 
00330                         // Init and start indexing:
00331                 $this->init();
00332         }
00333 
00340         function backend_setFreeIndexUid($freeIndexUid) {
00341                 $this->conf['freeIndexUid'] = $freeIndexUid;
00342         }
00343 
00357         function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00358 
00359                         // Content of page:
00360                 $this->conf['mtime'] = $mtime;                  // Most recent modification time (seconds) of the content
00361                 $this->conf['crdate'] = $crdate;                // The creation date of the TYPO3 content
00362                 $this->conf['recordUid'] = $recordUid;  // UID of the record, if applicable
00363 
00364                         // Construct fake HTML for parsing:
00365                 $this->conf['content'] = '
00366                 <html>
00367                         <head>
00368                                 <title>'.htmlspecialchars($title).'</title>
00369                                 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00370                                 <meta name="description" content="'.htmlspecialchars($description).'" />
00371                         </head>
00372                         <body>
00373                                 '.htmlspecialchars($content).'
00374                         </body>
00375                 </html>';                                       // Content string (HTML of TYPO3 page)
00376 
00377                         // Initializing charset:
00378                 $this->conf['metaCharset'] = $charset;                  // Character set of content (will be converted to utf-8 during indexing)
00379                 $this->conf['indexedDocTitle'] = '';    // Alternative title for indexing
00380 
00381                         // Index content as if it was a TYPO3 page:
00382                 $this->indexTypo3PageContent();
00383         }
00384 
00385 
00386 
00387 
00388 
00389 
00390 
00391 
00392 
00393 
00394 
00395 
00396 
00397         /********************************
00398          *
00399          * Initialization
00400          *
00401          *******************************/
00402 
00408         function init() {
00409                 global $TYPO3_CONF_VARS;
00410 
00411                         // Initializing:
00412                 $this->cHashParams = $this->conf['cHash_array'];
00413                 if (is_array($this->cHashParams) && count($this->cHashParams))  {
00414                         if ($this->conf['cHash'])       $this->cHashParams['cHash'] = $this->conf['cHash'];     // Add this so that URL's come out right...
00415                         unset($this->cHashParams['encryptionKey']);             // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
00416                 }
00417 
00418                         // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
00419                 $this->setT3Hashes();
00420 
00421                         // Indexer configuration from Extension Manager interface:
00422                 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00423                 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00424                 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00425                 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00426                 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00427 
00428                         // Initialize external document parsers:
00429                         // Example configuration, see ext_localconf.php of this file!
00430                 if ($this->conf['index_externals'])     {
00431                         $this->initializeExternalParsers();
00432                 }
00433 
00434                         // Initialize lexer (class that deconstructs the text into words):
00435                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00436                 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00437                                                 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00438                                                 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00439                 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
00440                 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00441 
00442                         // Initialize metaphone hook:
00443                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
00444                 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00445                         $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00446                         $this->metaphoneObj->pObj = &$this;
00447                 }
00448 
00449                         // Init charset class:
00450                 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00451         }
00452 
00460         function initializeExternalParsers()    {
00461                 global $TYPO3_CONF_VARS;
00462 
00463                 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
00464                         foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
00465                                 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
00466                                 $this->external_parsers[$extension]->pObj = &$this;
00467 
00468                                         // Init parser and if it returns false, unset its entry again:
00469                                 if (!$this->external_parsers[$extension]->initParser($extension))       {
00470                                         unset($this->external_parsers[$extension]);
00471                                 }
00472                         }
00473                 }
00474         }
00475 
00476 
00477 
00478 
00479 
00480 
00481 
00482 
00483 
00484 
00485 
00486 
00487 
00488 
00489 
00490         /********************************
00491          *
00492          * Indexing; TYPO3 pages (HTML content)
00493          *
00494          *******************************/
00495 
00501         function indexTypo3PageContent()        {
00502 
00503                 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00504                 $is_grlist = $this->is_grlist_set($this->hash['phash']);
00505 
00506                 if ($check > 0 || !$is_grlist || $this->forceIndexing)  {
00507 
00508                                 // Setting message:
00509                         if ($this->forceIndexing)       {
00510                                 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00511                         } elseif ($check > 0)   {
00512                                 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00513                         } else {
00514                                 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00515                         }
00516 
00517                                         // Divide into title,keywords,description and body:
00518                         $this->log_push('Split content','');
00519                                 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00520                                 if ($this->conf['indexedDocTitle'])     {
00521                                         $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00522                                 }
00523                         $this->log_pull();
00524 
00525                                 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
00526                         $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00527 
00528                                 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
00529                                 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
00530                                 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
00531                         $checkCHash = $this->checkContentHash();
00532                         if (!is_array($checkCHash) || $check===1)       {
00533                                 $Pstart=t3lib_div::milliseconds();
00534 
00535                                 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00536                                         $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00537                                 $this->log_pull();
00538 
00539                                                 // Splitting words
00540                                 $this->log_push('Extract words from content','');
00541                                         $splitInWords = $this->procesWordsInArrays($this->contentParts);
00542                                 $this->log_pull();
00543 
00544                                                 // Analyse the indexed words.
00545                                 $this->log_push('Analyse the extracted words','');
00546                                         $indexArr = $this->indexAnalyze($splitInWords);
00547                                 $this->log_pull();
00548 
00549                                                 // Submitting page (phash) record
00550                                 $this->log_push('Submitting page','');
00551                                         $this->submitPage();
00552                                 $this->log_pull();
00553 
00554                                                 // Check words and submit to word list if not there
00555                                 $this->log_push('Check word list and submit words','');
00556                                         $this->checkWordList($indexArr);
00557                                         $this->submitWords($indexArr,$this->hash['phash']);
00558                                 $this->log_pull();
00559 
00560                                                 // Set parsetime
00561                                 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00562 
00563                                                 // Checking external files if configured for.
00564                                 $this->log_push('Checking external files','');
00565                                 if ($this->conf['index_externals'])     {
00566                                         $this->extractLinks($this->conf['content']);
00567                                 }
00568                                 $this->log_pull();
00569                         } else {
00570                                 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
00571                                 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
00572                                 $this->updateRootline();
00573                                 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00574                         }
00575                 } else {
00576                         $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00577                 }
00578         }
00579 
00587         function splitHTMLContent($content) {
00588 
00589                         // divide head from body ( u-ouh :) )
00590                 $contentArr = $this->defaultContentArray;
00591                 $contentArr['body'] = stristr($content,'<body');
00592                 $headPart = substr($content,0,-strlen($contentArr['body']));
00593 
00594                         // get title
00595                 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00596                 $titleParts = explode(':',$contentArr['title'],2);
00597                 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00598 
00599                         // get keywords and description metatags
00600                 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
00601                 for($i=0;isset($meta[$i]);$i++) {
00602                         $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00603                         if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00604                         if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00605                 }
00606 
00607                         // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
00608                 $this->typoSearchTags($contentArr['body']);
00609 
00610                         // Get rid of unwanted sections (ie. scripting and style stuff) in body
00611                 $tagList = explode(',',$this->excludeSections);
00612                 foreach($tagList as $tag)       {
00613                         while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00614                 }
00615 
00616                         // remove tags, but first make sure we don't concatenate words by doing it
00617                 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00618                 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00619 
00620                 $contentArr['keywords'] = trim($contentArr['keywords']);
00621                 $contentArr['description'] = trim($contentArr['description']);
00622 
00623                         // Return array
00624                 return $contentArr;
00625         }
00626 
00633         function getHTMLcharset($content)       {
00634                 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))       {
00635                         if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))     {
00636                                 return $reg2[1];
00637                         }
00638                 }
00639         }
00640 
00648         function convertHTMLToUtf8($content,$charset='')        {
00649 
00650                         // Find charset:
00651                 $charset = $charset ? $charset : $this->getHTMLcharset($content);
00652                 $charset = $this->csObj->parse_charset($charset);
00653 
00654                         // Convert charset:
00655                 if ($charset && $charset!=='utf-8')     {
00656                         $content = $this->csObj->utf8_encode($content, $charset);
00657                 }
00658                         // Convert entities, assuming document is now UTF-8:
00659                 $content = $this->csObj->entities_to_utf8($content, TRUE);
00660 
00661                 return $content;
00662         }
00663 
00676         function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00677                 $endTag = '</'.$tagName.'>';
00678                 $startTag = '<'.$tagName;
00679 
00680                 $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
00681                 if(!$isTagInText) return false; // if the tag was not found, return false
00682 
00683                 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00684                 $afterTagInText = stristr($isTagInText,$endTag);
00685                 if ($afterTagInText)    {
00686                         $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00687                         $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00688                         $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00689                 } else {        // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
00690                         $tagContent='';
00691                         $stringAfter = $isTagInText;
00692                 }
00693 
00694                 return true;
00695         }
00696 
00703         function typoSearchTags(&$body) {
00704                 $expBody = explode('<!--TYPO3SEARCH_',$body);
00705 
00706                 if(count($expBody)>1) {
00707                         $body = '';
00708 
00709                         foreach($expBody as $val)       {
00710                                 $part = explode('-->',$val,2);
00711                                 if(trim($part[0])=='begin') {
00712                                         $body.= $part[1];
00713                                         $prev = '';
00714                                 } elseif(trim($part[0])=='end') {
00715                                         $body.= $prev;
00716                                 } else {
00717                                         $prev = $val;
00718                                 }
00719                         }
00720                         return true;
00721                 } else {
00722                         return false;
00723                 }
00724         }
00725 
00732         function extractLinks($content) {
00733 
00734                         // Get links:
00735                 $list = $this->extractHyperLinks($content);
00736 
00737                         // Traverse links:
00738                 foreach($list as $linkInfo)     {
00739 
00740                                 // Decode entities:
00741                         $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00742 
00743                                 // Parse URL:
00744                         $qParts = parse_url($linkSource);
00745 
00746                                 // Check for jumpurl (TYPO3 specific thing...)
00747                         if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))    {
00748                                 parse_str($qParts['query'],$getP);
00749                                 $linkSource = $getP['jumpurl'];
00750                                 $qParts = parse_url($linkSource);       // parse again due to new linkSource!
00751                         }
00752 
00753                         if ($qParts['scheme'])  {
00754                                 if ($this->indexerConfig['indexExternalURLs'])  {
00755                                                 // Index external URL (http or otherwise)
00756                                         $this->indexExternalUrl($linkSource);
00757                                 }
00758                         } elseif (!$qParts['query']) {
00759                                 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00760                                 if ($localFile && @is_file($localFile)) {
00761                                                 // Index local file:
00762                                         $this->indexRegularDocument($linkSource);
00763                                 }
00764                         }
00765                 }
00766         }
00767 
00775         function extractHyperLinks($string)     {
00776                 if (!is_object($this->htmlParser))      {
00777                         $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00778                 }
00779 
00780                 $parts = $this->htmlParser->splitTags('a',$string);
00781                 $list = array();
00782                 foreach($parts as $k => $v)     {
00783                         if ($k%2)       {
00784                                 $params = $this->htmlParser->get_tag_attributes($v,1);
00785                                 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
00786 
00787                                 switch(strtolower($firstTagName))       {
00788                                         case 'a':
00789                                                 $src = $params[0]['href'];
00790                                                 if ($src)       {
00791                                                         $list[] = array(
00792                                                                 'tag' => $v,
00793                                                                 'href' => $params[0]['href']
00794                                                         );
00795                                                 }
00796                                         break;
00797                                 }
00798                         }
00799                 }
00800 
00801                 return $list;
00802         }
00803 
00804 
00805 
00806 
00807 
00808 
00809 
00810 
00811 
00812 
00813 
00814         /******************************************
00815          *
00816          * Indexing; external URL
00817          *
00818          ******************************************/
00819 
00827         function indexExternalUrl($externalUrl) {
00828 
00829                         // Parse External URL:
00830                 $qParts = parse_url($externalUrl);
00831                 $fI = pathinfo($qParts['path']);
00832                 $ext = strtolower($fI['extension']);
00833 
00834                         // Get headers:
00835                 $urlHeaders = $this->getUrlHeaders($externalUrl);
00836                 if (stristr($urlHeaders['Content-Type'],'text/html'))   {
00837                         $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00838                         if (strlen($content))   {
00839 
00840                                         // Create temporary file:
00841                                 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
00842                                 t3lib_div::writeFile($tmpFile, $content);
00843 
00844                                         // Index that file:
00845                                 $this->indexRegularDocument($externalUrl, FALSE, $tmpFile, 'html');
00846                                 unlink($tmpFile);
00847                         }
00848                 }
00849         }
00850 
00858         function getUrlHeaders($url, $timeout = 2)      {
00859                 $url = parse_url($url);
00860 
00861                 if(!in_array($url['scheme'],array('','http')))  return FALSE;
00862 
00863                 $fp = fsockopen ($url['host'], ($url['port'] > 0 ? $url['port'] : 80), $errno, $errstr, $timeout);
00864                 if (!$fp)       {
00865                         return FALSE;
00866                 } else {
00867                         $msg = "GET ".$url['path'].($url['query'] ? '?'.$url['query'] : '')." HTTP/1.0\r\nHost: ".$url['host']."\r\n\r\n";
00868                         fputs ($fp, $msg);
00869                         $d = '';
00870                         while (!feof($fp)) {
00871                                 $line = fgets ($fp,2048);
00872 
00873                                 $d.=$line;
00874                                 if (!strlen(trim($line)))       {
00875                                         break;
00876                                 }
00877                         }
00878                         fclose ($fp);
00879 
00880                                 // Compile headers:
00881                         $headers = t3lib_div::trimExplode(chr(10),$d,1);
00882                         $retVal = array();
00883                         foreach($headers as $line)      {
00884                                 list($headKey, $headValue) = explode(':', $line, 2);
00885                                 $retVal[$headKey] = $headValue;
00886                         }
00887                         return $retVal;
00888                 }
00889         }
00890 
00891 
00892 
00893 
00894 
00895 
00896 
00897 
00898 
00899 
00900 
00901 
00902 
00903         /******************************************
00904          *
00905          * Indexing; external files (PDF, DOC, etc)
00906          *
00907          ******************************************/
00908 
00918         function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')        {
00919 
00920                         // Init
00921                 $fI = pathinfo($file);
00922                 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
00923 
00924                         // Create abs-path:
00925                 if (!$contentTmpFile)   {
00926                         if (!t3lib_div::isAbsPath($file))       {       // Relative, prepend PATH_site:
00927                                 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
00928                         } else {        // Absolute, pass-through:
00929                                 $absFile = $file;
00930                         }
00931                         $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
00932                 } else {
00933                         $absFile = $contentTmpFile;
00934                 }
00935 
00936                         // Indexing the document:
00937                 if ($absFile &&  @is_file($absFile))    {
00938                         if ($this->external_parsers[$ext])      {
00939                                 $mtime = filemtime($absFile);
00940                                 $cParts = $this->fileContentParts($ext,$absFile);
00941 
00942                                 foreach($cParts as $cPKey)      {
00943                                         $this->internal_log = array();
00944                                         $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00945                                         $Pstart = t3lib_div::milliseconds();
00946                                         $subinfo = array('key' => $cPKey);      // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
00947                                         $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
00948                                         $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
00949                                         if ($check > 0 || $force)       {
00950                                                 if ($check > 0) {
00951                                                         $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00952                                                 } else {
00953                                                         $this->log_setTSlogMessage('Indexing forced by flag',1);
00954                                                 }
00955 
00956                                                         // Check external file counter:
00957                                                 if ($this->externalFileCounter < $this->maxExternalFiles || $force)     {
00958 
00959                                                                         // Divide into title,keywords,description and body:
00960                                                         $this->log_push('Split content','');
00961                                                                 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
00962                                                         $this->log_pull();
00963 
00964                                                         if (is_array($contentParts))    {
00965                                                                         // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
00966                                                                 $content_md5h = $this->md5inthash(implode($contentParts,''));
00967 
00968                                                                 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)  {
00969 
00970                                                                                 // Increment counter:
00971                                                                         $this->externalFileCounter++;
00972 
00973                                                                                 // Splitting words
00974                                                                         $this->log_push('Extract words from content','');
00975                                                                                 $splitInWords = $this->procesWordsInArrays($contentParts);
00976                                                                         $this->log_pull();
00977 
00978                                                                                 // Analyse the indexed words.
00979                                                                         $this->log_push('Analyse the extracted words','');
00980                                                                                 $indexArr = $this->indexAnalyze($splitInWords);
00981                                                                         $this->log_pull();
00982 
00983                                                                                 // Submitting page (phash) record
00984                                                                         $this->log_push('Submitting page','');
00985                                                                                 $size = filesize($absFile);
00986                                                                                 $ctime = filemtime($absFile);   // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
00987                                                                                 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
00988                                                                         $this->log_pull();
00989 
00990                                                                                 // Check words and submit to word list if not there
00991                                                                         $this->log_push('Check word list and submit words','');
00992                                                                                 $this->checkWordList($indexArr);
00993                                                                                 $this->submitWords($indexArr,$phash_arr['phash']);
00994                                                                         $this->log_pull();
00995 
00996                                                                                 // Set parsetime
00997                                                                         $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
00998                                                                 } else {
00999                                                                         $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
01000                                                                         $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01001                                                                 }
01002                                                         } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01003                                                 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01004                                         } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01005 
01006                                                 // Checking and setting sections:
01007                 #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
01008                                         $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
01009                                         $this->log_pull();
01010                                 }
01011                         } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01012                 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01013         }
01014 
01024         function readFileContent($ext,$absFile,$cPKey)  {
01025 
01026                         // Consult relevant external document parser:
01027                 if (is_object($this->external_parsers[$ext]))   {
01028                         $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01029                 }
01030 
01031                 return $contentArr;
01032         }
01033 
01041         function fileContentParts($ext,$absFile)        {
01042                 $cParts = array(0);
01043 
01044                         // Consult relevant external document parser:
01045                 if (is_object($this->external_parsers[$ext]))   {
01046                         $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01047                 }
01048 
01049                 return $cParts;
01050         }
01051 
01059         function splitRegularContent($content) {
01060                 $contentArr = $this->defaultContentArray;
01061                 $contentArr['body'] = $content;
01062 
01063                 return $contentArr;
01064         }
01065 
01066 
01067 
01068 
01069 
01070 
01071 
01072 
01073 
01074 
01075 
01076 
01077 
01078 
01079         /**********************************
01080          *
01081          * Analysing content, Extracting words
01082          *
01083          **********************************/
01084 
01092         function charsetEntity2utf8(&$contentArr, $charset)     {
01093 
01094                         // Convert charset if necessary
01095                 reset($contentArr);
01096                 while(list($key,)=each($contentArr)) {
01097                         if (strlen($contentArr[$key]))  {
01098 
01099                                 if ($charset!=='utf-8') {
01100                                         $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01101                                 }
01102 
01103                                         // decode all numeric / html-entities in the string to real characters:
01104                                 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01105                         }
01106                 }
01107         }
01108 
01115         function procesWordsInArrays($contentArr)       {
01116 
01117                         // split all parts to words
01118                 reset($contentArr);
01119                 while(list($key,)=each($contentArr)) {
01120                         $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01121                 }
01122 
01123                         // For title, keywords, and description we don't want duplicates:
01124                 $contentArr['title'] = array_unique($contentArr['title']);
01125                 $contentArr['keywords'] = array_unique($contentArr['keywords']);
01126                 $contentArr['description'] = array_unique($contentArr['description']);
01127 
01128                         // Return modified array:
01129                 return $contentArr;
01130         }
01131 
01138         function bodyDescription($contentArr)   {
01139 
01140                         // Setting description
01141                 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01142                 if ($maxL)      {
01143                                 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
01144         #               $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01145                         $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
01146 
01147                                 // Shorten the string:
01148                         $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01149                 }
01150 
01151                 return $bodyDescription;
01152         }
01153 
01160         function indexAnalyze($content) {
01161                 $indexArr = Array();
01162                 $counter = 0;
01163 
01164                 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01165                 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01166                 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01167                 $this->analyzeBody($indexArr,$content);
01168 
01169                 return ($indexArr);
01170         }
01171 
01181         function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01182                 reset($content[$key]);
01183                 while(list(,$val)=each($content[$key]))  {
01184                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01185                         $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01186                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01187                         $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01188                         $retArr[$val]['metaphone'] = $this->metaphone($val);
01189                         $this->wordcount++;
01190                 }
01191         }
01192 
01200         function analyzeBody(&$retArr,$content) {
01201                 foreach($content['body'] as $key => $val)       {
01202                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01203                         if(!isset($retArr[$val])) {
01204                                 $retArr[$val]['first'] = $key;
01205                                 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01206                                 $retArr[$val]['metaphone'] = $this->metaphone($val);
01207                         }
01208                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01209                         $this->wordcount++;
01210                 }
01211         }
01212 
01220         function metaphone($word,$retRaw=FALSE) {
01221 
01222                 if (is_object($this->metaphoneObj))     {
01223                         $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01224                 } else {
01225                         $tmp = metaphone($word);
01226                 }
01227 
01228                         // Return raw value?
01229                 if ($retRaw)    return $tmp;
01230 
01231                         // Otherwise create hash and return integer
01232                 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01233                 return $ret;
01234         }
01235 
01236 
01237 
01238 
01239 
01240 
01241 
01242 
01243 
01244 
01245 
01246 
01247 
01248 
01249 
01250 
01251         /********************************
01252          *
01253          * SQL; TYPO3 Pages
01254          *
01255          *******************************/
01256 
01262         function submitPage()   {
01263 
01264                         // Remove any current data for this phash:
01265                 $this->removeOldIndexedPages($this->hash['phash']);
01266 
01267                         // setting new phash_row
01268                 $fields = array(
01269                         'phash' => $this->hash['phash'],
01270                         'phash_grouping' => $this->hash['phash_grouping'],
01271                         'cHashParams' => serialize($this->cHashParams),
01272                         'contentHash' => $this->content_md5h,
01273                         'data_page_id' => $this->conf['id'],
01274                         'data_page_reg1' => $this->conf['page_cache_reg1'],
01275                         'data_page_type' => $this->conf['type'],
01276                         'data_page_mp' => $this->conf['MP'],
01277                         'gr_list' => $this->conf['gr_list'],
01278                         'item_type' => 0,       // TYPO3 page
01279                         'item_title' => $this->contentParts['title'],
01280                         'item_description' => $this->bodyDescription($this->contentParts),
01281                         'item_mtime' => $this->conf['mtime'],
01282                         'item_size' => strlen($this->conf['content']),
01283                         'tstamp' => time(),
01284                         'crdate' => time(),
01285                         'item_crdate' => $this->conf['crdate'], // Creation date of page
01286                         'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
01287                         'externalUrl' => 0,
01288                         'recordUid' => intval($this->conf['recordUid']),
01289                         'freeIndexUid' => intval($this->conf['freeIndexUid']),
01290                 );
01291                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01292 
01293                         // PROCESSING index_section
01294                 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01295 
01296                         // PROCESSING index_grlist
01297                 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01298 
01299                         // PROCESSING index_fulltext
01300                 $fields = array(
01301                         'phash' => $this->hash['phash'],
01302                         'fulltextdata' => implode(' ', $this->contentParts)
01303                 );
01304                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01305 
01306                         // PROCESSING index_debug
01307                 if ($this->indexerConfig['debugMode'])  {
01308                         $fields = array(
01309                                 'phash' => $this->hash['phash'],
01310                                 'debuginfo' => serialize(array(
01311                                                 'cHashParams' => $this->cHashParams,
01312                                                 'external_parsers initialized' => array_keys($this->external_parsers),
01313                                                 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01314                                                 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01315                                                 'logs' => $this->internal_log,
01316                                                 'lexer' => $this->lexerObj->debugString,
01317                                         ))
01318                         );
01319                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01320                 }
01321         }
01322 
01331         function submit_grlist($hash,$phash_x)  {
01332 
01333                         // Setting the gr_list record
01334                 $fields = array(
01335                         'phash' => $hash,
01336                         'phash_x' => $phash_x,
01337                         'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
01338                         'gr_list' => $this->conf['gr_list']
01339                 );
01340                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01341         }
01342 
01351         function submit_section($hash,$hash_t3) {
01352                 $fields = array(
01353                         'phash' => $hash,
01354                         'phash_t3' => $hash_t3,
01355                         'page_id' => intval($this->conf['id'])
01356                 );
01357 
01358                 $this->getRootLineFields($fields);
01359 
01360                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01361         }
01362 
01369         function removeOldIndexedPages($phash)  {
01370                         // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
01371                 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
01372                 foreach($tableArr as $table)    {
01373                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01374                 }
01375                         // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
01376                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
01377         }
01378 
01379 
01380 
01381 
01382 
01383 
01384 
01385 
01386 
01387 
01388 
01389 
01390 
01391         /********************************
01392          *
01393          * SQL; External media
01394          *
01395          *******************************/
01396 
01397 
01412         function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)      {
01413 
01414                         // Find item Type:
01415                 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
01416                 $storeItemType = $storeItemType ? $storeItemType : $ext;
01417 
01418                         // Remove any current data for this phash:
01419                 $this->removeOldIndexedFiles($hash['phash']);
01420 
01421                         // Split filename:
01422                 $fileParts = parse_url($file);
01423 
01424                         // setting new
01425                 $fields = array(
01426                         'phash' => $hash['phash'],
01427                         'phash_grouping' => $hash['phash_grouping'],
01428                         'cHashParams' => serialize($subinfo),
01429                         'contentHash' => $content_md5h,
01430                         'data_filename' => $file,
01431                         'item_type' => $storeItemType,
01432                         'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01433                         'item_description' => $this->bodyDescription($contentParts),
01434                         'item_mtime' => $mtime,
01435                         'item_size' => $size,
01436                         'item_crdate' => $ctime,
01437                         'tstamp' => time(),
01438                         'crdate' => time(),
01439                         'gr_list' => $this->conf['gr_list'],
01440                         'externalUrl' => $fileParts['scheme'] ? 1 : 0,
01441                         'recordUid' => intval($this->conf['recordUid']),
01442                         'freeIndexUid' => intval($this->conf['freeIndexUid']),
01443                 );
01444                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01445 
01446                         // PROCESSING index_fulltext
01447                 $fields = array(
01448                         'phash' => $hash['phash'],
01449                         'fulltextdata' => implode(' ', $contentParts)
01450                 );
01451                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01452 
01453                         // PROCESSING index_debug
01454                 if ($this->indexerConfig['debugMode'])  {
01455                         $fields = array(
01456                                 'phash' => $hash['phash'],
01457                                 'debuginfo' => serialize(array(
01458                                                 'cHashParams' => $subinfo,
01459                                                 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
01460                                                 'logs' => $this->internal_log,
01461                                                 'lexer' => $this->lexerObj->debugString,
01462                                         ))
01463                         );
01464                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01465                 }
01466         }
01467 
01474         function submitFile_grlist($hash)       {
01475                         // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
01476                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
01477                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01478                         $this->submit_grlist($hash,$hash);
01479                 }
01480         }
01481 
01488         function submitFile_section($hash)      {
01489                         // Testing if there is a section
01490                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
01491                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01492                         $this->submit_section($hash,$this->hash['phash']);
01493                 }
01494         }
01495 
01502         function removeOldIndexedFiles($phash)  {
01503 
01504                         // Removing old registrations for tables.
01505                 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
01506                 foreach($tableArr as $table)    {
01507                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01508                 }
01509         }
01510 
01511 
01512 
01513 
01514 
01515 
01516 
01517 
01518 
01519 
01520 
01521 
01522 
01523 
01524         /********************************
01525          *
01526          * SQL Helper functions
01527          *
01528          *******************************/
01529 
01538         function checkMtimeTstamp($mtime,$phash)        {
01539 
01540                         // Select indexed page:
01541                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01542                 $out = 0;
01543 
01544                         // If there was an indexing of the page...:
01545                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01546                         if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time())     {               // If max age is exceeded, index the page
01547                                 $out = 1;               // The configured max-age was exceeded for the document and thus it's indexed.
01548                         } else {
01549                                 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time())      {       // if minAge is not set or if minAge is exceeded, consider at mtime
01550                                         if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
01551                                                 if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
01552                                                         $out = 2;               // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
01553                                                 } else {
01554                                                         $out = -1;              // mtime matched the document, so no changes detected and no content updated
01555                                                         if ($this->tstamp_maxAge)       {
01556                                                                 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
01557                                                         } else {
01558                                                                 $this->updateTstamp($phash);    // Update the timestatmp
01559                                                                 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
01560                                                         }
01561                                                 }
01562                                         } else {$out = 3;       }       // The minimum age was exceed, but mtime was not set, so the page was indexed.
01563                                 } else {$out = -2;}                     // The minimum age was not exceeded
01564                         }
01565                 } else {$out = 4;}      // Page has never been indexed (is not represented in the index_phash table).
01566                 return $out;
01567         }
01568 
01574         function checkContentHash()     {
01575                         // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
01576                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01577                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01578                         return $row;
01579                 }
01580                 return 1;
01581         }
01582 
01591         function checkExternalDocContentHash($hashGr,$content_md5h)     {
01592                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01593                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01594                         return 0;
01595                 }
01596                 return 1;
01597         }
01598 
01605         function is_grlist_set($phash_x)        {
01606                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
01607                 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
01608         }
01609 
01618         function update_grlist($phash,$phash_x) {
01619                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
01620                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01621                         $this->submit_grlist($phash,$phash_x);
01622                         $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
01623                 }
01624         }
01625 
01633         function updateTstamp($phash,$mtime=0)  {
01634                 $updateFields = array(
01635                         'tstamp' => time()
01636                 );
01637                 if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
01638 
01639                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01640         }
01641 
01649         function updateParsetime($phash,$parsetime)     {
01650                 $updateFields = array(
01651                         'parsetime' => intval($parsetime)
01652                 );
01653 
01654                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01655         }
01656 
01662         function updateRootline()       {
01663 
01664                 $updateFields = array();
01665                 $this->getRootLineFields($updateFields);
01666 
01667                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
01668         }
01669 
01677         function getRootLineFields(&$fieldArr)  {
01678 
01679                 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
01680                 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
01681                 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
01682 
01683                 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
01684                         foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
01685                                 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
01686                         }
01687                 }
01688         }
01689 
01696         function removeLoginpagesWithContentHash()      {
01697                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
01698                                         A.phash=B.phash
01699                                         AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01700                                         AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01701                                         AND A.contentHash='.intval($this->content_md5h));
01702                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
01703                         $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01704                         $this->removeOldIndexedPages($row['phash']);
01705                 }
01706         }
01707 
01708 
01709 
01710 
01711 
01712 
01713 
01714 
01715 
01716 
01717 
01718 
01719         /********************************
01720          *
01721          * SQL; Submitting words
01722          *
01723          *******************************/
01724 
01731         function checkWordList($wl) {
01732                 reset($wl);
01733                 $phashArr = array();
01734                 while(list($key,) = each($wl)) {
01735                         $phashArr[] = $wl[$key]['hash'];
01736                 }
01737                 if (count($phashArr))   {
01738                         $cwl = implode(',',$phashArr);
01739                         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
01740 
01741                         if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
01742                                 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
01743                                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01744                                         unset($wl[$row['baseword']]);
01745                                 }
01746 
01747                                 reset($wl);
01748                                 while(list($key,$val)=each($wl)) {
01749                                         $insertFields = array(
01750                                                 'wid' => $val['hash'],
01751                                                 'baseword' => $key,
01752                                                 'metaphone' => $val['metaphone']
01753                                         );
01754                                                 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
01755                                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
01756                                 }
01757                         }
01758                 }
01759         }
01760 
01768         function submitWords($wl,$phash) {
01769                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
01770 
01771                 foreach($wl as $val)    {
01772                         $insertFields = array(
01773                                 'phash' => $phash,
01774                                 'wid' => $val['hash'],
01775                                 'count' => $val['count'],
01776                                 'first' => $val['first'],
01777                                 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
01778                                 'flags' => ($val['cmp'] & $this->flagBitMask)
01779                         );
01780 
01781                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
01782                 }
01783         }
01784 
01792         function freqMap($freq) {
01793                 $mapFactor = $this->freqMax*100*$this->freqRange;
01794                 if($freq<1) {
01795                         $newFreq = $freq*$mapFactor;
01796                         $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
01797                 } else {
01798                         $newFreq = $freq/$mapFactor;
01799                 }
01800                 return $newFreq;
01801 
01802         }
01803 
01804 
01805 
01806 
01807 
01808 
01809 
01810 
01811 
01812 
01813 
01814         /********************************
01815          *
01816          * Hashing
01817          *
01818          *******************************/
01819 
01825         function setT3Hashes()  {
01826 
01827                         //  Set main array:
01828                 $hArray = array(
01829                         'id' => (integer)$this->conf['id'],
01830                         'type' => (integer)$this->conf['type'],
01831                         'sys_lang' => (integer)$this->conf['sys_language_uid'],
01832                         'MP' => (string)$this->conf['MP'],
01833                         'cHash' => $this->cHashParams
01834                 );
01835 
01836                         // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
01837                 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01838 
01839                         // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
01840                 $hArray['gr_list'] = (string)$this->conf['gr_list'];
01841                 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
01842         }
01843 
01851         function setExtHashes($file,$subinfo=array())   {
01852                         //  Set main array:
01853                 $hash = array();
01854                 $hArray = array(
01855                         'file' => $file,
01856                 );
01857 
01858                         // Set grouping hash:
01859                 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01860 
01861                         // Add subinfo
01862                 $hArray['subinfo'] = $subinfo;
01863                 $hash['phash'] = $this->md5inthash(serialize($hArray));
01864 
01865                 return $hash;
01866         }
01867 
01875         function md5inthash($str)       {
01876                 return hexdec(substr(md5($str),0,7));
01877         }
01878 
01885         function makeCHash($paramArray) {
01886                 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
01887 
01888                 $pA = t3lib_div::cHashParams($addQueryParams);
01889 
01890                 return t3lib_div::shortMD5(serialize($pA));
01891         }
01892 
01893 
01894 
01895 
01896 
01897 
01898 
01899 
01900 
01901 
01902 
01903 
01904         /*********************************
01905          *
01906          * Internal logging functions
01907          *
01908          *********************************/
01909 
01917         function log_push($msg,$key)    {
01918                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->push($msg,$key);
01919         }
01920 
01926         function log_pull()     {
01927                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->pull();
01928         }
01929 
01937         function log_setTSlogMessage($msg, $errorNum=0) {
01938                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
01939                 $this->internal_log[] = $msg;
01940         }
01941 
01942 
01943 
01944 
01945 
01946 
01947 
01948 
01949         /**************************
01950          *
01951          * tslib_fe hooks:
01952          *
01953          **************************/
01954 
01962         function fe_headerNoCache(&$params, $ref)       {
01963 
01964                         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
01965                 if (t3lib_extMgm::isLoaded('crawler')
01966                                 && $params['pObj']->applicationData['tx_crawler']['running']
01967                                 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions']))      {
01968 
01969                                 // Setting simple log entry:
01970                         $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
01971 
01972                                 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
01973                         $params['disableAcquireCacheData'] = TRUE;
01974                 }
01975         }
01976 }
01977 
01978 
01979 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])    {
01980         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
01981 }
01982 ?>


Généré par Le spécialiste TYPO3 avec  doxygen 1.4.6