Documentation TYPO3 par Ameos

class.indexer.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
00132 
00133 
00141 class tx_indexedsearch_indexer {
00142 
00143                 // Messages:
00144         var $reasons = array(
00145                 -1 => 'mtime matched the document, so no changes detected and no content updated',
00146                 -2 => 'The minimum age was not exceeded',
00147                 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00148                 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00149                 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00150                 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00151         );
00152 
00153                 // HTML code blocks to exclude from indexing:
00154         var $excludeSections = 'script,style';
00155 
00156                 // Supported Extensions for external files:
00157         var $external_parsers = array();                // External parser objects, keys are file extension names. Values are objects with certain methods.
00158 
00159                 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
00160         var $defaultGrList = '0,-1';
00161 
00162                 // Min/Max times:
00163         var $tstamp_maxAge = 0;         // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
00164         var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
00165         var $maxExternalFiles = 0;      // Max number of external files to index.
00166 
00167         var $forceIndexing = FALSE;             // If true, indexing is forced despite of hashes etc.
00168         var $crawlerActive = FALSE;             // Set when crawler is detected (internal)
00169 
00170                 // INTERNALS:
00171         var $defaultContentArray=array(
00172                 'title' => '',
00173                 'description' => '',
00174                 'keywords' => '',
00175                 'body' => '',
00176         );
00177         var $wordcount = 0;
00178         var $externalFileCounter = 0;
00179 
00180         var $conf = array();            // Configuration set internally (see init functions for required keys and their meaning)
00181         var $indexerConfig = array();   // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
00182         var $hash = array();            // Hash array, contains phash and phash_grouping
00183         var $file_phash_arr = array();  // Hash array for files
00184         var $contentParts = array();    // Content of TYPO3 page
00185         var $content_md5h = '';
00186         var $internal_log = array();    // Internal log
00187         var $indexExternalUrl_content = '';
00188 
00189         var $cHashParams = array();     // cHashparams array
00190 
00191         var $freqRange = 32000;
00192         var $freqMax = 0.1;
00193 
00194                 // Objects:
00195         var $csObj;                             // Charset class object , t3lib_cs
00196         var $metaphoneObj;              // Metaphone object, if any
00197         var $lexerObj;                  // Lexer object for word splitting
00198 
00199 
00200 
00207         function hook_indexContent(&$pObj)      {
00208 
00209                         // Indexer configuration from Extension Manager interface:
00210                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00211 
00212                         // Crawler activation:
00213                         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
00214                 if (t3lib_extMgm::isLoaded('crawler')
00215                                 && $pObj->applicationData['tx_crawler']['running']
00216                                 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))        {
00217 
00218                                 // Setting simple log message:
00219                         $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00220 
00221                                 // Setting variables:
00222                         $this->crawlerActive = TRUE;    // Crawler active flag
00223                         $this->forceIndexing = TRUE;    // Force indexing despite timestamps etc.
00224                 }
00225 
00226                         // Determine if page should be indexed, and if so, configure and initialize indexer
00227                 if ($pObj->config['config']['index_enable'])    {
00228                         $this->log_push('Index page','');
00229 
00230                         if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00231                                 if (!$pObj->page['no_search'])  {
00232                                         if (!$pObj->no_cache)   {
00233                                                 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content))       {
00234 
00235                                                                 // Setting up internal configuration from config array:
00236                                                         $this->conf = array();
00237 
00238                                                                 // Information about page for which the indexing takes place
00239                                                         $this->conf['id'] = $pObj->id;                          // Page id
00240                                                         $this->conf['type'] = $pObj->type;                      // Page type
00241                                                         $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
00242                                                         $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
00243                                                         $this->conf['gr_list'] = $pObj->gr_list;        // Group list
00244 
00245                                                         $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
00246                                                         $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
00247 
00248                                                         $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
00249                                                         $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
00250 
00251                                                                 // Root line uids
00252                                                         $this->conf['rootline_uids'] = array();
00253                                                         foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
00254                                                                 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00255                                                         }
00256 
00257                                                                 // Content of page:
00258                                                         $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
00259                                                         $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);      // Alternative title for indexing
00260                                                         $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
00261                                                         $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
00262 
00263                                                                 // Configuration of behavior:
00264                                                         $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
00265                                                         $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
00266 
00267                                                                 // Set to zero:
00268                                                         $this->conf['recordUid'] = 0;
00269                                                         $this->conf['freeIndexUid'] = 0;
00270                                                         $this->conf['freeIndexSetId'] = 0;
00271 
00272                                                                 // Init and start indexing:
00273                                                         $this->init();
00274                                                         $this->indexTypo3PageContent();
00275                                                 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
00276                                         } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00277                                 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
00278                         } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00279                         $this->log_pull();
00280                 }
00281         }
00282 
00283 
00284 
00285 
00286 
00287 
00288 
00289 
00290         /****************************
00291          *
00292          * Backend API
00293          *
00294          ****************************/
00295 
00308         function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)      {
00309 
00310                         // Setting up internal configuration from config array:
00311                 $this->conf = array();
00312 
00313                         // Information about page for which the indexing takes place
00314                 $this->conf['id'] = $id;                                // Page id      (integer)
00315                 $this->conf['type'] = $type;                    // Page type (integer)
00316                 $this->conf['sys_language_uid'] = $sys_language_uid;    // sys_language UID of the language of the indexing (integer)
00317                 $this->conf['MP'] = $MP;                                // MP variable, if any (Mount Points) (string)
00318                 $this->conf['gr_list'] = '0,-1';        // Group list (hardcoded for now...)
00319 
00320                         // cHash values:
00321                 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';      // cHash string for additional parameters
00322                 $this->conf['cHash_array'] = $cHash_array;              // Array of the additional parameters
00323 
00324                         // Set to defaults
00325                 $this->conf['freeIndexUid'] = 0;
00326                 $this->conf['freeIndexSetId'] = 0;
00327                 $this->conf['page_cache_reg1'] = '';
00328 
00329                         // Root line uids
00330                 $this->conf['rootline_uids'] = $uidRL;
00331 
00332                         // Configuration of behavior:
00333                 $this->conf['index_externals'] = 1;     // Whether to index external documents like PDF, DOC etc. (if possible)
00334                 $this->conf['index_descrLgd'] = 200;            // Length of description text (max 250, default 200)
00335 
00336                         // Init and start indexing:
00337                 $this->init();
00338         }
00339 
00347         function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)      {
00348                 $this->conf['freeIndexUid'] = $freeIndexUid;
00349                 $this->conf['freeIndexSetId'] = $freeIndexSetId;
00350         }
00351 
00365         function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00366 
00367                         // Content of page:
00368                 $this->conf['mtime'] = $mtime;                  // Most recent modification time (seconds) of the content
00369                 $this->conf['crdate'] = $crdate;                // The creation date of the TYPO3 content
00370                 $this->conf['recordUid'] = $recordUid;  // UID of the record, if applicable
00371 
00372                         // Construct fake HTML for parsing:
00373                 $this->conf['content'] = '
00374                 <html>
00375                         <head>
00376                                 <title>'.htmlspecialchars($title).'</title>
00377                                 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00378                                 <meta name="description" content="'.htmlspecialchars($description).'" />
00379                         </head>
00380                         <body>
00381                                 '.htmlspecialchars($content).'
00382                         </body>
00383                 </html>';                                       // Content string (HTML of TYPO3 page)
00384 
00385                         // Initializing charset:
00386                 $this->conf['metaCharset'] = $charset;                  // Character set of content (will be converted to utf-8 during indexing)
00387                 $this->conf['indexedDocTitle'] = '';    // Alternative title for indexing
00388 
00389                         // Index content as if it was a TYPO3 page:
00390                 $this->indexTypo3PageContent();
00391         }
00392 
00393 
00394 
00395 
00396 
00397 
00398 
00399 
00400 
00401 
00402 
00403 
00404 
00405         /********************************
00406          *
00407          * Initialization
00408          *
00409          *******************************/
00410 
00416         function init() {
00417                 global $TYPO3_CONF_VARS;
00418 
00419                         // Initializing:
00420                 $this->cHashParams = $this->conf['cHash_array'];
00421                 if (is_array($this->cHashParams) && count($this->cHashParams))  {
00422                         if ($this->conf['cHash'])       $this->cHashParams['cHash'] = $this->conf['cHash'];     // Add this so that URL's come out right...
00423                         unset($this->cHashParams['encryptionKey']);             // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
00424                 }
00425 
00426                         // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
00427                 $this->setT3Hashes();
00428 
00429                         // Indexer configuration from Extension Manager interface:
00430                 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00431                 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00432                 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00433                 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00434                 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00435 
00436                         // Initialize external document parsers:
00437                         // Example configuration, see ext_localconf.php of this file!
00438                 if ($this->conf['index_externals'])     {
00439                         $this->initializeExternalParsers();
00440                 }
00441 
00442                         // Initialize lexer (class that deconstructs the text into words):
00443                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00444                 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00445                                                 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00446                                                 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00447                 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
00448                 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00449 
00450                         // Initialize metaphone hook:
00451                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
00452                 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00453                         $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00454                         $this->metaphoneObj->pObj = &$this;
00455                 }
00456 
00457                         // Init charset class:
00458                 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00459         }
00460 
00468         function initializeExternalParsers()    {
00469                 global $TYPO3_CONF_VARS;
00470 
00471                 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
00472                         foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
00473                                 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
00474                                 $this->external_parsers[$extension]->pObj = &$this;
00475 
00476                                         // Init parser and if it returns false, unset its entry again:
00477                                 if (!$this->external_parsers[$extension]->initParser($extension))       {
00478                                         unset($this->external_parsers[$extension]);
00479                                 }
00480                         }
00481                 }
00482         }
00483 
00484 
00485 
00486 
00487 
00488 
00489 
00490 
00491 
00492 
00493 
00494 
00495 
00496 
00497 
00498         /********************************
00499          *
00500          * Indexing; TYPO3 pages (HTML content)
00501          *
00502          *******************************/
00503 
00509         function indexTypo3PageContent()        {
00510 
00511                 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00512                 $is_grlist = $this->is_grlist_set($this->hash['phash']);
00513 
00514                 if ($check > 0 || !$is_grlist || $this->forceIndexing)  {
00515 
00516                                 // Setting message:
00517                         if ($this->forceIndexing)       {
00518                                 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00519                         } elseif ($check > 0)   {
00520                                 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00521                         } else {
00522                                 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00523                         }
00524 
00525                                         // Divide into title,keywords,description and body:
00526                         $this->log_push('Split content','');
00527                                 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00528                                 if ($this->conf['indexedDocTitle'])     {
00529                                         $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00530                                 }
00531                         $this->log_pull();
00532 
00533                                 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
00534                         $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00535 
00536                                 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
00537                                 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
00538                                 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
00539                         $checkCHash = $this->checkContentHash();
00540                         if (!is_array($checkCHash) || $check===1)       {
00541                                 $Pstart=t3lib_div::milliseconds();
00542 
00543                                 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00544                                         $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00545                                 $this->log_pull();
00546 
00547                                                 // Splitting words
00548                                 $this->log_push('Extract words from content','');
00549                                         $splitInWords = $this->processWordsInArrays($this->contentParts);
00550                                 $this->log_pull();
00551 
00552                                                 // Analyse the indexed words.
00553                                 $this->log_push('Analyse the extracted words','');
00554                                         $indexArr = $this->indexAnalyze($splitInWords);
00555                                 $this->log_pull();
00556 
00557                                                 // Submitting page (phash) record
00558                                 $this->log_push('Submitting page','');
00559                                         $this->submitPage();
00560                                 $this->log_pull();
00561 
00562                                                 // Check words and submit to word list if not there
00563                                 $this->log_push('Check word list and submit words','');
00564                                         $this->checkWordList($indexArr);
00565                                         $this->submitWords($indexArr,$this->hash['phash']);
00566                                 $this->log_pull();
00567 
00568                                                 // Set parsetime
00569                                 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00570 
00571                                                 // Checking external files if configured for.
00572                                 $this->log_push('Checking external files','');
00573                                 if ($this->conf['index_externals'])     {
00574                                         $this->extractLinks($this->conf['content']);
00575                                 }
00576                                 $this->log_pull();
00577                         } else {
00578                                 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
00579                                 $this->updateSetId($this->hash['phash']);
00580                                 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
00581                                 $this->updateRootline();
00582                                 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00583                         }
00584                 } else {
00585                         $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00586                 }
00587         }
00588 
00596         function splitHTMLContent($content) {
00597 
00598                         // divide head from body ( u-ouh :) )
00599                 $contentArr = $this->defaultContentArray;
00600                 $contentArr['body'] = stristr($content,'<body');
00601                 $headPart = substr($content,0,-strlen($contentArr['body']));
00602 
00603                         // get title
00604                 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00605                 $titleParts = explode(':',$contentArr['title'],2);
00606                 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00607 
00608                         // get keywords and description metatags
00609                 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
00610                 for($i=0;isset($meta[$i]);$i++) {
00611                         $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00612                         if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00613                         if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00614                 }
00615 
00616                         // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
00617                 $this->typoSearchTags($contentArr['body']);
00618 
00619                         // Get rid of unwanted sections (ie. scripting and style stuff) in body
00620                 $tagList = explode(',',$this->excludeSections);
00621                 foreach($tagList as $tag)       {
00622                         while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00623                 }
00624 
00625                         // remove tags, but first make sure we don't concatenate words by doing it
00626                 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00627                 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00628 
00629                 $contentArr['keywords'] = trim($contentArr['keywords']);
00630                 $contentArr['description'] = trim($contentArr['description']);
00631 
00632                         // Return array
00633                 return $contentArr;
00634         }
00635 
00642         function getHTMLcharset($content)       {
00643                 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))       {
00644                         if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))     {
00645                                 return $reg2[1];
00646                         }
00647                 }
00648         }
00649 
00657         function convertHTMLToUtf8($content,$charset='')        {
00658 
00659                         // Find charset:
00660                 $charset = $charset ? $charset : $this->getHTMLcharset($content);
00661                 $charset = $this->csObj->parse_charset($charset);
00662 
00663                         // Convert charset:
00664                 if ($charset && $charset!=='utf-8')     {
00665                         $content = $this->csObj->utf8_encode($content, $charset);
00666                 }
00667                         // Convert entities, assuming document is now UTF-8:
00668                 $content = $this->csObj->entities_to_utf8($content, TRUE);
00669 
00670                 return $content;
00671         }
00672 
00685         function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00686                 $endTag = '</'.$tagName.'>';
00687                 $startTag = '<'.$tagName;
00688 
00689                 $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
00690                 if(!$isTagInText) return false; // if the tag was not found, return false
00691 
00692                 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00693                 $afterTagInText = stristr($isTagInText,$endTag);
00694                 if ($afterTagInText)    {
00695                         $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00696                         $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00697                         $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00698                 } else {        // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
00699                         $tagContent='';
00700                         $stringAfter = $isTagInText;
00701                 }
00702 
00703                 return true;
00704         }
00705 
00712         function typoSearchTags(&$body) {
00713                 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
00714 
00715                 if(count($expBody)>1) {
00716                         $body = '';
00717 
00718                         foreach($expBody as $val)       {
00719                                 $part = explode('-->',$val,2);
00720                                 if(trim($part[0])=='begin') {
00721                                         $body.= $part[1];
00722                                         $prev = '';
00723                                 } elseif(trim($part[0])=='end') {
00724                                         $body.= $prev;
00725                                 } else {
00726                                         $prev = $val;
00727                                 }
00728                         }
00729                         return true;
00730                 } else {
00731                         return false;
00732                 }
00733         }
00734 
00741         function extractLinks($content) {
00742 
00743                         // Get links:
00744                 $list = $this->extractHyperLinks($content);
00745 
00746                 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler'))    {
00747                         $this->includeCrawlerClass();
00748                         $crawler = t3lib_div::makeInstance('tx_crawler_lib');
00749                 }
00750 
00751                         // Traverse links:
00752                 foreach($list as $linkInfo)     {
00753 
00754                                 // Decode entities:
00755                         if ($linkInfo['localPath'])     {       // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
00756                                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
00757                         } else {
00758                                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00759                         }
00760 
00761                                 // Parse URL:
00762                         $qParts = parse_url($linkSource);
00763 
00764                                 // Check for jumpurl (TYPO3 specific thing...)
00765                         if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))    {
00766                                 parse_str($qParts['query'],$getP);
00767                                 $linkSource = $getP['jumpurl'];
00768                                 $qParts = parse_url($linkSource);       // parse again due to new linkSource!
00769                         }
00770 
00771                         if ($qParts['scheme'])  {
00772                                 if ($this->indexerConfig['indexExternalURLs'])  {
00773                                                 // Index external URL (http or otherwise)
00774                                         $this->indexExternalUrl($linkSource);
00775                                 }
00776                         } elseif (!$qParts['query']) {
00777                                 if (t3lib_div::isAllowedAbsPath($linkSource))   {
00778                                         $localFile = $linkSource;
00779                                 } else {
00780                                         $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00781                                 }
00782                                 if ($localFile && @is_file($localFile)) {
00783 
00784                                                 // Index local file:
00785                                         if ($linkInfo['localPath'])     {
00786 
00787                                                 $fI = pathinfo($linkSource);
00788                                                 $ext = strtolower($fI['extension']);
00789                                                 if (is_object($crawler))        {
00790                                                         $params = array(
00791                                                                 'document' => $linkSource,
00792                                                                 'alturl' => $linkInfo['href'],
00793                                                                 'conf' => $this->conf
00794                                                         );
00795                                                         unset($params['conf']['content']);
00796 
00797                                                         $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00798                                                         $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00799                                                 } else {
00800                                                         $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
00801                                                 }
00802                                         } else {
00803                                                 if (is_object($crawler))        {
00804                                                         $params = array(
00805                                                                 'document' => $linkSource,
00806                                                                 'conf' => $this->conf
00807                                                         );
00808                                                         unset($params['conf']['content']);
00809                                                         $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00810                                                         $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00811                                                 } else {
00812                                                         $this->indexRegularDocument($linkSource);
00813                                                 }
00814                                         }
00815                                 }
00816                         }
00817                 }
00818         }
00819 
00827         function extractHyperLinks($string)     {
00828                 if (!is_object($this->htmlParser))      {
00829                         $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00830                 }
00831 
00832                 $parts = $this->htmlParser->splitTags('a',$string);
00833                 $list = array();
00834                 foreach ($parts as $k => $v)    {
00835                         if ($k%2)       {
00836                                 $params = $this->htmlParser->get_tag_attributes($v,1);
00837                                 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
00838 
00839                                 switch (strtolower($firstTagName))      {
00840                                         case 'a':
00841                                                 $src = $params[0]['href'];
00842                                                 if ($src)       {
00843                                                                 // Check if a local path to that file has been set - useful if you are using a download script.
00844                                                         $md5 = t3lib_div::shortMD5($src);
00845                                                         if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']))  {
00846                                                                 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
00847                                                         } else $localPath=false;
00848 
00849                                                         $list[] = array(
00850                                                                 'tag' => $v,
00851                                                                 'href' => $params[0]['href'],
00852                                                                 'localPath' => $localPath
00853                                                         );
00854                                                 }
00855                                         break;
00856                                 }
00857                         }
00858                 }
00859 
00860                 return $list;
00861         }
00862 
00863 
00864 
00865 
00866 
00867 
00868 
00869 
00870 
00871 
00872 
00873         /******************************************
00874          *
00875          * Indexing; external URL
00876          *
00877          ******************************************/
00878 
00886         function indexExternalUrl($externalUrl) {
00887 
00888                         // Parse External URL:
00889                 $qParts = parse_url($externalUrl);
00890                 $fI = pathinfo($qParts['path']);
00891                 $ext = strtolower($fI['extension']);
00892 
00893                         // Get headers:
00894                 $urlHeaders = $this->getUrlHeaders($externalUrl);
00895                 if (stristr($urlHeaders['Content-Type'],'text/html'))   {
00896                         $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00897                         if (strlen($content))   {
00898 
00899                                         // Create temporary file:
00900                                 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
00901                                 t3lib_div::writeFile($tmpFile, $content);
00902 
00903                                         // Index that file:
00904                                 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');      // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
00905                                 unlink($tmpFile);
00906                         }
00907                 }
00908         }
00909 
00917         function getUrlHeaders($url)    {
00918                 $content = t3lib_div::getURL($url,2);   // Try to get the headers only
00919 
00920                 if (strlen($content))   {
00921                                 // Compile headers:
00922                         $headers = t3lib_div::trimExplode(chr(10),$content,1);
00923                         $retVal = array();
00924                         foreach($headers as $line)      {
00925                                 if (!strlen(trim($line)))       {
00926                                         break;  // Stop at the first empty line (= end of header)
00927                                 }
00928 
00929                                 list($headKey, $headValue) = explode(':', $line, 2);
00930                                 $retVal[$headKey] = $headValue;
00931                         }
00932                         return $retVal;
00933                 }
00934         }
00935 
00936 
00937 
00938 
00939 
00940 
00941 
00942 
00943 
00944 
00945 
00946 
00947 
00948         /******************************************
00949          *
00950          * Indexing; external files (PDF, DOC, etc)
00951          *
00952          ******************************************/
00953 
00963         function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')        {
00964 
00965                         // Init
00966                 $fI = pathinfo($file);
00967                 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
00968 
00969                         // Create abs-path:
00970                 if (!$contentTmpFile)   {
00971                         if (!t3lib_div::isAbsPath($file))       {       // Relative, prepend PATH_site:
00972                                 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
00973                         } else {        // Absolute, pass-through:
00974                                 $absFile = $file;
00975                         }
00976                         $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
00977                 } else {
00978                         $absFile = $contentTmpFile;
00979                 }
00980 
00981                         // Indexing the document:
00982                 if ($absFile && @is_file($absFile))     {
00983                         if ($this->external_parsers[$ext])      {
00984                                 $mtime = filemtime($absFile);
00985                                 $cParts = $this->fileContentParts($ext,$absFile);
00986 
00987                                 foreach($cParts as $cPKey)      {
00988                                         $this->internal_log = array();
00989                                         $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00990                                         $Pstart = t3lib_div::milliseconds();
00991                                         $subinfo = array('key' => $cPKey);      // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
00992                                         $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
00993                                         $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
00994                                         if ($check > 0 || $force)       {
00995                                                 if ($check > 0) {
00996                                                         $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00997                                                 } else {
00998                                                         $this->log_setTSlogMessage('Indexing forced by flag',1);
00999                                                 }
01000 
01001                                                         // Check external file counter:
01002                                                 if ($this->externalFileCounter < $this->maxExternalFiles || $force)     {
01003 
01004                                                                         // Divide into title,keywords,description and body:
01005                                                         $this->log_push('Split content','');
01006                                                                 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
01007                                                         $this->log_pull();
01008 
01009                                                         if (is_array($contentParts))    {
01010                                                                         // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
01011                                                                 $content_md5h = $this->md5inthash(implode($contentParts,''));
01012 
01013                                                                 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)  {
01014 
01015                                                                                 // Increment counter:
01016                                                                         $this->externalFileCounter++;
01017 
01018                                                                                 // Splitting words
01019                                                                         $this->log_push('Extract words from content','');
01020                                                                                 $splitInWords = $this->processWordsInArrays($contentParts);
01021                                                                         $this->log_pull();
01022 
01023                                                                                 // Analyse the indexed words.
01024                                                                         $this->log_push('Analyse the extracted words','');
01025                                                                                 $indexArr = $this->indexAnalyze($splitInWords);
01026                                                                         $this->log_pull();
01027 
01028                                                                                 // Submitting page (phash) record
01029                                                                         $this->log_push('Submitting page','');
01030                                                                                 $size = filesize($absFile);
01031                                                                                 $ctime = filemtime($absFile);   // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
01032                                                                                 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
01033                                                                         $this->log_pull();
01034 
01035                                                                                 // Check words and submit to word list if not there
01036                                                                         $this->log_push('Check word list and submit words','');
01037                                                                                 $this->checkWordList($indexArr);
01038                                                                                 $this->submitWords($indexArr,$phash_arr['phash']);
01039                                                                         $this->log_pull();
01040 
01041                                                                                 // Set parsetime
01042                                                                         $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
01043                                                                 } else {
01044                                                                         $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
01045                                                                         $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01046                                                                 }
01047                                                         } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01048                                                 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01049                                         } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01050 
01051                                                 // Checking and setting sections:
01052                 #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
01053                                         $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
01054                                         $this->log_pull();
01055                                 }
01056                         } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01057                 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01058         }
01059 
01069         function readFileContent($ext,$absFile,$cPKey)  {
01070 
01071                         // Consult relevant external document parser:
01072                 if (is_object($this->external_parsers[$ext]))   {
01073                         $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01074                 }
01075 
01076                 return $contentArr;
01077         }
01078 
01086         function fileContentParts($ext,$absFile)        {
01087                 $cParts = array(0);
01088 
01089                         // Consult relevant external document parser:
01090                 if (is_object($this->external_parsers[$ext]))   {
01091                         $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01092                 }
01093 
01094                 return $cParts;
01095         }
01096 
01104         function splitRegularContent($content) {
01105                 $contentArr = $this->defaultContentArray;
01106                 $contentArr['body'] = $content;
01107 
01108                 return $contentArr;
01109         }
01110 
01111 
01112 
01113 
01114 
01115 
01116 
01117 
01118 
01119 
01120 
01121 
01122 
01123 
01124         /**********************************
01125          *
01126          * Analysing content, Extracting words
01127          *
01128          **********************************/
01129 
01137         function charsetEntity2utf8(&$contentArr, $charset)     {
01138 
01139                         // Convert charset if necessary
01140                 reset($contentArr);
01141                 while(list($key,)=each($contentArr)) {
01142                         if (strlen($contentArr[$key]))  {
01143 
01144                                 if ($charset!=='utf-8') {
01145                                         $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01146                                 }
01147 
01148                                         // decode all numeric / html-entities in the string to real characters:
01149                                 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01150                         }
01151                 }
01152         }
01153 
01160         function processWordsInArrays($contentArr)      {
01161 
01162                         // split all parts to words
01163                 reset($contentArr);
01164                 while(list($key,)=each($contentArr)) {
01165                         $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01166                 }
01167 
01168                         // For title, keywords, and description we don't want duplicates:
01169                 $contentArr['title'] = array_unique($contentArr['title']);
01170                 $contentArr['keywords'] = array_unique($contentArr['keywords']);
01171                 $contentArr['description'] = array_unique($contentArr['description']);
01172 
01173                         // Return modified array:
01174                 return $contentArr;
01175         }
01176 
01185         function procesWordsInArrays($contentArr)       {
01186                 return $this->processWordsInArrays($contentArr);
01187         }
01188 
01195         function bodyDescription($contentArr)   {
01196 
01197                         // Setting description
01198                 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01199                 if ($maxL)      {
01200                                 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
01201         #               $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01202                         $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
01203 
01204                                 // Shorten the string:
01205                         $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01206                 }
01207 
01208                 return $bodyDescription;
01209         }
01210 
01217         function indexAnalyze($content) {
01218                 $indexArr = Array();
01219                 $counter = 0;
01220 
01221                 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01222                 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01223                 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01224                 $this->analyzeBody($indexArr,$content);
01225 
01226                 return ($indexArr);
01227         }
01228 
01238         function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01239                 reset($content[$key]);
01240                 while(list(,$val)=each($content[$key]))  {
01241                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01242                         $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01243                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01244                         $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01245                         $retArr[$val]['metaphone'] = $this->metaphone($val);
01246                         $this->wordcount++;
01247                 }
01248         }
01249 
01257         function analyzeBody(&$retArr,$content) {
01258                 foreach($content['body'] as $key => $val)       {
01259                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01260                         if(!isset($retArr[$val])) {
01261                                 $retArr[$val]['first'] = $key;
01262                                 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01263                                 $retArr[$val]['metaphone'] = $this->metaphone($val);
01264                         }
01265                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01266                         $this->wordcount++;
01267                 }
01268         }
01269 
01277         function metaphone($word,$retRaw=FALSE) {
01278 
01279                 if (is_object($this->metaphoneObj))     {
01280                         $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01281                 } else {
01282                         $tmp = metaphone($word);
01283                 }
01284 
01285                         // Return raw value?
01286                 if ($retRaw)    return $tmp;
01287 
01288                         // Otherwise create hash and return integer
01289                 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01290                 return $ret;
01291         }
01292 
01293 
01294 
01295 
01296 
01297 
01298 
01299 
01300 
01301 
01302 
01303 
01304 
01305 
01306 
01307 
01308         /********************************
01309          *
01310          * SQL; TYPO3 Pages
01311          *
01312          *******************************/
01313 
01319         function submitPage()   {
01320 
01321                         // Remove any current data for this phash:
01322                 $this->removeOldIndexedPages($this->hash['phash']);
01323 
01324                         // setting new phash_row
01325                 $fields = array(
01326                         'phash' => $this->hash['phash'],
01327                         'phash_grouping' => $this->hash['phash_grouping'],
01328                         'cHashParams' => serialize($this->cHashParams),
01329                         'contentHash' => $this->content_md5h,
01330                         'data_page_id' => $this->conf['id'],
01331                         'data_page_reg1' => $this->conf['page_cache_reg1'],
01332                         'data_page_type' => $this->conf['type'],
01333                         'data_page_mp' => $this->conf['MP'],
01334                         'gr_list' => $this->conf['gr_list'],
01335                         'item_type' => 0,       // TYPO3 page
01336                         'item_title' => $this->contentParts['title'],
01337                         'item_description' => $this->bodyDescription($this->contentParts),
01338                         'item_mtime' => $this->conf['mtime'],
01339                         'item_size' => strlen($this->conf['content']),
01340                         'tstamp' => time(),
01341                         'crdate' => time(),
01342                         'item_crdate' => $this->conf['crdate'], // Creation date of page
01343                         'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
01344                         'externalUrl' => 0,
01345                         'recordUid' => intval($this->conf['recordUid']),
01346                         'freeIndexUid' => intval($this->conf['freeIndexUid']),
01347                         'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01348                 );
01349 
01350                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01351 
01352                         // PROCESSING index_section
01353                 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01354 
01355                         // PROCESSING index_grlist
01356                 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01357 
01358                         // PROCESSING index_fulltext
01359                 $fields = array(
01360                         'phash' => $this->hash['phash'],
01361                         'fulltextdata' => implode(' ', $this->contentParts)
01362                 );
01363                 if ($this->indexerConfig['fullTextDataLength']>0)       {
01364                         $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01365                 }
01366                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01367 
01368                         // PROCESSING index_debug
01369                 if ($this->indexerConfig['debugMode'])  {
01370                         $fields = array(
01371                                 'phash' => $this->hash['phash'],
01372                                 'debuginfo' => serialize(array(
01373                                                 'cHashParams' => $this->cHashParams,
01374                                                 'external_parsers initialized' => array_keys($this->external_parsers),
01375                                                 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01376                                                 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01377                                                 'logs' => $this->internal_log,
01378                                                 'lexer' => $this->lexerObj->debugString,
01379                                         ))
01380                         );
01381                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01382                 }
01383         }
01384 
01393         function submit_grlist($hash,$phash_x)  {
01394 
01395                         // Setting the gr_list record
01396                 $fields = array(
01397                         'phash' => $hash,
01398                         'phash_x' => $phash_x,
01399                         'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
01400                         'gr_list' => $this->conf['gr_list']
01401                 );
01402                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01403         }
01404 
01413         function submit_section($hash,$hash_t3) {
01414                 $fields = array(
01415                         'phash' => $hash,
01416                         'phash_t3' => $hash_t3,
01417                         'page_id' => intval($this->conf['id'])
01418                 );
01419 
01420                 $this->getRootLineFields($fields);
01421 
01422                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01423         }
01424 
01431         function removeOldIndexedPages($phash)  {
01432                         // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
01433                 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
01434                 foreach($tableArr as $table)    {
01435                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01436                 }
01437                         // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
01438                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
01439         }
01440 
01441 
01442 
01443 
01444 
01445 
01446 
01447 
01448 
01449 
01450 
01451 
01452 
01453         /********************************
01454          *
01455          * SQL; External media
01456          *
01457          *******************************/
01458 
01459 
01474         function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)      {
01475 
01476                         // Find item Type:
01477                 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
01478                 $storeItemType = $storeItemType ? $storeItemType : $ext;
01479 
01480                         // Remove any current data for this phash:
01481                 $this->removeOldIndexedFiles($hash['phash']);
01482 
01483                         // Split filename:
01484                 $fileParts = parse_url($file);
01485 
01486                         // Setting new
01487                 $fields = array(
01488                         'phash' => $hash['phash'],
01489                         'phash_grouping' => $hash['phash_grouping'],
01490                         'cHashParams' => serialize($subinfo),
01491                         'contentHash' => $content_md5h,
01492                         'data_filename' => $file,
01493                         'item_type' => $storeItemType,
01494                         'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01495                         'item_description' => $this->bodyDescription($contentParts),
01496                         'item_mtime' => $mtime,
01497                         'item_size' => $size,
01498                         'item_crdate' => $ctime,
01499                         'tstamp' => time(),
01500                         'crdate' => time(),
01501                         'gr_list' => $this->conf['gr_list'],
01502                         'externalUrl' => $fileParts['scheme'] ? 1 : 0,
01503                         'recordUid' => intval($this->conf['recordUid']),
01504                         'freeIndexUid' => intval($this->conf['freeIndexUid']),
01505                         'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01506                 );
01507                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01508 
01509                         // PROCESSING index_fulltext
01510                 $fields = array(
01511                         'phash' => $hash['phash'],
01512                         'fulltextdata' => implode(' ', $contentParts)
01513                 );
01514                 if ($this->indexerConfig['fullTextDataLength']>0)       {
01515                         $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01516                 }
01517                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01518 
01519                         // PROCESSING index_debug
01520                 if ($this->indexerConfig['debugMode'])  {
01521                         $fields = array(
01522                                 'phash' => $hash['phash'],
01523                                 'debuginfo' => serialize(array(
01524                                                 'cHashParams' => $subinfo,
01525                                                 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
01526                                                 'logs' => $this->internal_log,
01527                                                 'lexer' => $this->lexerObj->debugString,
01528                                         ))
01529                         );
01530                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01531                 }
01532         }
01533 
01540         function submitFile_grlist($hash)       {
01541                         // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
01542                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
01543                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01544                         $this->submit_grlist($hash,$hash);
01545                 }
01546         }
01547 
01554         function submitFile_section($hash)      {
01555                         // Testing if there is a section
01556                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
01557                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01558                         $this->submit_section($hash,$this->hash['phash']);
01559                 }
01560         }
01561 
01568         function removeOldIndexedFiles($phash)  {
01569 
01570                         // Removing old registrations for tables.
01571                 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
01572                 foreach($tableArr as $table)    {
01573                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01574                 }
01575         }
01576 
01577 
01578 
01579 
01580 
01581 
01582 
01583 
01584 
01585 
01586 
01587 
01588 
01589 
01590         /********************************
01591          *
01592          * SQL Helper functions
01593          *
01594          *******************************/
01595 
01604         function checkMtimeTstamp($mtime,$phash)        {
01605 
01606                         // Select indexed page:
01607                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01608                 $out = 0;
01609 
01610                         // If there was an indexing of the page...:
01611                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01612                         if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time())     {               // If max age is exceeded, index the page
01613                                 $out = 1;               // The configured max-age was exceeded for the document and thus it's indexed.
01614                         } else {
01615                                 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time())      {       // if minAge is not set or if minAge is exceeded, consider at mtime
01616                                         if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
01617                                                 if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
01618                                                         $out = 2;               // The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.
01619                                                 } else {
01620                                                         $out = -1;              // mtime matched the document, so no changes detected and no content updated
01621                                                         if ($this->tstamp_maxAge)       {
01622                                                                 $this->log_setTSlogMessage('Mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
01623                                                         } else {
01624                                                                 $this->updateTstamp($phash);    // Update the timestatmp
01625                                                                 $this->log_setTSlogMessage('Mtime matched, timestamp updated.',1);
01626                                                         }
01627                                                 }
01628                                         } else {$out = 3;       }       // The minimum age was exceed, but mtime was not set, so the page was indexed.
01629                                 } else {$out = -2;}                     // The minimum age was not exceeded
01630                         }
01631                 } else {$out = 4;}      // Page has never been indexed (is not represented in the index_phash table).
01632                 return $out;
01633         }
01634 
01640         function checkContentHash()     {
01641                         // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
01642                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01643                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01644                         return $row;
01645                 }
01646                 return 1;
01647         }
01648 
01657         function checkExternalDocContentHash($hashGr,$content_md5h)     {
01658                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01659                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01660                         return 0;
01661                 }
01662                 return 1;
01663         }
01664 
01671         function is_grlist_set($phash_x)        {
01672                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
01673                 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
01674         }
01675 
01684         function update_grlist($phash,$phash_x) {
01685                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
01686                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01687                         $this->submit_grlist($phash,$phash_x);
01688                         $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
01689                 }
01690         }
01691 
01699         function updateTstamp($phash,$mtime=0)  {
01700                 $updateFields = array(
01701                         'tstamp' => time()
01702                 );
01703                 if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
01704 
01705                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01706         }
01707 
01714         function updateSetId($phash)    {
01715                 $updateFields = array(
01716                         'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
01717                 );
01718 
01719                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01720         }
01721 
01729         function updateParsetime($phash,$parsetime)     {
01730                 $updateFields = array(
01731                         'parsetime' => intval($parsetime)
01732                 );
01733 
01734                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01735         }
01736 
01742         function updateRootline()       {
01743 
01744                 $updateFields = array();
01745                 $this->getRootLineFields($updateFields);
01746 
01747                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
01748         }
01749 
01757         function getRootLineFields(&$fieldArr)  {
01758 
01759                 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
01760                 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
01761                 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
01762 
01763                 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
01764                         foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
01765                                 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
01766                         }
01767                 }
01768         }
01769 
01776         function removeLoginpagesWithContentHash()      {
01777                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
01778                                         A.phash=B.phash
01779                                         AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01780                                         AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01781                                         AND A.contentHash='.intval($this->content_md5h));
01782                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
01783                         $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01784                         $this->removeOldIndexedPages($row['phash']);
01785                 }
01786         }
01787 
01793         function includeCrawlerClass()  {
01794                 global $TYPO3_CONF_VARS;
01795 
01796                 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
01797         }
01798 
01799 
01800 
01801 
01802 
01803 
01804 
01805 
01806 
01807 
01808         /********************************
01809          *
01810          * SQL; Submitting words
01811          *
01812          *******************************/
01813 
01820         function checkWordList($wl) {
01821                 reset($wl);
01822                 $phashArr = array();
01823                 while(list($key,) = each($wl)) {
01824                         $phashArr[] = $wl[$key]['hash'];
01825                 }
01826                 if (count($phashArr))   {
01827                         $cwl = implode(',',$phashArr);
01828                         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
01829 
01830                         if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
01831                                 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
01832                                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01833                                         unset($wl[$row['baseword']]);
01834                                 }
01835 
01836                                 reset($wl);
01837                                 while(list($key,$val)=each($wl)) {
01838                                         $insertFields = array(
01839                                                 'wid' => $val['hash'],
01840                                                 'baseword' => $key,
01841                                                 'metaphone' => $val['metaphone']
01842                                         );
01843                                                 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 60 chars (the baseword varchar is 60 characters...) this is not a problem.
01844                                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
01845                                 }
01846                         }
01847                 }
01848         }
01849 
01857         function submitWords($wl,$phash) {
01858                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
01859 
01860                 foreach($wl as $val)    {
01861                         $insertFields = array(
01862                                 'phash' => $phash,
01863                                 'wid' => $val['hash'],
01864                                 'count' => $val['count'],
01865                                 'first' => $val['first'],
01866                                 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
01867                                 'flags' => ($val['cmp'] & $this->flagBitMask)
01868                         );
01869 
01870                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
01871                 }
01872         }
01873 
01881         function freqMap($freq) {
01882                 $mapFactor = $this->freqMax*100*$this->freqRange;
01883                 if($freq<1) {
01884                         $newFreq = $freq*$mapFactor;
01885                         $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
01886                 } else {
01887                         $newFreq = $freq/$mapFactor;
01888                 }
01889                 return $newFreq;
01890 
01891         }
01892 
01893 
01894 
01895 
01896 
01897 
01898 
01899 
01900 
01901 
01902 
01903         /********************************
01904          *
01905          * Hashing
01906          *
01907          *******************************/
01908 
01914         function setT3Hashes()  {
01915 
01916                         //  Set main array:
01917                 $hArray = array(
01918                         'id' => (integer)$this->conf['id'],
01919                         'type' => (integer)$this->conf['type'],
01920                         'sys_lang' => (integer)$this->conf['sys_language_uid'],
01921                         'MP' => (string)$this->conf['MP'],
01922                         'cHash' => $this->cHashParams
01923                 );
01924 
01925                         // Set grouping hash (Identifies a "page" combined of id, type, language, mountpoint and cHash parameters):
01926                 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01927 
01928                         // Add gr_list and set plain phash (Subdivision where special page composition based on login is taken into account as well. It is expected that such pages are normally similar regardless of the login.)
01929                 $hArray['gr_list'] = (string)$this->conf['gr_list'];
01930                 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
01931         }
01932 
01940         function setExtHashes($file,$subinfo=array())   {
01941                         //  Set main array:
01942                 $hash = array();
01943                 $hArray = array(
01944                         'file' => $file,
01945                 );
01946 
01947                         // Set grouping hash:
01948                 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01949 
01950                         // Add subinfo
01951                 $hArray['subinfo'] = $subinfo;
01952                 $hash['phash'] = $this->md5inthash(serialize($hArray));
01953 
01954                 return $hash;
01955         }
01956 
01964         function md5inthash($str)       {
01965                 return hexdec(substr(md5($str),0,7));
01966         }
01967 
01974         function makeCHash($paramArray) {
01975                 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
01976 
01977                 $pA = t3lib_div::cHashParams($addQueryParams);
01978 
01979                 return t3lib_div::shortMD5(serialize($pA));
01980         }
01981 
01982 
01983 
01984 
01985 
01986 
01987 
01988 
01989 
01990 
01991 
01992 
01993         /*********************************
01994          *
01995          * Internal logging functions
01996          *
01997          *********************************/
01998 
02006         function log_push($msg,$key)    {
02007                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->push($msg,$key);
02008         }
02009 
02015         function log_pull()     {
02016                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->pull();
02017         }
02018 
02026         function log_setTSlogMessage($msg, $errorNum=0) {
02027                 if (is_object($GLOBALS['TT']))          $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
02028                 $this->internal_log[] = $msg;
02029         }
02030 
02031 
02032 
02033 
02034 
02035 
02036 
02037 
02038         /**************************
02039          *
02040          * tslib_fe hooks:
02041          *
02042          **************************/
02043 
02051         function fe_headerNoCache(&$params, $ref)       {
02052 
02053                         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
02054                 if (t3lib_extMgm::isLoaded('crawler')
02055                                 && $params['pObj']->applicationData['tx_crawler']['running']
02056                                 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions']))      {
02057 
02058                                 // Setting simple log entry:
02059                         $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
02060 
02061                                 // Disables a look-up for cached page data - thus resulting in re-generation of the page even if cached.
02062                         $params['disableAcquireCacheData'] = TRUE;
02063                 }
02064         }
02065 }
02066 
02067 
02068 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])    {
02069         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
02070 }
02071 ?>


Généré par TYPO3 Ameos avec  doxygen 1.4.6