Documentation TYPO3 par Ameos

class.indexer.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
00132 
00133 
00141 class tx_indexedsearch_indexer {
00142 
00143                 // Messages:
00144         var $reasons = array(
00145                 -1 => 'mtime matched the document, so no changes detected and no content updated',
00146                 -2 => 'The minimum age was not exceeded',
00147                 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00148                 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00149                 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00150                 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00151         );
00152 
00153                 // HTML code blocks to exclude from indexing:
00154         var $excludeSections = 'script,style';
00155 
00156                 // Supported Extensions for external files:
00157         var $external_parsers = array();                // External parser objects, keys are file extension names. Values are objects with certain methods.
00158 
00159                 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
00160         var $defaultGrList = '0,-1';
00161 
00162                 // Min/Max times:
00163         var $tstamp_maxAge = 0;         // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
00164         var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
00165         var $maxExternalFiles = 0;      // Max number of external files to index.
00166 
00167         var $forceIndexing = FALSE;             // If true, indexing is forced despite of hashes etc.
00168         var $crawlerActive = FALSE;             // Set when crawler is detected (internal)
00169 
00170                 // INTERNALS:
00171         var $defaultContentArray=array(
00172                 'title' => '',
00173                 'description' => '',
00174                 'keywords' => '',
00175                 'body' => '',
00176         );
00177         var $wordcount = 0;
00178         var $externalFileCounter = 0;
00179 
00180         var $conf = array();            // Configuration set internally (see init functions for required keys and their meaning)
00181         var $indexerConfig = array();   // Indexer configuration, coming from $GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']
00182         var $hash = array();            // Hash array, contains phash and phash_grouping
00183         var $file_phash_arr = array();  // Hash array for files
00184         var $contentParts = array();    // Content of TYPO3 page
00185         var $content_md5h = '';
00186         var $internal_log = array();    // Internal log
00187         var $indexExternalUrl_content = '';
00188 
00189         var $cHashParams = array();     // cHashparams array
00190 
00191         var $freqRange = 32000;
00192         var $freqMax = 0.1;
00193 
00194                 // Objects:
00195         var $csObj;                             // Charset class object , t3lib_cs
00196         var $metaphoneObj;              // Metaphone object, if any
00197         var $lexerObj;                  // Lexer object for word splitting
00198 
00199 
00200 
00207         function hook_indexContent(&$pObj)      {
00208 
00209                         // Indexer configuration from Extension Manager interface:
00210                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00211 
00212                         // Crawler activation:
00213                         // Requirements are that the crawler is loaded, a crawler session is running and re-indexing requested as processing instruction:
00214                 if (t3lib_extMgm::isLoaded('crawler')
00215                                 && $pObj->applicationData['tx_crawler']['running']
00216                                 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions']))        {
00217 
00218                                 // Setting simple log message:
00219                         $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00220 
00221                                 // Setting variables:
00222                         $this->crawlerActive = TRUE;    // Crawler active flag
00223                         $this->forceIndexing = TRUE;    // Force indexing despite timestamps etc.
00224                 }
00225 
00226                         // Determine if page should be indexed, and if so, configure and initialize indexer
00227                 if ($pObj->config['config']['index_enable'])    {
00228                         $this->log_push('Index page','');
00229 
00230                         if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00231                                 if (!$pObj->page['no_search'])  {
00232                                         if (!$pObj->no_cache)   {
00233                                                 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content))       {
00234 
00235                                                                 // Setting up internal configuration from config array:
00236                                                         $this->conf = array();
00237 
00238                                                                 // Information about page for which the indexing takes place
00239                                                         $this->conf['id'] = $pObj->id;                          // Page id
00240                                                         $this->conf['type'] = $pObj->type;                      // Page type
00241                                                         $this->conf['sys_language_uid'] = $pObj->sys_language_uid;      // sys_language UID of the language of the indexing.
00242                                                         $this->conf['MP'] = $pObj->MP;                          // MP variable, if any (Mount Points)
00243                                                         $this->conf['gr_list'] = $pObj->gr_list;        // Group list
00244 
00245                                                         $this->conf['cHash'] = $pObj->cHash;                                    // cHash string for additional parameters
00246                                                         $this->conf['cHash_array'] = $pObj->cHash_array;                // Array of the additional parameters
00247 
00248                                                         $this->conf['crdate'] = $pObj->page['crdate'];                  // The creation date of the TYPO3 page
00249                                                         $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;        // reg1 of the caching table. Not known what practical use this has.
00250 
00251                                                                 // Root line uids
00252                                                         $this->conf['rootline_uids'] = array();
00253                                                         foreach($pObj->config['rootLine'] as $rlkey => $rldat)  {
00254                                                                 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00255                                                         }
00256 
00257                                                                 // Content of page:
00258                                                         $this->conf['content'] = $pObj->content;                                        // Content string (HTML of TYPO3 page)
00259                                                         $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);      // Alternative title for indexing
00260                                                         $this->conf['metaCharset'] = $pObj->metaCharset;                        // Character set of content (will be converted to utf-8 during indexing)
00261                                                         $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];      // Most recent modification time (seconds) of the content on the page. Used to evaluate whether it should be re-indexed.
00262 
00263                                                                 // Configuration of behavior:
00264                                                         $this->conf['index_externals'] = $pObj->config['config']['index_externals'];    // Whether to index external documents like PDF, DOC etc. (if possible)
00265                                                         $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];              // Length of description text (max 250, default 200)
00266 
00267                                                                 // Set to zero:
00268                                                         $this->conf['recordUid'] = 0;
00269                                                         $this->conf['freeIndexUid'] = 0;
00270                                                         $this->conf['freeIndexSetId'] = 0;
00271 
00272                                                                 // Init and start indexing:
00273                                                         $this->init();
00274                                                         $this->indexTypo3PageContent();
00275                                                 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
00276                                         } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00277                                 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
00278                         } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00279                         $this->log_pull();
00280                 }
00281         }
00282 
00283 
00284 
00285 
00286 
00287 
00288 
00289 
00290         /****************************
00291          *
00292          * Backend API
00293          *
00294          ****************************/
00295 
00308         function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE)      {
00309 
00310                         // Setting up internal configuration from config array:
00311                 $this->conf = array();
00312 
00313                         // Information about page for which the indexing takes place
00314                 $this->conf['id'] = $id;                                // Page id      (integer)
00315                 $this->conf['type'] = $type;                    // Page type (integer)
00316                 $this->conf['sys_language_uid'] = $sys_language_uid;    // sys_language UID of the language of the indexing (integer)
00317                 $this->conf['MP'] = $MP;                                // MP variable, if any (Mount Points) (string)
00318                 $this->conf['gr_list'] = '0,-1';        // Group list (hardcoded for now...)
00319 
00320                         // cHash values:
00321                 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';      // cHash string for additional parameters
00322                 $this->conf['cHash_array'] = $cHash_array;              // Array of the additional parameters
00323 
00324                         // Set to defaults
00325                 $this->conf['freeIndexUid'] = 0;
00326                 $this->conf['freeIndexSetId'] = 0;
00327                 $this->conf['page_cache_reg1'] = '';
00328 
00329                         // Root line uids
00330                 $this->conf['rootline_uids'] = $uidRL;
00331 
00332                         // Configuration of behavior:
00333                 $this->conf['index_externals'] = 1;     // Whether to index external documents like PDF, DOC etc. (if possible)
00334                 $this->conf['index_descrLgd'] = 200;            // Length of description text (max 250, default 200)
00335 
00336                         // Init and start indexing:
00337                 $this->init();
00338         }
00339 
00347         function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0)      {
00348                 $this->conf['freeIndexUid'] = $freeIndexUid;
00349                 $this->conf['freeIndexSetId'] = $freeIndexSetId;
00350         }
00351 
00365         function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00366 
00367                         // Content of page:
00368                 $this->conf['mtime'] = $mtime;                  // Most recent modification time (seconds) of the content
00369                 $this->conf['crdate'] = $crdate;                // The creation date of the TYPO3 content
00370                 $this->conf['recordUid'] = $recordUid;  // UID of the record, if applicable
00371 
00372                         // Construct fake HTML for parsing:
00373                 $this->conf['content'] = '
00374                 <html>
00375                         <head>
00376                                 <title>'.htmlspecialchars($title).'</title>
00377                                 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00378                                 <meta name="description" content="'.htmlspecialchars($description).'" />
00379                         </head>
00380                         <body>
00381                                 '.htmlspecialchars($content).'
00382                         </body>
00383                 </html>';                                       // Content string (HTML of TYPO3 page)
00384 
00385                         // Initializing charset:
00386                 $this->conf['metaCharset'] = $charset;                  // Character set of content (will be converted to utf-8 during indexing)
00387                 $this->conf['indexedDocTitle'] = '';    // Alternative title for indexing
00388 
00389                         // Index content as if it was a TYPO3 page:
00390                 $this->indexTypo3PageContent();
00391         }
00392 
00393 
00394 
00395 
00396 
00397 
00398 
00399 
00400 
00401 
00402 
00403 
00404 
00405         /********************************
00406          *
00407          * Initialization
00408          *
00409          *******************************/
00410 
00416         function init() {
00417                 global $TYPO3_CONF_VARS;
00418 
00419                         // Initializing:
00420                 $this->cHashParams = $this->conf['cHash_array'];
00421                 if (is_array($this->cHashParams) && count($this->cHashParams))  {
00422                         if ($this->conf['cHash'])       $this->cHashParams['cHash'] = $this->conf['cHash'];     // Add this so that URL's come out right...
00423                         unset($this->cHashParams['encryptionKey']);             // encryptionKey is added inside TSFE in order to calculate the cHash value and it should NOT be a part of this array!!! If it is it will be exposed in links!!!
00424                 }
00425 
00426                         // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
00427                 $this->setT3Hashes();
00428 
00429                         // Indexer configuration from Extension Manager interface:
00430                 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00431                 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00432                 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00433                 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00434                 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00435 
00436                         // Initialize external document parsers:
00437                         // Example configuration, see ext_localconf.php of this file!
00438                 if ($this->conf['index_externals'])     {
00439                         $this->initializeExternalParsers();
00440                 }
00441 
00442                         // Initialize lexer (class that deconstructs the text into words):
00443                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] = 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00444                 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00445                                                 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00446                                                 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00447                 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
00448                 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00449 
00450                         // Initialize metaphone hook:
00451                         // Example configuration (localconf.php) for this hook: $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone'] = 'EXT:indexed_search/class.doublemetaphone.php:&user_DoubleMetaPhone';
00452                 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00453                         $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00454                         $this->metaphoneObj->pObj = &$this;
00455                 }
00456 
00457                         // Init charset class:
00458                 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00459         }
00460 
00468         function initializeExternalParsers()    {
00469                 global $TYPO3_CONF_VARS;
00470 
00471                 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers']))        {
00472                         foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef)    {
00473                                 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
00474                                 $this->external_parsers[$extension]->pObj = &$this;
00475 
00476                                         // Init parser and if it returns false, unset its entry again:
00477                                 if (!$this->external_parsers[$extension]->initParser($extension))       {
00478                                         unset($this->external_parsers[$extension]);
00479                                 }
00480                         }
00481                 }
00482         }
00483 
00484 
00485 
00486 
00487 
00488 
00489 
00490 
00491 
00492 
00493 
00494 
00495 
00496 
00497 
00498         /********************************
00499          *
00500          * Indexing; TYPO3 pages (HTML content)
00501          *
00502          *******************************/
00503 
00509         function indexTypo3PageContent()        {
00510 
00511                 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00512                 $is_grlist = $this->is_grlist_set($this->hash['phash']);
00513 
00514                 if ($check > 0 || !$is_grlist || $this->forceIndexing)  {
00515 
00516                                 // Setting message:
00517                         if ($this->forceIndexing)       {
00518                                 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00519                         } elseif ($check > 0)   {
00520                                 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00521                         } else {
00522                                 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00523                         }
00524 
00525                                         // Divide into title,keywords,description and body:
00526                         $this->log_push('Split content','');
00527                                 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00528                                 if ($this->conf['indexedDocTitle'])     {
00529                                         $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00530                                 }
00531                         $this->log_pull();
00532 
00533                                 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
00534                         $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00535 
00536                                 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
00537                                 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
00538                                 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
00539                         $checkCHash = $this->checkContentHash();
00540                         if (!is_array($checkCHash) || $check===1)       {
00541                                 $Pstart=t3lib_div::milliseconds();
00542 
00543                                 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00544                                         $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00545                                 $this->log_pull();
00546 
00547                                                 // Splitting words
00548                                 $this->log_push('Extract words from content','');
00549                                         $splitInWords = $this->processWordsInArrays($this->contentParts);
00550                                 $this->log_pull();
00551 
00552                                                 // Analyse the indexed words.
00553                                 $this->log_push('Analyse the extracted words','');
00554                                         $indexArr = $this->indexAnalyze($splitInWords);
00555                                 $this->log_pull();
00556 
00557                                                 // Submitting page (phash) record
00558                                 $this->log_push('Submitting page','');
00559                                         $this->submitPage();
00560                                 $this->log_pull();
00561 
00562                                                 // Check words and submit to word list if not there
00563                                 $this->log_push('Check word list and submit words','');
00564                                         $this->checkWordList($indexArr);
00565                                         $this->submitWords($indexArr,$this->hash['phash']);
00566                                 $this->log_pull();
00567 
00568                                                 // Set parsetime
00569                                 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00570 
00571                                                 // Checking external files if configured for.
00572                                 $this->log_push('Checking external files','');
00573                                 if ($this->conf['index_externals'])     {
00574                                         $this->extractLinks($this->conf['content']);
00575                                 }
00576                                 $this->log_pull();
00577                         } else {
00578                                 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']); // Update the timestatmp
00579                                 $this->updateSetId($this->hash['phash']);
00580                                 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);        // $checkCHash['phash'] is the phash of the result row that is similar to the current phash regarding the content hash.
00581                                 $this->updateRootline();
00582                                 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00583                         }
00584                 } else {
00585                         $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00586                 }
00587         }
00588 
00596         function splitHTMLContent($content) {
00597 
00598                         // divide head from body ( u-ouh :) )
00599                 $contentArr = $this->defaultContentArray;
00600                 $contentArr['body'] = stristr($content,'<body');
00601                 $headPart = substr($content,0,-strlen($contentArr['body']));
00602 
00603                         // get title
00604                 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00605                 $titleParts = explode(':',$contentArr['title'],2);
00606                 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00607 
00608                         // get keywords and description metatags
00609                 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
00610                 for($i=0;isset($meta[$i]);$i++) {
00611                         $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00612                         if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00613                         if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00614                 }
00615 
00616                         // Process <!--TYPO3SEARCH_begin--> or <!--TYPO3SEARCH_end--> tags:
00617                 $this->typoSearchTags($contentArr['body']);
00618 
00619                         // Get rid of unwanted sections (ie. scripting and style stuff) in body
00620                 $tagList = explode(',',$this->excludeSections);
00621                 foreach($tagList as $tag)       {
00622                         while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00623                 }
00624 
00625                         // remove tags, but first make sure we don't concatenate words by doing it
00626                 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00627                 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00628 
00629                 $contentArr['keywords'] = trim($contentArr['keywords']);
00630                 $contentArr['description'] = trim($contentArr['description']);
00631 
00632                         // Return array
00633                 return $contentArr;
00634         }
00635 
00642         function getHTMLcharset($content)       {
00643                 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg))       {
00644                         if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2))     {
00645                                 return $reg2[1];
00646                         }
00647                 }
00648         }
00649 
00657         function convertHTMLToUtf8($content,$charset='')        {
00658 
00659                         // Find charset:
00660                 $charset = $charset ? $charset : $this->getHTMLcharset($content);
00661                 $charset = $this->csObj->parse_charset($charset);
00662 
00663                         // Convert charset:
00664                 if ($charset && $charset!=='utf-8')     {
00665                         $content = $this->csObj->utf8_encode($content, $charset);
00666                 }
00667                         // Convert entities, assuming document is now UTF-8:
00668                 $content = $this->csObj->entities_to_utf8($content, TRUE);
00669 
00670                 return $content;
00671         }
00672 
00685         function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00686                 $endTag = '</'.$tagName.'>';
00687                 $startTag = '<'.$tagName;
00688 
00689                 $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
00690                 if(!$isTagInText) return false; // if the tag was not found, return false
00691 
00692                 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00693                 $afterTagInText = stristr($isTagInText,$endTag);
00694                 if ($afterTagInText)    {
00695                         $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00696                         $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00697                         $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00698                 } else {        // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
00699                         $tagContent='';
00700                         $stringAfter = $isTagInText;
00701                 }
00702 
00703                 return true;
00704         }
00705 
00712         function typoSearchTags(&$body) {
00713                 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
00714 
00715                 if(count($expBody)>1) {
00716                         $body = '';
00717 
00718                         foreach($expBody as $val)       {
00719                                 $part = explode('-->',$val,2);
00720                                 if(trim($part[0])=='begin') {
00721                                         $body.= $part[1];
00722                                         $prev = '';
00723                                 } elseif(trim($part[0])=='end') {
00724                                         $body.= $prev;
00725                                 } else {
00726                                         $prev = $val;
00727                                 }
00728                         }
00729                         return true;
00730                 } else {
00731                         return false;
00732                 }
00733         }
00734 
00741         function extractLinks($content) {
00742 
00743                         // Get links:
00744                 $list = $this->extractHyperLinks($content);
00745 
00746                 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler'))    {
00747                         $this->includeCrawlerClass();
00748                         $crawler = t3lib_div::makeInstance('tx_crawler_lib');
00749                 }
00750 
00751                         // Traverse links:
00752                 foreach($list as $linkInfo)     {
00753 
00754                                 // Decode entities:
00755                         if ($linkInfo['localPath'])     {       // localPath means: This file is sent by a download script. While the indexed URL has to point to $linkInfo['href'], the absolute path to the file is specified here!
00756                                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
00757                         } else {
00758                                 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00759                         }
00760 
00761                                 // Parse URL:
00762                         $qParts = parse_url($linkSource);
00763 
00764                                 // Check for jumpurl (TYPO3 specific thing...)
00765                         if ($qParts['query'] && strstr($qParts['query'],'jumpurl='))    {
00766                                 parse_str($qParts['query'],$getP);
00767                                 $linkSource = $getP['jumpurl'];
00768                                 $qParts = parse_url($linkSource);       // parse again due to new linkSource!
00769                         }
00770 
00771                         if ($qParts['scheme'])  {
00772                                 if ($this->indexerConfig['indexExternalURLs'])  {
00773                                                 // Index external URL (http or otherwise)
00774                                         $this->indexExternalUrl($linkSource);
00775                                 }
00776                         } elseif (!$qParts['query']) {
00777                                 if (t3lib_div::isAllowedAbsPath($linkSource))   {
00778                                         $localFile = $linkSource;
00779                                 } else {
00780                                         $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00781                                 }
00782                                 if ($localFile && @is_file($localFile)) {
00783 
00784                                                 // Index local file:
00785                                         if ($linkInfo['localPath'])     {
00786 
00787                                                 $fI = pathinfo($linkSource);
00788                                                 $ext = strtolower($fI['extension']);
00789                                                 if (is_object($crawler))        {
00790                                                         $params = array(
00791                                                                 'document' => $linkSource,
00792                                                                 'alturl' => $linkInfo['href'],
00793                                                                 'conf' => $this->conf
00794                                                         );
00795                                                         unset($params['conf']['content']);
00796 
00797                                                         $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00798                                                         $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00799                                                 } else {
00800                                                         $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
00801                                                 }
00802                                         } else {
00803                                                 if (is_object($crawler))        {
00804                                                         $params = array(
00805                                                                 'document' => $linkSource,
00806                                                                 'conf' => $this->conf
00807                                                         );
00808                                                         unset($params['conf']['content']);
00809                                                         $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00810                                                         $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00811                                                 } else {
00812                                                         $this->indexRegularDocument($linkSource);
00813                                                 }
00814                                         }
00815                                 }
00816                         }
00817                 }
00818         }
00819 
00827         function extractHyperLinks($string)     {
00828                 if (!is_object($this->htmlParser))      {
00829                         $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00830                 }
00831 
00832                 $parts = $this->htmlParser->splitTags('a',$string);
00833                 $list = array();
00834                 foreach ($parts as $k => $v)    {
00835                         if ($k%2)       {
00836                                 $params = $this->htmlParser->get_tag_attributes($v,1);
00837                                 $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
00838 
00839                                 switch (strtolower($firstTagName))      {
00840                                         case 'a':
00841                                                 $src = $params[0]['href'];
00842                                                 if ($src)       {
00843                                                                 // Check if a local path to that file has been set - useful if you are using a download script.
00844                                                         $md5 = t3lib_div::shortMD5($src);
00845                                                         if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']))  {
00846                                                                 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
00847                                                         } else $localPath=false;
00848 
00849                                                         $list[] = array(
00850                                                                 'tag' => $v,
00851                                                                 'href' => $params[0]['href'],
00852                                                                 'localPath' => $localPath
00853                                                         );
00854                                                 }
00855                                         break;
00856                                 }
00857                         }
00858                 }
00859 
00860                 return $list;
00861         }
00862 
00863 
00864 
00865 
00866 
00867 
00868 
00869 
00870 
00871 
00872 
00873         /******************************************
00874          *
00875          * Indexing; external URL
00876          *
00877          ******************************************/
00878 
00886         function indexExternalUrl($externalUrl) {
00887 
00888                         // Parse External URL:
00889                 $qParts = parse_url($externalUrl);
00890                 $fI = pathinfo($qParts['path']);
00891                 $ext = strtolower($fI['extension']);
00892 
00893                         // Get headers:
00894                 $urlHeaders = $this->getUrlHeaders($externalUrl);
00895                 if (stristr($urlHeaders['Content-Type'],'text/html'))   {
00896                         $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00897                         if (strlen($content))   {
00898 
00899                                         // Create temporary file:
00900                                 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
00901                                 t3lib_div::writeFile($tmpFile, $content);
00902 
00903                                         // Index that file:
00904                                 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');      // Using "TRUE" for second parameter to force indexing of external URLs (mtime doesn't make sense, does it?)
00905                                 unlink($tmpFile);
00906                         }
00907                 }
00908         }
00909 
00917         function getUrlHeaders($url)    {
00918                 $content = t3lib_div::getURL($url,2);   // Try to get the headers only
00919 
00920                 if (strlen($content))   {
00921                                 // Compile headers:
00922                         $headers = t3lib_div::trimExplode(chr(10),$content,1);
00923                         $retVal = array();
00924                         foreach($headers as $line)      {
00925                                 if (!strlen(trim($line)))       {
00926                                         break;  // Stop at the first empty line (= end of header)
00927                                 }
00928 
00929                                 list($headKey, $headValue) = explode(':', $line, 2);
00930                                 $retVal[$headKey] = $headValue;
00931                         }
00932                         return $retVal;
00933                 }
00934         }
00935 
00936 
00937 
00938 
00939 
00940 
00941 
00942 
00943 
00944 
00945 
00946 
00947 
00948         /******************************************
00949          *
00950          * Indexing; external files (PDF, DOC, etc)
00951          *
00952          ******************************************/
00953 
00963         function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='')        {
00964 
00965                         // Init
00966                 $fI = pathinfo($file);
00967                 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
00968 
00969                         // Create abs-path:
00970                 if (!$contentTmpFile)   {
00971                         if (!t3lib_div::isAbsPath($file))       {       // Relative, prepend PATH_site:
00972                                 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
00973                         } else {        // Absolute, pass-through:
00974                                 $absFile = $file;
00975                         }
00976                         $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
00977                 } else {
00978                         $absFile = $contentTmpFile;
00979                 }
00980 
00981                         // Indexing the document:
00982                 if ($absFile && @is_file($absFile))     {
00983                         if ($this->external_parsers[$ext])      {
00984                                 $mtime = filemtime($absFile);
00985                                 $cParts = $this->fileContentParts($ext,$absFile);
00986 
00987                                 foreach($cParts as $cPKey)      {
00988                                         $this->internal_log = array();
00989                                         $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00990                                         $Pstart = t3lib_div::milliseconds();
00991                                         $subinfo = array('key' => $cPKey);      // Setting page range. This is "0" (zero) when no division is made, otherwise a range like "1-3"
00992                                         $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
00993                                         $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
00994                                         if ($check > 0 || $force)       {
00995                                                 if ($check > 0) {
00996                                                         $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00997                                                 } else {
00998                                                         $this->log_setTSlogMessage('Indexing forced by flag',1);
00999                                                 }
01000 
01001                                                         // Check external file counter:
01002                                                 if ($this->externalFileCounter < $this->maxExternalFiles || $force)     {
01003 
01004                                                                         // Divide into title,keywords,description and body:
01005                                                         $this->log_push('Split content','');
01006                                                                 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
01007                                                         $this->log_pull();
01008 
01009                                                         if (is_array($contentParts))    {
01010                                                                         // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
01011                                                                 $content_md5h = $this->md5inthash(implode($contentParts,''));
01012 
01013                                                                 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force)  {
01014 
01015                                                                                 // Increment counter:
01016                                                                         $this->externalFileCounter++;
01017 
01018                                                                                 // Splitting words
01019                                                                         $this->log_push('Extract words from content','');
01020                                                                                 $splitInWords = $this->processWordsInArrays($contentParts);
01021                                                                         $this->log_pull();
01022 
01023                                                                                 // Analyse the indexed words.
01024                                                                         $this->log_push('Analyse the extracted words','');
01025                                                                                 $indexArr = $this->indexAnalyze($splitInWords);
01026                                                                         $this->log_pull();
01027 
01028                                                                                 // Submitting page (phash) record
01029                                                                         $this->log_push('Submitting page','');
01030                                                                                 $size = filesize($absFile);
01031                                                                                 $ctime = filemtime($absFile);   // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
01032                                                                                 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
01033                                                                         $this->log_pull();
01034 
01035                                                                                 // Check words and submit to word list if not there
01036                                                                         $this->log_push('Check word list and submit words','');
01037                                                                                 $this->checkWordList($indexArr);
01038                                                                                 $this->submitWords($indexArr,$phash_arr['phash']);
01039                                                                         $this->log_pull();
01040 
01041                                                                                 // Set parsetime
01042                                                                         $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
01043                                                                 } else {
01044                                                                         $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
01045                                                                         $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01046                                                                 }
01047                                                         } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01048                                                 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01049                                         } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01050 
01051                                                 // Checking and setting sections:
01052                 #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
01053                                         $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
01054                                         $this->log_pull();
01055                                 }
01056                         } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01057                 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01058         }
01059 
01069         function readFileContent($ext,$absFile,$cPKey)  {
01070 
01071                         // Consult relevant external document parser:
01072                 if (is_object($this->external_parsers[$ext]))   {
01073                         $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01074                 }
01075 
01076                 return $contentArr;
01077         }
01078 
01086         function fileContentParts($ext,$absFile)        {
01087                 $cParts = array(0);
01088 
01089                         // Consult relevant external document parser:
01090                 if (is_object($this->external_parsers[$ext]))   {
01091                         $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01092                 }
01093 
01094                 return $cParts;
01095         }
01096 
01104         function splitRegularContent($content) {
01105                 $contentArr = $this->defaultContentArray;
01106                 $contentArr['body'] = $content;
01107 
01108                 return $contentArr;
01109         }
01110 
01111 
01112 
01113 
01114 
01115 
01116 
01117 
01118 
01119 
01120 
01121 
01122 
01123 
01124         /**********************************
01125          *
01126          * Analysing content, Extracting words
01127          *
01128          **********************************/
01129 
01137         function charsetEntity2utf8(&$contentArr, $charset)     {
01138 
01139                         // Convert charset if necessary
01140                 reset($contentArr);
01141                 while(list($key,)=each($contentArr)) {
01142                         if (strlen($contentArr[$key]))  {
01143 
01144                                 if ($charset!=='utf-8') {
01145                                         $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01146                                 }
01147 
01148                                         // decode all numeric / html-entities in the string to real characters:
01149                                 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01150                         }
01151                 }
01152         }
01153 
01160         function processWordsInArrays($contentArr)      {
01161 
01162                         // split all parts to words
01163                 reset($contentArr);
01164                 while(list($key,)=each($contentArr)) {
01165                         $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01166                 }
01167 
01168                         // For title, keywords, and description we don't want duplicates:
01169                 $contentArr['title'] = array_unique($contentArr['title']);
01170                 $contentArr['keywords'] = array_unique($contentArr['keywords']);
01171                 $contentArr['description'] = array_unique($contentArr['description']);
01172 
01173                         // Return modified array:
01174                 return $contentArr;
01175         }
01176 
01185         function procesWordsInArrays($contentArr)       {
01186                 return $this->processWordsInArrays($contentArr);
01187         }
01188 
01195         function bodyDescription($contentArr)   {
01196 
01197                         // Setting description
01198                 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01199                 if ($maxL)      {
01200                                 // Takes the quadruple lenght first, because whitespace and entities may be removed and thus shorten the string more yet.
01201         #               $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01202                         $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
01203 
01204                                 // Shorten the string:
01205                         $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01206                 }
01207 
01208                 return $bodyDescription;
01209         }
01210 
01217         function indexAnalyze($content) {
01218                 $indexArr = Array();
01219                 $counter = 0;
01220 
01221                 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01222                 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01223                 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01224                 $this->analyzeBody($indexArr,$content);
01225 
01226                 return ($indexArr);
01227         }
01228 
01238         function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01239                 reset($content[$key]);
01240                 while(list(,$val)=each($content[$key]))  {
01241                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01242                         $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01243                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01244                         $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01245                         $retArr[$val]['metaphone'] = $this->metaphone($val);
01246                         $this->wordcount++;
01247                 }
01248         }
01249 
01257         function analyzeBody(&$retArr,$content) {
01258                 foreach($content['body'] as $key => $val)       {
01259                         $val = substr($val,0,60);       // Max 60 - because the baseword varchar IS 60. This MUST be the same.
01260                         if(!isset($retArr[$val])) {
01261                                 $retArr[$val]['first'] = $key;
01262                                 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01263                                 $retArr[$val]['metaphone'] = $this->metaphone($val);
01264                         }
01265                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
01266                         $this->wordcount++;
01267                 }
01268         }
01269 
01277         function metaphone($word,$retRaw=FALSE) {
01278 
01279                 if (is_object($this->metaphoneObj))     {
01280                         $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01281                 } else {
01282                         $tmp = metaphone($word);
01283                 }
01284 
01285                         // Return raw value?
01286                 if ($retRaw)    return $tmp;
01287 
01288                         // Otherwise create hash and return integer
01289                 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01290                 return $ret;
01291         }
01292 
01293 
01294 
01295 
01296 
01297 
01298 
01299 
01300 
01301 
01302 
01303 
01304 
01305 
01306 
01307 
01308         /********************************
01309          *
01310          * SQL; TYPO3 Pages
01311          *
01312          *******************************/
01313 
01319         function submitPage()   {
01320 
01321                         // Remove any current data for this phash:
01322                 $this->removeOldIndexedPages($this->hash['phash']);
01323 
01324                         // setting new phash_row
01325                 $fields = array(
01326                         'phash' => $this->hash['phash'],
01327                         'phash_grouping' => $this->hash['phash_grouping'],
01328                         'cHashParams' => serialize($this->cHashParams),
01329                         'contentHash' => $this->content_md5h,
01330                         'data_page_id' => $this->conf['id'],
01331                         'data_page_reg1' => $this->conf['page_cache_reg1'],
01332                         'data_page_type' => $this->conf['type'],
01333                         'data_page_mp' => $this->conf['MP'],
01334                         'gr_list' => $this->conf['gr_list'],
01335                         'item_type' => 0,       // TYPO3 page
01336                         'item_title' => $this->contentParts['title'],
01337                         'item_description' => $this->bodyDescription($this->contentParts),
01338                         'item_mtime' => $this->conf['mtime'],
01339                         'item_size' => strlen($this->conf['content']),
01340                         'tstamp' => time(),
01341                         'crdate' => time(),
01342                         'item_crdate' => $this->conf['crdate'], // Creation date of page
01343                         'sys_language_uid' => $this->conf['sys_language_uid'],  // Sys language uid of the page. Should reflect which language it DOES actually display!
01344                         'externalUrl' => 0,
01345                         'recordUid' => intval($this->conf['recordUid']),
01346                         'freeIndexUid' => intval($this->conf['freeIndexUid']),
01347                         'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01348                 );
01349 
01350                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01351 
01352                         // PROCESSING index_section
01353                 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01354 
01355                         // PROCESSING index_grlist
01356                 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01357 
01358                         // PROCESSING index_fulltext
01359                 $fields = array(
01360                         'phash' => $this->hash['phash'],
01361                         'fulltextdata' => implode(' ', $this->contentParts)
01362                 );
01363                 if ($this->indexerConfig['fullTextDataLength']>0)       {
01364                         $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01365                 }
01366                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01367 
01368                         // PROCESSING index_debug
01369                 if ($this->indexerConfig['debugMode'])  {
01370                         $fields = array(
01371                                 'phash' => $this->hash['phash'],
01372                                 'debuginfo' => serialize(array(
01373                                                 'cHashParams' => $this->cHashParams,
01374                                                 'external_parsers initialized' => array_keys($this->external_parsers),
01375                                                 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01376                                                 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01377                                                 'logs' => $this->internal_log,
01378                                                 'lexer' => $this->lexerObj->debugString,
01379                                         ))
01380                         );
01381                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01382                 }
01383         }
01384 
01393         function