Documentation TYPO3 par Ameos

class.indexer.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00108 require_once(PATH_t3lib.'class.t3lib_htmlmail.php');
00109 
00110 
00118 class tx_indexedsearch_indexer {
00119 
00120                 // Messages:
00121         var $reasons = array(
00122                 -1 => 'mtime matched the document, so no changes detected and no content updated',
00123                 -2 => 'The minimum age was not exceeded',
00124                 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00125                 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00126                 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00127                 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00128         );
00129         var $convChars=array(
00130                 'ÁÉÚÍÄËÜÖÏÆØÅ',
00131                 'áéúíâêûôîæøå'
00132         );
00133 
00134                 // HTML code blocks to exclude from indexing:
00135         var $excludeSections = 'script,style';
00136 
00137                 // Supported Extensions for external files:
00138         var $supportedExtensions = array(
00139                         'pdf' => 1,
00140                         'doc' => 1,
00141                         'txt' => 1,
00142                         'html' => 1,
00143                         'htm' => 1
00144                 );
00145 
00146                 // This value is also overridden from config.
00147         var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
00148 
00149                 // This array is reset and configured in initialization:
00150         var $app = array(
00151                 'pdftotext' => '/usr/local/bin/pdftotext',
00152                 'pdfinfo' => '/usr/local/bin/pdfinfo',
00153                 'catdoc' => '/usr/local/bin/catdoc'
00154         );
00155 
00156                 // Fe-group list (pages might be indexed separately for each usergroup combination to support search in access limited pages!)
00157         var $defaultGrList='0,-1';
00158 
00159                 // Min/Max times:
00160         var $tstamp_maxAge = 0;         // If set, this tells a number of seconds that is the maximum age of an indexed document. Regardless of mtime the document will be re-indexed if this limit is exceeded.
00161         var $tstamp_minAge = 0;         // If set, this tells a minimum limit before a document can be indexed again. This is regardless of mtime.
00162 
00163                 // INTERNALS:
00164         var $defaultContentArray=array(
00165                 'title' => '',
00166                 'description' => '',
00167                 'keywords' => '',
00168                 'body' => '',
00169         );
00170         var $wordcount = 0;
00171         var $Itypes = array(
00172                 'html' => 1,
00173                 'htm' => 1,
00174                 'pdf' => 2,
00175                 'doc' => 3,
00176                 'txt' => 4
00177         );
00178         var $conf = array();    // Configuration set internally
00179         var $hash = array();    // Hash array, contains phash and phash_grouping
00180         var $contentParts = array();
00181         var $pObj = '';                         // Parent object, reference to global TSFE
00182         var $content_md5h = '';
00183 
00184         var $cHashParams = array();     // cHashparams array
00185         var $mtime = 0;                         // If set, then the mtime of the document must be different in order to be indexed.
00186         var $rootLine = array();        // Root line from TSFE
00187 
00188         var $freqRange = 65000;
00189         var $freqMax = 0.1;
00190 
00191 
00192 
00193 
00200         function hook_indexContent(&$pObj)      {
00201 
00202                 if ($pObj->config['config']['index_enable'])    {
00203                         if (!$pObj->no_cache)   {
00204                                 $GLOBALS['TT']->push('Index page','');
00205 
00206                                                 // Setting parent object:
00207                                         $this->pObj = &$pObj;
00208 
00209                                                 // Init and start indexing:
00210                                         $this->init();
00211                                         $this->indexTypo3PageContent();
00212                                 $GLOBALS['TT']->pull();
00213                         } else {
00214                                 $GLOBALS['TT']->push('Index page','');
00215                                 $GLOBALS['TT']->setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00216                                 $GLOBALS['TT']->pull();
00217                         }
00218                 }
00219         }
00220 
00221 
00222 
00223 
00224 
00225 
00226 
00227 
00228 
00229 
00230 
00231         /********************************
00232          *
00233          * Initialization
00234          *
00235          *******************************/
00236 
00242         function init() {
00243 
00244                         // Initializing:
00245                 $this->cHashParams = $this->pObj->cHash_array;
00246                 if (is_array($this->cHashParams) && count($this->cHashParams))  {
00247                         $this->cHashParams['cHash'] = $this->pObj->cHash;       // Add this so that URL's come out right...
00248                 }
00249 
00250                         // Modification time of page and root line transferred:
00251                 $this->mtime = $this->pObj->register['SYS_LASTCHANGED'];
00252                 $this->rootLine = $this->pObj->config['rootLine'];
00253 
00254                         // Setting up internal configuration from config array:
00255                 $this->conf = array();
00256                 $this->conf['index_externals'] = $this->pObj->config['config']['index_externals'];
00257                 $this->conf['index_descrLgd'] = $this->pObj->config['config']['index_descrLgd'];
00258 
00259                         // Setting phash / phash_grouping which identifies the indexed page based on some of these variables:
00260                 $this->setT3Hashes();
00261 
00262                         // Initialize tools for reading PDF and Word documents:
00263                 $this->initExternalReaders();
00264         }
00265 
00271         function initExternalReaders()  {
00272                         // PDF + WORD tools:
00273                         // First reset the class default settings (disabling)
00274                 $this->app = array();
00275                 $this->supportedExtensions['pdf'] = 0;
00276                 $this->supportedExtensions['doc'] = 0;
00277 
00278                         // Then read indexer-config and set if appropriate:
00279                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00280 
00281                         // PDF
00282                 if ($indexerConfig['pdftools']) {
00283                         $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00284                         if ((ini_get('safe_mode') && $pdfPath) || (@is_file($pdfPath.'pdftotext') && @is_file($pdfPath.'pdfinfo')))     {
00285                                 $this->app['pdfinfo'] = $pdfPath.'pdfinfo';
00286                                 $this->app['pdftotext'] = $pdfPath.'pdftotext';
00287                                 $this->supportedExtensions['pdf'] = 1;
00288                         } else $GLOBALS['TT']->setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00289                 } else $GLOBALS['TT']->setTSlogMessage('PDF tools disabled',1);
00290 
00291                         // Catdoc
00292                 if ($indexerConfig['catdoc'])   {
00293                         $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00294                         if (is_file($catdocPath.'catdoc'))      {
00295                                 $this->app['catdoc'] = $catdocPath.'catdoc';
00296                                 $this->supportedExtensions['doc'] = 1;
00297                         } else $GLOBALS['TT']->setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
00298                 } else $GLOBALS['TT']->setTSlogMessage('catdoc tools (Word-files) disabled',1);
00299 
00300                         // PDF mode:
00301                 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00302         }
00303 
00304 
00305 
00306 
00307 
00308 
00309 
00310 
00311 
00312 
00313 
00314         /********************************
00315          *
00316          * Indexing
00317          *
00318          *******************************/
00319 
00325         function indexTypo3PageContent()        {
00326 
00327                 $check = $this->checkMtimeTstamp($this->mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $this->hash['phash']);
00328 # WHAT IS THIS? Test that it works...           $is_grlist = $this->is_grlist_set($phash_x);    // Use $this->hash['phash']?
00329 
00330                 if ($check > 0 || !$is_grlist)  {
00331 
00332                                 // Setting message:
00333                         if ($check > 0) {
00334                                 $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00335                         } else {
00336                                 $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00337                         }
00338 
00339                                         // Divide into title,keywords,description and body:
00340                         $GLOBALS['TT']->push('Split content','');
00341                                 $this->contentParts = $this->splitHTMLContent($this->pObj->content);
00342                                 if ($this->pObj->indexedDocTitle)       $this->contentParts['title'] = $this->pObj->indexedDocTitle;
00343                         $GLOBALS['TT']->pull();
00344 
00345                                 // Calculating a hash over what is to be the actual page content. Maybe this hash should not include title,description and keywords? The bodytext is the primary concern. (on the other hand a changed page-title would make no difference then, so dont!)
00346                         $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00347                                 // This function checks if there is already a page (with gr_list = 0,-1) indexed and if that page has the very same contentHash.
00348                                 // If the contentHash is the same, then we can rest assured that this page is already indexed and regardless of mtime and origContent we don't need to do anything more.
00349                                 // This will also prevent pages from being indexed if a fe_users has logged in and it turns out that the page content is not changed anyway. fe_users logged in should always search with hash_gr_list = "0,-1" OR "[their_group_list]". This situation will be prevented only if the page has been indexed with no user login on before hand. Else the page will be indexed by users until that event. However that does not present a serious problem.
00350                         $checkCHash = $this->checkContentHash();
00351                         if (!is_array($checkCHash))     {
00352                                 $Pstart=t3lib_div::milliseconds();
00353                                                 // Splitting words
00354                                 $GLOBALS['TT']->push('Extract words from content','');
00355                                         $splitInWords = $this->procesWordsInArrays($this->contentParts);
00356                                 $GLOBALS['TT']->pull();
00357 
00358                                                 // Analyse the indexed words.
00359                                 $GLOBALS['TT']->push('Analyse the extracted words','');
00360                                         $indexArr = $this->indexAnalyze($splitInWords);
00361                                 $GLOBALS['TT']->pull();
00362 
00363                                                 // Submitting page (phash) record
00364                                 $GLOBALS['TT']->push('Submitting page','');
00365                                         $this->submitPage();
00366                                 $GLOBALS['TT']->pull();
00367 
00368                                                 // Check words and submit to word list if not there
00369                                 $GLOBALS['TT']->push('Check word list and submit words','');
00370                                         $this->checkWordList($indexArr);
00371                                         $this->submitWords($indexArr,$this->hash['phash']);
00372                                 $GLOBALS['TT']->pull();
00373 
00374                                                 // Set parsetime
00375                                 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00376 
00377                                                 // Checking external files if configured for.
00378                                 $GLOBALS['TT']->push('Checking external files','');
00379                                 if ($this->conf['index_externals'])     {
00380                                         $this->extractLinks($this->pObj->content);
00381                                 }
00382                                 $GLOBALS['TT']->pull();
00383                         } else {
00384                                 $this->updateTstamp($this->hash['phash'],$this->mtime); // Update the timestatmp
00385                                 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);
00386                                 $this->updateRootline();
00387                                 $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00388                         }
00389                 } else {
00390                         $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00391                 }
00392         }
00393 
00400         function splitHTMLContent($content) {
00401 
00402                 # divide head from body ( u-ouh :) )
00403 
00404                 $contentArr=$this->defaultContentArray;
00405                 $contentArr['body'] = stristr($content,'<body');
00406                 $headPart = substr($content,0,-strlen($contentArr['body']));
00407 
00408                 # get title
00409                 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00410                 $titleParts = explode(':',$contentArr['title'],2);
00411                 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00412 
00413                 # get keywords and description metatags
00414                 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { /*nothing*/ }
00415                 for($i=0;isset($meta[$i]);$i++) {
00416                         $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00417                         if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00418                         if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00419                 }
00420 
00421                 $this->typoSearchTags($contentArr['body']);
00422 
00423                 # get rid of unwanted sections (ie. scripting and style stuff) in body
00424                 $tagList = explode(',',$this->excludeSections);
00425                 reset($tagList);
00426                 while(list(,$tag)=each($tagList)) {
00427                         while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00428                 }
00429 
00430                 # remove tags, but first make sure we don't concatenate words by doing it
00431                 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00432                 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00433 
00434                 $contentArr['keywords'] = trim($contentArr['keywords']);
00435                 $contentArr['description'] = trim($contentArr['description']);
00436                 # ta-dah!
00437                 return $contentArr;
00438         }
00439 
00446         function splitRegularContent($content) {
00447                 $contentArr = $this->defaultContentArray;
00448                 $contentArr['body'] = $content;
00449 
00450                 return $contentArr;
00451         }
00452 
00459         function procesWordsInArrays($contentArr)       {
00460 
00461                 # split all parts to words
00462                 reset($contentArr);
00463                 while(list($key,)=each($contentArr)) {
00464                         if (function_exists('html_entity_decode'))              $contentArr[$key] = html_entity_decode($contentArr[$key]);
00465                         $contentArr[$key] = $this->strtolower_all($contentArr[$key]);
00466                         $this->split2words($contentArr[$key]);
00467                 }
00468 
00469                 # for title, keywords, and description we don't want duplicates
00470                 $contentArr['title'] = array_unique($contentArr['title']);
00471                 $contentArr['keywords'] = array_unique($contentArr['keywords']);
00472                 $contentArr['description'] = array_unique($contentArr['description']);
00473                 return $contentArr;
00474         }
00475 
00482         function bodyDescription($contentArr)   {
00483                 # Setting description
00484                 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
00485                 if ($maxL)      {
00486                         if (function_exists('html_entity_decode'))              $bodyDescription = html_entity_decode(trim($contentArr['body']));
00487                         $bodyDescription = implode(' ',split('[[:space:],]+',substr($bodyDescription,0,$maxL*2)));      // Takes the double lenght first, because whitespace may be removed and thus shorten the string more yet.
00488                         $bodyDescription=substr($bodyDescription,0,$maxL);
00489                 }
00490                 return $bodyDescription;
00491         }
00492 
00499         function extractLinks($content) {
00500                 $extract = t3lib_div::makeInstance('t3lib_htmlmail');
00501                 $extract->extractHtmlInit($content,'');
00502                 $extract->extractHyperLinks();
00503 #debug($extract->theParts['html']['hrefs']);
00504                 if (is_array($extract->theParts['html']['hrefs']))      {
00505                         reset($extract->theParts['html']['hrefs']);
00506                         while(list(,$linkInfo)=each($extract->theParts['html']['hrefs']))       {
00507                                 $linkInfo['ref'] = t3lib_div::htmlspecialchars_decode($linkInfo['ref']);
00508 #debug($linkInfo['ref'],1);
00509                                 if (strstr($linkInfo['ref'],'?') && strstr($linkInfo['ref'],'jumpurl='))        {
00510                                         $qParts = parse_url($linkInfo['ref']);
00511 #debug($qParts);
00512                                         $theJumpurlFile = $this->getJumpurl($qParts['query']);
00513 //                                      debug($theJumpurlFile);
00514                                         if ($theJumpurlFile && @is_file($theJumpurlFile))       {
00515         //                                      debug($theJumpurlFile);
00516                                                 $this->indexRegularDocument($theJumpurlFile);
00517                                         }
00518                                 } elseif (@is_file($linkInfo['ref']))   {
00519                                         $this->indexRegularDocument($linkInfo['ref']);
00520                                 }
00521                         }
00522                 }
00523         }
00524 
00531         function getJumpurl($query)     {
00532                 $res = parse_str($query);
00533 #               debug(array($res),'getJumpurl');
00534 
00535                 return $jumpurl;
00536         }
00537 
00544         function splitPdfInfo($pdfInfoArray)    {
00545                 $res = array();
00546                 if (is_array($pdfInfoArray))    {
00547                         reset($pdfInfoArray);
00548                         while(list(,$line)=each($pdfInfoArray)) {
00549                                 $parts = explode(':',$line,2);
00550                                 if (count($parts)>1 && trim($parts[0])) {
00551                                         $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00552                                 }
00553                         }
00554                 }
00555                 return $res;
00556         }
00557 
00564         function indexRegularDocument($file)    {
00565                         // init
00566                 $fI=pathinfo($file);
00567                 $ext = strtolower($fI['extension']);
00568                 $absFile = PATH_site.$file;
00569 #debug($file);
00570                         //
00571                 if (@is_file($absFile) && $this->supportedExtensions[$ext])     {
00572                         $mtime = filemtime($absFile);
00573                         $cParts = $this->fileContentParts($ext,$absFile);
00574 //                      debug($cParts);
00575                         reset($cParts);
00576                         while(list(,$cPKey)=each($cParts))      {
00577                                 $GLOBALS['TT']->push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00578                                 $Pstart = t3lib_div::milliseconds();
00579                                 $subinfo=array('key'=>$cPKey);
00580                                 $phash_arr = $this->setExtHashes($file,$subinfo);
00581 //                              debug($phash_arr);
00582 
00583                                 $check = $this->checkMtimeTstamp($mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $phash_arr['phash']);
00584                                 if ($check > 0) {
00585                                         $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00586                                                         // Divide into title,keywords,description and body:
00587                                         $GLOBALS['TT']->push('Split content','');
00588                                                 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
00589 #debug($contentParts);
00590                                         $GLOBALS['TT']->pull();
00591                                         if (is_array($contentParts))    {
00592                                                         // Calculating a hash over what is to be the actual content. (see indexTypo3PageContent())
00593                                                 $content_md5h = $this->md5inthash(implode($contentParts,''));
00594 
00595                                                 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h))    {
00596                                                                         // Splitting words
00597                                                         $GLOBALS['TT']->push('Extract words from content','');
00598                                                                 $splitInWords = $this->procesWordsInArrays($contentParts);
00599                                                         $GLOBALS['TT']->pull();
00600 
00601                                                                         // Analyse the indexed words.
00602                                                         $GLOBALS['TT']->push('Analyse the extracted words','');
00603                                                                 $indexArr = $this->indexAnalyze($splitInWords);
00604                                                         $GLOBALS['TT']->pull();
00605 
00606                                                                         // Submitting page (phash) record
00607                                                         $GLOBALS['TT']->push('Submitting page','');
00608                                                                 $size=filesize($absFile);
00609                                                                 $ctime=filemtime($absFile);     // Unfortunately I cannot determine WHEN a file is originally made - so I must return the modification time...
00610                                                                 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
00611                                                         $GLOBALS['TT']->pull();
00612 
00613                                                                         // Check words and submit to word list if not there
00614                                                         $GLOBALS['TT']->push('Check word list and submit words','');
00615                                                                 $this->checkWordList($indexArr);
00616                                                                 $this->submitWords($indexArr,$phash_arr['phash']);
00617                                                         $GLOBALS['TT']->pull();
00618 
00619                                                                 // Set parsetime
00620                                                         $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
00621                                                 } else {
00622                                                         $this->updateTstamp($phash_arr['phash'],$mtime);        // Update the timestamp
00623                                                         $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
00624                                                 }
00625                                         } else {
00626                                                 $GLOBALS['TT']->setTSlogMessage('Could not index file! Unsupported extension.');
00627                                         }
00628                                 } else {
00629                                         $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00630                                 }
00631                                         // Checking and setting sections:
00632         #                       $this->submitFile_grlist($phash_arr['phash']);  // Setting a gr_list record if there is none already (set for default fe_group)
00633                                 $this->submitFile_section($phash_arr['phash']);         // Setting a section-record for the file. This is done also if the file is not indexed. Notice that section records are deleted when the page is indexed.
00634                                 $GLOBALS['TT']->pull();
00635                         }
00636                 }
00637         }
00638 
00647         function readFileContent($ext,$absFile,$cPKey)  {
00648                 switch ($ext)   {
00649                         case 'pdf':
00650                                 if ($this->app['pdfinfo'])      {
00651 #debug($this->app);
00652                                                 // Getting pdf-info:
00653                                         $cmd = $this->app['pdfinfo'].' '.$absFile;
00654                                         exec($cmd,$res);
00655                                         $pdfInfo=$this->splitPdfInfo($res);
00656 
00657                                         if (intval($pdfInfo['pages']))  {
00658                                                 list($low,$high) = explode('-',$cPKey);
00659 
00660                                                         // Get pdf content:
00661                                                 $tempFileName = t3lib_div::tempnam('Typo3_indexer');            // Create temporary name
00662                                                 @unlink ($tempFileName);        // Delete if exists, just to be safe.
00663                                                 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -q '.$absFile.' '.$tempFileName;
00664         //                                      debug($cmd,1);
00665                                                 exec($cmd,$res);
00666                                                 if (@is_file($tempFileName))    {
00667                                                         $content = t3lib_div::getUrl($tempFileName);
00668                                                         unlink($tempFileName);
00669                                                 } else {
00670                                                         $GLOBALS['TT']->setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00671                                                 }
00672                                                 $contentArr = $this->splitRegularContent($content);
00673                                         }
00674                                 }
00675                         break;
00676                         case 'doc':
00677                                 if ($this->app['catdoc'])       {
00678                                         $cmd = $this->app['catdoc'].' '.$absFile;
00679                                         exec($cmd,$res);
00680                                         $content = implode(chr(10),$res);
00681                                         $contentArr = $this->splitRegularContent($content);
00682                                 }
00683                         break;
00684                         case 'txt':
00685                                 $content = t3lib_div::getUrl($absFile);
00686                                 $contentArr = $this->splitRegularContent($content);
00687                         break;
00688                         case 'html':
00689                         case 'htm':
00690                                 $fileContent = t3lib_div::getUrl($absFile);
00691                                 $contentArr = $this->splitHTMLContent($fileContent);
00692                         break;
00693                         default:
00694                                 return false;
00695                         break;
00696                 }
00697                         // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
00698                 if (!$contentArr['title'])      {
00699                         $contentArr['title']=str_replace('_',' ',basename($absFile));   // Substituting "_" for " " because many filenames may have this instead of a space char.
00700                 }
00701                 return $contentArr;
00702         }
00703 
00711         function fileContentParts($ext,$absFile)        {
00712                 $cParts=array(0);
00713                 switch ($ext)   {
00714                         case 'pdf':
00715                                         // Getting pdf-info:
00716                                 $cmd = $this->app['pdfinfo'].' '.$absFile;
00717                                 exec($cmd,$res);
00718                                 $pdfInfo=$this->splitPdfInfo($res);
00719                         //      debug($pdfInfo);
00720 
00721                                 if (intval($pdfInfo['pages']))  {
00722                                         $cParts=array();
00723                                                 // Calculate mode
00724                                                 // Calculate mode
00725                                         if ($this->pdf_mode>0)  {
00726                                                 $iter=ceil($pdfInfo['pages']/$this->pdf_mode);
00727                                         } else {
00728                                                 $iter=t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00729                                         }
00730                                         for ($a=0;$a<$iter;$a++)        {
00731                                                 $low=floor($a*($pdfInfo['pages']/$iter))+1;
00732                                                 $high=floor(($a+1)*($pdfInfo['pages']/$iter));
00733                                                 $cParts[]=$low.'-'.$high;
00734                                         }
00735                                 }
00736                         break;
00737                 }
00738                 return $cParts;
00739         }
00740 
00741 
00754         function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00755                 $endTag = '</'.$tagName.'>';
00756                 $startTag = '<'.$tagName;
00757                 $isTagInText = stristr($string,$startTag);              // stristr used because we want a case-insensitive search for the tag.
00758                 if(!$isTagInText) return false; // if the tag was not found, return false
00759 
00760                 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00761                 $afterTagInText = stristr($isTagInText,$endTag);
00762                 if ($afterTagInText)    {
00763                         $tagContent = substr($isTagInText,0,-strlen($afterTagInText));
00764                         $stringAfter = substr($afterTagInText,strlen($endTag));
00765                 } else {        // If there was no ending tag, the tagContent is blank and anything after the tag it self is returned.
00766                         $tagContent='';
00767                         $stringAfter = $isTagInText;
00768                 }
00769 //              debug(array($tagContent,$stringAfter));
00770                 return true;
00771         }
00772 
00780         function indexAnalyze($content) {
00781                 $indexArr = Array();
00782                 $counter = 0;
00783 
00784                 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
00785                 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
00786                 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
00787                 $this->analyzeBody($indexArr,$content);
00788 
00789                 return ($indexArr);
00790         }
00791 
00801         function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
00802                 reset($content[$key]);
00803                 while(list(,$val)=each($content[$key]))  {
00804                         $val = substr($val,0,30);       // Max 30 - because the baseword varchar IS 30. This MUST be the same.
00805                         $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
00806                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
00807                         $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
00808                         $retArr[$val]['metaphone'] = $this->metaphone($val);
00809                         $this->wordcount++;
00810                 }
00811         }
00812 
00820         function analyzeBody(&$retArr,$content) {
00821                 reset($content['body']);
00822                 while(list($key,$val)=each($content['body']))  {
00823                         $val = substr($val,0,30);       // Max 30 - because the baseword varchar IS 30. This MUST be the same.
00824                         if(!isset($retArr[$val])) {
00825                                 $retArr[$val]['first']=$key;
00826                                 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
00827                                 $retArr[$val]['metaphone'] = $this->metaphone($val);
00828                         }
00829                         $retArr[$val]['count'] = $retArr[$val]['count']+1;
00830                         $this->wordcount++;
00831                 }
00832         }
00833 
00840         function typoSearchTags(&$body) {
00841                 $expBody = explode('<!--TYPO3SEARCH_',$body);
00842 #debug($expBody);
00843                 if(count($expBody)>1) {
00844                         $body = '';
00845                         reset($expBody);
00846                         while(list(,$val)=each($expBody)) {
00847                                 $part = explode('-->',$val,2);
00848                                 if(trim($part[0])=='begin') {
00849                                         $body .= $part[1];
00850                                         $prev = '';
00851                                 } elseif(trim($part[0])=='end') {
00852                                         $body .= $prev;
00853                                 } else {
00854                                         $prev = $val;
00855                                 }
00856 #debug($part);
00857                         }
00858 #debug(array($body));
00859                         return true;
00860                 } else {
00861                         return false;
00862                 }
00863         }
00864 
00865 
00866 
00867 
00868 
00869 
00870 
00871 
00872 
00873 
00874 
00875 
00876 
00877 
00878         /**********************************
00879          *
00880          * Words
00881          *
00882          **********************************/
00883 
00891         function split2words(&$string) {
00892                 $words = split('[[:space:],]+',$string);
00893                 $reg='['.quotemeta('().,_?!:-').']*';
00894                 $reg='[^[:alnum:]'.$this->convChars[0].$this->convChars[1].']*';
00895 
00896 #debug($words);
00897 #debug(array($string));
00898                 reset($words);
00899                 $matches=array();
00900                 while(list(,$w)=each($words))   {
00901                         $w=trim($w);
00902                         $w=ereg_replace('^'.$reg,'',$w);
00903                         $w=ereg_replace($reg.'$','',$w);
00904                         if ($this->wordOK($w))  {$matches[]=$w;}
00905                 }
00906 #               debug($matches);
00907                 $string =$matches;
00908 
00909 
00910                 /*
00911                 preg_match_all("/\b(\w[\w']*\w+|\w+)\b/", $string ,$matches);
00912                 $string = $matches[0];
00913                 */
00914         }
00915 
00924         function wordOK($w)     {
00925                 if ($w && strlen($w)>1 && strlen($w)<50)        {
00926                         if (rawurlencode($w)!=$w)       {
00927                                 $fChars = count(explode('%',rawurlencode($w)))-1;
00928                                 $rel = round($fChars/strlen($w)*100);
00929                                 return $rel<30 ? 1 : 0;         // Max 30% strange chars!
00930                         } else {
00931                                 return 1;
00932                         }
00933                 }
00934         }
00935 
00942         function metaphone($word) {
00943                 $tmp = metaphone($word);
00944                 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
00945                 return $ret;
00946         }
00947 
00954         function strtolower_all($str)   {
00955                 return strtolower(strtr($str, $this->convChars[0], $this->convChars[1]));
00956         }
00957 
00958 
00959 
00960 
00961 
00962 
00963 
00964 
00965 
00966 
00967 
00968 
00969 
00970 
00971 
00972         /********************************
00973          *
00974          * SQL Helper functions
00975          *
00976          *******************************/
00977 
00985         function freqMap($freq) {
00986                 $mapFactor = $this->freqMax*100*$this->freqRange;
00987                 if($freq<1) {
00988                         $newFreq = $freq*$mapFactor;
00989                         $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
00990                 } else {
00991                         $newFreq = $freq/$mapFactor;
00992                 }
00993                 return $newFreq;
00994 
00995         }
00996 
01003         function getRootLineFields(&$fieldArr)  {
01004                 $rl = $this->rootLine;
01005 
01006                 $fieldArr['rl0'] = intval($rl[0]['uid']);
01007                 $fieldArr['rl1'] = intval($rl[1]['uid']);
01008                 $fieldArr['rl2'] = intval($rl[2]['uid']);
01009 
01010                 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields']))    {
01011                         foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel)  {
01012                                 $fieldArr[$fieldName] = intval($rl[$rootLineLevel]['uid']);
01013                         }
01014                 }
01015         }
01016 
01017 
01018 
01019 
01020 
01021 
01022 
01023 
01024 
01025 
01026 
01027 
01028 
01029 
01030         /********************************
01031          *
01032          * SQL Helper functions
01033          *
01034          *******************************/
01035 
01043         function removeIndexedPhashRow($phashList,$clearPageCache=1)    {
01044                 $phashRows=t3lib_div::trimExplode(',',$phashList,1);
01045                 while(list(,$phash)=each($phashRows))   {
01046                         $phash = intval($phash);
01047                         if ($phash>0)   {
01048 
01049                                 if ($clearPageCache)    {
01050                                                 // Clearing page cache:
01051                                         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('page_id', 'index_section', 'phash='.intval($phash));
01052                                         if ($GLOBALS['TYPO3_DB']->sql_num_rows($res))   {
01053                                                 $idList = array();
01054                                                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
01055                                                         $idList[] = $row['page_id'];
01056                                                 }
01057                                                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('cache_pages', 'page_id IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($idList)).')');
01058                                         }
01059                                 }
01060 
01061                                         // Removing old registrations for all tables.
01062                                 $tableArr = explode(',','index_phash,index_rel,index_section,index_fulltext,index_grlist');
01063                                 foreach($tableArr as $table)    {
01064                                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
01065                                 }
01066 
01067                                         // Did not remove any index_section records for external files where phash_t3 points to this hash!
01068 #debug('DELETE: '.$phash,1);
01069                         }
01070                 }
01071         }
01072 
01083         function checkMtimeTstamp($mtime,$maxAge,$minAge,$phash)        {
01084 
01085                         // Select indexed page:
01086                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01087                 $out = 0;
01088 
01089                         // If there was an indexing of the page...:
01090                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01091                         if ($maxAge && ($row['tstamp']+$maxAge)<time()) {               // If min age is exceeded, index the page
01092                                 $out = 1;
01093                         } else {
01094                                 if (!$minAge || ($row['tstamp']+$minAge)<time())        {       // if minAge is not set or if minAge is exceeded, consider at mtime
01095                                         if ($mtime)     {               // It mtime is set, then it's tested. If not, the page must clearly be indexed.
01096                                                 if ($row['item_mtime'] != $mtime)       {       // And if mtime is different from the index_phash mtime, it's about time to re-index.
01097                                                         $out = 2;
01098                                                 } else {
01099                                                         $out = -1;
01100                                                         $this->updateTstamp($phash);    // Update the timestatmp
01101                                                         $GLOBALS['TT']->setTSlogMessage('Mtime matched, timestamp updated.',1);
01102                                                 }
01103                                         } else {$out = 3;       }
01104                                 } else {$out = -2;}
01105                         }
01106                 } else {$out = 4;}      // No indexing found.
01107                 return $out;
01108         }
01109 
01117         function update_grlist($phash,$phash_x) {
01118                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->pObj->gr_list));
01119                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01120                         $this->submit_grlist($phash,$phash_x);
01121                         $GLOBALS['TT']->setTSlogMessage("Inserted gr_list '".$this->pObj->gr_list."' for phash '".$phash."'",1);
01122                 }
01123         }
01124 
01129         function is_grlist_set($phash_x)        {
01130                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
01131                 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
01132         }
01133 
01140         function checkContentHash()     {
01141                         // With this query the page will only be indexed if it's content is different from the same "phash_grouping" -page.
01142                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash AS A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01143                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01144                         return $row;
01145                 }
01146                 return 1;
01147         }
01148 
01154         function removeLoginpagesWithContentHash()      {
01155                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A,index_grlist AS B', '
01156                                         A.phash=B.phash
01157                                         AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01158                                         AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01159                                         AND A.contentHash='.intval($this->content_md5h));
01160                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res))       {
01161                         $GLOBALS['TT']->setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01162                         $this->removeOldIndexedPages($row['phash']);
01163                 }
01164         }
01165 
01172         function removeOldIndexedPages($phash)  {
01173                         // Removing old registrations for all tables. Because the pages are TYPO3 pages there can be nothing else than 1-1 relations here.
01174                 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext');
01175                 foreach($tableArr as $table)    {
01176                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
01177                 }
01178                         // Removing all index_section records with hash_t3 set to this hash (this includes such records set for external media on the page as well!). The re-insert of these records are done in indexRegularDocument($file).
01179                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_section').'"');
01180         }
01181 
01190         function checkExternalDocContentHash($hashGr,$content_md5h)     {
01191                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01192                 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01193                         return 0;
01194                 }
01195                 return 1;
01196         }
01197 
01205         function updateTstamp($phash,$mtime=0)  {
01206                 $updateFields = array(
01207                         'tstamp' => time()
01208                 );
01209                 if ($mtime)     { $updateFields['item_mtime'] = intval($mtime); }
01210 
01211                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01212         }
01213 
01221         function updateParsetime($phash,$parsetime)     {
01222                 $updateFields = array(
01223                         'parsetime' => intval($parsetime)
01224                 );
01225 
01226                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01227         }
01228 
01234         function updateRootline()       {
01235 
01236                 $updateFields = array();
01237                 $this->getRootLineFields($updateFields);
01238 
01239                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->pObj->id), $updateFields);
01240         }
01241 
01242 
01243 
01244 
01245 
01246 
01247 
01248 
01249 
01250 
01251 
01252 
01253         /********************************
01254          *
01255          * SQL; Inserting in database
01256          *
01257          *******************************/
01258 
01264         function submitPage()   {
01265                 $this->removeOldIndexedPages($this->hash['phash']);
01266 
01267                         // setting new
01268                 $fields = array(
01269                         'phash' => $this->hash['phash'],
01270                         'phash_grouping' => $this->hash['phash_grouping'],
01271                         'cHashParams' => serialize($this->cHashParams),
01272                         'contentHash' => $this->content_md5h,
01273                         'data_page_id' => $this->pObj->id,
01274                         'data_page_reg1' => $this->pObj->page_cache_reg1,
01275                         'data_page_type' => $this->pObj->type,
01276                         'data_page_mp' => $this->pObj->MP,
01277                         'gr_list' => $this->pObj->gr_list,
01278                         'item_type' => 0,       // TYPO3 page
01279                         'item_title' => $this->contentParts['title'],
01280                         'item_description' => $this->bodyDescription($this->contentParts),
01281                         'item_mtime' => $this->mtime,
01282                         'item_size' => strlen($this->pObj->content),
01283                         'tstamp' => time(),
01284                         'crdate' => time(),
01285                         'item_crdate' => $this->pObj->page['crdate'],   // Creation date of page
01286                         'sys_language_uid' => $this->pObj->sys_language_uid     // Sys language uid of the page. Should reflect which language it DOES actually display!
01287                 );
01288                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01289 
01290                 // ************************
01291                 // PROCESSING index_section
01292                 // ************************
01293                 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01294 
01295                 // ************************
01296                 // PROCESSING index_grlist
01297                 // ************************
01298                 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01299 
01300                 // ************************
01301                 // PROCESSING index_fulltext
01302                 // ************************
01303                 $fields = array(
01304                         'phash' => $this->hash['phash'],
01305                         'fulltextdata' => implode($this->contentParts,' ')
01306                 );
01307                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01308         }
01309 
01317         function submit_grlist($hash,$phash_x)  {
01318                         // Setting the gr_list record
01319                 $fields = array(
01320                         'phash' => $hash,
01321                         'phash_x' => $phash_x,
01322                         'hash_gr_list' => $this->md5inthash($this->pObj->gr_list),
01323                         'gr_list' => $this->pObj->gr_list
01324                 );
01325                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01326         }
01327 
01335         function submit_section($hash,$hash_t3) {
01336                 $fields = array(
01337                         'phash' => $hash,
01338                         'phash_t3' => $hash_t3,
01339                         'page_id' => intval($this->pObj->id)
01340                 );
01341 
01342                 $this->getRootLineFields($fields);
01343 
01344                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01345         }
01346 
01361         function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts)      {
01362                         // Removing old registrations for tables.
01363                 $tableArr = explode(',','index_phash,index_fulltext,index_grlist');
01364                 foreach($tableArr as $table)    {
01365                         $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($hash['phash'], $table).'"');
01366                 }
01367                         // setting new
01368                 $fields = array(
01369                         'phash' => $hash['phash'],
01370                         'phash_grouping' => $hash['phash_grouping'],
01371                         'cHashParams' => serialize($subinfo),
01372                         'contentHash' => $content_md5h,
01373                         'data_filename' => $file,
01374                         'item_type' => intval($this->Itypes[$ext]) ? intval($this->Itypes[$ext]) : -1,
01375                         'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01376                         'item_description' => $this->bodyDescription($contentParts),
01377                         'item_mtime' => $mtime,
01378                         'item_size' => $size,
01379                         'item_crdate' => $ctime,
01380                         'tstamp' => time(),
01381                         'crdate' => time(),
01382                         'gr_list' => $this->pObj->gr_list
01383                 );
01384                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01385 
01386                 // ************************
01387                 // PROCESSING index_fulltext
01388                 // ************************
01389                 $fields = array(
01390                         'phash' => $hash['phash'],
01391                         'fulltextdata' => implode($contentParts,' ')
01392                 );
01393                 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01394         }
01395 
01402         function submitFile_grlist($hash)       {
01403                 // ************************
01404                 // PROCESSING index_grlist
01405                 // ************************
01406                         // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
01407                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->pObj->gr_list).')');
01408                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01409                         $this->submit_grlist($hash,$hash);
01410                 }
01411         }
01412 
01419         function submitFile_section($hash)      {
01420                 // ************************
01421                 // PROCESSING index_grlist
01422                 // ************************
01423                         // Testing if there is a gr_list record for a non-logged in user and if so, there is no need to place another one.
01424                 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->pObj->id));
01425                 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res))  {
01426                         $this->submit_section($hash,$this->hash['phash']);
01427                 }
01428         }
01429 
01436         function checkWordList($wl) {
01437                 reset($wl);
01438                 $phashArr=array();
01439                 while(list($key,)=each($wl)) {
01440                         $phashArr[] = $wl[$key]['hash'];
01441                 }
01442                 if (count($phashArr))   {
01443                         $cwl = implode(',',$phashArr);
01444                         $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
01445 
01446                         if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
01447                                 $GLOBALS['TT']->setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
01448                                 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01449                                         unset($wl[$row['baseword']]);
01450                                 }
01451 
01452                                 reset($wl);
01453                                 while(list($key,$val)=each($wl)) {
01454                                         $insertFields = array(
01455                                                 'wid' => $val['hash'],
01456                                                 'baseword' => $key,
01457                                                 'metaphone' => $val['metaphone']
01458                                         );
01459                                                 // A duplicate-key error will occur here if a word is NOT unset in the unset() line. However as long as the words in $wl are NOT longer as 30 chars (the baseword varchar is 30 characters...) this is not a problem.
01460                                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
01461                                 }
01462                         }
01463                 }
01464         }
01465 
01473         function submitWords($wl,$phash) {
01474                 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_rel').'"');
01475 
01476                 foreach($wl as $val)    {
01477                         $insertFields = array(
01478                                 'phash' => $phash,
01479                                 'wid' => $val['hash'],
01480                                 'count' => $val['count'],
01481                                 'first' => $val['first'],
01482                                 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
01483                                 'flags' => $val['cmp']
01484                         );
01485 
01486                         $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
01487                 }
01488         }
01489 
01490 
01491 
01492 
01493 
01494 
01495 
01496 
01497 
01498 
01499 
01500 
01501 
01502 
01503 
01504 
01505 
01506         /********************************
01507          *
01508          * Hashing
01509          *
01510          *******************************/
01511 
01517         function setT3Hashes()  {
01518                         //  Set main array:
01519                 $hArray = array(
01520                         'id' => $this->pObj->id,
01521                         'type' => $this->pObj->type,
01522                         'sys_lang' => $this->pObj->sys_language_uid,
01523                         'MP' => $this->pObj->MP,
01524                         'cHash' => $this->cHashParams
01525                 );
01526                         // Set grouping hash:
01527                 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01528                         // Add gr_list and set plain phash
01529                 $hArray['gr_list']=$this->pObj->gr_list;
01530                 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
01531         }
01532 
01540         function setExtHashes($file,$subinfo=array())   {
01541                         //  Set main array:
01542                 $hash = array();
01543                 $hArray = array(
01544                         'file' => $file,
01545                 );
01546 
01547                         // Set grouping hash:
01548                 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01549 
01550                         // Add subinfo
01551                 $hArray['subinfo'] = $subinfo;
01552                 $hash['phash'] = $this->md5inthash(serialize($hArray));
01553 
01554                 return $hash;
01555         }
01556 
01563         function md5inthash($str)       {
01564                         // Using 7 instead of 8 just because that makes the integers lower than 32 bit (28 bit) and so they does not interfere with UNSIGNED integers or PHP-versions which has varying output from the hexdec function.
01565                         // NOTICE: This must be changed a number of other places as well!
01566                 return hexdec(substr(md5($str),0,7));
01567         }
01568 }
01569 
01570 
01571 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php'])    {
01572         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
01573 }
01574 ?>


Généré par Les spécialistes TYPO3 avec  doxygen 1.4.6