00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00108 require_once(PATH_t3lib.'class.t3lib_htmlmail.php');
00109
00110
00118 class tx_indexedsearch_indexer {
00119
00120
00121 var $reasons = array(
00122 -1 => 'mtime matched the document, so no changes detected and no content updated',
00123 -2 => 'The minimum age was not exceeded',
00124 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00125 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00126 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00127 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00128 );
00129 var $convChars=array(
00130 'ÁÉÚÍÄËÜÖÏÆØÅ',
00131 'áéúíâêûôîæøå'
00132 );
00133
00134
00135 var $excludeSections = 'script,style';
00136
00137
00138 var $supportedExtensions = array(
00139 'pdf' => 1,
00140 'doc' => 1,
00141 'txt' => 1,
00142 'html' => 1,
00143 'htm' => 1
00144 );
00145
00146
00147 var $pdf_mode = -20;
00148
00149
00150 var $app = array(
00151 'pdftotext' => '/usr/local/bin/pdftotext',
00152 'pdfinfo' => '/usr/local/bin/pdfinfo',
00153 'catdoc' => '/usr/local/bin/catdoc'
00154 );
00155
00156
00157 var $defaultGrList='0,-1';
00158
00159
00160 var $tstamp_maxAge = 0;
00161 var $tstamp_minAge = 0;
00162
00163
00164 var $defaultContentArray=array(
00165 'title' => '',
00166 'description' => '',
00167 'keywords' => '',
00168 'body' => '',
00169 );
00170 var $wordcount = 0;
00171 var $Itypes = array(
00172 'html' => 1,
00173 'htm' => 1,
00174 'pdf' => 2,
00175 'doc' => 3,
00176 'txt' => 4
00177 );
00178 var $conf = array();
00179 var $hash = array();
00180 var $contentParts = array();
00181 var $pObj = '';
00182 var $content_md5h = '';
00183
00184 var $cHashParams = array();
00185 var $mtime = 0;
00186 var $rootLine = array();
00187
00188 var $freqRange = 65000;
00189 var $freqMax = 0.1;
00190
00191
00192
00193
00200 function hook_indexContent(&$pObj) {
00201
00202 if ($pObj->config['config']['index_enable']) {
00203 if (!$pObj->no_cache) {
00204 $GLOBALS['TT']->push('Index page','');
00205
00206
00207 $this->pObj = &$pObj;
00208
00209
00210 $this->init();
00211 $this->indexTypo3PageContent();
00212 $GLOBALS['TT']->pull();
00213 } else {
00214 $GLOBALS['TT']->push('Index page','');
00215 $GLOBALS['TT']->setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00216 $GLOBALS['TT']->pull();
00217 }
00218 }
00219 }
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00242 function init() {
00243
00244
00245 $this->cHashParams = $this->pObj->cHash_array;
00246 if (is_array($this->cHashParams) && count($this->cHashParams)) {
00247 $this->cHashParams['cHash'] = $this->pObj->cHash;
00248 }
00249
00250
00251 $this->mtime = $this->pObj->register['SYS_LASTCHANGED'];
00252 $this->rootLine = $this->pObj->config['rootLine'];
00253
00254
00255 $this->conf = array();
00256 $this->conf['index_externals'] = $this->pObj->config['config']['index_externals'];
00257 $this->conf['index_descrLgd'] = $this->pObj->config['config']['index_descrLgd'];
00258
00259
00260 $this->setT3Hashes();
00261
00262
00263 $this->initExternalReaders();
00264 }
00265
00271 function initExternalReaders() {
00272
00273
00274 $this->app = array();
00275 $this->supportedExtensions['pdf'] = 0;
00276 $this->supportedExtensions['doc'] = 0;
00277
00278
00279 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00280
00281
00282 if ($indexerConfig['pdftools']) {
00283 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00284 if ((ini_get('safe_mode') && $pdfPath) || (@is_file($pdfPath.'pdftotext') && @is_file($pdfPath.'pdfinfo'))) {
00285 $this->app['pdfinfo'] = $pdfPath.'pdfinfo';
00286 $this->app['pdftotext'] = $pdfPath.'pdftotext';
00287 $this->supportedExtensions['pdf'] = 1;
00288 } else $GLOBALS['TT']->setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00289 } else $GLOBALS['TT']->setTSlogMessage('PDF tools disabled',1);
00290
00291
00292 if ($indexerConfig['catdoc']) {
00293 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00294 if (is_file($catdocPath.'catdoc')) {
00295 $this->app['catdoc'] = $catdocPath.'catdoc';
00296 $this->supportedExtensions['doc'] = 1;
00297 } else $GLOBALS['TT']->setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
00298 } else $GLOBALS['TT']->setTSlogMessage('catdoc tools (Word-files) disabled',1);
00299
00300
00301 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00302 }
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00325 function indexTypo3PageContent() {
00326
00327 $check = $this->checkMtimeTstamp($this->mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $this->hash['phash']);
00328 # WHAT IS THIS? Test that it works... $is_grlist = $this->is_grlist_set($phash_x); // Use $this->hash['phash']?
00329
00330 if ($check > 0 || !$is_grlist) {
00331
00332
00333 if ($check > 0) {
00334 $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00335 } else {
00336 $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00337 }
00338
00339
00340 $GLOBALS['TT']->push('Split content','');
00341 $this->contentParts = $this->splitHTMLContent($this->pObj->content);
00342 if ($this->pObj->indexedDocTitle) $this->contentParts['title'] = $this->pObj->indexedDocTitle;
00343 $GLOBALS['TT']->pull();
00344
00345
00346 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00347
00348
00349
00350 $checkCHash = $this->checkContentHash();
00351 if (!is_array($checkCHash)) {
00352 $Pstart=t3lib_div::milliseconds();
00353
00354 $GLOBALS['TT']->push('Extract words from content','');
00355 $splitInWords = $this->procesWordsInArrays($this->contentParts);
00356 $GLOBALS['TT']->pull();
00357
00358
00359 $GLOBALS['TT']->push('Analyse the extracted words','');
00360 $indexArr = $this->indexAnalyze($splitInWords);
00361 $GLOBALS['TT']->pull();
00362
00363
00364 $GLOBALS['TT']->push('Submitting page','');
00365 $this->submitPage();
00366 $GLOBALS['TT']->pull();
00367
00368
00369 $GLOBALS['TT']->push('Check word list and submit words','');
00370 $this->checkWordList($indexArr);
00371 $this->submitWords($indexArr,$this->hash['phash']);
00372 $GLOBALS['TT']->pull();
00373
00374
00375 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00376
00377
00378 $GLOBALS['TT']->push('Checking external files','');
00379 if ($this->conf['index_externals']) {
00380 $this->extractLinks($this->pObj->content);
00381 }
00382 $GLOBALS['TT']->pull();
00383 } else {
00384 $this->updateTstamp($this->hash['phash'],$this->mtime);
00385 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);
00386 $this->updateRootline();
00387 $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00388 }
00389 } else {
00390 $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00391 }
00392 }
00393
00400 function splitHTMLContent($content) {
00401
00402 # divide head from body ( u-ouh :) )
00403
00404 $contentArr=$this->defaultContentArray;
00405 $contentArr['body'] = stristr($content,'<body');
00406 $headPart = substr($content,0,-strlen($contentArr['body']));
00407
00408 # get title
00409 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00410 $titleParts = explode(':',$contentArr['title'],2);
00411 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00412
00413 # get keywords and description metatags
00414 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { }
00415 for($i=0;isset($meta[$i]);$i++) {
00416 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00417 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00418 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00419 }
00420
00421 $this->typoSearchTags($contentArr['body']);
00422
00423 # get rid of unwanted sections (ie. scripting and style stuff) in body
00424 $tagList = explode(',',$this->excludeSections);
00425 reset($tagList);
00426 while(list(,$tag)=each($tagList)) {
00427 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00428 }
00429
00430 # remove tags, but first make sure we don't concatenate words by doing it
00431 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00432 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00433
00434 $contentArr['keywords'] = trim($contentArr['keywords']);
00435 $contentArr['description'] = trim($contentArr['description']);
00436 # ta-dah!
00437 return $contentArr;
00438 }
00439
00446 function splitRegularContent($content) {
00447 $contentArr = $this->defaultContentArray;
00448 $contentArr['body'] = $content;
00449
00450 return $contentArr;
00451 }
00452
00459 function procesWordsInArrays($contentArr) {
00460
00461 # split all parts to words
00462 reset($contentArr);
00463 while(list($key,)=each($contentArr)) {
00464 if (function_exists('html_entity_decode')) $contentArr[$key] = html_entity_decode($contentArr[$key]);
00465 $contentArr[$key] = $this->strtolower_all($contentArr[$key]);
00466 $this->split2words($contentArr[$key]);
00467 }
00468
00469 # for title, keywords, and description we don't want duplicates
00470 $contentArr['title'] = array_unique($contentArr['title']);
00471 $contentArr['keywords'] = array_unique($contentArr['keywords']);
00472 $contentArr['description'] = array_unique($contentArr['description']);
00473 return $contentArr;
00474 }
00475
00482 function bodyDescription($contentArr) {
00483 # Setting description
00484 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
00485 if ($maxL) {
00486 if (function_exists('html_entity_decode')) $bodyDescription = html_entity_decode(trim($contentArr['body']));
00487 $bodyDescription = implode(' ',split('[[:space:],]+',substr($bodyDescription,0,$maxL*2)));
00488 $bodyDescription=substr($bodyDescription,0,$maxL);
00489 }
00490 return $bodyDescription;
00491 }
00492
00499 function extractLinks($content) {
00500 $extract = t3lib_div::makeInstance('t3lib_htmlmail');
00501 $extract->extractHtmlInit($content,'');
00502 $extract->extractHyperLinks();
00503 #debug($extract->theParts['html']['hrefs']);
00504 if (is_array($extract->theParts['html']['hrefs'])) {
00505 reset($extract->theParts['html']['hrefs']);
00506 while(list(,$linkInfo)=each($extract->theParts['html']['hrefs'])) {
00507 $linkInfo['ref'] = t3lib_div::htmlspecialchars_decode($linkInfo['ref']);
00508 #debug($linkInfo['ref'],1);
00509 if (strstr($linkInfo['ref'],'?') && strstr($linkInfo['ref'],'jumpurl=')) {
00510 $qParts = parse_url($linkInfo['ref']);
00511 #debug($qParts);
00512 $theJumpurlFile = $this->getJumpurl($qParts['query']);
00513
00514 if ($theJumpurlFile && @is_file($theJumpurlFile)) {
00515
00516 $this->indexRegularDocument($theJumpurlFile);
00517 }
00518 } elseif (@is_file($linkInfo['ref'])) {
00519 $this->indexRegularDocument($linkInfo['ref']);
00520 }
00521 }
00522 }
00523 }
00524
00531 function getJumpurl($query) {
00532 $res = parse_str($query);
00533 # debug(array($res),'getJumpurl');
00534
00535 return $jumpurl;
00536 }
00537
00544 function splitPdfInfo($pdfInfoArray) {
00545 $res = array();
00546 if (is_array($pdfInfoArray)) {
00547 reset($pdfInfoArray);
00548 while(list(,$line)=each($pdfInfoArray)) {
00549 $parts = explode(':',$line,2);
00550 if (count($parts)>1 && trim($parts[0])) {
00551 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00552 }
00553 }
00554 }
00555 return $res;
00556 }
00557
00564 function indexRegularDocument($file) {
00565
00566 $fI=pathinfo($file);
00567 $ext = strtolower($fI['extension']);
00568 $absFile = PATH_site.$file;
00569 #debug($file);
00570
00571 if (@is_file($absFile) && $this->supportedExtensions[$ext]) {
00572 $mtime = filemtime($absFile);
00573 $cParts = $this->fileContentParts($ext,$absFile);
00574
00575 reset($cParts);
00576 while(list(,$cPKey)=each($cParts)) {
00577 $GLOBALS['TT']->push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00578 $Pstart = t3lib_div::milliseconds();
00579 $subinfo=array('key'=>$cPKey);
00580 $phash_arr = $this->setExtHashes($file,$subinfo);
00581
00582
00583 $check = $this->checkMtimeTstamp($mtime, $this->tstamp_maxAge, $this->tstamp_minAge, $phash_arr['phash']);
00584 if ($check > 0) {
00585 $GLOBALS['TT']->setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00586
00587 $GLOBALS['TT']->push('Split content','');
00588 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
00589 #debug($contentParts);
00590 $GLOBALS['TT']->pull();
00591 if (is_array($contentParts)) {
00592
00593 $content_md5h = $this->md5inthash(implode($contentParts,''));
00594
00595 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h)) {
00596
00597 $GLOBALS['TT']->push('Extract words from content','');
00598 $splitInWords = $this->procesWordsInArrays($contentParts);
00599 $GLOBALS['TT']->pull();
00600
00601
00602 $GLOBALS['TT']->push('Analyse the extracted words','');
00603 $indexArr = $this->indexAnalyze($splitInWords);
00604 $GLOBALS['TT']->pull();
00605
00606
00607 $GLOBALS['TT']->push('Submitting page','');
00608 $size=filesize($absFile);
00609 $ctime=filemtime($absFile);
00610 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
00611 $GLOBALS['TT']->pull();
00612
00613
00614 $GLOBALS['TT']->push('Check word list and submit words','');
00615 $this->checkWordList($indexArr);
00616 $this->submitWords($indexArr,$phash_arr['phash']);
00617 $GLOBALS['TT']->pull();
00618
00619
00620 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
00621 } else {
00622 $this->updateTstamp($phash_arr['phash'],$mtime);
00623 $GLOBALS['TT']->setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
00624 }
00625 } else {
00626 $GLOBALS['TT']->setTSlogMessage('Could not index file! Unsupported extension.');
00627 }
00628 } else {
00629 $GLOBALS['TT']->setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00630 }
00631
00632 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
00633 $this->submitFile_section($phash_arr['phash']);
00634 $GLOBALS['TT']->pull();
00635 }
00636 }
00637 }
00638
00647 function readFileContent($ext,$absFile,$cPKey) {
00648 switch ($ext) {
00649 case 'pdf':
00650 if ($this->app['pdfinfo']) {
00651 #debug($this->app);
00652
00653 $cmd = $this->app['pdfinfo'].' '.$absFile;
00654 exec($cmd,$res);
00655 $pdfInfo=$this->splitPdfInfo($res);
00656
00657 if (intval($pdfInfo['pages'])) {
00658 list($low,$high) = explode('-',$cPKey);
00659
00660
00661 $tempFileName = t3lib_div::tempnam('Typo3_indexer');
00662 @unlink ($tempFileName);
00663 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -q '.$absFile.' '.$tempFileName;
00664
00665 exec($cmd,$res);
00666 if (@is_file($tempFileName)) {
00667 $content = t3lib_div::getUrl($tempFileName);
00668 unlink($tempFileName);
00669 } else {
00670 $GLOBALS['TT']->setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00671 }
00672 $contentArr = $this->splitRegularContent($content);
00673 }
00674 }
00675 break;
00676 case 'doc':
00677 if ($this->app['catdoc']) {
00678 $cmd = $this->app['catdoc'].' '.$absFile;
00679 exec($cmd,$res);
00680 $content = implode(chr(10),$res);
00681 $contentArr = $this->splitRegularContent($content);
00682 }
00683 break;
00684 case 'txt':
00685 $content = t3lib_div::getUrl($absFile);
00686 $contentArr = $this->splitRegularContent($content);
00687 break;
00688 case 'html':
00689 case 'htm':
00690 $fileContent = t3lib_div::getUrl($absFile);
00691 $contentArr = $this->splitHTMLContent($fileContent);
00692 break;
00693 default:
00694 return false;
00695 break;
00696 }
00697
00698 if (!$contentArr['title']) {
00699 $contentArr['title']=str_replace('_',' ',basename($absFile));
00700 }
00701 return $contentArr;
00702 }
00703
00711 function fileContentParts($ext,$absFile) {
00712 $cParts=array(0);
00713 switch ($ext) {
00714 case 'pdf':
00715
00716 $cmd = $this->app['pdfinfo'].' '.$absFile;
00717 exec($cmd,$res);
00718 $pdfInfo=$this->splitPdfInfo($res);
00719
00720
00721 if (intval($pdfInfo['pages'])) {
00722 $cParts=array();
00723
00724
00725 if ($this->pdf_mode>0) {
00726 $iter=ceil($pdfInfo['pages']/$this->pdf_mode);
00727 } else {
00728 $iter=t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00729 }
00730 for ($a=0;$a<$iter;$a++) {
00731 $low=floor($a*($pdfInfo['pages']/$iter))+1;
00732 $high=floor(($a+1)*($pdfInfo['pages']/$iter));
00733 $cParts[]=$low.'-'.$high;
00734 }
00735 }
00736 break;
00737 }
00738 return $cParts;
00739 }
00740
00741
00754 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00755 $endTag = '</'.$tagName.'>';
00756 $startTag = '<'.$tagName;
00757 $isTagInText = stristr($string,$startTag);
00758 if(!$isTagInText) return false;
00759
00760 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00761 $afterTagInText = stristr($isTagInText,$endTag);
00762 if ($afterTagInText) {
00763 $tagContent = substr($isTagInText,0,-strlen($afterTagInText));
00764 $stringAfter = substr($afterTagInText,strlen($endTag));
00765 } else {
00766 $tagContent='';
00767 $stringAfter = $isTagInText;
00768 }
00769
00770 return true;
00771 }
00772
00780 function indexAnalyze($content) {
00781 $indexArr = Array();
00782 $counter = 0;
00783
00784 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
00785 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
00786 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
00787 $this->analyzeBody($indexArr,$content);
00788
00789 return ($indexArr);
00790 }
00791
00801 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
00802 reset($content[$key]);
00803 while(list(,$val)=each($content[$key])) {
00804 $val = substr($val,0,30);
00805 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
00806 $retArr[$val]['count'] = $retArr[$val]['count']+1;
00807 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
00808 $retArr[$val]['metaphone'] = $this->metaphone($val);
00809 $this->wordcount++;
00810 }
00811 }
00812
00820 function analyzeBody(&$retArr,$content) {
00821 reset($content['body']);
00822 while(list($key,$val)=each($content['body'])) {
00823 $val = substr($val,0,30);
00824 if(!isset($retArr[$val])) {
00825 $retArr[$val]['first']=$key;
00826 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
00827 $retArr[$val]['metaphone'] = $this->metaphone($val);
00828 }
00829 $retArr[$val]['count'] = $retArr[$val]['count']+1;
00830 $this->wordcount++;
00831 }
00832 }
00833
00840 function typoSearchTags(&$body) {
00841 $expBody = explode('<!--TYPO3SEARCH_',$body);
00842 #debug($expBody);
00843 if(count($expBody)>1) {
00844 $body = '';
00845 reset($expBody);
00846 while(list(,$val)=each($expBody)) {
00847 $part = explode('-->',$val,2);
00848 if(trim($part[0])=='begin') {
00849 $body .= $part[1];
00850 $prev = '';
00851 } elseif(trim($part[0])=='end') {
00852 $body .= $prev;
00853 } else {
00854 $prev = $val;
00855 }
00856 #debug($part);
00857 }
00858 #debug(array($body));
00859 return true;
00860 } else {
00861 return false;
00862 }
00863 }
00864
00865
00866
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881
00882
00883
00891 function split2words(&$string) {
00892 $words = split('[[:space:],]+',$string);
00893 $reg='['.quotemeta('().,_?!:-').']*';
00894 $reg='[^[:alnum:]'.$this->convChars[0].$this->convChars[1].']*';
00895
00896 #debug($words);
00897 #debug(array($string));
00898 reset($words);
00899 $matches=array();
00900 while(list(,$w)=each($words)) {
00901 $w=trim($w);
00902 $w=ereg_replace('^'.$reg,'',$w);
00903 $w=ereg_replace($reg.'$','',$w);
00904 if ($this->wordOK($w)) {$matches[]=$w;}
00905 }
00906 # debug($matches);
00907 $string =$matches;
00908
00909
00910
00911
00912
00913
00914 }
00915
00924 function wordOK($w) {
00925 if ($w && strlen($w)>1 && strlen($w)<50) {
00926 if (rawurlencode($w)!=$w) {
00927 $fChars = count(explode('%',rawurlencode($w)))-1;
00928 $rel = round($fChars/strlen($w)*100);
00929 return $rel<30 ? 1 : 0;
00930 } else {
00931 return 1;
00932 }
00933 }
00934 }
00935
00942 function metaphone($word) {
00943 $tmp = metaphone($word);
00944 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
00945 return $ret;
00946 }
00947
00954 function strtolower_all($str) {
00955 return strtolower(strtr($str, $this->convChars[0], $this->convChars[1]));
00956 }
00957
00958
00959
00960
00961
00962
00963
00964
00965
00966
00967
00968
00969
00970
00971
00972
00973
00974
00975
00976
00977
00985 function freqMap($freq) {
00986 $mapFactor = $this->freqMax*100*$this->freqRange;
00987 if($freq<1) {
00988 $newFreq = $freq*$mapFactor;
00989 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
00990 } else {
00991 $newFreq = $freq/$mapFactor;
00992 }
00993 return $newFreq;
00994
00995 }
00996
01003 function getRootLineFields(&$fieldArr) {
01004 $rl = $this->rootLine;
01005
01006 $fieldArr['rl0'] = intval($rl[0]['uid']);
01007 $fieldArr['rl1'] = intval($rl[1]['uid']);
01008 $fieldArr['rl2'] = intval($rl[2]['uid']);
01009
01010 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
01011 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
01012 $fieldArr[$fieldName] = intval($rl[$rootLineLevel]['uid']);
01013 }
01014 }
01015 }
01016
01017
01018
01019
01020
01021
01022
01023
01024
01025
01026
01027
01028
01029
01030
01031
01032
01033
01034
01035
01043 function removeIndexedPhashRow($phashList,$clearPageCache=1) {
01044 $phashRows=t3lib_div::trimExplode(',',$phashList,1);
01045 while(list(,$phash)=each($phashRows)) {
01046 $phash = intval($phash);
01047 if ($phash>0) {
01048
01049 if ($clearPageCache) {
01050
01051 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('page_id', 'index_section', 'phash='.intval($phash));
01052 if ($GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01053 $idList = array();
01054 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01055 $idList[] = $row['page_id'];
01056 }
01057 $GLOBALS['TYPO3_DB']->exec_DELETEquery('cache_pages', 'page_id IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($idList)).')');
01058 }
01059 }
01060
01061
01062 $tableArr = explode(',','index_phash,index_rel,index_section,index_fulltext,index_grlist');
01063 foreach($tableArr as $table) {
01064 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
01065 }
01066
01067
01068 #debug('DELETE: '.$phash,1);
01069 }
01070 }
01071 }
01072
01083 function checkMtimeTstamp($mtime,$maxAge,$minAge,$phash) {
01084
01085
01086 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01087 $out = 0;
01088
01089
01090 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01091 if ($maxAge && ($row['tstamp']+$maxAge)<time()) {
01092 $out = 1;
01093 } else {
01094 if (!$minAge || ($row['tstamp']+$minAge)<time()) {
01095 if ($mtime) {
01096 if ($row['item_mtime'] != $mtime) {
01097 $out = 2;
01098 } else {
01099 $out = -1;
01100 $this->updateTstamp($phash);
01101 $GLOBALS['TT']->setTSlogMessage('Mtime matched, timestamp updated.',1);
01102 }
01103 } else {$out = 3; }
01104 } else {$out = -2;}
01105 }
01106 } else {$out = 4;}
01107 return $out;
01108 }
01109
01117 function update_grlist($phash,$phash_x) {
01118 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->pObj->gr_list));
01119 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01120 $this->submit_grlist($phash,$phash_x);
01121 $GLOBALS['TT']->setTSlogMessage("Inserted gr_list '".$this->pObj->gr_list."' for phash '".$phash."'",1);
01122 }
01123 }
01124
01129 function is_grlist_set($phash_x) {
01130 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
01131 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
01132 }
01133
01140 function checkContentHash() {
01141
01142 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash AS A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01143 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01144 return $row;
01145 }
01146 return 1;
01147 }
01148
01154 function removeLoginpagesWithContentHash() {
01155 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A,index_grlist AS B', '
01156 A.phash=B.phash
01157 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01158 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01159 AND A.contentHash='.intval($this->content_md5h));
01160 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01161 $GLOBALS['TT']->setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01162 $this->removeOldIndexedPages($row['phash']);
01163 }
01164 }
01165
01172 function removeOldIndexedPages($phash) {
01173
01174 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext');
01175 foreach($tableArr as $table) {
01176 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, $table).'"');
01177 }
01178
01179 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_section').'"');
01180 }
01181
01190 function checkExternalDocContentHash($hashGr,$content_md5h) {
01191 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash AS A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01192 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01193 return 0;
01194 }
01195 return 1;
01196 }
01197
01205 function updateTstamp($phash,$mtime=0) {
01206 $updateFields = array(
01207 'tstamp' => time()
01208 );
01209 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
01210
01211 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01212 }
01213
01221 function updateParsetime($phash,$parsetime) {
01222 $updateFields = array(
01223 'parsetime' => intval($parsetime)
01224 );
01225
01226 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01227 }
01228
01234 function updateRootline() {
01235
01236 $updateFields = array();
01237 $this->getRootLineFields($updateFields);
01238
01239 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->pObj->id), $updateFields);
01240 }
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01264 function submitPage() {
01265 $this->removeOldIndexedPages($this->hash['phash']);
01266
01267
01268 $fields = array(
01269 'phash' => $this->hash['phash'],
01270 'phash_grouping' => $this->hash['phash_grouping'],
01271 'cHashParams' => serialize($this->cHashParams),
01272 'contentHash' => $this->content_md5h,
01273 'data_page_id' => $this->pObj->id,
01274 'data_page_reg1' => $this->pObj->page_cache_reg1,
01275 'data_page_type' => $this->pObj->type,
01276 'data_page_mp' => $this->pObj->MP,
01277 'gr_list' => $this->pObj->gr_list,
01278 'item_type' => 0,
01279 'item_title' => $this->contentParts['title'],
01280 'item_description' => $this->bodyDescription($this->contentParts),
01281 'item_mtime' => $this->mtime,
01282 'item_size' => strlen($this->pObj->content),
01283 'tstamp' => time(),
01284 'crdate' => time(),
01285 'item_crdate' => $this->pObj->page['crdate'],
01286 'sys_language_uid' => $this->pObj->sys_language_uid
01287 );
01288 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01289
01290
01291
01292
01293 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01294
01295
01296
01297
01298 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01299
01300
01301
01302
01303 $fields = array(
01304 'phash' => $this->hash['phash'],
01305 'fulltextdata' => implode($this->contentParts,' ')
01306 );
01307 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01308 }
01309
01317 function submit_grlist($hash,$phash_x) {
01318
01319 $fields = array(
01320 'phash' => $hash,
01321 'phash_x' => $phash_x,
01322 'hash_gr_list' => $this->md5inthash($this->pObj->gr_list),
01323 'gr_list' => $this->pObj->gr_list
01324 );
01325 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01326 }
01327
01335 function submit_section($hash,$hash_t3) {
01336 $fields = array(
01337 'phash' => $hash,
01338 'phash_t3' => $hash_t3,
01339 'page_id' => intval($this->pObj->id)
01340 );
01341
01342 $this->getRootLineFields($fields);
01343
01344 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01345 }
01346
01361 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
01362
01363 $tableArr = explode(',','index_phash,index_fulltext,index_grlist');
01364 foreach($tableArr as $table) {
01365 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($hash['phash'], $table).'"');
01366 }
01367
01368 $fields = array(
01369 'phash' => $hash['phash'],
01370 'phash_grouping' => $hash['phash_grouping'],
01371 'cHashParams' => serialize($subinfo),
01372 'contentHash' => $content_md5h,
01373 'data_filename' => $file,
01374 'item_type' => intval($this->Itypes[$ext]) ? intval($this->Itypes[$ext]) : -1,
01375 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01376 'item_description' => $this->bodyDescription($contentParts),
01377 'item_mtime' => $mtime,
01378 'item_size' => $size,
01379 'item_crdate' => $ctime,
01380 'tstamp' => time(),
01381 'crdate' => time(),
01382 'gr_list' => $this->pObj->gr_list
01383 );
01384 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01385
01386
01387
01388
01389 $fields = array(
01390 'phash' => $hash['phash'],
01391 'fulltextdata' => implode($contentParts,' ')
01392 );
01393 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01394 }
01395
01402 function submitFile_grlist($hash) {
01403
01404
01405
01406
01407 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->pObj->gr_list).')');
01408 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01409 $this->submit_grlist($hash,$hash);
01410 }
01411 }
01412
01419 function submitFile_section($hash) {
01420
01421
01422
01423
01424 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->pObj->id));
01425 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01426 $this->submit_section($hash,$this->hash['phash']);
01427 }
01428 }
01429
01436 function checkWordList($wl) {
01437 reset($wl);
01438 $phashArr=array();
01439 while(list($key,)=each($wl)) {
01440 $phashArr[] = $wl[$key]['hash'];
01441 }
01442 if (count($phashArr)) {
01443 $cwl = implode(',',$phashArr);
01444 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
01445
01446 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
01447 $GLOBALS['TT']->setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
01448 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01449 unset($wl[$row['baseword']]);
01450 }
01451
01452 reset($wl);
01453 while(list($key,$val)=each($wl)) {
01454 $insertFields = array(
01455 'wid' => $val['hash'],
01456 'baseword' => $key,
01457 'metaphone' => $val['metaphone']
01458 );
01459
01460 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
01461 }
01462 }
01463 }
01464 }
01465
01473 function submitWords($wl,$phash) {
01474 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash="'.$GLOBALS['TYPO3_DB']->quoteStr($phash, 'index_rel').'"');
01475
01476 foreach($wl as $val) {
01477 $insertFields = array(
01478 'phash' => $phash,
01479 'wid' => $val['hash'],
01480 'count' => $val['count'],
01481 'first' => $val['first'],
01482 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
01483 'flags' => $val['cmp']
01484 );
01485
01486 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
01487 }
01488 }
01489
01490
01491
01492
01493
01494
01495
01496
01497
01498
01499
01500
01501
01502
01503
01504
01505
01506
01507
01508
01509
01510
01511
01517 function setT3Hashes() {
01518
01519 $hArray = array(
01520 'id' => $this->pObj->id,
01521 'type' => $this->pObj->type,
01522 'sys_lang' => $this->pObj->sys_language_uid,
01523 'MP' => $this->pObj->MP,
01524 'cHash' => $this->cHashParams
01525 );
01526
01527 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01528
01529 $hArray['gr_list']=$this->pObj->gr_list;
01530 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
01531 }
01532
01540 function setExtHashes($file,$subinfo=array()) {
01541
01542 $hash = array();
01543 $hArray = array(
01544 'file' => $file,
01545 );
01546
01547
01548 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01549
01550
01551 $hArray['subinfo'] = $subinfo;
01552 $hash['phash'] = $this->md5inthash(serialize($hArray));
01553
01554 return $hash;
01555 }
01556
01563 function md5inthash($str) {
01564
01565
01566 return hexdec(substr(md5($str),0,7));
01567 }
01568 }
01569
01570
01571 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
01572 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
01573 }
01574 ?>