00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00131 require_once(PATH_t3lib.'class.t3lib_parsehtml.php');
00132
00133
00141 class tx_indexedsearch_indexer {
00142
00143
00144 var $reasons = array(
00145 -1 => 'mtime matched the document, so no changes detected and no content updated',
00146 -2 => 'The minimum age was not exceeded',
00147 1 => "The configured max-age was exceeded for the document and thus it's indexed.",
00148 2 => 'The minimum age was exceed and mtime was set and the mtime was different, so the page was indexed.',
00149 3 => 'The minimum age was exceed, but mtime was not set, so the page was indexed.',
00150 4 => 'Page has never been indexed (is not represented in the index_phash table).'
00151 );
00152
00153
00154 var $excludeSections = 'script,style';
00155
00156
00157 var $external_parsers = array();
00158
00159
00160 var $defaultGrList = '0,-1';
00161
00162
00163 var $tstamp_maxAge = 0;
00164 var $tstamp_minAge = 0;
00165 var $maxExternalFiles = 0;
00166
00167 var $forceIndexing = FALSE;
00168 var $crawlerActive = FALSE;
00169
00170
00171 var $defaultContentArray=array(
00172 'title' => '',
00173 'description' => '',
00174 'keywords' => '',
00175 'body' => '',
00176 );
00177 var $wordcount = 0;
00178 var $externalFileCounter = 0;
00179
00180 var $conf = array();
00181 var $indexerConfig = array();
00182 var $hash = array();
00183 var $file_phash_arr = array();
00184 var $contentParts = array();
00185 var $content_md5h = '';
00186 var $internal_log = array();
00187 var $indexExternalUrl_content = '';
00188
00189 var $cHashParams = array();
00190
00191 var $freqRange = 32000;
00192 var $freqMax = 0.1;
00193
00194
00195 var $csObj;
00196 var $metaphoneObj;
00197 var $lexerObj;
00198
00199
00200
00207 function hook_indexContent(&$pObj) {
00208
00209
00210 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00211
00212
00213
00214 if (t3lib_extMgm::isLoaded('crawler')
00215 && $pObj->applicationData['tx_crawler']['running']
00216 && in_array('tx_indexedsearch_reindex', $pObj->applicationData['tx_crawler']['parameters']['procInstructions'])) {
00217
00218
00219 $pObj->applicationData['tx_crawler']['log'][] = 'Forced Re-indexing enabled';
00220
00221
00222 $this->crawlerActive = TRUE;
00223 $this->forceIndexing = TRUE;
00224 }
00225
00226
00227 if ($pObj->config['config']['index_enable']) {
00228 $this->log_push('Index page','');
00229
00230 if (!$indexerConfig['disableFrontendIndexing'] || $this->crawlerActive) {
00231 if (!$pObj->page['no_search']) {
00232 if (!$pObj->no_cache) {
00233 if (!strcmp($pObj->sys_language_uid,$pObj->sys_language_content)) {
00234
00235
00236 $this->conf = array();
00237
00238
00239 $this->conf['id'] = $pObj->id;
00240 $this->conf['type'] = $pObj->type;
00241 $this->conf['sys_language_uid'] = $pObj->sys_language_uid;
00242 $this->conf['MP'] = $pObj->MP;
00243 $this->conf['gr_list'] = $pObj->gr_list;
00244
00245 $this->conf['cHash'] = $pObj->cHash;
00246 $this->conf['cHash_array'] = $pObj->cHash_array;
00247
00248 $this->conf['crdate'] = $pObj->page['crdate'];
00249 $this->conf['page_cache_reg1'] = $pObj->page_cache_reg1;
00250
00251
00252 $this->conf['rootline_uids'] = array();
00253 foreach($pObj->config['rootLine'] as $rlkey => $rldat) {
00254 $this->conf['rootline_uids'][$rlkey] = $rldat['uid'];
00255 }
00256
00257
00258 $this->conf['content'] = $pObj->content;
00259 $this->conf['indexedDocTitle'] = $pObj->convOutputCharset($pObj->indexedDocTitle);
00260 $this->conf['metaCharset'] = $pObj->metaCharset;
00261 $this->conf['mtime'] = $pObj->register['SYS_LASTCHANGED'];
00262
00263
00264 $this->conf['index_externals'] = $pObj->config['config']['index_externals'];
00265 $this->conf['index_descrLgd'] = $pObj->config['config']['index_descrLgd'];
00266
00267
00268 $this->conf['recordUid'] = 0;
00269 $this->conf['freeIndexUid'] = 0;
00270 $this->conf['freeIndexSetId'] = 0;
00271
00272
00273 $this->init();
00274 $this->indexTypo3PageContent();
00275 } else $this->log_setTSlogMessage('Index page? No, ->sys_language_uid was different from sys_language_content which indicates that the page contains fall-back content and that would be falsely indexed as localized content.');
00276 } else $this->log_setTSlogMessage('Index page? No, page was set to "no_cache" and so cannot be indexed.');
00277 } else $this->log_setTSlogMessage('Index page? No, The "No Search" flag has been set in the page properties!');
00278 } else $this->log_setTSlogMessage('Index page? No, Ordinary Frontend indexing during rendering is disabled.');
00279 $this->log_pull();
00280 }
00281 }
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291
00292
00293
00294
00295
00308 function backend_initIndexer($id, $type, $sys_language_uid, $MP, $uidRL, $cHash_array=array(), $createCHash=FALSE) {
00309
00310
00311 $this->conf = array();
00312
00313
00314 $this->conf['id'] = $id;
00315 $this->conf['type'] = $type;
00316 $this->conf['sys_language_uid'] = $sys_language_uid;
00317 $this->conf['MP'] = $MP;
00318 $this->conf['gr_list'] = '0,-1';
00319
00320
00321 $this->conf['cHash'] = $createCHash ? $this->makeCHash($cHash_array) : '';
00322 $this->conf['cHash_array'] = $cHash_array;
00323
00324
00325 $this->conf['freeIndexUid'] = 0;
00326 $this->conf['freeIndexSetId'] = 0;
00327 $this->conf['page_cache_reg1'] = '';
00328
00329
00330 $this->conf['rootline_uids'] = $uidRL;
00331
00332
00333 $this->conf['index_externals'] = 1;
00334 $this->conf['index_descrLgd'] = 200;
00335
00336
00337 $this->init();
00338 }
00339
00347 function backend_setFreeIndexUid($freeIndexUid, $freeIndexSetId=0) {
00348 $this->conf['freeIndexUid'] = $freeIndexUid;
00349 $this->conf['freeIndexSetId'] = $freeIndexSetId;
00350 }
00351
00365 function backend_indexAsTYPO3Page($title, $keywords, $description, $content, $charset, $mtime, $crdate=0, $recordUid=0) {
00366
00367
00368 $this->conf['mtime'] = $mtime;
00369 $this->conf['crdate'] = $crdate;
00370 $this->conf['recordUid'] = $recordUid;
00371
00372
00373 $this->conf['content'] = '
00374 <html>
00375 <head>
00376 <title>'.htmlspecialchars($title).'</title>
00377 <meta name="keywords" content="'.htmlspecialchars($keywords).'" />
00378 <meta name="description" content="'.htmlspecialchars($description).'" />
00379 </head>
00380 <body>
00381 '.htmlspecialchars($content).'
00382 </body>
00383 </html>';
00384
00385
00386 $this->conf['metaCharset'] = $charset;
00387 $this->conf['indexedDocTitle'] = '';
00388
00389
00390 $this->indexTypo3PageContent();
00391 }
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00416 function init() {
00417 global $TYPO3_CONF_VARS;
00418
00419
00420 $this->cHashParams = $this->conf['cHash_array'];
00421 if (is_array($this->cHashParams) && count($this->cHashParams)) {
00422 if ($this->conf['cHash']) $this->cHashParams['cHash'] = $this->conf['cHash'];
00423 unset($this->cHashParams['encryptionKey']);
00424 }
00425
00426
00427 $this->setT3Hashes();
00428
00429
00430 $this->indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00431 $this->tstamp_minAge = t3lib_div::intInRange($this->indexerConfig['minAge']*3600,0);
00432 $this->tstamp_maxAge = t3lib_div::intInRange($this->indexerConfig['maxAge']*3600,0);
00433 $this->maxExternalFiles = t3lib_div::intInRange($this->indexerConfig['maxExternalFiles'],0,1000,5);
00434 $this->flagBitMask = t3lib_div::intInRange($this->indexerConfig['flagBitMask'],0,255);
00435
00436
00437
00438 if ($this->conf['index_externals']) {
00439 $this->initializeExternalParsers();
00440 }
00441
00442
00443
00444 $lexerObjRef = $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] ?
00445 $TYPO3_CONF_VARS['EXTCONF']['indexed_search']['lexer'] :
00446 'EXT:indexed_search/class.lexer.php:&tx_indexedsearch_lexer';
00447 $this->lexerObj = &t3lib_div::getUserObj($lexerObjRef);
00448 $this->lexerObj->debug = $this->indexerConfig['debugMode'];
00449
00450
00451
00452 if ($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']) {
00453 $this->metaphoneObj = &t3lib_div::getUserObj($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['metaphone']);
00454 $this->metaphoneObj->pObj = &$this;
00455 }
00456
00457
00458 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00459 }
00460
00468 function initializeExternalParsers() {
00469 global $TYPO3_CONF_VARS;
00470
00471 if (is_array($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'])) {
00472 foreach($TYPO3_CONF_VARS['EXTCONF']['indexed_search']['external_parsers'] as $extension => $_objRef) {
00473 $this->external_parsers[$extension] = &t3lib_div::getUserObj($_objRef);
00474 $this->external_parsers[$extension]->pObj = &$this;
00475
00476
00477 if (!$this->external_parsers[$extension]->initParser($extension)) {
00478 unset($this->external_parsers[$extension]);
00479 }
00480 }
00481 }
00482 }
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00509 function indexTypo3PageContent() {
00510
00511 $check = $this->checkMtimeTstamp($this->conf['mtime'], $this->hash['phash']);
00512 $is_grlist = $this->is_grlist_set($this->hash['phash']);
00513
00514 if ($check > 0 || !$is_grlist || $this->forceIndexing) {
00515
00516
00517 if ($this->forceIndexing) {
00518 $this->log_setTSlogMessage('Indexing needed, reason: Forced',1);
00519 } elseif ($check > 0) {
00520 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00521 } else {
00522 $this->log_setTSlogMessage('Indexing needed, reason: Updates gr_list!',1);
00523 }
00524
00525
00526 $this->log_push('Split content','');
00527 $this->contentParts = $this->splitHTMLContent($this->conf['content']);
00528 if ($this->conf['indexedDocTitle']) {
00529 $this->contentParts['title'] = $this->conf['indexedDocTitle'];
00530 }
00531 $this->log_pull();
00532
00533
00534 $this->content_md5h = $this->md5inthash(implode($this->contentParts,''));
00535
00536
00537
00538
00539 $checkCHash = $this->checkContentHash();
00540 if (!is_array($checkCHash) || $check===1) {
00541 $Pstart=t3lib_div::milliseconds();
00542
00543 $this->log_push('Converting charset of content ('.$this->conf['metaCharset'].') to utf-8','');
00544 $this->charsetEntity2utf8($this->contentParts,$this->conf['metaCharset']);
00545 $this->log_pull();
00546
00547
00548 $this->log_push('Extract words from content','');
00549 $splitInWords = $this->processWordsInArrays($this->contentParts);
00550 $this->log_pull();
00551
00552
00553 $this->log_push('Analyse the extracted words','');
00554 $indexArr = $this->indexAnalyze($splitInWords);
00555 $this->log_pull();
00556
00557
00558 $this->log_push('Submitting page','');
00559 $this->submitPage();
00560 $this->log_pull();
00561
00562
00563 $this->log_push('Check word list and submit words','');
00564 $this->checkWordList($indexArr);
00565 $this->submitWords($indexArr,$this->hash['phash']);
00566 $this->log_pull();
00567
00568
00569 $this->updateParsetime($this->hash['phash'],t3lib_div::milliseconds()-$Pstart);
00570
00571
00572 $this->log_push('Checking external files','');
00573 if ($this->conf['index_externals']) {
00574 $this->extractLinks($this->conf['content']);
00575 }
00576 $this->log_pull();
00577 } else {
00578 $this->updateTstamp($this->hash['phash'],$this->conf['mtime']);
00579 $this->updateSetId($this->hash['phash']);
00580 $this->update_grlist($checkCHash['phash'],$this->hash['phash']);
00581 $this->updateRootline();
00582 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$this->content_md5h.', has not changed. Timestamp, grlist and rootline updated if necessary.');
00583 }
00584 } else {
00585 $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
00586 }
00587 }
00588
00596 function splitHTMLContent($content) {
00597
00598
00599 $contentArr = $this->defaultContentArray;
00600 $contentArr['body'] = stristr($content,'<body');
00601 $headPart = substr($content,0,-strlen($contentArr['body']));
00602
00603
00604 $this->embracingTags($headPart,'TITLE',$contentArr['title'],$dummy2,$dummy);
00605 $titleParts = explode(':',$contentArr['title'],2);
00606 $contentArr['title'] = trim(isset($titleParts[1]) ? $titleParts[1] : $titleParts[0]);
00607
00608
00609 for($i=0;$this->embracingTags($headPart,'meta',$dummy,$headPart,$meta[$i]);$i++) { }
00610 for($i=0;isset($meta[$i]);$i++) {
00611 $meta[$i] = t3lib_div::get_tag_attributes($meta[$i]);
00612 if(stristr($meta[$i]['name'],'keywords')) $contentArr['keywords'].=','.$meta[$i]['content'];
00613 if(stristr($meta[$i]['name'],'description')) $contentArr['description'].=','.$meta[$i]['content'];
00614 }
00615
00616
00617 $this->typoSearchTags($contentArr['body']);
00618
00619
00620 $tagList = explode(',',$this->excludeSections);
00621 foreach($tagList as $tag) {
00622 while($this->embracingTags($contentArr['body'],$tag,$dummy,$contentArr['body'],$dummy2));
00623 }
00624
00625
00626 $contentArr['body'] = str_replace('<',' <',$contentArr['body']);
00627 $contentArr['body'] = trim(strip_tags($contentArr['body']));
00628
00629 $contentArr['keywords'] = trim($contentArr['keywords']);
00630 $contentArr['description'] = trim($contentArr['description']);
00631
00632
00633 return $contentArr;
00634 }
00635
00642 function getHTMLcharset($content) {
00643 if (eregi('<meta[[:space:]]+[^>]*http-equiv[[:space:]]*=[[:space:]]*["\']CONTENT-TYPE["\'][^>]*>',$content,$reg)) {
00644 if (eregi('charset[[:space:]]*=[[:space:]]*([[:alnum:]-]+)',$reg[0],$reg2)) {
00645 return $reg2[1];
00646 }
00647 }
00648 }
00649
00657 function convertHTMLToUtf8($content,$charset='') {
00658
00659
00660 $charset = $charset ? $charset : $this->getHTMLcharset($content);
00661 $charset = $this->csObj->parse_charset($charset);
00662
00663
00664 if ($charset && $charset!=='utf-8') {
00665 $content = $this->csObj->utf8_encode($content, $charset);
00666 }
00667
00668 $content = $this->csObj->entities_to_utf8($content, TRUE);
00669
00670 return $content;
00671 }
00672
00685 function embracingTags($string,$tagName,&$tagContent,&$stringAfter,&$paramList) {
00686 $endTag = '</'.$tagName.'>';
00687 $startTag = '<'.$tagName;
00688
00689 $isTagInText = stristr($string,$startTag);
00690 if(!$isTagInText) return false;
00691
00692 list($paramList,$isTagInText) = explode('>',substr($isTagInText,strlen($startTag)),2);
00693 $afterTagInText = stristr($isTagInText,$endTag);
00694 if ($afterTagInText) {
00695 $stringBefore = substr($string, 0, strpos(strtolower($string), strtolower($startTag)));
00696 $tagContent = substr($isTagInText,0,strlen($isTagInText)-strlen($afterTagInText));
00697 $stringAfter = $stringBefore.substr($afterTagInText,strlen($endTag));
00698 } else {
00699 $tagContent='';
00700 $stringAfter = $isTagInText;
00701 }
00702
00703 return true;
00704 }
00705
00712 function typoSearchTags(&$body) {
00713 $expBody = preg_split('/\<\!\-\-[\s]?TYPO3SEARCH_/',$body);
00714
00715 if(count($expBody)>1) {
00716 $body = '';
00717
00718 foreach($expBody as $val) {
00719 $part = explode('-->',$val,2);
00720 if(trim($part[0])=='begin') {
00721 $body.= $part[1];
00722 $prev = '';
00723 } elseif(trim($part[0])=='end') {
00724 $body.= $prev;
00725 } else {
00726 $prev = $val;
00727 }
00728 }
00729 return true;
00730 } else {
00731 return false;
00732 }
00733 }
00734
00741 function extractLinks($content) {
00742
00743
00744 $list = $this->extractHyperLinks($content);
00745
00746 if ($this->indexerConfig['useCrawlerForExternalFiles'] && t3lib_extMgm::isLoaded('crawler')) {
00747 $this->includeCrawlerClass();
00748 $crawler = t3lib_div::makeInstance('tx_crawler_lib');
00749 }
00750
00751
00752 foreach($list as $linkInfo) {
00753
00754
00755 if ($linkInfo['localPath']) {
00756 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['localPath']);
00757 } else {
00758 $linkSource = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00759 }
00760
00761
00762 $qParts = parse_url($linkSource);
00763
00764
00765 if ($qParts['query'] && strstr($qParts['query'],'jumpurl=')) {
00766 parse_str($qParts['query'],$getP);
00767 $linkSource = $getP['jumpurl'];
00768 $qParts = parse_url($linkSource);
00769 }
00770
00771 if ($qParts['scheme']) {
00772 if ($this->indexerConfig['indexExternalURLs']) {
00773
00774 $this->indexExternalUrl($linkSource);
00775 }
00776 } elseif (!$qParts['query']) {
00777 if (t3lib_div::isAllowedAbsPath($linkSource)) {
00778 $localFile = $linkSource;
00779 } else {
00780 $localFile = t3lib_div::getFileAbsFileName(PATH_site.$linkSource);
00781 }
00782 if ($localFile && @is_file($localFile)) {
00783
00784
00785 if ($linkInfo['localPath']) {
00786
00787 $fI = pathinfo($linkSource);
00788 $ext = strtolower($fI['extension']);
00789 if (is_object($crawler)) {
00790 $params = array(
00791 'document' => $linkSource,
00792 'alturl' => $linkInfo['href'],
00793 'conf' => $this->conf
00794 );
00795 unset($params['conf']['content']);
00796
00797 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00798 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00799 } else {
00800 $this->indexRegularDocument($linkInfo['href'], false, $linkSource, $ext);
00801 }
00802 } else {
00803 if (is_object($crawler)) {
00804 $params = array(
00805 'document' => $linkSource,
00806 'conf' => $this->conf
00807 );
00808 unset($params['conf']['content']);
00809 $crawler->addQueueEntry_callBack(0,$params,'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_files',$this->conf['id']);
00810 $this->log_setTSlogMessage('media "'.$params['document'].'" added to "crawler" queue.',1);
00811 } else {
00812 $this->indexRegularDocument($linkSource);
00813 }
00814 }
00815 }
00816 }
00817 }
00818 }
00819
00827 function extractHyperLinks($string) {
00828 if (!is_object($this->htmlParser)) {
00829 $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
00830 }
00831
00832 $parts = $this->htmlParser->splitTags('a',$string);
00833 $list = array();
00834 foreach ($parts as $k => $v) {
00835 if ($k%2) {
00836 $params = $this->htmlParser->get_tag_attributes($v,1);
00837 $firstTagName = $this->htmlParser->getFirstTagName($v);
00838
00839 switch (strtolower($firstTagName)) {
00840 case 'a':
00841 $src = $params[0]['href'];
00842 if ($src) {
00843
00844 $md5 = t3lib_div::shortMD5($src);
00845 if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
00846 $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
00847 } else $localPath=false;
00848
00849 $list[] = array(
00850 'tag' => $v,
00851 'href' => $params[0]['href'],
00852 'localPath' => $localPath
00853 );
00854 }
00855 break;
00856 }
00857 }
00858 }
00859
00860 return $list;
00861 }
00862
00863
00864
00865
00866
00867
00868
00869
00870
00871
00872
00873
00874
00875
00876
00877
00878
00886 function indexExternalUrl($externalUrl) {
00887
00888
00889 $qParts = parse_url($externalUrl);
00890 $fI = pathinfo($qParts['path']);
00891 $ext = strtolower($fI['extension']);
00892
00893
00894 $urlHeaders = $this->getUrlHeaders($externalUrl);
00895 if (stristr($urlHeaders['Content-Type'],'text/html')) {
00896 $content = $this->indexExternalUrl_content = t3lib_div::getUrl($externalUrl);
00897 if (strlen($content)) {
00898
00899
00900 $tmpFile = t3lib_div::tempnam('EXTERNAL_URL').'.html';
00901 t3lib_div::writeFile($tmpFile, $content);
00902
00903
00904 $this->indexRegularDocument($externalUrl, TRUE, $tmpFile, 'html');
00905 unlink($tmpFile);
00906 }
00907 }
00908 }
00909
00917 function getUrlHeaders($url) {
00918 $content = t3lib_div::getURL($url,2);
00919
00920 if (strlen($content)) {
00921
00922 $headers = t3lib_div::trimExplode(chr(10),$content,1);
00923 $retVal = array();
00924 foreach($headers as $line) {
00925 if (!strlen(trim($line))) {
00926 break;
00927 }
00928
00929 list($headKey, $headValue) = explode(':', $line, 2);
00930 $retVal[$headKey] = $headValue;
00931 }
00932 return $retVal;
00933 }
00934 }
00935
00936
00937
00938
00939
00940
00941
00942
00943
00944
00945
00946
00947
00948
00949
00950
00951
00952
00953
00963 function indexRegularDocument($file, $force=FALSE, $contentTmpFile='', $altExtension='') {
00964
00965
00966 $fI = pathinfo($file);
00967 $ext = $altExtension ? $altExtension : strtolower($fI['extension']);
00968
00969
00970 if (!$contentTmpFile) {
00971 if (!t3lib_div::isAbsPath($file)) {
00972 $absFile = t3lib_div::getFileAbsFileName(PATH_site.$file);
00973 } else {
00974 $absFile = $file;
00975 }
00976 $absFile = t3lib_div::isAllowedAbsPath($absFile) ? $absFile : '';
00977 } else {
00978 $absFile = $contentTmpFile;
00979 }
00980
00981
00982 if ($absFile && @is_file($absFile)) {
00983 if ($this->external_parsers[$ext]) {
00984 $mtime = filemtime($absFile);
00985 $cParts = $this->fileContentParts($ext,$absFile);
00986
00987 foreach($cParts as $cPKey) {
00988 $this->internal_log = array();
00989 $this->log_push('Index: '.str_replace('.','_',basename($file)).($cPKey?'#'.$cPKey:''),'');
00990 $Pstart = t3lib_div::milliseconds();
00991 $subinfo = array('key' => $cPKey);
00992 $phash_arr = $this->file_phash_arr = $this->setExtHashes($file,$subinfo);
00993 $check = $this->checkMtimeTstamp($mtime, $phash_arr['phash']);
00994 if ($check > 0 || $force) {
00995 if ($check > 0) {
00996 $this->log_setTSlogMessage('Indexing needed, reason: '.$this->reasons[$check],1);
00997 } else {
00998 $this->log_setTSlogMessage('Indexing forced by flag',1);
00999 }
01000
01001
01002 if ($this->externalFileCounter < $this->maxExternalFiles || $force) {
01003
01004
01005 $this->log_push('Split content','');
01006 $contentParts = $this->readFileContent($ext,$absFile,$cPKey);
01007 $this->log_pull();
01008
01009 if (is_array($contentParts)) {
01010
01011 $content_md5h = $this->md5inthash(implode($contentParts,''));
01012
01013 if ($this->checkExternalDocContentHash($phash_arr['phash_grouping'], $content_md5h) || $force) {
01014
01015
01016 $this->externalFileCounter++;
01017
01018
01019 $this->log_push('Extract words from content','');
01020 $splitInWords = $this->processWordsInArrays($contentParts);
01021 $this->log_pull();
01022
01023
01024 $this->log_push('Analyse the extracted words','');
01025 $indexArr = $this->indexAnalyze($splitInWords);
01026 $this->log_pull();
01027
01028
01029 $this->log_push('Submitting page','');
01030 $size = filesize($absFile);
01031 $ctime = filemtime($absFile);
01032 $this->submitFilePage($phash_arr,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts);
01033 $this->log_pull();
01034
01035
01036 $this->log_push('Check word list and submit words','');
01037 $this->checkWordList($indexArr);
01038 $this->submitWords($indexArr,$phash_arr['phash']);
01039 $this->log_pull();
01040
01041
01042 $this->updateParsetime($phash_arr['phash'],t3lib_div::milliseconds()-$Pstart);
01043 } else {
01044 $this->updateTstamp($phash_arr['phash'],$mtime);
01045 $this->log_setTSlogMessage('Indexing not needed, the contentHash, '.$content_md5h.', has not changed. Timestamp updated.');
01046 }
01047 } else $this->log_setTSlogMessage('Could not index file! Unsupported extension.');
01048 } else $this->log_setTSlogMessage('The limit of '.$this->maxExternalFiles.' has already been exceeded, so no indexing will take place this time.');
01049 } else $this->log_setTSlogMessage('Indexing not needed, reason: '.$this->reasons[$check]);
01050
01051
01052 # $this->submitFile_grlist($phash_arr['phash']); // Setting a gr_list record if there is none already (set for default fe_group)
01053 $this->submitFile_section($phash_arr['phash']);
01054 $this->log_pull();
01055 }
01056 } else $this->log_setTSlogMessage('Indexing not possible; The extension "'.$ext.'" was not supported.');
01057 } else $this->log_setTSlogMessage('Indexing not possible; File "'.$absFile.'" not found or valid.');
01058 }
01059
01069 function readFileContent($ext,$absFile,$cPKey) {
01070
01071
01072 if (is_object($this->external_parsers[$ext])) {
01073 $contentArr = $this->external_parsers[$ext]->readFileContent($ext,$absFile,$cPKey);
01074 }
01075
01076 return $contentArr;
01077 }
01078
01086 function fileContentParts($ext,$absFile) {
01087 $cParts = array(0);
01088
01089
01090 if (is_object($this->external_parsers[$ext])) {
01091 $cParts = $this->external_parsers[$ext]->fileContentParts($ext,$absFile);
01092 }
01093
01094 return $cParts;
01095 }
01096
01104 function splitRegularContent($content) {
01105 $contentArr = $this->defaultContentArray;
01106 $contentArr['body'] = $content;
01107
01108 return $contentArr;
01109 }
01110
01111
01112
01113
01114
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125
01126
01127
01128
01129
01137 function charsetEntity2utf8(&$contentArr, $charset) {
01138
01139
01140 reset($contentArr);
01141 while(list($key,)=each($contentArr)) {
01142 if (strlen($contentArr[$key])) {
01143
01144 if ($charset!=='utf-8') {
01145 $contentArr[$key] = $this->csObj->utf8_encode($contentArr[$key], $charset);
01146 }
01147
01148
01149 $contentArr[$key] = $this->csObj->entities_to_utf8($contentArr[$key],TRUE);
01150 }
01151 }
01152 }
01153
01160 function processWordsInArrays($contentArr) {
01161
01162
01163 reset($contentArr);
01164 while(list($key,)=each($contentArr)) {
01165 $contentArr[$key] = $this->lexerObj->split2Words($contentArr[$key]);
01166 }
01167
01168
01169 $contentArr['title'] = array_unique($contentArr['title']);
01170 $contentArr['keywords'] = array_unique($contentArr['keywords']);
01171 $contentArr['description'] = array_unique($contentArr['description']);
01172
01173
01174 return $contentArr;
01175 }
01176
01185 function procesWordsInArrays($contentArr) {
01186 return $this->processWordsInArrays($contentArr);
01187 }
01188
01195 function bodyDescription($contentArr) {
01196
01197
01198 $maxL = t3lib_div::intInRange($this->conf['index_descrLgd'],0,255,200);
01199 if ($maxL) {
01200
01201 # $bodyDescription = implode(' ',split('[[:space:],]+',substr(trim($contentArr['body']),0,$maxL*4)));
01202 $bodyDescription = str_replace(array(' ',"\t","\r","\n"),' ',$contentArr['body']);
01203
01204
01205 $bodyDescription = $this->csObj->strtrunc('utf-8', $bodyDescription, $maxL);
01206 }
01207
01208 return $bodyDescription;
01209 }
01210
01217 function indexAnalyze($content) {
01218 $indexArr = Array();
01219 $counter = 0;
01220
01221 $this->analyzeHeaderinfo($indexArr,$content,'title',7);
01222 $this->analyzeHeaderinfo($indexArr,$content,'keywords',6);
01223 $this->analyzeHeaderinfo($indexArr,$content,'description',5);
01224 $this->analyzeBody($indexArr,$content);
01225
01226 return ($indexArr);
01227 }
01228
01238 function analyzeHeaderinfo(&$retArr,$content,$key,$offset) {
01239 reset($content[$key]);
01240 while(list(,$val)=each($content[$key])) {
01241 $val = substr($val,0,60);
01242 $retArr[$val]['cmp'] = $retArr[$val]['cmp']|pow(2,$offset);
01243 $retArr[$val]['count'] = $retArr[$val]['count']+1;
01244 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01245 $retArr[$val]['metaphone'] = $this->metaphone($val);
01246 $this->wordcount++;
01247 }
01248 }
01249
01257 function analyzeBody(&$retArr,$content) {
01258 foreach($content['body'] as $key => $val) {
01259 $val = substr($val,0,60);
01260 if(!isset($retArr[$val])) {
01261 $retArr[$val]['first'] = $key;
01262 $retArr[$val]['hash'] = hexdec(substr(md5($val),0,7));
01263 $retArr[$val]['metaphone'] = $this->metaphone($val);
01264 }
01265 $retArr[$val]['count'] = $retArr[$val]['count']+1;
01266 $this->wordcount++;
01267 }
01268 }
01269
01277 function metaphone($word,$retRaw=FALSE) {
01278
01279 if (is_object($this->metaphoneObj)) {
01280 $tmp = $this->metaphoneObj->metaphone($word, $this->conf['sys_language_uid']);
01281 } else {
01282 $tmp = metaphone($word);
01283 }
01284
01285
01286 if ($retRaw) return $tmp;
01287
01288
01289 if($tmp=='') $ret=0; else $ret=hexdec(substr(md5($tmp),0,7));
01290 return $ret;
01291 }
01292
01293
01294
01295
01296
01297
01298
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311
01312
01313
01319 function submitPage() {
01320
01321
01322 $this->removeOldIndexedPages($this->hash['phash']);
01323
01324
01325 $fields = array(
01326 'phash' => $this->hash['phash'],
01327 'phash_grouping' => $this->hash['phash_grouping'],
01328 'cHashParams' => serialize($this->cHashParams),
01329 'contentHash' => $this->content_md5h,
01330 'data_page_id' => $this->conf['id'],
01331 'data_page_reg1' => $this->conf['page_cache_reg1'],
01332 'data_page_type' => $this->conf['type'],
01333 'data_page_mp' => $this->conf['MP'],
01334 'gr_list' => $this->conf['gr_list'],
01335 'item_type' => 0,
01336 'item_title' => $this->contentParts['title'],
01337 'item_description' => $this->bodyDescription($this->contentParts),
01338 'item_mtime' => $this->conf['mtime'],
01339 'item_size' => strlen($this->conf['content']),
01340 'tstamp' => time(),
01341 'crdate' => time(),
01342 'item_crdate' => $this->conf['crdate'],
01343 'sys_language_uid' => $this->conf['sys_language_uid'],
01344 'externalUrl' => 0,
01345 'recordUid' => intval($this->conf['recordUid']),
01346 'freeIndexUid' => intval($this->conf['freeIndexUid']),
01347 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01348 );
01349
01350 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01351
01352
01353 $this->submit_section($this->hash['phash'],$this->hash['phash']);
01354
01355
01356 $this->submit_grlist($this->hash['phash'],$this->hash['phash']);
01357
01358
01359 $fields = array(
01360 'phash' => $this->hash['phash'],
01361 'fulltextdata' => implode(' ', $this->contentParts)
01362 );
01363 if ($this->indexerConfig['fullTextDataLength']>0) {
01364 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01365 }
01366 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01367
01368
01369 if ($this->indexerConfig['debugMode']) {
01370 $fields = array(
01371 'phash' => $this->hash['phash'],
01372 'debuginfo' => serialize(array(
01373 'cHashParams' => $this->cHashParams,
01374 'external_parsers initialized' => array_keys($this->external_parsers),
01375 'conf' => array_merge($this->conf,array('content'=>substr($this->conf['content'],0,1000))),
01376 'contentParts' => array_merge($this->contentParts,array('body' => substr($this->contentParts['body'],0,1000))),
01377 'logs' => $this->internal_log,
01378 'lexer' => $this->lexerObj->debugString,
01379 ))
01380 );
01381 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01382 }
01383 }
01384
01393 function submit_grlist($hash,$phash_x) {
01394
01395
01396 $fields = array(
01397 'phash' => $hash,
01398 'phash_x' => $phash_x,
01399 'hash_gr_list' => $this->md5inthash($this->conf['gr_list']),
01400 'gr_list' => $this->conf['gr_list']
01401 );
01402 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_grlist', $fields);
01403 }
01404
01413 function submit_section($hash,$hash_t3) {
01414 $fields = array(
01415 'phash' => $hash,
01416 'phash_t3' => $hash_t3,
01417 'page_id' => intval($this->conf['id'])
01418 );
01419
01420 $this->getRootLineFields($fields);
01421
01422 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_section', $fields);
01423 }
01424
01431 function removeOldIndexedPages($phash) {
01432
01433 $tableArr = explode(',','index_phash,index_section,index_grlist,index_fulltext,index_debug');
01434 foreach($tableArr as $table) {
01435 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01436 }
01437
01438 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_section', 'phash_t3='.intval($phash));
01439 }
01440
01441
01442
01443
01444
01445
01446
01447
01448
01449
01450
01451
01452
01453
01454
01455
01456
01457
01458
01459
01474 function submitFilePage($hash,$file,$subinfo,$ext,$mtime,$ctime,$size,$content_md5h,$contentParts) {
01475
01476
01477 $storeItemType = $this->external_parsers[$ext]->ext2itemtype_map[$ext];
01478 $storeItemType = $storeItemType ? $storeItemType : $ext;
01479
01480
01481 $this->removeOldIndexedFiles($hash['phash']);
01482
01483
01484 $fileParts = parse_url($file);
01485
01486
01487 $fields = array(
01488 'phash' => $hash['phash'],
01489 'phash_grouping' => $hash['phash_grouping'],
01490 'cHashParams' => serialize($subinfo),
01491 'contentHash' => $content_md5h,
01492 'data_filename' => $file,
01493 'item_type' => $storeItemType,
01494 'item_title' => trim($contentParts['title']) ? $contentParts['title'] : basename($file),
01495 'item_description' => $this->bodyDescription($contentParts),
01496 'item_mtime' => $mtime,
01497 'item_size' => $size,
01498 'item_crdate' => $ctime,
01499 'tstamp' => time(),
01500 'crdate' => time(),
01501 'gr_list' => $this->conf['gr_list'],
01502 'externalUrl' => $fileParts['scheme'] ? 1 : 0,
01503 'recordUid' => intval($this->conf['recordUid']),
01504 'freeIndexUid' => intval($this->conf['freeIndexUid']),
01505 'freeIndexSetId' => intval($this->conf['freeIndexSetId']),
01506 );
01507 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_phash', $fields);
01508
01509
01510 $fields = array(
01511 'phash' => $hash['phash'],
01512 'fulltextdata' => implode(' ', $contentParts)
01513 );
01514 if ($this->indexerConfig['fullTextDataLength']>0) {
01515 $fields['fulltextdata'] = substr($fields['fulltextdata'],0,$this->indexerConfig['fullTextDataLength']);
01516 }
01517 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_fulltext', $fields);
01518
01519
01520 if ($this->indexerConfig['debugMode']) {
01521 $fields = array(
01522 'phash' => $hash['phash'],
01523 'debuginfo' => serialize(array(
01524 'cHashParams' => $subinfo,
01525 'contentParts' => array_merge($contentParts,array('body' => substr($contentParts['body'],0,1000))),
01526 'logs' => $this->internal_log,
01527 'lexer' => $this->lexerObj->debugString,
01528 ))
01529 );
01530 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_debug', $fields);
01531 }
01532 }
01533
01540 function submitFile_grlist($hash) {
01541
01542 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($hash).' AND (hash_gr_list='.$this->md5inthash($this->defaultGrList).' OR hash_gr_list='.$this->md5inthash($this->conf['gr_list']).')');
01543 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01544 $this->submit_grlist($hash,$hash);
01545 }
01546 }
01547
01554 function submitFile_section($hash) {
01555
01556 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_section', 'phash='.intval($hash).' AND page_id='.intval($this->conf['id']));
01557 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01558 $this->submit_section($hash,$this->hash['phash']);
01559 }
01560 }
01561
01568 function removeOldIndexedFiles($phash) {
01569
01570
01571 $tableArr = explode(',','index_phash,index_grlist,index_fulltext,index_debug');
01572 foreach($tableArr as $table) {
01573 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($phash));
01574 }
01575 }
01576
01577
01578
01579
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591
01592
01593
01594
01595
01604 function checkMtimeTstamp($mtime,$phash) {
01605
01606
01607 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('item_mtime,tstamp', 'index_phash', 'phash='.intval($phash));
01608 $out = 0;
01609
01610
01611 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01612 if ($this->tstamp_maxAge && ($row['tstamp']+$this->tstamp_maxAge) < time()) {
01613 $out = 1;
01614 } else {
01615 if (!$this->tstamp_minAge || ($row['tstamp']+$this->tstamp_minAge)<time()) {
01616 if ($mtime) {
01617 if ($row['item_mtime'] != $mtime) {
01618 $out = 2;
01619 } else {
01620 $out = -1;
01621 if ($this->tstamp_maxAge) {
01622 $this->log_setTSlogMessage('mtime matched, timestamp NOT updated because a maxAge is set ('.($row['tstamp'] + $this->tstamp_maxAge - time()).' seconds to expire time).',1);
01623 } else {
01624 $this->updateTstamp($phash);
01625 $this->log_setTSlogMessage('mtime matched, timestamp updated.',1);
01626 }
01627 }
01628 } else {$out = 3; }
01629 } else {$out = -2;}
01630 }
01631 } else {$out = 4;}
01632 return $out;
01633 }
01634
01640 function checkContentHash() {
01641
01642 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_phash A', 'A.phash_grouping='.intval($this->hash['phash_grouping']).' AND A.contentHash='.intval($this->content_md5h));
01643 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01644 return $row;
01645 }
01646 return 1;
01647 }
01648
01657 function checkExternalDocContentHash($hashGr,$content_md5h) {
01658 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A', 'A.phash_grouping='.intval($hashGr).' AND A.contentHash='.intval($content_md5h));
01659 if ($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01660 return 0;
01661 }
01662 return 1;
01663 }
01664
01671 function is_grlist_set($phash_x) {
01672 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash_x', 'index_grlist', 'phash_x='.intval($phash_x));
01673 return $GLOBALS['TYPO3_DB']->sql_num_rows($res);
01674 }
01675
01684 function update_grlist($phash,$phash_x) {
01685 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('phash', 'index_grlist', 'phash='.intval($phash).' AND hash_gr_list='.$this->md5inthash($this->conf['gr_list']));
01686 if (!$GLOBALS['TYPO3_DB']->sql_num_rows($res)) {
01687 $this->submit_grlist($phash,$phash_x);
01688 $this->log_setTSlogMessage("Inserted gr_list '".$this->conf['gr_list']."' for phash '".$phash."'",1);
01689 }
01690 }
01691
01699 function updateTstamp($phash,$mtime=0) {
01700 $updateFields = array(
01701 'tstamp' => time()
01702 );
01703 if ($mtime) { $updateFields['item_mtime'] = intval($mtime); }
01704
01705 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01706 }
01707
01714 function updateSetId($phash) {
01715 $updateFields = array(
01716 'freeIndexSetId' => intval($this->conf['freeIndexSetId'])
01717 );
01718
01719 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01720 }
01721
01729 function updateParsetime($phash,$parsetime) {
01730 $updateFields = array(
01731 'parsetime' => intval($parsetime)
01732 );
01733
01734 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_phash', 'phash='.intval($phash), $updateFields);
01735 }
01736
01742 function updateRootline() {
01743
01744 $updateFields = array();
01745 $this->getRootLineFields($updateFields);
01746
01747 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_section', 'page_id='.intval($this->conf['id']), $updateFields);
01748 }
01749
01757 function getRootLineFields(&$fieldArr) {
01758
01759 $fieldArr['rl0'] = intval($this->conf['rootline_uids'][0]);
01760 $fieldArr['rl1'] = intval($this->conf['rootline_uids'][1]);
01761 $fieldArr['rl2'] = intval($this->conf['rootline_uids'][2]);
01762
01763 if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'])) {
01764 foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['addRootLineFields'] as $fieldName => $rootLineLevel) {
01765 $fieldArr[$fieldName] = intval($this->conf['rootline_uids'][$rootLineLevel]);
01766 }
01767 }
01768 }
01769
01776 function removeLoginpagesWithContentHash() {
01777 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('*', 'index_phash A,index_grlist B', '
01778 A.phash=B.phash
01779 AND A.phash_grouping='.intval($this->hash['phash_grouping']).'
01780 AND B.hash_gr_list!='.$this->md5inthash($this->defaultGrList).'
01781 AND A.contentHash='.intval($this->content_md5h));
01782 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01783 $this->log_setTSlogMessage("The currently indexed page was indexed under no user-login and apparently this page has been indexed under login conditions earlier, but with the SAME content. Therefore the old similar page with phash='".$row['phash']."' are now removed.",1);
01784 $this->removeOldIndexedPages($row['phash']);
01785 }
01786 }
01787
01793 function includeCrawlerClass() {
01794 global $TYPO3_CONF_VARS;
01795
01796 require_once(t3lib_extMgm::extPath('crawler').'class.tx_crawler_lib.php');
01797 }
01798
01799
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812
01813
01820 function checkWordList($wl) {
01821 reset($wl);
01822 $phashArr = array();
01823 while(list($key,) = each($wl)) {
01824 $phashArr[] = $wl[$key]['hash'];
01825 }
01826 if (count($phashArr)) {
01827 $cwl = implode(',',$phashArr);
01828 $res = $GLOBALS['TYPO3_DB']->exec_SELECTquery('baseword', 'index_words', 'wid IN ('.$cwl.')');
01829
01830 if($GLOBALS['TYPO3_DB']->sql_num_rows($res)!=count($wl)) {
01831 $this->log_setTSlogMessage('Inserting words: '.(count($wl)-$GLOBALS['TYPO3_DB']->sql_num_rows($res)),1);
01832 while($row = $GLOBALS['TYPO3_DB']->sql_fetch_assoc($res)) {
01833 unset($wl[$row['baseword']]);
01834 }
01835
01836 reset($wl);
01837 while(list($key,$val)=each($wl)) {
01838 $insertFields = array(
01839 'wid' => $val['hash'],
01840 'baseword' => $key,
01841 'metaphone' => $val['metaphone']
01842 );
01843
01844 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_words', $insertFields);
01845 }
01846 }
01847 }
01848 }
01849
01857 function submitWords($wl,$phash) {
01858 $GLOBALS['TYPO3_DB']->exec_DELETEquery('index_rel', 'phash='.intval($phash));
01859
01860 foreach($wl as $val) {
01861 $insertFields = array(
01862 'phash' => $phash,
01863 'wid' => $val['hash'],
01864 'count' => $val['count'],
01865 'first' => $val['first'],
01866 'freq' => $this->freqMap(($val['count']/$this->wordcount)),
01867 'flags' => ($val['cmp'] & $this->flagBitMask)
01868 );
01869
01870 $GLOBALS['TYPO3_DB']->exec_INSERTquery('index_rel', $insertFields);
01871 }
01872 }
01873
01881 function freqMap($freq) {
01882 $mapFactor = $this->freqMax*100*$this->freqRange;
01883 if($freq<1) {
01884 $newFreq = $freq*$mapFactor;
01885 $newFreq = $newFreq>$this->freqRange?$this->freqRange:$newFreq;
01886 } else {
01887 $newFreq = $freq/$mapFactor;
01888 }
01889 return $newFreq;
01890
01891 }
01892
01893
01894
01895
01896
01897
01898
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908
01914 function setT3Hashes() {
01915
01916
01917 $hArray = array(
01918 'id' => (integer)$this->conf['id'],
01919 'type' => (integer)$this->conf['type'],
01920 'sys_lang' => (integer)$this->conf['sys_language_uid'],
01921 'MP' => (string)$this->conf['MP'],
01922 'cHash' => $this->cHashParams
01923 );
01924
01925
01926 $this->hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01927
01928
01929 $hArray['gr_list'] = (string)$this->conf['gr_list'];
01930 $this->hash['phash'] = $this->md5inthash(serialize($hArray));
01931 }
01932
01940 function setExtHashes($file,$subinfo=array()) {
01941
01942 $hash = array();
01943 $hArray = array(
01944 'file' => $file,
01945 );
01946
01947
01948 $hash['phash_grouping'] = $this->md5inthash(serialize($hArray));
01949
01950
01951 $hArray['subinfo'] = $subinfo;
01952 $hash['phash'] = $this->md5inthash(serialize($hArray));
01953
01954 return $hash;
01955 }
01956
01964 function md5inthash($str) {
01965 return hexdec(substr(md5($str),0,7));
01966 }
01967
01974 function makeCHash($paramArray) {
01975 $addQueryParams = t3lib_div::implodeArrayForUrl('', $paramArray);
01976
01977 $pA = t3lib_div::cHashParams($addQueryParams);
01978
01979 return t3lib_div::shortMD5(serialize($pA));
01980 }
01981
01982
01983
01984
01985
01986
01987
01988
01989
01990
01991
01992
01993
01994
01995
01996
01997
01998
02006 function log_push($msg,$key) {
02007 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->push($msg,$key);
02008 }
02009
02015 function log_pull() {
02016 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->pull();
02017 }
02018
02026 function log_setTSlogMessage($msg, $errorNum=0) {
02027 if (is_object($GLOBALS['TT'])) $GLOBALS['TT']->setTSlogMessage($msg,$errorNum);
02028 $this->internal_log[] = $msg;
02029 }
02030
02031
02032
02033
02034
02035
02036
02037
02038
02039
02040
02041
02042
02043
02051 function fe_headerNoCache(&$params, $ref) {
02052
02053
02054 if (t3lib_extMgm::isLoaded('crawler')
02055 && $params['pObj']->applicationData['tx_crawler']['running']
02056 && in_array('tx_indexedsearch_reindex', $params['pObj']->applicationData['tx_crawler']['parameters']['procInstructions'])) {
02057
02058
02059 $params['pObj']->applicationData['tx_crawler']['log'][] = 'RE_CACHE (indexed), old status: '.$params['disableAcquireCacheData'];
02060
02061
02062 $params['disableAcquireCacheData'] = TRUE;
02063 }
02064 }
02065 }
02066
02067
02068 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']) {
02069 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.indexer.php']);
02070 }
02071 ?>