Documentation TYPO3 par Ameos

class.crawler.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00072 # To make sure the backend charset is available:
00073 require_once(PATH_typo3.'sysext/lang/lang.php');
00074 if (!is_object($GLOBALS['LANG']))       {
00075         $GLOBALS['LANG'] = t3lib_div::makeInstance('language');
00076         $GLOBALS['LANG']->init($GLOBALS['BE_USER']->uc['lang']);
00077 }
00078 
00079 
00087 class tx_indexedsearch_crawler {
00088 
00089                 // Static:
00090         var $secondsPerExternalUrl = 3;         // Number of seconds to use as interval between queued indexing operations of URLs / files (types 2 & 3)
00091 
00092                 // Internal, dynamic:
00093         var $instanceCounter = 0;               // Counts up for each added URL (type 3)
00094 
00095                 // Internal, static:
00096         var $callBack = 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler';               // The object reference to this class.
00097 
00106         function crawler_init(&$pObj){
00107 
00108                         // Select all indexing configuration which are waiting to be activated:
00109                 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00110                         '*',
00111                         'index_config',
00112                         'hidden=0
00113                                 AND (starttime=0 OR starttime<='.time().')
00114                                 AND timer_next_indexing<'.time().'
00115                                 AND set_id=0
00116                                 '.t3lib_BEfunc::deleteClause('index_config')
00117                 );
00118 
00119                         // For each configuration, check if it should be executed and if so, start:
00120                 foreach($indexingConfigurations as $cfgRec)     {
00121 
00122                                 // Generate a unique set-ID:
00123                         $setId = t3lib_div::md5int(microtime());
00124 
00125                                 // Get next time:
00126                         $nextTime = $this->generateNextIndexingTime($cfgRec);
00127 
00128                                 // Start process by updating index-config record:
00129                         $field_array = array (
00130                                 'set_id' => $setId,
00131                                 'timer_next_indexing' => $nextTime,
00132                                 'session_data' => '',
00133                         );
00134                         $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00135 
00136                                 // Based on configuration type:
00137                         switch($cfgRec['type']) {
00138                                 case 1: // RECORDS:
00139 
00140                                                 // Parameters:
00141                                         $params = array(
00142                                                 'indexConfigUid' => $cfgRec['uid'],
00143                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00144                                                 'url' => 'Records (start)',     // Just for show.
00145                                         );
00146                                                 //
00147                                         $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00148                                 break;
00149                                 case 2: // FILES:
00150 
00151                                                 // Parameters:
00152                                         $params = array(
00153                                                 'indexConfigUid' => $cfgRec['uid'],             // General
00154                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
00155                                                 'url' => $cfgRec['filepath'],   // Partly general... (for URL and file types)
00156                                                 'depth' => 0    // Specific for URL and file types
00157                                         );
00158 
00159                                         $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00160                                 break;
00161                                 case 3: // External URL:
00162 
00163                                                 // Parameters:
00164                                         $params = array(
00165                                                 'indexConfigUid' => $cfgRec['uid'],             // General
00166                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
00167                                                 'url' => $cfgRec['externalUrl'],        // Partly general... (for URL and file types)
00168                                                 'depth' => 0    // Specific for URL and file types
00169                                         );
00170 
00171                                         $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00172                                 break;
00173                                 case 4: // Page tree
00174 
00175                                                 // Parameters:
00176                                         $params = array(
00177                                                 'indexConfigUid' => $cfgRec['uid'],             // General
00178                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),      // General
00179                                                 'url' => intval($cfgRec['alternative_source_pid']),     // Partly general... (for URL and file types and page tree (root))
00180                                                 'depth' => 0    // Specific for URL and file types and page tree
00181                                         );
00182 
00183                                         $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00184                                 break;
00185                                 case 5: // Meta configuration, nothing to do:
00186                                         # NOOP
00187                                 break;
00188                                 default:
00189                                         if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])       {
00190                                                 $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
00191 
00192                                                 if (is_object($hookObj))        {
00193 
00194                                                                 // Parameters:
00195                                                         $params = array(
00196                                                                 'indexConfigUid' => $cfgRec['uid'],             // General
00197                                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].'/CUSTOM]'),       // General
00198                                                                 'url' => $hookObj->initMessage($message),
00199                                                         );
00200 
00201                                                         $pObj->addQueueEntry_callBack($setId,$params,$this->callBack,$cfgRec['pid']);
00202                                                 }
00203                                         }
00204                                 break;
00205                         }
00206                 }
00207 
00208                         // Finally, look up all old index configurations which are finished and needs to be reset and done.
00209                 $this->cleanUpOldRunningConfigurations();
00210         }
00211 
00219         function crawler_execute($params,&$pObj)        {
00220 
00221                         // Indexer configuration ID must exist:
00222                 if ($params['indexConfigUid'])  {
00223 
00224                                 // Load the indexing configuration record:
00225                         list($cfgRec) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00226                                 '*',
00227                                 'index_config',
00228                                 'uid='.intval($params['indexConfigUid'])
00229                         );
00230 
00231                         if (is_array($cfgRec))  {
00232 
00233                                         // Unpack session data:
00234                                 $session_data = unserialize($cfgRec['session_data']);
00235 
00236                                         // Select which type:
00237                                 switch($cfgRec['type']) {
00238                                         case 1: // Records:
00239                                                 $this->crawler_execute_type1($cfgRec,$session_data,$params,$pObj);
00240                                         break;
00241                                         case 2: // Files
00242                                                 $this->crawler_execute_type2($cfgRec,$session_data,$params,$pObj);
00243                                         break;
00244                                         case 3: // External URL:
00245                                                 $this->crawler_execute_type3($cfgRec,$session_data,$params,$pObj);
00246                                         break;
00247                                         case 4: // Page tree:
00248                                                 $this->crawler_execute_type4($cfgRec,$session_data,$params,$pObj);
00249                                         break;
00250                                         case 5: // Meta
00251                                                 # NOOP (should never enter here!)
00252                                         break;
00253                                         default:
00254                                                 if ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']])       {
00255                                                         $hookObj = &t3lib_div::getUserObj($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['indexed_search']['crawler'][$cfgRec['type']]);
00256 
00257                                                         if (is_object($hookObj))        {
00258                                                                 $this->pObj = &$pObj;   // For addQueueEntryForHook()
00259                                                                 $hookObj->indexOperation($cfgRec,$session_data,$params,$this);
00260                                                         }
00261                                                 }
00262                                         break;
00263                                 }
00264 
00265                                         // Save process data which might be modified:
00266                                 $field_array = array (
00267                                         'session_data' => serialize($session_data)
00268                                 );
00269                                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00270                         }
00271                 }
00272 
00273                 return array('log' => $params);
00274         }
00275 
00285         function crawler_execute_type1($cfgRec,&$session_data,$params,&$pObj)   {
00286                 if ($cfgRec['table2index'] && isset($GLOBALS['TCA'][$cfgRec['table2index']]))   {
00287 
00288                                 // Init session data array if not already:
00289                         if (!is_array($session_data))   {
00290                                 $session_data = array(
00291                                         'uid' => 0
00292                                 );
00293                         }
00294 
00295                                 // Init:
00296                         $pid = intval($cfgRec['alternative_source_pid']) ? intval($cfgRec['alternative_source_pid']) : $cfgRec['pid'];
00297                         $numberOfRecords = $cfgRec['recordsbatch'] ? t3lib_div::intInRange($cfgRec['recordsbatch'],1) : 100;
00298 
00299                                 // Get root line:
00300                         $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00301 
00302                                 // Select
00303                         $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00304                                                 '*',
00305                                                 $cfgRec['table2index'],
00306                                                 'pid = '.intval($pid).'
00307                                                         AND uid > '.intval($session_data['uid']).
00308                                                         t3lib_BEfunc::deleteClause($cfgRec['table2index']),
00309                                                 '',
00310                                                 'uid',
00311                                                 $numberOfRecords
00312                                         );
00313 
00314                                 // Traverse:
00315                         if (count($recs))       {
00316                                 foreach($recs as $r)    {
00317 
00318                                                 // Index single record:
00319                                         $this->indexSingleRecord($r,$cfgRec,$rl);
00320 
00321                                                 // Update the UID we last processed:
00322                                         $session_data['uid'] = $r['uid'];
00323                                 }
00324 
00325                                         // Finally, set entry for next indexing of batch of records:
00326                                 $nparams = array(
00327                                         'indexConfigUid' => $cfgRec['uid'],
00328                                         'url' => 'Records from UID#'.($r['uid']+1).'-?',
00329                                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')
00330                                 );
00331                                 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
00332                         }
00333                 }
00334         }
00335 
00345         function crawler_execute_type2($cfgRec,&$session_data,$params,&$pObj)   {
00346 
00347                         // Prepare path, making it absolute and checking:
00348                 $readpath = $params['url'];
00349                 if (!t3lib_div::isAbsPath($readpath))   {
00350                         $readpath = t3lib_div::getFileAbsFileName($readpath);
00351                 }
00352 
00353                 if (t3lib_div::isAllowedAbsPath($readpath))     {
00354                         if (@is_file($readpath))        {       // If file, index it!
00355 
00356                                         // Get root line (need to provide this when indexing external files)
00357                                 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00358 
00359                                         // Load indexer if not yet.
00360                                 $this->loadIndexerClass();
00361 
00362                                         // (Re)-Indexing file on page.
00363                                 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
00364                                 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, 0, '', $rl);
00365                                 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00366                                 $indexerObj->hash['phash'] = -1;        // EXPERIMENT - but to avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00367 
00368                                         // Index document:
00369                                 $indexerObj->indexRegularDocument(substr($readpath,strlen(PATH_site)), TRUE);
00370                         } elseif (@is_dir($readpath)) { // If dir, read content and create new pending items for log:
00371 
00372                                         // Select files and directories in path:
00373                                 $extList = implode(',',t3lib_div::trimExplode(',',$cfgRec['extensions'],1));
00374                                 $fileArr = array();
00375                                 $files = t3lib_div::getAllFilesAndFoldersInPath($fileArr,$readpath,$extList,0,0);
00376 
00377                                 $directoryList = t3lib_div::get_dirs($readpath);
00378                                 if (is_array($directoryList) && $params['depth'] < $cfgRec['depth'])    {
00379                                         foreach ($directoryList as $subdir)     {
00380                                                 if ((string)$subdir!='')        {
00381                                                         $files[]= $readpath.$subdir.'/';
00382                                                 }
00383                                         }
00384                                 }
00385                                 $files = t3lib_div::removePrefixPathFromList($files,PATH_site);
00386 
00387                                         // traverse the items and create log entries:
00388                                 foreach($files as $path)        {
00389                                         $this->instanceCounter++;
00390                                         if ($path!==$params['url'])     {
00391                                                         // Parameters:
00392                                                 $nparams = array(
00393                                                         'indexConfigUid' => $cfgRec['uid'],
00394                                                         'url' => $path,
00395                                                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00396                                                         'depth' => $params['depth']+1
00397                                                 );
00398                                                 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
00399                                         }
00400                                 }
00401                         }
00402                 }
00403         }
00404 
00414         function crawler_execute_type3($cfgRec,&$session_data,$params,&$pObj)   {
00415 
00416                         // Init session data array if not already:
00417                 if (!is_array($session_data))   {
00418                         $session_data = array(
00419                                 'urlLog' => array($params['url'])
00420                         );
00421                 }
00422 
00423                         // Index the URL:
00424                 $rl = $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00425                 $subUrls = $this->indexExtUrl($params['url'], $cfgRec['pid'], $rl, $cfgRec['uid'], $cfgRec['set_id']);
00426 
00427                         // Add more elements to log now:
00428                 if ($params['depth'] < $cfgRec['depth'])        {
00429                         foreach($subUrls as $url)       {
00430                                 if ($url = $this->checkUrl($url,$session_data['urlLog'],$cfgRec['externalUrl']))        {
00431                                         if (!$this->checkDeniedSuburls($url, $cfgRec['url_deny']))      {
00432                                                 $this->instanceCounter++;
00433                                                 $session_data['urlLog'][] = $url;
00434 
00435                                                         // Parameters:
00436                                                 $nparams = array(
00437                                                         'indexConfigUid' => $cfgRec['uid'],
00438                                                         'url' => $url,
00439                                                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00440                                                         'depth' => $params['depth']+1
00441                                                 );
00442                                                 $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
00443                                         }
00444                                 }
00445                         }
00446                 }
00447         }
00448 
00458         function crawler_execute_type4($cfgRec,&$session_data,$params,&$pObj)   {
00459 
00460                         // Base page uid:
00461                 $pageUid = intval($params['url']);
00462 
00463                         // Get array of URLs from page:
00464                 $pageRow = t3lib_BEfunc::getRecord('pages',$pageUid);
00465                 $res = $pObj->getUrlsForPageRow($pageRow);
00466 
00467                 $duplicateTrack = array();      // Registry for duplicates
00468                 $downloadUrls = array();        // Dummy.
00469 
00470                         // Submit URLs:
00471                 if (count($res))        {
00472                         foreach($res as $paramSetKey => $vv)    {
00473                                 $urlList = $pObj->urlListFromUrlArray($vv,$pageRow,time(),30,1,0,$duplicateTrack,$downloadUrls,array('tx_indexedsearch_reindex'));
00474                         }
00475                 }
00476 
00477                         // Add subpages to log now:
00478                 if ($params['depth'] < $cfgRec['depth'])        {
00479 
00480                                 // Subpages selected
00481                         $recs = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00482                                 'uid,title',
00483                                 'pages',
00484                                 'pid = '.intval($pageUid).
00485                                         t3lib_BEfunc::deleteClause('pages')
00486                         );
00487 
00488                                 // Traverse subpages and add to queue:
00489                         if (count($recs))       {
00490                                 foreach($recs as $r)    {
00491                                         $this->instanceCounter++;
00492                                         $url = 'pages:'.$r['uid'].': '.$r['title'];
00493                                         $session_data['urlLog'][] = $url;
00494 
00495                                                         // Parameters:
00496                                         $nparams = array(
00497                                                 'indexConfigUid' => $cfgRec['uid'],
00498                                                 'url' => $r['uid'],
00499                                                 'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']'),
00500                                                 'depth' => $params['depth']+1
00501                                         );
00502                                         $pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid'],time()+$this->instanceCounter*$this->secondsPerExternalUrl);
00503                                 }
00504                         }
00505                 }
00506         }
00507 
00513         function cleanUpOldRunningConfigurations()      {
00514 
00515                         // Lookup running index configurations:
00516                 $runningIndexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00517                         'uid,set_id',
00518                         'index_config',
00519                         'set_id!=0'.t3lib_BEfunc::deleteClause('index_config')
00520                 );
00521 
00522                         // For each running configuration, look up how many log entries there are which are scheduled for execution and if none, clear the "set_id" (means; Processing was DONE)
00523                 foreach($runningIndexingConfigurations as $cfgRec)      {
00524 
00525                                 // Look for ended processes:
00526                         list($queued_items) = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00527                                 'count(*) AS count',
00528                                 'tx_crawler_queue',
00529                                 'set_id='.intval($cfgRec['set_id']).' AND exec_time=0'
00530                         );
00531 
00532                         if (!$queued_items['count'])    {
00533 
00534                                         // Lookup old phash rows:
00535                                 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00536                                         'phash',
00537                                         'index_phash',
00538                                         'freeIndexUid='.intval($cfgRec['uid']).' AND freeIndexSetId!='.$cfgRec['set_id']
00539                                 );
00540 
00541                                 foreach($oldPhashRows as $pHashRow)     {
00542                                                 // Removing old registrations for all tables (code copied from class.tx_indexedsearch_modfunc1.php)
00543                                         $tableArr = explode(',','index_phash,index_rel,index_section,index_grlist,index_fulltext,index_debug');
00544                                         foreach($tableArr as $table)    {
00545                                                 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, 'phash='.intval($pHashRow['phash']));
00546                                         }
00547                                 }
00548 
00549                                         // End process by updating index-config record:
00550                                 $field_array = array (
00551                                         'set_id' => 0,
00552                                         'session_data' => '',
00553                                 );
00554                                 $GLOBALS['TYPO3_DB']->exec_UPDATEquery('index_config','uid='.intval($cfgRec['uid']), $field_array);
00555                         }
00556                 }
00557         }
00558 
00559 
00560 
00561 
00562 
00563 
00564 
00565         /*****************************************
00566          *
00567          * Helper functions
00568          *
00569          *****************************************/
00570 
00579         function checkUrl($url,$urlLog,$baseUrl)        {
00580                 $url = ereg_replace('\/\/$','/',$url);
00581                 list($url) = explode('#',$url);
00582 
00583                 if (!strstr($url,'../'))        {
00584                         if (t3lib_div::isFirstPartOfStr($url,$baseUrl)) {
00585                                 if (!in_array($url,$urlLog))    {
00586                                         return $url;
00587                                 }
00588                         }
00589                 }
00590         }
00591 
00602         function indexExtUrl($url, $pageId, $rl, $cfgUid, $setId)       {
00603 
00604                         // Load indexer if not yet.
00605                 $this->loadIndexerClass();
00606 
00607                         // Index external URL:
00608                 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
00609                 $indexerObj->backend_initIndexer($pageId, 0, 0, '', $rl);
00610                 $indexerObj->backend_setFreeIndexUid($cfgUid, $setId);
00611                 $indexerObj->hash['phash'] = -1;        // To avoid phash_t3 being written to file sections (otherwise they are removed when page is reindexed!!!)
00612 
00613                 $indexerObj->indexExternalUrl($url);
00614                 $url_qParts = parse_url($url);
00615 
00616                         // Get URLs on this page:
00617                 $subUrls = array();
00618                 $list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
00619 
00620                                                 // Traverse links:
00621                 foreach ($list as $count => $linkInfo)  {
00622 
00623                                 // Decode entities:
00624                         $subUrl = t3lib_div::htmlspecialchars_decode($linkInfo['href']);
00625 
00626                         $qParts = parse_url($subUrl);
00627                         if (!$qParts['scheme']) {
00628                                 $subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
00629                         }
00630 
00631                         $subUrls[] = $subUrl;
00632                 }
00633 
00634                 return $subUrls;
00635         }
00636 
00645         function indexSingleRecord($r,$cfgRec,$rl=NULL) {
00646 
00647                         // Load indexer if not yet.
00648                 $this->loadIndexerClass();
00649 
00650 
00651                         // Init:
00652                 $rl = is_array($rl) ? $rl : $this->getUidRootLineForClosestTemplate($cfgRec['pid']);
00653                 $fieldList = t3lib_div::trimExplode(',',$cfgRec['fieldlist'],1);
00654                 $languageField = $GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['languageField'];
00655                 $sys_language_uid = $languageField ? $r[$languageField] : 0;
00656 
00657                         // (Re)-Indexing a row from a table:
00658                 $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
00659                 parse_str(str_replace('###UID###',$r['uid'],$cfgRec['get_params']),$GETparams);
00660                 $indexerObj->backend_initIndexer($cfgRec['pid'], 0, $sys_language_uid, '', $rl, $GETparams, $cfgRec['chashcalc'] ? TRUE : FALSE);
00661                 $indexerObj->backend_setFreeIndexUid($cfgRec['uid'], $cfgRec['set_id']);
00662                 $indexerObj->forceIndexing = TRUE;
00663 
00664                 $theContent = '';
00665                 foreach($fieldList as $k => $v) {
00666                         if (!$k)        {
00667                                 $theTitle = $r[$v];
00668                         } else {
00669                                 $theContent.= $r[$v].' ';
00670                         }
00671                 }
00672 
00673                         // Indexing the record as a page (but with parameters set, see ->backend_setFreeIndexUid())
00674                 $indexerObj->backend_indexAsTYPO3Page(
00675                         strip_tags($theTitle),
00676                         '',
00677                         '',
00678                         strip_tags($theContent),
00679                         $GLOBALS['LANG']->charSet,      // Requires that
00680                         $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['tstamp']],
00681                         $r[$GLOBALS['TCA'][$cfgRec['table2index']]['ctrl']['crdate']],
00682                         $r['uid']
00683                 );
00684 
00685                 #echo print_r($indexerObj->internal_log);
00686                 #echo print_r($indexerObj->contentParts);
00687         }
00688 
00694         function loadIndexerClass()     {
00695                 global $TYPO3_CONF_VARS;
00696                 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
00697         }
00698 
00706         function getUidRootLineForClosestTemplate($id)  {
00707                 global $TYPO3_CONF_VARS;
00708 
00709                 require_once (PATH_t3lib."class.t3lib_page.php");
00710                 require_once (PATH_t3lib."class.t3lib_tstemplate.php");
00711                 require_once (PATH_t3lib."class.t3lib_tsparser_ext.php");
00712 
00713 
00714 
00715                 $tmpl = t3lib_div::makeInstance("t3lib_tsparser_ext");
00716                 $tmpl->tt_track = 0;    // Do not log time-performance information
00717                 $tmpl->init();
00718 
00719                                 // Gets the rootLine
00720                 $sys_page = t3lib_div::makeInstance("t3lib_pageSelect");
00721                 $rootLine = $sys_page->getRootLine($id);
00722                 $tmpl->runThroughTemplates($rootLine,0);        // This generates the constants/config + hierarchy info for the template.
00723 
00724                         // Root line uids
00725                 $rootline_uids = array();
00726                 foreach($tmpl->rootLine as $rlkey => $rldat)    {
00727                         $rootline_uids[$rlkey] = $rldat['uid'];
00728                 }
00729 
00730                 return $rootline_uids;
00731         }
00732 
00739         function generateNextIndexingTime($cfgRec)      {
00740                 $currentTime = time();
00741 
00742                         // Now, find a midnight time to use for offset calculation. This has to differ depending on whether we have frequencies within a day or more than a day; Less than a day, we don't care which day to use for offset, more than a day we want to respect the currently entered day as offset regardless of when the script is run - thus the day-of-week used in case "Weekly" is selected will be respected
00743                 if ($cfgRec['timer_frequency']<=24*3600)        {
00744                         $aMidNight = mktime (0,0,0)-1*24*3600;
00745                 } else {
00746                         $lastTime = $cfgRec['timer_next_indexing']?$cfgRec['timer_next_indexing']:time();
00747                         $aMidNight = mktime (0,0,0, date('m',$lastTime), date('d',$lastTime), date('y',$lastTime));
00748                 }
00749 
00750                         // Find last offset time plus frequency in seconds:
00751                 $lastSureOffset = $aMidNight+t3lib_div::intInRange($cfgRec['timer_offset'],0,86400);
00752                 $frequencySeconds = t3lib_div::intInRange($cfgRec['timer_frequency'],1);
00753 
00754                         // Now, find out how many blocks of the length of frequency there is until the next time:
00755                 $frequencyBlocksUntilNextTime = ceil(($currentTime-$lastSureOffset)/$frequencySeconds);
00756 
00757                         // Set next time to the offset + the frequencyblocks multiplied with the frequency length in seconds.
00758                 $nextTime = $lastSureOffset + $frequencyBlocksUntilNextTime*$frequencySeconds;
00759 
00760                 return $nextTime;
00761         }
00762 
00770         function checkDeniedSuburls($url, $url_deny)    {
00771                 if (trim($url_deny))    {
00772                         $url_denyArray = t3lib_div::trimExplode(chr(10),$url_deny,1);
00773                         foreach($url_denyArray as $testurl)     {
00774                                 if (t3lib_div::isFirstPartOfStr($url,$testurl)) {
00775                                         echo $url.' 
00776                                         return TRUE;
00777                                 }
00778                         }
00779                 }
00780                 return FALSE;
00781         }
00782 
00790         function addQueueEntryForHook($cfgRec, $title)  {
00791 
00792                 $nparams = array(
00793                         'indexConfigUid' => $cfgRec['uid'],             // This must ALWAYS be the cfgRec uid!
00794                         'url' => $title,
00795                         'procInstructions' => array('[Index Cfg UID#'.$cfgRec['uid'].']')       // Also just for information. Its good style to show that its an indexing configuration that added the entry.
00796                 );
00797                 $this->pObj->addQueueEntry_callBack($cfgRec['set_id'],$nparams,$this->callBack,$cfgRec['pid']);
00798         }
00799 
00806         function deleteFromIndex($id)   {
00807 
00808                         // Lookup old phash rows:
00809                 $oldPhashRows = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('phash','index_section', 'page_id='.intval($id));
00810 
00811                 if (count($oldPhashRows))       {
00812                         $pHashesToDelete = array();
00813                         foreach ($oldPhashRows as $pHashRow)    {
00814                                 $pHashesToDelete[] = $pHashRow['phash'];
00815                         }
00816 
00817                         $where_clause = 'phash IN ('.implode(',',$GLOBALS['TYPO3_DB']->cleanIntArray($pHashesToDelete)).')';
00818                         $tables = explode(',', 'index_debug,index_fulltext,index_grlist,index_phash,index_rel,index_section');
00819                         foreach ($tables as $table)     {
00820                                 $GLOBALS['TYPO3_DB']->exec_DELETEquery($table, $where_clause);
00821                         }
00822                 }
00823         }
00824 
00825 
00826 
00827 
00828 
00829 
00830 
00831         /*************************
00832          *
00833          * Hook functions for TCEmain (indexing of records)
00834          *
00835          *************************/
00836 
00847         function processCmdmap_preProcess($command, $table, $id, $value, &$pObj)        {
00848 
00849                         // Clean up the index
00850                 if ($command=='delete' && $table == 'pages')    {
00851                         $this->deleteFromIndex($id);
00852                 }
00853         }
00854 
00865         function processDatamap_afterDatabaseOperations($status, $table, $id, $fieldArray, &$pObj) {
00866 
00867                         // Check if any fields are actually updated:
00868                 if (count($fieldArray)) {
00869 
00870                                 // Translate new ids.
00871                         if ($status=='new')     {
00872                                 $id = $pObj->substNEWwithIDs[$id];
00873 
00874                         } elseif ($table=='pages' && $status=='update' && ((array_key_exists('hidden',$fieldArray) && $fieldArray['hidden']==1) || (array_key_exists('no_search',$fieldArray) && $fieldArray['no_search']==1))) {
00875 
00876                                         // If the page should be hidden or not indexed after update, delete index for this page
00877                                 $this->deleteFromIndex($id);
00878                         }
00879 
00880                                 // Get full record and if exists, search for indexing configurations:
00881                         $currentRecord = t3lib_BEfunc::getRecord($table,$id);
00882                         if (is_array($currentRecord))   {
00883 
00884                                         // Select all (not running) indexing configurations of type "record" (1) and which points to this table and is located on the same page as the record or pointing to the right source PID
00885                                 $indexingConfigurations = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows(
00886                                         '*',
00887                                         'index_config',
00888                                         'hidden=0
00889                                                 AND (starttime=0 OR starttime<='.time().')
00890                                                 AND set_id=0
00891                                                 AND type=1
00892                                                 AND table2index='.$GLOBALS['TYPO3_DB']->fullQuoteStr($table,'index_config').'
00893                                                 AND (
00894                                                                 (alternative_source_pid=0 AND pid='.intval($currentRecord['pid']).')
00895                                                                 OR (alternative_source_pid='.intval($currentRecord['pid']).')
00896                                                         )
00897                                                 AND records_indexonchange=1
00898                                                 '.t3lib_BEfunc::deleteClause('index_config')
00899                                 );
00900 
00901                                 foreach($indexingConfigurations as $cfgRec)     {
00902                                         $this->indexSingleRecord($currentRecord,$cfgRec);
00903                                 }
00904                         }
00905                 }
00906         }
00907 }
00908 
00909 
00919 class tx_indexedsearch_files {
00920 
00928         function crawler_execute($params,&$pObj)        {
00929 
00930                         // Load indexer if not yet.
00931                 $this->loadIndexerClass();
00932 
00933                 if (is_array($params['conf']))  {
00934 
00935                                 // Initialize the indexer class:
00936                         $indexerObj = &t3lib_div::makeInstance('tx_indexedsearch_indexer');
00937                         $indexerObj->conf = $params['conf'];
00938                         $indexerObj->init();
00939 
00940                                 // Index document:
00941                         if ($params['alturl'])  {
00942                                 $fI = pathinfo($params['document']);
00943                                 $ext = strtolower($fI['extension']);
00944                                 $indexerObj->indexRegularDocument($params['alturl'], TRUE, $params['document'], $ext);
00945                         } else {
00946                                 $indexerObj->indexRegularDocument($params['document'], TRUE);
00947                         }
00948 
00949                                 // Return OK:
00950                         return array('content' => array());
00951                 }
00952         }
00953 
00959         function loadIndexerClass()     {
00960                 global $TYPO3_CONF_VARS;
00961                 require_once(t3lib_extMgm::extPath('indexed_search').'class.indexer.php');
00962         }
00963 }
00964 
00965 
00966 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php'])    {
00967         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.crawler.php']);
00968 }
00969 ?>


Généré par Les experts TYPO3 avec  doxygen 1.4.6