Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00075 class tx_indexed_search_extparse { 00076 00077 // This value is also overridden from config. 00078 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10 00079 00080 // This array is configured in initialization: 00081 var $app = array(); 00082 var $ext2itemtype_map = array(); 00083 var $supportedExtensions = array(); 00084 00085 var $pObj; // Reference to parent object (indexer class) 00086 00087 00094 function initParser($extension) { 00095 00096 // Then read indexer-config and set if appropriate: 00097 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00098 00099 // If windows, apply extension to tool name: 00100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg 00101 $extOK = FALSE; 00102 $mainExtension = ''; 00103 00104 // Ignore extensions 00105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00106 if (in_array($extension, $ignoreExtensions)) { 00107 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1); 00108 00109 return FALSE; 00110 } 00111 00112 // Switch on file extension: 00113 switch($extension) { 00114 case 'pdf': 00115 // PDF 00116 if ($indexerConfig['pdftools']) { 00117 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/'; 00118 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) { 00119 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe; 00120 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe; 00121 // PDF mode: 00122 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100); 00123 $extOK = TRUE; 00124 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3); 00125 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1); 00126 break; 00127 case 'doc': 00128 // Catdoc 00129 if ($indexerConfig['catdoc']) { 00130 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/'; 00131 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) { 00132 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe; 00133 $extOK = TRUE; 00134 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3); 00135 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1); 00136 break; 00137 case 'pps': // MS PowerPoint(?) 00138 case 'ppt': // MS PowerPoint 00139 // ppthtml 00140 if ($indexerConfig['ppthtml']) { 00141 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/'; 00142 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){ 00143 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe; 00144 $extOK = TRUE; 00145 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3); 00146 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1); 00147 break; 00148 case 'xls': // MS Excel 00149 // Xlhtml 00150 if ($indexerConfig['xlhtml']) { 00151 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/'; 00152 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){ 00153 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe; 00154 $extOK = TRUE; 00155 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3); 00156 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1); 00157 break; 00158 case 'sxc': // Open Office Calc. 00159 case 'sxi': // Open Office Impress 00160 case 'sxw': // Open Office Writer 00161 // ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html 00162 // I had to run this on debian before I could run the ooo_extract.rb script: 00163 // apt-get install libzlib-ruby1.8 00164 // apt-get install librexml-ruby1.8 00165 // ruby + ooo_extract 00166 if ($indexerConfig['nativeOOMethod']) { 00167 if (t3lib_extMgm::isLoaded('libunzipped')) { 00168 $this->app['nativeOOMethod'] = TRUE; 00169 $extOK = TRUE; 00170 $this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1); 00171 } else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2); 00172 } else { 00173 if ($indexerConfig['OOoExtract']) { 00174 if($indexerConfig['ruby']) { $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/'; } 00175 00176 $oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/'; 00177 if (ini_get('safe_mode') || (($rubyPath ? @is_file($rubyPath.'ruby'.$exe) : true) && @is_file($oooExPath.'ooo_extract.rb'))) { 00178 $this->app['ruby'] = $rubyPath.'ruby'.$exe; 00179 $this->app['OOo'] = $oooExPath.'ooo_extract.rb'; 00180 $extOK = TRUE; 00181 } else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OpenOffice.org documents were not found in paths '".$rubyPath."ruby".$exe."' OR '".$oooExPath."ooo_extract.rb'",3); 00182 } else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1); 00183 } 00184 break; 00185 case 'rtf': 00186 // Catdoc 00187 if ($indexerConfig['unrtf']) { 00188 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/'; 00189 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) { 00190 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe; 00191 $extOK = TRUE; 00192 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3); 00193 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1); 00194 break; 00195 case 'txt': // Raw text 00196 case 'csv': // Raw text 00197 case 'xml': // PHP strip-tags() 00198 case 'tif': // PHP EXIF 00199 $extOK = TRUE; 00200 break; 00201 case 'html': // PHP strip-tags() 00202 case 'htm': // PHP strip-tags() 00203 $extOK = TRUE; 00204 $mainExtension = 'html'; // making "html" the common "item_type" 00205 break; 00206 case 'jpg': // PHP EXIF 00207 case 'jpeg': // PHP EXIF 00208 $extOK = TRUE; 00209 $mainExtension = 'jpeg'; // making "jpeg" the common item_type 00210 break; 00211 } 00212 00213 // If extension was OK: 00214 if ($extOK) { 00215 $this->supportedExtensions[$extension] = TRUE; 00216 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension; 00217 return TRUE; 00218 } 00219 } 00220 00228 function softInit($extension) { 00229 switch($extension) { 00230 case 'pdf': // PDF 00231 case 'doc': // MS Word files 00232 case 'pps': // MS PowerPoint 00233 case 'ppt': // MS PowerPoint 00234 case 'xls': // MS Excel 00235 case 'sxc': // Open Office Calc. 00236 case 'sxi': // Open Office Impress 00237 case 'sxw': // Open Office Writer 00238 case 'rtf': // RTF documents 00239 case 'txt': // ASCII Text documents 00240 case 'html': // HTML 00241 case 'htm': // HTML 00242 case 'csv': // Comma Separated Values 00243 case 'xml': // Generic XML 00244 case 'jpg': // Jpeg images (EXIF comment) 00245 case 'jpeg': // Jpeg images (EXIF comment) 00246 case 'tif': // TIF images (EXIF comment) 00247 return TRUE; 00248 break; 00249 } 00250 } 00251 00258 function searchTypeMediaTitle($extension) { 00259 00260 // Read indexer-config 00261 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00262 00263 // Ignore extensions 00264 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00265 if (in_array($extension, $ignoreExtensions)) { 00266 return FALSE; 00267 } 00268 00269 // Switch on file extension: 00270 switch($extension) { 00271 case 'pdf': 00272 // PDF 00273 if ($indexerConfig['pdftools']) { 00274 return 'PDF'; 00275 } 00276 break; 00277 case 'doc': 00278 // Catdoc 00279 if ($indexerConfig['catdoc']) { 00280 return 'MS Word'; 00281 } 00282 break; 00283 case 'pps': // MS PowerPoint(?) 00284 case 'ppt': // MS PowerPoint 00285 // ppthtml 00286 if ($indexerConfig['ppthtml']) { 00287 return 'MS Powerpoint'; 00288 } 00289 break; 00290 case 'xls': // MS Excel 00291 // Xlhtml 00292 if ($indexerConfig['xlhtml']) { 00293 return 'MS Excel'; 00294 } 00295 break; 00296 case 'sxc': // Open Office Calc. 00297 case 'sxi': // Open Office Impress 00298 case 'sxw': // Open Office Writer 00299 if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby']) { 00300 return 'Open Office'; 00301 } 00302 break; 00303 case 'rtf': 00304 // Catdoc 00305 if ($indexerConfig['unrtf']) { 00306 return 'RTF'; 00307 } 00308 break; 00309 case 'html': // PHP strip-tags() 00310 case 'jpeg': // PHP EXIF 00311 case 'txt': // Raw text 00312 case 'csv': // Raw text 00313 case 'xml': // PHP strip-tags() 00314 case 'tif': // PHP EXIF 00315 return strtoupper($extension); 00316 break; 00317 // NO entry (duplicates or blank): 00318 case 'htm': // PHP strip-tags() 00319 case 'jpg': // PHP EXIF 00320 default: 00321 break; 00322 } 00323 } 00324 00331 function isMultiplePageExtension($extension) { 00332 // Switch on file extension: 00333 switch((string)$extension) { 00334 case 'pdf': 00335 return TRUE; 00336 break; 00337 } 00338 } 00339 00340 00341 00342 00343 00344 00345 00346 00347 00348 /************************ 00349 * 00350 * Reading documents (for parsing) 00351 * 00352 ************************/ 00353 00362 function readFileContent($ext,$absFile,$cPKey) { 00363 unset($contentArr); 00364 00365 // Return immediately if initialization didn't set support up: 00366 if (!$this->supportedExtensions[$ext]) return FALSE; 00367 00368 // Switch by file extension 00369 switch ($ext) { 00370 case 'pdf': 00371 if ($this->app['pdfinfo']) { 00372 // Getting pdf-info: 00373 $cmd = $this->app['pdfinfo'].' '.$absFile; 00374 exec($cmd,$res); 00375 $pdfInfo = $this->splitPdfInfo($res); 00376 if (intval($pdfInfo['pages'])) { 00377 list($low,$high) = explode('-',$cPKey); 00378 00379 // Get pdf content: 00380 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name 00381 @unlink ($tempFileName); // Delete if exists, just to be safe. 00382 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q '.$absFile.' '.$tempFileName; 00383 exec($cmd,$res); 00384 if (@is_file($tempFileName)) { 00385 $content = t3lib_div::getUrl($tempFileName); 00386 unlink($tempFileName); 00387 } else { 00388 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2); 00389 } 00390 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00391 } 00392 } 00393 break; 00394 case 'doc': 00395 if ($this->app['catdoc']) { 00396 $cmd = $this->app['catdoc'].' -d utf-8 '.$absFile; 00397 exec($cmd,$res); 00398 $content = implode(chr(10),$res); 00399 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00400 } 00401 break; 00402 case 'pps': 00403 case 'ppt': 00404 if ($this->app['ppthtml']) { 00405 $cmd = $this->app['ppthtml'].' '.$absFile; 00406 exec($cmd,$res); 00407 $content = implode(chr(10),$res); 00408 $content = $this->pObj->convertHTMLToUtf8($content); 00409 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00410 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00411 } 00412 break; 00413 case 'xls': 00414 if ($this->app['xlhtml']) { 00415 $cmd = $this->app['xlhtml'].' -nc -te '.$absFile; 00416 exec($cmd,$res); 00417 $content = implode(chr(10),$res); 00418 $content = $this->pObj->convertHTMLToUtf8($content); 00419 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00420 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00421 } 00422 break; 00423 case 'sxi': 00424 case 'sxc': 00425 case 'sxw': 00426 if ($this->app['nativeOOMethod']) { 00427 if (t3lib_extMgm::isLoaded('libunzipped')) { 00428 00429 global $TYPO3_CONF_VARS; 00430 require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php'); 00431 00432 // Initialize Unzip object: 00433 $unzip = t3lib_div::makeInstance('tx_libunzipped'); 00434 $ooFiles = $unzip->init($absFile); 00435 if (is_array($ooFiles)) { 00436 // Read content.xml: 00437 $content_xml = $unzip->getFileFromArchive('content.xml'); 00438 $meta_xml = $unzip->getFileFromArchive('meta.xml'); 00439 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content']))); 00440 $contentArr = $this->pObj->splitRegularContent($utf8_content); 00441 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00442 00443 // Meta information 00444 $metaContent = t3lib_div::xml2tree($meta_xml['content']); 00445 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch']; 00446 if (is_array($metaContent)) { 00447 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title']; 00448 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0]; 00449 00450 // Keywords collected: 00451 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) { 00452 foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) { 00453 $contentArr['keywords'].= $kwDat['values'][0].' '; 00454 } 00455 } 00456 } 00457 } 00458 } 00459 } else { 00460 if ($this->app['ruby']) { 00461 // Extracting document headers: 00462 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading '.$absFile; 00463 exec($cmd,$headings); 00464 00465 // Extracting document text: 00466 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' '.$absFile; 00467 exec($cmd,$texts); 00468 00469 $content = implode(chr(10),$headings).' '.implode(chr(10),$texts); 00470 $contentArr = $this->pObj->splitRegularContent($content); 00471 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00472 } 00473 } 00474 break; 00475 case 'rtf': 00476 if ($this->app['unrtf']) { 00477 $cmd = $this->app['unrtf'].' '.$absFile; 00478 exec($cmd,$res); 00479 $fileContent = implode(chr(10),$res); 00480 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00481 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00482 } 00483 break; 00484 case 'txt': 00485 case 'csv': // Raw text 00486 $content = t3lib_div::getUrl($absFile); 00487 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...) 00488 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1'); 00489 $contentArr = $this->pObj->splitRegularContent($content); 00490 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00491 break; 00492 case 'html': 00493 case 'htm': 00494 $fileContent = t3lib_div::getUrl($absFile); 00495 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00496 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00497 break; 00498 case 'xml': // PHP strip-tags() 00499 $fileContent = t3lib_div::getUrl($absFile); 00500 00501 // Finding charset: 00502 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg); 00503 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8'; 00504 00505 // Converting content: 00506 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset); 00507 $contentArr = $this->pObj->splitRegularContent($fileContent); 00508 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00509 break; 00510 case 'jpg': // PHP EXIF 00511 case 'jpeg': // PHP EXIF 00512 case 'tif': // PHP EXIF 00513 $exif = exif_read_data($absFile, 'IFD0'); 00514 if ($exif) { 00515 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii. 00516 } else { 00517 $comment = ''; 00518 } 00519 $contentArr = $this->pObj->splitRegularContent($comment); 00520 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00521 break; 00522 default: 00523 return false; 00524 break; 00525 } 00526 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name. 00527 if (is_array($contentArr) && !$contentArr['title']) { 00528 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char. 00529 } 00530 00531 return $contentArr; 00532 } 00533 00542 function fileContentParts($ext,$absFile) { 00543 $cParts = array(0); 00544 switch ($ext) { 00545 case 'pdf': 00546 // Getting pdf-info: 00547 $cmd = $this->app['pdfinfo'].' '.$absFile; 00548 exec($cmd,$res); 00549 $pdfInfo = $this->splitPdfInfo($res); 00550 00551 if (intval($pdfInfo['pages'])) { 00552 $cParts = array(); 00553 00554 // Calculate mode 00555 if ($this->pdf_mode>0) { 00556 $iter = ceil($pdfInfo['pages']/$this->pdf_mode); 00557 } else { 00558 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']); 00559 } 00560 00561 // Traverse and create intervals. 00562 for ($a=0;$a<$iter;$a++) { 00563 $low = floor($a*($pdfInfo['pages']/$iter))+1; 00564 $high = floor(($a+1)*($pdfInfo['pages']/$iter)); 00565 $cParts[] = $low.'-'.$high; 00566 } 00567 } 00568 break; 00569 } 00570 return $cParts; 00571 } 00572 00581 function splitPdfInfo($pdfInfoArray) { 00582 $res = array(); 00583 if (is_array($pdfInfoArray)) { 00584 foreach($pdfInfoArray as $line) { 00585 $parts = explode(':',$line,2); 00586 if (count($parts)>1 && trim($parts[0])) { 00587 $res[strtolower(trim($parts[0]))] = trim($parts[1]); 00588 } 00589 } 00590 } 00591 return $res; 00592 } 00593 00600 function removeEndJunk($string) { 00601 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string)); 00602 } 00603 00604 00605 00606 00607 00608 00609 00610 00611 00612 00613 00614 00615 /************************ 00616 * 00617 * Backend analyzer 00618 * 00619 ************************/ 00620 00627 function getIcon($extension) { 00628 if ($extension=='htm') $extension = 'html'; 00629 if ($extension=='jpeg') $extension = 'jpg'; 00630 return 'EXT:indexed_search/pi/res/'.$extension.'.gif'; 00631 } 00632 } 00633 00634 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) { 00635 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']); 00636 } 00637 ?>