"TYPO3 4.0.1: typo3_src-4.0.1/typo3/sysext/indexed_search/class.external_parser.php Source File", "datetime" => "Sat Dec 2 19:22:32 2006", "date" => "2 Dec 2006", "doxygenversion" => "1.4.6", "projectname" => "TYPO3 4.0.1", "projectnumber" => "4.0.1" ); get_header($doxygen_vars); ?>
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00075 class tx_indexed_search_extparse { 00076 00077 // This value is also overridden from config. 00078 var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10 00079 00080 // This array is configured in initialization: 00081 var $app = array(); 00082 var $ext2itemtype_map = array(); 00083 var $supportedExtensions = array(); 00084 00085 var $pObj; // Reference to parent object (indexer class) 00086 00087 00094 function initParser($extension) { 00095 00096 // Then read indexer-config and set if appropriate: 00097 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00098 00099 // If windows, apply extension to tool name: 00100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg 00101 $extOK = FALSE; 00102 $mainExtension = ''; 00103 00104 // Ignore extensions 00105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00106 if (in_array($extension, $ignoreExtensions)) { 00107 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1); 00108 return FALSE; 00109 } 00110 00111 // Switch on file extension: 00112 switch($extension) { 00113 case 'pdf': 00114 // PDF 00115 if ($indexerConfig['pdftools']) { 00116 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/'; 00117 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) { 00118 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe; 00119 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe; 00120 // PDF mode: 00121 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100); 00122 $extOK = TRUE; 00123 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3); 00124 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1); 00125 break; 00126 case 'doc': 00127 // Catdoc 00128 if ($indexerConfig['catdoc']) { 00129 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/'; 00130 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) { 00131 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe; 00132 $extOK = TRUE; 00133 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3); 00134 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1); 00135 break; 00136 case 'pps': // MS PowerPoint(?) 00137 case 'ppt': // MS PowerPoint 00138 // ppthtml 00139 if ($indexerConfig['ppthtml']) { 00140 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/'; 00141 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){ 00142 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe; 00143 $extOK = TRUE; 00144 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in path '".$ppthtmlPath."ppthtml'",3); 00145 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1); 00146 break; 00147 case 'xls': // MS Excel 00148 // Xlhtml 00149 if ($indexerConfig['xlhtml']) { 00150 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/'; 00151 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){ 00152 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe; 00153 $extOK = TRUE; 00154 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in path '".$xlhtmlPath."xlhtml'",3); 00155 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1); 00156 break; 00157 case 'sxc': // Open Office Calc. 00158 case 'sxi': // Open Office Impress 00159 case 'sxw': // Open Office Writer 00160 case 'ods': // Oasis OpenDocument Spreadsheet 00161 case 'odp': // Oasis OpenDocument Presentation 00162 case 'odt': // Oasis OpenDocument Text 00163 if ($indexerConfig['unzip']) { 00164 $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/'; 00165 if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) { 00166 $this->app['unzip'] = $unzipPath.'unzip'.$exe; 00167 $extOK = TRUE; 00168 } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3); 00169 } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1); 00170 break; 00171 case 'rtf': 00172 // Catdoc 00173 if ($indexerConfig['unrtf']) { 00174 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/'; 00175 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) { 00176 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe; 00177 $extOK = TRUE; 00178 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in path '".$unrtfPath."unrtf'",3); 00179 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1); 00180 break; 00181 case 'txt': // Raw text 00182 case 'csv': // Raw text 00183 case 'xml': // PHP strip-tags() 00184 case 'tif': // PHP EXIF 00185 $extOK = TRUE; 00186 break; 00187 case 'html': // PHP strip-tags() 00188 case 'htm': // PHP strip-tags() 00189 $extOK = TRUE; 00190 $mainExtension = 'html'; // making "html" the common "item_type" 00191 break; 00192 case 'jpg': // PHP EXIF 00193 case 'jpeg': // PHP EXIF 00194 $extOK = TRUE; 00195 $mainExtension = 'jpeg'; // making "jpeg" the common item_type 00196 break; 00197 } 00198 00199 // If extension was OK: 00200 if ($extOK) { 00201 $this->supportedExtensions[$extension] = TRUE; 00202 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension; 00203 return TRUE; 00204 } 00205 } 00206 00214 function softInit($extension) { 00215 switch($extension) { 00216 case 'pdf': // PDF 00217 case 'doc': // MS Word files 00218 case 'pps': // MS PowerPoint 00219 case 'ppt': // MS PowerPoint 00220 case 'xls': // MS Excel 00221 case 'sxc': // Open Office Calc. 00222 case 'sxi': // Open Office Impress 00223 case 'sxw': // Open Office Writer 00224 case 'ods': // Oasis OpenDocument Spreadsheet 00225 case 'odp': // Oasis OpenDocument Presentation 00226 case 'odt': // Oasis OpenDocument Text 00227 case 'rtf': // RTF documents 00228 case 'txt': // ASCII Text documents 00229 case 'html': // HTML 00230 case 'htm': // HTML 00231 case 'csv': // Comma Separated Values 00232 case 'xml': // Generic XML 00233 case 'jpg': // Jpeg images (EXIF comment) 00234 case 'jpeg': // Jpeg images (EXIF comment) 00235 case 'tif': // TIF images (EXIF comment) 00236 return TRUE; 00237 break; 00238 } 00239 } 00240 00247 function searchTypeMediaTitle($extension) { 00248 00249 // Read indexer-config 00250 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); 00251 00252 // Ignore extensions 00253 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1); 00254 if (in_array($extension, $ignoreExtensions)) { 00255 return FALSE; 00256 } 00257 00258 // Switch on file extension: 00259 switch($extension) { 00260 case 'pdf': 00261 // PDF 00262 if ($indexerConfig['pdftools']) { 00263 return 'PDF'; 00264 } 00265 break; 00266 case 'doc': 00267 // Catdoc 00268 if ($indexerConfig['catdoc']) { 00269 return 'MS Word'; 00270 } 00271 break; 00272 case 'pps': // MS PowerPoint(?) 00273 case 'ppt': // MS PowerPoint 00274 // ppthtml 00275 if ($indexerConfig['ppthtml']) { 00276 return 'MS Powerpoint'; 00277 } 00278 break; 00279 case 'xls': // MS Excel 00280 // Xlhtml 00281 if ($indexerConfig['xlhtml']) { 00282 return 'MS Excel'; 00283 } 00284 break; 00285 case 'sxc': // Open Office Calc. 00286 case 'sxi': // Open Office Impress 00287 case 'sxw': // Open Office Writer 00288 case 'ods': // Oasis OpenDocument Spreadsheet 00289 case 'odp': // Oasis OpenDocument Presentation 00290 case 'odt': // Oasis OpenDocument Text 00291 if ($indexerConfig['unzip']) { 00292 return 'Open Office'; 00293 } 00294 break; 00295 case 'rtf': 00296 // Catdoc 00297 if ($indexerConfig['unrtf']) { 00298 return 'RTF'; 00299 } 00300 break; 00301 case 'html': // PHP strip-tags() 00302 case 'jpeg': // PHP EXIF 00303 case 'txt': // Raw text 00304 case 'csv': // Raw text 00305 case 'xml': // PHP strip-tags() 00306 case 'tif': // PHP EXIF 00307 return strtoupper($extension); 00308 break; 00309 // NO entry (duplicates or blank): 00310 case 'htm': // PHP strip-tags() 00311 case 'jpg': // PHP EXIF 00312 default: 00313 break; 00314 } 00315 } 00316 00323 function isMultiplePageExtension($extension) { 00324 // Switch on file extension: 00325 switch((string)$extension) { 00326 case 'pdf': 00327 return TRUE; 00328 break; 00329 } 00330 } 00331 00332 00333 00334 00335 00336 00337 00338 00339 00340 /************************ 00341 * 00342 * Reading documents (for parsing) 00343 * 00344 ************************/ 00345 00354 function readFileContent($ext,$absFile,$cPKey) { 00355 unset($contentArr); 00356 00357 // Return immediately if initialization didn't set support up: 00358 if (!$this->supportedExtensions[$ext]) return FALSE; 00359 00360 // Switch by file extension 00361 switch ($ext) { 00362 case 'pdf': 00363 if ($this->app['pdfinfo']) { 00364 // Getting pdf-info: 00365 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"'; 00366 exec($cmd,$res); 00367 $pdfInfo = $this->splitPdfInfo($res); 00368 if (intval($pdfInfo['pages'])) { 00369 list($low,$high) = explode('-',$cPKey); 00370 00371 // Get pdf content: 00372 $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name 00373 @unlink ($tempFileName); // Delete if exists, just to be safe. 00374 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName; 00375 exec($cmd,$res); 00376 if (@is_file($tempFileName)) { 00377 $content = t3lib_div::getUrl($tempFileName); 00378 unlink($tempFileName); 00379 } else { 00380 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2); 00381 } 00382 if (strlen($content)) { 00383 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00384 } 00385 } 00386 } 00387 break; 00388 case 'doc': 00389 if ($this->app['catdoc']) { 00390 $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"'; 00391 exec($cmd,$res); 00392 $content = implode(chr(10),$res); 00393 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content)); 00394 } 00395 break; 00396 case 'pps': 00397 case 'ppt': 00398 if ($this->app['ppthtml']) { 00399 $cmd = $this->app['ppthtml'].' "'.$absFile.'"'; 00400 exec($cmd,$res); 00401 $content = implode(chr(10),$res); 00402 $content = $this->pObj->convertHTMLToUtf8($content); 00403 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00404 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00405 } 00406 break; 00407 case 'xls': 00408 if ($this->app['xlhtml']) { 00409 $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"'; 00410 exec($cmd,$res); 00411 $content = implode(chr(10),$res); 00412 $content = $this->pObj->convertHTMLToUtf8($content); 00413 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content)); 00414 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00415 } 00416 break; 00417 case 'sxi': 00418 case 'sxc': 00419 case 'sxw': 00420 case 'ods': 00421 case 'odp': 00422 case 'odt': 00423 if ($this->app['unzip']) { 00424 // Read content.xml: 00425 $cmd = $this->app['unzip'].' -p '.$absFile.' content.xml'; 00426 exec($cmd,$out); 00427 $content_xml = implode(chr(10),$out); 00428 00429 // Read meta.xml: 00430 $cmd = $this->app['unzip'].' -p '.$absFile.' meta.xml'; 00431 exec($cmd, $out); 00432 $meta_xml = implode(chr(10),$out); 00433 00434 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml))); 00435 $contentArr = $this->pObj->splitRegularContent($utf8_content); 00436 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00437 00438 // Meta information 00439 $metaContent = t3lib_div::xml2tree($meta_xml); 00440 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch']; 00441 if (is_array($metaContent)) { 00442 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title']; 00443 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0]; 00444 00445 // Keywords collected: 00446 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) { 00447 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) { 00448 $contentArr['keywords'].= $kwDat['values'][0].' '; 00449 } 00450 } 00451 } 00452 } 00453 break; 00454 case 'rtf': 00455 if ($this->app['unrtf']) { 00456 $cmd = $this->app['unrtf'].' "'.$absFile.'"'; 00457 exec($cmd,$res); 00458 $fileContent = implode(chr(10),$res); 00459 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00460 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00461 } 00462 break; 00463 case 'txt': 00464 case 'csv': // Raw text 00465 $content = t3lib_div::getUrl($absFile); 00466 // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...) 00467 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1'); 00468 $contentArr = $this->pObj->splitRegularContent($content); 00469 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00470 break; 00471 case 'html': 00472 case 'htm': 00473 $fileContent = t3lib_div::getUrl($absFile); 00474 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent); 00475 $contentArr = $this->pObj->splitHTMLContent($fileContent); 00476 break; 00477 case 'xml': // PHP strip-tags() 00478 $fileContent = t3lib_div::getUrl($absFile); 00479 00480 // Finding charset: 00481 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg); 00482 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8'; 00483 00484 // Converting content: 00485 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset); 00486 $contentArr = $this->pObj->splitRegularContent($fileContent); 00487 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00488 break; 00489 case 'jpg': // PHP EXIF 00490 case 'jpeg': // PHP EXIF 00491 case 'tif': // PHP EXIF 00492 $exif = exif_read_data($absFile, 'IFD0'); 00493 if ($exif) { 00494 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii. 00495 } else { 00496 $comment = ''; 00497 } 00498 $contentArr = $this->pObj->splitRegularContent($comment); 00499 $contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path! 00500 break; 00501 default: 00502 return false; 00503 break; 00504 } 00505 // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name. 00506 if (is_array($contentArr) && !$contentArr['title']) { 00507 $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char. 00508 } 00509 00510 return $contentArr; 00511 } 00512 00521 function fileContentParts($ext,$absFile) { 00522 $cParts = array(0); 00523 switch ($ext) { 00524 case 'pdf': 00525 // Getting pdf-info: 00526 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"'; 00527 exec($cmd,$res); 00528 $pdfInfo = $this->splitPdfInfo($res); 00529 00530 if (intval($pdfInfo['pages'])) { 00531 $cParts = array(); 00532 00533 // Calculate mode 00534 if ($this->pdf_mode>0) { 00535 $iter = ceil($pdfInfo['pages']/$this->pdf_mode); 00536 } else { 00537 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']); 00538 } 00539 00540 // Traverse and create intervals. 00541 for ($a=0;$a<$iter;$a++) { 00542 $low = floor($a*($pdfInfo['pages']/$iter))+1; 00543 $high = floor(($a+1)*($pdfInfo['pages']/$iter)); 00544 $cParts[] = $low.'-'.$high; 00545 } 00546 } 00547 break; 00548 } 00549 return $cParts; 00550 } 00551 00560 function splitPdfInfo($pdfInfoArray) { 00561 $res = array(); 00562 if (is_array($pdfInfoArray)) { 00563 foreach($pdfInfoArray as $line) { 00564 $parts = explode(':',$line,2); 00565 if (count($parts)>1 && trim($parts[0])) { 00566 $res[strtolower(trim($parts[0]))] = trim($parts[1]); 00567 } 00568 } 00569 } 00570 return $res; 00571 } 00572 00579 function removeEndJunk($string) { 00580 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string)); 00581 } 00582 00583 00584 00585 00586 00587 00588 00589 00590 00591 00592 00593 00594 /************************ 00595 * 00596 * Backend analyzer 00597 * 00598 ************************/ 00599 00606 function getIcon($extension) { 00607 if ($extension=='htm') $extension = 'html'; 00608 if ($extension=='jpeg') $extension = 'jpg'; 00609 return 'EXT:indexed_search/pi/res/'.$extension.'.gif'; 00610 } 00611 } 00612 00613 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) { 00614 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']); 00615 } 00616 ?>