00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00075 class tx_indexed_search_extparse {
00076
00077
00078 var $pdf_mode = -20;
00079
00080
00081 var $app = array();
00082 var $ext2itemtype_map = array();
00083 var $supportedExtensions = array();
00084
00085 var $pObj;
00086
00087
00094 function initParser($extension) {
00095
00096
00097 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00098
00099
00100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : '';
00101 $extOK = FALSE;
00102 $mainExtension = '';
00103
00104
00105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00106 if (in_array($extension, $ignoreExtensions)) {
00107 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
00108
00109 return FALSE;
00110 }
00111
00112
00113 switch($extension) {
00114 case 'pdf':
00115
00116 if ($indexerConfig['pdftools']) {
00117 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00118 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
00119 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
00120 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
00121
00122 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00123 $extOK = TRUE;
00124 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00125 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
00126 break;
00127 case 'doc':
00128
00129 if ($indexerConfig['catdoc']) {
00130 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00131 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
00132 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
00133 $extOK = TRUE;
00134 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
00135 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
00136 break;
00137 case 'pps':
00138 case 'ppt':
00139
00140 if ($indexerConfig['ppthtml']) {
00141 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
00142 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
00143 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
00144 $extOK = TRUE;
00145 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);
00146 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
00147 break;
00148 case 'xls':
00149
00150 if ($indexerConfig['xlhtml']) {
00151 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
00152 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
00153 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
00154 $extOK = TRUE;
00155 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);
00156 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
00157 break;
00158 case 'sxc':
00159 case 'sxi':
00160 case 'sxw':
00161
00162
00163
00164
00165
00166 if ($indexerConfig['nativeOOMethod']) {
00167 if (t3lib_extMgm::isLoaded('libunzipped')) {
00168 $this->app['nativeOOMethod'] = TRUE;
00169 $extOK = TRUE;
00170 $this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);
00171 } else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);
00172 } else {
00173 if ($indexerConfig['OOoExtract']) {
00174 if($indexerConfig['ruby']) { $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/'; }
00175
00176 $oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';
00177 if (ini_get('safe_mode') || (($rubyPath ? @is_file($rubyPath.'ruby'.$exe) : true) && @is_file($oooExPath.'ooo_extract.rb'))) {
00178 $this->app['ruby'] = $rubyPath.'ruby'.$exe;
00179 $this->app['OOo'] = $oooExPath.'ooo_extract.rb';
00180 $extOK = TRUE;
00181 } else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OpenOffice.org documents were not found in paths '".$rubyPath."ruby".$exe."' OR '".$oooExPath."ooo_extract.rb'",3);
00182 } else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);
00183 }
00184 break;
00185 case 'rtf':
00186
00187 if ($indexerConfig['unrtf']) {
00188 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
00189 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
00190 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
00191 $extOK = TRUE;
00192 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);
00193 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
00194 break;
00195 case 'txt':
00196 case 'csv':
00197 case 'xml':
00198 case 'tif':
00199 $extOK = TRUE;
00200 break;
00201 case 'html':
00202 case 'htm':
00203 $extOK = TRUE;
00204 $mainExtension = 'html';
00205 break;
00206 case 'jpg':
00207 case 'jpeg':
00208 $extOK = TRUE;
00209 $mainExtension = 'jpeg';
00210 break;
00211 }
00212
00213
00214 if ($extOK) {
00215 $this->supportedExtensions[$extension] = TRUE;
00216 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
00217 return TRUE;
00218 }
00219 }
00220
00228 function softInit($extension) {
00229 switch($extension) {
00230 case 'pdf':
00231 case 'doc':
00232 case 'pps':
00233 case 'ppt':
00234 case 'xls':
00235 case 'sxc':
00236 case 'sxi':
00237 case 'sxw':
00238 case 'rtf':
00239 case 'txt':
00240 case 'html':
00241 case 'htm':
00242 case 'csv':
00243 case 'xml':
00244 case 'jpg':
00245 case 'jpeg':
00246 case 'tif':
00247 return TRUE;
00248 break;
00249 }
00250 }
00251
00258 function searchTypeMediaTitle($extension) {
00259
00260
00261 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00262
00263
00264 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00265 if (in_array($extension, $ignoreExtensions)) {
00266 return FALSE;
00267 }
00268
00269
00270 switch($extension) {
00271 case 'pdf':
00272
00273 if ($indexerConfig['pdftools']) {
00274 return 'PDF';
00275 }
00276 break;
00277 case 'doc':
00278
00279 if ($indexerConfig['catdoc']) {
00280 return 'MS Word';
00281 }
00282 break;
00283 case 'pps':
00284 case 'ppt':
00285
00286 if ($indexerConfig['ppthtml']) {
00287 return 'MS Powerpoint';
00288 }
00289 break;
00290 case 'xls':
00291
00292 if ($indexerConfig['xlhtml']) {
00293 return 'MS Excel';
00294 }
00295 break;
00296 case 'sxc':
00297 case 'sxi':
00298 case 'sxw':
00299 if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby']) {
00300 return 'Open Office';
00301 }
00302 break;
00303 case 'rtf':
00304
00305 if ($indexerConfig['unrtf']) {
00306 return 'RTF';
00307 }
00308 break;
00309 case 'html':
00310 case 'jpeg':
00311 case 'txt':
00312 case 'csv':
00313 case 'xml':
00314 case 'tif':
00315 return strtoupper($extension);
00316 break;
00317
00318 case 'htm':
00319 case 'jpg':
00320 default:
00321 break;
00322 }
00323 }
00324
00331 function isMultiplePageExtension($extension) {
00332
00333 switch((string)$extension) {
00334 case 'pdf':
00335 return TRUE;
00336 break;
00337 }
00338 }
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00362 function readFileContent($ext,$absFile,$cPKey) {
00363 unset($contentArr);
00364
00365
00366 if (!$this->supportedExtensions[$ext]) return FALSE;
00367
00368
00369 switch ($ext) {
00370 case 'pdf':
00371 if ($this->app['pdfinfo']) {
00372
00373 $cmd = $this->app['pdfinfo'].' '.$absFile;
00374 exec($cmd,$res);
00375 $pdfInfo = $this->splitPdfInfo($res);
00376 if (intval($pdfInfo['pages'])) {
00377 list($low,$high) = explode('-',$cPKey);
00378
00379
00380 $tempFileName = t3lib_div::tempnam('Typo3_indexer');
00381 @unlink ($tempFileName);
00382 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q '.$absFile.' '.$tempFileName;
00383 exec($cmd,$res);
00384 if (@is_file($tempFileName)) {
00385 $content = t3lib_div::getUrl($tempFileName);
00386 unlink($tempFileName);
00387 } else {
00388 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00389 }
00390 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00391 }
00392 }
00393 break;
00394 case 'doc':
00395 if ($this->app['catdoc']) {
00396 $cmd = $this->app['catdoc'].' -d utf-8 '.$absFile;
00397 exec($cmd,$res);
00398 $content = implode(chr(10),$res);
00399 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00400 }
00401 break;
00402 case 'pps':
00403 case 'ppt':
00404 if ($this->app['ppthtml']) {
00405 $cmd = $this->app['ppthtml'].' '.$absFile;
00406 exec($cmd,$res);
00407 $content = implode(chr(10),$res);
00408 $content = $this->pObj->convertHTMLToUtf8($content);
00409 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00410 $contentArr['title'] = basename($absFile);
00411 }
00412 break;
00413 case 'xls':
00414 if ($this->app['xlhtml']) {
00415 $cmd = $this->app['xlhtml'].' -nc -te '.$absFile;
00416 exec($cmd,$res);
00417 $content = implode(chr(10),$res);
00418 $content = $this->pObj->convertHTMLToUtf8($content);
00419 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00420 $contentArr['title'] = basename($absFile);
00421 }
00422 break;
00423 case 'sxi':
00424 case 'sxc':
00425 case 'sxw':
00426 if ($this->app['nativeOOMethod']) {
00427 if (t3lib_extMgm::isLoaded('libunzipped')) {
00428
00429 global $TYPO3_CONF_VARS;
00430 require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');
00431
00432
00433 $unzip = t3lib_div::makeInstance('tx_libunzipped');
00434 $ooFiles = $unzip->init($absFile);
00435 if (is_array($ooFiles)) {
00436
00437 $content_xml = $unzip->getFileFromArchive('content.xml');
00438 $meta_xml = $unzip->getFileFromArchive('meta.xml');
00439 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));
00440 $contentArr = $this->pObj->splitRegularContent($utf8_content);
00441 $contentArr['title'] = basename($absFile);
00442
00443
00444 $metaContent = t3lib_div::xml2tree($meta_xml['content']);
00445 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
00446 if (is_array($metaContent)) {
00447 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
00448 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
00449
00450
00451 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
00452 foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
00453 $contentArr['keywords'].= $kwDat['values'][0].' ';
00454 }
00455 }
00456 }
00457 }
00458 }
00459 } else {
00460 if ($this->app['ruby']) {
00461
00462 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading '.$absFile;
00463 exec($cmd,$headings);
00464
00465
00466 $cmd = $this->app['ruby'].' '.$this->app['OOo'].' '.$absFile;
00467 exec($cmd,$texts);
00468
00469 $content = implode(chr(10),$headings).' '.implode(chr(10),$texts);
00470 $contentArr = $this->pObj->splitRegularContent($content);
00471 $contentArr['title'] = basename($absFile);
00472 }
00473 }
00474 break;
00475 case 'rtf':
00476 if ($this->app['unrtf']) {
00477 $cmd = $this->app['unrtf'].' '.$absFile;
00478 exec($cmd,$res);
00479 $fileContent = implode(chr(10),$res);
00480 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00481 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00482 }
00483 break;
00484 case 'txt':
00485 case 'csv':
00486 $content = t3lib_div::getUrl($absFile);
00487
00488 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
00489 $contentArr = $this->pObj->splitRegularContent($content);
00490 $contentArr['title'] = basename($absFile);
00491 break;
00492 case 'html':
00493 case 'htm':
00494 $fileContent = t3lib_div::getUrl($absFile);
00495 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00496 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00497 break;
00498 case 'xml':
00499 $fileContent = t3lib_div::getUrl($absFile);
00500
00501
00502 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
00503 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
00504
00505
00506 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
00507 $contentArr = $this->pObj->splitRegularContent($fileContent);
00508 $contentArr['title'] = basename($absFile);
00509 break;
00510 case 'jpg':
00511 case 'jpeg':
00512 case 'tif':
00513 $exif = exif_read_data($absFile, 'IFD0');
00514 if ($exif) {
00515 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);
00516 } else {
00517 $comment = '';
00518 }
00519 $contentArr = $this->pObj->splitRegularContent($comment);
00520 $contentArr['title'] = basename($absFile);
00521 break;
00522 default:
00523 return false;
00524 break;
00525 }
00526
00527 if (is_array($contentArr) && !$contentArr['title']) {
00528 $contentArr['title'] = str_replace('_',' ',basename($absFile));
00529 }
00530
00531 return $contentArr;
00532 }
00533
00542 function fileContentParts($ext,$absFile) {
00543 $cParts = array(0);
00544 switch ($ext) {
00545 case 'pdf':
00546
00547 $cmd = $this->app['pdfinfo'].' '.$absFile;
00548 exec($cmd,$res);
00549 $pdfInfo = $this->splitPdfInfo($res);
00550
00551 if (intval($pdfInfo['pages'])) {
00552 $cParts = array();
00553
00554
00555 if ($this->pdf_mode>0) {
00556 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
00557 } else {
00558 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00559 }
00560
00561
00562 for ($a=0;$a<$iter;$a++) {
00563 $low = floor($a*($pdfInfo['pages']/$iter))+1;
00564 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
00565 $cParts[] = $low.'-'.$high;
00566 }
00567 }
00568 break;
00569 }
00570 return $cParts;
00571 }
00572
00581 function splitPdfInfo($pdfInfoArray) {
00582 $res = array();
00583 if (is_array($pdfInfoArray)) {
00584 foreach($pdfInfoArray as $line) {
00585 $parts = explode(':',$line,2);
00586 if (count($parts)>1 && trim($parts[0])) {
00587 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00588 }
00589 }
00590 }
00591 return $res;
00592 }
00593
00600 function removeEndJunk($string) {
00601 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
00602 }
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614
00615
00616
00617
00618
00619
00620
00627 function getIcon($extension) {
00628 if ($extension=='htm') $extension = 'html';
00629 if ($extension=='jpeg') $extension = 'jpg';
00630 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
00631 }
00632 }
00633
00634 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
00635 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
00636 }
00637 ?>