00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00075 class tx_indexed_search_extparse {
00076
00077
00078 var $pdf_mode = -20;
00079
00080
00081 var $app = array();
00082 var $ext2itemtype_map = array();
00083 var $supportedExtensions = array();
00084
00085 var $pObj;
00086
00087
00094 function initParser($extension) {
00095
00096
00097 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00098
00099
00100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : '';
00101 $extOK = FALSE;
00102 $mainExtension = '';
00103
00104
00105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00106 if (in_array($extension, $ignoreExtensions)) {
00107 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
00108 return FALSE;
00109 }
00110
00111
00112 switch($extension) {
00113 case 'pdf':
00114
00115 if ($indexerConfig['pdftools']) {
00116 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00117 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
00118 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
00119 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
00120
00121 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00122 $extOK = TRUE;
00123 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00124 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
00125 break;
00126 case 'doc':
00127
00128 if ($indexerConfig['catdoc']) {
00129 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00130 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
00131 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
00132 $extOK = TRUE;
00133 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
00134 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
00135 break;
00136 case 'pps':
00137 case 'ppt':
00138
00139 if ($indexerConfig['ppthtml']) {
00140 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
00141 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
00142 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
00143 $extOK = TRUE;
00144 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in path '".$ppthtmlPath."ppthtml'",3);
00145 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
00146 break;
00147 case 'xls':
00148
00149 if ($indexerConfig['xlhtml']) {
00150 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
00151 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
00152 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
00153 $extOK = TRUE;
00154 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in path '".$xlhtmlPath."xlhtml'",3);
00155 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
00156 break;
00157 case 'sxc':
00158 case 'sxi':
00159 case 'sxw':
00160 case 'ods':
00161 case 'odp':
00162 case 'odt':
00163 if ($indexerConfig['unzip']) {
00164 $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
00165 if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
00166 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
00167 $extOK = TRUE;
00168 } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
00169 } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
00170 break;
00171 case 'rtf':
00172
00173 if ($indexerConfig['unrtf']) {
00174 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
00175 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
00176 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
00177 $extOK = TRUE;
00178 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in path '".$unrtfPath."unrtf'",3);
00179 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
00180 break;
00181 case 'txt':
00182 case 'csv':
00183 case 'xml':
00184 case 'tif':
00185 $extOK = TRUE;
00186 break;
00187 case 'html':
00188 case 'htm':
00189 $extOK = TRUE;
00190 $mainExtension = 'html';
00191 break;
00192 case 'jpg':
00193 case 'jpeg':
00194 $extOK = TRUE;
00195 $mainExtension = 'jpeg';
00196 break;
00197 }
00198
00199
00200 if ($extOK) {
00201 $this->supportedExtensions[$extension] = TRUE;
00202 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
00203 return TRUE;
00204 }
00205 }
00206
00214 function softInit($extension) {
00215 switch($extension) {
00216 case 'pdf':
00217 case 'doc':
00218 case 'pps':
00219 case 'ppt':
00220 case 'xls':
00221 case 'sxc':
00222 case 'sxi':
00223 case 'sxw':
00224 case 'ods':
00225 case 'odp':
00226 case 'odt':
00227 case 'rtf':
00228 case 'txt':
00229 case 'html':
00230 case 'htm':
00231 case 'csv':
00232 case 'xml':
00233 case 'jpg':
00234 case 'jpeg':
00235 case 'tif':
00236 return TRUE;
00237 break;
00238 }
00239 }
00240
00247 function searchTypeMediaTitle($extension) {
00248
00249
00250 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00251
00252
00253 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00254 if (in_array($extension, $ignoreExtensions)) {
00255 return FALSE;
00256 }
00257
00258
00259 switch($extension) {
00260 case 'pdf':
00261
00262 if ($indexerConfig['pdftools']) {
00263 return 'PDF';
00264 }
00265 break;
00266 case 'doc':
00267
00268 if ($indexerConfig['catdoc']) {
00269 return 'MS Word';
00270 }
00271 break;
00272 case 'pps':
00273 case 'ppt':
00274
00275 if ($indexerConfig['ppthtml']) {
00276 return 'MS Powerpoint';
00277 }
00278 break;
00279 case 'xls':
00280
00281 if ($indexerConfig['xlhtml']) {
00282 return 'MS Excel';
00283 }
00284 break;
00285 case 'sxc':
00286 case 'sxi':
00287 case 'sxw':
00288 case 'ods':
00289 case 'odp':
00290 case 'odt':
00291 if ($indexerConfig['unzip']) {
00292 return 'Open Office';
00293 }
00294 break;
00295 case 'rtf':
00296
00297 if ($indexerConfig['unrtf']) {
00298 return 'RTF';
00299 }
00300 break;
00301 case 'html':
00302 case 'jpeg':
00303 case 'txt':
00304 case 'csv':
00305 case 'xml':
00306 case 'tif':
00307 return strtoupper($extension);
00308 break;
00309
00310 case 'htm':
00311 case 'jpg':
00312 default:
00313 break;
00314 }
00315 }
00316
00323 function isMultiplePageExtension($extension) {
00324
00325 switch((string)$extension) {
00326 case 'pdf':
00327 return TRUE;
00328 break;
00329 }
00330 }
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00354 function readFileContent($ext,$absFile,$cPKey) {
00355 unset($contentArr);
00356
00357
00358 if (!$this->supportedExtensions[$ext]) return FALSE;
00359
00360
00361 switch ($ext) {
00362 case 'pdf':
00363 if ($this->app['pdfinfo']) {
00364
00365 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00366 exec($cmd,$res);
00367 $pdfInfo = $this->splitPdfInfo($res);
00368 if (intval($pdfInfo['pages'])) {
00369 list($low,$high) = explode('-',$cPKey);
00370
00371
00372 $tempFileName = t3lib_div::tempnam('Typo3_indexer');
00373 @unlink ($tempFileName);
00374 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
00375 exec($cmd,$res);
00376 if (@is_file($tempFileName)) {
00377 $content = t3lib_div::getUrl($tempFileName);
00378 unlink($tempFileName);
00379 } else {
00380 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00381 }
00382 if (strlen($content)) {
00383 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00384 }
00385 }
00386 }
00387 break;
00388 case 'doc':
00389 if ($this->app['catdoc']) {
00390 $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
00391 exec($cmd,$res);
00392 $content = implode(chr(10),$res);
00393 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00394 }
00395 break;
00396 case 'pps':
00397 case 'ppt':
00398 if ($this->app['ppthtml']) {
00399 $cmd = $this->app['ppthtml'].' "'.$absFile.'"';
00400 exec($cmd,$res);
00401 $content = implode(chr(10),$res);
00402 $content = $this->pObj->convertHTMLToUtf8($content);
00403 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00404 $contentArr['title'] = basename($absFile);
00405 }
00406 break;
00407 case 'xls':
00408 if ($this->app['xlhtml']) {
00409 $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
00410 exec($cmd,$res);
00411 $content = implode(chr(10),$res);
00412 $content = $this->pObj->convertHTMLToUtf8($content);
00413 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00414 $contentArr['title'] = basename($absFile);
00415 }
00416 break;
00417 case 'sxi':
00418 case 'sxc':
00419 case 'sxw':
00420 case 'ods':
00421 case 'odp':
00422 case 'odt':
00423 if ($this->app['unzip']) {
00424
00425 $cmd = $this->app['unzip'].' -p '.$absFile.' content.xml';
00426 exec($cmd,$out);
00427 $content_xml = implode(chr(10),$out);
00428
00429
00430 $cmd = $this->app['unzip'].' -p '.$absFile.' meta.xml';
00431 exec($cmd, $out);
00432 $meta_xml = implode(chr(10),$out);
00433
00434 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
00435 $contentArr = $this->pObj->splitRegularContent($utf8_content);
00436 $contentArr['title'] = basename($absFile);
00437
00438
00439 $metaContent = t3lib_div::xml2tree($meta_xml);
00440 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
00441 if (is_array($metaContent)) {
00442 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
00443 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
00444
00445
00446 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
00447 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
00448 $contentArr['keywords'].= $kwDat['values'][0].' ';
00449 }
00450 }
00451 }
00452 }
00453 break;
00454 case 'rtf':
00455 if ($this->app['unrtf']) {
00456 $cmd = $this->app['unrtf'].' "'.$absFile.'"';
00457 exec($cmd,$res);
00458 $fileContent = implode(chr(10),$res);
00459 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00460 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00461 }
00462 break;
00463 case 'txt':
00464 case 'csv':
00465 $content = t3lib_div::getUrl($absFile);
00466
00467 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
00468 $contentArr = $this->pObj->splitRegularContent($content);
00469 $contentArr['title'] = basename($absFile);
00470 break;
00471 case 'html':
00472 case 'htm':
00473 $fileContent = t3lib_div::getUrl($absFile);
00474 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00475 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00476 break;
00477 case 'xml':
00478 $fileContent = t3lib_div::getUrl($absFile);
00479
00480
00481 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
00482 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
00483
00484
00485 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
00486 $contentArr = $this->pObj->splitRegularContent($fileContent);
00487 $contentArr['title'] = basename($absFile);
00488 break;
00489 case 'jpg':
00490 case 'jpeg':
00491 case 'tif':
00492 $exif = exif_read_data($absFile, 'IFD0');
00493 if ($exif) {
00494 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);
00495 } else {
00496 $comment = '';
00497 }
00498 $contentArr = $this->pObj->splitRegularContent($comment);
00499 $contentArr['title'] = basename($absFile);
00500 break;
00501 default:
00502 return false;
00503 break;
00504 }
00505
00506 if (is_array($contentArr) && !$contentArr['title']) {
00507 $contentArr['title'] = str_replace('_',' ',basename($absFile));
00508 }
00509
00510 return $contentArr;
00511 }
00512
00521 function fileContentParts($ext,$absFile) {
00522 $cParts = array(0);
00523 switch ($ext) {
00524 case 'pdf':
00525
00526 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00527 exec($cmd,$res);
00528 $pdfInfo = $this->splitPdfInfo($res);
00529
00530 if (intval($pdfInfo['pages'])) {
00531 $cParts = array();
00532
00533
00534 if ($this->pdf_mode>0) {
00535 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
00536 } else {
00537 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00538 }
00539
00540
00541 for ($a=0;$a<$iter;$a++) {
00542 $low = floor($a*($pdfInfo['pages']/$iter))+1;
00543 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
00544 $cParts[] = $low.'-'.$high;
00545 }
00546 }
00547 break;
00548 }
00549 return $cParts;
00550 }
00551
00560 function splitPdfInfo($pdfInfoArray) {
00561 $res = array();
00562 if (is_array($pdfInfoArray)) {
00563 foreach($pdfInfoArray as $line) {
00564 $parts = explode(':',$line,2);
00565 if (count($parts)>1 && trim($parts[0])) {
00566 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00567 }
00568 }
00569 }
00570 return $res;
00571 }
00572
00579 function removeEndJunk($string) {
00580 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
00581 }
00582
00583
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00606 function getIcon($extension) {
00607 if ($extension=='htm') $extension = 'html';
00608 if ($extension=='jpeg') $extension = 'jpg';
00609 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
00610 }
00611 }
00612
00613 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
00614 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
00615 }
00616 ?>