00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00075 class tx_indexed_search_extparse {
00076
00077
00078 var $pdf_mode = -20;
00079
00080
00081 var $app = array();
00082 var $ext2itemtype_map = array();
00083 var $supportedExtensions = array();
00084
00085 var $pObj;
00086
00087
00094 function initParser($extension) {
00095
00096
00097 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00098
00099
00100 $exe = (TYPO3_OS == 'WIN') ? '.exe' : '';
00101 $extOK = FALSE;
00102 $mainExtension = '';
00103
00104
00105 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00106 if (in_array($extension, $ignoreExtensions)) {
00107 $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
00108 return FALSE;
00109 }
00110
00111
00112 switch($extension) {
00113 case 'pdf':
00114
00115 if ($indexerConfig['pdftools']) {
00116 $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00117 if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
00118 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
00119 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
00120
00121 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00122 $extOK = TRUE;
00123 } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00124 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
00125 break;
00126 case 'doc':
00127
00128 if ($indexerConfig['catdoc']) {
00129 $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00130 if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
00131 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
00132 $extOK = TRUE;
00133 } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
00134 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
00135 break;
00136 case 'pps':
00137 case 'ppt':
00138
00139 if ($indexerConfig['ppthtml']) {
00140 $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
00141 if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
00142 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
00143 $extOK = TRUE;
00144 } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in path '".$ppthtmlPath."ppthtml'",3);
00145 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
00146 break;
00147 case 'xls':
00148
00149 if ($indexerConfig['xlhtml']) {
00150 $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
00151 if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
00152 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
00153 $extOK = TRUE;
00154 } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in path '".$xlhtmlPath."xlhtml'",3);
00155 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
00156 break;
00157 case 'sxc':
00158 case 'sxi':
00159 case 'sxw':
00160 case 'ods':
00161 case 'odp':
00162 case 'odt':
00163 if ($indexerConfig['unzip']) {
00164 $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
00165 if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
00166 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
00167 $extOK = TRUE;
00168 } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
00169 } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
00170 break;
00171 case 'rtf':
00172
00173 if ($indexerConfig['unrtf']) {
00174 $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
00175 if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
00176 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
00177 $extOK = TRUE;
00178 } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in path '".$unrtfPath."unrtf'",3);
00179 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
00180 break;
00181 case 'txt':
00182 case 'csv':
00183 case 'xml':
00184 case 'tif':
00185 $extOK = TRUE;
00186 break;
00187 case 'html':
00188 case 'htm':
00189 $extOK = TRUE;
00190 $mainExtension = 'html';
00191 break;
00192 case 'jpg':
00193 case 'jpeg':
00194 $extOK = TRUE;
00195 $mainExtension = 'jpeg';
00196 break;
00197 }
00198
00199
00200 if ($extOK) {
00201 $this->supportedExtensions[$extension] = TRUE;
00202 $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
00203 return TRUE;
00204 }
00205 }
00206
00214 function softInit($extension) {
00215 switch($extension) {
00216 case 'pdf':
00217 case 'doc':
00218 case 'pps':
00219 case 'ppt':
00220 case 'xls':
00221 case 'sxc':
00222 case 'sxi':
00223 case 'sxw':
00224 case 'ods':
00225 case 'odp':
00226 case 'odt':
00227 case 'rtf':
00228 case 'txt':
00229 case 'html':
00230 case 'htm':
00231 case 'csv':
00232 case 'xml':
00233 case 'jpg':
00234 case 'jpeg':
00235 case 'tif':
00236 return TRUE;
00237 break;
00238 }
00239 }
00240
00247 function searchTypeMediaTitle($extension) {
00248
00249
00250 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00251
00252
00253 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00254 if (in_array($extension, $ignoreExtensions)) {
00255 return FALSE;
00256 }
00257
00258
00259 switch($extension) {
00260 case 'pdf':
00261
00262 if ($indexerConfig['pdftools']) {
00263 return 'PDF';
00264 }
00265 break;
00266 case 'doc':
00267
00268 if ($indexerConfig['catdoc']) {
00269 return 'MS Word';
00270 }
00271 break;
00272 case 'pps':
00273 case 'ppt':
00274
00275 if ($indexerConfig['ppthtml']) {
00276 return 'MS Powerpoint';
00277 }
00278 break;
00279 case 'xls':
00280
00281 if ($indexerConfig['xlhtml']) {
00282 return 'MS Excel';
00283 }
00284 break;
00285 case 'sxc':
00286 case 'sxi':
00287 case 'sxw':
00288 case 'ods':
00289 case 'odp':
00290 case 'odt':
00291 if ($indexerConfig['unzip']) {
00292 return 'Open Office';
00293 }
00294 break;
00295 case 'rtf':
00296
00297 if ($indexerConfig['unrtf']) {
00298 return 'RTF';
00299 }
00300 break;
00301 case 'html':
00302 case 'jpeg':
00303 case 'txt':
00304 case 'csv':
00305 case 'xml':
00306 case 'tif':
00307 return strtoupper($extension);
00308 break;
00309
00310 case 'htm':
00311 case 'jpg':
00312 default:
00313 break;
00314 }
00315 }
00316
00323 function isMultiplePageExtension($extension) {
00324
00325 switch((string)$extension) {
00326 case 'pdf':
00327 return TRUE;
00328 break;
00329 }
00330 }
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00354 function readFileContent($ext,$absFile,$cPKey) {
00355 unset($contentArr);
00356
00357
00358 if (!$this->supportedExtensions[$ext]) return FALSE;
00359
00360
00361 switch ($ext) {
00362 case 'pdf':
00363 if ($this->app['pdfinfo']) {
00364
00365 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00366 exec($cmd,$res);
00367 $pdfInfo = $this->splitPdfInfo($res);
00368 unset($res);
00369 if (intval($pdfInfo['pages'])) {
00370 list($low,$high) = explode('-',$cPKey);
00371
00372
00373 $tempFileName = t3lib_div::tempnam('Typo3_indexer');
00374 @unlink ($tempFileName);
00375 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
00376 exec($cmd);
00377 if (@is_file($tempFileName)) {
00378 $content = t3lib_div::getUrl($tempFileName);
00379 unlink($tempFileName);
00380 } else {
00381 $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00382 }
00383 if (strlen($content)) {
00384 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00385 }
00386 }
00387 }
00388 break;
00389 case 'doc':
00390 if ($this->app['catdoc']) {
00391 $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
00392 exec($cmd,$res);
00393 $content = implode(chr(10),$res);
00394 unset($res);
00395 $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00396 }
00397 break;
00398 case 'pps':
00399 case 'ppt':
00400 if ($this->app['ppthtml']) {
00401 $cmd = $this->app['ppthtml'].' "'.$absFile.'"';
00402 exec($cmd,$res);
00403 $content = implode(chr(10),$res);
00404 unset($res);
00405 $content = $this->pObj->convertHTMLToUtf8($content);
00406 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00407 $contentArr['title'] = basename($absFile);
00408 }
00409 break;
00410 case 'xls':
00411 if ($this->app['xlhtml']) {
00412 $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
00413 exec($cmd,$res);
00414 $content = implode(chr(10),$res);
00415 unset($res);
00416 $content = $this->pObj->convertHTMLToUtf8($content);
00417 $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00418 $contentArr['title'] = basename($absFile);
00419 }
00420 break;
00421 case 'sxi':
00422 case 'sxc':
00423 case 'sxw':
00424 case 'ods':
00425 case 'odp':
00426 case 'odt':
00427 if ($this->app['unzip']) {
00428
00429 $cmd = $this->app['unzip'].' -p "'.$absFile.'" content.xml';
00430 exec($cmd,$res);
00431 $content_xml = implode(chr(10),$res);
00432 unset($res);
00433
00434
00435 $cmd = $this->app['unzip'].' -p "'.$absFile.'" meta.xml';
00436 exec($cmd, $res);
00437 $meta_xml = implode(chr(10),$res);
00438 unset($res);
00439
00440 $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
00441 $contentArr = $this->pObj->splitRegularContent($utf8_content);
00442 $contentArr['title'] = basename($absFile);
00443
00444
00445 $metaContent = t3lib_div::xml2tree($meta_xml);
00446 $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
00447 if (is_array($metaContent)) {
00448 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
00449 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
00450
00451
00452 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
00453 foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
00454 $contentArr['keywords'].= $kwDat['values'][0].' ';
00455 }
00456 }
00457 }
00458 }
00459 break;
00460 case 'rtf':
00461 if ($this->app['unrtf']) {
00462 $cmd = $this->app['unrtf'].' "'.$absFile.'"';
00463 exec($cmd,$res);
00464 $fileContent = implode(chr(10),$res);
00465 unset($res);
00466 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00467 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00468 }
00469 break;
00470 case 'txt':
00471 case 'csv':
00472 $content = t3lib_div::getUrl($absFile);
00473
00474 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
00475 $contentArr = $this->pObj->splitRegularContent($content);
00476 $contentArr['title'] = basename($absFile);
00477 break;
00478 case 'html':
00479 case 'htm':
00480 $fileContent = t3lib_div::getUrl($absFile);
00481 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00482 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00483 break;
00484 case 'xml':
00485 $fileContent = t3lib_div::getUrl($absFile);
00486
00487
00488 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
00489 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
00490
00491
00492 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
00493 $contentArr = $this->pObj->splitRegularContent($fileContent);
00494 $contentArr['title'] = basename($absFile);
00495 break;
00496 case 'jpg':
00497 case 'jpeg':
00498 case 'tif':
00499 $exif = exif_read_data($absFile, 'IFD0');
00500 if ($exif) {
00501 $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);
00502 } else {
00503 $comment = '';
00504 }
00505 $contentArr = $this->pObj->splitRegularContent($comment);
00506 $contentArr['title'] = basename($absFile);
00507 break;
00508 default:
00509 return false;
00510 break;
00511 }
00512
00513 if (is_array($contentArr) && !$contentArr['title']) {
00514 $contentArr['title'] = str_replace('_',' ',basename($absFile));
00515 }
00516
00517 return $contentArr;
00518 }
00519
00528 function fileContentParts($ext,$absFile) {
00529 $cParts = array(0);
00530 switch ($ext) {
00531 case 'pdf':
00532
00533 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00534 exec($cmd,$res);
00535 $pdfInfo = $this->splitPdfInfo($res);
00536 unset($res);
00537
00538 if (intval($pdfInfo['pages'])) {
00539 $cParts = array();
00540
00541
00542 if ($this->pdf_mode>0) {
00543 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
00544 } else {
00545 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00546 }
00547
00548
00549 for ($a=0;$a<$iter;$a++) {
00550 $low = floor($a*($pdfInfo['pages']/$iter))+1;
00551 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
00552 $cParts[] = $low.'-'.$high;
00553 }
00554 }
00555 break;
00556 }
00557 return $cParts;
00558 }
00559
00568 function splitPdfInfo($pdfInfoArray) {
00569 $res = array();
00570 if (is_array($pdfInfoArray)) {
00571 foreach($pdfInfoArray as $line) {
00572 $parts = explode(':',$line,2);
00573 if (count($parts)>1 && trim($parts[0])) {
00574 $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00575 }
00576 }
00577 }
00578 return $res;
00579 }
00580
00587 function removeEndJunk($string) {
00588 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
00589 }
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604
00605
00606
00607
00614 function getIcon($extension) {
00615 if ($extension=='htm') $extension = 'html';
00616 if ($extension=='jpeg') $extension = 'jpg';
00617 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
00618 }
00619 }
00620
00621 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
00622 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
00623 }
00624 ?>