Documentation TYPO3 par Ameos

class.external_parser.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00075 class tx_indexed_search_extparse {
00076 
00077                 // This value is also overridden from config.
00078         var $pdf_mode = -20;    // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
00079 
00080                 // This array is configured in initialization:
00081         var $app = array();
00082         var $ext2itemtype_map = array();
00083         var $supportedExtensions = array();
00084 
00085         var $pObj;              // Reference to parent object (indexer class)
00086 
00087 
00094         function initParser($extension) {
00095 
00096                         // Then read indexer-config and set if appropriate:
00097                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00098 
00099                         // If windows, apply extension to tool name:
00100                 $exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
00101                 $extOK = FALSE;
00102                 $mainExtension = '';
00103 
00104                         // Ignore extensions
00105                 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00106                 if (in_array($extension, $ignoreExtensions))    {
00107                         $this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
00108                         return FALSE;
00109                 }
00110 
00111                         // Switch on file extension:
00112                 switch($extension)      {
00113                         case 'pdf':
00114                                         // PDF
00115                                 if ($indexerConfig['pdftools']) {
00116                                         $pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
00117                                         if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
00118                                                 $this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
00119                                                 $this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
00120                                                         // PDF mode:
00121                                                 $this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
00122                                                 $extOK = TRUE;
00123                                         } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
00124                                 } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
00125                         break;
00126                         case 'doc':
00127                                         // Catdoc
00128                                 if ($indexerConfig['catdoc'])   {
00129                                         $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
00130                                         if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe))        {
00131                                                 $this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
00132                                                 $extOK = TRUE;
00133                                         } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
00134                                 } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
00135                         break;
00136                         case 'pps':             // MS PowerPoint(?)
00137                         case 'ppt':             // MS PowerPoint
00138                                         // ppthtml
00139                                 if ($indexerConfig['ppthtml'])  {
00140                                         $ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
00141                                         if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
00142                                                 $this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
00143                                                 $extOK = TRUE;
00144                                         } else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in path '".$ppthtmlPath."ppthtml'",3);
00145                                 } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
00146                         break;
00147                         case 'xls':             // MS Excel
00148                                         // Xlhtml
00149                                 if ($indexerConfig['xlhtml'])   {
00150                                         $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
00151                                         if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
00152                                                 $this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
00153                                                 $extOK = TRUE;
00154                                         } else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in path '".$xlhtmlPath."xlhtml'",3);
00155                                 } else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
00156                         break;
00157                         case 'sxc':             // Open Office Calc.
00158                         case 'sxi':             // Open Office Impress
00159                         case 'sxw':             // Open Office Writer
00160                         case 'ods':             // Oasis OpenDocument Spreadsheet
00161                         case 'odp':             // Oasis OpenDocument Presentation
00162                         case 'odt':             // Oasis OpenDocument Text
00163                                 if ($indexerConfig['unzip'])    {
00164                                         $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
00165                                         if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe))  {
00166                                                 $this->app['unzip'] = $unzipPath.'unzip'.$exe;
00167                                                 $extOK = TRUE;
00168                                         } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
00169                                 } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
00170                         break;
00171                         case 'rtf':
00172                                         // Catdoc
00173                                 if ($indexerConfig['unrtf'])    {
00174                                         $unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
00175                                         if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe))  {
00176                                                 $this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
00177                                                 $extOK = TRUE;
00178                                         } else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in path '".$unrtfPath."unrtf'",3);
00179                                 } else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
00180                         break;
00181                         case 'txt':             // Raw text
00182                         case 'csv':             // Raw text
00183                         case 'xml':             // PHP strip-tags()
00184                         case 'tif':             // PHP EXIF
00185                                 $extOK = TRUE;
00186                         break;
00187                         case 'html':    // PHP strip-tags()
00188                         case 'htm':             // PHP strip-tags()
00189                                 $extOK = TRUE;
00190                                 $mainExtension = 'html';        // making "html" the common "item_type"
00191                         break;
00192                         case 'jpg':             // PHP EXIF
00193                         case 'jpeg':    // PHP EXIF
00194                                 $extOK = TRUE;
00195                                 $mainExtension = 'jpeg';        // making "jpeg" the common item_type
00196                         break;
00197                 }
00198 
00199                         // If extension was OK:
00200                 if ($extOK)     {
00201                         $this->supportedExtensions[$extension] = TRUE;
00202                         $this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
00203                         return TRUE;
00204                 }
00205         }
00206 
00214         function softInit($extension)   {
00215                 switch($extension)      {
00216                         case 'pdf':             // PDF
00217                         case 'doc':             // MS Word files
00218                         case 'pps':             // MS PowerPoint
00219                         case 'ppt':             // MS PowerPoint
00220                         case 'xls':             // MS Excel
00221                         case 'sxc':             // Open Office Calc.
00222                         case 'sxi':             // Open Office Impress
00223                         case 'sxw':             // Open Office Writer
00224                         case 'ods':             // Oasis OpenDocument Spreadsheet
00225                         case 'odp':             // Oasis OpenDocument Presentation
00226                         case 'odt':             // Oasis OpenDocument Text
00227                         case 'rtf':             // RTF documents
00228                         case 'txt':             // ASCII Text documents
00229                         case 'html':    // HTML
00230                         case 'htm':             // HTML
00231                         case 'csv':             // Comma Separated Values
00232                         case 'xml':             // Generic XML
00233                         case 'jpg':             // Jpeg images (EXIF comment)
00234                         case 'jpeg':    // Jpeg images (EXIF comment)
00235                         case 'tif':             // TIF images (EXIF comment)
00236                                 return TRUE;
00237                         break;
00238                 }
00239         }
00240 
00247         function searchTypeMediaTitle($extension)       {
00248 
00249                         // Read indexer-config
00250                 $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
00251 
00252                         // Ignore extensions
00253                 $ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
00254                 if (in_array($extension, $ignoreExtensions))    {
00255                         return FALSE;
00256                 }
00257 
00258                         // Switch on file extension:
00259                 switch($extension)      {
00260                         case 'pdf':
00261                                         // PDF
00262                                 if ($indexerConfig['pdftools']) {
00263                                         return 'PDF';
00264                                 }
00265                         break;
00266                         case 'doc':
00267                                         // Catdoc
00268                                 if ($indexerConfig['catdoc'])   {
00269                                         return 'MS Word';
00270                                 }
00271                         break;
00272                         case 'pps':             // MS PowerPoint(?)
00273                         case 'ppt':             // MS PowerPoint
00274                                         // ppthtml
00275                                 if ($indexerConfig['ppthtml'])  {
00276                                         return 'MS Powerpoint';
00277                                 }
00278                         break;
00279                         case 'xls':             // MS Excel
00280                                         // Xlhtml
00281                                 if ($indexerConfig['xlhtml'])   {
00282                                         return 'MS Excel';
00283                                 }
00284                         break;
00285                         case 'sxc':             // Open Office Calc.
00286                         case 'sxi':             // Open Office Impress
00287                         case 'sxw':             // Open Office Writer
00288                         case 'ods':             // Oasis OpenDocument Spreadsheet
00289                         case 'odp':             // Oasis OpenDocument Presentation
00290                         case 'odt':             // Oasis OpenDocument Text
00291                                 if ($indexerConfig['unzip'])    {
00292                                         return 'Open Office';
00293                                 }
00294                         break;
00295                         case 'rtf':
00296                                         // Catdoc
00297                                 if ($indexerConfig['unrtf'])    {
00298                                         return 'RTF';
00299                                 }
00300                         break;
00301                         case 'html':    // PHP strip-tags()
00302                         case 'jpeg':    // PHP EXIF
00303                         case 'txt':             // Raw text
00304                         case 'csv':             // Raw text
00305                         case 'xml':             // PHP strip-tags()
00306                         case 'tif':             // PHP EXIF
00307                                 return strtoupper($extension);
00308                         break;
00309                                 // NO entry (duplicates or blank):
00310                         case 'htm':             // PHP strip-tags()
00311                         case 'jpg':             // PHP EXIF
00312                         default:
00313                         break;
00314                 }
00315         }
00316 
00323         function isMultiplePageExtension($extension)    {
00324                         // Switch on file extension:
00325                 switch((string)$extension)      {
00326                         case 'pdf':
00327                                 return TRUE;
00328                         break;
00329                 }
00330         }
00331 
00332 
00333 
00334 
00335 
00336 
00337 
00338 
00339 
00340         /************************
00341          *
00342          * Reading documents (for parsing)
00343          *
00344          ************************/
00345 
00354         function readFileContent($ext,$absFile,$cPKey)  {
00355                 unset($contentArr);
00356 
00357                         // Return immediately if initialization didn't set support up:
00358                 if (!$this->supportedExtensions[$ext])  return FALSE;
00359 
00360                         // Switch by file extension
00361                 switch ($ext)   {
00362                         case 'pdf':
00363                                 if ($this->app['pdfinfo'])      {
00364                                                 // Getting pdf-info:
00365                                         $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00366                                         exec($cmd,$res);
00367                                         $pdfInfo = $this->splitPdfInfo($res);
00368                                         if (intval($pdfInfo['pages']))  {
00369                                                 list($low,$high) = explode('-',$cPKey);
00370 
00371                                                         // Get pdf content:
00372                                                 $tempFileName = t3lib_div::tempnam('Typo3_indexer');            // Create temporary name
00373                                                 @unlink ($tempFileName);        // Delete if exists, just to be safe.
00374                                                 $cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" '.$tempFileName;
00375                                                 exec($cmd,$res);
00376                                                 if (@is_file($tempFileName))    {
00377                                                         $content = t3lib_div::getUrl($tempFileName);
00378                                                         unlink($tempFileName);
00379                                                 } else {
00380                                                         $this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
00381                                                 }
00382                                                 if (strlen($content))   {
00383                                                         $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00384                                                 }
00385                                         }
00386                                 }
00387                         break;
00388                         case 'doc':
00389                                 if ($this->app['catdoc'])       {
00390                                         $cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
00391                                         exec($cmd,$res);
00392                                         $content = implode(chr(10),$res);
00393                                         $contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
00394                                 }
00395                         break;
00396                         case 'pps':
00397                         case 'ppt':
00398                                 if ($this->app['ppthtml'])      {
00399                                         $cmd = $this->app['ppthtml'].' "'.$absFile.'"';
00400                                         exec($cmd,$res);
00401                                         $content = implode(chr(10),$res);
00402                                         $content = $this->pObj->convertHTMLToUtf8($content);
00403                                         $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00404                                         $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00405                                 }
00406                         break;
00407                         case 'xls':
00408                                 if ($this->app['xlhtml'])       {
00409                                         $cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
00410                                         exec($cmd,$res);
00411                                         $content = implode(chr(10),$res);
00412                                         $content = $this->pObj->convertHTMLToUtf8($content);
00413                                         $contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
00414                                         $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00415                                 }
00416                         break;
00417                         case 'sxi':
00418                         case 'sxc':
00419                         case 'sxw':
00420                         case 'ods':
00421                         case 'odp':
00422                         case 'odt':
00423                                 if ($this->app['unzip'])        {
00424                                                 // Read content.xml:
00425                                         $cmd = $this->app['unzip'].' -p '.$absFile.' content.xml';
00426                                         exec($cmd,$out);
00427                                         $content_xml = implode(chr(10),$out);
00428 
00429                                                 // Read meta.xml:
00430                                         $cmd = $this->app['unzip'].' -p '.$absFile.' meta.xml';
00431                                         exec($cmd, $out);
00432                                         $meta_xml = implode(chr(10),$out);
00433 
00434                                         $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
00435                                         $contentArr = $this->pObj->splitRegularContent($utf8_content);
00436                                         $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00437 
00438                                                 // Meta information
00439                                         $metaContent = t3lib_div::xml2tree($meta_xml);
00440                                         $metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
00441                                         if (is_array($metaContent))     {
00442                                                 $contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
00443                                                 $contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
00444 
00445                                                         // Keywords collected:
00446                                                 if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))   {
00447                                                         foreach ($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)      {
00448                                                                 $contentArr['keywords'].= $kwDat['values'][0].' ';
00449                                                         }
00450                                                 }
00451                                         }
00452                                 }
00453                         break;
00454                         case 'rtf':
00455                                 if ($this->app['unrtf'])        {
00456                                         $cmd = $this->app['unrtf'].' "'.$absFile.'"';
00457                                         exec($cmd,$res);
00458                                         $fileContent = implode(chr(10),$res);
00459                                         $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00460                                         $contentArr = $this->pObj->splitHTMLContent($fileContent);
00461                                 }
00462                         break;
00463                         case 'txt':
00464                         case 'csv':             // Raw text
00465                                 $content = t3lib_div::getUrl($absFile);
00466                                         // TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
00467                                 $content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
00468                                 $contentArr = $this->pObj->splitRegularContent($content);
00469                                 $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00470                         break;
00471                         case 'html':
00472                         case 'htm':
00473                                 $fileContent = t3lib_div::getUrl($absFile);
00474                                 $fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
00475                                 $contentArr = $this->pObj->splitHTMLContent($fileContent);
00476                         break;
00477                         case 'xml':             // PHP strip-tags()
00478                                 $fileContent = t3lib_div::getUrl($absFile);
00479 
00480                                         // Finding charset:
00481                                 eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
00482                                 $charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
00483 
00484                                         // Converting content:
00485                                 $fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
00486                                 $contentArr = $this->pObj->splitRegularContent($fileContent);
00487                                 $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00488                         break;
00489                         case 'jpg':             // PHP EXIF
00490                         case 'jpeg':    // PHP EXIF
00491                         case 'tif':             // PHP EXIF
00492                                 $exif = exif_read_data($absFile, 'IFD0');
00493                                 if ($exif)      {
00494                                         $comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);     // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
00495                                 } else {
00496                                         $comment = '';
00497                                 }
00498                                 $contentArr = $this->pObj->splitRegularContent($comment);
00499                                 $contentArr['title'] = basename($absFile);      // Make sure the title doesn't expose the absolute path!
00500                         break;
00501                         default:
00502                                 return false;
00503                         break;
00504                 }
00505                         // If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
00506                 if (is_array($contentArr) && !$contentArr['title'])     {
00507                         $contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
00508                 }
00509 
00510                 return $contentArr;
00511         }
00512 
00521         function fileContentParts($ext,$absFile)        {
00522                 $cParts = array(0);
00523                 switch ($ext)   {
00524                         case 'pdf':
00525                                         // Getting pdf-info:
00526                                 $cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
00527                                 exec($cmd,$res);
00528                                 $pdfInfo = $this->splitPdfInfo($res);
00529 
00530                                 if (intval($pdfInfo['pages']))  {
00531                                         $cParts = array();
00532 
00533                                                 // Calculate mode
00534                                         if ($this->pdf_mode>0)  {
00535                                                 $iter = ceil($pdfInfo['pages']/$this->pdf_mode);
00536                                         } else {
00537                                                 $iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
00538                                         }
00539 
00540                                                 // Traverse and create intervals.
00541                                         for ($a=0;$a<$iter;$a++)        {
00542                                                 $low = floor($a*($pdfInfo['pages']/$iter))+1;
00543                                                 $high = floor(($a+1)*($pdfInfo['pages']/$iter));
00544                                                 $cParts[] = $low.'-'.$high;
00545                                         }
00546                                 }
00547                         break;
00548                 }
00549                 return $cParts;
00550         }
00551 
00560         function splitPdfInfo($pdfInfoArray)    {
00561                 $res = array();
00562                 if (is_array($pdfInfoArray))    {
00563                         foreach($pdfInfoArray as $line) {
00564                                 $parts = explode(':',$line,2);
00565                                 if (count($parts)>1 && trim($parts[0])) {
00566                                         $res[strtolower(trim($parts[0]))] = trim($parts[1]);
00567                                 }
00568                         }
00569                 }
00570                 return $res;
00571         }
00572 
00579         function removeEndJunk($string) {
00580                 return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
00581         }
00582 
00583 
00584 
00585 
00586 
00587 
00588 
00589 
00590 
00591 
00592 
00593 
00594         /************************
00595          *
00596          * Backend analyzer
00597          *
00598          ************************/
00599 
00606         function getIcon($extension)    {
00607                 if ($extension=='htm')  $extension = 'html';
00608                 if ($extension=='jpeg') $extension = 'jpg';
00609                 return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
00610         }
00611 }
00612 
00613 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {
00614     include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
00615 }
00616 ?>


Généré par TYPO3 Ameos avec  doxygen 1.4.6