"TYPO3 4.0.1: typo3_src-4.0.1/typo3/sysext/indexed_search/class.lexer.php Source File", "datetime" => "Sat Dec 2 19:22:32 2006", "date" => "2 Dec 2006", "doxygenversion" => "1.4.6", "projectname" => "TYPO3 4.0.1", "projectnumber" => "4.0.1" ); get_header($doxygen_vars); ?>

class.lexer.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the TYPO3 project. The TYPO3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *  A copy is found in the textfile GPL.txt and important notices to the license
00017 *  from the author is found in LICENSE.txt distributed with these scripts.
00018 *
00019 *
00020 *  This script is distributed in the hope that it will be useful,
00021 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00022 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023 *  GNU General Public License for more details.
00024 *
00025 *  This copyright notice MUST APPEAR in all copies of the script!
00026 ***************************************************************/
00073 class tx_indexedsearch_lexer {
00074 
00075                 // Debugging options:
00076         var $debug = FALSE;             // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display)
00077         var $debugString = '';
00078 
00079         var $csObj;             // Charset class object , t3lib_cs
00080 
00081 
00082                 // Configuration of the lexer:
00083         var $lexerConf = array(
00084                 'printjoins' => array(  // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK)
00085                         0x2e,   // "."
00086                         0x2d,   // "-"
00087                         0x5f,   // "_"
00088                         0x3a,   // ":"
00089                         0x2f,   // "/"
00090                         0x27,   // "'"
00091                         // 0x615,       // ARABIC SMALL HIGH TAH
00092                 ),
00093                 'casesensitive' => FALSE,       // Set, if case sensitive indexing is wanted.
00094                 'removeChars' => array(         // List of unicode numbers of chars that will be removed before words are returned (eg. "-")
00095                         0x2d    // "-"
00096                 )
00097         );
00098 
00099 
00105         function tx_indexedsearch_lexer() {
00106                 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00107         }
00108 
00116         function split2Words($wordString)       {
00117 
00118                         // Reset debug string:
00119                 $this->debugString = '';
00120 
00121                         // Then convert the string to lowercase:
00122                 if (!$this->lexerConf['casesensitive']) {
00123                         $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
00124                 }
00125 
00126                         // Now, splitting words:
00127                 $len = 0;
00128                 $start = 0;
00129                 $pos = 0;
00130                 $words = array();
00131                 $this->debugString = '';
00132 
00133                 while(1)        {
00134                         list($start,$len) = $this->get_word($wordString, $pos);
00135                         if ($len)       {
00136 
00137                                 $this->addWords($words, $wordString,$start,$len);
00138 
00139                                 if ($this->debug)       {
00140                                         $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
00141                                                                                 htmlspecialchars(substr($wordString,$start,$len));
00142                                 }
00143 
00144                                 $pos = $start+$len;
00145                         } else break;
00146                 }
00147                 return $words;
00148         }
00149 
00150 
00151 
00152 
00153 
00154 
00155 
00156 
00157 
00158 
00159 
00160 
00161         /**********************************
00162          *
00163          * Helper functions
00164          *
00165          ********************************/
00166 
00167 
00178         function addWords(&$words, &$wordString, $start, $len)  {
00179 
00180                         // Get word out of string:
00181                 $theWord = substr($wordString,$start,$len);
00182 
00183                         // Get next chars unicode number and find type:
00184                 $bc = 0;
00185                 $cp = $this->utf8_ord($theWord, $bc);
00186                 list($cType) = $this->charType($cp);
00187 
00188                         // If string is a CJK sequence we follow this algorithm:
00189                         /*
00190                                 DESCRIPTION OF (CJK) ALGORITHM
00191 
00192                                 Continuous letters and numbers make up words. Spaces and symbols
00193                                 separate letters and numbers into words. This is sufficient for
00194                                 all western text.
00195 
00196                                 CJK doesn't use spaces or separators to separate words, so the only
00197                                 way to really find out what constitutes a word would be to have a
00198                                 dictionary and advanced heuristics. Instead, we form pairs from
00199                                 consecutive characters, in such a way that searches will find only
00200                                 characters that appear more-or-less the right sequence. For example:
00201 
00202                                         ABCDE => AB BC CD DE
00203 
00204                                 This works okay since both the index and the search query is split
00205                                 in the same manner, and since the set of characters is huge so the
00206                                 extra matches are not significant.
00207 
00208                                 (Hint taken from ZOPEs chinese user group)
00209 
00210                                 [Kasper: As far as I can see this will only work well with or-searches!]
00211                         */
00212                 if ($cType == 'cjk')    {
00213                                 // Find total string length:
00214                         $strlen = $this->csObj->utf8_strlen($theWord);
00215 
00216                                 // Traverse string length and add words as pairs of two chars:
00217                         for ($a=0; $a<$strlen; $a++)    {
00218                                 if ($strlen==1 || $a<$strlen-1) {
00219                                         $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
00220                                 }
00221                         }
00222                 } else {        // Normal "single-byte" chars:
00223                                 // Remove chars:
00224                         foreach($this->lexerConf['removeChars'] as $skipJoin)   {
00225                                 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
00226                         }
00227                                 // Add word:
00228                         $words[] = $theWord;
00229                 }
00230         }
00231 
00239         function get_word(&$str, $pos=0)        {
00240 
00241                 $len=0;
00242 
00243                         // If return is true, a word was found starting at this position, so returning position and length:
00244                 if ($this->utf8_is_letter($str, $len, $pos))    {
00245                         return array($pos,$len);
00246                 }
00247 
00248                         // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word:
00249                 $pos += $len;
00250                 if ($str{$pos} == '')   return false;   // check end of string before looking for word of course.
00251 
00252                 $this->utf8_is_letter($str, $len, $pos);
00253                 return array($pos,$len);
00254         }
00255 
00264         function utf8_is_letter(&$str, &$len, $pos=0)   {
00265                 global $cs;
00266 
00267                 $len = 0;
00268                 $bc = 0;
00269                 $cType = $cType_prev = false; // Letter type
00270                 $letter = true; // looking for a letter?
00271 
00272                 if ($str{$pos} == '')   return false;   // Return false on end-of-string at this stage
00273 
00274                 while(1) {
00275 
00276                                 // If characters has been obtained we will know whether the string starts as a sequence of letters or not:
00277                         if ($len)       {
00278                                 if ($letter)    {       // We are in a sequence of words
00279                                         if (!$cType     // The char was NOT a letter
00280                                                         || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev))       // ... or the previous and current char are from single-byte sets vs. asian CJK sets
00281                                                         )       {
00282                                                         // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word.
00283                                                 if (!in_array($cp,$this->lexerConf['printjoins']))      {
00284                                                                 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars)
00285                                                         if ($printJoinLgd)      {
00286                                                                 $len = $printJoinLgd;
00287                                                         }
00288                                                         #debug($cp);
00289                                                         return true;
00290                                                 } else {        // If a printJoin char is found, record the length if it has not been recorded already:
00291                                                         if (!$printJoinLgd)     $printJoinLgd = $len;
00292                                                 }
00293                                         } else {        // When a true letter is found, reset printJoinLgd counter:
00294                                                 $printJoinLgd = 0;
00295                                         }
00296                                 }
00297                                 elseif (!$letter && $cType)     {       // end of non-word reached
00298                                         return false;
00299                                 }
00300                         }
00301                         $len += $bc;    // add byte-length of last found character
00302 
00303                         if ($str{$pos} == '')   return $letter; // end of string; return status of string till now
00304 
00305                                 // Get next chars unicode number:
00306                         $cp = $this->utf8_ord($str,$bc,$pos);
00307                         $pos += $bc;
00308 
00309                                 // Determine the type:
00310                         $cType_prev = $cType;
00311                         list($cType) = $this->charType($cp);
00312                         if ($cType)     {
00313                                 continue;
00314                         }
00315 
00316                                 // Setting letter to false if the first char was not a letter!
00317                         if (!$len)      $letter = false;
00318                 }
00319 
00320                 return false;
00321         }
00322 
00329         function charType($cp)  {
00330 
00331                         // Numeric?
00332                 if (
00333                                 ($cp >= 0x30 && $cp <= 0x39)            // Arabic
00334 /*
00335                                 ($cp >= 0x660 && $cp <= 0x669) ||       // Arabic-Indic
00336                                 ($cp >= 0x6F0 && $cp <= 0x6F9) ||       // Arabic-Indic (Iran, Pakistan, and India)
00337                                 ($cp >= 0x3021 && $cp <= 0x3029) ||     // Hangzhou
00338 */
00339                         )       {
00340                         return array('num');
00341                 }
00342 
00343                         // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic):
00344                 if (
00345                                 ($cp >= 0x41 && $cp <= 0x5A) ||         // Basic Latin: capital letters
00346                                 ($cp >= 0x61 && $cp <= 0x7A) ||         // Basic Latin: small letters
00347                                 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) ||                   // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign
00348                                 ($cp >= 0x100 && $cp < 0x280) ||        // Latin Extended-A and -B
00349                                 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters
00350                                 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) ||             // Cyrillic and Cyrillic Supplement excluding historic miscellaneous
00351                                 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) ||     // Hebrew: only accents and letters
00352                                 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E &&  $cp <= 0x6D3)) ||  // Arabic: only letters (there are more letters in the range, we can add them if there is a demand)
00353                                 ($cp >= 0x1E00 && $cp < 0x2000)         // Latin Extended Additional and Greek Extended
00354                         )       {
00355                         return array('alpha');
00356                 }
00357 
00358                         // Looking for CJK (Chinese / Japanese / Korean)
00359                         // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/
00360                         // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete.
00361                 if (
00362                                 ($cp >= 0x3040 && $cp <= 0x30FF) ||             // HIRAGANA and KATAKANA letters
00363                                 ($cp >= 0x3130 && $cp <= 0x318F) ||             // Hangul Compatibility Jamo
00364                                 ($cp >= 0x3400 && $cp <= 0x4DBF) ||             // CJK Unified Ideographs Extension A
00365                                 ($cp >= 0x4E00 && $cp <= 0x9FAF) ||             // CJK Unified Ideographs
00366                                 ($cp >= 0xAC00 && $cp <= 0xD7AF) ||             // Hangul Syllables
00367                                 ($cp >= 0x20000 && $cp <= 0x2FA1F)              // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement
00368                                                                                                                 // also include CJK and Kangxi radicals or Bopomofo letter?
00369                         )       {
00370                         return array('cjk');
00371                 }
00372         }
00373 
00383         function utf8_ord(&$str, &$len, $pos=0, $hex=false)     {
00384                 $ord = ord($str{$pos});
00385                 $len = 1;
00386 
00387                 if ($ord > 0x80)        {
00388                         for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of extra bytes
00389                         $len += $bc;
00390 
00391                         $ord = $ord & ((1 << (6-$bc)) - 1);     // mask utf-8 lead-in bytes
00392                         for ($i=$pos+1; $bc; $bc--, $i++)       // "bring in" data bytes
00393                                 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
00394                 }
00395 
00396                 return $hex ? 'x'.dechex($ord) : $ord;
00397         }
00398 }
00399 
00400 
00401 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php'])    {
00402     include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
00403 }
00404 ?>