Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the TYPO3 project. The TYPO3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * A copy is found in the textfile GPL.txt and important notices to the license 00017 * from the author is found in LICENSE.txt distributed with these scripts. 00018 * 00019 * 00020 * This script is distributed in the hope that it will be useful, 00021 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00023 * GNU General Public License for more details. 00024 * 00025 * This copyright notice MUST APPEAR in all copies of the script! 00026 ***************************************************************/ 00073 class tx_indexedsearch_lexer { 00074 00075 // Debugging options: 00076 var $debug = FALSE; // If set, the debugString is filled with HTML output highlighting search / non-search words (for backend display) 00077 var $debugString = ''; 00078 00079 var $csObj; // Charset class object , t3lib_cs 00080 00081 00082 // Configuration of the lexer: 00083 var $lexerConf = array( 00084 'printjoins' => array( // This is the Unicode numbers of chars that are allowed INSIDE a sequence of letter chars (alphanum + CJK) 00085 0x2e, // "." 00086 0x2d, // "-" 00087 0x5f, // "_" 00088 0x3a, // ":" 00089 0x2f, // "/" 00090 0x27, // "'" 00091 // 0x615, // ARABIC SMALL HIGH TAH 00092 ), 00093 'casesensitive' => FALSE, // Set, if case sensitive indexing is wanted. 00094 'removeChars' => array( // List of unicode numbers of chars that will be removed before words are returned (eg. "-") 00095 0x2d // "-" 00096 ) 00097 ); 00098 00099 00105 function tx_indexedsearch_lexer() { 00106 $this->csObj = &t3lib_div::makeInstance('t3lib_cs'); 00107 } 00108 00116 function split2Words($wordString) { 00117 00118 // Reset debug string: 00119 $this->debugString = ''; 00120 00121 // Then convert the string to lowercase: 00122 if (!$this->lexerConf['casesensitive']) { 00123 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower'); 00124 } 00125 00126 // Now, splitting words: 00127 $len = 0; 00128 $start = 0; 00129 $pos = 0; 00130 $words = array(); 00131 $this->debugString = ''; 00132 00133 while(1) { 00134 list($start,$len) = $this->get_word($wordString, $pos); 00135 if ($len) { 00136 00137 $this->addWords($words, $wordString,$start,$len); 00138 00139 if ($this->debug) { 00140 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'. 00141 htmlspecialchars(substr($wordString,$start,$len)); 00142 } 00143 00144 $pos = $start+$len; 00145 } else break; 00146 } 00147 return $words; 00148 } 00149 00150 00151 00152 00153 00154 00155 00156 00157 00158 00159 00160 00161 /********************************** 00162 * 00163 * Helper functions 00164 * 00165 ********************************/ 00166 00167 00178 function addWords(&$words, &$wordString, $start, $len) { 00179 00180 // Get word out of string: 00181 $theWord = substr($wordString,$start,$len); 00182 00183 // Get next chars unicode number and find type: 00184 $bc = 0; 00185 $cp = $this->utf8_ord($theWord, $bc); 00186 list($cType) = $this->charType($cp); 00187 00188 // If string is a CJK sequence we follow this algorithm: 00189 /* 00190 DESCRIPTION OF (CJK) ALGORITHM 00191 00192 Continuous letters and numbers make up words. Spaces and symbols 00193 separate letters and numbers into words. This is sufficient for 00194 all western text. 00195 00196 CJK doesn't use spaces or separators to separate words, so the only 00197 way to really find out what constitutes a word would be to have a 00198 dictionary and advanced heuristics. Instead, we form pairs from 00199 consecutive characters, in such a way that searches will find only 00200 characters that appear more-or-less the right sequence. For example: 00201 00202 ABCDE => AB BC CD DE 00203 00204 This works okay since both the index and the search query is split 00205 in the same manner, and since the set of characters is huge so the 00206 extra matches are not significant. 00207 00208 (Hint taken from ZOPEs chinese user group) 00209 00210 [Kasper: As far as I can see this will only work well with or-searches!] 00211 */ 00212 if ($cType == 'cjk') { 00213 // Find total string length: 00214 $strlen = $this->csObj->utf8_strlen($theWord); 00215 00216 // Traverse string length and add words as pairs of two chars: 00217 for ($a=0; $a<$strlen; $a++) { 00218 if ($strlen==1 || $a<$strlen-1) { 00219 $words[] = $this->csObj->utf8_substr($theWord, $a, 2); 00220 } 00221 } 00222 } else { // Normal "single-byte" chars: 00223 // Remove chars: 00224 foreach($this->lexerConf['removeChars'] as $skipJoin) { 00225 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord); 00226 } 00227 // Add word: 00228 $words[] = $theWord; 00229 } 00230 } 00231 00239 function get_word(&$str, $pos=0) { 00240 00241 $len=0; 00242 00243 // If return is true, a word was found starting at this position, so returning position and length: 00244 if ($this->utf8_is_letter($str, $len, $pos)) { 00245 return array($pos,$len); 00246 } 00247 00248 // If the return value was false it means a sequence of non-word chars were found (or blank string) - so we will start another search for the word: 00249 $pos += $len; 00250 if ($str{$pos} == '') return false; // check end of string before looking for word of course. 00251 00252 $this->utf8_is_letter($str, $len, $pos); 00253 return array($pos,$len); 00254 } 00255 00264 function utf8_is_letter(&$str, &$len, $pos=0) { 00265 global $cs; 00266 00267 $len = 0; 00268 $bc = 0; 00269 $cType = $cType_prev = false; // Letter type 00270 $letter = true; // looking for a letter? 00271 00272 if ($str{$pos} == '') return false; // Return false on end-of-string at this stage 00273 00274 while(1) { 00275 00276 // If characters has been obtained we will know whether the string starts as a sequence of letters or not: 00277 if ($len) { 00278 if ($letter) { // We are in a sequence of words 00279 if (!$cType // The char was NOT a letter 00280 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev)) // ... or the previous and current char are from single-byte sets vs. asian CJK sets 00281 ) { 00282 // Check if the non-letter char is NOT a print-join char because then it signifies the end of the word. 00283 if (!in_array($cp,$this->lexerConf['printjoins'])) { 00284 // If a printjoin start length has been record, set that back now so the length is right (filtering out multiple end chars) 00285 if ($printJoinLgd) { 00286 $len = $printJoinLgd; 00287 } 00288 #debug($cp); 00289 return true; 00290 } else { // If a printJoin char is found, record the length if it has not been recorded already: 00291 if (!$printJoinLgd) $printJoinLgd = $len; 00292 } 00293 } else { // When a true letter is found, reset printJoinLgd counter: 00294 $printJoinLgd = 0; 00295 } 00296 } 00297 elseif (!$letter && $cType) { // end of non-word reached 00298 return false; 00299 } 00300 } 00301 $len += $bc; // add byte-length of last found character 00302 00303 if ($str{$pos} == '') return $letter; // end of string; return status of string till now 00304 00305 // Get next chars unicode number: 00306 $cp = $this->utf8_ord($str,$bc,$pos); 00307 $pos += $bc; 00308 00309 // Determine the type: 00310 $cType_prev = $cType; 00311 list($cType) = $this->charType($cp); 00312 if ($cType) { 00313 continue; 00314 } 00315 00316 // Setting letter to false if the first char was not a letter! 00317 if (!$len) $letter = false; 00318 } 00319 00320 return false; 00321 } 00322 00329 function charType($cp) { 00330 00331 // Numeric? 00332 if ( 00333 ($cp >= 0x30 && $cp <= 0x39) // Arabic 00334 /* 00335 ($cp >= 0x660 && $cp <= 0x669) || // Arabic-Indic 00336 ($cp >= 0x6F0 && $cp <= 0x6F9) || // Arabic-Indic (Iran, Pakistan, and India) 00337 ($cp >= 0x3021 && $cp <= 0x3029) || // Hangzhou 00338 */ 00339 ) { 00340 return array('num'); 00341 } 00342 00343 // LOOKING for Alpha chars (Latin, Cyrillic, Greek, Hebrew and Arabic): 00344 if ( 00345 ($cp >= 0x41 && $cp <= 0x5A) || // Basic Latin: capital letters 00346 ($cp >= 0x61 && $cp <= 0x7A) || // Basic Latin: small letters 00347 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) || // Latin-1 Supplement (0x80-0xFF) excluding multiplication and division sign 00348 ($cp >= 0x100 && $cp < 0x280) || // Latin Extended-A and -B 00349 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) || // Greek and Coptic excluding non-letters 00350 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) || // Cyrillic and Cyrillic Supplement excluding historic miscellaneous 00351 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) || // Hebrew: only accents and letters 00352 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) || // Arabic: only letters (there are more letters in the range, we can add them if there is a demand) 00353 ($cp >= 0x1E00 && $cp < 0x2000) // Latin Extended Additional and Greek Extended 00354 ) { 00355 return array('alpha'); 00356 } 00357 00358 // Looking for CJK (Chinese / Japanese / Korean) 00359 // Ranges are not certain - deducted from the translation tables in t3lib/csconvtbl/ 00360 // Verified with http://www.unicode.org/charts/ (16/2) - may still not be complete. 00361 if ( 00362 ($cp >= 0x3040 && $cp <= 0x30FF) || // HIRAGANA and KATAKANA letters 00363 ($cp >= 0x3130 && $cp <= 0x318F) || // Hangul Compatibility Jamo 00364 ($cp >= 0x3400 && $cp <= 0x4DBF) || // CJK Unified Ideographs Extension A 00365 ($cp >= 0x4E00 && $cp <= 0x9FAF) || // CJK Unified Ideographs 00366 ($cp >= 0xAC00 && $cp <= 0xD7AF) || // Hangul Syllables 00367 ($cp >= 0x20000 && $cp <= 0x2FA1F) // CJK Unified Ideographs Extension B and CJK Compatibility Ideographs Supplement 00368 // also include CJK and Kangxi radicals or Bopomofo letter? 00369 ) { 00370 return array('cjk'); 00371 } 00372 } 00373 00383 function utf8_ord(&$str, &$len, $pos=0, $hex=false) { 00384 $ord = ord($str{$pos}); 00385 $len = 1; 00386 00387 if ($ord > 0x80) { 00388 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of extra bytes 00389 $len += $bc; 00390 00391 $ord = $ord & ((1 << (6-$bc)) - 1); // mask utf-8 lead-in bytes 00392 for ($i=$pos+1; $bc; $bc--, $i++) // "bring in" data bytes 00393 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F); 00394 } 00395 00396 return $hex ? 'x'.dechex($ord) : $ord; 00397 } 00398 } 00399 00400 00401 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']) { 00402 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']); 00403 } 00404 ?>