00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00073 class tx_indexedsearch_lexer {
00074
00075
00076 var $debug = FALSE;
00077 var $debugString = '';
00078
00079 var $csObj;
00080
00081
00082
00083 var $lexerConf = array(
00084 'printjoins' => array(
00085 0x2e,
00086 0x2d,
00087 0x5f,
00088 0x3a,
00089 0x2f,
00090 0x27,
00091
00092 ),
00093 'casesensitive' => FALSE,
00094 'removeChars' => array(
00095 0x2d
00096 )
00097 );
00098
00099
00105 function tx_indexedsearch_lexer() {
00106 $this->csObj = &t3lib_div::makeInstance('t3lib_cs');
00107 }
00108
00116 function split2Words($wordString) {
00117
00118
00119 $this->debugString = '';
00120
00121
00122 if (!$this->lexerConf['casesensitive']) {
00123 $wordString = $this->csObj->conv_case('utf-8', $wordString, 'toLower');
00124 }
00125
00126
00127 $len = 0;
00128 $start = 0;
00129 $pos = 0;
00130 $words = array();
00131 $this->debugString = '';
00132
00133 while(1) {
00134 list($start,$len) = $this->get_word($wordString, $pos);
00135 if ($len) {
00136
00137 $this->addWords($words, $wordString,$start,$len);
00138
00139 if ($this->debug) {
00140 $this->debugString.= '<span style="color:red">'.htmlspecialchars(substr($wordString,$pos,$start-$pos)).'</span>'.
00141 htmlspecialchars(substr($wordString,$start,$len));
00142 }
00143
00144 $pos = $start+$len;
00145 } else break;
00146 }
00147 return $words;
00148 }
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00178 function addWords(&$words, &$wordString, $start, $len) {
00179
00180
00181 $theWord = substr($wordString,$start,$len);
00182
00183
00184 $bc = 0;
00185 $cp = $this->utf8_ord($theWord, $bc);
00186 list($cType) = $this->charType($cp);
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212 if ($cType == 'cjk') {
00213
00214 $strlen = $this->csObj->utf8_strlen($theWord);
00215
00216
00217 for ($a=0; $a<$strlen; $a++) {
00218 if ($strlen==1 || $a<$strlen-1) {
00219 $words[] = $this->csObj->utf8_substr($theWord, $a, 2);
00220 }
00221 }
00222 } else {
00223
00224 foreach($this->lexerConf['removeChars'] as $skipJoin) {
00225 $theWord = str_replace($this->csObj->UnumberToChar($skipJoin),'',$theWord);
00226 }
00227
00228 $words[] = $theWord;
00229 }
00230 }
00231
00239 function get_word(&$str, $pos=0) {
00240
00241 $len=0;
00242
00243
00244 if ($this->utf8_is_letter($str, $len, $pos)) {
00245 return array($pos,$len);
00246 }
00247
00248
00249 $pos += $len;
00250 if ($str{$pos} == '') return false;
00251
00252 $this->utf8_is_letter($str, $len, $pos);
00253 return array($pos,$len);
00254 }
00255
00264 function utf8_is_letter(&$str, &$len, $pos=0) {
00265 global $cs;
00266
00267 $len = 0;
00268 $bc = 0;
00269 $cType = $cType_prev = false;
00270 $letter = true;
00271
00272 if ($str{$pos} == '') return false;
00273
00274 while(1) {
00275
00276
00277 if ($len) {
00278 if ($letter) {
00279 if (!$cType
00280 || ($cType_prev=='cjk' && t3lib_div::inList('num,alpha',$cType)) || ($cType=='cjk' && t3lib_div::inList('num,alpha',$cType_prev))
00281 ) {
00282
00283 if (!in_array($cp,$this->lexerConf['printjoins'])) {
00284
00285 if ($printJoinLgd) {
00286 $len = $printJoinLgd;
00287 }
00288 #debug($cp);
00289 return true;
00290 } else {
00291 if (!$printJoinLgd) $printJoinLgd = $len;
00292 }
00293 } else {
00294 $printJoinLgd = 0;
00295 }
00296 }
00297 elseif (!$letter && $cType) {
00298 return false;
00299 }
00300 }
00301 $len += $bc;
00302
00303 if ($str{$pos} == '') return $letter;
00304
00305
00306 $cp = $this->utf8_ord($str,$bc,$pos);
00307 $pos += $bc;
00308
00309
00310 $cType_prev = $cType;
00311 list($cType) = $this->charType($cp);
00312 if ($cType) {
00313 continue;
00314 }
00315
00316
00317 if (!$len) $letter = false;
00318 }
00319
00320 return false;
00321 }
00322
00329 function charType($cp) {
00330
00331
00332 if (
00333 ($cp >= 0x30 && $cp <= 0x39)
00334
00335
00336
00337
00338
00339 ) {
00340 return array('num');
00341 }
00342
00343
00344 if (
00345 ($cp >= 0x41 && $cp <= 0x5A) ||
00346 ($cp >= 0x61 && $cp <= 0x7A) ||
00347 ($cp >= 0xC0 && $cp <= 0xFF && $cp != 0xD7 && $cp != 0xF7) ||
00348 ($cp >= 0x100 && $cp < 0x280) ||
00349 ($cp == 0x386 || ($cp >= 0x388 && $cp < 0x400)) ||
00350 (($cp >= 0x400 && $cp < 0x482) || ($cp >= 0x48A && $cp < 0x530)) ||
00351 (($cp >= 0x590 && $cp < 0x5B0) || ($cp >= 0x5D0 && $cp < 0x5F3)) ||
00352 (($cp >= 0x621 && $cp <= 0x658) || ($cp >= 0x66E && $cp <= 0x6D3)) ||
00353 ($cp >= 0x1E00 && $cp < 0x2000)
00354 ) {
00355 return array('alpha');
00356 }
00357
00358
00359
00360
00361 if (
00362 ($cp >= 0x3040 && $cp <= 0x30FF) ||
00363 ($cp >= 0x3130 && $cp <= 0x318F) ||
00364 ($cp >= 0x3400 && $cp <= 0x4DBF) ||
00365 ($cp >= 0x4E00 && $cp <= 0x9FAF) ||
00366 ($cp >= 0xAC00 && $cp <= 0xD7AF) ||
00367 ($cp >= 0x20000 && $cp <= 0x2FA1F)
00368
00369 ) {
00370 return array('cjk');
00371 }
00372 }
00373
00383 function utf8_ord(&$str, &$len, $pos=0, $hex=false) {
00384 $ord = ord($str{$pos});
00385 $len = 1;
00386
00387 if ($ord > 0x80) {
00388 for ($bc=-1, $mbs=$ord; $mbs & 0x80; $mbs = $mbs << 1) $bc++;
00389 $len += $bc;
00390
00391 $ord = $ord & ((1 << (6-$bc)) - 1);
00392 for ($i=$pos+1; $bc; $bc--, $i++)
00393 $ord = ($ord << 6) | (ord($str{$i}) & 0x3F);
00394 }
00395
00396 return $hex ? 'x'.dechex($ord) : $ord;
00397 }
00398 }
00399
00400
00401 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']) {
00402 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.lexer.php']);
00403 }
00404 ?>