00001 <?php
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00136 class t3lib_cs {
00137 var $noCharByteVal=63;
00138
00139
00140 var $parsedCharsets=array();
00141
00142
00143 var $caseFolding=array();
00144
00145
00146 var $toASCII=array();
00147
00148
00149 var $twoByteSets=array(
00150 'ucs-2'=>1,
00151 );
00152
00153
00154 var $fourByteSets=array(
00155 'ucs-4'=>1,
00156 'utf-32'=>1,
00157 );
00158
00159
00160 var $eucBasedSets=array(
00161 'gb2312'=>1,
00162 'big5'=>1,
00163 'euc-kr'=>1,
00164 'shift_jis'=>1,
00165 );
00166
00167
00168
00169 var $synonyms=array(
00170 'us' => 'ascii',
00171 'us-ascii'=> 'ascii',
00172 'cp819' => 'iso-8859-1',
00173 'ibm819' => 'iso-8859-1',
00174 'iso-ir-100' => 'iso-8859-1',
00175 'iso-ir-109' => 'iso-8859-2',
00176 'iso-ir-148' => 'iso-8859-9',
00177 'iso-ir-199' => 'iso-8859-14',
00178 'iso-ir-203' => 'iso-8859-15',
00179 'csisolatin1' => 'iso-8859-1',
00180 'csisolatin2' => 'iso-8859-2',
00181 'csisolatin3' => 'iso-8859-3',
00182 'csisolatin5' => 'iso-8859-9',
00183 'csisolatin8' => 'iso-8859-14',
00184 'csisolatin9' => 'iso-8859-15',
00185 'csisolatingreek' => 'iso-8859-7',
00186 'iso-celtic' => 'iso-8859-14',
00187 'latin1' => 'iso-8859-1',
00188 'latin2' => 'iso-8859-2',
00189 'latin3' => 'iso-8859-3',
00190 'latin5' => 'iso-8859-9',
00191 'latin6' => 'iso-8859-10',
00192 'latin8' => 'iso-8859-14',
00193 'latin9' => 'iso-8859-15',
00194 'l1' => 'iso-8859-1',
00195 'l2' => 'iso-8859-2',
00196 'l3' => 'iso-8859-3',
00197 'l5' => 'iso-8859-9',
00198 'l6' => 'iso-8859-10',
00199 'l8' => 'iso-8859-14',
00200 'l9' => 'iso-8859-15',
00201 'cyrillic' => 'iso-8859-5',
00202 'arabic' => 'iso-8859-6',
00203 'tis-620' => 'iso-8859-11',
00204 'win874' => 'windows-874',
00205 'win1250' => 'windows-1250',
00206 'win1251' => 'windows-1251',
00207 'win1252' => 'windows-1252',
00208 'win1253' => 'windows-1253',
00209 'win1254' => 'windows-1254',
00210 'win1255' => 'windows-1255',
00211 'win1256' => 'windows-1256',
00212 'win1257' => 'windows-1257',
00213 'win1258' => 'windows-1258',
00214 'cp1250' => 'windows-1250',
00215 'cp1251' => 'windows-1251',
00216 'cp1252' => 'windows-1252',
00217 'ms-ee' => 'windows-1250',
00218 'ms-ansi' => 'windows-1252',
00219 'ms-greek' => 'windows-1253',
00220 'ms-turk' => 'windows-1254',
00221 'winbaltrim' => 'windows-1257',
00222 'koi-8ru' => 'koi-8r',
00223 'koi8r' => 'koi-8r',
00224 'cp878' => 'koi-8r',
00225 'mac' => 'macroman',
00226 'macintosh' => 'macroman',
00227 'euc-cn' => 'gb2312',
00228 'x-euc-cn' => 'gb2312',
00229 'euccn' => 'gb2312',
00230 'cp936' => 'gb2312',
00231 'big-5' => 'big5',
00232 'cp950' => 'big5',
00233 'eucjp' => 'euc-jp',
00234 'sjis' => 'shift_jis',
00235 'shift-jis' => 'shift_jis',
00236 'cp932' => 'shift_jis',
00237 'cp949' => 'euc-kr',
00238 'utf7' => 'utf-7',
00239 'utf8' => 'utf-8',
00240 'utf16' => 'utf-16',
00241 'utf32' => 'utf-32',
00242 'utf8' => 'utf-8',
00243 'ucs2' => 'ucs-2',
00244 'ucs4' => 'ucs-4',
00245 );
00246
00247
00248 var $lang_to_script=array(
00249
00250
00251
00252
00253 'ar' => 'arabic',
00254 'bg' => 'cyrillic',
00255 'bs' => 'east_european',
00256 'cs' => 'east_european',
00257 'da' => 'west_european',
00258 'de' => 'west_european',
00259 'es' => 'west_european',
00260 'et' => 'estonian',
00261 'eo' => 'unicode',
00262 'eu' => 'west_european',
00263 'fa' => 'arabic',
00264 'fi' => 'west_european',
00265 'fo' => 'west_european',
00266 'fr' => 'west_european',
00267 'gr' => 'greek',
00268 'he' => 'hebrew',
00269 'hi' => 'unicode',
00270 'hr' => 'east_european',
00271 'hu' => 'east_european',
00272 'iw' => 'hebrew',
00273 'is' => 'west_european',
00274 'it' => 'west_european',
00275 'ja' => 'japanese',
00276 'kl' => 'west_european',
00277 'ko' => 'korean',
00278 'lt' => 'lithuanian',
00279 'lv' => 'west_european',
00280 'nl' => 'west_european',
00281 'no' => 'west_european',
00282 'pl' => 'east_european',
00283 'pt' => 'west_european',
00284 'ro' => 'east_european',
00285 'ru' => 'cyrillic',
00286 'sk' => 'east_european',
00287 'sl' => 'east_european',
00288 'sr' => 'cyrillic',
00289 'sv' => 'west_european',
00290 'th' => 'thai',
00291 'uk' => 'cyrillic',
00292 'vi' => 'vietnamese',
00293 'zh' => 'chinese',
00294
00295
00296 'ara' => 'arabic',
00297 'bgr' => 'cyrillic',
00298 'cat' => 'west_european',
00299 'chs' => 'simpl_chinese',
00300 'cht' => 'trad_chinese',
00301 'csy' => 'east_european',
00302 'dan' => 'west_european',
00303 'deu' => 'west_european',
00304 'dea' => 'west_european',
00305 'des' => 'west_european',
00306 'ena' => 'west_european',
00307 'enc' => 'west_european',
00308 'eng' => 'west_european',
00309 'enz' => 'west_european',
00310 'enu' => 'west_european',
00311 'euq' => 'west_european',
00312 'fos' => 'west_european',
00313 'far' => 'arabic',
00314 'fin' => 'west_european',
00315 'fra' => 'west_european',
00316 'frb' => 'west_european',
00317 'frc' => 'west_european',
00318 'frs' => 'west_european',
00319 'ell' => 'greek',
00320 'heb' => 'hebrew',
00321 'hin' => 'unicode',
00322 'hun' => 'east_european',
00323 'isl' => 'west_euorpean',
00324 'ita' => 'west_european',
00325 'its' => 'west_european',
00326 'jpn' => 'japanese',
00327 'kor' => 'korean',
00328 'lth' => 'lithuanian',
00329 'lvi' => 'west_european',
00330 'msl' => 'west_european',
00331 'nlb' => 'west_european',
00332 'nld' => 'west_european',
00333 'nor' => 'west_european',
00334 'non' => 'west_european',
00335 'plk' => 'east_european',
00336 'ptg' => 'west_european',
00337 'ptb' => 'west_european',
00338 'rom' => 'east_european',
00339 'rus' => 'cyrillic',
00340 'slv' => 'east_european',
00341 'sky' => 'east_european',
00342 'srl' => 'east_european',
00343 'srb' => 'cyrillic',
00344 'esp' => 'west_european',
00345 'esm' => 'west_european',
00346 'esn' => 'west_european',
00347 'sve' => 'west_european',
00348 'tha' => 'thai',
00349 'trk' => 'turkish',
00350 'ukr' => 'cyrillic',
00351
00352 'arabic' => 'arabic',
00353 'basque' => 'west_european',
00354 'bosnian' => 'east_european',
00355 'bulgarian' => 'east_european',
00356 'catalan' => 'west_european',
00357 'croatian' => 'east_european',
00358 'czech' => 'east_european',
00359 'danish' => 'west_european',
00360 'dutch' => 'west_european',
00361 'english' => 'west_european',
00362 'esperanto' => 'unicode',
00363 'estonian' => 'estonian',
00364 'faroese' => 'west_european',
00365 'farsi' => 'arabic',
00366 'finnish' => 'west_european',
00367 'french' => 'west_european',
00368 'galician' => 'west_european',
00369 'german' => 'west_european',
00370 'greek' => 'greek',
00371 'greenlandic' => 'west_european',
00372 'hebrew' => 'hebrew',
00373 'hindi' => 'unicode',
00374 'hungarian' => 'east_european',
00375 'icelandic' => 'west_european',
00376 'italian' => 'west_european',
00377 'latvian' => 'west_european',
00378 'lettish' => 'west_european',
00379 'lithuanian' => 'lithuanian',
00380 'malay' => 'west_european',
00381 'norwegian' => 'west_european',
00382 'persian' => 'arabic',
00383 'polish' => 'east_european',
00384 'portuguese' => 'west_european',
00385 'russian' => 'cyrillic',
00386 'romanian' => 'east_european',
00387 'serbian' => 'cyrillic',
00388 'slovak' => 'east_european',
00389 'slovenian' => 'east_european',
00390 'spanish' => 'west_european',
00391 'svedish' => 'west_european',
00392 'that' => 'thai',
00393 'turkish' => 'turkish',
00394 'ukrainian' => 'cyrillic',
00395 );
00396
00397
00398 var $script_to_charset_unix=array(
00399 'west_european' => 'iso-8859-1',
00400 'estonian' => 'iso-8859-1',
00401 'east_european' => 'iso-8859-2',
00402 'baltic' => 'iso-8859-4',
00403 'cyrillic' => 'iso-8859-5',
00404 'arabic' => 'iso-8859-6',
00405 'greek' => 'iso-8859-7',
00406 'hebrew' => 'iso-8859-8',
00407 'turkish' => 'iso-8859-9',
00408 'thai' => 'iso-8859-11',
00409 'lithuanian' => 'iso-8859-13',
00410 'chinese' => 'gb2312',
00411 'japanese' => 'euc-jp',
00412 'korean' => 'euc-kr',
00413 'simpl_chinese' => 'gb2312',
00414 'trad_chinese' => 'big5',
00415 'vietnamese' => '',
00416 'unicode' => 'utf-8',
00417 );
00418
00419
00420 var $script_to_charset_windows=array(
00421 'east_european' => 'windows-1250',
00422 'cyrillic' => 'windows-1251',
00423 'west_european' => 'windows-1252',
00424 'greek' => 'windows-1253',
00425 'turkish' => 'windows-1254',
00426 'hebrew' => 'windows-1255',
00427 'arabic' => 'windows-1256',
00428 'baltic' => 'windows-1257',
00429 'estonian' => 'windows-1257',
00430 'lithuanian' => 'windows-1257',
00431 'vietnamese' => 'windows-1258',
00432 'thai' => 'cp874',
00433 'korean' => 'cp949',
00434 'chinese' => 'gb2312',
00435 'japanese' => 'shift_jis',
00436 'simpl_chinese' => 'gb2312',
00437 'trad_chinese' => 'big5',
00438 );
00439
00440
00441 var $locale_to_charset=array(
00442 'japanese.euc' => 'euc-jp',
00443 'ja_jp.ujis' => 'euc-jp',
00444 'korean.euc' => 'euc-kr',
00445 'sr@Latn' => 'iso-8859-2',
00446 'zh_cn' => 'gb2312',
00447 'zh_hk' => 'big5',
00448 'zh_tw' => 'big5',
00449 );
00450
00451
00452
00453 var $charSetArray = array(
00454 'dk' => '',
00455 'de' => '',
00456 'no' => '',
00457 'it' => '',
00458 'fr' => '',
00459 'es' => '',
00460 'nl' => '',
00461 'cz' => 'windows-1250',
00462 'pl' => 'iso-8859-2',
00463 'si' => 'windows-1250',
00464 'fi' => '',
00465 'tr' => 'iso-8859-9',
00466 'se' => '',
00467 'pt' => '',
00468 'ru' => 'windows-1251',
00469 'ro' => 'iso-8859-2',
00470 'ch' => 'gb2312',
00471 'sk' => 'windows-1250',
00472 'lt' => 'windows-1257',
00473 'is' => 'utf-8',
00474 'hr' => 'windows-1250',
00475 'hu' => 'iso-8859-2',
00476 'gl' => '',
00477 'th' => 'iso-8859-11',
00478 'gr' => 'iso-8859-7',
00479 'hk' => 'big5',
00480 'eu' => '',
00481 'bg' => 'windows-1251',
00482 'br' => '',
00483 'et' => 'iso-8859-4',
00484 'ar' => 'iso-8859-6',
00485 'he' => 'utf-8',
00486 'ua' => 'windows-1251',
00487 'jp' => 'shift_jis',
00488 'lv' => 'utf-8',
00489 'vn' => 'utf-8',
00490 'ca' => 'iso-8859-15',
00491 'ba' => 'iso-8859-2',
00492 'kr' => 'euc-kr',
00493 'eo' => 'utf-8',
00494 'my' => '',
00495 'hi' => 'utf-8',
00496 'fo' => 'utf-8',
00497 'fa' => 'utf-8',
00498 'sr' => 'utf-8'
00499 );
00500
00501
00502
00503 var $isoArray = array(
00504 'ba' => 'bs',
00505 'br' => 'pt_BR',
00506 'ch' => 'zh_CN',
00507 'cz' => 'cs',
00508 'dk' => 'da',
00509 'si' => 'sl',
00510 'se' => 'sv',
00511 'gl' => 'kl',
00512 'gr' => 'el',
00513 'hk' => 'zh_HK',
00514 'kr' => 'ko',
00515 'ua' => 'uk',
00516 'jp' => 'ja',
00517 'vn' => 'vi',
00518 );
00519
00527 function parse_charset($charset) {
00528 $charset = strtolower($charset);
00529 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
00530
00531 return $charset;
00532 }
00533
00546 function get_locale_charset($locale) {
00547 $locale = strtolower($locale);
00548
00549
00550 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
00551
00552
00553 list($locale,$modifier) = explode('@',$locale);
00554
00555
00556 list($locale,$charset) = explode('.',$locale);
00557 if ($charset) return $this->parse_charset($charset);
00558
00559
00560 if ($modifier == 'euro') return 'iso-8859-15';
00561
00562
00563 list($language,$country) = explode('_',$locale);
00564 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
00565
00566 if (TYPO3_OS == 'WIN') {
00567 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
00568 } else {
00569 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00570 }
00571
00572 return $cs;
00573 }
00574
00575
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585
00586
00587
00588
00599 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00600 if ($fromCS==$toCS) return $str;
00601
00602
00603 if ($toCS=='utf-8' || !$useEntityForNoChar) {
00604 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
00605 case 'mbstring':
00606 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00607 if (false !== $conv_str) return $conv_str;
00608 break;
00609
00610 case 'iconv':
00611 $conv_str = iconv($fromCS,$toCS.'
00612 if (false !== $conv_str) return $conv_str;
00613 break;
00614
00615 case 'recode':
00616 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00617 if (false !== $conv_str) return $conv_str;
00618 break;
00619 }
00620
00621 }
00622
00623 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
00624 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00625 return $str;
00626 }
00627
00639 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00640 foreach($array as $key => $value) {
00641 if (is_array($array[$key])) {
00642 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00643 } else {
00644 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00645 }
00646 }
00647 }
00648
00656 function utf8_encode($str,$charset) {
00657
00658 if ($charset === 'utf-8') return $str;
00659
00660
00661 if ($this->initCharset($charset)) {
00662 $strLen = strlen($str);
00663 $outStr='';
00664
00665 for ($a=0;$a<$strLen;$a++) {
00666 $chr=substr($str,$a,1);
00667 $ord=ord($chr);
00668 if (isset($this->twoByteSets[$charset])) {
00669 $ord2 = ord($str{$a+1});
00670 $ord = $ord<<8 & $ord2;
00671
00672 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00673 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00674 } else $outStr.=chr($this->noCharByteVal);
00675 $a++;
00676 } elseif ($ord>127) {
00677 if (isset($this->eucBasedSets[$charset])) {
00678 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) {
00679 $a++;
00680 $ord2=ord(substr($str,$a,1));
00681 $ord = $ord*256+$ord2;
00682 }
00683 }
00684
00685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
00686 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00687 } else $outStr.= chr($this->noCharByteVal);
00688 } else $outStr.= $chr;
00689 }
00690 return $outStr;
00691 }
00692 }
00693
00702 function utf8_decode($str,$charset,$useEntityForNoChar=0) {
00703
00704
00705 if ($this->initCharset($charset)) {
00706 $strLen = strlen($str);
00707 $outStr='';
00708 $buf='';
00709 for ($a=0,$i=0;$a<$strLen;$a++,$i++) {
00710 $chr=substr($str,$a,1);
00711 $ord=ord($chr);
00712 if ($ord>127) {
00713 if ($ord & 64) {
00714
00715 $buf=$chr;
00716 for ($b=0;$b<8;$b++) {
00717 $ord = $ord << 1;
00718 if ($ord & 128) {
00719 $a++;
00720 $buf.=substr($str,$a,1);
00721 } else break;
00722 }
00723
00724 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
00725 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
00726 if ($mByte>255) {
00727 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00728 } else $outStr.= chr($mByte);
00729 } elseif ($useEntityForNoChar) {
00730 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00731 } else $outStr.=chr($this->noCharByteVal);
00732 } else $outStr.=chr($this->noCharByteVal);
00733 } else $outStr.=$chr;
00734 }
00735 return $outStr;
00736 }
00737 }
00738
00745 function utf8_to_entities($str) {
00746 $strLen = strlen($str);
00747 $outStr='';
00748 $buf='';
00749 for ($a=0;$a<$strLen;$a++) {
00750 $chr=substr($str,$a,1);
00751 $ord=ord($chr);
00752 if ($ord>127) {
00753 if ($ord & 64) {
00754 $buf=$chr;
00755 for ($b=0;$b<8;$b++) {
00756 $ord = $ord << 1;
00757 if ($ord & 128) {
00758 $a++;
00759 $buf.=substr($str,$a,1);
00760 } else break;
00761 }
00762
00763 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00764 } else $outStr.=chr($this->noCharByteVal);
00765 } else $outStr.=$chr;
00766 }
00767
00768 return $outStr;
00769 }
00770
00778 function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
00779 if ($alsoStdHtmlEnt) {
00780 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));
00781 }
00782
00783 $token = md5(microtime());
00784 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00785 foreach($parts as $k => $v) {
00786 if ($k%2) {
00787 if (substr($v,0,1)=='#') {
00788 if (substr($v,1,1)=='x') {
00789 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00790 } else {
00791 $parts[$k] = $this->UnumberToChar(substr($v,1));
00792 }
00793 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {
00794 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00795 } else {
00796 $parts[$k] ='&'.$v.';';
00797 }
00798 }
00799 }
00800
00801 return implode('',$parts);
00802 }
00803
00812 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
00813
00814 if ($convEntities) {
00815 $str = $this->entities_to_utf8($str,1);
00816 }
00817
00818 $strLen = strlen($str);
00819 $outArr=array();
00820 $buf='';
00821 for ($a=0;$a<$strLen;$a++) {
00822 $chr=substr($str,$a,1);
00823 $ord=ord($chr);
00824 if ($ord>127) {
00825 if ($ord & 64) {
00826 $buf=$chr;
00827 for ($b=0;$b<8;$b++) {
00828 $ord = $ord << 1;
00829 if ($ord & 128) {
00830 $a++;
00831 $buf.=substr($str,$a,1);
00832 } else break;
00833 }
00834
00835 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00836 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;
00837 } else $outArr[]=$retChar?chr($ord):$ord;
00838 }
00839
00840 return $outArr;
00841 }
00842
00862 function UnumberToChar($cbyte) {
00863 $str='';
00864
00865 if ($cbyte < 0x80) {
00866 $str.=chr($cbyte);
00867 } else if ($cbyte < 0x800) {
00868 $str.=chr(0xC0 | ($cbyte >> 6));
00869 $str.=chr(0x80 | ($cbyte & 0x3F));
00870 } else if ($cbyte < 0x10000) {
00871 $str.=chr(0xE0 | ($cbyte >> 12));
00872 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00873 $str.=chr(0x80 | ($cbyte & 0x3F));
00874 } else if ($cbyte < 0x200000) {
00875 $str.=chr(0xF0 | ($cbyte >> 18));
00876 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00877 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00878 $str.=chr(0x80 | ($cbyte & 0x3F));
00879 } else if ($cbyte < 0x4000000) {
00880 $str.=chr(0xF8 | ($cbyte >> 24));
00881 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00882 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00883 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00884 $str.=chr(0x80 | ($cbyte & 0x3F));
00885 } else if ($cbyte < 0x80000000) {
00886 $str.=chr(0xFC | ($cbyte >> 30));
00887 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00888 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00889 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00890 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00891 $str.=chr(0x80 | ($cbyte & 0x3F));
00892 } else {
00893 $str .= chr($this->noCharByteVal);
00894 }
00895 return $str;
00896 }
00897
00907 function utf8CharToUnumber($str,$hex=0) {
00908 $ord=ord(substr($str,0,1));
00909
00910 if (($ord & 192) == 192) {
00911 $binBuf='';
00912 for ($b=0;$b<8;$b++) {
00913 $ord = $ord << 1;
00914 if ($ord & 128) {
00915 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00916 } else break;
00917 }
00918 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00919
00920 $int = bindec($binBuf);
00921 } else $int = $ord;
00922
00923 return $hex ? 'x'.dechex($int) : $int;
00924 }
00925
00926
00927
00928
00929
00930
00931
00932
00933
00934
00935
00936
00937
00938
00939
00950 function initCharset($charset) {
00951
00952 if (!is_array($this->parsedCharsets[$charset])) {
00953
00954
00955 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00956
00957
00958 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
00959
00960
00961 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00962 if ($cacheFile && @is_file($cacheFile)) {
00963 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00964 } else {
00965
00966 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00967
00968 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00969
00970 $detectedType='';
00971 foreach($lines as $value) {
00972 if (trim($value) && substr($value,0,1)!='#') {
00973
00974
00975
00976 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00977
00978 if ($detectedType=='ms-token') {
00979 list($hexbyte,$utf8) = split('=|:',$value,3);
00980 } elseif ($detectedType=='whitespaced') {
00981 $regA=array();
00982 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00983 $hexbyte = $regA[1];
00984 $utf8 = 'U+'.$regA[2];
00985 }
00986 $decval = hexdec(trim($hexbyte));
00987 if ($decval>127) {
00988 $utf8decval = hexdec(substr(trim($utf8),2));
00989 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00990 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00991 }
00992 }
00993 }
00994 if ($cacheFile) {
00995 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00996 }
00997 }
00998 return 2;
00999 } else return false;
01000 } else return 1;
01001 }
01002
01012 function initUnicodeData($mode=null) {
01013
01014 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01015 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01016
01017
01018 switch($mode) {
01019 case 'case':
01020 if (is_array($this->caseFolding['utf-8'])) return 1;
01021
01022
01023 if ($cacheFileCase && @is_file($cacheFileCase)) {
01024 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01025 return 2;
01026 }
01027 break;
01028
01029 case 'ascii':
01030 if (is_array($this->toASCII['utf-8'])) return 1;
01031
01032
01033 if ($cacheFileASCII && @is_file($cacheFileASCII)) {
01034 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01035 return 2;
01036 }
01037 break;
01038 }
01039
01040
01041 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01042 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01043
01044 $fh = fopen($unicodeDataFile,'rb');
01045 if (!$fh) return false;
01046
01047
01048
01049 $this->caseFolding['utf-8'] = array();
01050 $utf8CaseFolding =& $this->caseFolding['utf-8'];
01051 $utf8CaseFolding['toUpper'] = array();
01052 $utf8CaseFolding['toLower'] = array();
01053 $utf8CaseFolding['toTitle'] = array();
01054
01055 $decomposition = array();
01056 $mark = array();
01057 $number = array();
01058 $omit = array();
01059
01060 while (!feof($fh)) {
01061 $line = fgets($fh,4096);
01062
01063 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01064
01065 $ord = hexdec($char);
01066 if ($ord > 0xFFFF) break;
01067
01068 $utf8_char = $this->UnumberToChar($ord);
01069
01070 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01071 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01072
01073 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01074
01075 switch ($cat{0}) {
01076 case 'M':
01077 $mark["U+$char"] = 1;
01078 break;
01079
01080 case 'N':
01081 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
01082 }
01083
01084
01085 $match = array();
01086 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) {
01087 $c = ord($match[2]);
01088 if ($match[1] == 'SMALL') $c += 32;
01089
01090 $decomposition["U+$char"] = array(dechex($c));
01091 continue;
01092 }
01093
01094 $match = array();
01095 if (ereg('(<.*>)? *(.+)',$decomp,$match)) {
01096 switch($match[1]) {
01097 case '<circle>':
01098 $match[2] = '0028 '.$match[2].' 0029';
01099 break;
01100
01101 case '<square>':
01102 $match[2] = '005B '.$match[2].' 005D';
01103 break;
01104
01105 case '<compat>':
01106 if (ereg('^0020 ',$match[2])) continue 2;
01107 break;
01108
01109
01110 case '<initial>':
01111 case '<medial>':
01112 case '<final>':
01113 case '<isolated>':
01114 case '<vertical>':
01115 continue 2;
01116 }
01117 $decomposition["U+$char"] = split(' ',$match[2]);
01118 }
01119 }
01120 fclose($fh);
01121
01122
01123 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01124 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
01125 $fh = fopen($specialCasingFile,'rb');
01126 if ($fh) {
01127 while (!feof($fh)) {
01128 $line = fgets($fh,4096);
01129 if ($line{0} != '#' && trim($line) != '') {
01130
01131 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01132 if ($cond == '' || $cond{0} == '#') {
01133 $utf8_char = $this->UnumberToChar(hexdec($char));
01134 if ($char != $lower) {
01135 $arr = split(' ',$lower);
01136 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01137 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01138 }
01139 if ($char != $title && $title != $upper) {
01140 $arr = split(' ',$title);
01141 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01142 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01143 }
01144 if ($char != $upper) {
01145 $arr = split(' ',$upper);
01146 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01147 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01148 }
01149 }
01150 }
01151 }
01152 fclose($fh);
01153 }
01154 }
01155
01156
01157 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01158 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
01159 $fh = fopen($customTranslitFile,'rb');
01160 if ($fh) {
01161 while (!feof($fh)) {
01162 $line = fgets($fh,4096);
01163 if ($line{0} != '#' && trim($line) != '') {
01164 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01165 if (!$translit) $omit["U+$char"] = 1;
01166 $decomposition["U+$char"] = split(' ', $translit);
01167
01168 }
01169 }
01170 fclose($fh);
01171 }
01172 }
01173
01174
01175 foreach($decomposition as $from => $to) {
01176 $code_decomp = array();
01177
01178 while ($code_value = array_shift($to)) {
01179 if (isset($decomposition["U+$code_value"])) {
01180 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
01181 array_unshift($to, $cv);
01182 }
01183 } elseif (!isset($mark["U+$code_value"])) {
01184 array_push($code_decomp, $code_value);
01185 }
01186 }
01187 if (count($code_decomp) || isset($omit[$from])) {
01188 $decomposition[$from] = $code_decomp;
01189 } else {
01190 unset($decomposition[$from]);
01191 }
01192 }
01193
01194
01195 $this->toASCII['utf-8'] = array();
01196 $ascii =& $this->toASCII['utf-8'];
01197
01198 foreach($decomposition as $from => $to) {
01199 $code_decomp = array();
01200 while ($code_value = array_shift($to)) {
01201 $ord = hexdec($code_value);
01202 if ($ord > 127)
01203 continue 2;
01204 else
01205 array_push($code_decomp,chr($ord));
01206 }
01207 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01208 }
01209
01210
01211 foreach($number as $from => $to) {
01212 $utf8_char = $this->UnumberToChar(hexdec($from));
01213 if (!isset($ascii[$utf8_char])) {
01214 $ascii[$utf8_char] = $to;
01215 }
01216 }
01217
01218 if ($cacheFileCase) {
01219 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01220 }
01221
01222 if ($cacheFileASCII) {
01223 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01224 }
01225
01226 return 3;
01227 }
01228
01237 function initCaseFolding($charset) {
01238
01239 if (is_array($this->caseFolding[$charset])) return 1;
01240
01241
01242 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01243 if ($cacheFile && @is_file($cacheFile)) {
01244 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01245 return 2;
01246 }
01247
01248
01249 if (!$this->initCharset($charset)) {
01250 return false;
01251 }
01252
01253
01254 if (!$this->initUnicodeData('case')) {
01255 return false;
01256 }
01257
01258 $nochar = chr($this->noCharByteVal);
01259 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01260
01261 $c = $this->utf8_decode($utf8, $charset);
01262
01263
01264 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01265 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01266
01267
01268 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01269 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc;
01270
01271
01272 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01273 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01274 }
01275
01276
01277 for ($i=ord('a'); $i<=ord('z'); $i++) {
01278 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01279 }
01280 for ($i=ord('A'); $i<=ord('Z'); $i++) {
01281 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01282 }
01283
01284 if ($cacheFile) {
01285 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01286 }
01287
01288 return 3;
01289 }
01290
01299 function initToASCII($charset) {
01300
01301 if (is_array($this->toASCII[$charset])) return 1;
01302
01303
01304 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01305 if ($cacheFile && @is_file($cacheFile)) {
01306 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01307 return 2;
01308 }
01309
01310
01311 if (!$this->initCharset($charset)) {
01312 return false;
01313 }
01314
01315
01316 if (!$this->initUnicodeData('ascii')) {
01317 return false;
01318 }
01319
01320 $nochar = chr($this->noCharByteVal);
01321 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
01322
01323 $c = $this->utf8_decode($utf8, $charset);
01324
01325 if (isset($this->toASCII['utf-8'][$utf8])) {
01326 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01327 }
01328 }
01329
01330 if ($cacheFile) {
01331 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01332 }
01333
01334 return 3;
01335 }
01336
01337
01338
01339
01340
01341
01342
01343
01344
01345
01346
01347
01348
01349
01350
01351
01352
01353
01354
01355
01356
01357
01370 function substr($charset,$string,$start,$len=null) {
01371 if ($len===0) return '';
01372
01373 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01374
01375 if ($len==null) {
01376 $enc = mb_internal_encoding();
01377 mb_internal_encoding($charset);
01378 $str = mb_substr($string,$start);
01379 mb_internal_encoding($enc);
01380
01381 return $str;
01382 }
01383 else {
01384 return mb_substr($string,$start,$len,$charset);
01385 }
01386 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01387
01388 if ($len==null) {
01389 $enc = iconv_get_encoding('internal_encoding');
01390 iconv_set_encoding('internal_encoding',$charset);
01391 $str = iconv_substr($string,$start);
01392 iconv_set_encoding('internal_encoding',$enc);
01393
01394 return $str;
01395 }
01396 else {
01397 return iconv_substr($string,$start,$len,$charset);
01398 }
01399 } elseif ($charset == 'utf-8') {
01400 return $this->utf8_substr($string,$start,$len);
01401 } elseif ($this->eucBasedSets[$charset]) {
01402 return $this->euc_substr($string,$start,$charset,$len);
01403 } elseif ($this->twoByteSets[$charset]) {
01404 return substr($string,$start*2,$len*2);
01405 } elseif ($this->fourByteSets[$charset]) {
01406 return substr($string,$start*4,$len*4);
01407 }
01408
01409
01410 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01411 }
01412
01423 function strlen($charset,$string) {
01424 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01425 return mb_strlen($string,$charset);
01426 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
01427 return iconv_strlen($string,$charset);
01428 } elseif ($charset == 'utf-8') {
01429 return $this->utf8_strlen($string);
01430 } elseif ($this->eucBasedSets[$charset]) {
01431 return $this->euc_strlen($string,$charset);
01432 } elseif ($this->twoByteSets[$charset]) {
01433 return strlen($string)/2;
01434 } elseif ($this->fourByteSets[$charset]) {
01435 return strlen($string)/4;
01436 }
01437
01438 return strlen($string);
01439 }
01440
01453 function crop($charset,$string,$len,$crop='') {
01454 if (intval($len) == 0) return $string;
01455
01456 if ($charset == 'utf-8') {
01457 $i = $this->utf8_char2byte_pos($string,$len);
01458 } elseif ($this->eucBasedSets[$charset]) {
01459 $i = $this->euc_char2byte_pos($string,$len,$charset);
01460 } else {
01461 if ($len > 0) {
01462 $i = $len;
01463 } else {
01464 $i = strlen($string)+$len;
01465 if ($i<=0) $i = false;
01466 }
01467 }
01468
01469 if ($i === false) {
01470 return $string;
01471 } else {
01472 if ($len > 0) {
01473 if (strlen($string{$i})) {
01474 return substr($string,0,$i).$crop;
01475
01476 }
01477 } else {
01478 if (strlen($string{$i-1})) {
01479 return $crop.substr($string,$i);
01480 }
01481 }
01482
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492 }
01493 return $string;
01494 }
01495
01506 function strtrunc($charset,$string,$len) {
01507 if ($len <= 0) return '';
01508
01509 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01510 return mb_strcut($string,0,$len,$charset);
01511 } elseif ($charset == 'utf-8') {
01512 return $this->utf8_strtrunc($string,$len);
01513 } elseif ($this->eucBasedSets[$charset]) {
01514 return $this->euc_strtrunc($string,$charset);
01515 } elseif ($this->twoByteSets[$charset]) {
01516 if ($len % 2) $len--;
01517 } elseif ($this->fourByteSets[$charset]) {
01518 $x = $len % 4;
01519 $len -= $x;
01520 }
01521
01522 return substr($string,0,$len);
01523 }
01524
01540 function conv_case($charset,$string,$case) {
01541 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) {
01542 if ($case == 'toLower') {
01543 $string = mb_strtolower($string,$charset);
01544 } else {
01545 $string = mb_strtoupper($string,$charset);
01546 }
01547 } elseif ($charset == 'utf-8') {
01548 $string = $this->utf8_char_mapping($string,'case',$case);
01549 } elseif (isset($this->eucBasedSets[$charset])) {
01550 $string = $this->euc_char_mapping($string,$charset,'case',$case);
01551 } else {
01552
01553 $string = $this->sb_char_mapping($string,$charset,'case',$case);
01554 }
01555
01556 return $string;
01557 }
01558
01566 function specCharsToASCII($charset,$string) {
01567 if ($charset == 'utf-8') {
01568 $string = $this->utf8_char_mapping($string,'ascii');
01569 } elseif (isset($this->eucBasedSets[$charset])) {