Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the Typo3 project. The Typo3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * 00017 * This script is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * This copyright notice MUST APPEAR in all copies of the script! 00023 ***************************************************************/ 00136 class t3lib_cs { 00137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent. 00138 00139 // This is the array where parsed conversion tables are stored (cached) 00140 var $parsedCharsets=array(); 00141 00142 // An array where case folding data will be stored (cached) 00143 var $caseFolding=array(); 00144 00145 // An array where charset-to-ASCII mappings are stored (cached) 00146 var $toASCII=array(); 00147 00148 // This tells the converter which charsets has two bytes per char: 00149 var $twoByteSets=array( 00150 'ucs-2'=>1, // 2-byte Unicode 00151 ); 00152 00153 // This tells the converter which charsets has four bytes per char: 00154 var $fourByteSets=array( 00155 'ucs-4'=>1, // 4-byte Unicode 00156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 00157 ); 00158 00159 // This tells the converter which charsets use a scheme like the Extended Unix Code: 00160 var $eucBasedSets=array( 00161 'gb2312'=>1, // Chinese, simplified. 00162 'big5'=>1, // Chinese, traditional. 00163 'euc-kr'=>1, // Korean 00164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 00165 ); 00166 00167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 00168 // http://czyborra.com/charsets/iso8859.html 00169 var $synonyms=array( 00170 'us' => 'ascii', 00171 'us-ascii'=> 'ascii', 00172 'cp819' => 'iso-8859-1', 00173 'ibm819' => 'iso-8859-1', 00174 'iso-ir-100' => 'iso-8859-1', 00175 'iso-ir-109' => 'iso-8859-2', 00176 'iso-ir-148' => 'iso-8859-9', 00177 'iso-ir-199' => 'iso-8859-14', 00178 'iso-ir-203' => 'iso-8859-15', 00179 'csisolatin1' => 'iso-8859-1', 00180 'csisolatin2' => 'iso-8859-2', 00181 'csisolatin3' => 'iso-8859-3', 00182 'csisolatin5' => 'iso-8859-9', 00183 'csisolatin8' => 'iso-8859-14', 00184 'csisolatin9' => 'iso-8859-15', 00185 'csisolatingreek' => 'iso-8859-7', 00186 'iso-celtic' => 'iso-8859-14', 00187 'latin1' => 'iso-8859-1', 00188 'latin2' => 'iso-8859-2', 00189 'latin3' => 'iso-8859-3', 00190 'latin5' => 'iso-8859-9', 00191 'latin6' => 'iso-8859-10', 00192 'latin8' => 'iso-8859-14', 00193 'latin9' => 'iso-8859-15', 00194 'l1' => 'iso-8859-1', 00195 'l2' => 'iso-8859-2', 00196 'l3' => 'iso-8859-3', 00197 'l5' => 'iso-8859-9', 00198 'l6' => 'iso-8859-10', 00199 'l8' => 'iso-8859-14', 00200 'l9' => 'iso-8859-15', 00201 'cyrillic' => 'iso-8859-5', 00202 'arabic' => 'iso-8859-6', 00203 'tis-620' => 'iso-8859-11', 00204 'win874' => 'windows-874', 00205 'win1250' => 'windows-1250', 00206 'win1251' => 'windows-1251', 00207 'win1252' => 'windows-1252', 00208 'win1253' => 'windows-1253', 00209 'win1254' => 'windows-1254', 00210 'win1255' => 'windows-1255', 00211 'win1256' => 'windows-1256', 00212 'win1257' => 'windows-1257', 00213 'win1258' => 'windows-1258', 00214 'cp1250' => 'windows-1250', 00215 'cp1251' => 'windows-1251', 00216 'cp1252' => 'windows-1252', 00217 'ms-ee' => 'windows-1250', 00218 'ms-ansi' => 'windows-1252', 00219 'ms-greek' => 'windows-1253', 00220 'ms-turk' => 'windows-1254', 00221 'winbaltrim' => 'windows-1257', 00222 'koi-8ru' => 'koi-8r', 00223 'koi8r' => 'koi-8r', 00224 'cp878' => 'koi-8r', 00225 'mac' => 'macroman', 00226 'macintosh' => 'macroman', 00227 'euc-cn' => 'gb2312', 00228 'x-euc-cn' => 'gb2312', 00229 'euccn' => 'gb2312', 00230 'cp936' => 'gb2312', 00231 'big-5' => 'big5', 00232 'cp950' => 'big5', 00233 'eucjp' => 'euc-jp', 00234 'sjis' => 'shift_jis', 00235 'shift-jis' => 'shift_jis', 00236 'cp932' => 'shift_jis', 00237 'cp949' => 'euc-kr', 00238 'utf7' => 'utf-7', 00239 'utf8' => 'utf-8', 00240 'utf16' => 'utf-16', 00241 'utf32' => 'utf-32', 00242 'utf8' => 'utf-8', 00243 'ucs2' => 'ucs-2', 00244 'ucs4' => 'ucs-4', 00245 ); 00246 00247 // mapping of iso-639:2 language codes to language (family) names 00248 var $lang_to_langfamily=array( 00249 // iso-639:2 language codes, see: 00250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm 00251 // http://www.unicode.org/onlinedat/languages.html 00252 'ar' => 'arabic', 00253 'bg' => 'cyrillic', 00254 'cs' => 'east_european', 00255 'da' => 'west_european', 00256 'de' => 'west_european', 00257 'es' => 'west_european', 00258 'et' => 'estonian', 00259 'eu' => 'west_european', 00260 'fi' => 'west_european', 00261 'fr' => 'west_european', 00262 'gr' => 'greek', 00263 'hr' => 'east_european', 00264 'hu' => 'east_european', 00265 'iw' => 'hebrew', 00266 'is' => 'west_european', 00267 'it' => 'west_european', 00268 'ja' => 'japanese', 00269 'kl' => 'west_european', 00270 'ko' => 'korean', 00271 'lt' => 'lithuanian', 00272 'lv' => 'west_european', // Latvian/Lettish 00273 'nl' => 'west_european', 00274 'no' => 'west_european', 00275 'pl' => 'east_european', 00276 'pt' => 'west_european', 00277 'ro' => 'east_european', 00278 'ru' => 'cyrillic', 00279 'sk' => 'east_european', 00280 'sl' => 'east_european', 00281 'sv' => 'west_european', 00282 'th' => 'thai', 00283 'uk' => 'cyrillic', 00284 'vi' => 'vietnamese', 00285 'zh' => 'chinese', 00286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 00287 'chs' => 'simpl_chinese', 00288 'cht' => 'trad_chinese', 00289 'csy' => 'east_european', 00290 'dan' => 'west_european', 00291 'deu' => 'west_european', 00292 'dea' => 'west_european', 00293 'des' => 'west_european', 00294 'ena' => 'west_european', 00295 'enc' => 'west_european', 00296 'eng' => 'west_european', 00297 'enz' => 'west_european', 00298 'enu' => 'west_european', 00299 'nld' => 'west_european', 00300 'nlb' => 'west_european', 00301 'fin' => 'west_european', 00302 'fra' => 'west_european', 00303 'frb' => 'west_european', 00304 'frc' => 'west_european', 00305 'frs' => 'west_european', 00306 'ell' => 'greek', 00307 'hun' => 'east_european', 00308 'isl' => 'west_euorpean', 00309 'ita' => 'west_european', 00310 'its' => 'west_european', 00311 'jpn' => 'japanese', 00312 'kor' => 'korean', 00313 'nor' => 'west_european', 00314 'non' => 'west_european', 00315 'plk' => 'east_european', 00316 'ptg' => 'west_european', 00317 'ptb' => 'west_european', 00318 'rus' => 'east_european', 00319 'sky' => 'east_european', 00320 'esp' => 'west_european', 00321 'esm' => 'west_european', 00322 'esn' => 'west_european', 00323 'sve' => 'west_european', 00324 'trk' => 'turkish', 00325 // English language names 00326 'bulgarian' => 'east_european', 00327 'catalan' => 'west_european', 00328 'croatian' => 'east_european', 00329 'czech' => 'east_european', 00330 'danish' => 'west_european', 00331 'dutch' => 'west_european', 00332 'english' => 'west_european', 00333 'finnish' => 'west_european', 00334 'french' => 'west_european', 00335 'galician' => 'west_european', 00336 'german' => 'west_european', 00337 'hungarian' => 'east_european', 00338 'icelandic' => 'west_european', 00339 'italian' => 'west_european', 00340 'latvian' => 'west_european', 00341 'lettish' => 'west_european', 00342 'norwegian' => 'west_european', 00343 'polish' => 'east_european', 00344 'portuguese' => 'west_european', 00345 'russian' => 'cyrillic', 00346 'romanian' => 'east_european', 00347 'slovak' => 'east_european', 00348 'slovenian' => 'east_european', 00349 'spanish' => 'west_european', 00350 'svedish' => 'west_european', 00351 'turkish' => 'east_european', 00352 'ukrainian' => 'cyrillic', 00353 ); 00354 00355 // mapping of language (family) names to charsets on Unix 00356 var $lang_to_charset_unix=array( 00357 'west_european' => 'iso-8859-1', 00358 'estonian' => 'iso-8859-1', 00359 'east_european' => 'iso-8859-2', 00360 'baltic' => 'iso-8859-4', 00361 'cyrillic' => 'iso-8859-5', 00362 'arabic' => 'iso-8859-6', 00363 'greek' => 'iso-8859-7', 00364 'hebrew' => 'iso-8859-8', 00365 'turkish' => 'iso-8859-9', 00366 'thai' => 'iso-8859-11', // = TIS-620 00367 'lithuanian' => 'iso-8859-13', 00368 'chinese' => 'gb2312', // = euc-cn 00369 'japanese' => 'euc-jp', 00370 'korean' => 'euc-kr', 00371 'simpl_chinese' => 'gb2312', 00372 'trad_chinese' => 'big5', 00373 'vietnamese' => '', 00374 ); 00375 00376 // mapping of language (family) names to charsets on Windows 00377 var $lang_to_charset_windows=array( 00378 'east_european' => 'windows-1250', 00379 'cyrillic' => 'windows-1251', 00380 'west_european' => 'windows-1252', 00381 'greek' => 'windows-1253', 00382 'turkish' => 'windows-1254', 00383 'hebrew' => 'windows-1255', 00384 'arabic' => 'windows-1256', 00385 'baltic' => 'windows-1257', 00386 'estonian' => 'windows-1257', 00387 'lithuanian' => 'windows-1257', 00388 'vietnamese' => 'windows-1258', 00389 'thai' => 'cp874', 00390 'korean' => 'cp949', 00391 'chinese' => 'gb2312', 00392 'japanese' => 'shift_jis', 00393 'simpl_chinese' => 'gb2312', 00394 'trad_chinese' => 'big5', 00395 ); 00396 00397 // mapping of locale names to charsets 00398 var $locale_to_charset=array( 00399 'japanese.euc' => 'euc-jp', 00400 'ja_jp.ujis' => 'euc-jp', 00401 'korean.euc' => 'euc-kr', 00402 'zh_cn' => 'gb2312', 00403 'zh_hk' => 'big5', 00404 'zh_tw' => 'big5', 00405 ); 00406 00407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 00408 // Empty values means "iso-8859-1" 00409 var $charSetArray = array( 00410 'dk' => '', 00411 'de' => '', 00412 'no' => '', 00413 'it' => '', 00414 'fr' => '', 00415 'es' => '', 00416 'nl' => '', 00417 'cz' => 'windows-1250', 00418 'pl' => 'iso-8859-2', 00419 'si' => 'windows-1250', 00420 'fi' => '', 00421 'tr' => 'iso-8859-9', 00422 'se' => '', 00423 'pt' => '', 00424 'ru' => 'windows-1251', 00425 'ro' => 'iso-8859-2', 00426 'ch' => 'gb2312', 00427 'sk' => 'windows-1250', 00428 'lt' => 'windows-1257', 00429 'is' => 'utf-8', 00430 'hr' => 'windows-1250', 00431 'hu' => 'iso-8859-2', 00432 'gl' => '', 00433 'th' => 'iso-8859-11', 00434 'gr' => 'iso-8859-7', 00435 'hk' => 'big5', 00436 'eu' => '', 00437 'bg' => 'windows-1251', 00438 'br' => '', 00439 'et' => 'iso-8859-4', 00440 'ar' => 'iso-8859-6', 00441 'he' => 'utf-8', 00442 'ua' => 'windows-1251', 00443 'jp' => 'shift_jis', 00444 'lv' => 'utf-8', 00445 'vn' => 'utf-8', 00446 'ca' => 'iso-8859-15', 00447 'ba' => 'iso-8859-2', 00448 'kr' => 'euc-kr', 00449 'eo' => 'utf-8', 00450 'my' => '', 00451 'hi' => 'utf-8', 00452 ); 00453 00454 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 00455 // Empty values means sames as Typo3 00456 var $isoArray = array( 00457 'dk' => 'da', 00458 'de' => '', 00459 'no' => '', 00460 'it' => '', 00461 'fr' => '', 00462 'es' => '', 00463 'nl' => '', 00464 'cz' => 'cs', 00465 'pl' => '', 00466 'si' => 'sl', 00467 'fi' => '', 00468 'tr' => '', 00469 'se' => 'sv', 00470 'pt' => '', 00471 'ru' => '', 00472 'ro' => '', 00473 'ch' => 'zh_CN', 00474 'sk' => '', 00475 'lt' => '', 00476 'is' => '', 00477 'hr' => '', 00478 'hu' => '', 00479 'gl' => '', // Greenlandic 00480 'th' => '', 00481 'gr' => 'el', 00482 'hk' => 'zh_HK', 00483 'eu' => '', 00484 'bg' => '', 00485 'br' => 'pt_BR', 00486 'et' => '', 00487 'ar' => '', 00488 'he' => 'iw', 00489 'ua' => 'uk', 00490 'jp' => 'ja', 00491 'lv' => '', 00492 'vn' => 'vi', 00493 'ca' => '', 00494 'ba' => '', // Bosnian 00495 'kr' => '', 00496 ); 00497 00505 function parse_charset($charset) { 00506 $charset = strtolower($charset); 00507 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset]; 00508 00509 return $charset; 00510 } 00511 00524 function get_locale_charset($locale) { 00525 $locale = strtolower($locale); 00526 00527 // exact locale specific charset? 00528 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale]; 00529 00530 // get modifier 00531 list($locale,$modifier) = explode('@',$locale); 00532 00533 // locale contains charset: use it 00534 list($locale,$charset) = explode('.',$locale); 00535 if ($charset) return $this->parse_charset($charset); 00536 00537 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 00538 if ($modifier == 'euro') return 'iso-8859-15'; 00539 00540 // get language 00541 list($language,$country) = explode('_',$locale); 00542 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language]; 00543 00544 if (TYPO3_OS == 'WIN') { 00545 $cs = $this->lang_to_charset_windows[$language]; 00546 } else { 00547 $cs = $this->lang_to_charset_unix[$language]; 00548 } 00549 00550 return $cs ? $cs : 'iso-8859-1'; 00551 } 00552 00553 00554 00555 00556 00557 00558 00559 00560 00561 /******************************************** 00562 * 00563 * Charset Conversion functions 00564 * 00565 ********************************************/ 00566 00577 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) { 00578 if ($fromCS==$toCS) return $str; 00579 00580 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 00581 if ($toCS=='utf-8' || !$useEntityForNoChar) { 00582 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 00583 case 'mbstring': 00584 $conv_str = mb_convert_encoding($str,$toCS,$fromCS); 00585 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets 00586 break; 00587 00588 case 'iconv': 00589 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str); 00590 if (false !== $conv_str) return $conv_str; 00591 break; 00592 00593 case 'recode': 00594 $conv_str = recode_string($fromCS.'..'.$toCS,$str); 00595 if (false !== $conv_str) return $conv_str; 00596 break; 00597 } 00598 // fallback to TYPO3 conversion 00599 } 00600 00601 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS); 00602 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar); 00603 return $str; 00604 } 00605 00617 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) { 00618 foreach($array as $key => $value) { 00619 if (is_array($array[$key])) { 00620 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00621 } else { 00622 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00623 } 00624 } 00625 } 00626 00634 function utf8_encode($str,$charset) { 00635 00636 if ($charset === 'utf-8') return $str; 00637 00638 // Charset is case-insensitive. 00639 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00640 $strLen = strlen($str); 00641 $outStr=''; 00642 00643 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string. 00644 $chr=substr($str,$a,1); 00645 $ord=ord($chr); 00646 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 00647 $ord2 = ord($str{$a+1}); 00648 $ord = $ord<<8 & $ord2; // assume big endian 00649 00650 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00651 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 00652 } else $outStr.=chr($this->noCharByteVal); // No char exists 00653 $a++; 00654 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8 00655 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 00656 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 00657 $a++; 00658 $ord2=ord(substr($str,$a,1)); 00659 $ord = $ord*256+$ord2; 00660 } 00661 } 00662 00663 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00664 $outStr.= $this->parsedCharsets[$charset]['local'][$ord]; 00665 } else $outStr.= chr($this->noCharByteVal); // No char exists 00666 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00667 } 00668 return $outStr; 00669 } 00670 } 00671 00680 function utf8_decode($str,$charset,$useEntityForNoChar=0) { 00681 00682 // Charset is case-insensitive. 00683 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00684 $strLen = strlen($str); 00685 $outStr=''; 00686 $buf=''; 00687 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string. 00688 $chr=substr($str,$a,1); 00689 $ord=ord($chr); 00690 if ($ord>127) { // This means multibyte! (first byte!) 00691 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00692 00693 $buf=$chr; // Add first byte 00694 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00695 $ord = $ord << 1; // Shift it left and ... 00696 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00697 $a++; // Increase pointer... 00698 $buf.=substr($str,$a,1); // ... and add the next char. 00699 } else break; 00700 } 00701 00702 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 00703 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 00704 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 00705 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255); 00706 } else $outStr.= chr($mByte); 00707 } elseif ($useEntityForNoChar) { // Create num entity: 00708 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00709 } else $outStr.=chr($this->noCharByteVal); // No char exists 00710 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00711 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00712 } 00713 return $outStr; 00714 } 00715 } 00716 00723 function utf8_to_entities($str) { 00724 $strLen = strlen($str); 00725 $outStr=''; 00726 $buf=''; 00727 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00728 $chr=substr($str,$a,1); 00729 $ord=ord($chr); 00730 if ($ord>127) { // This means multibyte! (first byte!) 00731 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00732 $buf=$chr; // Add first byte 00733 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00734 $ord = $ord << 1; // Shift it left and ... 00735 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00736 $a++; // Increase pointer... 00737 $buf.=substr($str,$a,1); // ... and add the next char. 00738 } else break; 00739 } 00740 00741 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00742 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00743 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00744 } 00745 00746 return $outStr; 00747 } 00748 00756 function entities_to_utf8($str,$alsoStdHtmlEnt=0) { 00757 if ($alsoStdHtmlEnt) { 00758 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 00759 } 00760 00761 $token = md5(microtime()); 00762 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str)); 00763 foreach($parts as $k => $v) { 00764 if ($k%2) { 00765 if (substr($v,0,1)=='#') { // Dec or hex entities: 00766 if (substr($v,1,1)=='x') { 00767 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2))); 00768 } else { 00769 $parts[$k] = $this->UnumberToChar(substr($v,1)); 00770 } 00771 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities: 00772 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1'); 00773 } else { // No conversion: 00774 $parts[$k] ='&'.$v.';'; 00775 } 00776 } 00777 } 00778 00779 return implode('',$parts); 00780 } 00781 00790 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) { 00791 // If entities must be registered as well...: 00792 if ($convEntities) { 00793 $str = $this->entities_to_utf8($str,1); 00794 } 00795 // Do conversion: 00796 $strLen = strlen($str); 00797 $outArr=array(); 00798 $buf=''; 00799 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00800 $chr=substr($str,$a,1); 00801 $ord=ord($chr); 00802 if ($ord>127) { // This means multibyte! (first byte!) 00803 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00804 $buf=$chr; // Add first byte 00805 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00806 $ord = $ord << 1; // Shift it left and ... 00807 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00808 $a++; // Increase pointer... 00809 $buf.=substr($str,$a,1); // ... and add the next char. 00810 } else break; 00811 } 00812 00813 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf); 00814 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!) 00815 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00816 } 00817 00818 return $outArr; 00819 } 00820 00840 function UnumberToChar($cbyte) { 00841 $str=''; 00842 00843 if ($cbyte < 0x80) { 00844 $str.=chr($cbyte); 00845 } else if ($cbyte < 0x800) { 00846 $str.=chr(0xC0 | ($cbyte >> 6)); 00847 $str.=chr(0x80 | ($cbyte & 0x3F)); 00848 } else if ($cbyte < 0x10000) { 00849 $str.=chr(0xE0 | ($cbyte >> 12)); 00850 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00851 $str.=chr(0x80 | ($cbyte & 0x3F)); 00852 } else if ($cbyte < 0x200000) { 00853 $str.=chr(0xF0 | ($cbyte >> 18)); 00854 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00855 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00856 $str.=chr(0x80 | ($cbyte & 0x3F)); 00857 } else if ($cbyte < 0x4000000) { 00858 $str.=chr(0xF8 | ($cbyte >> 24)); 00859 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00860 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00861 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00862 $str.=chr(0x80 | ($cbyte & 0x3F)); 00863 } else if ($cbyte < 0x80000000) { 00864 $str.=chr(0xFC | ($cbyte >> 30)); 00865 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F)); 00866 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00867 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00868 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00869 $str.=chr(0x80 | ($cbyte & 0x3F)); 00870 } else { // Cannot express a 32-bit character in UTF-8 00871 $str .= chr($this->noCharByteVal); 00872 } 00873 return $str; 00874 } 00875 00885 function utf8CharToUnumber($str,$hex=0) { 00886 $ord=ord(substr($str,0,1)); // First char 00887 00888 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 00889 $binBuf=''; 00890 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00891 $ord = $ord << 1; // Shift it left and ... 00892 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00893 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6); 00894 } else break; 00895 } 00896 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf; 00897 00898 $int = bindec($binBuf); 00899 } else $int = $ord; 00900 00901 return $hex ? 'x'.dechex($int) : $int; 00902 } 00903 00904 00905 00906 00907 00908 00909 00910 00911 00912 /******************************************** 00913 * 00914 * Init functions 00915 * 00916 ********************************************/ 00917 00928 function initCharset($charset) { 00929 // Only process if the charset is not yet loaded: 00930 if (!is_array($this->parsedCharsets[$charset])) { 00931 00932 // Conversion table filename: 00933 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl'; 00934 00935 // If the conversion table is found: 00936 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 00937 // Cache file for charsets: 00938 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 00939 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl'); 00940 if ($cacheFile && @is_file($cacheFile)) { 00941 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile)); 00942 } else { 00943 // Parse conversion table into lines: 00944 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1); 00945 // Initialize the internal variable holding the conv. table: 00946 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array()); 00947 // traverse the lines: 00948 $detectedType=''; 00949 foreach($lines as $value) { 00950 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored. 00951 00952 // Detect type if not done yet: (Done on first real line) 00953 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 00954 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token'; 00955 00956 if ($detectedType=='ms-token') { 00957 list($hexbyte,$utf8) = split('=|:',$value,3); 00958 } elseif ($detectedType=='whitespaced') { 00959 $regA=array(); 00960 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA); 00961 $hexbyte = $regA[1]; 00962 $utf8 = 'U+'.$regA[2]; 00963 } 00964 $decval = hexdec(trim($hexbyte)); 00965 if ($decval>127) { 00966 $utf8decval = hexdec(substr(trim($utf8),2)); 00967 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval); 00968 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval; 00969 } 00970 } 00971 } 00972 if ($cacheFile) { 00973 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset])); 00974 } 00975 } 00976 return 2; 00977 } else return false; 00978 } else return 1; 00979 } 00980 00990 function initUnicodeData($mode=null) { 00991 // cache files 00992 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 00993 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 00994 00995 // Only process if the tables are not yet loaded 00996 switch($mode) { 00997 case 'case': 00998 if (is_array($this->caseFolding['utf-8'])) return 1; 00999 01000 // Use cached version if possible 01001 if ($cacheFileCase && @is_file($cacheFileCase)) { 01002 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 01003 return 2; 01004 } 01005 break; 01006 01007 case 'ascii': 01008 if (is_array($this->toASCII['utf-8'])) return 1; 01009 01010 // Use cached version if possible 01011 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 01012 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 01013 return 2; 01014 } 01015 break; 01016 } 01017 01018 // process main Unicode data file 01019 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt'; 01020 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false; 01021 01022 $fh = fopen($unicodeDataFile,'rb'); 01023 if (!$fh) return false; 01024 01025 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 01026 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 01027 $this->caseFolding['utf-8'] = array(); 01028 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 01029 $utf8CaseFolding['toUpper'] = array(); 01030 $utf8CaseFolding['toLower'] = array(); 01031 $utf8CaseFolding['toTitle'] = array(); 01032 01033 $decomposition = array(); // array of temp. decompositions 01034 $mark = array(); // array of chars that are marks (eg. composing accents) 01035 $number = array(); // array of chars that are numbers (eg. digits) 01036 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 01037 01038 while (!feof($fh)) { 01039 $line = fgets($fh,4096); 01040 // has a lot of info 01041 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line)); 01042 01043 $ord = hexdec($char); 01044 if ($ord > 0xFFFF) break; // only process the BMP 01045 01046 $utf8_char = $this->UnumberToChar($ord); 01047 01048 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 01049 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 01050 // store "title" only when different from "upper" (only a few) 01051 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 01052 01053 switch ($cat{0}) { 01054 case 'M': // mark (accent, umlaut, ...) 01055 $mark["U+$char"] = 1; 01056 break; 01057 01058 case 'N': // numeric value 01059 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num; 01060 } 01061 01062 // accented Latin letters without "official" decomposition 01063 $match = array(); 01064 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) { 01065 $c = ord($match[2]); 01066 if ($match[1] == 'SMALL') $c += 32; 01067 01068 $decomposition["U+$char"] = array(dechex($c)); 01069 continue; 01070 } 01071 01072 $match = array(); 01073 if (ereg('(<.*>)? *(.+)',$decomp,$match)) { 01074 switch($match[1]) { 01075 case '<circle>': // add parenthesis as circle replacement, eg (1) 01076 $match[2] = '0028 '.$match[2].' 0029'; 01077 break; 01078 01079 case '<square>': // add square brackets as square replacement, eg [1] 01080 $match[2] = '005B '.$match[2].' 005D'; 01081 break; 01082 01083 case '<compat>': // ignore multi char decompositions that start with a space 01084 if (ereg('^0020 ',$match[2])) continue 2; 01085 break; 01086 01087 // ignore Arabic and vertical layout presentation decomposition 01088 case '<initial>': 01089 case '<medial>': 01090 case '<final>': 01091 case '<isolated>': 01092 case '<vertical>': 01093 continue 2; 01094 } 01095 $decomposition["U+$char"] = split(' ',$match[2]); 01096 } 01097 } 01098 fclose($fh); 01099 01100 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 01101 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt'; 01102 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 01103 $fh = fopen($specialCasingFile,'rb'); 01104 if ($fh) { 01105 while (!feof($fh)) { 01106 $line = fgets($fh,4096); 01107 if ($line{0} != '#' && trim($line) != '') { 01108 01109 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line); 01110 if ($cond == '' || $cond{0} == '#') { 01111 $utf8_char = $this->UnumberToChar(hexdec($char)); 01112 if ($char != $lower) { 01113 $arr = split(' ',$lower); 01114 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01115 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr); 01116 } 01117 if ($char != $title && $title != $upper) { 01118 $arr = split(' ',$title); 01119 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01120 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr); 01121 } 01122 if ($char != $upper) { 01123 $arr = split(' ',$upper); 01124 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01125 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr); 01126 } 01127 } 01128 } 01129 } 01130 fclose($fh); 01131 } 01132 } 01133 01134 // process custom decompositions 01135 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt'; 01136 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 01137 $fh = fopen($customTranslitFile,'rb'); 01138 if ($fh) { 01139 while (!feof($fh)) { 01140 $line = fgets($fh,4096); 01141 if ($line{0} != '#' && trim($line) != '') { 01142 list($char,$translit) = t3lib_div::trimExplode(';', $line); 01143 if (!$translit) $omit["U+$char"] = 1; 01144 $decomposition["U+$char"] = split(' ', $translit); 01145 01146 } 01147 } 01148 fclose($fh); 01149 } 01150 } 01151 01152 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 01153 foreach($decomposition as $from => $to) { 01154 $code_decomp = array(); 01155 01156 while ($code_value = array_shift($to)) { 01157 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 01158 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) { 01159 array_unshift($to, $cv); 01160 } 01161 } elseif (!isset($mark["U+$code_value"])) { // remove mark 01162 array_push($code_decomp, $code_value); 01163 } 01164 } 01165 if (count($code_decomp) || isset($omit[$from])) { 01166 $decomposition[$from] = $code_decomp; 01167 } else { 01168 unset($decomposition[$from]); 01169 } 01170 } 01171 01172 // create ascii only mapping 01173 $this->toASCII['utf-8'] = array(); 01174 $ascii =& $this->toASCII['utf-8']; 01175 01176 foreach($decomposition as $from => $to) { 01177 $code_decomp = array(); 01178 while ($code_value = array_shift($to)) { 01179 $ord = hexdec($code_value); 01180 if ($ord > 127) 01181 continue 2; // skip decompositions containing non-ASCII chars 01182 else 01183 array_push($code_decomp,chr($ord)); 01184 } 01185 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp); 01186 } 01187 01188 // add numeric decompositions 01189 foreach($number as $from => $to) { 01190 $utf8_char = $this->UnumberToChar(hexdec($from)); 01191 if (!isset($ascii[$utf8_char])) { 01192 $ascii[$utf8_char] = $to; 01193 } 01194 } 01195 01196 if ($cacheFileCase) { 01197 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding)); 01198 } 01199 01200 if ($cacheFileASCII) { 01201 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii)); 01202 } 01203 01204 return 3; 01205 } 01206 01215 function initCaseFolding($charset) { 01216 // Only process if the case table is not yet loaded: 01217 if (is_array($this->caseFolding[$charset])) return 1; 01218 01219 // Use cached version if possible 01220 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl'); 01221 if ($cacheFile && @is_file($cacheFile)) { 01222 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01223 return 2; 01224 } 01225 01226 // init UTF-8 conversion for this charset 01227 if (!$this->initCharset($charset)) { 01228 return false; 01229 } 01230 01231 // UTF-8 case folding is used as the base conversion table 01232 if (!$this->initUnicodeData('case')) { 01233 return false; 01234 } 01235 01236 $nochar = chr($this->noCharByteVal); 01237 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01238 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01239 $c = $this->utf8_decode($utf8, $charset); 01240 01241 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 01242 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 01243 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc; 01244 01245 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 01246 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 01247 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc; 01248 01249 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 01250 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 01251 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc; 01252 } 01253 01254 // add the ASCII case table 01255 for ($i=ord('a'); $i<=ord('z'); $i++) { 01256 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32); 01257 } 01258 for ($i=ord('A'); $i<=ord('Z'); $i++) { 01259 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32); 01260 } 01261 01262 if ($cacheFile) { 01263 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset])); 01264 } 01265 01266 return 3; 01267 } 01268 01277 function initToASCII($charset) { 01278 // Only process if the case table is not yet loaded: 01279 if (is_array($this->toASCII[$charset])) return 1; 01280 01281 // Use cached version if possible 01282 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl'); 01283 if ($cacheFile && @is_file($cacheFile)) { 01284 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01285 return 2; 01286 } 01287 01288 // init UTF-8 conversion for this charset 01289 if (!$this->initCharset($charset)) { 01290 return false; 01291 } 01292 01293 // UTF-8/ASCII transliteration is used as the base conversion table 01294 if (!$this->initUnicodeData('ascii')) { 01295 return false; 01296 } 01297 01298 $nochar = chr($this->noCharByteVal); 01299 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01300 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01301 $c = $this->utf8_decode($utf8, $charset); 01302 01303 if (isset($this->toASCII['utf-8'][$utf8])) { 01304 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 01305 } 01306 } 01307 01308 if ($cacheFile) { 01309 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset])); 01310 } 01311 01312 return 3; 01313 } 01314 01315 01316 01317 01318 01319 01320 01321 01322 01323 01324 01325 01326 01327 01328 01329 01330 /******************************************** 01331 * 01332 * String operation functions 01333 * 01334 ********************************************/ 01335 01348 function substr($charset,$string,$start,$len=null) { 01349 if ($len===0) return ''; 01350 01351 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01352 // cannot omit $len, when specifying charset 01353 if ($len==null) { 01354 $enc = mb_internal_encoding(); // save internal encoding 01355 mb_internal_encoding('utf-8'); 01356 $str = mb_substr($string,$start); 01357 mb_internal_encoding($enc); // restore internal encoding 01358 01359 return $str; 01360 } 01361 else return mb_substr($string,$start,$len,'utf-8'); 01362 } elseif ($charset == 'utf-8') { 01363 return $this->utf8_substr($string,$start,$len); 01364 } elseif ($this->eucBasedSets[$charset]) { 01365 return $this->euc_substr($string,$start,$charset,$len); 01366 } elseif ($this->twoByteSets[$charset]) { 01367 return substr($string,$start*2,$len*2); 01368 } elseif ($this->fourByteSets[$charset]) { 01369 return substr($string,$start*4,$len*4); 01370 } 01371 01372 // treat everything else as single-byte encoding 01373 return $len === NULL ? substr($string,$start) : substr($string,$start,$len); 01374 } 01375 01386 function strlen($charset,$string) { 01387 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01388 return mb_strlen($string,$charset); 01389 } elseif ($charset == 'utf-8') { 01390 return $this->utf8_strlen($string); 01391 } elseif ($this->eucBasedSets[$charset]) { 01392 return $this->euc_strlen($string,$charset); 01393 } elseif ($this->twoByteSets[$charset]) { 01394 return strlen($string)/2; 01395 } elseif ($this->fourByteSets[$charset]) { 01396 return strlen($string)/4; 01397 } 01398 // treat everything else as single-byte encoding 01399 return strlen($string); 01400 } 01401 01414 function crop($charset,$string,$len,$crop='') { 01415 if (intval($len) == 0) return $string; 01416 01417 if ($charset == 'utf-8') { 01418 $i = $this->utf8_char2byte_pos($string,$len); 01419 } elseif ($this->eucBasedSets[$charset]) { 01420 $i = $this->euc_char2byte_pos($string,$len,$charset); 01421 } else { 01422 if ($len > 0) { 01423 $i = $len; 01424 } else { 01425 $i = strlen($string)+$len; 01426 if ($i<=0) $i = false; 01427 } 01428 } 01429 01430 if ($i === false) { // $len outside actual string length 01431 return $string; 01432 } else { 01433 if ($len > 0) { 01434 if (strlen($string{$i})) { 01435 return substr($string,0,$i).$crop; 01436 01437 } 01438 } else { 01439 if (strlen($string{$i-1})) { 01440 return $crop.substr($string,$i); 01441 } 01442 } 01443 01444 /* 01445 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 01446 if ($len > 0) { 01447 return substr($string,0,$i).$crop; 01448 } else { 01449 return $crop.substr($string,$i); 01450 } 01451 } 01452 */ 01453 } 01454 return $string; 01455 } 01456 01467 function strtrunc($charset,$string,$len) { 01468 if ($len <= 0) return ''; 01469 01470 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01471 return mb_strcut($string,0,$len,$charset); 01472 } elseif ($charset == 'utf-8') { 01473 return $this->utf8_strtrunc($string,$len); 01474 } elseif ($this->eucBasedSets[$charset]) { 01475 return $this->euc_strtrunc($string,$charset); 01476 } elseif ($this->twoByteSets[$charset]) { 01477 if ($len % 2) $len--; // don't cut at odd positions 01478 } elseif ($this->fourByteSets[$charset]) { 01479 $x = $len % 4; 01480 $len -= $x; // realign to position dividable by four 01481 } 01482 // treat everything else as single-byte encoding 01483 return substr($string,0,$len); 01484 } 01485 01501 function conv_case($charset,$string,$case) { 01502 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3) { 01503 if ($case == 'toLower') { 01504 return mb_strtolower($string,'utf-8'); 01505 } else { 01506 return mb_strtoupper($string,'utf-8'); 01507 } 01508 } elseif ($charset == 'utf-8') { 01509 return $this->utf8_char_mapping($string,'case',$case); 01510 } elseif (isset($this->eucBasedSets[$charset])) { 01511 return $this->euc_char_mapping($string,$charset,'case',$case); 01512 } else { 01513 // treat everything else as single-byte encoding 01514 return $this->sb_char_mapping($string,$charset,'case',$case); 01515 } 01516 01517 return $string; 01518 } 01519 01527 function specCharsToASCII($charset,$string) { 01528 if ($charset == 'utf-8') { 01529 return $this->utf8_char_mapping($string,'ascii'); 01530 } elseif (isset($this->eucBasedSets[$charset])) { 01531 return $this->euc_char_mapping($string,$charset,'ascii'); 01532 } else { 01533 // treat everything else as single-byte encoding 01534 return $this->sb_char_mapping($string,$charset,'ascii'); 01535 } 01536 01537 return $string; 01538 } 01539 01540 01541 01542 01543 01544 01545 01546 01547 01548 01549 01550 01551 /******************************************** 01552 * 01553 * Internal string operation functions 01554 * 01555 ********************************************/ 01556 01567 function sb_char_mapping($str,$charset,$mode,$opt='') { 01568 switch($mode) { 01569 case 'case': 01570 if (!$this->initCaseFolding($charset)) return $str; // do nothing 01571 $map =& $this->caseFolding[$charset][$opt]; 01572 break; 01573 01574 case 'ascii': 01575 if (!$this->initToASCII($charset)) return $str; // do nothing 01576 $map =& $this->toASCII[$charset]; 01577 break; 01578 01579 default: 01580 return $str; 01581 } 01582 01583 $out = ''; 01584 for($i=0; strlen($str{$i}); $i++) { 01585 $c = $str{$i}; 01586 if (isset($map[$c])) { 01587 $out .= $map[$c]; 01588 } else { 01589 $out .= $c; 01590 } 01591 } 01592 01593 return $out; 01594 } 01595 01596 01597 01598 01599 01600 01601 01602 01603 01604 01605 /******************************************** 01606 * 01607 * Internal UTF-8 string operation functions 01608 * 01609 ********************************************/ 01610 01622 function utf8_substr($str,$start,$len=null) { 01623 if (!strcmp($len,'0')) return ''; 01624 01625 $byte_start = $this->utf8_char2byte_pos($str,$start); 01626 if ($byte_start === false) { 01627 if ($start > 0) { 01628 return false; // $start outside string length 01629 } else { 01630 $start = 0; 01631 } 01632 } 01633 01634 $str = substr($str,$byte_start); 01635 01636 if ($len!=null) { 01637 $byte_end = $this->utf8_char2byte_pos($str,$len); 01638 if ($byte_end === false) // $len outside actual string length 01639 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string. 01640 else 01641 return substr($str,0,$byte_end); 01642 } 01643 else return $str; 01644 } 01645 01655 function utf8_strlen($str) { 01656 $n=0; 01657 for($i=0; strlen($str{$i}); $i++) { 01658 $c = ord($str{$i}); 01659 if (!($c & 0x80)) // single-byte (0xxxxxx) 01660 $n++; 01661 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01662 $n++; 01663 } 01664 return $n; 01665 } 01666 01676 function utf8_strtrunc($str,$len) { 01677 $i = $len-1; 01678 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 01679 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 01680 if ($i <= 0) return ''; // sanity check 01681 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 01682 if ($bc+$i > $len) return substr($str,0,$i); 01683 // fallthru: multibyte char fits into length 01684 } 01685 return substr($str,0,$len); 01686 } 01687 01698 function utf8_strpos($haystack,$needle,$offset=0) { 01699 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01700 return mb_strpos($haystack,$needle,'utf-8'); 01701 } 01702 01703 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset); 01704 if ($byte_offset === false) return false; // offset beyond string length 01705 01706 $byte_pos = strpos($haystack,$needle,$byte_offset); 01707 if ($byte_pos === false) return false; // needle not found 01708 01709 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01710 } 01711 01721 function utf8_strrpos($haystack,$needle) { 01722 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01723 return mb_strrpos($haystack,$needle,'utf-8'); 01724 } 01725 01726 $byte_pos = strrpos($haystack,$needle); 01727 if ($byte_pos === false) return false; // needle not found 01728 01729 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01730 } 01731 01741 function utf8_char2byte_pos($str,$pos) { 01742 $n = 0; // number of characters found 01743 $p = abs($pos); // number of characters wanted 01744 01745 if ($pos >= 0) { 01746 $i = 0; 01747 $d = 1; 01748 } else { 01749 $i = strlen($str)-1; 01750 $d = -1; 01751 } 01752 01753 for( ; strlen($str{$i}) && $n<$p; $i+=$d) { 01754 $c = (int)ord($str{$i}); 01755 if (!($c & 0x80)) // single-byte (0xxxxxx) 01756 $n++; 01757 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01758 $n++; 01759 } 01760 if (!strlen($str{$i})) return false; // offset beyond string length 01761 01762 if ($pos >= 0) { 01763 // skip trailing multi-byte data bytes 01764 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; } 01765 } else { 01766 // correct offset 01767 $i++; 01768 } 01769 01770 return $i; 01771 } 01772 01782 function utf8_byte2char_pos($str,$pos) { 01783 $n = 0; // number of characters 01784 for($i=$pos; $i>0; $i--) { 01785 $c = (int)ord($str{$i}); 01786 if (!($c & 0x80)) // single-byte (0xxxxxx) 01787 $n++; 01788 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01789 $n++; 01790 } 01791 if (!strlen($str{$i})) return false; // offset beyond string length 01792 01793 return $n; 01794 } 01795 01805 function utf8_char_mapping($str,$mode,$opt='') { 01806 if (!$this->initUnicodeData($mode)) return $str; // do nothing 01807 01808 $out = ''; 01809 switch($mode) { 01810 case 'case': 01811 $map =& $this->caseFolding['utf-8'][$opt]; 01812 break; 01813 01814 case 'ascii': 01815 $map =& $this->toASCII['utf-8']; 01816 break; 01817 01818 default: 01819 return $str; 01820 } 01821 01822 for($i=0; strlen($str{$i}); $i++) { 01823 $c = ord($str{$i}); 01824 if (!($c & 0x80)) // single-byte (0xxxxxx) 01825 $mbc = $str{$i}; 01826 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 01827 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes 01828 $mbc = substr($str,$i,$bc); 01829 $i += $bc-1; 01830 } 01831 01832 if (isset($map[$mbc])) { 01833 $out .= $map[$mbc]; 01834 } else { 01835 $out .= $mbc; 01836 } 01837 } 01838 01839 return $out; 01840 } 01841 01842 01843 01844 01845 01846 01847 01848 01849 01850 01851 01852 01853 01854 01855 01856 01857 01858 01859 /******************************************** 01860 * 01861 * Internal EUC string operation functions 01862 * 01863 * Extended Unix Code: 01864 * ASCII compatible 7bit single bytes chars 01865 * 8bit two byte chars 01866 * 01867 * Shift-JIS is treated as a special case. 01868 * 01869 ********************************************/ 01870 01881 function euc_strtrunc($str,$len,$charset) { 01882 $sjis = ($charset == 'shift_jis'); 01883 for ($i=0; strlen($str{$i}) && $i<$len; $i++) { 01884 $c = ord($str{$i}); 01885 if ($sjis) { 01886 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01887 } 01888 else { 01889 if ($c >= 0x80) $i++; // advance a double-byte char 01890 } 01891 } 01892 if (!strlen($str{$i})) return $str; // string shorter than supplied length 01893 01894 if ($i>$len) 01895 return substr($str,0,$len-1); // we ended on a first byte 01896 else 01897 return substr($str,0,$len); 01898 } 01899 01910 function euc_substr($str,$start,$charset,$len=null) { 01911 $byte_start = $this->euc_char2byte_pos($str,$start,$charset); 01912 if ($byte_start === false) return false; // $start outside string length 01913 01914 $str = substr($str,$byte_start); 01915 01916 if ($len!=null) { 01917 $byte_end = $this->euc_char2byte_pos($str,$len,$charset); 01918 if ($byte_end === false) // $len outside actual string length 01919 return $str; 01920 else 01921 return substr($str,0,$byte_end); 01922 } 01923 else return $str; 01924 } 01925 01935 function euc_strlen($str,$charset) { 01936 $sjis = ($charset == 'shift_jis'); 01937 $n=0; 01938 for ($i=0; strlen($str{$i}); $i++) { 01939 $c = ord($str{$i}); 01940 if ($sjis) { 01941 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01942 } 01943 else { 01944 if ($c >= 0x80) $i++; // advance a double-byte char 01945 } 01946 01947 $n++; 01948 } 01949 01950 return $n; 01951 } 01952 01962 function euc_char2byte_pos($str,$pos,$charset) { 01963 $sjis = ($charset == 'shift_jis'); 01964 $n = 0; // number of characters seen 01965 $p = abs($pos); // number of characters wanted 01966 01967 if ($pos >= 0) { 01968 $i = 0; 01969 $d = 1; 01970 } else { 01971 $i = strlen($str)-1; 01972 $d = -1; 01973 } 01974 01975 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) { 01976 $c = ord($str{$i}); 01977 if ($sjis) { 01978 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char 01979 } 01980 else { 01981 if ($c >= 0x80) $i+=$d; // advance a double-byte char 01982 } 01983 01984 $n++; 01985 } 01986 if (!strlen($str{$i})) return false; // offset beyond string length 01987 01988 if ($pos < 0) $i++; // correct offset 01989 01990 return $i; 01991 } 01992 02003 function euc_char_mapping($str,$charset,$mode,$opt='') { 02004 switch($mode) { 02005 case 'case': 02006 if (!$this->initCaseFolding($charset)) return $str; // do nothing 02007 $map =& $this->caseFolding[$charset][$opt]; 02008 break; 02009 02010 case 'ascii': 02011 if (!$this->initToASCII($charset)) return $str; // do nothing 02012 $map =& $this->toASCII[$charset]; 02013 break; 02014 02015 default: 02016 return $str; 02017 } 02018 02019 $sjis = ($charset == 'shift_jis'); 02020 $out = ''; 02021 for($i=0; strlen($str{$i}); $i++) { 02022 $mbc = $str{$i}; 02023 $c = ord($mbc); 02024 02025 if ($sjis) { 02026 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 02027 $mbc = substr($str,$i,2); 02028 $i++; 02029 } 02030 } 02031 else { 02032 if ($c >= 0x80) { // a double-byte char 02033 $mbc = substr($str,$i,2); 02034 $i++; 02035 } 02036 } 02037 02038 if (isset($map[$mbc])) { 02039 $out .= $map[$mbc]; 02040 } else { 02041 $out .= $mbc; 02042 } 02043 } 02044 02045 return $out; 02046 } 02047 02048 } 02049 02050 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) { 02051 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 02052 } 02053 ?>