Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the Typo3 project. The Typo3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * 00017 * This script is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * This copyright notice MUST APPEAR in all copies of the script! 00023 ***************************************************************/ 00136 class t3lib_cs { 00137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent. 00138 00139 // This is the array where parsed conversion tables are stored (cached) 00140 var $parsedCharsets=array(); 00141 00142 // An array where case folding data will be stored (cached) 00143 var $caseFolding=array(); 00144 00145 // An array where charset-to-ASCII mappings are stored (cached) 00146 var $toASCII=array(); 00147 00148 // This tells the converter which charsets has two bytes per char: 00149 var $twoByteSets=array( 00150 'ucs-2'=>1, // 2-byte Unicode 00151 ); 00152 00153 // This tells the converter which charsets has four bytes per char: 00154 var $fourByteSets=array( 00155 'ucs-4'=>1, // 4-byte Unicode 00156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 00157 ); 00158 00159 // This tells the converter which charsets use a scheme like the Extended Unix Code: 00160 var $eucBasedSets=array( 00161 'gb2312'=>1, // Chinese, simplified. 00162 'big5'=>1, // Chinese, traditional. 00163 'euc-kr'=>1, // Korean 00164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 00165 ); 00166 00167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 00168 // http://czyborra.com/charsets/iso8859.html 00169 var $synonyms=array( 00170 'us' => 'ascii', 00171 'us-ascii'=> 'ascii', 00172 'cp819' => 'iso-8859-1', 00173 'ibm819' => 'iso-8859-1', 00174 'iso-ir-100' => 'iso-8859-1', 00175 'iso-ir-109' => 'iso-8859-2', 00176 'iso-ir-148' => 'iso-8859-9', 00177 'iso-ir-199' => 'iso-8859-14', 00178 'iso-ir-203' => 'iso-8859-15', 00179 'csisolatin1' => 'iso-8859-1', 00180 'csisolatin2' => 'iso-8859-2', 00181 'csisolatin3' => 'iso-8859-3', 00182 'csisolatin5' => 'iso-8859-9', 00183 'csisolatin8' => 'iso-8859-14', 00184 'csisolatin9' => 'iso-8859-15', 00185 'csisolatingreek' => 'iso-8859-7', 00186 'iso-celtic' => 'iso-8859-14', 00187 'latin1' => 'iso-8859-1', 00188 'latin2' => 'iso-8859-2', 00189 'latin3' => 'iso-8859-3', 00190 'latin5' => 'iso-8859-9', 00191 'latin6' => 'iso-8859-10', 00192 'latin8' => 'iso-8859-14', 00193 'latin9' => 'iso-8859-15', 00194 'l1' => 'iso-8859-1', 00195 'l2' => 'iso-8859-2', 00196 'l3' => 'iso-8859-3', 00197 'l5' => 'iso-8859-9', 00198 'l6' => 'iso-8859-10', 00199 'l8' => 'iso-8859-14', 00200 'l9' => 'iso-8859-15', 00201 'cyrillic' => 'iso-8859-5', 00202 'arabic' => 'iso-8859-6', 00203 'tis-620' => 'iso-8859-11', 00204 'win874' => 'windows-874', 00205 'win1250' => 'windows-1250', 00206 'win1251' => 'windows-1251', 00207 'win1252' => 'windows-1252', 00208 'win1253' => 'windows-1253', 00209 'win1254' => 'windows-1254', 00210 'win1255' => 'windows-1255', 00211 'win1256' => 'windows-1256', 00212 'win1257' => 'windows-1257', 00213 'win1258' => 'windows-1258', 00214 'cp1250' => 'windows-1250', 00215 'cp1251' => 'windows-1251', 00216 'cp1252' => 'windows-1252', 00217 'ms-ee' => 'windows-1250', 00218 'ms-ansi' => 'windows-1252', 00219 'ms-greek' => 'windows-1253', 00220 'ms-turk' => 'windows-1254', 00221 'winbaltrim' => 'windows-1257', 00222 'koi-8ru' => 'koi-8r', 00223 'koi8r' => 'koi-8r', 00224 'cp878' => 'koi-8r', 00225 'mac' => 'macroman', 00226 'macintosh' => 'macroman', 00227 'euc-cn' => 'gb2312', 00228 'x-euc-cn' => 'gb2312', 00229 'euccn' => 'gb2312', 00230 'cp936' => 'gb2312', 00231 'big-5' => 'big5', 00232 'cp950' => 'big5', 00233 'eucjp' => 'euc-jp', 00234 'sjis' => 'shift_jis', 00235 'shift-jis' => 'shift_jis', 00236 'cp932' => 'shift_jis', 00237 'cp949' => 'euc-kr', 00238 'utf7' => 'utf-7', 00239 'utf8' => 'utf-8', 00240 'utf16' => 'utf-16', 00241 'utf32' => 'utf-32', 00242 'utf8' => 'utf-8', 00243 'ucs2' => 'ucs-2', 00244 'ucs4' => 'ucs-4', 00245 ); 00246 00247 // mapping of iso-639:2 language codes to language (family) names 00248 var $lang_to_langfamily=array( 00249 // iso-639:2 language codes, see: 00250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm 00251 // http://www.unicode.org/onlinedat/languages.html 00252 'ar' => 'arabic', 00253 'bg' => 'cyrillic', 00254 'cs' => 'east_european', 00255 'da' => 'west_european', 00256 'de' => 'west_european', 00257 'es' => 'west_european', 00258 'et' => 'estonian', 00259 'eu' => 'west_european', 00260 'fi' => 'west_european', 00261 'fr' => 'west_european', 00262 'gr' => 'greek', 00263 'hr' => 'east_european', 00264 'hu' => 'east_european', 00265 'iw' => 'hebrew', 00266 'is' => 'west_european', 00267 'it' => 'west_european', 00268 'ja' => 'japanese', 00269 'kl' => 'west_european', 00270 'ko' => 'korean', 00271 'lt' => 'lithuanian', 00272 'lv' => 'west_european', // Latvian/Lettish 00273 'nl' => 'west_european', 00274 'no' => 'west_european', 00275 'pl' => 'east_european', 00276 'pt' => 'west_european', 00277 'ro' => 'east_european', 00278 'ru' => 'cyrillic', 00279 'sk' => 'east_european', 00280 'sl' => 'east_european', 00281 'sv' => 'west_european', 00282 'th' => 'thai', 00283 'uk' => 'cyrillic', 00284 'vi' => 'vietnamese', 00285 'zh' => 'chinese', 00286 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 00287 'chs' => 'simpl_chinese', 00288 'cht' => 'trad_chinese', 00289 'csy' => 'east_european', 00290 'dan' => 'west_european', 00291 'deu' => 'west_european', 00292 'dea' => 'west_european', 00293 'des' => 'west_european', 00294 'ena' => 'west_european', 00295 'enc' => 'west_european', 00296 'eng' => 'west_european', 00297 'enz' => 'west_european', 00298 'enu' => 'west_european', 00299 'nld' => 'west_european', 00300 'nlb' => 'west_european', 00301 'fin' => 'west_european', 00302 'fra' => 'west_european', 00303 'frb' => 'west_european', 00304 'frc' => 'west_european', 00305 'frs' => 'west_european', 00306 'ell' => 'greek', 00307 'hun' => 'east_european', 00308 'isl' => 'west_euorpean', 00309 'ita' => 'west_european', 00310 'its' => 'west_european', 00311 'jpn' => 'japanese', 00312 'kor' => 'korean', 00313 'nor' => 'west_european', 00314 'non' => 'west_european', 00315 'plk' => 'east_european', 00316 'ptg' => 'west_european', 00317 'ptb' => 'west_european', 00318 'rus' => 'east_european', 00319 'sky' => 'east_european', 00320 'esp' => 'west_european', 00321 'esm' => 'west_european', 00322 'esn' => 'west_european', 00323 'sve' => 'west_european', 00324 'trk' => 'turkish', 00325 // English language names 00326 'bulgarian' => 'east_european', 00327 'catalan' => 'west_european', 00328 'croatian' => 'east_european', 00329 'czech' => 'east_european', 00330 'danish' => 'west_european', 00331 'dutch' => 'west_european', 00332 'english' => 'west_european', 00333 'finnish' => 'west_european', 00334 'french' => 'west_european', 00335 'galician' => 'west_european', 00336 'german' => 'west_european', 00337 'hungarian' => 'east_european', 00338 'icelandic' => 'west_european', 00339 'italian' => 'west_european', 00340 'latvian' => 'west_european', 00341 'lettish' => 'west_european', 00342 'norwegian' => 'west_european', 00343 'polish' => 'east_european', 00344 'portuguese' => 'west_european', 00345 'russian' => 'cyrillic', 00346 'romanian' => 'east_european', 00347 'slovak' => 'east_european', 00348 'slovenian' => 'east_european', 00349 'spanish' => 'west_european', 00350 'svedish' => 'west_european', 00351 'turkish' => 'east_european', 00352 'ukrainian' => 'cyrillic', 00353 ); 00354 00355 // mapping of language (family) names to charsets on Unix 00356 var $lang_to_charset_unix=array( 00357 'west_european' => 'iso-8859-1', 00358 'estonian' => 'iso-8859-1', 00359 'east_european' => 'iso-8859-2', 00360 'baltic' => 'iso-8859-4', 00361 'cyrillic' => 'iso-8859-5', 00362 'arabic' => 'iso-8859-6', 00363 'greek' => 'iso-8859-7', 00364 'hebrew' => 'iso-8859-8', 00365 'turkish' => 'iso-8859-9', 00366 'thai' => 'iso-8859-11', // = TIS-620 00367 'lithuanian' => 'iso-8859-13', 00368 'chinese' => 'gb2312', // = euc-cn 00369 'japanese' => 'euc-jp', 00370 'korean' => 'euc-kr', 00371 'simpl_chinese' => 'gb2312', 00372 'trad_chinese' => 'big5', 00373 'vietnamese' => '', 00374 ); 00375 00376 // mapping of language (family) names to charsets on Windows 00377 var $lang_to_charset_windows=array( 00378 'east_european' => 'windows-1250', 00379 'cyrillic' => 'windows-1251', 00380 'west_european' => 'windows-1252', 00381 'greek' => 'windows-1253', 00382 'turkish' => 'windows-1254', 00383 'hebrew' => 'windows-1255', 00384 'arabic' => 'windows-1256', 00385 'baltic' => 'windows-1257', 00386 'estonian' => 'windows-1257', 00387 'lithuanian' => 'windows-1257', 00388 'vietnamese' => 'windows-1258', 00389 'thai' => 'cp874', 00390 'korean' => 'cp949', 00391 'chinese' => 'gb2312', 00392 'japanese' => 'shift_jis', 00393 'simpl_chinese' => 'gb2312', 00394 'trad_chinese' => 'big5', 00395 ); 00396 00397 // mapping of locale names to charsets 00398 var $locale_to_charset=array( 00399 'japanese.euc' => 'euc-jp', 00400 'ja_jp.ujis' => 'euc-jp', 00401 'korean.euc' => 'euc-kr', 00402 'zh_cn' => 'gb2312', 00403 'zh_hk' => 'big5', 00404 'zh_tw' => 'big5', 00405 ); 00406 00407 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 00408 // Empty values means "iso-8859-1" 00409 var $charSetArray = array( 00410 'dk' => '', 00411 'de' => '', 00412 'no' => '', 00413 'it' => '', 00414 'fr' => '', 00415 'es' => '', 00416 'nl' => '', 00417 'cz' => 'windows-1250', 00418 'pl' => 'iso-8859-2', 00419 'si' => 'windows-1250', 00420 'fi' => '', 00421 'tr' => 'iso-8859-9', 00422 'se' => '', 00423 'pt' => '', 00424 'ru' => 'windows-1251', 00425 'ro' => 'iso-8859-2', 00426 'ch' => 'gb2312', 00427 'sk' => 'windows-1250', 00428 'lt' => 'windows-1257', 00429 'is' => 'utf-8', 00430 'hr' => 'windows-1250', 00431 'hu' => 'iso-8859-2', 00432 'gl' => '', 00433 'th' => 'iso-8859-11', 00434 'gr' => 'iso-8859-7', 00435 'hk' => 'big5', 00436 'eu' => '', 00437 'bg' => 'windows-1251', 00438 'br' => '', 00439 'et' => 'iso-8859-4', 00440 'ar' => 'iso-8859-6', 00441 'he' => 'utf-8', 00442 'ua' => 'windows-1251', 00443 'jp' => 'shift_jis', 00444 'lv' => 'utf-8', 00445 'vn' => 'utf-8', 00446 'ca' => 'iso-8859-15', 00447 'ba' => 'iso-8859-2', 00448 'kr' => 'euc-kr', 00449 ); 00450 00451 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 00452 // Empty values means sames as Typo3 00453 var $isoArray = array( 00454 'dk' => 'da', 00455 'de' => '', 00456 'no' => '', 00457 'it' => '', 00458 'fr' => '', 00459 'es' => '', 00460 'nl' => '', 00461 'cz' => 'cs', 00462 'pl' => '', 00463 'si' => 'sl', 00464 'fi' => '', 00465 'tr' => '', 00466 'se' => 'sv', 00467 'pt' => '', 00468 'ru' => '', 00469 'ro' => '', 00470 'ch' => 'zh_CN', 00471 'sk' => '', 00472 'lt' => '', 00473 'is' => '', 00474 'hr' => '', 00475 'hu' => '', 00476 'gl' => '', // Greenlandic 00477 'th' => '', 00478 'gr' => 'el', 00479 'hk' => 'zh_HK', 00480 'eu' => '', 00481 'bg' => '', 00482 'br' => 'pt_BR', 00483 'et' => '', 00484 'ar' => '', 00485 'he' => 'iw', 00486 'ua' => 'uk', 00487 'jp' => 'ja', 00488 'lv' => '', 00489 'vn' => 'vi', 00490 'ca' => '', 00491 'ba' => '', // Bosnian 00492 'kr' => '', 00493 ); 00494 00502 function parse_charset($charset) { 00503 $charset = strtolower($charset); 00504 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset]; 00505 00506 return $charset; 00507 } 00508 00521 function get_locale_charset($locale) { 00522 $locale = strtolower($locale); 00523 00524 // exact locale specific charset? 00525 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale]; 00526 00527 // get modifier 00528 list($locale,$modifier) = explode('@',$locale); 00529 00530 // locale contains charset: use it 00531 list($locale,$charset) = explode('.',$locale); 00532 if ($charset) return $this->parse_charset($charset); 00533 00534 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 00535 if ($modifier == 'euro') return 'iso-8859-15'; 00536 00537 // get language 00538 list($language,$country) = explode('_',$locale); 00539 if (isset($this->lang_to_langfamily[$language])) $language = $this->lang_to_langfamily[$language]; 00540 00541 if (TYPO3_OS == 'WIN') { 00542 $cs = $this->lang_to_charset_windows[$language]; 00543 } else { 00544 $cs = $this->lang_to_charset_unix[$language]; 00545 } 00546 00547 return $cs ? $cs : 'iso-8859-1'; 00548 } 00549 00550 00551 00552 00553 00554 00555 00556 00557 00558 /******************************************** 00559 * 00560 * Charset Conversion functions 00561 * 00562 ********************************************/ 00563 00574 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) { 00575 if ($fromCS==$toCS) return $str; 00576 00577 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 00578 if ($toCS=='utf-8' || !$useEntityForNoChar) { 00579 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 00580 case 'mbstring': 00581 $conv_str = mb_convert_encoding($str,$toCS,$fromCS); 00582 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets 00583 break; 00584 00585 case 'iconv': 00586 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str); 00587 if (false !== $conv_str) return $conv_str; 00588 break; 00589 00590 case 'recode': 00591 $conv_str = recode_string($fromCS.'..'.$toCS,$str); 00592 if (false !== $conv_str) return $conv_str; 00593 break; 00594 } 00595 // fallback to TYPO3 conversion 00596 } 00597 00598 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS); 00599 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar); 00600 return $str; 00601 } 00602 00614 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) { 00615 foreach($array as $key => $value) { 00616 if (is_array($array[$key])) { 00617 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00618 } else { 00619 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00620 } 00621 } 00622 } 00623 00631 function utf8_encode($str,$charset) { 00632 00633 // Charset is case-insensitive. 00634 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00635 $strLen = strlen($str); 00636 $outStr=''; 00637 00638 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string. 00639 $chr=substr($str,$a,1); 00640 $ord=ord($chr); 00641 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 00642 $ord2 = ord($str{$a+1}); 00643 $ord = $ord<<8 & $ord2; // assume big endian 00644 00645 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00646 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 00647 } else $outStr.=chr($this->noCharByteVal); // No char exists 00648 $a++; 00649 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8 00650 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 00651 $a++; 00652 $ord2=ord(substr($str,$a,1)); 00653 $ord = $ord*256+$ord2; 00654 } 00655 elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223)) { // Shift-JIS is like EUC, but chars between 160 and 223 are single byte 00656 $a++; 00657 $ord2=ord(substr($str,$a,1)); 00658 $ord = $ord*256+$ord2; 00659 } 00660 00661 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00662 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 00663 } else $outStr.=chr($this->noCharByteVal); // No char exists 00664 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00665 } 00666 return $outStr; 00667 } 00668 } 00669 00678 function utf8_decode($str,$charset,$useEntityForNoChar=0) { 00679 00680 // Charset is case-insensitive. 00681 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00682 $strLen = strlen($str); 00683 $outStr=''; 00684 $buf=''; 00685 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string. 00686 $chr=substr($str,$a,1); 00687 $ord=ord($chr); 00688 if ($ord>127) { // This means multibyte! (first byte!) 00689 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00690 00691 $buf=$chr; // Add first byte 00692 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00693 $ord = $ord << 1; // Shift it left and ... 00694 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00695 $a++; // Increase pointer... 00696 $buf.=substr($str,$a,1); // ... and add the next char. 00697 } else break; 00698 } 00699 00700 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 00701 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 00702 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 00703 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255); 00704 } else $outStr.= chr($mByte); 00705 } elseif ($useEntityForNoChar) { // Create num entity: 00706 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00707 } else $outStr.=chr($this->noCharByteVal); // No char exists 00708 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00709 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00710 } 00711 return $outStr; 00712 } 00713 } 00714 00721 function utf8_to_entities($str) { 00722 $strLen = strlen($str); 00723 $outStr=''; 00724 $buf=''; 00725 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00726 $chr=substr($str,$a,1); 00727 $ord=ord($chr); 00728 if ($ord>127) { // This means multibyte! (first byte!) 00729 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00730 $buf=$chr; // Add first byte 00731 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00732 $ord = $ord << 1; // Shift it left and ... 00733 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00734 $a++; // Increase pointer... 00735 $buf.=substr($str,$a,1); // ... and add the next char. 00736 } else break; 00737 } 00738 00739 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00740 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00741 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00742 } 00743 00744 return $outStr; 00745 } 00746 00754 function entities_to_utf8($str,$alsoStdHtmlEnt=0) { 00755 if ($alsoStdHtmlEnt) { 00756 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 00757 } 00758 00759 $token = md5(microtime()); 00760 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str)); 00761 foreach($parts as $k => $v) { 00762 if ($k%2) { 00763 if (substr($v,0,1)=='#') { // Dec or hex entities: 00764 if (substr($v,1,1)=='x') { 00765 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2))); 00766 } else { 00767 $parts[$k] = $this->UnumberToChar(substr($v,1)); 00768 } 00769 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities: 00770 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1'); 00771 } else { // No conversion: 00772 $parts[$k] ='&'.$v.';'; 00773 } 00774 } 00775 } 00776 00777 return implode('',$parts); 00778 } 00779 00788 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) { 00789 // If entities must be registered as well...: 00790 if ($convEntities) { 00791 $str = $this->entities_to_utf8($str,1); 00792 } 00793 // Do conversion: 00794 $strLen = strlen($str); 00795 $outArr=array(); 00796 $buf=''; 00797 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00798 $chr=substr($str,$a,1); 00799 $ord=ord($chr); 00800 if ($ord>127) { // This means multibyte! (first byte!) 00801 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00802 $buf=$chr; // Add first byte 00803 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00804 $ord = $ord << 1; // Shift it left and ... 00805 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00806 $a++; // Increase pointer... 00807 $buf.=substr($str,$a,1); // ... and add the next char. 00808 } else break; 00809 } 00810 00811 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf); 00812 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!) 00813 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00814 } 00815 00816 return $outArr; 00817 } 00818 00838 function UnumberToChar($cbyte) { 00839 $str=''; 00840 00841 if ($cbyte < 0x80) { 00842 $str.=chr($cbyte); 00843 } else if ($cbyte < 0x800) { 00844 $str.=chr(0xC0 | ($cbyte >> 6)); 00845 $str.=chr(0x80 | ($cbyte & 0x3F)); 00846 } else if ($cbyte < 0x10000) { 00847 $str.=chr(0xE0 | ($cbyte >> 12)); 00848 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00849 $str.=chr(0x80 | ($cbyte & 0x3F)); 00850 } else if ($cbyte < 0x200000) { 00851 $str.=chr(0xF0 | ($cbyte >> 18)); 00852 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00853 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00854 $str.=chr(0x80 | ($cbyte & 0x3F)); 00855 } else if ($cbyte < 0x4000000) { 00856 $str.=chr(0xF8 | ($cbyte >> 24)); 00857 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00858 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00859 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00860 $str.=chr(0x80 | ($cbyte & 0x3F)); 00861 } else if ($cbyte < 0x80000000) { 00862 $str.=chr(0xFC | ($cbyte >> 30)); 00863 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F)); 00864 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00865 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00866 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00867 $str.=chr(0x80 | ($cbyte & 0x3F)); 00868 } else { // Cannot express a 32-bit character in UTF-8 00869 $str .= chr($this->noCharByteVal); 00870 } 00871 return $str; 00872 } 00873 00883 function utf8CharToUnumber($str,$hex=0) { 00884 $ord=ord(substr($str,0,1)); // First char 00885 00886 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 00887 $binBuf=''; 00888 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00889 $ord = $ord << 1; // Shift it left and ... 00890 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00891 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6); 00892 } else break; 00893 } 00894 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf; 00895 00896 $int = bindec($binBuf); 00897 } else $int = $ord; 00898 00899 return $hex ? 'x'.dechex($int) : $int; 00900 } 00901 00902 00903 00904 00905 00906 00907 00908 00909 00910 /******************************************** 00911 * 00912 * Init functions 00913 * 00914 ********************************************/ 00915 00926 function initCharset($charset) { 00927 // Only process if the charset is not yet loaded: 00928 if (!is_array($this->parsedCharsets[$charset])) { 00929 00930 // Conversion table filename: 00931 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl'; 00932 00933 // If the conversion table is found: 00934 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 00935 // Cache file for charsets: 00936 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 00937 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl'); 00938 if ($cacheFile && @is_file($cacheFile)) { 00939 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile)); 00940 } else { 00941 // Parse conversion table into lines: 00942 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1); 00943 // Initialize the internal variable holding the conv. table: 00944 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array()); 00945 // traverse the lines: 00946 $detectedType=''; 00947 foreach($lines as $value) { 00948 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored. 00949 00950 // Detect type if not done yet: (Done on first real line) 00951 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 00952 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token'; 00953 00954 if ($detectedType=='ms-token') { 00955 list($hexbyte,$utf8) = split('=|:',$value,3); 00956 } elseif ($detectedType=='whitespaced') { 00957 $regA=array(); 00958 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA); 00959 $hexbyte = $regA[1]; 00960 $utf8 = 'U+'.$regA[2]; 00961 } 00962 $decval = hexdec(trim($hexbyte)); 00963 if ($decval>127) { 00964 $utf8decval = hexdec(substr(trim($utf8),2)); 00965 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval); 00966 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval; 00967 } 00968 } 00969 } 00970 if ($cacheFile) { 00971 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset])); 00972 } 00973 } 00974 return 2; 00975 } else return false; 00976 } else return 1; 00977 } 00978 00988 function initUnicodeData($mode=null) { 00989 // cache files 00990 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 00991 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 00992 00993 // Only process if the tables are not yet loaded 00994 switch($mode) { 00995 case 'case': 00996 if (is_array($this->caseFolding['utf-8'])) return 1; 00997 00998 // Use cached version if possible 00999 if ($cacheFileCase && @is_file($cacheFileCase)) { 01000 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 01001 return 2; 01002 } 01003 break; 01004 01005 case 'ascii': 01006 if (is_array($this->toASCII['utf-8'])) return 1; 01007 01008 // Use cached version if possible 01009 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 01010 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 01011 return 2; 01012 } 01013 break; 01014 } 01015 01016 // process main Unicode data file 01017 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt'; 01018 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false; 01019 01020 $fh = fopen($unicodeDataFile,'rb'); 01021 if (!$fh) return false; 01022 01023 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 01024 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 01025 $this->caseFolding['utf-8'] = array(); 01026 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 01027 $utf8CaseFolding['toUpper'] = array(); 01028 $utf8CaseFolding['toLower'] = array(); 01029 $utf8CaseFolding['toTitle'] = array(); 01030 01031 $decomposition = array(); // array of temp. decompositions 01032 $mark = array(); // array of chars that are marks (eg. composing accents) 01033 $number = array(); // array of chars that are numbers (eg. digits) 01034 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 01035 01036 while (!feof($fh)) { 01037 $line = fgets($fh,4096); 01038 // has a lot of info 01039 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line)); 01040 01041 $ord = hexdec($char); 01042 if ($ord > 0xFFFF) break; // only process the BMP 01043 01044 $utf8_char = $this->UnumberToChar($ord); 01045 01046 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 01047 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 01048 // store "title" only when different from "upper" (only a few) 01049 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 01050 01051 switch ($cat{0}) { 01052 case 'M': // mark (accent, umlaut, ...) 01053 $mark["U+$char"] = 1; 01054 break; 01055 01056 case 'N': // numeric value 01057 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num; 01058 } 01059 01060 // accented Latin letters without "official" decomposition 01061 $match = array(); 01062 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) { 01063 $c = ord($match[2]); 01064 if ($match[1] == 'SMALL') $c += 32; 01065 01066 $decomposition["U+$char"] = array(dechex($c)); 01067 continue; 01068 } 01069 01070 $match = array(); 01071 if (ereg('(<.*>)? *(.+)',$decomp,$match)) { 01072 switch($match[1]) { 01073 case '<circle>': // add parenthesis as circle replacement, eg (1) 01074 $match[2] = '0028 '.$match[2].' 0029'; 01075 break; 01076 01077 case '<square>': // add square brackets as square replacement, eg [1] 01078 $match[2] = '005B '.$match[2].' 005D'; 01079 break; 01080 01081 case '<compat>': // ignore multi char decompositions that start with a space 01082 if (ereg('^0020 ',$match[2])) continue 2; 01083 break; 01084 01085 // ignore Arabic and vertical layout presentation decomposition 01086 case '<initial>': 01087 case '<medial>': 01088 case '<final>': 01089 case '<isolated>': 01090 case '<vertical>': 01091 continue 2; 01092 } 01093 $decomposition["U+$char"] = split(' ',$match[2]); 01094 } 01095 } 01096 fclose($fh); 01097 01098 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 01099 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt'; 01100 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 01101 $fh = fopen($specialCasingFile,'rb'); 01102 if ($fh) { 01103 while (!feof($fh)) { 01104 $line = fgets($fh,4096); 01105 if ($line{0} != '#' && trim($line) != '') { 01106 01107 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line); 01108 if ($cond == '' || $cond{0} == '#') { 01109 $utf8_char = $this->UnumberToChar(hexdec($char)); 01110 if ($char != $lower) { 01111 $arr = split(' ',$lower); 01112 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01113 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr); 01114 } 01115 if ($char != $title && $title != $upper) { 01116 $arr = split(' ',$title); 01117 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01118 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr); 01119 } 01120 if ($char != $upper) { 01121 $arr = split(' ',$upper); 01122 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01123 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr); 01124 } 01125 } 01126 } 01127 } 01128 fclose($fh); 01129 } 01130 } 01131 01132 // process custom decompositions 01133 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt'; 01134 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 01135 $fh = fopen($customTranslitFile,'rb'); 01136 if ($fh) { 01137 while (!feof($fh)) { 01138 $line = fgets($fh,4096); 01139 if ($line{0} != '#' && trim($line) != '') { 01140 list($char,$translit) = t3lib_div::trimExplode(';', $line); 01141 if (!$translit) $omit["U+$char"] = 1; 01142 $decomposition["U+$char"] = split(' ', $translit); 01143 01144 } 01145 } 01146 fclose($fh); 01147 } 01148 } 01149 01150 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 01151 foreach($decomposition as $from => $to) { 01152 $code_decomp = array(); 01153 01154 while ($code_value = array_shift($to)) { 01155 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 01156 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) { 01157 array_unshift($to, $cv); 01158 } 01159 } elseif (!isset($mark["U+$code_value"])) { // remove mark 01160 array_push($code_decomp, $code_value); 01161 } 01162 } 01163 if (count($code_decomp) || isset($omit[$from])) { 01164 $decomposition[$from] = $code_decomp; 01165 } else { 01166 unset($decomposition[$from]); 01167 } 01168 } 01169 01170 // create ascii only mapping 01171 $this->toASCII['utf-8'] = array(); 01172 $ascii =& $this->toASCII['utf-8']; 01173 01174 foreach($decomposition as $from => $to) { 01175 $code_decomp = array(); 01176 while ($code_value = array_shift($to)) { 01177 $ord = hexdec($code_value); 01178 if ($ord > 127) 01179 continue 2; // skip decompositions containing non-ASCII chars 01180 else 01181 array_push($code_decomp,chr($ord)); 01182 } 01183 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp); 01184 } 01185 01186 // add numeric decompositions 01187 foreach($number as $from => $to) { 01188 $utf8_char = $this->UnumberToChar(hexdec($from)); 01189 if (!isset($ascii[$utf8_char])) { 01190 $ascii[$utf8_char] = $to; 01191 } 01192 } 01193 01194 if ($cacheFileCase) { 01195 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding)); 01196 } 01197 01198 if ($cacheFileASCII) { 01199 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii)); 01200 } 01201 01202 return 3; 01203 } 01204 01213 function initCaseFolding($charset) { 01214 // Only process if the case table is not yet loaded: 01215 if (is_array($this->caseFolding[$charset])) return 1; 01216 01217 // Use cached version if possible 01218 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl'); 01219 if ($cacheFile && @is_file($cacheFile)) { 01220 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01221 return 2; 01222 } 01223 01224 // init UTF-8 conversion for this charset 01225 if (!$this->initCharset($charset)) { 01226 return false; 01227 } 01228 01229 // UTF-8 case folding is used as the base conversion table 01230 if (!$this->initUnicodeData('case')) { 01231 return false; 01232 } 01233 01234 $nochar = chr($this->noCharByteVal); 01235 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01236 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01237 $c = $this->utf8_decode($utf8, $charset); 01238 01239 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 01240 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 01241 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc; 01242 01243 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 01244 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 01245 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc; 01246 01247 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 01248 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 01249 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc; 01250 } 01251 01252 // add the ASCII case table 01253 for ($i=ord('a'); $i<=ord('z'); $i++) { 01254 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32); 01255 } 01256 for ($i=ord('A'); $i<=ord('Z'); $i++) { 01257 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32); 01258 } 01259 01260 if ($cacheFile) { 01261 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset])); 01262 } 01263 01264 return 3; 01265 } 01266 01275 function initToASCII($charset) { 01276 // Only process if the case table is not yet loaded: 01277 if (is_array($this->toASCII[$charset])) return 1; 01278 01279 // Use cached version if possible 01280 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl'); 01281 if ($cacheFile && @is_file($cacheFile)) { 01282 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01283 return 2; 01284 } 01285 01286 // init UTF-8 conversion for this charset 01287 if (!$this->initCharset($charset)) { 01288 return false; 01289 } 01290 01291 // UTF-8/ASCII transliteration is used as the base conversion table 01292 if (!$this->initUnicodeData('ascii')) { 01293 return false; 01294 } 01295 01296 $nochar = chr($this->noCharByteVal); 01297 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01298 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01299 $c = $this->utf8_decode($utf8, $charset); 01300 01301 if (isset($this->toASCII['utf-8'][$utf8])) { 01302 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 01303 } 01304 } 01305 01306 if ($cacheFile) { 01307 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset])); 01308 } 01309 01310 return 3; 01311 } 01312 01313 01314 01315 01316 01317 01318 01319 01320 01321 01322 01323 01324 01325 01326 01327 01328 /******************************************** 01329 * 01330 * String operation functions 01331 * 01332 ********************************************/ 01333 01346 function substr($charset,$string,$start,$len=null) { 01347 if ($len===0) return ''; 01348 01349 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01350 // cannot omit $len, when specifying charset 01351 if ($len==null) { 01352 $enc = mb_internal_encoding(); // save internal encoding 01353 mb_internal_encoding('utf-8'); 01354 $str = mb_substr($string,$start); 01355 mb_internal_encoding($enc); // restore internal encoding 01356 01357 return $str; 01358 } 01359 else return mb_substr($string,$start,$len,'utf-8'); 01360 } elseif ($charset == 'utf-8') { 01361 return $this->utf8_substr($string,$start,$len); 01362 } elseif ($this->eucBasedSets[$charset]) { 01363 return $this->euc_substr($string,$start,$charset,$len); 01364 } elseif ($this->twoByteSets[$charset]) { 01365 return substr($string,$start*2,$len*2); 01366 } elseif ($this->fourByteSets[$charset]) { 01367 return substr($string,$start*4,$len*4); 01368 } 01369 01370 // treat everything else as single-byte encoding 01371 return $len === NULL ? substr($string,$start) : substr($string,$start,$len); 01372 } 01373 01384 function strlen($charset,$string) { 01385 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01386 return mb_strlen($string,$charset); 01387 } elseif ($charset == 'utf-8') { 01388 return $this->utf8_strlen($string); 01389 } elseif ($this->eucBasedSets[$charset]) { 01390 return $this->euc_strlen($string,$charset); 01391 } elseif ($this->twoByteSets[$charset]) { 01392 return strlen($string)/2; 01393 } elseif ($this->fourByteSets[$charset]) { 01394 return strlen($string)/4; 01395 } 01396 // treat everything else as single-byte encoding 01397 return strlen($string); 01398 } 01399 01412 function crop($charset,$string,$len,$crop='') { 01413 if (intval($len) == 0) return $string; 01414 01415 if ($charset == 'utf-8') { 01416 $i = $this->utf8_char2byte_pos($string,$len); 01417 } elseif ($this->eucBasedSets[$charset]) { 01418 $i = $this->euc_char2byte_pos($string,$len,$charset); 01419 } else { 01420 if ($len > 0) { 01421 $i = $len; 01422 } else { 01423 $i = strlen($string)+$len; 01424 if ($i<=0) $i = false; 01425 } 01426 } 01427 01428 if ($i === false) { // $len outside actual string length 01429 return $string; 01430 } else { 01431 if ($len > 0) { 01432 if (strlen($string{$i})) { 01433 return substr($string,0,$i).$crop; 01434 01435 } 01436 } else { 01437 if (strlen($string{$i-1})) { 01438 return $crop.substr($string,$i); 01439 } 01440 } 01441 01442 /* 01443 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 01444 if ($len > 0) { 01445 return substr($string,0,$i).$crop; 01446 } else { 01447 return $crop.substr($string,$i); 01448 } 01449 } 01450 */ 01451 } 01452 return $string; 01453 } 01454 01465 function strtrunc($charset,$string,$len) { 01466 if ($len <= 0) return ''; 01467 01468 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01469 return mb_strcut($string,0,$len,$charset); 01470 } elseif ($charset == 'utf-8') { 01471 return $this->utf8_strtrunc($string,$len); 01472 } elseif ($this->eucBasedSets[$charset]) { 01473 return $this->euc_strtrunc($string,$charset); 01474 } elseif ($this->twoByteSets[$charset]) { 01475 if ($len % 2) $len--; // don't cut at odd positions 01476 } elseif ($this->fourByteSets[$charset]) { 01477 $x = $len % 4; 01478 $len -= $x; // realign to position dividable by four 01479 } 01480 // treat everything else as single-byte encoding 01481 return substr($string,0,$len); 01482 } 01483 01499 function conv_case($charset,$string,$case) { 01500 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3) { 01501 if ($case == 'toLower') { 01502 return mb_strtolower($str,'utf-8'); 01503 } else { 01504 return mb_strtoupper($str,'utf-8'); 01505 } 01506 } elseif ($charset == 'utf-8') { 01507 return $this->utf8_char_mapping($string,'case',$case); 01508 } elseif (isset($this->eucBasedSets[$charset])) { 01509 return $this->euc_char_mapping($string,$charset,'case',$case); 01510 } else { 01511 // treat everything else as single-byte encoding 01512 return $this->sb_char_mapping($string,$charset,'case',$case); 01513 } 01514 01515 return $string; 01516 } 01517 01525 function specCharsToASCII($charset,$string) { 01526 if ($charset == 'utf-8') { 01527 return $this->utf8_char_mapping($string,'ascii'); 01528 } elseif (isset($this->eucBasedSets[$charset])) { 01529 return $this->euc_char_mapping($string,$charset,'ascii'); 01530 } else { 01531 // treat everything else as single-byte encoding 01532 return $this->sb_char_mapping($string,$charset,'ascii'); 01533 } 01534 01535 return $string; 01536 } 01537 01538 01539 01540 01541 01542 01543 01544 01545 01546 01547 01548 01549 /******************************************** 01550 * 01551 * Internal string operation functions 01552 * 01553 ********************************************/ 01554 01565 function sb_char_mapping($str,$charset,$mode,$opt='') { 01566 switch($mode) { 01567 case 'case': 01568 if (!$this->initCaseFolding($charset)) return $str; // do nothing 01569 $map =& $this->caseFolding[$charset][$opt]; 01570 break; 01571 01572 case 'ascii': 01573 if (!$this->initToASCII($charset)) return $str; // do nothing 01574 $map =& $this->toASCII[$charset]; 01575 break; 01576 01577 default: 01578 return $str; 01579 } 01580 01581 $out = ''; 01582 for($i=0; strlen($str{$i}); $i++) { 01583 $c = $str{$i}; 01584 if (isset($map[$c])) { 01585 $out .= $map[$c]; 01586 } else { 01587 $out .= $c; 01588 } 01589 } 01590 01591 return $out; 01592 } 01593 01594 01595 01596 01597 01598 01599 01600 01601 01602 01603 /******************************************** 01604 * 01605 * Internal UTF-8 string operation functions 01606 * 01607 ********************************************/ 01608 01620 function utf8_substr($str,$start,$len=null) { 01621 if (!strcmp($len,'0')) return ''; 01622 01623 $byte_start = $this->utf8_char2byte_pos($str,$start); 01624 if ($byte_start === false) { 01625 if ($start > 0) { 01626 return false; // $start outside string length 01627 } else { 01628 $start = 0; 01629 } 01630 } 01631 01632 $str = substr($str,$byte_start); 01633 01634 if ($len!=null) { 01635 $byte_end = $this->utf8_char2byte_pos($str,$len); 01636 if ($byte_end === false) // $len outside actual string length 01637 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string. 01638 else 01639 return substr($str,0,$byte_end); 01640 } 01641 else return $str; 01642 } 01643 01653 function utf8_strlen($str) { 01654 $n=0; 01655 for($i=0; strlen($str{$i}); $i++) { 01656 $c = ord($str{$i}); 01657 if (!($c & 0x80)) // single-byte (0xxxxxx) 01658 $n++; 01659 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01660 $n++; 01661 } 01662 return $n; 01663 } 01664 01674 function utf8_strtrunc($str,$len) { 01675 $i = $len-1; 01676 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 01677 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 01678 if ($i <= 0) return ''; // sanity check 01679 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 01680 if ($bc+$i > $len) return substr($str,0,$i); 01681 // fallthru: multibyte char fits into length 01682 } 01683 return substr($str,$len); 01684 } 01685 01696 function utf8_strpos($haystack,$needle,$offset=0) { 01697 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01698 return mb_strpos($haystack,$needle,'utf-8'); 01699 } 01700 01701 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset); 01702 if ($byte_offset === false) return false; // offset beyond string length 01703 01704 $byte_pos = strpos($haystack,$needle,$byte_offset); 01705 if ($byte_pos === false) return false; // needle not found 01706 01707 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01708 } 01709 01719 function utf8_strrpos($haystack,$needle) { 01720 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01721 return mb_strrpos($haystack,$needle,'utf-8'); 01722 } 01723 01724 $byte_pos = strrpos($haystack,$needle); 01725 if ($byte_pos === false) return false; // needle not found 01726 01727 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01728 } 01729 01739 function utf8_char2byte_pos($str,$pos) { 01740 $n = 0; // number of characters found 01741 $p = abs($pos); // number of characters wanted 01742 01743 if ($pos >= 0) { 01744 $i = 0; 01745 $d = 1; 01746 } else { 01747 $i = strlen($str)-1; 01748 $d = -1; 01749 } 01750 01751 for( ; strlen($str{$i}) && $n<$p; $i+=$d) { 01752 $c = (int)ord($str{$i}); 01753 if (!($c & 0x80)) // single-byte (0xxxxxx) 01754 $n++; 01755 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01756 $n++; 01757 } 01758 if (!strlen($str{$i})) return false; // offset beyond string length 01759 01760 if ($pos >= 0) { 01761 // skip trailing multi-byte data bytes 01762 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; } 01763 } else { 01764 // correct offset 01765 $i++; 01766 } 01767 01768 return $i; 01769 } 01770 01780 function utf8_byte2char_pos($str,$pos) { 01781 $n = 0; // number of characters 01782 for($i=$pos; $i>0; $i--) { 01783 $c = (int)ord($str{$i}); 01784 if (!($c & 0x80)) // single-byte (0xxxxxx) 01785 $n++; 01786 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01787 $n++; 01788 } 01789 if (!strlen($str{$i})) return false; // offset beyond string length 01790 01791 return $n; 01792 } 01793 01803 function utf8_char_mapping($str,$mode,$opt='') { 01804 if (!$this->initUnicodeData($mode)) return $str; // do nothing 01805 01806 $out = ''; 01807 switch($mode) { 01808 case 'case': 01809 $map =& $this->caseFolding['utf-8'][$opt]; 01810 break; 01811 01812 case 'ascii': 01813 $map =& $this->toASCII['utf-8']; 01814 break; 01815 01816 default: 01817 return $str; 01818 } 01819 01820 for($i=0; strlen($str{$i}); $i++) { 01821 $c = ord($str{$i}); 01822 if (!($c & 0x80)) // single-byte (0xxxxxx) 01823 $mbc = $str{$i}; 01824 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 01825 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes 01826 $mbc = substr($str,$i,$bc); 01827 $i += $bc-1; 01828 } 01829 01830 if (isset($map[$mbc])) { 01831 $out .= $map[$mbc]; 01832 } else { 01833 $out .= $mbc; 01834 } 01835 } 01836 01837 return $out; 01838 } 01839 01840 01841 01842 01843 01844 01845 01846 01847 01848 01849 01850 01851 01852 01853 01854 01855 01856 01857 /******************************************** 01858 * 01859 * Internal EUC string operation functions 01860 * 01861 * Extended Unix Code: 01862 * ASCII compatible 7bit single bytes chars 01863 * 8bit two byte chars 01864 * 01865 * Shift-JIS is treated as a special case. 01866 * 01867 ********************************************/ 01868 01879 function euc_strtrunc($str,$len,$charset) { 01880 $sjis = ($charset == 'shift_jis'); 01881 for ($i=0; strlen($str{$i}) && $i<$len; $i++) { 01882 $c = ord($str{$i}); 01883 if ($sjis) { 01884 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01885 } 01886 else { 01887 if ($c >= 0x80) $i++; // advance a double-byte char 01888 } 01889 } 01890 if (!strlen($str{$i})) return $str; // string shorter than supplied length 01891 01892 if ($i>$len) 01893 return substr($str,0,$len-1); // we ended on a first byte 01894 else 01895 return substr($str,0,$len); 01896 } 01897 01908 function euc_substr($str,$start,$charset,$len=null) { 01909 $byte_start = $this->euc_char2byte_pos($str,$start,$charset); 01910 if ($byte_start === false) return false; // $start outside string length 01911 01912 $str = substr($str,$byte_start); 01913 01914 if ($len!=null) { 01915 $byte_end = $this->euc_char2byte_pos($str,$len,$charset); 01916 if ($byte_end === false) // $len outside actual string length 01917 return $str; 01918 else 01919 return substr($str,0,$byte_end); 01920 } 01921 else return $str; 01922 } 01923 01933 function euc_strlen($str,$charset) { 01934 $sjis = ($charset == 'shift_jis'); 01935 $n=0; 01936 for ($i=0; strlen($str{$i}); $i++) { 01937 $c = ord($str{$i}); 01938 if ($sjis) { 01939 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01940 } 01941 else { 01942 if ($c >= 0x80) $i++; // advance a double-byte char 01943 } 01944 01945 $n++; 01946 } 01947 01948 return $n; 01949 } 01950 01960 function euc_char2byte_pos($str,$pos,$charset) { 01961 $sjis = ($charset == 'shift_jis'); 01962 $n = 0; // number of characters seen 01963 $p = abs($pos); // number of characters wanted 01964 01965 if ($pos >= 0) { 01966 $i = 0; 01967 $d = 1; 01968 } else { 01969 $i = strlen($str)-1; 01970 $d = -1; 01971 } 01972 01973 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) { 01974 $c = ord($str{$i}); 01975 if ($sjis) { 01976 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char 01977 } 01978 else { 01979 if ($c >= 0x80) $i+=$d; // advance a double-byte char 01980 } 01981 01982 $n++; 01983 } 01984 if (!strlen($str{$i})) return false; // offset beyond string length 01985 01986 if ($pos < 0) $i++; // correct offset 01987 01988 return $i; 01989 } 01990 02001 function euc_char_mapping($str,$charset,$mode,$opt='') { 02002 switch($mode) { 02003 case 'case': 02004 if (!$this->initCaseFolding($charset)) return $str; // do nothing 02005 $map =& $this->caseFolding[$charset][$opt]; 02006 break; 02007 02008 case 'ascii': 02009 if (!$this->initToASCII($charset)) return $str; // do nothing 02010 $map =& $this->toASCII[$charset]; 02011 break; 02012 02013 default: 02014 return $str; 02015 } 02016 02017 $sjis = ($charset == 'shift_jis'); 02018 $out = ''; 02019 for($i=0; strlen($str{$i}); $i++) { 02020 $mbc = $str{$i}; 02021 $c = ord($mbc); 02022 02023 if ($sjis) { 02024 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 02025 $mbc = substr($str,$i,2); 02026 $i++; 02027 } 02028 } 02029 else { 02030 if ($c >= 0x80) { // a double-byte char 02031 $mbc = substr($str,$i,2); 02032 $i++; 02033 } 02034 } 02035 02036 if (isset($map[$mbc])) { 02037 $out .= $map[$mbc]; 02038 } else { 02039 $out .= $mbc; 02040 } 02041 } 02042 02043 return $out; 02044 } 02045 02046 } 02047 02048 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) { 02049 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 02050 } 02051 ?>