Documentation TYPO3 par Ameos |
00001 <?php 00002 /*************************************************************** 00003 * Copyright notice 00004 * 00005 * (c) 2003-2007 Kasper Skaarhoj (kasperYYYY@typo3.com) 00006 * All rights reserved 00007 * 00008 * This script is part of the Typo3 project. The Typo3 project is 00009 * free software; you can redistribute it and/or modify 00010 * it under the terms of the GNU General Public License as published by 00011 * the Free Software Foundation; either version 2 of the License, or 00012 * (at your option) any later version. 00013 * 00014 * The GNU General Public License can be found at 00015 * http://www.gnu.org/copyleft/gpl.html. 00016 * 00017 * This script is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * This copyright notice MUST APPEAR in all copies of the script! 00023 ***************************************************************/ 00136 class t3lib_cs { 00137 var $noCharByteVal=63; // ASCII Value for chars with no equivalent. 00138 00139 // This is the array where parsed conversion tables are stored (cached) 00140 var $parsedCharsets=array(); 00141 00142 // An array where case folding data will be stored (cached) 00143 var $caseFolding=array(); 00144 00145 // An array where charset-to-ASCII mappings are stored (cached) 00146 var $toASCII=array(); 00147 00148 // This tells the converter which charsets has two bytes per char: 00149 var $twoByteSets=array( 00150 'ucs-2'=>1, // 2-byte Unicode 00151 ); 00152 00153 // This tells the converter which charsets has four bytes per char: 00154 var $fourByteSets=array( 00155 'ucs-4'=>1, // 4-byte Unicode 00156 'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16) 00157 ); 00158 00159 // This tells the converter which charsets use a scheme like the Extended Unix Code: 00160 var $eucBasedSets=array( 00161 'gb2312'=>1, // Chinese, simplified. 00162 'big5'=>1, // Chinese, traditional. 00163 'euc-kr'=>1, // Korean 00164 'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80! 00165 ); 00166 00167 // see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html 00168 // http://czyborra.com/charsets/iso8859.html 00169 var $synonyms=array( 00170 'us' => 'ascii', 00171 'us-ascii'=> 'ascii', 00172 'cp819' => 'iso-8859-1', 00173 'ibm819' => 'iso-8859-1', 00174 'iso-ir-100' => 'iso-8859-1', 00175 'iso-ir-109' => 'iso-8859-2', 00176 'iso-ir-148' => 'iso-8859-9', 00177 'iso-ir-199' => 'iso-8859-14', 00178 'iso-ir-203' => 'iso-8859-15', 00179 'csisolatin1' => 'iso-8859-1', 00180 'csisolatin2' => 'iso-8859-2', 00181 'csisolatin3' => 'iso-8859-3', 00182 'csisolatin5' => 'iso-8859-9', 00183 'csisolatin8' => 'iso-8859-14', 00184 'csisolatin9' => 'iso-8859-15', 00185 'csisolatingreek' => 'iso-8859-7', 00186 'iso-celtic' => 'iso-8859-14', 00187 'latin1' => 'iso-8859-1', 00188 'latin2' => 'iso-8859-2', 00189 'latin3' => 'iso-8859-3', 00190 'latin5' => 'iso-8859-9', 00191 'latin6' => 'iso-8859-10', 00192 'latin8' => 'iso-8859-14', 00193 'latin9' => 'iso-8859-15', 00194 'l1' => 'iso-8859-1', 00195 'l2' => 'iso-8859-2', 00196 'l3' => 'iso-8859-3', 00197 'l5' => 'iso-8859-9', 00198 'l6' => 'iso-8859-10', 00199 'l8' => 'iso-8859-14', 00200 'l9' => 'iso-8859-15', 00201 'cyrillic' => 'iso-8859-5', 00202 'arabic' => 'iso-8859-6', 00203 'tis-620' => 'iso-8859-11', 00204 'win874' => 'windows-874', 00205 'win1250' => 'windows-1250', 00206 'win1251' => 'windows-1251', 00207 'win1252' => 'windows-1252', 00208 'win1253' => 'windows-1253', 00209 'win1254' => 'windows-1254', 00210 'win1255' => 'windows-1255', 00211 'win1256' => 'windows-1256', 00212 'win1257' => 'windows-1257', 00213 'win1258' => 'windows-1258', 00214 'cp1250' => 'windows-1250', 00215 'cp1251' => 'windows-1251', 00216 'cp1252' => 'windows-1252', 00217 'ms-ee' => 'windows-1250', 00218 'ms-ansi' => 'windows-1252', 00219 'ms-greek' => 'windows-1253', 00220 'ms-turk' => 'windows-1254', 00221 'winbaltrim' => 'windows-1257', 00222 'koi-8ru' => 'koi-8r', 00223 'koi8r' => 'koi-8r', 00224 'cp878' => 'koi-8r', 00225 'mac' => 'macroman', 00226 'macintosh' => 'macroman', 00227 'euc-cn' => 'gb2312', 00228 'x-euc-cn' => 'gb2312', 00229 'euccn' => 'gb2312', 00230 'cp936' => 'gb2312', 00231 'big-5' => 'big5', 00232 'cp950' => 'big5', 00233 'eucjp' => 'euc-jp', 00234 'sjis' => 'shift_jis', 00235 'shift-jis' => 'shift_jis', 00236 'cp932' => 'shift_jis', 00237 'cp949' => 'euc-kr', 00238 'utf7' => 'utf-7', 00239 'utf8' => 'utf-8', 00240 'utf16' => 'utf-16', 00241 'utf32' => 'utf-32', 00242 'utf8' => 'utf-8', 00243 'ucs2' => 'ucs-2', 00244 'ucs4' => 'ucs-4', 00245 ); 00246 00247 // mapping of iso-639:2 language codes to script names 00248 var $lang_to_script=array( 00249 // iso-639:2 language codes, see: 00250 // http://www.w3.org/WAI/ER/IG/ert/iso639.htm 00251 // http://www.loc.gov/standards/iso639-2/langcodes.html 00252 // http://www.unicode.org/onlinedat/languages.html 00253 'ar' => 'arabic', 00254 'bg' => 'cyrillic', // Bulgarian 00255 'bs' => 'east_european', // Bosnian 00256 'cs' => 'east_european', // Czech 00257 'da' => 'west_european', // Danish 00258 'de' => 'west_european', // German 00259 'es' => 'west_european', // Spanish 00260 'et' => 'estonian', 00261 'eo' => 'unicode', // Esperanto 00262 'eu' => 'west_european', // Basque 00263 'fa' => 'arabic', // Persian 00264 'fi' => 'west_european', // Finish 00265 'fo' => 'west_european', // Faroese 00266 'fr' => 'west_european', // French 00267 'gr' => 'greek', 00268 'he' => 'hebrew', // Hebrew (since 1998) 00269 'hi' => 'unicode', // Hindi 00270 'hr' => 'east_european', // Croatian 00271 'hu' => 'east_european', // Hungarian 00272 'iw' => 'hebrew', // Hebrew (til 1998) 00273 'is' => 'west_european', // Icelandic 00274 'it' => 'west_european', // Italian 00275 'ja' => 'japanese', 00276 'kl' => 'west_european', // Greenlandic 00277 'ko' => 'korean', 00278 'lt' => 'lithuanian', 00279 'lv' => 'west_european', // Latvian/Lettish 00280 'nl' => 'west_european', // Dutch 00281 'no' => 'west_european', // Norwegian 00282 'pl' => 'east_european', // Polish 00283 'pt' => 'west_european', // Portuguese 00284 'ro' => 'east_european', // Romanian 00285 'ru' => 'cyrillic', // Russian 00286 'sk' => 'east_european', // Slovak 00287 'sl' => 'east_european', // Slovenian 00288 'sr' => 'cyrillic', // Serbian 00289 'sv' => 'west_european', // Swedish 00290 'th' => 'thai', 00291 'uk' => 'cyrillic', // Ukranian 00292 'vi' => 'vietnamese', 00293 'zh' => 'chinese', 00294 // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp 00295 // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp 00296 'ara' => 'arabic', 00297 'bgr' => 'cyrillic', // Bulgarian 00298 'cat' => 'west_european', // Catalan 00299 'chs' => 'simpl_chinese', 00300 'cht' => 'trad_chinese', 00301 'csy' => 'east_european', // Czech 00302 'dan' => 'west_european', // Danisch 00303 'deu' => 'west_european', // German 00304 'dea' => 'west_european', // German (Austrian) 00305 'des' => 'west_european', // German (Swiss) 00306 'ena' => 'west_european', // English (Australian) 00307 'enc' => 'west_european', // English (Canadian) 00308 'eng' => 'west_european', // English 00309 'enz' => 'west_european', // English (New Zealand) 00310 'enu' => 'west_european', // English (United States) 00311 'euq' => 'west_european', // Basque 00312 'fos' => 'west_european', // Faroese 00313 'far' => 'arabic', // Persian 00314 'fin' => 'west_european', // Finish 00315 'fra' => 'west_european', // French 00316 'frb' => 'west_european', // French (Belgian) 00317 'frc' => 'west_european', // French (Canadian) 00318 'frs' => 'west_european', // French (Swiss) 00319 'ell' => 'greek', 00320 'heb' => 'hebrew', 00321 'hin' => 'unicode', // Hindi 00322 'hun' => 'east_european', // Hungarian 00323 'isl' => 'west_euorpean', // Icelandic 00324 'ita' => 'west_european', // Italian 00325 'its' => 'west_european', // Italian (Swiss) 00326 'jpn' => 'japanese', 00327 'kor' => 'korean', 00328 'lth' => 'lithuanian', 00329 'lvi' => 'west_european', // Latvian/Lettish 00330 'msl' => 'west_european', // Malay 00331 'nlb' => 'west_european', // Dutch (Belgian) 00332 'nld' => 'west_european', // Dutch 00333 'nor' => 'west_european', // Norwegian (bokmal) 00334 'non' => 'west_european', // Norwegian (nynorsk) 00335 'plk' => 'east_european', // Polish 00336 'ptg' => 'west_european', // Portuguese 00337 'ptb' => 'west_european', // Portuguese (Brazil) 00338 'rom' => 'east_european', // Romanian 00339 'rus' => 'cyrillic', // Russian 00340 'slv' => 'east_european', // Slovenian 00341 'sky' => 'east_european', // Slovak 00342 'srl' => 'east_european', // Serbian (Latin) 00343 'srb' => 'cyrillic', // Serbian (Cyrillic) 00344 'esp' => 'west_european', // Spanish (trad. sort) 00345 'esm' => 'west_european', // Spanish (Mexican) 00346 'esn' => 'west_european', // Spanish (internat. sort) 00347 'sve' => 'west_european', // Swedish 00348 'tha' => 'thai', 00349 'trk' => 'turkish', 00350 'ukr' => 'cyrillic', // Ukrainian 00351 // English language names 00352 'arabic' => 'arabic', 00353 'basque' => 'west_european', 00354 'bosnian' => 'east_european', 00355 'bulgarian' => 'east_european', 00356 'catalan' => 'west_european', 00357 'croatian' => 'east_european', 00358 'czech' => 'east_european', 00359 'danish' => 'west_european', 00360 'dutch' => 'west_european', 00361 'english' => 'west_european', 00362 'esperanto' => 'unicode', 00363 'estonian' => 'estonian', 00364 'faroese' => 'west_european', 00365 'farsi' => 'arabic', 00366 'finnish' => 'west_european', 00367 'french' => 'west_european', 00368 'galician' => 'west_european', 00369 'german' => 'west_european', 00370 'greek' => 'greek', 00371 'greenlandic' => 'west_european', 00372 'hebrew' => 'hebrew', 00373 'hindi' => 'unicode', 00374 'hungarian' => 'east_european', 00375 'icelandic' => 'west_european', 00376 'italian' => 'west_european', 00377 'latvian' => 'west_european', 00378 'lettish' => 'west_european', 00379 'lithuanian' => 'lithuanian', 00380 'malay' => 'west_european', 00381 'norwegian' => 'west_european', 00382 'persian' => 'arabic', 00383 'polish' => 'east_european', 00384 'portuguese' => 'west_european', 00385 'russian' => 'cyrillic', 00386 'romanian' => 'east_european', 00387 'serbian' => 'cyrillic', 00388 'slovak' => 'east_european', 00389 'slovenian' => 'east_european', 00390 'spanish' => 'west_european', 00391 'svedish' => 'west_european', 00392 'that' => 'thai', 00393 'turkish' => 'turkish', 00394 'ukrainian' => 'cyrillic', 00395 ); 00396 00397 // mapping of language (family) names to charsets on Unix 00398 var $script_to_charset_unix=array( 00399 'west_european' => 'iso-8859-1', 00400 'estonian' => 'iso-8859-1', 00401 'east_european' => 'iso-8859-2', 00402 'baltic' => 'iso-8859-4', 00403 'cyrillic' => 'iso-8859-5', 00404 'arabic' => 'iso-8859-6', 00405 'greek' => 'iso-8859-7', 00406 'hebrew' => 'iso-8859-8', 00407 'turkish' => 'iso-8859-9', 00408 'thai' => 'iso-8859-11', // = TIS-620 00409 'lithuanian' => 'iso-8859-13', 00410 'chinese' => 'gb2312', // = euc-cn 00411 'japanese' => 'euc-jp', 00412 'korean' => 'euc-kr', 00413 'simpl_chinese' => 'gb2312', 00414 'trad_chinese' => 'big5', 00415 'vietnamese' => '', 00416 'unicode' => 'utf-8', 00417 ); 00418 00419 // mapping of language (family) names to charsets on Windows 00420 var $script_to_charset_windows=array( 00421 'east_european' => 'windows-1250', 00422 'cyrillic' => 'windows-1251', 00423 'west_european' => 'windows-1252', 00424 'greek' => 'windows-1253', 00425 'turkish' => 'windows-1254', 00426 'hebrew' => 'windows-1255', 00427 'arabic' => 'windows-1256', 00428 'baltic' => 'windows-1257', 00429 'estonian' => 'windows-1257', 00430 'lithuanian' => 'windows-1257', 00431 'vietnamese' => 'windows-1258', 00432 'thai' => 'cp874', 00433 'korean' => 'cp949', 00434 'chinese' => 'gb2312', 00435 'japanese' => 'shift_jis', 00436 'simpl_chinese' => 'gb2312', 00437 'trad_chinese' => 'big5', 00438 ); 00439 00440 // mapping of locale names to charsets 00441 var $locale_to_charset=array( 00442 'japanese.euc' => 'euc-jp', 00443 'ja_jp.ujis' => 'euc-jp', 00444 'korean.euc' => 'euc-kr', 00445 'sr@Latn' => 'iso-8859-2', 00446 'zh_cn' => 'gb2312', 00447 'zh_hk' => 'big5', 00448 'zh_tw' => 'big5', 00449 ); 00450 00451 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3: 00452 // Empty values means "iso-8859-1" 00453 var $charSetArray = array( 00454 'dk' => '', 00455 'de' => '', 00456 'no' => '', 00457 'it' => '', 00458 'fr' => '', 00459 'es' => '', 00460 'nl' => '', 00461 'cz' => 'windows-1250', 00462 'pl' => 'iso-8859-2', 00463 'si' => 'windows-1250', 00464 'fi' => '', 00465 'tr' => 'iso-8859-9', 00466 'se' => '', 00467 'pt' => '', 00468 'ru' => 'windows-1251', 00469 'ro' => 'iso-8859-2', 00470 'ch' => 'gb2312', 00471 'sk' => 'windows-1250', 00472 'lt' => 'windows-1257', 00473 'is' => 'utf-8', 00474 'hr' => 'windows-1250', 00475 'hu' => 'iso-8859-2', 00476 'gl' => '', 00477 'th' => 'iso-8859-11', 00478 'gr' => 'iso-8859-7', 00479 'hk' => 'big5', 00480 'eu' => '', 00481 'bg' => 'windows-1251', 00482 'br' => '', 00483 'et' => 'iso-8859-4', 00484 'ar' => 'iso-8859-6', 00485 'he' => 'utf-8', 00486 'ua' => 'windows-1251', 00487 'jp' => 'shift_jis', 00488 'lv' => 'utf-8', 00489 'vn' => 'utf-8', 00490 'ca' => 'iso-8859-15', 00491 'ba' => 'iso-8859-2', 00492 'kr' => 'euc-kr', 00493 'eo' => 'utf-8', 00494 'my' => '', 00495 'hi' => 'utf-8', 00496 'fo' => 'utf-8', 00497 'fa' => 'utf-8', 00498 'sr' => 'utf-8' 00499 ); 00500 00501 // TYPO3 specific: Array with the iso names used for each system language in TYPO3: 00502 // Missing keys means: same as Typo3 00503 var $isoArray = array( 00504 'ba' => 'bs', 00505 'br' => 'pt_BR', 00506 'ch' => 'zh_CN', 00507 'cz' => 'cs', 00508 'dk' => 'da', 00509 'si' => 'sl', 00510 'se' => 'sv', 00511 'gl' => 'kl', 00512 'gr' => 'el', 00513 'hk' => 'zh_HK', 00514 'kr' => 'ko', 00515 'ua' => 'uk', 00516 'jp' => 'ja', 00517 'vn' => 'vi', 00518 ); 00519 00527 function parse_charset($charset) { 00528 $charset = trim(strtolower($charset)); 00529 if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset]; 00530 00531 return $charset; 00532 } 00533 00546 function get_locale_charset($locale) { 00547 $locale = strtolower($locale); 00548 00549 // exact locale specific charset? 00550 if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale]; 00551 00552 // get modifier 00553 list($locale,$modifier) = explode('@',$locale); 00554 00555 // locale contains charset: use it 00556 list($locale,$charset) = explode('.',$locale); 00557 if ($charset) return $this->parse_charset($charset); 00558 00559 // modifier is 'euro' (after charset check, because of xx.utf-8@euro) 00560 if ($modifier == 'euro') return 'iso-8859-15'; 00561 00562 // get language 00563 list($language,$country) = explode('_',$locale); 00564 if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language]; 00565 00566 if (TYPO3_OS == 'WIN') { 00567 $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252'; 00568 } else { 00569 $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1'; 00570 } 00571 00572 return $cs; 00573 } 00574 00575 00576 00577 00578 00579 00580 00581 00582 00583 /******************************************** 00584 * 00585 * Charset Conversion functions 00586 * 00587 ********************************************/ 00588 00599 function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) { 00600 if ($fromCS==$toCS) return $str; 00601 00602 // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything 00603 if ($toCS=='utf-8' || !$useEntityForNoChar) { 00604 switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) { 00605 case 'mbstring': 00606 $conv_str = mb_convert_encoding($str,$toCS,$fromCS); 00607 if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets 00608 break; 00609 00610 case 'iconv': 00611 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str); 00612 if (false !== $conv_str) return $conv_str; 00613 break; 00614 00615 case 'recode': 00616 $conv_str = recode_string($fromCS.'..'.$toCS,$str); 00617 if (false !== $conv_str) return $conv_str; 00618 break; 00619 } 00620 // fallback to TYPO3 conversion 00621 } 00622 00623 if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS); 00624 if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar); 00625 return $str; 00626 } 00627 00639 function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) { 00640 foreach($array as $key => $value) { 00641 if (is_array($array[$key])) { 00642 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00643 } else { 00644 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar); 00645 } 00646 } 00647 } 00648 00656 function utf8_encode($str,$charset) { 00657 00658 if ($charset === 'utf-8') return $str; 00659 00660 // Charset is case-insensitive. 00661 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00662 $strLen = strlen($str); 00663 $outStr=''; 00664 00665 for ($a=0;$a<$strLen;$a++) { // Traverse each char in string. 00666 $chr=substr($str,$a,1); 00667 $ord=ord($chr); 00668 if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char 00669 $ord2 = ord($str{$a+1}); 00670 $ord = $ord<<8 | $ord2; // assume big endian 00671 00672 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00673 $outStr.=$this->parsedCharsets[$charset]['local'][$ord]; 00674 } else $outStr.=chr($this->noCharByteVal); // No char exists 00675 $a++; 00676 } elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8 00677 if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int. 00678 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte 00679 $a++; 00680 $ord2=ord(substr($str,$a,1)); 00681 $ord = $ord*256+$ord2; 00682 } 00683 } 00684 00685 if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?) 00686 $outStr.= $this->parsedCharsets[$charset]['local'][$ord]; 00687 } else $outStr.= chr($this->noCharByteVal); // No char exists 00688 } else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00689 } 00690 return $outStr; 00691 } 00692 } 00693 00702 function utf8_decode($str,$charset,$useEntityForNoChar=0) { 00703 00704 // Charset is case-insensitive. 00705 if ($this->initCharset($charset)) { // Parse conv. table if not already... 00706 $strLen = strlen($str); 00707 $outStr=''; 00708 $buf=''; 00709 for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string. 00710 $chr=substr($str,$a,1); 00711 $ord=ord($chr); 00712 if ($ord>127) { // This means multibyte! (first byte!) 00713 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00714 00715 $buf=$chr; // Add first byte 00716 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00717 $ord = $ord << 1; // Shift it left and ... 00718 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00719 $a++; // Increase pointer... 00720 $buf.=substr($str,$a,1); // ... and add the next char. 00721 } else break; 00722 } 00723 00724 if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then... 00725 $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number 00726 if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars. 00727 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255); 00728 } else $outStr.= chr($mByte); 00729 } elseif ($useEntityForNoChar) { // Create num entity: 00730 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00731 } else $outStr.=chr($this->noCharByteVal); // No char exists 00732 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00733 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00734 } 00735 return $outStr; 00736 } 00737 } 00738 00745 function utf8_to_entities($str) { 00746 $strLen = strlen($str); 00747 $outStr=''; 00748 $buf=''; 00749 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00750 $chr=substr($str,$a,1); 00751 $ord=ord($chr); 00752 if ($ord>127) { // This means multibyte! (first byte!) 00753 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00754 $buf=$chr; // Add first byte 00755 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00756 $ord = $ord << 1; // Shift it left and ... 00757 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00758 $a++; // Increase pointer... 00759 $buf.=substr($str,$a,1); // ... and add the next char. 00760 } else break; 00761 } 00762 00763 $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';'; 00764 } else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!) 00765 } else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00766 } 00767 00768 return $outStr; 00769 } 00770 00778 function entities_to_utf8($str,$alsoStdHtmlEnt=0) { 00779 if ($alsoStdHtmlEnt) { 00780 $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below. 00781 } 00782 00783 $token = md5(microtime()); 00784 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str)); 00785 foreach($parts as $k => $v) { 00786 if ($k%2) { 00787 if (substr($v,0,1)=='#') { // Dec or hex entities: 00788 if (substr($v,1,1)=='x') { 00789 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2))); 00790 } else { 00791 $parts[$k] = $this->UnumberToChar(substr($v,1)); 00792 } 00793 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities: 00794 $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1'); 00795 } else { // No conversion: 00796 $parts[$k] ='&'.$v.';'; 00797 } 00798 } 00799 } 00800 00801 return implode('',$parts); 00802 } 00803 00812 function utf8_to_numberarray($str,$convEntities=0,$retChar=0) { 00813 // If entities must be registered as well...: 00814 if ($convEntities) { 00815 $str = $this->entities_to_utf8($str,1); 00816 } 00817 // Do conversion: 00818 $strLen = strlen($str); 00819 $outArr=array(); 00820 $buf=''; 00821 for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string. 00822 $chr=substr($str,$a,1); 00823 $ord=ord($chr); 00824 if ($ord>127) { // This means multibyte! (first byte!) 00825 if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence. 00826 $buf=$chr; // Add first byte 00827 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00828 $ord = $ord << 1; // Shift it left and ... 00829 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00830 $a++; // Increase pointer... 00831 $buf.=substr($str,$a,1); // ... and add the next char. 00832 } else break; 00833 } 00834 00835 $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf); 00836 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!) 00837 } else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent 00838 } 00839 00840 return $outArr; 00841 } 00842 00862 function UnumberToChar($cbyte) { 00863 $str=''; 00864 00865 if ($cbyte < 0x80) { 00866 $str.=chr($cbyte); 00867 } else if ($cbyte < 0x800) { 00868 $str.=chr(0xC0 | ($cbyte >> 6)); 00869 $str.=chr(0x80 | ($cbyte & 0x3F)); 00870 } else if ($cbyte < 0x10000) { 00871 $str.=chr(0xE0 | ($cbyte >> 12)); 00872 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00873 $str.=chr(0x80 | ($cbyte & 0x3F)); 00874 } else if ($cbyte < 0x200000) { 00875 $str.=chr(0xF0 | ($cbyte >> 18)); 00876 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00877 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00878 $str.=chr(0x80 | ($cbyte & 0x3F)); 00879 } else if ($cbyte < 0x4000000) { 00880 $str.=chr(0xF8 | ($cbyte >> 24)); 00881 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00882 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00883 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00884 $str.=chr(0x80 | ($cbyte & 0x3F)); 00885 } else if ($cbyte < 0x80000000) { 00886 $str.=chr(0xFC | ($cbyte >> 30)); 00887 $str.=chr(0x80 | (($cbyte >> 24) & 0x3F)); 00888 $str.=chr(0x80 | (($cbyte >> 18) & 0x3F)); 00889 $str.=chr(0x80 | (($cbyte >> 12) & 0x3F)); 00890 $str.=chr(0x80 | (($cbyte >> 6) & 0x3F)); 00891 $str.=chr(0x80 | ($cbyte & 0x3F)); 00892 } else { // Cannot express a 32-bit character in UTF-8 00893 $str .= chr($this->noCharByteVal); 00894 } 00895 return $str; 00896 } 00897 00907 function utf8CharToUnumber($str,$hex=0) { 00908 $ord=ord(substr($str,0,1)); // First char 00909 00910 if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string 00911 $binBuf=''; 00912 for ($b=0;$b<8;$b++) { // for each byte in multibyte string... 00913 $ord = $ord << 1; // Shift it left and ... 00914 if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence. 00915 $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6); 00916 } else break; 00917 } 00918 $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf; 00919 00920 $int = bindec($binBuf); 00921 } else $int = $ord; 00922 00923 return $hex ? 'x'.dechex($int) : $int; 00924 } 00925 00926 00927 00928 00929 00930 00931 00932 00933 00934 /******************************************** 00935 * 00936 * Init functions 00937 * 00938 ********************************************/ 00939 00950 function initCharset($charset) { 00951 // Only process if the charset is not yet loaded: 00952 if (!is_array($this->parsedCharsets[$charset])) { 00953 00954 // Conversion table filename: 00955 $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl'; 00956 00957 // If the conversion table is found: 00958 if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) { 00959 // Cache file for charsets: 00960 // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero. 00961 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl'); 00962 if ($cacheFile && @is_file($cacheFile)) { 00963 $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile)); 00964 } else { 00965 // Parse conversion table into lines: 00966 $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1); 00967 // Initialize the internal variable holding the conv. table: 00968 $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array()); 00969 // traverse the lines: 00970 $detectedType=''; 00971 foreach($lines as $value) { 00972 if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored. 00973 00974 // Detect type if not done yet: (Done on first real line) 00975 // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE" 00976 if (!$detectedType) $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token'; 00977 00978 if ($detectedType=='ms-token') { 00979 list($hexbyte,$utf8) = split('=|:',$value,3); 00980 } elseif ($detectedType=='whitespaced') { 00981 $regA=array(); 00982 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA); 00983 $hexbyte = $regA[1]; 00984 $utf8 = 'U+'.$regA[2]; 00985 } 00986 $decval = hexdec(trim($hexbyte)); 00987 if ($decval>127) { 00988 $utf8decval = hexdec(substr(trim($utf8),2)); 00989 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval); 00990 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval; 00991 } 00992 } 00993 } 00994 if ($cacheFile) { 00995 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset])); 00996 } 00997 } 00998 return 2; 00999 } else return false; 01000 } else return 1; 01001 } 01002 01012 function initUnicodeData($mode=null) { 01013 // cache files 01014 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl'); 01015 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl'); 01016 01017 // Only process if the tables are not yet loaded 01018 switch($mode) { 01019 case 'case': 01020 if (is_array($this->caseFolding['utf-8'])) return 1; 01021 01022 // Use cached version if possible 01023 if ($cacheFileCase && @is_file($cacheFileCase)) { 01024 $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase)); 01025 return 2; 01026 } 01027 break; 01028 01029 case 'ascii': 01030 if (is_array($this->toASCII['utf-8'])) return 1; 01031 01032 // Use cached version if possible 01033 if ($cacheFileASCII && @is_file($cacheFileASCII)) { 01034 $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII)); 01035 return 2; 01036 } 01037 break; 01038 } 01039 01040 // process main Unicode data file 01041 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt'; 01042 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false; 01043 01044 $fh = fopen($unicodeDataFile,'rb'); 01045 if (!$fh) return false; 01046 01047 // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence) 01048 // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper) 01049 $this->caseFolding['utf-8'] = array(); 01050 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand 01051 $utf8CaseFolding['toUpper'] = array(); 01052 $utf8CaseFolding['toLower'] = array(); 01053 $utf8CaseFolding['toTitle'] = array(); 01054 01055 $decomposition = array(); // array of temp. decompositions 01056 $mark = array(); // array of chars that are marks (eg. composing accents) 01057 $number = array(); // array of chars that are numbers (eg. digits) 01058 $omit = array(); // array of chars to be omitted (eg. Russian hard sign) 01059 01060 while (!feof($fh)) { 01061 $line = fgets($fh,4096); 01062 // has a lot of info 01063 list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line)); 01064 01065 $ord = hexdec($char); 01066 if ($ord > 0xFFFF) break; // only process the BMP 01067 01068 $utf8_char = $this->UnumberToChar($ord); 01069 01070 if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper)); 01071 if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower)); 01072 // store "title" only when different from "upper" (only a few) 01073 if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title)); 01074 01075 switch ($cat{0}) { 01076 case 'M': // mark (accent, umlaut, ...) 01077 $mark["U+$char"] = 1; 01078 break; 01079 01080 case 'N': // numeric value 01081 if ($ord > 0x80 && $num != '') $number["U+$char"] = $num; 01082 } 01083 01084 // accented Latin letters without "official" decomposition 01085 $match = array(); 01086 if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp) { 01087 $c = ord($match[2]); 01088 if ($match[1] == 'SMALL') $c += 32; 01089 01090 $decomposition["U+$char"] = array(dechex($c)); 01091 continue; 01092 } 01093 01094 $match = array(); 01095 if (ereg('(<.*>)? *(.+)',$decomp,$match)) { 01096 switch($match[1]) { 01097 case '<circle>': // add parenthesis as circle replacement, eg (1) 01098 $match[2] = '0028 '.$match[2].' 0029'; 01099 break; 01100 01101 case '<square>': // add square brackets as square replacement, eg [1] 01102 $match[2] = '005B '.$match[2].' 005D'; 01103 break; 01104 01105 case '<compat>': // ignore multi char decompositions that start with a space 01106 if (ereg('^0020 ',$match[2])) continue 2; 01107 break; 01108 01109 // ignore Arabic and vertical layout presentation decomposition 01110 case '<initial>': 01111 case '<medial>': 01112 case '<final>': 01113 case '<isolated>': 01114 case '<vertical>': 01115 continue 2; 01116 } 01117 $decomposition["U+$char"] = split(' ',$match[2]); 01118 } 01119 } 01120 fclose($fh); 01121 01122 // process additional Unicode data for casing (allow folded characters to expand into a sequence) 01123 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt'; 01124 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) { 01125 $fh = fopen($specialCasingFile,'rb'); 01126 if ($fh) { 01127 while (!feof($fh)) { 01128 $line = fgets($fh,4096); 01129 if ($line{0} != '#' && trim($line) != '') { 01130 01131 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line); 01132 if ($cond == '' || $cond{0} == '#') { 01133 $utf8_char = $this->UnumberToChar(hexdec($char)); 01134 if ($char != $lower) { 01135 $arr = split(' ',$lower); 01136 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01137 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr); 01138 } 01139 if ($char != $title && $title != $upper) { 01140 $arr = split(' ',$title); 01141 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01142 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr); 01143 } 01144 if ($char != $upper) { 01145 $arr = split(' ',$upper); 01146 for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i])); 01147 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr); 01148 } 01149 } 01150 } 01151 } 01152 fclose($fh); 01153 } 01154 } 01155 01156 // process custom decompositions 01157 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt'; 01158 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) { 01159 $fh = fopen($customTranslitFile,'rb'); 01160 if ($fh) { 01161 while (!feof($fh)) { 01162 $line = fgets($fh,4096); 01163 if ($line{0} != '#' && trim($line) != '') { 01164 list($char,$translit) = t3lib_div::trimExplode(';', $line); 01165 if (!$translit) $omit["U+$char"] = 1; 01166 $decomposition["U+$char"] = split(' ', $translit); 01167 01168 } 01169 } 01170 fclose($fh); 01171 } 01172 } 01173 01174 // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>) 01175 foreach($decomposition as $from => $to) { 01176 $code_decomp = array(); 01177 01178 while ($code_value = array_shift($to)) { 01179 if (isset($decomposition["U+$code_value"])) { // do recursive decomposition 01180 foreach(array_reverse($decomposition["U+$code_value"]) as $cv) { 01181 array_unshift($to, $cv); 01182 } 01183 } elseif (!isset($mark["U+$code_value"])) { // remove mark 01184 array_push($code_decomp, $code_value); 01185 } 01186 } 01187 if (count($code_decomp) || isset($omit[$from])) { 01188 $decomposition[$from] = $code_decomp; 01189 } else { 01190 unset($decomposition[$from]); 01191 } 01192 } 01193 01194 // create ascii only mapping 01195 $this->toASCII['utf-8'] = array(); 01196 $ascii =& $this->toASCII['utf-8']; 01197 01198 foreach($decomposition as $from => $to) { 01199 $code_decomp = array(); 01200 while ($code_value = array_shift($to)) { 01201 $ord = hexdec($code_value); 01202 if ($ord > 127) 01203 continue 2; // skip decompositions containing non-ASCII chars 01204 else 01205 array_push($code_decomp,chr($ord)); 01206 } 01207 $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp); 01208 } 01209 01210 // add numeric decompositions 01211 foreach($number as $from => $to) { 01212 $utf8_char = $this->UnumberToChar(hexdec($from)); 01213 if (!isset($ascii[$utf8_char])) { 01214 $ascii[$utf8_char] = $to; 01215 } 01216 } 01217 01218 if ($cacheFileCase) { 01219 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding)); 01220 } 01221 01222 if ($cacheFileASCII) { 01223 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii)); 01224 } 01225 01226 return 3; 01227 } 01228 01237 function initCaseFolding($charset) { 01238 // Only process if the case table is not yet loaded: 01239 if (is_array($this->caseFolding[$charset])) return 1; 01240 01241 // Use cached version if possible 01242 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl'); 01243 if ($cacheFile && @is_file($cacheFile)) { 01244 $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01245 return 2; 01246 } 01247 01248 // init UTF-8 conversion for this charset 01249 if (!$this->initCharset($charset)) { 01250 return false; 01251 } 01252 01253 // UTF-8 case folding is used as the base conversion table 01254 if (!$this->initUnicodeData('case')) { 01255 return false; 01256 } 01257 01258 $nochar = chr($this->noCharByteVal); 01259 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01260 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01261 $c = $this->utf8_decode($utf8, $charset); 01262 01263 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset); 01264 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset); 01265 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toUpper'][$c] = $cc; 01266 01267 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset); 01268 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset); 01269 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toLower'][$c] = $cc; 01270 01271 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset); 01272 $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset); 01273 if ($cc != '' && $cc != $nochar) $this->caseFolding[$charset]['toTitle'][$c] = $cc; 01274 } 01275 01276 // add the ASCII case table 01277 for ($i=ord('a'); $i<=ord('z'); $i++) { 01278 $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32); 01279 } 01280 for ($i=ord('A'); $i<=ord('Z'); $i++) { 01281 $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32); 01282 } 01283 01284 if ($cacheFile) { 01285 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset])); 01286 } 01287 01288 return 3; 01289 } 01290 01299 function initToASCII($charset) { 01300 // Only process if the case table is not yet loaded: 01301 if (is_array($this->toASCII[$charset])) return 1; 01302 01303 // Use cached version if possible 01304 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl'); 01305 if ($cacheFile && @is_file($cacheFile)) { 01306 $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile)); 01307 return 2; 01308 } 01309 01310 // init UTF-8 conversion for this charset 01311 if (!$this->initCharset($charset)) { 01312 return false; 01313 } 01314 01315 // UTF-8/ASCII transliteration is used as the base conversion table 01316 if (!$this->initUnicodeData('ascii')) { 01317 return false; 01318 } 01319 01320 $nochar = chr($this->noCharByteVal); 01321 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) { 01322 // reconvert to charset (don't use chr() of numeric value, might be muli-byte) 01323 $c = $this->utf8_decode($utf8, $charset); 01324 01325 if (isset($this->toASCII['utf-8'][$utf8])) { 01326 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8]; 01327 } 01328 } 01329 01330 if ($cacheFile) { 01331 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset])); 01332 } 01333 01334 return 3; 01335 } 01336 01337 01338 01339 01340 01341 01342 01343 01344 01345 01346 01347 01348 01349 01350 01351 01352 /******************************************** 01353 * 01354 * String operation functions 01355 * 01356 ********************************************/ 01357 01370 function substr($charset,$string,$start,$len=null) { 01371 if ($len===0) return ''; 01372 01373 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01374 // cannot omit $len, when specifying charset 01375 if ($len==null) { 01376 $enc = mb_internal_encoding(); // save internal encoding 01377 mb_internal_encoding($charset); 01378 $str = mb_substr($string,$start); 01379 mb_internal_encoding($enc); // restore internal encoding 01380 01381 return $str; 01382 } 01383 else { 01384 return mb_substr($string,$start,$len,$charset); 01385 } 01386 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01387 // cannot omit $len, when specifying charset 01388 if ($len==null) { 01389 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding 01390 iconv_set_encoding('internal_encoding',$charset); 01391 $str = iconv_substr($string,$start); 01392 iconv_set_encoding('internal_encoding',$enc); // restore internal encoding 01393 01394 return $str; 01395 } 01396 else { 01397 return iconv_substr($string,$start,$len,$charset); 01398 } 01399 } elseif ($charset == 'utf-8') { 01400 return $this->utf8_substr($string,$start,$len); 01401 } elseif ($this->eucBasedSets[$charset]) { 01402 return $this->euc_substr($string,$start,$charset,$len); 01403 } elseif ($this->twoByteSets[$charset]) { 01404 return substr($string,$start*2,$len*2); 01405 } elseif ($this->fourByteSets[$charset]) { 01406 return substr($string,$start*4,$len*4); 01407 } 01408 01409 // treat everything else as single-byte encoding 01410 return $len === NULL ? substr($string,$start) : substr($string,$start,$len); 01411 } 01412 01423 function strlen($charset,$string) { 01424 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01425 return mb_strlen($string,$charset); 01426 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01427 return iconv_strlen($string,$charset); 01428 } elseif ($charset == 'utf-8') { 01429 return $this->utf8_strlen($string); 01430 } elseif ($this->eucBasedSets[$charset]) { 01431 return $this->euc_strlen($string,$charset); 01432 } elseif ($this->twoByteSets[$charset]) { 01433 return strlen($string)/2; 01434 } elseif ($this->fourByteSets[$charset]) { 01435 return strlen($string)/4; 01436 } 01437 // treat everything else as single-byte encoding 01438 return strlen($string); 01439 } 01440 01453 function crop($charset,$string,$len,$crop='') { 01454 if (intval($len) == 0) return $string; 01455 01456 if ($charset == 'utf-8') { 01457 $i = $this->utf8_char2byte_pos($string,$len); 01458 } elseif ($this->eucBasedSets[$charset]) { 01459 $i = $this->euc_char2byte_pos($string,$len,$charset); 01460 } else { 01461 if ($len > 0) { 01462 $i = $len; 01463 } else { 01464 $i = strlen($string)+$len; 01465 if ($i<=0) $i = false; 01466 } 01467 } 01468 01469 if ($i === false) { // $len outside actual string length 01470 return $string; 01471 } else { 01472 if ($len > 0) { 01473 if (strlen($string{$i})) { 01474 return substr($string,0,$i).$crop; 01475 01476 } 01477 } else { 01478 if (strlen($string{$i-1})) { 01479 return $crop.substr($string,$i); 01480 } 01481 } 01482 01483 /* 01484 if (abs($len)<$this->strlen($charset,$string)) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...) 01485 if ($len > 0) { 01486 return substr($string,0,$i).$crop; 01487 } else { 01488 return $crop.substr($string,$i); 01489 } 01490 } 01491 */ 01492 } 01493 return $string; 01494 } 01495 01506 function strtrunc($charset,$string,$len) { 01507 if ($len <= 0) return ''; 01508 01509 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01510 return mb_strcut($string,0,$len,$charset); 01511 } elseif ($charset == 'utf-8') { 01512 return $this->utf8_strtrunc($string,$len); 01513 } elseif ($this->eucBasedSets[$charset]) { 01514 return $this->euc_strtrunc($string,$charset); 01515 } elseif ($this->twoByteSets[$charset]) { 01516 if ($len % 2) $len--; // don't cut at odd positions 01517 } elseif ($this->fourByteSets[$charset]) { 01518 $x = $len % 4; 01519 $len -= $x; // realign to position dividable by four 01520 } 01521 // treat everything else as single-byte encoding 01522 return substr($string,0,$len); 01523 } 01524 01540 function conv_case($charset,$string,$case) { 01541 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01542 if ($case == 'toLower') { 01543 $string = mb_strtolower($string,$charset); 01544 } else { 01545 $string = mb_strtoupper($string,$charset); 01546 } 01547 } elseif ($charset == 'utf-8') { 01548 $string = $this->utf8_char_mapping($string,'case',$case); 01549 } elseif (isset($this->eucBasedSets[$charset])) { 01550 $string = $this->euc_char_mapping($string,$charset,'case',$case); 01551 } else { 01552 // treat everything else as single-byte encoding 01553 $string = $this->sb_char_mapping($string,$charset,'case',$case); 01554 } 01555 01556 return $string; 01557 } 01558 01566 function specCharsToASCII($charset,$string) { 01567 if ($charset == 'utf-8') { 01568 $string = $this->utf8_char_mapping($string,'ascii'); 01569 } elseif (isset($this->eucBasedSets[$charset])) { 01570 $string = $this->euc_char_mapping($string,$charset,'ascii'); 01571 } else { 01572 // treat everything else as single-byte encoding 01573 $string = $this->sb_char_mapping($string,$charset,'ascii'); 01574 } 01575 01576 return $string; 01577 } 01578 01579 01580 01581 01582 01583 01584 01585 01586 01587 01588 01589 01590 /******************************************** 01591 * 01592 * Internal string operation functions 01593 * 01594 ********************************************/ 01595 01606 function sb_char_mapping($str,$charset,$mode,$opt='') { 01607 switch($mode) { 01608 case 'case': 01609 if (!$this->initCaseFolding($charset)) return $str; // do nothing 01610 $map =& $this->caseFolding[$charset][$opt]; 01611 break; 01612 01613 case 'ascii': 01614 if (!$this->initToASCII($charset)) return $str; // do nothing 01615 $map =& $this->toASCII[$charset]; 01616 break; 01617 01618 default: 01619 return $str; 01620 } 01621 01622 $out = ''; 01623 for($i=0; strlen($str{$i}); $i++) { 01624 $c = $str{$i}; 01625 if (isset($map[$c])) { 01626 $out .= $map[$c]; 01627 } else { 01628 $out .= $c; 01629 } 01630 } 01631 01632 return $out; 01633 } 01634 01635 01636 01637 01638 01639 01640 01641 01642 01643 01644 /******************************************** 01645 * 01646 * Internal UTF-8 string operation functions 01647 * 01648 ********************************************/ 01649 01661 function utf8_substr($str,$start,$len=null) { 01662 if (!strcmp($len,'0')) return ''; 01663 01664 $byte_start = $this->utf8_char2byte_pos($str,$start); 01665 if ($byte_start === false) { 01666 if ($start > 0) { 01667 return false; // $start outside string length 01668 } else { 01669 $start = 0; 01670 } 01671 } 01672 01673 $str = substr($str,$byte_start); 01674 01675 if ($len!=null) { 01676 $byte_end = $this->utf8_char2byte_pos($str,$len); 01677 if ($byte_end === false) // $len outside actual string length 01678 return $len<0 ? '' : $str; // When length is less than zero and exceeds, then we return blank string. 01679 else 01680 return substr($str,0,$byte_end); 01681 } 01682 else return $str; 01683 } 01684 01694 function utf8_strlen($str) { 01695 $n=0; 01696 for($i=0; strlen($str{$i}); $i++) { 01697 $c = ord($str{$i}); 01698 if (!($c & 0x80)) // single-byte (0xxxxxx) 01699 $n++; 01700 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01701 $n++; 01702 } 01703 return $n; 01704 } 01705 01715 function utf8_strtrunc($str,$len) { 01716 $i = $len-1; 01717 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence 01718 for (; $i>0 && !(ord($str{$i}) & 0x40); $i--) ; // find the first byte 01719 if ($i <= 0) return ''; // sanity check 01720 for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) $bc++; // calculate number of bytes 01721 if ($bc+$i > $len) return substr($str,0,$i); 01722 // fallthru: multibyte char fits into length 01723 } 01724 return substr($str,0,$len); 01725 } 01726 01737 function utf8_strpos($haystack,$needle,$offset=0) { 01738 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01739 return mb_strpos($haystack,$needle,$offset,'utf-8'); 01740 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01741 return iconv_strpos($haystack,$needle,$offset,'utf-8'); 01742 } 01743 01744 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset); 01745 if ($byte_offset === false) return false; // offset beyond string length 01746 01747 $byte_pos = strpos($haystack,$needle,$byte_offset); 01748 if ($byte_pos === false) return false; // needle not found 01749 01750 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01751 } 01752 01762 function utf8_strrpos($haystack,$needle) { 01763 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') { 01764 return mb_strrpos($haystack,$needle,'utf-8'); 01765 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') { 01766 return iconv_strrpos($haystack,$needle,'utf-8'); 01767 } 01768 01769 $byte_pos = strrpos($haystack,$needle); 01770 if ($byte_pos === false) return false; // needle not found 01771 01772 return $this->utf8_byte2char_pos($haystack,$byte_pos); 01773 } 01774 01784 function utf8_char2byte_pos($str,$pos) { 01785 $n = 0; // number of characters found 01786 $p = abs($pos); // number of characters wanted 01787 01788 if ($pos >= 0) { 01789 $i = 0; 01790 $d = 1; 01791 } else { 01792 $i = strlen($str)-1; 01793 $d = -1; 01794 } 01795 01796 for( ; strlen($str{$i}) && $n<$p; $i+=$d) { 01797 $c = (int)ord($str{$i}); 01798 if (!($c & 0x80)) // single-byte (0xxxxxx) 01799 $n++; 01800 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01801 $n++; 01802 } 01803 if (!strlen($str{$i})) return false; // offset beyond string length 01804 01805 if ($pos >= 0) { 01806 // skip trailing multi-byte data bytes 01807 while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; } 01808 } else { 01809 // correct offset 01810 $i++; 01811 } 01812 01813 return $i; 01814 } 01815 01825 function utf8_byte2char_pos($str,$pos) { 01826 $n = 0; // number of characters 01827 for($i=$pos; $i>0; $i--) { 01828 $c = (int)ord($str{$i}); 01829 if (!($c & 0x80)) // single-byte (0xxxxxx) 01830 $n++; 01831 elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx) 01832 $n++; 01833 } 01834 if (!strlen($str{$i})) return false; // offset beyond string length 01835 01836 return $n; 01837 } 01838 01848 function utf8_char_mapping($str,$mode,$opt='') { 01849 if (!$this->initUnicodeData($mode)) return $str; // do nothing 01850 01851 $out = ''; 01852 switch($mode) { 01853 case 'case': 01854 $map =& $this->caseFolding['utf-8'][$opt]; 01855 break; 01856 01857 case 'ascii': 01858 $map =& $this->toASCII['utf-8']; 01859 break; 01860 01861 default: 01862 return $str; 01863 } 01864 01865 for($i=0; strlen($str{$i}); $i++) { 01866 $c = ord($str{$i}); 01867 if (!($c & 0x80)) // single-byte (0xxxxxx) 01868 $mbc = $str{$i}; 01869 elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx) 01870 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes 01871 $mbc = substr($str,$i,$bc); 01872 $i += $bc-1; 01873 } 01874 01875 if (isset($map[$mbc])) { 01876 $out .= $map[$mbc]; 01877 } else { 01878 $out .= $mbc; 01879 } 01880 } 01881 01882 return $out; 01883 } 01884 01885 01886 01887 01888 01889 01890 01891 01892 01893 01894 01895 01896 01897 01898 01899 01900 01901 01902 /******************************************** 01903 * 01904 * Internal EUC string operation functions 01905 * 01906 * Extended Unix Code: 01907 * ASCII compatible 7bit single bytes chars 01908 * 8bit two byte chars 01909 * 01910 * Shift-JIS is treated as a special case. 01911 * 01912 ********************************************/ 01913 01924 function euc_strtrunc($str,$len,$charset) { 01925 $sjis = ($charset == 'shift_jis'); 01926 for ($i=0; strlen($str{$i}) && $i<$len; $i++) { 01927 $c = ord($str{$i}); 01928 if ($sjis) { 01929 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01930 } 01931 else { 01932 if ($c >= 0x80) $i++; // advance a double-byte char 01933 } 01934 } 01935 if (!strlen($str{$i})) return $str; // string shorter than supplied length 01936 01937 if ($i>$len) 01938 return substr($str,0,$len-1); // we ended on a first byte 01939 else 01940 return substr($str,0,$len); 01941 } 01942 01953 function euc_substr($str,$start,$charset,$len=null) { 01954 $byte_start = $this->euc_char2byte_pos($str,$start,$charset); 01955 if ($byte_start === false) return false; // $start outside string length 01956 01957 $str = substr($str,$byte_start); 01958 01959 if ($len!=null) { 01960 $byte_end = $this->euc_char2byte_pos($str,$len,$charset); 01961 if ($byte_end === false) // $len outside actual string length 01962 return $str; 01963 else 01964 return substr($str,0,$byte_end); 01965 } 01966 else return $str; 01967 } 01968 01978 function euc_strlen($str,$charset) { 01979 $sjis = ($charset == 'shift_jis'); 01980 $n=0; 01981 for ($i=0; strlen($str{$i}); $i++) { 01982 $c = ord($str{$i}); 01983 if ($sjis) { 01984 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i++; // advance a double-byte char 01985 } 01986 else { 01987 if ($c >= 0x80) $i++; // advance a double-byte char 01988 } 01989 01990 $n++; 01991 } 01992 01993 return $n; 01994 } 01995 02005 function euc_char2byte_pos($str,$pos,$charset) { 02006 $sjis = ($charset == 'shift_jis'); 02007 $n = 0; // number of characters seen 02008 $p = abs($pos); // number of characters wanted 02009 02010 if ($pos >= 0) { 02011 $i = 0; 02012 $d = 1; 02013 } else { 02014 $i = strlen($str)-1; 02015 $d = -1; 02016 } 02017 02018 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) { 02019 $c = ord($str{$i}); 02020 if ($sjis) { 02021 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) $i+=$d; // advance a double-byte char 02022 } 02023 else { 02024 if ($c >= 0x80) $i+=$d; // advance a double-byte char 02025 } 02026 02027 $n++; 02028 } 02029 if (!strlen($str{$i})) return false; // offset beyond string length 02030 02031 if ($pos < 0) $i++; // correct offset 02032 02033 return $i; 02034 } 02035 02046 function euc_char_mapping($str,$charset,$mode,$opt='') { 02047 switch($mode) { 02048 case 'case': 02049 if (!$this->initCaseFolding($charset)) return $str; // do nothing 02050 $map =& $this->caseFolding[$charset][$opt]; 02051 break; 02052 02053 case 'ascii': 02054 if (!$this->initToASCII($charset)) return $str; // do nothing 02055 $map =& $this->toASCII[$charset]; 02056 break; 02057 02058 default: 02059 return $str; 02060 } 02061 02062 $sjis = ($charset == 'shift_jis'); 02063 $out = ''; 02064 for($i=0; strlen($str{$i}); $i++) { 02065 $mbc = $str{$i}; 02066 $c = ord($mbc); 02067 02068 if ($sjis) { 02069 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char 02070 $mbc = substr($str,$i,2); 02071 $i++; 02072 } 02073 } 02074 else { 02075 if ($c >= 0x80) { // a double-byte char 02076 $mbc = substr($str,$i,2); 02077 $i++; 02078 } 02079 } 02080 02081 if (isset($map[$mbc])) { 02082 $out .= $map[$mbc]; 02083 } else { 02084 $out .= $mbc; 02085 } 02086 } 02087 02088 return $out; 02089 } 02090 02091 } 02092 02093 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']) { 02094 include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']); 02095 } 02096 ?>