00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2003-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the Typo3 project. The Typo3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *
00017 *  This script is distributed in the hope that it will be useful,
00018 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 *  GNU General Public License for more details.
00021 *
00022 *  This copyright notice MUST APPEAR in all copies of the script!
00023 ***************************************************************/
00136 class t3lib_cs {
00137         var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
00138 
00139                 // This is the array where parsed conversion tables are stored (cached)
00140         var $parsedCharsets=array();
00141 
00142                 // An array where case folding data will be stored (cached)
00143         var $caseFolding=array();
00144 
00145                 // An array where charset-to-ASCII mappings are stored (cached)
00146         var $toASCII=array();
00147 
00148                 // This tells the converter which charsets has two bytes per char:
00149         var $twoByteSets=array(
00150                 'ucs-2'=>1,     // 2-byte Unicode
00151         );
00152 
00153                 // This tells the converter which charsets has four bytes per char:
00154         var $fourByteSets=array(
00155                 'ucs-4'=>1,     // 4-byte Unicode
00156                 'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
00157         );
00158 
00159                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
00160         var $eucBasedSets=array(
00161                 'gb2312'=>1,            // Chinese, simplified.
00162                 'big5'=>1,              // Chinese, traditional.
00163                 'euc-kr'=>1,            // Korean
00164                 'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00165         );
00166 
00167                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00168                 // http://czyborra.com/charsets/iso8859.html
00169         var $synonyms=array(
00170                 'us' => 'ascii',
00171                 'us-ascii'=> 'ascii',
00172                 'cp819' => 'iso-8859-1',
00173                 'ibm819' => 'iso-8859-1',
00174                 'iso-ir-100' => 'iso-8859-1',
00175                 'iso-ir-109' => 'iso-8859-2',
00176                 'iso-ir-148' => 'iso-8859-9',
00177                 'iso-ir-199' => 'iso-8859-14',
00178                 'iso-ir-203' => 'iso-8859-15',
00179                 'csisolatin1' => 'iso-8859-1',
00180                 'csisolatin2' => 'iso-8859-2',
00181                 'csisolatin3' => 'iso-8859-3',
00182                 'csisolatin5' => 'iso-8859-9',
00183                 'csisolatin8' => 'iso-8859-14',
00184                 'csisolatin9' => 'iso-8859-15',
00185                 'csisolatingreek' => 'iso-8859-7',
00186                 'iso-celtic' => 'iso-8859-14',
00187                 'latin1' => 'iso-8859-1',
00188                 'latin2' => 'iso-8859-2',
00189                 'latin3' => 'iso-8859-3',
00190                 'latin5' => 'iso-8859-9',
00191                 'latin6' => 'iso-8859-10',
00192                 'latin8' => 'iso-8859-14',
00193                 'latin9' => 'iso-8859-15',
00194                 'l1' => 'iso-8859-1',
00195                 'l2' => 'iso-8859-2',
00196                 'l3' => 'iso-8859-3',
00197                 'l5' => 'iso-8859-9',
00198                 'l6' => 'iso-8859-10',
00199                 'l8' => 'iso-8859-14',
00200                 'l9' => 'iso-8859-15',
00201                 'cyrillic' => 'iso-8859-5',
00202                 'arabic' => 'iso-8859-6',
00203                 'tis-620' => 'iso-8859-11',
00204                 'win874' => 'windows-874',
00205                 'win1250' => 'windows-1250',
00206                 'win1251' => 'windows-1251',
00207                 'win1252' => 'windows-1252',
00208                 'win1253' => 'windows-1253',
00209                 'win1254' => 'windows-1254',
00210                 'win1255' => 'windows-1255',
00211                 'win1256' => 'windows-1256',
00212                 'win1257' => 'windows-1257',
00213                 'win1258' => 'windows-1258',
00214                 'cp1250' => 'windows-1250',
00215                 'cp1251' => 'windows-1251',
00216                 'cp1252' => 'windows-1252',
00217                 'ms-ee' => 'windows-1250',
00218                 'ms-ansi' => 'windows-1252',
00219                 'ms-greek' => 'windows-1253',
00220                 'ms-turk' => 'windows-1254',
00221                 'winbaltrim' => 'windows-1257',
00222                 'koi-8ru' => 'koi-8r',
00223                 'koi8r' => 'koi-8r',
00224                 'cp878' => 'koi-8r',
00225                 'mac' => 'macroman',
00226                 'macintosh' => 'macroman',
00227                 'euc-cn' => 'gb2312',
00228                 'x-euc-cn' => 'gb2312',
00229                 'euccn' => 'gb2312',
00230                 'cp936' => 'gb2312',
00231                 'big-5' => 'big5',
00232                 'cp950' => 'big5',
00233                 'eucjp' => 'euc-jp',
00234                 'sjis' => 'shift_jis',
00235                 'shift-jis' => 'shift_jis',
00236                 'cp932' => 'shift_jis',
00237                 'cp949' => 'euc-kr',
00238                 'utf7' => 'utf-7',
00239                 'utf8' => 'utf-8',
00240                 'utf16' => 'utf-16',
00241                 'utf32' => 'utf-32',
00242                 'utf8' => 'utf-8',
00243                 'ucs2' => 'ucs-2',
00244                 'ucs4' => 'ucs-4',
00245         );
00246 
00247                 // mapping of iso-639:2 language codes to script names
00248         var $lang_to_script=array(
00249                         // iso-639:2 language codes, see:
00250                         //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
00251                         //  http://www.loc.gov/standards/iso639-2/langcodes.html
00252                         //  http://www.unicode.org/onlinedat/languages.html
00253                 'ar' => 'arabic',
00254                 'bg' => 'cyrillic',             // Bulgarian
00255                 'bs' => 'east_european',        // Bosnian
00256                 'cs' => 'east_european',        // Czech
00257                 'da' => 'west_european',        // Danish
00258                 'de' => 'west_european',        // German
00259                 'es' => 'west_european',        // Spanish
00260                 'et' => 'estonian',
00261                 'eo' => 'unicode',              // Esperanto
00262                 'eu' => 'west_european',        // Basque
00263                 'fa' => 'arabic',       // Persian
00264                 'fi' => 'west_european',        // Finish
00265                 'fo' => 'west_european',        // Faroese
00266                 'fr' => 'west_european',        // French
00267                 'gr' => 'greek',
00268                 'he' => 'hebrew',               // Hebrew (since 1998)
00269                 'hi' => 'unicode',              // Hindi
00270                 'hr' => 'east_european',        // Croatian
00271                 'hu' => 'east_european',        // Hungarian
00272                 'iw' => 'hebrew',               // Hebrew (til 1998)
00273                 'is' => 'west_european',        // Icelandic
00274                 'it' => 'west_european',        // Italian
00275                 'ja' => 'japanese',
00276                 'kl' => 'west_european',        // Greenlandic
00277                 'ko' => 'korean',
00278                 'lt' => 'lithuanian',
00279                 'lv' => 'west_european',        // Latvian/Lettish
00280                 'nl' => 'west_european',        // Dutch
00281                 'no' => 'west_european',        // Norwegian
00282                 'pl' => 'east_european',        // Polish
00283                 'pt' => 'west_european',        // Portuguese
00284                 'ro' => 'east_european',        // Romanian
00285                 'ru' => 'cyrillic',             // Russian
00286                 'sk' => 'east_european',        // Slovak
00287                 'sl' => 'east_european',        // Slovenian
00288                 'sr' => 'cyrillic',             // Serbian
00289                 'sv' => 'west_european',        // Swedish
00290                 'th' => 'thai',
00291                 'uk' => 'cyrillic',             // Ukranian
00292                 'vi' => 'vietnamese',
00293                 'zh' => 'chinese',
00294                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00295                         // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
00296                 'ara' => 'arabic',
00297                 'bgr' => 'cyrillic',            // Bulgarian
00298                 'cat' => 'west_european',       // Catalan
00299                 'chs' => 'simpl_chinese',
00300                 'cht' => 'trad_chinese',
00301                 'csy' => 'east_european',       // Czech
00302                 'dan' => 'west_european',       // Danisch
00303                 'deu' => 'west_european',       // German
00304                 'dea' => 'west_european',       // German (Austrian)
00305                 'des' => 'west_european',       // German (Swiss)
00306                 'ena' => 'west_european',       // English (Australian)
00307                 'enc' => 'west_european',       // English (Canadian)
00308                 'eng' => 'west_european',       // English
00309                 'enz' => 'west_european',       // English (New Zealand)
00310                 'enu' => 'west_european',       // English (United States)
00311                 'euq' => 'west_european',       // Basque
00312                 'fos' => 'west_european',       // Faroese
00313                 'far' => 'arabic',      // Persian
00314                 'fin' => 'west_european',       // Finish
00315                 'fra' => 'west_european',       // French
00316                 'frb' => 'west_european',       // French (Belgian)
00317                 'frc' => 'west_european',       // French (Canadian)
00318                 'frs' => 'west_european',       // French (Swiss)
00319                 'ell' => 'greek',
00320                 'heb' => 'hebrew',
00321                 'hin' => 'unicode',     // Hindi
00322                 'hun' => 'east_european',       // Hungarian
00323                 'isl' => 'west_euorpean',       // Icelandic
00324                 'ita' => 'west_european',       // Italian
00325                 'its' => 'west_european',       // Italian (Swiss)
00326                 'jpn' => 'japanese',
00327                 'kor' => 'korean',
00328                 'lth' => 'lithuanian',
00329                 'lvi' => 'west_european',       // Latvian/Lettish
00330                 'msl' => 'west_european',       // Malay
00331                 'nlb' => 'west_european',       // Dutch (Belgian)
00332                 'nld' => 'west_european',       // Dutch
00333                 'nor' => 'west_european',       // Norwegian (bokmal)
00334                 'non' => 'west_european',       // Norwegian (nynorsk)
00335                 'plk' => 'east_european',       // Polish
00336                 'ptg' => 'west_european',       // Portuguese
00337                 'ptb' => 'west_european',       // Portuguese (Brazil)
00338                 'rom' => 'east_european',       // Romanian
00339                 'rus' => 'cyrillic',            // Russian
00340                 'slv' => 'east_european',       // Slovenian
00341                 'sky' => 'east_european',       // Slovak
00342                 'srl' => 'east_european',       // Serbian (Latin)
00343                 'srb' => 'cyrillic',            // Serbian (Cyrillic)
00344                 'esp' => 'west_european',       // Spanish (trad. sort)
00345                 'esm' => 'west_european',       // Spanish (Mexican)
00346                 'esn' => 'west_european',       // Spanish (internat. sort)
00347                 'sve' => 'west_european',       // Swedish
00348                 'tha' => 'thai',
00349                 'trk' => 'turkish',
00350                 'ukr' => 'cyrillic',    // Ukrainian
00351                         // English language names
00352                 'arabic' => 'arabic',
00353                 'basque' => 'west_european',
00354                 'bosnian' => 'east_european',
00355                 'bulgarian' => 'east_european',
00356                 'catalan' => 'west_european',
00357                 'croatian' => 'east_european',
00358                 'czech' => 'east_european',
00359                 'danish' => 'west_european',
00360                 'dutch' => 'west_european',
00361                 'english' => 'west_european',
00362                 'esperanto' => 'unicode',
00363                 'estonian' => 'estonian',
00364                 'faroese' => 'west_european',
00365                 'farsi' => 'arabic',
00366                 'finnish' => 'west_european',
00367                 'french' => 'west_european',
00368                 'galician' => 'west_european',
00369                 'german' => 'west_european',
00370                 'greek' => 'greek',
00371                 'greenlandic' => 'west_european',
00372                 'hebrew' => 'hebrew',
00373                 'hindi' => 'unicode',
00374                 'hungarian' => 'east_european',
00375                 'icelandic' => 'west_european',
00376                 'italian' => 'west_european',
00377                 'latvian' => 'west_european',
00378                 'lettish' => 'west_european',
00379                 'lithuanian' => 'lithuanian',
00380                 'malay' => 'west_european',
00381                 'norwegian' => 'west_european',
00382                 'persian' => 'arabic',
00383                 'polish' => 'east_european',
00384                 'portuguese' => 'west_european',
00385                 'russian' => 'cyrillic',
00386                 'romanian' => 'east_european',
00387                 'serbian' => 'cyrillic',
00388                 'slovak' => 'east_european',
00389                 'slovenian' => 'east_european',
00390                 'spanish' => 'west_european',
00391                 'svedish' => 'west_european',
00392                 'that' => 'thai',
00393                 'turkish' => 'turkish',
00394                 'ukrainian' => 'cyrillic',
00395         );
00396 
00397                 // mapping of language (family) names to charsets on Unix
00398         var $script_to_charset_unix=array(
00399                 'west_european' => 'iso-8859-1',
00400                 'estonian' => 'iso-8859-1',
00401                 'east_european' => 'iso-8859-2',
00402                 'baltic' => 'iso-8859-4',
00403                 'cyrillic' => 'iso-8859-5',
00404                 'arabic' => 'iso-8859-6',
00405                 'greek' => 'iso-8859-7',
00406                 'hebrew' => 'iso-8859-8',
00407                 'turkish' => 'iso-8859-9',
00408                 'thai' => 'iso-8859-11', // = TIS-620
00409                 'lithuanian' => 'iso-8859-13',
00410                 'chinese' => 'gb2312', // = euc-cn
00411                 'japanese' => 'euc-jp',
00412                 'korean' => 'euc-kr',
00413                 'simpl_chinese' => 'gb2312',
00414                 'trad_chinese' => 'big5',
00415                 'vietnamese' => '',
00416                 'unicode' => 'utf-8',
00417         );
00418 
00419                 // mapping of language (family) names to charsets on Windows
00420         var $script_to_charset_windows=array(
00421                 'east_european' => 'windows-1250',
00422                 'cyrillic' => 'windows-1251',
00423                 'west_european' => 'windows-1252',
00424                 'greek' => 'windows-1253',
00425                 'turkish' => 'windows-1254',
00426                 'hebrew' => 'windows-1255',
00427                 'arabic' => 'windows-1256',
00428                 'baltic' => 'windows-1257',
00429                 'estonian' => 'windows-1257',
00430                 'lithuanian' => 'windows-1257',
00431                 'vietnamese' => 'windows-1258',
00432                 'thai' => 'cp874',
00433                 'korean' => 'cp949',
00434                 'chinese' => 'gb2312',
00435                 'japanese' => 'shift_jis',
00436                 'simpl_chinese' => 'gb2312',
00437                 'trad_chinese' => 'big5',
00438         );
00439 
00440                 // mapping of locale names to charsets
00441         var $locale_to_charset=array(
00442                 'japanese.euc' => 'euc-jp',
00443                 'ja_jp.ujis' => 'euc-jp',
00444                 'korean.euc' => 'euc-kr',
00445                 'sr@Latn' => 'iso-8859-2',
00446                 'zh_cn' => 'gb2312',
00447                 'zh_hk' => 'big5',
00448                 'zh_tw' => 'big5',
00449         );
00450 
00451                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00452                 // Empty values means "iso-8859-1"
00453         var $charSetArray = array(
00454                 'dk' => '',
00455                 'de' => '',
00456                 'no' => '',
00457                 'it' => '',
00458                 'fr' => '',
00459                 'es' => '',
00460                 'nl' => '',
00461                 'cz' => 'windows-1250',
00462                 'pl' => 'iso-8859-2',
00463                 'si' => 'windows-1250',
00464                 'fi' => '',
00465                 'tr' => 'iso-8859-9',
00466                 'se' => '',
00467                 'pt' => '',
00468                 'ru' => 'windows-1251',
00469                 'ro' => 'iso-8859-2',
00470                 'ch' => 'gb2312',
00471                 'sk' => 'windows-1250',
00472                 'lt' => 'windows-1257',
00473                 'is' => 'utf-8',
00474                 'hr' => 'windows-1250',
00475                 'hu' => 'iso-8859-2',
00476                 'gl' => '',
00477                 'th' => 'iso-8859-11',
00478                 'gr' => 'iso-8859-7',
00479                 'hk' => 'big5',
00480                 'eu' => '',
00481                 'bg' => 'windows-1251',
00482                 'br' => '',
00483                 'et' => 'iso-8859-4',
00484                 'ar' => 'iso-8859-6',
00485                 'he' => 'utf-8',
00486                 'ua' => 'windows-1251',
00487                 'jp' => 'shift_jis',
00488                 'lv' => 'utf-8',
00489                 'vn' => 'utf-8',
00490                 'ca' => 'iso-8859-15',
00491                 'ba' => 'iso-8859-2',
00492                 'kr' => 'euc-kr',
00493                 'eo' => 'utf-8',
00494                 'my' => '',
00495                 'hi' => 'utf-8',
00496                 'fo' => 'utf-8',
00497                 'fa' => 'utf-8',
00498                 'sr' => 'utf-8'
00499         );
00500 
00501                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00502                 // Missing keys means: same as Typo3
00503         var $isoArray = array(
00504                 'ba' => 'bs',
00505                 'br' => 'pt_BR',
00506                 'ch' => 'zh_CN',
00507                 'cz' => 'cs',
00508                 'dk' => 'da',
00509                 'si' => 'sl',
00510                 'se' => 'sv',
00511                 'gl' => 'kl',
00512                 'gr' => 'el',
00513                 'hk' => 'zh_HK',
00514                 'kr' => 'ko',
00515                 'ua' => 'uk',
00516                 'jp' => 'ja',
00517                 'vn' => 'vi',
00518         );
00519 
00527         function parse_charset($charset)        {
00528                 $charset = trim(strtolower($charset));
00529                 if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
00530 
00531                 return $charset;
00532         }
00533 
00546         function get_locale_charset($locale)    {
00547                 $locale = strtolower($locale);
00548 
00549                         // exact locale specific charset?
00550                 if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
00551 
00552                         // get modifier
00553                 list($locale,$modifier) = explode('@',$locale);
00554 
00555                         // locale contains charset: use it
00556                 list($locale,$charset) = explode('.',$locale);
00557                 if ($charset)   return $this->parse_charset($charset);
00558 
00559                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00560                 if ($modifier == 'euro')        return 'iso-8859-15';
00561 
00562                         // get language
00563                 list($language,$country) = explode('_',$locale);
00564                 if (isset($this->lang_to_script[$language]))    $script = $this->lang_to_script[$language];
00565 
00566                 if (TYPO3_OS == 'WIN')  {
00567                         $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'window-1252';
00568                 } else {
00569                         $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
00570                 }
00571 
00572                 return $cs;
00573         }
00574 
00575 
00576 
00577 
00578 
00579 
00580 
00581 
00582 
00583         /********************************************
00584          *
00585          * Charset Conversion functions
00586          *
00587          ********************************************/
00588 
00599         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00600                 if ($fromCS==$toCS)     return $str;
00601 
00602                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00603                 if ($toCS=='utf-8' || !$useEntityForNoChar)     {
00604                         switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
00605                         case 'mbstring':
00606                                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00607                                 if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
00608                                 break;
00609 
00610                         case 'iconv':
00611                                 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
00612                                 if (false !== $conv_str)        return $conv_str;
00613                                 break;
00614 
00615                         case 'recode':
00616                                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00617                                 if (false !== $conv_str)        return $conv_str;
00618                                 break;
00619                         }
00620                         // fallback to TYPO3 conversion
00621                 }
00622 
00623                 if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
00624                 if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00625                 return $str;
00626         }
00627 
00639         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00640                 foreach($array as $key => $value)       {
00641                         if (is_array($array[$key]))     {
00642                                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00643                         } else {
00644                                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00645                         }
00646                 }
00647         }
00648 
00656         function utf8_encode($str,$charset)     {
00657 
00658                 if ($charset === 'utf-8')       return $str;
00659 
00660                         // Charset is case-insensitive.
00661                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00662                         $strLen = strlen($str);
00663                         $outStr='';
00664 
00665                         for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
00666                                 $chr=substr($str,$a,1);
00667                                 $ord=ord($chr);
00668                                 if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
00669                                         $ord2 = ord($str{$a+1});
00670                                         $ord = $ord<<8 | $ord2; // assume big endian
00671 
00672                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00673                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00674                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
00675                                         $a++;
00676                                 } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
00677                                         if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00678                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
00679                                                         $a++;
00680                                                         $ord2=ord(substr($str,$a,1));
00681                                                         $ord = $ord*256+$ord2;
00682                                                 }
00683                                         }
00684 
00685                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00686                                                 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00687                                         } else $outStr.= chr($this->noCharByteVal);     // No char exists
00688                                 } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00689                         }
00690                         return $outStr;
00691                 }
00692         }
00693 
00702         function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
00703 
00704                         // Charset is case-insensitive.
00705                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00706                         $strLen = strlen($str);
00707                         $outStr='';
00708                         $buf='';
00709                         for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {       // Traverse each char in UTF-8 string.
00710                                 $chr=substr($str,$a,1);
00711                                 $ord=ord($chr);
00712                                 if ($ord>127)   {       // This means multibyte! (first byte!)
00713                                         if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00714 
00715                                                 $buf=$chr;      // Add first byte
00716                                                 for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00717                                                         $ord = $ord << 1;       // Shift it left and ...
00718                                                         if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00719                                                                 $a++;   // Increase pointer...
00720                                                                 $buf.=substr($str,$a,1);        // ... and add the next char.
00721                                                         } else break;
00722                                                 }
00723 
00724                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
00725                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00726                                                         if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00727                                                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00728                                                         } else $outStr.= chr($mByte);
00729                                                 } elseif ($useEntityForNoChar) {        // Create num entity:
00730                                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00731                                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists
00732                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00733                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00734                         }
00735                         return $outStr;
00736                 }
00737         }
00738 
00745         function utf8_to_entities($str) {
00746                 $strLen = strlen($str);
00747                 $outStr='';
00748                 $buf='';
00749                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00750                         $chr=substr($str,$a,1);
00751                         $ord=ord($chr);
00752                         if ($ord>127)   {       // This means multibyte! (first byte!)
00753                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00754                                         $buf=$chr;      // Add first byte
00755                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00756                                                 $ord = $ord << 1;       // Shift it left and ...
00757                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00758                                                         $a++;   // Increase pointer...
00759                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00760                                                 } else break;
00761                                         }
00762 
00763                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00764                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00765                         } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00766                 }
00767 
00768                 return $outStr;
00769         }
00770 
00778         function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
00779                 if ($alsoStdHtmlEnt)    {
00780                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
00781                 }
00782 
00783                 $token = md5(microtime());
00784                 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00785                 foreach($parts as $k => $v)     {
00786                         if ($k%2)       {
00787                                 if (substr($v,0,1)=='#')        {       // Dec or hex entities:
00788                                         if (substr($v,1,1)=='x')        {
00789                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00790                                         } else {
00791                                                 $parts[$k] = $this->UnumberToChar(substr($v,1));
00792                                         }
00793                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
00794                                         $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00795                                 } else {        // No conversion:
00796                                         $parts[$k] ='&'.$v.';';
00797                                 }
00798                         }
00799                 }
00800 
00801                 return implode('',$parts);
00802         }
00803 
00812         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
00813                         // If entities must be registered as well...:
00814                 if ($convEntities)      {
00815                         $str = $this->entities_to_utf8($str,1);
00816                 }
00817                         // Do conversion:
00818                 $strLen = strlen($str);
00819                 $outArr=array();
00820                 $buf='';
00821                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00822                         $chr=substr($str,$a,1);
00823                         $ord=ord($chr);
00824                         if ($ord>127)   {       // This means multibyte! (first byte!)
00825                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00826                                         $buf=$chr;      // Add first byte
00827                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00828                                                 $ord = $ord << 1;       // Shift it left and ...
00829                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00830                                                         $a++;   // Increase pointer...
00831                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00832                                                 } else break;
00833                                         }
00834 
00835                                         $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00836                                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
00837                         } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00838                 }
00839 
00840                 return $outArr;
00841         }
00842 
00862         function UnumberToChar($cbyte)  {
00863                 $str='';
00864 
00865                 if ($cbyte < 0x80) {
00866                         $str.=chr($cbyte);
00867                 } else if ($cbyte < 0x800) {
00868                         $str.=chr(0xC0 | ($cbyte >> 6));
00869                         $str.=chr(0x80 | ($cbyte & 0x3F));
00870                 } else if ($cbyte < 0x10000) {
00871                         $str.=chr(0xE0 | ($cbyte >> 12));
00872                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00873                         $str.=chr(0x80 | ($cbyte & 0x3F));
00874                 } else if ($cbyte < 0x200000) {
00875                         $str.=chr(0xF0 | ($cbyte >> 18));
00876                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00877                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00878                         $str.=chr(0x80 | ($cbyte & 0x3F));
00879                 } else if ($cbyte < 0x4000000) {
00880                         $str.=chr(0xF8 | ($cbyte >> 24));
00881                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00882                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00883                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00884                         $str.=chr(0x80 | ($cbyte & 0x3F));
00885                 } else if ($cbyte < 0x80000000) {
00886                         $str.=chr(0xFC | ($cbyte >> 30));
00887                         $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00888                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00889                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00890                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00891                         $str.=chr(0x80 | ($cbyte & 0x3F));
00892                 } else { // Cannot express a 32-bit character in UTF-8
00893                         $str .= chr($this->noCharByteVal);
00894                 }
00895                 return $str;
00896         }
00897 
00907         function utf8CharToUnumber($str,$hex=0) {
00908                 $ord=ord(substr($str,0,1));     // First char
00909 
00910                 if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
00911                         $binBuf='';
00912                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00913                                 $ord = $ord << 1;       // Shift it left and ...
00914                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00915                                         $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00916                                 } else break;
00917                         }
00918                         $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00919 
00920                         $int = bindec($binBuf);
00921                 } else $int = $ord;
00922 
00923                 return $hex ? 'x'.dechex($int) : $int;
00924         }
00925 
00926 
00927 
00928 
00929 
00930 
00931 
00932 
00933 
00934         /********************************************
00935          *
00936          * Init functions
00937          *
00938          ********************************************/
00939 
00950         function initCharset($charset)  {
00951                         // Only process if the charset is not yet loaded:
00952                 if (!is_array($this->parsedCharsets[$charset])) {
00953 
00954                                 // Conversion table filename:
00955                         $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00956 
00957                                 // If the conversion table is found:
00958                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
00959                                         // Cache file for charsets:
00960                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
00961                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00962                                 if ($cacheFile && @is_file($cacheFile)) {
00963                                         $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00964                                 } else {
00965                                                 // Parse conversion table into lines:
00966                                         $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00967                                                 // Initialize the internal variable holding the conv. table:
00968                                         $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00969                                                 // traverse the lines:
00970                                         $detectedType='';
00971                                         foreach($lines as $value)       {
00972                                                 if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
00973 
00974                                                                 // Detect type if not done yet: (Done on first real line)
00975                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
00976                                                         if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00977 
00978                                                         if ($detectedType=='ms-token')  {
00979                                                                 list($hexbyte,$utf8) = split('=|:',$value,3);
00980                                                         } elseif ($detectedType=='whitespaced') {
00981                                                                 $regA=array();
00982                                                                 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00983                                                                 $hexbyte = $regA[1];
00984                                                                 $utf8 = 'U+'.$regA[2];
00985                                                         }
00986                                                         $decval = hexdec(trim($hexbyte));
00987                                                         if ($decval>127)        {
00988                                                                 $utf8decval = hexdec(substr(trim($utf8),2));
00989                                                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00990                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00991                                                         }
00992                                                 }
00993                                         }
00994                                         if ($cacheFile) {
00995                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00996                                         }
00997                                 }
00998                                 return 2;
00999                         } else return false;
01000                 } else return 1;
01001         }
01002 
01012         function initUnicodeData($mode=null)    {
01013                         // cache files
01014                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
01015                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
01016 
01017                         // Only process if the tables are not yet loaded
01018                 switch($mode)   {
01019                         case 'case':
01020                                 if (is_array($this->caseFolding['utf-8']))      return 1;
01021 
01022                                         // Use cached version if possible
01023                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01024                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01025                                         return 2;
01026                                 }
01027                                 break;
01028 
01029                         case 'ascii':
01030                                 if (is_array($this->toASCII['utf-8']))  return 1;
01031 
01032                                         // Use cached version if possible
01033                                 if ($cacheFileASCII && @is_file($cacheFileASCII))       {
01034                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01035                                         return 2;
01036                                 }
01037                                 break;
01038                 }
01039 
01040                         // process main Unicode data file
01041                 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01042                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01043 
01044                 $fh = fopen($unicodeDataFile,'rb');
01045                 if (!$fh)       return false;
01046 
01047                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01048                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01049                 $this->caseFolding['utf-8'] = array();
01050                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01051                 $utf8CaseFolding['toUpper'] = array();
01052                 $utf8CaseFolding['toLower'] = array();
01053                 $utf8CaseFolding['toTitle'] = array();
01054 
01055                 $decomposition = array();       // array of temp. decompositions
01056                 $mark = array();                // array of chars that are marks (eg. composing accents)
01057                 $number = array();              // array of chars that are numbers (eg. digits)
01058                 $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
01059 
01060                 while (!feof($fh))      {
01061                         $line = fgets($fh,4096);
01062                                 // has a lot of info
01063                         list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01064 
01065                         $ord = hexdec($char);
01066                         if ($ord > 0xFFFF)      break;  // only process the BMP
01067 
01068                         $utf8_char = $this->UnumberToChar($ord);
01069 
01070                         if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01071                         if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01072                                 // store "title" only when different from "upper" (only a few)
01073                         if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01074 
01075                         switch ($cat{0})        {
01076                                 case 'M':       // mark (accent, umlaut, ...)
01077                                         $mark["U+$char"] = 1;
01078                                         break;
01079 
01080                                 case 'N':       // numeric value
01081                                         if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
01082                         }
01083 
01084                                 // accented Latin letters without "official" decomposition
01085                         $match = array();
01086                         if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
01087                                 $c = ord($match[2]);
01088                                 if ($match[1] == 'SMALL')       $c += 32;
01089 
01090                                 $decomposition["U+$char"] = array(dechex($c));
01091                                 continue;
01092                         }
01093 
01094                         $match = array();
01095                         if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
01096                                 switch($match[1])       {
01097                                         case '<circle>':        // add parenthesis as circle replacement, eg (1)
01098                                                 $match[2] = '0028 '.$match[2].' 0029';
01099                                                 break;
01100 
01101                                         case '<square>':        // add square brackets as square replacement, eg [1]
01102                                                 $match[2] = '005B '.$match[2].' 005D';
01103                                                 break;
01104 
01105                                         case '<compat>':        // ignore multi char decompositions that start with a space
01106                                                 if (ereg('^0020 ',$match[2]))   continue 2;
01107                                                 break;
01108 
01109                                                 // ignore Arabic and vertical layout presentation decomposition
01110                                         case '<initial>':
01111                                         case '<medial>':
01112                                         case '<final>':
01113                                         case '<isolated>':
01114                                         case '<vertical>':
01115                                                 continue 2;
01116                                 }
01117                                 $decomposition["U+$char"] = split(' ',$match[2]);
01118                         }
01119                 }
01120                 fclose($fh);
01121 
01122                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01123                 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01124                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
01125                         $fh = fopen($specialCasingFile,'rb');
01126                         if ($fh)        {
01127                                 while (!feof($fh))      {
01128                                         $line = fgets($fh,4096);
01129                                         if ($line{0} != '#' && trim($line) != '')       {
01130 
01131                                                 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01132                                                 if ($cond == '' || $cond{0} == '#')     {
01133                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
01134                                                         if ($char != $lower)    {
01135                                                                 $arr = split(' ',$lower);
01136                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01137                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01138                                                         }
01139                                                         if ($char != $title && $title != $upper)        {
01140                                                                 $arr = split(' ',$title);
01141                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01142                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01143                                                         }
01144                                                         if ($char != $upper)    {
01145                                                                         $arr = split(' ',$upper);
01146                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01147                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01148                                                         }
01149                                                 }
01150                                         }
01151                                 }
01152                                 fclose($fh);
01153                         }
01154                 }
01155 
01156                         // process custom decompositions
01157                 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01158                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
01159                         $fh = fopen($customTranslitFile,'rb');
01160                         if ($fh)        {
01161                                 while (!feof($fh))      {
01162                                         $line = fgets($fh,4096);
01163                                         if ($line{0} != '#' && trim($line) != '')       {
01164                                                 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01165                                                 if (!$translit) $omit["U+$char"] = 1;
01166                                                 $decomposition["U+$char"] = split(' ', $translit);
01167 
01168                                         }
01169                                 }
01170                                 fclose($fh);
01171                         }
01172                 }
01173 
01174                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01175                 foreach($decomposition as $from => $to) {
01176                         $code_decomp = array();
01177 
01178                         while ($code_value = array_shift($to))  {
01179                                 if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
01180                                         foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
01181                                                 array_unshift($to, $cv);
01182                                         }
01183                                 } elseif (!isset($mark["U+$code_value"])) {     // remove mark
01184                                         array_push($code_decomp, $code_value);
01185                                 }
01186                         }
01187                         if (count($code_decomp) || isset($omit[$from])) {
01188                                 $decomposition[$from] = $code_decomp;
01189                         } else {
01190                                 unset($decomposition[$from]);
01191                         }
01192                 }
01193 
01194                         // create ascii only mapping
01195                 $this->toASCII['utf-8'] = array();
01196                 $ascii =& $this->toASCII['utf-8'];
01197 
01198                 foreach($decomposition as $from => $to) {
01199                         $code_decomp = array();
01200                         while ($code_value = array_shift($to))  {
01201                                 $ord = hexdec($code_value);
01202                                 if ($ord > 127)
01203                                         continue 2;     // skip decompositions containing non-ASCII chars
01204                                 else
01205                                         array_push($code_decomp,chr($ord));
01206                         }
01207                         $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01208                 }
01209 
01210                         // add numeric decompositions
01211                 foreach($number as $from => $to)        {
01212                         $utf8_char = $this->UnumberToChar(hexdec($from));
01213                         if (!isset($ascii[$utf8_char])) {
01214                                 $ascii[$utf8_char] = $to;
01215                         }
01216                 }
01217 
01218                 if ($cacheFileCase)     {
01219                                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01220                 }
01221 
01222                 if ($cacheFileASCII)    {
01223                                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01224                 }
01225 
01226                 return 3;
01227         }
01228 
01237         function initCaseFolding($charset)      {
01238                         // Only process if the case table is not yet loaded:
01239                 if (is_array($this->caseFolding[$charset]))     return 1;
01240 
01241                         // Use cached version if possible
01242                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01243                 if ($cacheFile && @is_file($cacheFile)) {
01244                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01245                         return 2;
01246                 }
01247 
01248                         // init UTF-8 conversion for this charset
01249                 if (!$this->initCharset($charset))      {
01250                         return false;
01251                 }
01252 
01253                         // UTF-8 case folding is used as the base conversion table
01254                 if (!$this->initUnicodeData('case'))    {
01255                         return false;
01256                 }
01257 
01258                 $nochar = chr($this->noCharByteVal);
01259                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01260                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01261                         $c = $this->utf8_decode($utf8, $charset);
01262 
01263                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01264                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01265                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01266 
01267                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01268                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01269                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
01270 
01271                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01272                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01273                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01274                 }
01275 
01276                         // add the ASCII case table
01277                 for ($i=ord('a'); $i<=ord('z'); $i++)   {
01278                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01279                 }
01280                 for ($i=ord('A'); $i<=ord('Z'); $i++)   {
01281                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01282                 }
01283 
01284                 if ($cacheFile) {
01285                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01286                 }
01287 
01288                 return 3;
01289         }
01290 
01299         function initToASCII($charset)  {
01300                         // Only process if the case table is not yet loaded:
01301                 if (is_array($this->toASCII[$charset])) return 1;
01302 
01303                         // Use cached version if possible
01304                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01305                 if ($cacheFile && @is_file($cacheFile)) {
01306                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01307                         return 2;
01308                 }
01309 
01310                         // init UTF-8 conversion for this charset
01311                 if (!$this->initCharset($charset))      {
01312                         return false;
01313                 }
01314 
01315                         // UTF-8/ASCII transliteration is used as the base conversion table
01316                 if (!$this->initUnicodeData('ascii'))   {
01317                         return false;
01318                 }
01319 
01320                 $nochar = chr($this->noCharByteVal);
01321                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01322                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01323                         $c = $this->utf8_decode($utf8, $charset);
01324 
01325                         if (isset($this->toASCII['utf-8'][$utf8]))      {
01326                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01327                         }
01328                 }
01329 
01330                 if ($cacheFile) {
01331                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01332                 }
01333 
01334                 return 3;
01335         }
01336 
01337 
01338 
01339 
01340 
01341 
01342 
01343 
01344 
01345 
01346 
01347 
01348 
01349 
01350 
01351 
01352         /********************************************
01353          *
01354          * String operation functions
01355          *
01356          ********************************************/
01357 
01370         function substr($charset,$string,$start,$len=null)      {
01371                 if ($len===0)   return '';
01372 
01373                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01374                                 // cannot omit $len, when specifying charset
01375                         if ($len==null) {
01376                                 $enc = mb_internal_encoding();  // save internal encoding
01377                                 mb_internal_encoding($charset);
01378                                 $str = mb_substr($string,$start);
01379                                 mb_internal_encoding($enc);     // restore internal encoding
01380 
01381                                 return $str;
01382                         }
01383                         else {
01384                                 return mb_substr($string,$start,$len,$charset);
01385                         }
01386                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
01387                                 // cannot omit $len, when specifying charset
01388                         if ($len==null) {
01389                                 $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
01390                                 iconv_set_encoding('internal_encoding',$charset);
01391                                 $str = iconv_substr($string,$start);
01392                                 iconv_set_encoding('internal_encoding',$enc);   // restore internal encoding
01393 
01394                                 return $str;
01395                         }
01396                         else {
01397                                 return iconv_substr($string,$start,$len,$charset);
01398                         }
01399                 } elseif ($charset == 'utf-8')  {
01400                         return $this->utf8_substr($string,$start,$len);
01401                 } elseif ($this->eucBasedSets[$charset])        {
01402                         return $this->euc_substr($string,$start,$charset,$len);
01403                 } elseif ($this->twoByteSets[$charset]) {
01404                         return substr($string,$start*2,$len*2);
01405                 } elseif ($this->fourByteSets[$charset])        {
01406                         return substr($string,$start*4,$len*4);
01407                 }
01408 
01409                 // treat everything else as single-byte encoding
01410                 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01411         }
01412 
01423         function strlen($charset,$string)       {
01424                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01425                         return mb_strlen($string,$charset);
01426                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
01427                         return iconv_strlen($string,$charset);
01428                 } elseif ($charset == 'utf-8')  {
01429                         return $this->utf8_strlen($string);
01430                 } elseif ($this->eucBasedSets[$charset])        {
01431                         return $this->euc_strlen($string,$charset);
01432                 } elseif ($this->twoByteSets[$charset]) {
01433                         return strlen($string)/2;
01434                 } elseif ($this->fourByteSets[$charset])        {
01435                         return strlen($string)/4;
01436                 }
01437                 // treat everything else as single-byte encoding
01438                 return strlen($string);
01439         }
01440 
01453         function crop($charset,$string,$len,$crop='')   {
01454                 if (intval($len) == 0)  return $string;
01455 
01456                 if ($charset == 'utf-8')        {
01457                         $i = $this->utf8_char2byte_pos($string,$len);
01458                 } elseif ($this->eucBasedSets[$charset])        {
01459                         $i = $this->euc_char2byte_pos($string,$len,$charset);
01460                 } else {
01461                         if ($len > 0)   {
01462                                 $i = $len;
01463                         } else {
01464                                 $i = strlen($string)+$len;
01465                                 if ($i<=0)      $i = false;
01466                         }
01467                 }
01468 
01469                 if ($i === false)       {       // $len outside actual string length
01470                         return $string;
01471                 } else  {
01472                         if ($len > 0)   {
01473                                 if (strlen($string{$i}))        {
01474                                         return substr($string,0,$i).$crop;
01475 
01476                                 }
01477                         } else {
01478                                 if (strlen($string{$i-1}))      {
01479                                         return $crop.substr($string,$i);
01480                                 }
01481                         }
01482 
01483 /*
01484                         if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01485                                 if ($len > 0)   {
01486                                         return substr($string,0,$i).$crop;
01487                                 } else {
01488                                         return $crop.substr($string,$i);
01489                                 }
01490                         }
01491 */
01492                 }
01493                 return $string;
01494         }
01495 
01506         function strtrunc($charset,$string,$len)        {
01507                 if ($len <= 0)  return '';
01508 
01509                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01510                         return mb_strcut($string,0,$len,$charset);
01511                 } elseif ($charset == 'utf-8')  {
01512                         return $this->utf8_strtrunc($string,$len);
01513                 } elseif ($this->eucBasedSets[$charset])        {
01514                         return $this->euc_strtrunc($string,$charset);
01515                 } elseif ($this->twoByteSets[$charset]) {
01516                         if ($len % 2)   $len--;         // don't cut at odd positions
01517                 } elseif ($this->fourByteSets[$charset])        {
01518                         $x = $len % 4;
01519                         $len -= $x;     // realign to position dividable by four
01520                 }
01521                 // treat everything else as single-byte encoding
01522                 return substr($string,0,$len);
01523         }
01524 
01540         function conv_case($charset,$string,$case)      {
01541                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3)   {
01542                         if ($case == 'toLower') {
01543                                 $string = mb_strtolower($string,$charset);
01544                         } else {
01545                                 $string = mb_strtoupper($string,$charset);
01546                         }
01547                 } elseif ($charset == 'utf-8')  {
01548                         $string = $this->utf8_char_mapping($string,'case',$case);
01549                 } elseif (isset($this->eucBasedSets[$charset])) {
01550                         $string = $this->euc_char_mapping($string,$charset,'case',$case);
01551                 } else {
01552                                 // treat everything else as single-byte encoding
01553                         $string = $this->sb_char_mapping($string,$charset,'case',$case);
01554                 }
01555 
01556                 return $string;
01557         }
01558 
01566         function specCharsToASCII($charset,$string)     {
01567                 if ($charset == 'utf-8')        {
01568                         $string = $this->utf8_char_mapping($string,'ascii');
01569                 } elseif (isset($this->eucBasedSets[$charset])) {
01570                         $string = $this->euc_char_mapping($string,$charset,'ascii');
01571                 } else {
01572                                 // treat everything else as single-byte encoding
01573                         $string = $this->sb_char_mapping($string,$charset,'ascii');
01574                 }
01575 
01576                 return $string;
01577         }
01578 
01579 
01580 
01581 
01582 
01583 
01584 
01585 
01586 
01587 
01588 
01589 
01590         /********************************************
01591          *
01592          * Internal string operation functions
01593          *
01594          ********************************************/
01595 
01606         function sb_char_mapping($str,$charset,$mode,$opt='')   {
01607                 switch($mode)   {
01608                         case 'case':
01609                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
01610                                 $map =& $this->caseFolding[$charset][$opt];
01611                                 break;
01612 
01613                         case 'ascii':
01614                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
01615                                 $map =& $this->toASCII[$charset];
01616                                 break;
01617 
01618                         default:
01619                                 return $str;
01620                 }
01621 
01622                 $out = '';
01623                 for($i=0; strlen($str{$i}); $i++)       {
01624                         $c = $str{$i};
01625                         if (isset($map[$c]))    {
01626                                 $out .= $map[$c];
01627                         } else {
01628                                 $out .= $c;
01629                         }
01630                 }
01631 
01632                 return $out;
01633         }
01634 
01635 
01636 
01637 
01638 
01639 
01640 
01641 
01642 
01643 
01644         /********************************************
01645          *
01646          * Internal UTF-8 string operation functions
01647          *
01648          ********************************************/
01649 
01661         function utf8_substr($str,$start,$len=null)     {
01662                 if (!strcmp($len,'0'))  return '';
01663 
01664                 $byte_start = $this->utf8_char2byte_pos($str,$start);
01665                 if ($byte_start === false)      {
01666                         if ($start > 0) {
01667                                 return false;   // $start outside string length
01668                         } else {
01669                                 $start = 0;
01670                         }
01671                 }
01672 
01673                 $str = substr($str,$byte_start);
01674 
01675                 if ($len!=null) {
01676                         $byte_end = $this->utf8_char2byte_pos($str,$len);
01677                         if ($byte_end === false)        // $len outside actual string length
01678                                 return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
01679                         else
01680                                 return substr($str,0,$byte_end);
01681                 }
01682                 else    return $str;
01683         }
01684 
01694         function utf8_strlen($str)      {
01695                 $n=0;
01696                 for($i=0; strlen($str{$i}); $i++)       {
01697                         $c = ord($str{$i});
01698                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01699                                 $n++;
01700                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01701                                 $n++;
01702                 }
01703                 return $n;
01704         }
01705 
01715         function utf8_strtrunc($str,$len)       {
01716                 $i = $len-1;
01717                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01718                         for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
01719                         if ($i <= 0)    return ''; // sanity check
01720                         for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
01721                         if ($bc+$i > $len)      return substr($str,0,$i);
01722                         // fallthru: multibyte char fits into length
01723                 }
01724                 return substr($str,0,$len);
01725         }
01726 
01737         function utf8_strpos($haystack,$needle,$offset=0)       {
01738                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01739                         return mb_strpos($haystack,$needle,$offset,'utf-8');
01740                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
01741                         return iconv_strpos($haystack,$needle,$offset,'utf-8');
01742                 }
01743 
01744                 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
01745                 if ($byte_offset === false)     return false; // offset beyond string length
01746 
01747                 $byte_pos = strpos($haystack,$needle,$byte_offset);
01748                 if ($byte_pos === false)        return false; // needle not found
01749 
01750                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01751         }
01752 
01762         function utf8_strrpos($haystack,$needle)        {
01763                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01764                         return mb_strrpos($haystack,$needle,'utf-8');
01765                 } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv')      {
01766                         return iconv_strrpos($haystack,$needle,'utf-8');
01767                 }
01768 
01769                 $byte_pos = strrpos($haystack,$needle);
01770                 if ($byte_pos === false)        return false; // needle not found
01771 
01772                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01773         }
01774 
01784         function utf8_char2byte_pos($str,$pos)  {
01785                 $n = 0;                         // number of characters found
01786                 $p = abs($pos);         // number of characters wanted
01787 
01788                 if ($pos >= 0)  {
01789                         $i = 0;
01790                         $d = 1;
01791                 } else {
01792                         $i = strlen($str)-1;
01793                         $d = -1;
01794                 }
01795 
01796                 for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
01797                         $c = (int)ord($str{$i});
01798                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01799                                 $n++;
01800                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01801                                 $n++;
01802                 }
01803                 if (!strlen($str{$i}))  return false; // offset beyond string length
01804 
01805                 if ($pos >= 0)  {
01806                                 // skip trailing multi-byte data bytes
01807                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
01808                 } else {
01809                                 // correct offset
01810                         $i++;
01811                 }
01812 
01813                 return $i;
01814         }
01815 
01825         function utf8_byte2char_pos($str,$pos)  {
01826                 $n = 0; // number of characters
01827                 for($i=$pos; $i>0; $i--)        {
01828                         $c = (int)ord($str{$i});
01829                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01830                                 $n++;
01831                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01832                                 $n++;
01833                 }
01834                 if (!strlen($str{$i}))  return false; // offset beyond string length
01835 
01836                 return $n;
01837         }
01838 
01848         function utf8_char_mapping($str,$mode,$opt='')  {
01849                 if (!$this->initUnicodeData($mode))     return $str;    // do nothing
01850 
01851                 $out = '';
01852                 switch($mode)   {
01853                         case 'case':
01854                                 $map =& $this->caseFolding['utf-8'][$opt];
01855                                 break;
01856 
01857                         case 'ascii':
01858                                 $map =& $this->toASCII['utf-8'];
01859                                 break;
01860 
01861                         default:
01862                                 return $str;
01863                 }
01864 
01865                 for($i=0; strlen($str{$i}); $i++)       {
01866                         $c = ord($str{$i});
01867                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01868                                 $mbc = $str{$i};
01869                         elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
01870                                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
01871                                 $mbc = substr($str,$i,$bc);
01872                                 $i += $bc-1;
01873                         }
01874 
01875                         if (isset($map[$mbc]))  {
01876                                 $out .= $map[$mbc];
01877                         } else {
01878                                 $out .= $mbc;
01879                         }
01880                 }
01881 
01882                 return $out;
01883         }
01884 
01885 
01886 
01887 
01888 
01889 
01890 
01891 
01892 
01893 
01894 
01895 
01896 
01897 
01898 
01899 
01900 
01901 
01902         /********************************************
01903          *
01904          * Internal EUC string operation functions
01905          *
01906          * Extended Unix Code:
01907          *  ASCII compatible 7bit single bytes chars
01908          *  8bit two byte chars
01909          *
01910          * Shift-JIS is treated as a special case.
01911          *
01912          ********************************************/
01913 
01924         function euc_strtrunc($str,$len,$charset)        {
01925                 $sjis = ($charset == 'shift_jis');
01926                 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
01927                         $c = ord($str{$i});
01928                         if ($sjis)      {
01929                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01930                         }
01931                         else    {
01932                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01933                         }
01934                 }
01935                 if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
01936 
01937                 if ($i>$len)
01938                         return substr($str,0,$len-1);   // we ended on a first byte
01939                 else
01940                         return substr($str,0,$len);
01941         }
01942 
01953         function euc_substr($str,$start,$charset,$len=null)     {
01954                 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
01955                 if ($byte_start === false)      return false;   // $start outside string length
01956 
01957                 $str = substr($str,$byte_start);
01958 
01959                 if ($len!=null) {
01960                         $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
01961                         if ($byte_end === false)        // $len outside actual string length
01962                                 return $str;
01963                         else
01964                                 return substr($str,0,$byte_end);
01965                 }
01966                 else    return $str;
01967         }
01968 
01978         function euc_strlen($str,$charset)       {
01979                 $sjis = ($charset == 'shift_jis');
01980                 $n=0;
01981                 for ($i=0; strlen($str{$i}); $i++) {
01982                         $c = ord($str{$i});
01983                         if ($sjis)      {
01984                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01985                         }
01986                         else    {
01987                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01988                         }
01989 
01990                         $n++;
01991                 }
01992 
01993                 return $n;
01994         }
01995 
02005         function euc_char2byte_pos($str,$pos,$charset)  {
02006                 $sjis = ($charset == 'shift_jis');
02007                 $n = 0; // number of characters seen
02008                 $p = abs($pos); // number of characters wanted
02009 
02010                 if ($pos >= 0)  {
02011                         $i = 0;
02012                         $d = 1;
02013                 } else {
02014                         $i = strlen($str)-1;
02015                         $d = -1;
02016                 }
02017 
02018                 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
02019                         $c = ord($str{$i});
02020                         if ($sjis)      {
02021                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
02022                         }
02023                         else    {
02024                                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
02025                         }
02026 
02027                         $n++;
02028                 }
02029                 if (!strlen($str{$i}))  return false; // offset beyond string length
02030 
02031                 if ($pos < 0)   $i++;   // correct offset
02032 
02033                 return $i;
02034         }
02035 
02046         function euc_char_mapping($str,$charset,$mode,$opt='')  {
02047                 switch($mode)   {
02048                         case 'case':
02049                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
02050                                 $map =& $this->caseFolding[$charset][$opt];
02051                                 break;
02052 
02053                         case 'ascii':
02054                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
02055                                 $map =& $this->toASCII[$charset];
02056                                 break;
02057 
02058                         default:
02059                                 return $str;
02060                 }
02061 
02062                 $sjis = ($charset == 'shift_jis');
02063                 $out = '';
02064                 for($i=0; strlen($str{$i}); $i++)       {
02065                         $mbc = $str{$i};
02066                         $c = ord($mbc);
02067 
02068                         if ($sjis)      {
02069                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
02070                                         $mbc = substr($str,$i,2);
02071                                         $i++;
02072                                 }
02073                         }
02074                         else    {
02075                                 if ($c >= 0x80) {       // a double-byte char
02076                                         $mbc = substr($str,$i,2);
02077                                         $i++;
02078                                 }
02079                         }
02080 
02081                         if (isset($map[$mbc]))  {
02082                                 $out .= $map[$mbc];
02083                         } else {
02084                                 $out .= $mbc;
02085                         }
02086                 }
02087 
02088                 return $out;
02089         }
02090 
02091 }
02092 
02093 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])        {
02094         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02095 }
02096 ?>