Documentation TYPO3 par Ameos

class.t3lib_cs.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2003-2004 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the Typo3 project. The Typo3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *
00017 *  This script is distributed in the hope that it will be useful,
00018 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 *  GNU General Public License for more details.
00021 *
00022 *  This copyright notice MUST APPEAR in all copies of the script!
00023 ***************************************************************/
00136 class t3lib_cs {
00137         var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
00138 
00139                 // This is the array where parsed conversion tables are stored (cached)
00140         var $parsedCharsets=array();
00141 
00142                 // An array where case folding data will be stored (cached)
00143         var $caseFolding=array();
00144 
00145                 // An array where charset-to-ASCII mappings are stored (cached)
00146         var $toASCII=array();
00147 
00148                 // This tells the converter which charsets has two bytes per char:
00149         var $twoByteSets=array(
00150                 'ucs-2'=>1,     // 2-byte Unicode
00151         );
00152 
00153                 // This tells the converter which charsets has four bytes per char:
00154         var $fourByteSets=array(
00155                 'ucs-4'=>1,     // 4-byte Unicode
00156                 'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
00157         );
00158 
00159                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
00160         var $eucBasedSets=array(
00161                 'gb2312'=>1,            // Chinese, simplified.
00162                 'big5'=>1,              // Chinese, traditional.
00163                 'euc-kr'=>1,            // Korean
00164                 'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00165         );
00166 
00167                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00168                 // http://czyborra.com/charsets/iso8859.html
00169         var $synonyms=array(
00170                 'us' => 'ascii',
00171                 'us-ascii'=> 'ascii',
00172                 'cp819' => 'iso-8859-1',
00173                 'ibm819' => 'iso-8859-1',
00174                 'iso-ir-100' => 'iso-8859-1',
00175                 'iso-ir-109' => 'iso-8859-2',
00176                 'iso-ir-148' => 'iso-8859-9',
00177                 'iso-ir-199' => 'iso-8859-14',
00178                 'iso-ir-203' => 'iso-8859-15',
00179                 'csisolatin1' => 'iso-8859-1',
00180                 'csisolatin2' => 'iso-8859-2',
00181                 'csisolatin3' => 'iso-8859-3',
00182                 'csisolatin5' => 'iso-8859-9',
00183                 'csisolatin8' => 'iso-8859-14',
00184                 'csisolatin9' => 'iso-8859-15',
00185                 'csisolatingreek' => 'iso-8859-7',
00186                 'iso-celtic' => 'iso-8859-14',
00187                 'latin1' => 'iso-8859-1',
00188                 'latin2' => 'iso-8859-2',
00189                 'latin3' => 'iso-8859-3',
00190                 'latin5' => 'iso-8859-9',
00191                 'latin6' => 'iso-8859-10',
00192                 'latin8' => 'iso-8859-14',
00193                 'latin9' => 'iso-8859-15',
00194                 'l1' => 'iso-8859-1',
00195                 'l2' => 'iso-8859-2',
00196                 'l3' => 'iso-8859-3',
00197                 'l5' => 'iso-8859-9',
00198                 'l6' => 'iso-8859-10',
00199                 'l8' => 'iso-8859-14',
00200                 'l9' => 'iso-8859-15',
00201                 'cyrillic' => 'iso-8859-5',
00202                 'arabic' => 'iso-8859-6',
00203                 'tis-620' => 'iso-8859-11',
00204                 'win874' => 'windows-874',
00205                 'win1250' => 'windows-1250',
00206                 'win1251' => 'windows-1251',
00207                 'win1252' => 'windows-1252',
00208                 'win1253' => 'windows-1253',
00209                 'win1254' => 'windows-1254',
00210                 'win1255' => 'windows-1255',
00211                 'win1256' => 'windows-1256',
00212                 'win1257' => 'windows-1257',
00213                 'win1258' => 'windows-1258',
00214                 'cp1250' => 'windows-1250',
00215                 'cp1251' => 'windows-1251',
00216                 'cp1252' => 'windows-1252',
00217                 'ms-ee' => 'windows-1250',
00218                 'ms-ansi' => 'windows-1252',
00219                 'ms-greek' => 'windows-1253',
00220                 'ms-turk' => 'windows-1254',
00221                 'winbaltrim' => 'windows-1257',
00222                 'koi-8ru' => 'koi-8r',
00223                 'koi8r' => 'koi-8r',
00224                 'cp878' => 'koi-8r',
00225                 'mac' => 'macroman',
00226                 'macintosh' => 'macroman',
00227                 'euc-cn' => 'gb2312',
00228                 'x-euc-cn' => 'gb2312',
00229                 'euccn' => 'gb2312',
00230                 'cp936' => 'gb2312',
00231                 'big-5' => 'big5',
00232                 'cp950' => 'big5',
00233                 'eucjp' => 'euc-jp',
00234                 'sjis' => 'shift_jis',
00235                 'shift-jis' => 'shift_jis',
00236                 'cp932' => 'shift_jis',
00237                 'cp949' => 'euc-kr',
00238                 'utf7' => 'utf-7',
00239                 'utf8' => 'utf-8',
00240                 'utf16' => 'utf-16',
00241                 'utf32' => 'utf-32',
00242                 'utf8' => 'utf-8',
00243                 'ucs2' => 'ucs-2',
00244                 'ucs4' => 'ucs-4',
00245         );
00246 
00247                 // mapping of iso-639:2 language codes to language (family) names
00248         var $lang_to_langfamily=array(
00249                         // iso-639:2 language codes, see:
00250                         //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
00251                         //  http://www.unicode.org/onlinedat/languages.html
00252                 'ar' => 'arabic',
00253                 'bg' => 'cyrillic',
00254                 'cs' => 'east_european',
00255                 'da' => 'west_european',
00256                 'de' => 'west_european',
00257                 'es' => 'west_european',
00258                 'et' => 'estonian',
00259                 'eu' => 'west_european',
00260                 'fi' => 'west_european',
00261                 'fr' => 'west_european',
00262                 'gr' => 'greek',
00263                 'hr' => 'east_european',
00264                 'hu' => 'east_european',
00265                 'iw' => 'hebrew',
00266                 'is' => 'west_european',
00267                 'it' => 'west_european',
00268                 'ja' => 'japanese',
00269                 'kl' => 'west_european',
00270                 'ko' => 'korean',
00271                 'lt' => 'lithuanian',
00272                 'lv' => 'west_european', // Latvian/Lettish
00273                 'nl' => 'west_european',
00274                 'no' => 'west_european',
00275                 'pl' => 'east_european',
00276                 'pt' => 'west_european',
00277                 'ro' => 'east_european',
00278                 'ru' => 'cyrillic',
00279                 'sk' => 'east_european',
00280                 'sl' => 'east_european',
00281                 'sv' => 'west_european',
00282                 'th' => 'thai',
00283                 'uk' => 'cyrillic',
00284                 'vi' => 'vietnamese',
00285                 'zh' => 'chinese',
00286                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00287                 'chs' => 'simpl_chinese',
00288                 'cht' => 'trad_chinese',
00289                 'csy' => 'east_european',
00290                 'dan' => 'west_european',
00291                 'deu' => 'west_european',
00292                 'dea' => 'west_european',
00293                 'des' => 'west_european',
00294                 'ena' => 'west_european',
00295                 'enc' => 'west_european',
00296                 'eng' => 'west_european',
00297                 'enz' => 'west_european',
00298                 'enu' => 'west_european',
00299                 'nld' => 'west_european',
00300                 'nlb' => 'west_european',
00301                 'fin' => 'west_european',
00302                 'fra' => 'west_european',
00303                 'frb' => 'west_european',
00304                 'frc' => 'west_european',
00305                 'frs' => 'west_european',
00306                 'ell' => 'greek',
00307                 'hun' => 'east_european',
00308                 'isl' => 'west_euorpean',
00309                 'ita' => 'west_european',
00310                 'its' => 'west_european',
00311                 'jpn' => 'japanese',
00312                 'kor' => 'korean',
00313                 'nor' => 'west_european',
00314                 'non' => 'west_european',
00315                 'plk' => 'east_european',
00316                 'ptg' => 'west_european',
00317                 'ptb' => 'west_european',
00318                 'rus' => 'east_european',
00319                 'sky' => 'east_european',
00320                 'esp' => 'west_european',
00321                 'esm' => 'west_european',
00322                 'esn' => 'west_european',
00323                 'sve' => 'west_european',
00324                 'trk' => 'turkish',
00325                         // English language names
00326                 'bulgarian' => 'east_european',
00327                 'catalan' => 'west_european',
00328                 'croatian' => 'east_european',
00329                 'czech' => 'east_european',
00330                 'danish' => 'west_european',
00331                 'dutch' => 'west_european',
00332                 'english' => 'west_european',
00333                 'finnish' => 'west_european',
00334                 'french' => 'west_european',
00335                 'galician' => 'west_european',
00336                 'german' => 'west_european',
00337                 'hungarian' => 'east_european',
00338                 'icelandic' => 'west_european',
00339                 'italian' => 'west_european',
00340                 'latvian' => 'west_european',
00341                 'lettish' => 'west_european',
00342                 'norwegian' => 'west_european',
00343                 'polish' => 'east_european',
00344                 'portuguese' => 'west_european',
00345                 'russian' => 'cyrillic',
00346                 'romanian' => 'east_european',
00347                 'slovak' => 'east_european',
00348                 'slovenian' => 'east_european',
00349                 'spanish' => 'west_european',
00350                 'svedish' => 'west_european',
00351                 'turkish' => 'east_european',
00352                 'ukrainian' => 'cyrillic',
00353         );
00354 
00355                 // mapping of language (family) names to charsets on Unix
00356         var $lang_to_charset_unix=array(
00357                 'west_european' => 'iso-8859-1',
00358                 'estonian' => 'iso-8859-1',
00359                 'east_european' => 'iso-8859-2',
00360                 'baltic' => 'iso-8859-4',
00361                 'cyrillic' => 'iso-8859-5',
00362                 'arabic' => 'iso-8859-6',
00363                 'greek' => 'iso-8859-7',
00364                 'hebrew' => 'iso-8859-8',
00365                 'turkish' => 'iso-8859-9',
00366                 'thai' => 'iso-8859-11', // = TIS-620
00367                 'lithuanian' => 'iso-8859-13',
00368                 'chinese' => 'gb2312', // = euc-cn
00369                 'japanese' => 'euc-jp',
00370                 'korean' => 'euc-kr',
00371                 'simpl_chinese' => 'gb2312',
00372                 'trad_chinese' => 'big5',
00373                 'vietnamese' => '',
00374         );
00375 
00376                 // mapping of language (family) names to charsets on Windows
00377         var $lang_to_charset_windows=array(
00378                 'east_european' => 'windows-1250',
00379                 'cyrillic' => 'windows-1251',
00380                 'west_european' => 'windows-1252',
00381                 'greek' => 'windows-1253',
00382                 'turkish' => 'windows-1254',
00383                 'hebrew' => 'windows-1255',
00384                 'arabic' => 'windows-1256',
00385                 'baltic' => 'windows-1257',
00386                 'estonian' => 'windows-1257',
00387                 'lithuanian' => 'windows-1257',
00388                 'vietnamese' => 'windows-1258',
00389                 'thai' => 'cp874',
00390                 'korean' => 'cp949',
00391                 'chinese' => 'gb2312',
00392                 'japanese' => 'shift_jis',
00393                 'simpl_chinese' => 'gb2312',
00394                 'trad_chinese' => 'big5',
00395         );
00396 
00397                 // mapping of locale names to charsets
00398         var $locale_to_charset=array(
00399                 'japanese.euc' => 'euc-jp',
00400                 'ja_jp.ujis' => 'euc-jp',
00401                 'korean.euc' => 'euc-kr',
00402                 'zh_cn' => 'gb2312',
00403                 'zh_hk' => 'big5',
00404                 'zh_tw' => 'big5',
00405         );
00406 
00407                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00408                 // Empty values means "iso-8859-1"
00409         var $charSetArray = array(
00410                 'dk' => '',
00411                 'de' => '',
00412                 'no' => '',
00413                 'it' => '',
00414                 'fr' => '',
00415                 'es' => '',
00416                 'nl' => '',
00417                 'cz' => 'windows-1250',
00418                 'pl' => 'iso-8859-2',
00419                 'si' => 'windows-1250',
00420                 'fi' => '',
00421                 'tr' => 'iso-8859-9',
00422                 'se' => '',
00423                 'pt' => '',
00424                 'ru' => 'windows-1251',
00425                 'ro' => 'iso-8859-2',
00426                 'ch' => 'gb2312',
00427                 'sk' => 'windows-1250',
00428                 'lt' => 'windows-1257',
00429                 'is' => 'utf-8',
00430                 'hr' => 'windows-1250',
00431                 'hu' => 'iso-8859-2',
00432                 'gl' => '',
00433                 'th' => 'iso-8859-11',
00434                 'gr' => 'iso-8859-7',
00435                 'hk' => 'big5',
00436                 'eu' => '',
00437                 'bg' => 'windows-1251',
00438                 'br' => '',
00439                 'et' => 'iso-8859-4',
00440                 'ar' => 'iso-8859-6',
00441                 'he' => 'utf-8',
00442                 'ua' => 'windows-1251',
00443                 'jp' => 'shift_jis',
00444                 'lv' => 'utf-8',
00445                 'vn' => 'utf-8',
00446                 'ca' => 'iso-8859-15',
00447                 'ba' => 'iso-8859-2',
00448                 'kr' => 'euc-kr',
00449         );
00450 
00451                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00452                 // Empty values means sames as Typo3
00453         var $isoArray = array(
00454                 'dk' => 'da',
00455                 'de' => '',
00456                 'no' => '',
00457                 'it' => '',
00458                 'fr' => '',
00459                 'es' => '',
00460                 'nl' => '',
00461                 'cz' => 'cs',
00462                 'pl' => '',
00463                 'si' => 'sl',
00464                 'fi' => '',
00465                 'tr' => '',
00466                 'se' => 'sv',
00467                 'pt' => '',
00468                 'ru' => '',
00469                 'ro' => '',
00470                 'ch' => 'zh_CN',
00471                 'sk' => '',
00472                 'lt' => '',
00473                 'is' => '',
00474                 'hr' => '',
00475                 'hu' => '',
00476                 'gl' => '', // Greenlandic
00477                 'th' => '',
00478                 'gr' => 'el',
00479                 'hk' => 'zh_HK',
00480                 'eu' => '',
00481                 'bg' => '',
00482                 'br' => 'pt_BR',
00483                 'et' => '',
00484                 'ar' => '',
00485                 'he' => 'iw',
00486                 'ua' => 'uk',
00487                 'jp' => 'ja',
00488                 'lv' => '',
00489                 'vn' => 'vi',
00490                 'ca' => '',
00491                 'ba' => '', // Bosnian
00492                 'kr' => '',
00493         );
00494 
00502         function parse_charset($charset)        {
00503                 $charset = strtolower($charset);
00504                 if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
00505 
00506                 return $charset;
00507         }
00508 
00521         function get_locale_charset($locale)    {
00522                 $locale = strtolower($locale);
00523 
00524                         // exact locale specific charset?
00525                 if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
00526 
00527                         // get modifier
00528                 list($locale,$modifier) = explode('@',$locale);
00529 
00530                         // locale contains charset: use it
00531                 list($locale,$charset) = explode('.',$locale);
00532                 if ($charset)   return $this->parse_charset($charset);
00533 
00534                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00535                 if ($modifier == 'euro')        return 'iso-8859-15';
00536 
00537                         // get language
00538                 list($language,$country) = explode('_',$locale);
00539                 if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
00540 
00541                 if (TYPO3_OS == 'WIN')  {
00542                         $cs = $this->lang_to_charset_windows[$language];
00543                 } else {
00544                         $cs = $this->lang_to_charset_unix[$language];
00545                 }
00546 
00547                 return $cs ? $cs : 'iso-8859-1';
00548         }
00549 
00550 
00551 
00552 
00553 
00554 
00555 
00556 
00557 
00558         /********************************************
00559          *
00560          * Charset Conversion functions
00561          *
00562          ********************************************/
00563 
00574         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00575                 if ($fromCS==$toCS)     return $str;
00576 
00577                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00578                 if ($toCS=='utf-8' || !$useEntityForNoChar)     {
00579                         switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
00580                         case 'mbstring':
00581                                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00582                                 if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
00583                                 break;
00584 
00585                         case 'iconv':
00586                                 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
00587                                 if (false !== $conv_str)        return $conv_str;
00588                                 break;
00589 
00590                         case 'recode':
00591                                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00592                                 if (false !== $conv_str)        return $conv_str;
00593                                 break;
00594                         }
00595                         // fallback to TYPO3 conversion
00596                 }
00597 
00598                 if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
00599                 if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00600                 return $str;
00601         }
00602 
00614         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00615                 foreach($array as $key => $value)       {
00616                         if (is_array($array[$key]))     {
00617                                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00618                         } else {
00619                                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00620                         }
00621                 }
00622         }
00623 
00631         function utf8_encode($str,$charset)     {
00632 
00633                         // Charset is case-insensitive.
00634                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00635                         $strLen = strlen($str);
00636                         $outStr='';
00637 
00638                         for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
00639                                 $chr=substr($str,$a,1);
00640                                 $ord=ord($chr);
00641                                 if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
00642                                         $ord2 = ord($str{$a+1});
00643                                         $ord = $ord<<8 & $ord2; // assume big endian
00644 
00645                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00646                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00647                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
00648                                         $a++;
00649                                 } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
00650                                         if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00651                                                 $a++;
00652                                                 $ord2=ord(substr($str,$a,1));
00653                                                 $ord = $ord*256+$ord2;
00654                                         }
00655                                         elseif ($charset == 'shift_jis' && ($ord <160 || $ord>223))     {       // Shift-JIS is like EUC, but chars between 160 and 223 are single byte
00656                                                 $a++;
00657                                                 $ord2=ord(substr($str,$a,1));
00658                                                 $ord = $ord*256+$ord2;
00659                                         }
00660 
00661                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00662                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00663                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
00664                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00665                         }
00666                         return $outStr;
00667                 }
00668         }
00669 
00678         function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
00679 
00680                         // Charset is case-insensitive.
00681                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00682                         $strLen = strlen($str);
00683                         $outStr='';
00684                         $buf='';
00685                         for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {       // Traverse each char in UTF-8 string.
00686                                 $chr=substr($str,$a,1);
00687                                 $ord=ord($chr);
00688                                 if ($ord>127)   {       // This means multibyte! (first byte!)
00689                                         if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00690 
00691                                                 $buf=$chr;      // Add first byte
00692                                                 for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00693                                                         $ord = $ord << 1;       // Shift it left and ...
00694                                                         if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00695                                                                 $a++;   // Increase pointer...
00696                                                                 $buf.=substr($str,$a,1);        // ... and add the next char.
00697                                                         } else break;
00698                                                 }
00699 
00700                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
00701                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00702                                                         if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00703                                                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00704                                                         } else $outStr.= chr($mByte);
00705                                                 } elseif ($useEntityForNoChar) {        // Create num entity:
00706                                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00707                                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists
00708                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00709                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00710                         }
00711                         return $outStr;
00712                 }
00713         }
00714 
00721         function utf8_to_entities($str) {
00722                 $strLen = strlen($str);
00723                 $outStr='';
00724                 $buf='';
00725                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00726                         $chr=substr($str,$a,1);
00727                         $ord=ord($chr);
00728                         if ($ord>127)   {       // This means multibyte! (first byte!)
00729                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00730                                         $buf=$chr;      // Add first byte
00731                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00732                                                 $ord = $ord << 1;       // Shift it left and ...
00733                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00734                                                         $a++;   // Increase pointer...
00735                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00736                                                 } else break;
00737                                         }
00738 
00739                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00740                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00741                         } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00742                 }
00743 
00744                 return $outStr;
00745         }
00746 
00754         function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
00755                 if ($alsoStdHtmlEnt)    {
00756                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
00757                 }
00758 
00759                 $token = md5(microtime());
00760                 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00761                 foreach($parts as $k => $v)     {
00762                         if ($k%2)       {
00763                                 if (substr($v,0,1)=='#')        {       // Dec or hex entities:
00764                                         if (substr($v,1,1)=='x')        {
00765                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00766                                         } else {
00767                                                 $parts[$k] = $this->UnumberToChar(substr($v,1));
00768                                         }
00769                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
00770                                         $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00771                                 } else {        // No conversion:
00772                                         $parts[$k] ='&'.$v.';';
00773                                 }
00774                         }
00775                 }
00776 
00777                 return implode('',$parts);
00778         }
00779 
00788         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
00789                         // If entities must be registered as well...:
00790                 if ($convEntities)      {
00791                         $str = $this->entities_to_utf8($str,1);
00792                 }
00793                         // Do conversion:
00794                 $strLen = strlen($str);
00795                 $outArr=array();
00796                 $buf='';
00797                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00798                         $chr=substr($str,$a,1);
00799                         $ord=ord($chr);
00800                         if ($ord>127)   {       // This means multibyte! (first byte!)
00801                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00802                                         $buf=$chr;      // Add first byte
00803                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00804                                                 $ord = $ord << 1;       // Shift it left and ...
00805                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00806                                                         $a++;   // Increase pointer...
00807                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00808                                                 } else break;
00809                                         }
00810 
00811                                         $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00812                                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
00813                         } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00814                 }
00815 
00816                 return $outArr;
00817         }
00818 
00838         function UnumberToChar($cbyte)  {
00839                 $str='';
00840 
00841                 if ($cbyte < 0x80) {
00842                         $str.=chr($cbyte);
00843                 } else if ($cbyte < 0x800) {
00844                         $str.=chr(0xC0 | ($cbyte >> 6));
00845                         $str.=chr(0x80 | ($cbyte & 0x3F));
00846                 } else if ($cbyte < 0x10000) {
00847                         $str.=chr(0xE0 | ($cbyte >> 12));
00848                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00849                         $str.=chr(0x80 | ($cbyte & 0x3F));
00850                 } else if ($cbyte < 0x200000) {
00851                         $str.=chr(0xF0 | ($cbyte >> 18));
00852                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00853                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00854                         $str.=chr(0x80 | ($cbyte & 0x3F));
00855                 } else if ($cbyte < 0x4000000) {
00856                         $str.=chr(0xF8 | ($cbyte >> 24));
00857                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00858                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00859                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00860                         $str.=chr(0x80 | ($cbyte & 0x3F));
00861                 } else if ($cbyte < 0x80000000) {
00862                         $str.=chr(0xFC | ($cbyte >> 30));
00863                         $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00864                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00865                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00866                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00867                         $str.=chr(0x80 | ($cbyte & 0x3F));
00868                 } else { // Cannot express a 32-bit character in UTF-8
00869                         $str .= chr($this->noCharByteVal);
00870                 }
00871                 return $str;
00872         }
00873 
00883         function utf8CharToUnumber($str,$hex=0) {
00884                 $ord=ord(substr($str,0,1));     // First char
00885 
00886                 if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
00887                         $binBuf='';
00888                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00889                                 $ord = $ord << 1;       // Shift it left and ...
00890                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00891                                         $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00892                                 } else break;
00893                         }
00894                         $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00895 
00896                         $int = bindec($binBuf);
00897                 } else $int = $ord;
00898 
00899                 return $hex ? 'x'.dechex($int) : $int;
00900         }
00901 
00902 
00903 
00904 
00905 
00906 
00907 
00908 
00909 
00910         /********************************************
00911          *
00912          * Init functions
00913          *
00914          ********************************************/
00915 
00926         function initCharset($charset)  {
00927                         // Only process if the charset is not yet loaded:
00928                 if (!is_array($this->parsedCharsets[$charset])) {
00929 
00930                                 // Conversion table filename:
00931                         $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00932 
00933                                 // If the conversion table is found:
00934                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
00935                                         // Cache file for charsets:
00936                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
00937                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00938                                 if ($cacheFile && @is_file($cacheFile)) {
00939                                         $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00940                                 } else {
00941                                                 // Parse conversion table into lines:
00942                                         $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00943                                                 // Initialize the internal variable holding the conv. table:
00944                                         $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00945                                                 // traverse the lines:
00946                                         $detectedType='';
00947                                         foreach($lines as $value)       {
00948                                                 if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
00949 
00950                                                                 // Detect type if not done yet: (Done on first real line)
00951                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
00952                                                         if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00953 
00954                                                         if ($detectedType=='ms-token')  {
00955                                                                 list($hexbyte,$utf8) = split('=|:',$value,3);
00956                                                         } elseif ($detectedType=='whitespaced') {
00957                                                                 $regA=array();
00958                                                                 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00959                                                                 $hexbyte = $regA[1];
00960                                                                 $utf8 = 'U+'.$regA[2];
00961                                                         }
00962                                                         $decval = hexdec(trim($hexbyte));
00963                                                         if ($decval>127)        {
00964                                                                 $utf8decval = hexdec(substr(trim($utf8),2));
00965                                                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00966                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00967                                                         }
00968                                                 }
00969                                         }
00970                                         if ($cacheFile) {
00971                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00972                                         }
00973                                 }
00974                                 return 2;
00975                         } else return false;
00976                 } else return 1;
00977         }
00978 
00988         function initUnicodeData($mode=null)    {
00989                         // cache files
00990                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
00991                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
00992 
00993                         // Only process if the tables are not yet loaded
00994                 switch($mode)   {
00995                         case 'case':
00996                                 if (is_array($this->caseFolding['utf-8']))      return 1;
00997 
00998                                         // Use cached version if possible
00999                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01000                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01001                                         return 2;
01002                                 }
01003                                 break;
01004 
01005                         case 'ascii':
01006                                 if (is_array($this->toASCII['utf-8']))  return 1;
01007 
01008                                         // Use cached version if possible
01009                                 if ($cacheFileASCII && @is_file($cacheFileASCII))       {
01010                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01011                                         return 2;
01012                                 }
01013                                 break;
01014                 }
01015 
01016                         // process main Unicode data file
01017                 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01018                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01019 
01020                 $fh = fopen($unicodeDataFile,'rb');
01021                 if (!$fh)       return false;
01022 
01023                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01024                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01025                 $this->caseFolding['utf-8'] = array();
01026                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01027                 $utf8CaseFolding['toUpper'] = array();
01028                 $utf8CaseFolding['toLower'] = array();
01029                 $utf8CaseFolding['toTitle'] = array();
01030 
01031                 $decomposition = array();       // array of temp. decompositions
01032                 $mark = array();                // array of chars that are marks (eg. composing accents)
01033                 $number = array();              // array of chars that are numbers (eg. digits)
01034                 $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
01035 
01036                 while (!feof($fh))      {
01037                         $line = fgets($fh,4096);
01038                                 // has a lot of info
01039                         list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01040 
01041                         $ord = hexdec($char);
01042                         if ($ord > 0xFFFF)      break;  // only process the BMP
01043 
01044                         $utf8_char = $this->UnumberToChar($ord);
01045 
01046                         if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01047                         if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01048                                 // store "title" only when different from "upper" (only a few)
01049                         if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01050 
01051                         switch ($cat{0})        {
01052                                 case 'M':       // mark (accent, umlaut, ...)
01053                                         $mark["U+$char"] = 1;
01054                                         break;
01055 
01056                                 case 'N':       // numeric value
01057                                         if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
01058                         }
01059 
01060                                 // accented Latin letters without "official" decomposition
01061                         $match = array();
01062                         if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
01063                                 $c = ord($match[2]);
01064                                 if ($match[1] == 'SMALL')       $c += 32;
01065 
01066                                 $decomposition["U+$char"] = array(dechex($c));
01067                                 continue;
01068                         }
01069 
01070                         $match = array();
01071                         if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
01072                                 switch($match[1])       {
01073                                         case '<circle>':        // add parenthesis as circle replacement, eg (1)
01074                                                 $match[2] = '0028 '.$match[2].' 0029';
01075                                                 break;
01076 
01077                                         case '<square>':        // add square brackets as square replacement, eg [1]
01078                                                 $match[2] = '005B '.$match[2].' 005D';
01079                                                 break;
01080 
01081                                         case '<compat>':        // ignore multi char decompositions that start with a space
01082                                                 if (ereg('^0020 ',$match[2]))   continue 2;
01083                                                 break;
01084 
01085                                                 // ignore Arabic and vertical layout presentation decomposition
01086                                         case '<initial>':
01087                                         case '<medial>':
01088                                         case '<final>':
01089                                         case '<isolated>':
01090                                         case '<vertical>':
01091                                                 continue 2;
01092                                 }
01093                                 $decomposition["U+$char"] = split(' ',$match[2]);
01094                         }
01095                 }
01096                 fclose($fh);
01097 
01098                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01099                 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01100                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
01101                         $fh = fopen($specialCasingFile,'rb');
01102                         if ($fh)        {
01103                                 while (!feof($fh))      {
01104                                         $line = fgets($fh,4096);
01105                                         if ($line{0} != '#' && trim($line) != '')       {
01106 
01107                                                 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01108                                                 if ($cond == '' || $cond{0} == '#')     {
01109                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
01110                                                         if ($char != $lower)    {
01111                                                                 $arr = split(' ',$lower);
01112                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01113                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01114                                                         }
01115                                                         if ($char != $title && $title != $upper)        {
01116                                                                 $arr = split(' ',$title);
01117                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01118                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01119                                                         }
01120                                                         if ($char != $upper)    {
01121                                                                         $arr = split(' ',$upper);
01122                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01123                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01124                                                         }
01125                                                 }
01126                                         }
01127                                 }
01128                                 fclose($fh);
01129                         }
01130                 }
01131 
01132                         // process custom decompositions
01133                 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01134                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
01135                         $fh = fopen($customTranslitFile,'rb');
01136                         if ($fh)        {
01137                                 while (!feof($fh))      {
01138                                         $line = fgets($fh,4096);
01139                                         if ($line{0} != '#' && trim($line) != '')       {
01140                                                 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01141                                                 if (!$translit) $omit["U+$char"] = 1;
01142                                                 $decomposition["U+$char"] = split(' ', $translit);
01143 
01144                                         }
01145                                 }
01146                                 fclose($fh);
01147                         }
01148                 }
01149 
01150                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01151                 foreach($decomposition as $from => $to) {
01152                         $code_decomp = array();
01153 
01154                         while ($code_value = array_shift($to))  {
01155                                 if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
01156                                         foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
01157                                                 array_unshift($to, $cv);
01158                                         }
01159                                 } elseif (!isset($mark["U+$code_value"])) {     // remove mark
01160                                         array_push($code_decomp, $code_value);
01161                                 }
01162                         }
01163                         if (count($code_decomp) || isset($omit[$from])) {
01164                                 $decomposition[$from] = $code_decomp;
01165                         } else {
01166                                 unset($decomposition[$from]);
01167                         }
01168                 }
01169 
01170                         // create ascii only mapping
01171                 $this->toASCII['utf-8'] = array();
01172                 $ascii =& $this->toASCII['utf-8'];
01173 
01174                 foreach($decomposition as $from => $to) {
01175                         $code_decomp = array();
01176                         while ($code_value = array_shift($to))  {
01177                                 $ord = hexdec($code_value);
01178                                 if ($ord > 127)
01179                                         continue 2;     // skip decompositions containing non-ASCII chars
01180                                 else
01181                                         array_push($code_decomp,chr($ord));
01182                         }
01183                         $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01184                 }
01185 
01186                         // add numeric decompositions
01187                 foreach($number as $from => $to)        {
01188                         $utf8_char = $this->UnumberToChar(hexdec($from));
01189                         if (!isset($ascii[$utf8_char])) {
01190                                 $ascii[$utf8_char] = $to;
01191                         }
01192                 }
01193 
01194                 if ($cacheFileCase)     {
01195                                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01196                 }
01197 
01198                 if ($cacheFileASCII)    {
01199                                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01200                 }
01201 
01202                 return 3;
01203         }
01204 
01213         function initCaseFolding($charset)      {
01214                         // Only process if the case table is not yet loaded:
01215                 if (is_array($this->caseFolding[$charset]))     return 1;
01216 
01217                         // Use cached version if possible
01218                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01219                 if ($cacheFile && @is_file($cacheFile)) {
01220                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01221                         return 2;
01222                 }
01223 
01224                         // init UTF-8 conversion for this charset
01225                 if (!$this->initCharset($charset))      {
01226                         return false;
01227                 }
01228 
01229                         // UTF-8 case folding is used as the base conversion table
01230                 if (!$this->initUnicodeData('case'))    {
01231                         return false;
01232                 }
01233 
01234                 $nochar = chr($this->noCharByteVal);
01235                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01236                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01237                         $c = $this->utf8_decode($utf8, $charset);
01238 
01239                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01240                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01241                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01242 
01243                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01244                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01245                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
01246 
01247                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01248                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01249                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01250                 }
01251 
01252                         // add the ASCII case table
01253                 for ($i=ord('a'); $i<=ord('z'); $i++)   {
01254                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01255                 }
01256                 for ($i=ord('A'); $i<=ord('Z'); $i++)   {
01257                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01258                 }
01259 
01260                 if ($cacheFile) {
01261                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01262                 }
01263 
01264                 return 3;
01265         }
01266 
01275         function initToASCII($charset)  {
01276                         // Only process if the case table is not yet loaded:
01277                 if (is_array($this->toASCII[$charset])) return 1;
01278 
01279                         // Use cached version if possible
01280                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01281                 if ($cacheFile && @is_file($cacheFile)) {
01282                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01283                         return 2;
01284                 }
01285 
01286                         // init UTF-8 conversion for this charset
01287                 if (!$this->initCharset($charset))      {
01288                         return false;
01289                 }
01290 
01291                         // UTF-8/ASCII transliteration is used as the base conversion table
01292                 if (!$this->initUnicodeData('ascii'))   {
01293                         return false;
01294                 }
01295 
01296                 $nochar = chr($this->noCharByteVal);
01297                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01298                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01299                         $c = $this->utf8_decode($utf8, $charset);
01300 
01301                         if (isset($this->toASCII['utf-8'][$utf8]))      {
01302                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01303                         }
01304                 }
01305 
01306                 if ($cacheFile) {
01307                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01308                 }
01309 
01310                 return 3;
01311         }
01312 
01313 
01314 
01315 
01316 
01317 
01318 
01319 
01320 
01321 
01322 
01323 
01324 
01325 
01326 
01327 
01328         /********************************************
01329          *
01330          * String operation functions
01331          *
01332          ********************************************/
01333 
01346         function substr($charset,$string,$start,$len=null)      {
01347                 if ($len===0)   return '';
01348 
01349                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01350                                 // cannot omit $len, when specifying charset
01351                         if ($len==null) {
01352                                 $enc = mb_internal_encoding();  // save internal encoding
01353                                 mb_internal_encoding('utf-8');
01354                                 $str = mb_substr($string,$start);
01355                                 mb_internal_encoding($enc);     // restore internal encoding
01356 
01357                                 return $str;
01358                         }
01359                         else    return mb_substr($string,$start,$len,'utf-8');
01360                 } elseif ($charset == 'utf-8')  {
01361                         return $this->utf8_substr($string,$start,$len);
01362                 } elseif ($this->eucBasedSets[$charset])        {
01363                         return $this->euc_substr($string,$start,$charset,$len);
01364                 } elseif ($this->twoByteSets[$charset]) {
01365                         return substr($string,$start*2,$len*2);
01366                 } elseif ($this->fourByteSets[$charset])        {
01367                         return substr($string,$start*4,$len*4);
01368                 }
01369 
01370                 // treat everything else as single-byte encoding
01371                 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01372         }
01373 
01384         function strlen($charset,$string)       {
01385                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01386                         return mb_strlen($string,$charset);
01387                 } elseif ($charset == 'utf-8')  {
01388                         return $this->utf8_strlen($string);
01389                 } elseif ($this->eucBasedSets[$charset])        {
01390                         return $this->euc_strlen($string,$charset);
01391                 } elseif ($this->twoByteSets[$charset]) {
01392                         return strlen($string)/2;
01393                 } elseif ($this->fourByteSets[$charset])        {
01394                         return strlen($string)/4;
01395                 }
01396                 // treat everything else as single-byte encoding
01397                 return strlen($string);
01398         }
01399 
01412         function crop($charset,$string,$len,$crop='')   {
01413                 if (intval($len) == 0)  return $string;
01414 
01415                 if ($charset == 'utf-8')        {
01416                         $i = $this->utf8_char2byte_pos($string,$len);
01417                 } elseif ($this->eucBasedSets[$charset])        {
01418                         $i = $this->euc_char2byte_pos($string,$len,$charset);
01419                 } else {
01420                         if ($len > 0)   {
01421                                 $i = $len;
01422                         } else {
01423                                 $i = strlen($string)+$len;
01424                                 if ($i<=0)      $i = false;
01425                         }
01426                 }
01427 
01428                 if ($i === false)       {       // $len outside actual string length
01429                         return $string;
01430                 } else  {
01431                         if ($len > 0)   {
01432                                 if (strlen($string{$i}))        {
01433                                         return substr($string,0,$i).$crop;
01434 
01435                                 }
01436                         } else {
01437                                 if (strlen($string{$i-1}))      {
01438                                         return $crop.substr($string,$i);
01439                                 }
01440                         }
01441 
01442 /*
01443                         if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01444                                 if ($len > 0)   {
01445                                         return substr($string,0,$i).$crop;
01446                                 } else {
01447                                         return $crop.substr($string,$i);
01448                                 }
01449                         }
01450 */
01451                 }
01452                 return $string;
01453         }
01454 
01465         function strtrunc($charset,$string,$len)        {
01466                 if ($len <= 0)  return '';
01467 
01468                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01469                         return mb_strcut($string,0,$len,$charset);
01470                 } elseif ($charset == 'utf-8')  {
01471                         return $this->utf8_strtrunc($string,$len);
01472                 } elseif ($this->eucBasedSets[$charset])        {
01473                         return $this->euc_strtrunc($string,$charset);
01474                 } elseif ($this->twoByteSets[$charset]) {
01475                         if ($len % 2)   $len--;         // don't cut at odd positions
01476                 } elseif ($this->fourByteSets[$charset])        {
01477                         $x = $len % 4;
01478                         $len -= $x;     // realign to position dividable by four
01479                 }
01480                 // treat everything else as single-byte encoding
01481                 return substr($string,0,$len);
01482         }
01483 
01499         function conv_case($charset,$string,$case)      {
01500                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && float(phpversion()) >= 4.3)   {
01501                         if ($case == 'toLower') {
01502                                 return mb_strtolower($str,'utf-8');
01503                         } else {
01504                                 return mb_strtoupper($str,'utf-8');
01505                         }
01506                 } elseif ($charset == 'utf-8')  {
01507                         return $this->utf8_char_mapping($string,'case',$case);
01508                 } elseif (isset($this->eucBasedSets[$charset])) {
01509                         return $this->euc_char_mapping($string,$charset,'case',$case);
01510                 } else {
01511                                 // treat everything else as single-byte encoding
01512                         return $this->sb_char_mapping($string,$charset,'case',$case);
01513                 }
01514 
01515                 return $string;
01516         }
01517 
01525         function specCharsToASCII($charset,$string)     {
01526                 if ($charset == 'utf-8')        {
01527                         return $this->utf8_char_mapping($string,'ascii');
01528                 } elseif (isset($this->eucBasedSets[$charset])) {
01529                         return $this->euc_char_mapping($string,$charset,'ascii');
01530                 } else {
01531                                 // treat everything else as single-byte encoding
01532                         return $this->sb_char_mapping($string,$charset,'ascii');
01533                 }
01534 
01535                 return $string;
01536         }
01537 
01538 
01539 
01540 
01541 
01542 
01543 
01544 
01545 
01546 
01547 
01548 
01549         /********************************************
01550          *
01551          * Internal string operation functions
01552          *
01553          ********************************************/
01554 
01565         function sb_char_mapping($str,$charset,$mode,$opt='')   {
01566                 switch($mode)   {
01567                         case 'case':
01568                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
01569                                 $map =& $this->caseFolding[$charset][$opt];
01570                                 break;
01571 
01572                         case 'ascii':
01573                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
01574                                 $map =& $this->toASCII[$charset];
01575                                 break;
01576 
01577                         default:
01578                                 return $str;
01579                 }
01580 
01581                 $out = '';
01582                 for($i=0; strlen($str{$i}); $i++)       {
01583                         $c = $str{$i};
01584                         if (isset($map[$c]))    {
01585                                 $out .= $map[$c];
01586                         } else {
01587                                 $out .= $c;
01588                         }
01589                 }
01590 
01591                 return $out;
01592         }
01593 
01594 
01595 
01596 
01597 
01598 
01599 
01600 
01601 
01602 
01603         /********************************************
01604          *
01605          * Internal UTF-8 string operation functions
01606          *
01607          ********************************************/
01608 
01620         function utf8_substr($str,$start,$len=null)     {
01621                 if (!strcmp($len,'0'))  return '';
01622 
01623                 $byte_start = $this->utf8_char2byte_pos($str,$start);
01624                 if ($byte_start === false)      {
01625                         if ($start > 0) {
01626                                 return false;   // $start outside string length
01627                         } else {
01628                                 $start = 0;
01629                         }
01630                 }
01631 
01632                 $str = substr($str,$byte_start);
01633 
01634                 if ($len!=null) {
01635                         $byte_end = $this->utf8_char2byte_pos($str,$len);
01636                         if ($byte_end === false)        // $len outside actual string length
01637                                 return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
01638                         else
01639                                 return substr($str,0,$byte_end);
01640                 }
01641                 else    return $str;
01642         }
01643 
01653         function utf8_strlen($str)      {
01654                 $n=0;
01655                 for($i=0; strlen($str{$i}); $i++)       {
01656                         $c = ord($str{$i});
01657                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01658                                 $n++;
01659                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01660                                 $n++;
01661                 }
01662                 return $n;
01663         }
01664 
01674         function utf8_strtrunc($str,$len)       {
01675                 $i = $len-1;
01676                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01677                         for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
01678                         if ($i <= 0)    return ''; // sanity check
01679                         for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
01680                         if ($bc+$i > $len)      return substr($str,0,$i);
01681                         // fallthru: multibyte char fits into length
01682                 }
01683                 return substr($str,$len);
01684         }
01685 
01696         function utf8_strpos($haystack,$needle,$offset=0)       {
01697                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01698                         return mb_strpos($haystack,$needle,'utf-8');
01699                 }
01700 
01701                 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
01702                 if ($byte_offset === false)     return false; // offset beyond string length
01703 
01704                 $byte_pos = strpos($haystack,$needle,$byte_offset);
01705                 if ($byte_pos === false)        return false; // needle not found
01706 
01707                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01708         }
01709 
01719         function utf8_strrpos($haystack,$needle)        {
01720                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01721                         return mb_strrpos($haystack,$needle,'utf-8');
01722                 }
01723 
01724                 $byte_pos = strrpos($haystack,$needle);
01725                 if ($byte_pos === false)        return false; // needle not found
01726 
01727                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01728         }
01729 
01739         function utf8_char2byte_pos($str,$pos)  {
01740                 $n = 0;                         // number of characters found
01741                 $p = abs($pos);         // number of characters wanted
01742 
01743                 if ($pos >= 0)  {
01744                         $i = 0;
01745                         $d = 1;
01746                 } else {
01747                         $i = strlen($str)-1;
01748                         $d = -1;
01749                 }
01750 
01751                 for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
01752                         $c = (int)ord($str{$i});
01753                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01754                                 $n++;
01755                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01756                                 $n++;
01757                 }
01758                 if (!strlen($str{$i}))  return false; // offset beyond string length
01759 
01760                 if ($pos >= 0)  {
01761                                 // skip trailing multi-byte data bytes
01762                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
01763                 } else {
01764                                 // correct offset
01765                         $i++;
01766                 }
01767 
01768                 return $i;
01769         }
01770 
01780         function utf8_byte2char_pos($str,$pos)  {
01781                 $n = 0; // number of characters
01782                 for($i=$pos; $i>0; $i--)        {
01783                         $c = (int)ord($str{$i});
01784                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01785                                 $n++;
01786                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01787                                 $n++;
01788                 }
01789                 if (!strlen($str{$i}))  return false; // offset beyond string length
01790 
01791                 return $n;
01792         }
01793 
01803         function utf8_char_mapping($str,$mode,$opt='')  {
01804                 if (!$this->initUnicodeData($mode))     return $str;    // do nothing
01805 
01806                 $out = '';
01807                 switch($mode)   {
01808                         case 'case':
01809                                 $map =& $this->caseFolding['utf-8'][$opt];
01810                                 break;
01811 
01812                         case 'ascii':
01813                                 $map =& $this->toASCII['utf-8'];
01814                                 break;
01815 
01816                         default:
01817                                 return $str;
01818                 }
01819 
01820                 for($i=0; strlen($str{$i}); $i++)       {
01821                         $c = ord($str{$i});
01822                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01823                                 $mbc = $str{$i};
01824                         elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
01825                                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
01826                                 $mbc = substr($str,$i,$bc);
01827                                 $i += $bc-1;
01828                         }
01829 
01830                         if (isset($map[$mbc]))  {
01831                                 $out .= $map[$mbc];
01832                         } else {
01833                                 $out .= $mbc;
01834                         }
01835                 }
01836 
01837                 return $out;
01838         }
01839 
01840 
01841 
01842 
01843 
01844 
01845 
01846 
01847 
01848 
01849 
01850 
01851 
01852 
01853 
01854 
01855 
01856 
01857         /********************************************
01858          *
01859          * Internal EUC string operation functions
01860          *
01861          * Extended Unix Code:
01862          *  ASCII compatible 7bit single bytes chars
01863          *  8bit two byte chars
01864          *
01865          * Shift-JIS is treated as a special case.
01866          *
01867          ********************************************/
01868 
01879         function euc_strtrunc($str,$len,$charset)        {
01880                 $sjis = ($charset == 'shift_jis');
01881                 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
01882                         $c = ord($str{$i});
01883                         if ($sjis)      {
01884                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01885                         }
01886                         else    {
01887                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01888                         }
01889                 }
01890                 if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
01891 
01892                 if ($i>$len)
01893                         return substr($str,0,$len-1);   // we ended on a first byte
01894                 else
01895                         return substr($str,0,$len);
01896         }
01897 
01908         function euc_substr($str,$start,$charset,$len=null)     {
01909                 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
01910                 if ($byte_start === false)      return false;   // $start outside string length
01911 
01912                 $str = substr($str,$byte_start);
01913 
01914                 if ($len!=null) {
01915                         $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
01916                         if ($byte_end === false)        // $len outside actual string length
01917                                 return $str;
01918                         else
01919                                 return substr($str,0,$byte_end);
01920                 }
01921                 else    return $str;
01922         }
01923 
01933         function euc_strlen($str,$charset)       {
01934                 $sjis = ($charset == 'shift_jis');
01935                 $n=0;
01936                 for ($i=0; strlen($str{$i}); $i++) {
01937                         $c = ord($str{$i});
01938                         if ($sjis)      {
01939                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01940                         }
01941                         else    {
01942                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01943                         }
01944 
01945                         $n++;
01946                 }
01947 
01948                 return $n;
01949         }
01950 
01960         function euc_char2byte_pos($str,$pos,$charset)  {
01961                 $sjis = ($charset == 'shift_jis');
01962                 $n = 0; // number of characters seen
01963                 $p = abs($pos); // number of characters wanted
01964 
01965                 if ($pos >= 0)  {
01966                         $i = 0;
01967                         $d = 1;
01968                 } else {
01969                         $i = strlen($str)-1;
01970                         $d = -1;
01971                 }
01972 
01973                 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
01974                         $c = ord($str{$i});
01975                         if ($sjis)      {
01976                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
01977                         }
01978                         else    {
01979                                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
01980                         }
01981 
01982                         $n++;
01983                 }
01984                 if (!strlen($str{$i}))  return false; // offset beyond string length
01985 
01986                 if ($pos < 0)   $i++;   // correct offset
01987 
01988                 return $i;
01989         }
01990 
02001         function euc_char_mapping($str,$charset,$mode,$opt='')  {
02002                 switch($mode)   {
02003                         case 'case':
02004                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
02005                                 $map =& $this->caseFolding[$charset][$opt];
02006                                 break;
02007 
02008                         case 'ascii':
02009                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
02010                                 $map =& $this->toASCII[$charset];
02011                                 break;
02012 
02013                         default:
02014                                 return $str;
02015                 }
02016 
02017                 $sjis = ($charset == 'shift_jis');
02018                 $out = '';
02019                 for($i=0; strlen($str{$i}); $i++)       {
02020                         $mbc = $str{$i};
02021                         $c = ord($mbc);
02022 
02023                         if ($sjis)      {
02024                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
02025                                         $mbc = substr($str,$i,2);
02026                                         $i++;
02027                                 }
02028                         }
02029                         else    {
02030                                 if ($c >= 0x80) {       // a double-byte char
02031                                         $mbc = substr($str,$i,2);
02032                                         $i++;
02033                                 }
02034                         }
02035 
02036                         if (isset($map[$mbc]))  {
02037                                 $out .= $map[$mbc];
02038                         } else {
02039                                 $out .= $mbc;
02040                         }
02041                 }
02042 
02043                 return $out;
02044         }
02045 
02046 }
02047 
02048 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])        {
02049         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02050 }
02051 ?>


Généré par Le spécialiste TYPO3 avec  doxygen 1.4.6