Documentation TYPO3 par Ameos

class.t3lib_cs.php

00001 <?php
00002 /***************************************************************
00003 *  Copyright notice
00004 *
00005 *  (c) 2003-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
00006 *  All rights reserved
00007 *
00008 *  This script is part of the Typo3 project. The Typo3 project is
00009 *  free software; you can redistribute it and/or modify
00010 *  it under the terms of the GNU General Public License as published by
00011 *  the Free Software Foundation; either version 2 of the License, or
00012 *  (at your option) any later version.
00013 *
00014 *  The GNU General Public License can be found at
00015 *  http://www.gnu.org/copyleft/gpl.html.
00016 *
00017 *  This script is distributed in the hope that it will be useful,
00018 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00019 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020 *  GNU General Public License for more details.
00021 *
00022 *  This copyright notice MUST APPEAR in all copies of the script!
00023 ***************************************************************/
00136 class t3lib_cs {
00137         var $noCharByteVal=63;          // ASCII Value for chars with no equivalent.
00138 
00139                 // This is the array where parsed conversion tables are stored (cached)
00140         var $parsedCharsets=array();
00141 
00142                 // An array where case folding data will be stored (cached)
00143         var $caseFolding=array();
00144 
00145                 // An array where charset-to-ASCII mappings are stored (cached)
00146         var $toASCII=array();
00147 
00148                 // This tells the converter which charsets has two bytes per char:
00149         var $twoByteSets=array(
00150                 'ucs-2'=>1,     // 2-byte Unicode
00151         );
00152 
00153                 // This tells the converter which charsets has four bytes per char:
00154         var $fourByteSets=array(
00155                 'ucs-4'=>1,     // 4-byte Unicode
00156                 'utf-32'=>1,    // 4-byte Unicode (limited to the 21-bits of UTF-16)
00157         );
00158 
00159                 // This tells the converter which charsets use a scheme like the Extended Unix Code:
00160         var $eucBasedSets=array(
00161                 'gb2312'=>1,            // Chinese, simplified.
00162                 'big5'=>1,              // Chinese, traditional.
00163                 'euc-kr'=>1,            // Korean
00164                 'shift_jis'=>1,         // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
00165         );
00166 
00167                 // see  http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
00168                 // http://czyborra.com/charsets/iso8859.html
00169         var $synonyms=array(
00170                 'us' => 'ascii',
00171                 'us-ascii'=> 'ascii',
00172                 'cp819' => 'iso-8859-1',
00173                 'ibm819' => 'iso-8859-1',
00174                 'iso-ir-100' => 'iso-8859-1',
00175                 'iso-ir-109' => 'iso-8859-2',
00176                 'iso-ir-148' => 'iso-8859-9',
00177                 'iso-ir-199' => 'iso-8859-14',
00178                 'iso-ir-203' => 'iso-8859-15',
00179                 'csisolatin1' => 'iso-8859-1',
00180                 'csisolatin2' => 'iso-8859-2',
00181                 'csisolatin3' => 'iso-8859-3',
00182                 'csisolatin5' => 'iso-8859-9',
00183                 'csisolatin8' => 'iso-8859-14',
00184                 'csisolatin9' => 'iso-8859-15',
00185                 'csisolatingreek' => 'iso-8859-7',
00186                 'iso-celtic' => 'iso-8859-14',
00187                 'latin1' => 'iso-8859-1',
00188                 'latin2' => 'iso-8859-2',
00189                 'latin3' => 'iso-8859-3',
00190                 'latin5' => 'iso-8859-9',
00191                 'latin6' => 'iso-8859-10',
00192                 'latin8' => 'iso-8859-14',
00193                 'latin9' => 'iso-8859-15',
00194                 'l1' => 'iso-8859-1',
00195                 'l2' => 'iso-8859-2',
00196                 'l3' => 'iso-8859-3',
00197                 'l5' => 'iso-8859-9',
00198                 'l6' => 'iso-8859-10',
00199                 'l8' => 'iso-8859-14',
00200                 'l9' => 'iso-8859-15',
00201                 'cyrillic' => 'iso-8859-5',
00202                 'arabic' => 'iso-8859-6',
00203                 'tis-620' => 'iso-8859-11',
00204                 'win874' => 'windows-874',
00205                 'win1250' => 'windows-1250',
00206                 'win1251' => 'windows-1251',
00207                 'win1252' => 'windows-1252',
00208                 'win1253' => 'windows-1253',
00209                 'win1254' => 'windows-1254',
00210                 'win1255' => 'windows-1255',
00211                 'win1256' => 'windows-1256',
00212                 'win1257' => 'windows-1257',
00213                 'win1258' => 'windows-1258',
00214                 'cp1250' => 'windows-1250',
00215                 'cp1251' => 'windows-1251',
00216                 'cp1252' => 'windows-1252',
00217                 'ms-ee' => 'windows-1250',
00218                 'ms-ansi' => 'windows-1252',
00219                 'ms-greek' => 'windows-1253',
00220                 'ms-turk' => 'windows-1254',
00221                 'winbaltrim' => 'windows-1257',
00222                 'koi-8ru' => 'koi-8r',
00223                 'koi8r' => 'koi-8r',
00224                 'cp878' => 'koi-8r',
00225                 'mac' => 'macroman',
00226                 'macintosh' => 'macroman',
00227                 'euc-cn' => 'gb2312',
00228                 'x-euc-cn' => 'gb2312',
00229                 'euccn' => 'gb2312',
00230                 'cp936' => 'gb2312',
00231                 'big-5' => 'big5',
00232                 'cp950' => 'big5',
00233                 'eucjp' => 'euc-jp',
00234                 'sjis' => 'shift_jis',
00235                 'shift-jis' => 'shift_jis',
00236                 'cp932' => 'shift_jis',
00237                 'cp949' => 'euc-kr',
00238                 'utf7' => 'utf-7',
00239                 'utf8' => 'utf-8',
00240                 'utf16' => 'utf-16',
00241                 'utf32' => 'utf-32',
00242                 'utf8' => 'utf-8',
00243                 'ucs2' => 'ucs-2',
00244                 'ucs4' => 'ucs-4',
00245         );
00246 
00247                 // mapping of iso-639:2 language codes to language (family) names
00248         var $lang_to_langfamily=array(
00249                         // iso-639:2 language codes, see:
00250                         //  http://www.w3.org/WAI/ER/IG/ert/iso639.htm
00251                         //  http://www.unicode.org/onlinedat/languages.html
00252                 'ar' => 'arabic',
00253                 'bg' => 'cyrillic',
00254                 'cs' => 'east_european',
00255                 'da' => 'west_european',
00256                 'de' => 'west_european',
00257                 'es' => 'west_european',
00258                 'et' => 'estonian',
00259                 'eu' => 'west_european',
00260                 'fi' => 'west_european',
00261                 'fr' => 'west_european',
00262                 'gr' => 'greek',
00263                 'hr' => 'east_european',
00264                 'hu' => 'east_european',
00265                 'iw' => 'hebrew',
00266                 'is' => 'west_european',
00267                 'it' => 'west_european',
00268                 'ja' => 'japanese',
00269                 'kl' => 'west_european',
00270                 'ko' => 'korean',
00271                 'lt' => 'lithuanian',
00272                 'lv' => 'west_european', // Latvian/Lettish
00273                 'nl' => 'west_european',
00274                 'no' => 'west_european',
00275                 'pl' => 'east_european',
00276                 'pt' => 'west_european',
00277                 'ro' => 'east_european',
00278                 'ru' => 'cyrillic',
00279                 'sk' => 'east_european',
00280                 'sl' => 'east_european',
00281                 'sv' => 'west_european',
00282                 'th' => 'thai',
00283                 'uk' => 'cyrillic',
00284                 'vi' => 'vietnamese',
00285                 'zh' => 'chinese',
00286                         // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
00287                 'chs' => 'simpl_chinese',
00288                 'cht' => 'trad_chinese',
00289                 'csy' => 'east_european',
00290                 'dan' => 'west_european',
00291                 'deu' => 'west_european',
00292                 'dea' => 'west_european',
00293                 'des' => 'west_european',
00294                 'ena' => 'west_european',
00295                 'enc' => 'west_european',
00296                 'eng' => 'west_european',
00297                 'enz' => 'west_european',
00298                 'enu' => 'west_european',
00299                 'nld' => 'west_european',
00300                 'nlb' => 'west_european',
00301                 'fin' => 'west_european',
00302                 'fra' => 'west_european',
00303                 'frb' => 'west_european',
00304                 'frc' => 'west_european',
00305                 'frs' => 'west_european',
00306                 'ell' => 'greek',
00307                 'hun' => 'east_european',
00308                 'isl' => 'west_euorpean',
00309                 'ita' => 'west_european',
00310                 'its' => 'west_european',
00311                 'jpn' => 'japanese',
00312                 'kor' => 'korean',
00313                 'nor' => 'west_european',
00314                 'non' => 'west_european',
00315                 'plk' => 'east_european',
00316                 'ptg' => 'west_european',
00317                 'ptb' => 'west_european',
00318                 'rus' => 'east_european',
00319                 'sky' => 'east_european',
00320                 'esp' => 'west_european',
00321                 'esm' => 'west_european',
00322                 'esn' => 'west_european',
00323                 'sve' => 'west_european',
00324                 'trk' => 'turkish',
00325                         // English language names
00326                 'bulgarian' => 'east_european',
00327                 'catalan' => 'west_european',
00328                 'croatian' => 'east_european',
00329                 'czech' => 'east_european',
00330                 'danish' => 'west_european',
00331                 'dutch' => 'west_european',
00332                 'english' => 'west_european',
00333                 'finnish' => 'west_european',
00334                 'french' => 'west_european',
00335                 'galician' => 'west_european',
00336                 'german' => 'west_european',
00337                 'hungarian' => 'east_european',
00338                 'icelandic' => 'west_european',
00339                 'italian' => 'west_european',
00340                 'latvian' => 'west_european',
00341                 'lettish' => 'west_european',
00342                 'norwegian' => 'west_european',
00343                 'polish' => 'east_european',
00344                 'portuguese' => 'west_european',
00345                 'russian' => 'cyrillic',
00346                 'romanian' => 'east_european',
00347                 'slovak' => 'east_european',
00348                 'slovenian' => 'east_european',
00349                 'spanish' => 'west_european',
00350                 'svedish' => 'west_european',
00351                 'turkish' => 'east_european',
00352                 'ukrainian' => 'cyrillic',
00353         );
00354 
00355                 // mapping of language (family) names to charsets on Unix
00356         var $lang_to_charset_unix=array(
00357                 'west_european' => 'iso-8859-1',
00358                 'estonian' => 'iso-8859-1',
00359                 'east_european' => 'iso-8859-2',
00360                 'baltic' => 'iso-8859-4',
00361                 'cyrillic' => 'iso-8859-5',
00362                 'arabic' => 'iso-8859-6',
00363                 'greek' => 'iso-8859-7',
00364                 'hebrew' => 'iso-8859-8',
00365                 'turkish' => 'iso-8859-9',
00366                 'thai' => 'iso-8859-11', // = TIS-620
00367                 'lithuanian' => 'iso-8859-13',
00368                 'chinese' => 'gb2312', // = euc-cn
00369                 'japanese' => 'euc-jp',
00370                 'korean' => 'euc-kr',
00371                 'simpl_chinese' => 'gb2312',
00372                 'trad_chinese' => 'big5',
00373                 'vietnamese' => '',
00374         );
00375 
00376                 // mapping of language (family) names to charsets on Windows
00377         var $lang_to_charset_windows=array(
00378                 'east_european' => 'windows-1250',
00379                 'cyrillic' => 'windows-1251',
00380                 'west_european' => 'windows-1252',
00381                 'greek' => 'windows-1253',
00382                 'turkish' => 'windows-1254',
00383                 'hebrew' => 'windows-1255',
00384                 'arabic' => 'windows-1256',
00385                 'baltic' => 'windows-1257',
00386                 'estonian' => 'windows-1257',
00387                 'lithuanian' => 'windows-1257',
00388                 'vietnamese' => 'windows-1258',
00389                 'thai' => 'cp874',
00390                 'korean' => 'cp949',
00391                 'chinese' => 'gb2312',
00392                 'japanese' => 'shift_jis',
00393                 'simpl_chinese' => 'gb2312',
00394                 'trad_chinese' => 'big5',
00395         );
00396 
00397                 // mapping of locale names to charsets
00398         var $locale_to_charset=array(
00399                 'japanese.euc' => 'euc-jp',
00400                 'ja_jp.ujis' => 'euc-jp',
00401                 'korean.euc' => 'euc-kr',
00402                 'zh_cn' => 'gb2312',
00403                 'zh_hk' => 'big5',
00404                 'zh_tw' => 'big5',
00405         );
00406 
00407                 // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
00408                 // Empty values means "iso-8859-1"
00409         var $charSetArray = array(
00410                 'dk' => '',
00411                 'de' => '',
00412                 'no' => '',
00413                 'it' => '',
00414                 'fr' => '',
00415                 'es' => '',
00416                 'nl' => '',
00417                 'cz' => 'windows-1250',
00418                 'pl' => 'iso-8859-2',
00419                 'si' => 'windows-1250',
00420                 'fi' => '',
00421                 'tr' => 'iso-8859-9',
00422                 'se' => '',
00423                 'pt' => '',
00424                 'ru' => 'windows-1251',
00425                 'ro' => 'iso-8859-2',
00426                 'ch' => 'gb2312',
00427                 'sk' => 'windows-1250',
00428                 'lt' => 'windows-1257',
00429                 'is' => 'utf-8',
00430                 'hr' => 'windows-1250',
00431                 'hu' => 'iso-8859-2',
00432                 'gl' => '',
00433                 'th' => 'iso-8859-11',
00434                 'gr' => 'iso-8859-7',
00435                 'hk' => 'big5',
00436                 'eu' => '',
00437                 'bg' => 'windows-1251',
00438                 'br' => '',
00439                 'et' => 'iso-8859-4',
00440                 'ar' => 'iso-8859-6',
00441                 'he' => 'utf-8',
00442                 'ua' => 'windows-1251',
00443                 'jp' => 'shift_jis',
00444                 'lv' => 'utf-8',
00445                 'vn' => 'utf-8',
00446                 'ca' => 'iso-8859-15',
00447                 'ba' => 'iso-8859-2',
00448                 'kr' => 'euc-kr',
00449                 'eo' => 'utf-8',
00450                 'my' => '',
00451                 'hi' => 'utf-8',
00452         );
00453 
00454                 // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
00455                 // Empty values means sames as Typo3
00456         var $isoArray = array(
00457                 'dk' => 'da',
00458                 'de' => '',
00459                 'no' => '',
00460                 'it' => '',
00461                 'fr' => '',
00462                 'es' => '',
00463                 'nl' => '',
00464                 'cz' => 'cs',
00465                 'pl' => '',
00466                 'si' => 'sl',
00467                 'fi' => '',
00468                 'tr' => '',
00469                 'se' => 'sv',
00470                 'pt' => '',
00471                 'ru' => '',
00472                 'ro' => '',
00473                 'ch' => 'zh_CN',
00474                 'sk' => '',
00475                 'lt' => '',
00476                 'is' => '',
00477                 'hr' => '',
00478                 'hu' => '',
00479                 'gl' => '', // Greenlandic
00480                 'th' => '',
00481                 'gr' => 'el',
00482                 'hk' => 'zh_HK',
00483                 'eu' => '',
00484                 'bg' => '',
00485                 'br' => 'pt_BR',
00486                 'et' => '',
00487                 'ar' => '',
00488                 'he' => 'iw',
00489                 'ua' => 'uk',
00490                 'jp' => 'ja',
00491                 'lv' => '',
00492                 'vn' => 'vi',
00493                 'ca' => '',
00494                 'ba' => '', // Bosnian
00495                 'kr' => '',
00496         );
00497 
00505         function parse_charset($charset)        {
00506                 $charset = strtolower($charset);
00507                 if (isset($this->synonyms[$charset]))   $charset = $this->synonyms[$charset];
00508 
00509                 return $charset;
00510         }
00511 
00524         function get_locale_charset($locale)    {
00525                 $locale = strtolower($locale);
00526 
00527                         // exact locale specific charset?
00528                 if (isset($this->locale_to_charset[$locale]))   return $this->locale_to_charset[$locale];
00529 
00530                         // get modifier
00531                 list($locale,$modifier) = explode('@',$locale);
00532 
00533                         // locale contains charset: use it
00534                 list($locale,$charset) = explode('.',$locale);
00535                 if ($charset)   return $this->parse_charset($charset);
00536 
00537                         // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
00538                 if ($modifier == 'euro')        return 'iso-8859-15';
00539 
00540                         // get language
00541                 list($language,$country) = explode('_',$locale);
00542                 if (isset($this->lang_to_langfamily[$language]))        $language = $this->lang_to_langfamily[$language];
00543 
00544                 if (TYPO3_OS == 'WIN')  {
00545                         $cs = $this->lang_to_charset_windows[$language];
00546                 } else {
00547                         $cs = $this->lang_to_charset_unix[$language];
00548                 }
00549 
00550                 return $cs ? $cs : 'iso-8859-1';
00551         }
00552 
00553 
00554 
00555 
00556 
00557 
00558 
00559 
00560 
00561         /********************************************
00562          *
00563          * Charset Conversion functions
00564          *
00565          ********************************************/
00566 
00577         function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
00578                 if ($fromCS==$toCS)     return $str;
00579 
00580                         // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
00581                 if ($toCS=='utf-8' || !$useEntityForNoChar)     {
00582                         switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod'])       {
00583                         case 'mbstring':
00584                                 $conv_str = mb_convert_encoding($str,$toCS,$fromCS);
00585                                 if (false !== $conv_str)        return $conv_str; // returns false for unsupported charsets
00586                                 break;
00587 
00588                         case 'iconv':
00589                                 $conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
00590                                 if (false !== $conv_str)        return $conv_str;
00591                                 break;
00592 
00593                         case 'recode':
00594                                 $conv_str = recode_string($fromCS.'..'.$toCS,$str);
00595                                 if (false !== $conv_str)        return $conv_str;
00596                                 break;
00597                         }
00598                         // fallback to TYPO3 conversion
00599                 }
00600 
00601                 if ($fromCS!='utf-8')   $str=$this->utf8_encode($str,$fromCS);
00602                 if ($toCS!='utf-8')     $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
00603                 return $str;
00604         }
00605 
00617         function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
00618                 foreach($array as $key => $value)       {
00619                         if (is_array($array[$key]))     {
00620                                 $this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00621                         } else {
00622                                 $array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
00623                         }
00624                 }
00625         }
00626 
00634         function utf8_encode($str,$charset)     {
00635 
00636                 if ($charset === 'utf-8')       return $str;
00637 
00638                         // Charset is case-insensitive.
00639                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00640                         $strLen = strlen($str);
00641                         $outStr='';
00642 
00643                         for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in string.
00644                                 $chr=substr($str,$a,1);
00645                                 $ord=ord($chr);
00646                                 if (isset($this->twoByteSets[$charset]))        {       // If the charset has two bytes per char
00647                                         $ord2 = ord($str{$a+1});
00648                                         $ord = $ord<<8 & $ord2; // assume big endian
00649 
00650                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00651                                                 $outStr.=$this->parsedCharsets[$charset]['local'][$ord];
00652                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists
00653                                         $a++;
00654                                 } elseif ($ord>127)     {       // If char has value over 127 it's a multibyte char in UTF-8
00655                                         if (isset($this->eucBasedSets[$charset]))       {       // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
00656                                                 if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF))    {       // Shift-JIS: chars between 160 and 223 are single byte
00657                                                         $a++;
00658                                                         $ord2=ord(substr($str,$a,1));
00659                                                         $ord = $ord*256+$ord2;
00660                                                 }
00661                                         }
00662 
00663                                         if (isset($this->parsedCharsets[$charset]['local'][$ord]))      {       // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
00664                                                 $outStr.= $this->parsedCharsets[$charset]['local'][$ord];
00665                                         } else $outStr.= chr($this->noCharByteVal);     // No char exists
00666                                 } else $outStr.= $chr;  // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00667                         }
00668                         return $outStr;
00669                 }
00670         }
00671 
00680         function utf8_decode($str,$charset,$useEntityForNoChar=0)       {
00681 
00682                         // Charset is case-insensitive.
00683                 if ($this->initCharset($charset))       {       // Parse conv. table if not already...
00684                         $strLen = strlen($str);
00685                         $outStr='';
00686                         $buf='';
00687                         for ($a=0,$i=0;$a<$strLen;$a++,$i++)    {       // Traverse each char in UTF-8 string.
00688                                 $chr=substr($str,$a,1);
00689                                 $ord=ord($chr);
00690                                 if ($ord>127)   {       // This means multibyte! (first byte!)
00691                                         if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00692 
00693                                                 $buf=$chr;      // Add first byte
00694                                                 for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00695                                                         $ord = $ord << 1;       // Shift it left and ...
00696                                                         if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00697                                                                 $a++;   // Increase pointer...
00698                                                                 $buf.=substr($str,$a,1);        // ... and add the next char.
00699                                                         } else break;
00700                                                 }
00701 
00702                                                 if (isset($this->parsedCharsets[$charset]['utf8'][$buf]))       {       // If the UTF-8 char-sequence is found then...
00703                                                         $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
00704                                                         if ($mByte>255) {       // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
00705                                                                 $outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
00706                                                         } else $outStr.= chr($mByte);
00707                                                 } elseif ($useEntityForNoChar) {        // Create num entity:
00708                                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00709                                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists
00710                                         } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00711                                 } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00712                         }
00713                         return $outStr;
00714                 }
00715         }
00716 
00723         function utf8_to_entities($str) {
00724                 $strLen = strlen($str);
00725                 $outStr='';
00726                 $buf='';
00727                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00728                         $chr=substr($str,$a,1);
00729                         $ord=ord($chr);
00730                         if ($ord>127)   {       // This means multibyte! (first byte!)
00731                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00732                                         $buf=$chr;      // Add first byte
00733                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00734                                                 $ord = $ord << 1;       // Shift it left and ...
00735                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00736                                                         $a++;   // Increase pointer...
00737                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00738                                                 } else break;
00739                                         }
00740 
00741                                         $outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
00742                                 } else $outStr.=chr($this->noCharByteVal);      // No char exists (MIDDLE of MB sequence!)
00743                         } else $outStr.=$chr;   // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00744                 }
00745 
00746                 return $outStr;
00747         }
00748 
00756         function entities_to_utf8($str,$alsoStdHtmlEnt=0)       {
00757                 if ($alsoStdHtmlEnt)    {
00758                         $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES));             // Getting them in iso-8859-1 - but thats ok since this is observed below.
00759                 }
00760 
00761                 $token = md5(microtime());
00762                 $parts = explode($token,ereg_replace('(&([#[:alnum:]]*);)',$token.'\2'.$token,$str));
00763                 foreach($parts as $k => $v)     {
00764                         if ($k%2)       {
00765                                 if (substr($v,0,1)=='#')        {       // Dec or hex entities:
00766                                         if (substr($v,1,1)=='x')        {
00767                                                 $parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
00768                                         } else {
00769                                                 $parts[$k] = $this->UnumberToChar(substr($v,1));
00770                                         }
00771                                 } elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) {  // Other entities:
00772                                         $parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
00773                                 } else {        // No conversion:
00774                                         $parts[$k] ='&'.$v.';';
00775                                 }
00776                         }
00777                 }
00778 
00779                 return implode('',$parts);
00780         }
00781 
00790         function utf8_to_numberarray($str,$convEntities=0,$retChar=0)   {
00791                         // If entities must be registered as well...:
00792                 if ($convEntities)      {
00793                         $str = $this->entities_to_utf8($str,1);
00794                 }
00795                         // Do conversion:
00796                 $strLen = strlen($str);
00797                 $outArr=array();
00798                 $buf='';
00799                 for ($a=0;$a<$strLen;$a++)      {       // Traverse each char in UTF-8 string.
00800                         $chr=substr($str,$a,1);
00801                         $ord=ord($chr);
00802                         if ($ord>127)   {       // This means multibyte! (first byte!)
00803                                 if ($ord & 64)  {       // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
00804                                         $buf=$chr;      // Add first byte
00805                                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00806                                                 $ord = $ord << 1;       // Shift it left and ...
00807                                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00808                                                         $a++;   // Increase pointer...
00809                                                         $buf.=substr($str,$a,1);        // ... and add the next char.
00810                                                 } else break;
00811                                         }
00812 
00813                                         $outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
00814                                 } else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal;       // No char exists (MIDDLE of MB sequence!)
00815                         } else $outArr[]=$retChar?chr($ord):$ord;       // ... otherwise it's just ASCII 0-127 and one byte. Transparent
00816                 }
00817 
00818                 return $outArr;
00819         }
00820 
00840         function UnumberToChar($cbyte)  {
00841                 $str='';
00842 
00843                 if ($cbyte < 0x80) {
00844                         $str.=chr($cbyte);
00845                 } else if ($cbyte < 0x800) {
00846                         $str.=chr(0xC0 | ($cbyte >> 6));
00847                         $str.=chr(0x80 | ($cbyte & 0x3F));
00848                 } else if ($cbyte < 0x10000) {
00849                         $str.=chr(0xE0 | ($cbyte >> 12));
00850                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00851                         $str.=chr(0x80 | ($cbyte & 0x3F));
00852                 } else if ($cbyte < 0x200000) {
00853                         $str.=chr(0xF0 | ($cbyte >> 18));
00854                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00855                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00856                         $str.=chr(0x80 | ($cbyte & 0x3F));
00857                 } else if ($cbyte < 0x4000000) {
00858                         $str.=chr(0xF8 | ($cbyte >> 24));
00859                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00860                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00861                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00862                         $str.=chr(0x80 | ($cbyte & 0x3F));
00863                 } else if ($cbyte < 0x80000000) {
00864                         $str.=chr(0xFC | ($cbyte >> 30));
00865                         $str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
00866                         $str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
00867                         $str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
00868                         $str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
00869                         $str.=chr(0x80 | ($cbyte & 0x3F));
00870                 } else { // Cannot express a 32-bit character in UTF-8
00871                         $str .= chr($this->noCharByteVal);
00872                 }
00873                 return $str;
00874         }
00875 
00885         function utf8CharToUnumber($str,$hex=0) {
00886                 $ord=ord(substr($str,0,1));     // First char
00887 
00888                 if (($ord & 192) == 192)        {       // This verifyes that it IS a multi byte string
00889                         $binBuf='';
00890                         for ($b=0;$b<8;$b++)    {       // for each byte in multibyte string...
00891                                 $ord = $ord << 1;       // Shift it left and ...
00892                                 if ($ord & 128) {       // ... and with 8th bit - if that is set, then there are still bytes in sequence.
00893                                         $binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
00894                                 } else break;
00895                         }
00896                         $binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
00897 
00898                         $int = bindec($binBuf);
00899                 } else $int = $ord;
00900 
00901                 return $hex ? 'x'.dechex($int) : $int;
00902         }
00903 
00904 
00905 
00906 
00907 
00908 
00909 
00910 
00911 
00912         /********************************************
00913          *
00914          * Init functions
00915          *
00916          ********************************************/
00917 
00928         function initCharset($charset)  {
00929                         // Only process if the charset is not yet loaded:
00930                 if (!is_array($this->parsedCharsets[$charset])) {
00931 
00932                                 // Conversion table filename:
00933                         $charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
00934 
00935                                 // If the conversion table is found:
00936                         if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile))      {
00937                                         // Cache file for charsets:
00938                                         // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
00939                                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
00940                                 if ($cacheFile && @is_file($cacheFile)) {
00941                                         $this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
00942                                 } else {
00943                                                 // Parse conversion table into lines:
00944                                         $lines=t3lib_div::trimExplode(chr(10),t3lib_div::getUrl($charsetConvTableFile),1);
00945                                                 // Initialize the internal variable holding the conv. table:
00946                                         $this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
00947                                                 // traverse the lines:
00948                                         $detectedType='';
00949                                         foreach($lines as $value)       {
00950                                                 if (trim($value) && substr($value,0,1)!='#')    {       // Comment line or blanks are ignored.
00951 
00952                                                                 // Detect type if not done yet: (Done on first real line)
00953                                                                 // The "whitespaced" type is on the syntax      "0x0A   0x000A  #LINE FEED"     while   "ms-token" is like              "B9 = U+00B9 : SUPERSCRIPT ONE"
00954                                                         if (!$detectedType)             $detectedType = ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value) ? 'whitespaced' : 'ms-token';
00955 
00956                                                         if ($detectedType=='ms-token')  {
00957                                                                 list($hexbyte,$utf8) = split('=|:',$value,3);
00958                                                         } elseif ($detectedType=='whitespaced') {
00959                                                                 $regA=array();
00960                                                                 ereg('[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+',$value,$regA);
00961                                                                 $hexbyte = $regA[1];
00962                                                                 $utf8 = 'U+'.$regA[2];
00963                                                         }
00964                                                         $decval = hexdec(trim($hexbyte));
00965                                                         if ($decval>127)        {
00966                                                                 $utf8decval = hexdec(substr(trim($utf8),2));
00967                                                                 $this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
00968                                                                 $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
00969                                                         }
00970                                                 }
00971                                         }
00972                                         if ($cacheFile) {
00973                                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
00974                                         }
00975                                 }
00976                                 return 2;
00977                         } else return false;
00978                 } else return 1;
00979         }
00980 
00990         function initUnicodeData($mode=null)    {
00991                         // cache files
00992                 $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
00993                 $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
00994 
00995                         // Only process if the tables are not yet loaded
00996                 switch($mode)   {
00997                         case 'case':
00998                                 if (is_array($this->caseFolding['utf-8']))      return 1;
00999 
01000                                         // Use cached version if possible
01001                                 if ($cacheFileCase && @is_file($cacheFileCase)) {
01002                                         $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
01003                                         return 2;
01004                                 }
01005                                 break;
01006 
01007                         case 'ascii':
01008                                 if (is_array($this->toASCII['utf-8']))  return 1;
01009 
01010                                         // Use cached version if possible
01011                                 if ($cacheFileASCII && @is_file($cacheFileASCII))       {
01012                                         $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
01013                                         return 2;
01014                                 }
01015                                 break;
01016                 }
01017 
01018                         // process main Unicode data file
01019                 $unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
01020                 if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
01021 
01022                 $fh = fopen($unicodeDataFile,'rb');
01023                 if (!$fh)       return false;
01024 
01025                         // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
01026                         // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
01027                 $this->caseFolding['utf-8'] = array();
01028                 $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
01029                 $utf8CaseFolding['toUpper'] = array();
01030                 $utf8CaseFolding['toLower'] = array();
01031                 $utf8CaseFolding['toTitle'] = array();
01032 
01033                 $decomposition = array();       // array of temp. decompositions
01034                 $mark = array();                // array of chars that are marks (eg. composing accents)
01035                 $number = array();              // array of chars that are numbers (eg. digits)
01036                 $omit = array();                // array of chars to be omitted (eg. Russian hard sign)
01037 
01038                 while (!feof($fh))      {
01039                         $line = fgets($fh,4096);
01040                                 // has a lot of info
01041                         list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = split(';', rtrim($line));
01042 
01043                         $ord = hexdec($char);
01044                         if ($ord > 0xFFFF)      break;  // only process the BMP
01045 
01046                         $utf8_char = $this->UnumberToChar($ord);
01047 
01048                         if ($upper)     $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
01049                         if ($lower)     $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
01050                                 // store "title" only when different from "upper" (only a few)
01051                         if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
01052 
01053                         switch ($cat{0})        {
01054                                 case 'M':       // mark (accent, umlaut, ...)
01055                                         $mark["U+$char"] = 1;
01056                                         break;
01057 
01058                                 case 'N':       // numeric value
01059                                         if ($ord > 0x80 && $num != '')  $number["U+$char"] = $num;
01060                         }
01061 
01062                                 // accented Latin letters without "official" decomposition
01063                         $match = array();
01064                         if (ereg('^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH',$name,$match) && !$decomp)        {
01065                                 $c = ord($match[2]);
01066                                 if ($match[1] == 'SMALL')       $c += 32;
01067 
01068                                 $decomposition["U+$char"] = array(dechex($c));
01069                                 continue;
01070                         }
01071 
01072                         $match = array();
01073                         if (ereg('(<.*>)? *(.+)',$decomp,$match))       {
01074                                 switch($match[1])       {
01075                                         case '<circle>':        // add parenthesis as circle replacement, eg (1)
01076                                                 $match[2] = '0028 '.$match[2].' 0029';
01077                                                 break;
01078 
01079                                         case '<square>':        // add square brackets as square replacement, eg [1]
01080                                                 $match[2] = '005B '.$match[2].' 005D';
01081                                                 break;
01082 
01083                                         case '<compat>':        // ignore multi char decompositions that start with a space
01084                                                 if (ereg('^0020 ',$match[2]))   continue 2;
01085                                                 break;
01086 
01087                                                 // ignore Arabic and vertical layout presentation decomposition
01088                                         case '<initial>':
01089                                         case '<medial>':
01090                                         case '<final>':
01091                                         case '<isolated>':
01092                                         case '<vertical>':
01093                                                 continue 2;
01094                                 }
01095                                 $decomposition["U+$char"] = split(' ',$match[2]);
01096                         }
01097                 }
01098                 fclose($fh);
01099 
01100                         // process additional Unicode data for casing (allow folded characters to expand into a sequence)
01101                 $specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
01102                 if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile))        {
01103                         $fh = fopen($specialCasingFile,'rb');
01104                         if ($fh)        {
01105                                 while (!feof($fh))      {
01106                                         $line = fgets($fh,4096);
01107                                         if ($line{0} != '#' && trim($line) != '')       {
01108 
01109                                                 list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
01110                                                 if ($cond == '' || $cond{0} == '#')     {
01111                                                         $utf8_char = $this->UnumberToChar(hexdec($char));
01112                                                         if ($char != $lower)    {
01113                                                                 $arr = split(' ',$lower);
01114                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01115                                                                 $utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
01116                                                         }
01117                                                         if ($char != $title && $title != $upper)        {
01118                                                                 $arr = split(' ',$title);
01119                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01120                                                                 $utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
01121                                                         }
01122                                                         if ($char != $upper)    {
01123                                                                         $arr = split(' ',$upper);
01124                                                                 for ($i=0; isset($arr[$i]); $i++)       $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
01125                                                                 $utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
01126                                                         }
01127                                                 }
01128                                         }
01129                                 }
01130                                 fclose($fh);
01131                         }
01132                 }
01133 
01134                         // process custom decompositions
01135                 $customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
01136                 if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile))      {
01137                         $fh = fopen($customTranslitFile,'rb');
01138                         if ($fh)        {
01139                                 while (!feof($fh))      {
01140                                         $line = fgets($fh,4096);
01141                                         if ($line{0} != '#' && trim($line) != '')       {
01142                                                 list($char,$translit) = t3lib_div::trimExplode(';', $line);
01143                                                 if (!$translit) $omit["U+$char"] = 1;
01144                                                 $decomposition["U+$char"] = split(' ', $translit);
01145 
01146                                         }
01147                                 }
01148                                 fclose($fh);
01149                         }
01150                 }
01151 
01152                         // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
01153                 foreach($decomposition as $from => $to) {
01154                         $code_decomp = array();
01155 
01156                         while ($code_value = array_shift($to))  {
01157                                 if (isset($decomposition["U+$code_value"]))     {       // do recursive decomposition
01158                                         foreach(array_reverse($decomposition["U+$code_value"]) as $cv)  {
01159                                                 array_unshift($to, $cv);
01160                                         }
01161                                 } elseif (!isset($mark["U+$code_value"])) {     // remove mark
01162                                         array_push($code_decomp, $code_value);
01163                                 }
01164                         }
01165                         if (count($code_decomp) || isset($omit[$from])) {
01166                                 $decomposition[$from] = $code_decomp;
01167                         } else {
01168                                 unset($decomposition[$from]);
01169                         }
01170                 }
01171 
01172                         // create ascii only mapping
01173                 $this->toASCII['utf-8'] = array();
01174                 $ascii =& $this->toASCII['utf-8'];
01175 
01176                 foreach($decomposition as $from => $to) {
01177                         $code_decomp = array();
01178                         while ($code_value = array_shift($to))  {
01179                                 $ord = hexdec($code_value);
01180                                 if ($ord > 127)
01181                                         continue 2;     // skip decompositions containing non-ASCII chars
01182                                 else
01183                                         array_push($code_decomp,chr($ord));
01184                         }
01185                         $ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
01186                 }
01187 
01188                         // add numeric decompositions
01189                 foreach($number as $from => $to)        {
01190                         $utf8_char = $this->UnumberToChar(hexdec($from));
01191                         if (!isset($ascii[$utf8_char])) {
01192                                 $ascii[$utf8_char] = $to;
01193                         }
01194                 }
01195 
01196                 if ($cacheFileCase)     {
01197                                 t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
01198                 }
01199 
01200                 if ($cacheFileASCII)    {
01201                                 t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
01202                 }
01203 
01204                 return 3;
01205         }
01206 
01215         function initCaseFolding($charset)      {
01216                         // Only process if the case table is not yet loaded:
01217                 if (is_array($this->caseFolding[$charset]))     return 1;
01218 
01219                         // Use cached version if possible
01220                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
01221                 if ($cacheFile && @is_file($cacheFile)) {
01222                         $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01223                         return 2;
01224                 }
01225 
01226                         // init UTF-8 conversion for this charset
01227                 if (!$this->initCharset($charset))      {
01228                         return false;
01229                 }
01230 
01231                         // UTF-8 case folding is used as the base conversion table
01232                 if (!$this->initUnicodeData('case'))    {
01233                         return false;
01234                 }
01235 
01236                 $nochar = chr($this->noCharByteVal);
01237                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01238                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01239                         $c = $this->utf8_decode($utf8, $charset);
01240 
01241                                 // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
01242                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
01243                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toUpper'][$c] = $cc;
01244 
01245                                 // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
01246                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
01247                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toLower'][$c] = $cc;
01248 
01249                                 // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
01250                         $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
01251                         if ($cc != '' && $cc != $nochar)        $this->caseFolding[$charset]['toTitle'][$c] = $cc;
01252                 }
01253 
01254                         // add the ASCII case table
01255                 for ($i=ord('a'); $i<=ord('z'); $i++)   {
01256                         $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i-32);
01257                 }
01258                 for ($i=ord('A'); $i<=ord('Z'); $i++)   {
01259                         $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i+32);
01260                 }
01261 
01262                 if ($cacheFile) {
01263                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->caseFolding[$charset]));
01264                 }
01265 
01266                 return 3;
01267         }
01268 
01277         function initToASCII($charset)  {
01278                         // Only process if the case table is not yet loaded:
01279                 if (is_array($this->toASCII[$charset])) return 1;
01280 
01281                         // Use cached version if possible
01282                 $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_'.$charset.'.tbl');
01283                 if ($cacheFile && @is_file($cacheFile)) {
01284                         $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
01285                         return 2;
01286                 }
01287 
01288                         // init UTF-8 conversion for this charset
01289                 if (!$this->initCharset($charset))      {
01290                         return false;
01291                 }
01292 
01293                         // UTF-8/ASCII transliteration is used as the base conversion table
01294                 if (!$this->initUnicodeData('ascii'))   {
01295                         return false;
01296                 }
01297 
01298                 $nochar = chr($this->noCharByteVal);
01299                 foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8)      {
01300                                 // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
01301                         $c = $this->utf8_decode($utf8, $charset);
01302 
01303                         if (isset($this->toASCII['utf-8'][$utf8]))      {
01304                                 $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
01305                         }
01306                 }
01307 
01308                 if ($cacheFile) {
01309                                 t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->toASCII[$charset]));
01310                 }
01311 
01312                 return 3;
01313         }
01314 
01315 
01316 
01317 
01318 
01319 
01320 
01321 
01322 
01323 
01324 
01325 
01326 
01327 
01328 
01329 
01330         /********************************************
01331          *
01332          * String operation functions
01333          *
01334          ********************************************/
01335 
01348         function substr($charset,$string,$start,$len=null)      {
01349                 if ($len===0)   return '';
01350 
01351                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01352                                 // cannot omit $len, when specifying charset
01353                         if ($len==null) {
01354                                 $enc = mb_internal_encoding();  // save internal encoding
01355                                 mb_internal_encoding('utf-8');
01356                                 $str = mb_substr($string,$start);
01357                                 mb_internal_encoding($enc);     // restore internal encoding
01358 
01359                                 return $str;
01360                         }
01361                         else    return mb_substr($string,$start,$len,'utf-8');
01362                 } elseif ($charset == 'utf-8')  {
01363                         return $this->utf8_substr($string,$start,$len);
01364                 } elseif ($this->eucBasedSets[$charset])        {
01365                         return $this->euc_substr($string,$start,$charset,$len);
01366                 } elseif ($this->twoByteSets[$charset]) {
01367                         return substr($string,$start*2,$len*2);
01368                 } elseif ($this->fourByteSets[$charset])        {
01369                         return substr($string,$start*4,$len*4);
01370                 }
01371 
01372                 // treat everything else as single-byte encoding
01373                 return $len === NULL ? substr($string,$start) : substr($string,$start,$len);
01374         }
01375 
01386         function strlen($charset,$string)       {
01387                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01388                         return mb_strlen($string,$charset);
01389                 } elseif ($charset == 'utf-8')  {
01390                         return $this->utf8_strlen($string);
01391                 } elseif ($this->eucBasedSets[$charset])        {
01392                         return $this->euc_strlen($string,$charset);
01393                 } elseif ($this->twoByteSets[$charset]) {
01394                         return strlen($string)/2;
01395                 } elseif ($this->fourByteSets[$charset])        {
01396                         return strlen($string)/4;
01397                 }
01398                 // treat everything else as single-byte encoding
01399                 return strlen($string);
01400         }
01401 
01414         function crop($charset,$string,$len,$crop='')   {
01415                 if (intval($len) == 0)  return $string;
01416 
01417                 if ($charset == 'utf-8')        {
01418                         $i = $this->utf8_char2byte_pos($string,$len);
01419                 } elseif ($this->eucBasedSets[$charset])        {
01420                         $i = $this->euc_char2byte_pos($string,$len,$charset);
01421                 } else {
01422                         if ($len > 0)   {
01423                                 $i = $len;
01424                         } else {
01425                                 $i = strlen($string)+$len;
01426                                 if ($i<=0)      $i = false;
01427                         }
01428                 }
01429 
01430                 if ($i === false)       {       // $len outside actual string length
01431                         return $string;
01432                 } else  {
01433                         if ($len > 0)   {
01434                                 if (strlen($string{$i}))        {
01435                                         return substr($string,0,$i).$crop;
01436 
01437                                 }
01438                         } else {
01439                                 if (strlen($string{$i-1}))      {
01440                                         return $crop.substr($string,$i);
01441                                 }
01442                         }
01443 
01444 /*
01445                         if (abs($len)<$this->strlen($charset,$string))  {       // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return true here (which is not a catastrophe, but...)
01446                                 if ($len > 0)   {
01447                                         return substr($string,0,$i).$crop;
01448                                 } else {
01449                                         return $crop.substr($string,$i);
01450                                 }
01451                         }
01452 */
01453                 }
01454                 return $string;
01455         }
01456 
01467         function strtrunc($charset,$string,$len)        {
01468                 if ($len <= 0)  return '';
01469 
01470                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01471                         return mb_strcut($string,0,$len,$charset);
01472                 } elseif ($charset == 'utf-8')  {
01473                         return $this->utf8_strtrunc($string,$len);
01474                 } elseif ($this->eucBasedSets[$charset])        {
01475                         return $this->euc_strtrunc($string,$charset);
01476                 } elseif ($this->twoByteSets[$charset]) {
01477                         if ($len % 2)   $len--;         // don't cut at odd positions
01478                 } elseif ($this->fourByteSets[$charset])        {
01479                         $x = $len % 4;
01480                         $len -= $x;     // realign to position dividable by four
01481                 }
01482                 // treat everything else as single-byte encoding
01483                 return substr($string,0,$len);
01484         }
01485 
01501         function conv_case($charset,$string,$case)      {
01502                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring' && (float)phpversion() >= 4.3)   {
01503                         if ($case == 'toLower') {
01504                                 return mb_strtolower($string,'utf-8');
01505                         } else {
01506                                 return mb_strtoupper($string,'utf-8');
01507                         }
01508                 } elseif ($charset == 'utf-8')  {
01509                         return $this->utf8_char_mapping($string,'case',$case);
01510                 } elseif (isset($this->eucBasedSets[$charset])) {
01511                         return $this->euc_char_mapping($string,$charset,'case',$case);
01512                 } else {
01513                                 // treat everything else as single-byte encoding
01514                         return $this->sb_char_mapping($string,$charset,'case',$case);
01515                 }
01516 
01517                 return $string;
01518         }
01519 
01527         function specCharsToASCII($charset,$string)     {
01528                 if ($charset == 'utf-8')        {
01529                         return $this->utf8_char_mapping($string,'ascii');
01530                 } elseif (isset($this->eucBasedSets[$charset])) {
01531                         return $this->euc_char_mapping($string,$charset,'ascii');
01532                 } else {
01533                                 // treat everything else as single-byte encoding
01534                         return $this->sb_char_mapping($string,$charset,'ascii');
01535                 }
01536 
01537                 return $string;
01538         }
01539 
01540 
01541 
01542 
01543 
01544 
01545 
01546 
01547 
01548 
01549 
01550 
01551         /********************************************
01552          *
01553          * Internal string operation functions
01554          *
01555          ********************************************/
01556 
01567         function sb_char_mapping($str,$charset,$mode,$opt='')   {
01568                 switch($mode)   {
01569                         case 'case':
01570                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
01571                                 $map =& $this->caseFolding[$charset][$opt];
01572                                 break;
01573 
01574                         case 'ascii':
01575                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
01576                                 $map =& $this->toASCII[$charset];
01577                                 break;
01578 
01579                         default:
01580                                 return $str;
01581                 }
01582 
01583                 $out = '';
01584                 for($i=0; strlen($str{$i}); $i++)       {
01585                         $c = $str{$i};
01586                         if (isset($map[$c]))    {
01587                                 $out .= $map[$c];
01588                         } else {
01589                                 $out .= $c;
01590                         }
01591                 }
01592 
01593                 return $out;
01594         }
01595 
01596 
01597 
01598 
01599 
01600 
01601 
01602 
01603 
01604 
01605         /********************************************
01606          *
01607          * Internal UTF-8 string operation functions
01608          *
01609          ********************************************/
01610 
01622         function utf8_substr($str,$start,$len=null)     {
01623                 if (!strcmp($len,'0'))  return '';
01624 
01625                 $byte_start = $this->utf8_char2byte_pos($str,$start);
01626                 if ($byte_start === false)      {
01627                         if ($start > 0) {
01628                                 return false;   // $start outside string length
01629                         } else {
01630                                 $start = 0;
01631                         }
01632                 }
01633 
01634                 $str = substr($str,$byte_start);
01635 
01636                 if ($len!=null) {
01637                         $byte_end = $this->utf8_char2byte_pos($str,$len);
01638                         if ($byte_end === false)        // $len outside actual string length
01639                                 return $len<0 ? '' : $str;      // When length is less than zero and exceeds, then we return blank string.
01640                         else
01641                                 return substr($str,0,$byte_end);
01642                 }
01643                 else    return $str;
01644         }
01645 
01655         function utf8_strlen($str)      {
01656                 $n=0;
01657                 for($i=0; strlen($str{$i}); $i++)       {
01658                         $c = ord($str{$i});
01659                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01660                                 $n++;
01661                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01662                                 $n++;
01663                 }
01664                 return $n;
01665         }
01666 
01676         function utf8_strtrunc($str,$len)       {
01677                 $i = $len-1;
01678                 if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
01679                         for (; $i>0 && !(ord($str{$i}) & 0x40); $i--)   ;       // find the first byte
01680                         if ($i <= 0)    return ''; // sanity check
01681                         for ($bc=0, $mbs=ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1)  $bc++;  // calculate number of bytes
01682                         if ($bc+$i > $len)      return substr($str,0,$i);
01683                         // fallthru: multibyte char fits into length
01684                 }
01685                 return substr($str,0,$len);
01686         }
01687 
01698         function utf8_strpos($haystack,$needle,$offset=0)       {
01699                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01700                         return mb_strpos($haystack,$needle,'utf-8');
01701                 }
01702 
01703                 $byte_offset = $this->utf8_char2byte_pos($haystack,$offset);
01704                 if ($byte_offset === false)     return false; // offset beyond string length
01705 
01706                 $byte_pos = strpos($haystack,$needle,$byte_offset);
01707                 if ($byte_pos === false)        return false; // needle not found
01708 
01709                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01710         }
01711 
01721         function utf8_strrpos($haystack,$needle)        {
01722                 if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
01723                         return mb_strrpos($haystack,$needle,'utf-8');
01724                 }
01725 
01726                 $byte_pos = strrpos($haystack,$needle);
01727                 if ($byte_pos === false)        return false; // needle not found
01728 
01729                 return $this->utf8_byte2char_pos($haystack,$byte_pos);
01730         }
01731 
01741         function utf8_char2byte_pos($str,$pos)  {
01742                 $n = 0;                         // number of characters found
01743                 $p = abs($pos);         // number of characters wanted
01744 
01745                 if ($pos >= 0)  {
01746                         $i = 0;
01747                         $d = 1;
01748                 } else {
01749                         $i = strlen($str)-1;
01750                         $d = -1;
01751                 }
01752 
01753                 for( ; strlen($str{$i}) && $n<$p; $i+=$d)       {
01754                         $c = (int)ord($str{$i});
01755                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01756                                 $n++;
01757                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01758                                 $n++;
01759                 }
01760                 if (!strlen($str{$i}))  return false; // offset beyond string length
01761 
01762                 if ($pos >= 0)  {
01763                                 // skip trailing multi-byte data bytes
01764                         while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) { $i++; }
01765                 } else {
01766                                 // correct offset
01767                         $i++;
01768                 }
01769 
01770                 return $i;
01771         }
01772 
01782         function utf8_byte2char_pos($str,$pos)  {
01783                 $n = 0; // number of characters
01784                 for($i=$pos; $i>0; $i--)        {
01785                         $c = (int)ord($str{$i});
01786                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01787                                 $n++;
01788                         elseif (($c & 0xC0) == 0xC0)    // multi-byte starting byte (11xxxxxx)
01789                                 $n++;
01790                 }
01791                 if (!strlen($str{$i}))  return false; // offset beyond string length
01792 
01793                 return $n;
01794         }
01795 
01805         function utf8_char_mapping($str,$mode,$opt='')  {
01806                 if (!$this->initUnicodeData($mode))     return $str;    // do nothing
01807 
01808                 $out = '';
01809                 switch($mode)   {
01810                         case 'case':
01811                                 $map =& $this->caseFolding['utf-8'][$opt];
01812                                 break;
01813 
01814                         case 'ascii':
01815                                 $map =& $this->toASCII['utf-8'];
01816                                 break;
01817 
01818                         default:
01819                                 return $str;
01820                 }
01821 
01822                 for($i=0; strlen($str{$i}); $i++)       {
01823                         $c = ord($str{$i});
01824                         if (!($c & 0x80))       // single-byte (0xxxxxx)
01825                                 $mbc = $str{$i};
01826                         elseif (($c & 0xC0) == 0xC0)    {       // multi-byte starting byte (11xxxxxx)
01827                                 for ($bc=0; $c & 0x80; $c = $c << 1) { $bc++; } // calculate number of bytes
01828                                 $mbc = substr($str,$i,$bc);
01829                                 $i += $bc-1;
01830                         }
01831 
01832                         if (isset($map[$mbc]))  {
01833                                 $out .= $map[$mbc];
01834                         } else {
01835                                 $out .= $mbc;
01836                         }
01837                 }
01838 
01839                 return $out;
01840         }
01841 
01842 
01843 
01844 
01845 
01846 
01847 
01848 
01849 
01850 
01851 
01852 
01853 
01854 
01855 
01856 
01857 
01858 
01859         /********************************************
01860          *
01861          * Internal EUC string operation functions
01862          *
01863          * Extended Unix Code:
01864          *  ASCII compatible 7bit single bytes chars
01865          *  8bit two byte chars
01866          *
01867          * Shift-JIS is treated as a special case.
01868          *
01869          ********************************************/
01870 
01881         function euc_strtrunc($str,$len,$charset)        {
01882                 $sjis = ($charset == 'shift_jis');
01883                 for ($i=0; strlen($str{$i}) && $i<$len; $i++) {
01884                         $c = ord($str{$i});
01885                         if ($sjis)      {
01886                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01887                         }
01888                         else    {
01889                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01890                         }
01891                 }
01892                 if (!strlen($str{$i}))  return $str;    // string shorter than supplied length
01893 
01894                 if ($i>$len)
01895                         return substr($str,0,$len-1);   // we ended on a first byte
01896                 else
01897                         return substr($str,0,$len);
01898         }
01899 
01910         function euc_substr($str,$start,$charset,$len=null)     {
01911                 $byte_start = $this->euc_char2byte_pos($str,$start,$charset);
01912                 if ($byte_start === false)      return false;   // $start outside string length
01913 
01914                 $str = substr($str,$byte_start);
01915 
01916                 if ($len!=null) {
01917                         $byte_end = $this->euc_char2byte_pos($str,$len,$charset);
01918                         if ($byte_end === false)        // $len outside actual string length
01919                                 return $str;
01920                         else
01921                                 return substr($str,0,$byte_end);
01922                 }
01923                 else    return $str;
01924         }
01925 
01935         function euc_strlen($str,$charset)       {
01936                 $sjis = ($charset == 'shift_jis');
01937                 $n=0;
01938                 for ($i=0; strlen($str{$i}); $i++) {
01939                         $c = ord($str{$i});
01940                         if ($sjis)      {
01941                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i++;   // advance a double-byte char
01942                         }
01943                         else    {
01944                                 if ($c >= 0x80) $i++;   // advance a double-byte char
01945                         }
01946 
01947                         $n++;
01948                 }
01949 
01950                 return $n;
01951         }
01952 
01962         function euc_char2byte_pos($str,$pos,$charset)  {
01963                 $sjis = ($charset == 'shift_jis');
01964                 $n = 0; // number of characters seen
01965                 $p = abs($pos); // number of characters wanted
01966 
01967                 if ($pos >= 0)  {
01968                         $i = 0;
01969                         $d = 1;
01970                 } else {
01971                         $i = strlen($str)-1;
01972                         $d = -1;
01973                 }
01974 
01975                 for ( ; strlen($str{$i}) && $n<$p; $i+=$d) {
01976                         $c = ord($str{$i});
01977                         if ($sjis)      {
01978                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  $i+=$d; // advance a double-byte char
01979                         }
01980                         else    {
01981                                 if ($c >= 0x80) $i+=$d; // advance a double-byte char
01982                         }
01983 
01984                         $n++;
01985                 }
01986                 if (!strlen($str{$i}))  return false; // offset beyond string length
01987 
01988                 if ($pos < 0)   $i++;   // correct offset
01989 
01990                 return $i;
01991         }
01992 
02003         function euc_char_mapping($str,$charset,$mode,$opt='')  {
02004                 switch($mode)   {
02005                         case 'case':
02006                                 if (!$this->initCaseFolding($charset))  return $str;    // do nothing
02007                                 $map =& $this->caseFolding[$charset][$opt];
02008                                 break;
02009 
02010                         case 'ascii':
02011                                 if (!$this->initToASCII($charset))      return $str;    // do nothing
02012                                 $map =& $this->toASCII[$charset];
02013                                 break;
02014 
02015                         default:
02016                                 return $str;
02017                 }
02018 
02019                 $sjis = ($charset == 'shift_jis');
02020                 $out = '';
02021                 for($i=0; strlen($str{$i}); $i++)       {
02022                         $mbc = $str{$i};
02023                         $c = ord($mbc);
02024 
02025                         if ($sjis)      {
02026                                 if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0))  {       // a double-byte char
02027                                         $mbc = substr($str,$i,2);
02028                                         $i++;
02029                                 }
02030                         }
02031                         else    {
02032                                 if ($c >= 0x80) {       // a double-byte char
02033                                         $mbc = substr($str,$i,2);
02034                                         $i++;
02035                                 }
02036                         }
02037 
02038                         if (isset($map[$mbc]))  {
02039                                 $out .= $map[$mbc];
02040                         } else {
02041                                 $out .= $mbc;
02042                         }
02043                 }
02044 
02045                 return $out;
02046         }
02047 
02048 }
02049 
02050 if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])        {
02051         include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
02052 }
02053 ?>


Généré par Les experts TYPO3 avec  doxygen 1.4.6