Advertisement
Guest User

force utf8

a guest
May 24th, 2015
19,616
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 12.21 KB | None | 0 0
  1. <?php
  2. /*
  3. Copyright (c) 2008 Sebastián Grignoli
  4. All rights reserved.
  5.  
  6. Redistribution and use in source and binary forms, with or without
  7. modification, are permitted provided that the following conditions
  8. are met:
  9. 1. Redistributions of source code must retain the above copyright
  10.    notice, this list of conditions and the following disclaimer.
  11. 2. Redistributions in binary form must reproduce the above copyright
  12.    notice, this list of conditions and the following disclaimer in the
  13.    documentation and/or other materials provided with the distribution.
  14. 3. Neither the name of copyright holders nor the names of its
  15.    contributors may be used to endorse or promote products derived
  16.    from this software without specific prior written permission.
  17.  
  18. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  20. TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  21. PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS
  22. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28. POSSIBILITY OF SUCH DAMAGE.
  29. */
  30.  
  31. /**
  32.  * @author   "Sebastián Grignoli" <grignoli@framework2.com.ar>
  33.  * @package  ForceUTF8
  34.  * @version  1.2
  35.  * @link     https://github.com/neitanod/forceutf8
  36.  * @example  https://github.com/neitanod/forceutf8
  37.  * @license  Revised BSD
  38.  */
  39. class ForceUTF8
  40. {
  41.  
  42.     protected static $win1252ToUtf8 = array(
  43.         128 => "\xe2\x82\xac",
  44.  
  45.         130 => "\xe2\x80\x9a",
  46.         131 => "\xc6\x92",
  47.         132 => "\xe2\x80\x9e",
  48.         133 => "\xe2\x80\xa6",
  49.         134 => "\xe2\x80\xa0",
  50.         135 => "\xe2\x80\xa1",
  51.         136 => "\xcb\x86",
  52.         137 => "\xe2\x80\xb0",
  53.         138 => "\xc5\xa0",
  54.         139 => "\xe2\x80\xb9",
  55.         140 => "\xc5\x92",
  56.  
  57.         142 => "\xc5\xbd",
  58.  
  59.  
  60.         145 => "\xe2\x80\x98",
  61.         146 => "\xe2\x80\x99",
  62.         147 => "\xe2\x80\x9c",
  63.         148 => "\xe2\x80\x9d",
  64.         149 => "\xe2\x80\xa2",
  65.         150 => "\xe2\x80\x93",
  66.         151 => "\xe2\x80\x94",
  67.         152 => "\xcb\x9c",
  68.         153 => "\xe2\x84\xa2",
  69.         154 => "\xc5\xa1",
  70.         155 => "\xe2\x80\xba",
  71.         156 => "\xc5\x93",
  72.  
  73.         158 => "\xc5\xbe",
  74.         159 => "\xc5\xb8"
  75.     );
  76.  
  77.     protected static $brokenUtf8ToUtf8 = array(
  78.         "\xc2\x80" => "\xe2\x82\xac",
  79.  
  80.         "\xc2\x82" => "\xe2\x80\x9a",
  81.         "\xc2\x83" => "\xc6\x92",
  82.         "\xc2\x84" => "\xe2\x80\x9e",
  83.         "\xc2\x85" => "\xe2\x80\xa6",
  84.         "\xc2\x86" => "\xe2\x80\xa0",
  85.         "\xc2\x87" => "\xe2\x80\xa1",
  86.         "\xc2\x88" => "\xcb\x86",
  87.         "\xc2\x89" => "\xe2\x80\xb0",
  88.         "\xc2\x8a" => "\xc5\xa0",
  89.         "\xc2\x8b" => "\xe2\x80\xb9",
  90.         "\xc2\x8c" => "\xc5\x92",
  91.  
  92.         "\xc2\x8e" => "\xc5\xbd",
  93.  
  94.  
  95.         "\xc2\x91" => "\xe2\x80\x98",
  96.         "\xc2\x92" => "\xe2\x80\x99",
  97.         "\xc2\x93" => "\xe2\x80\x9c",
  98.         "\xc2\x94" => "\xe2\x80\x9d",
  99.         "\xc2\x95" => "\xe2\x80\xa2",
  100.         "\xc2\x96" => "\xe2\x80\x93",
  101.         "\xc2\x97" => "\xe2\x80\x94",
  102.         "\xc2\x98" => "\xcb\x9c",
  103.         "\xc2\x99" => "\xe2\x84\xa2",
  104.         "\xc2\x9a" => "\xc5\xa1",
  105.         "\xc2\x9b" => "\xe2\x80\xba",
  106.         "\xc2\x9c" => "\xc5\x93",
  107.  
  108.         "\xc2\x9e" => "\xc5\xbe",
  109.         "\xc2\x9f" => "\xc5\xb8"
  110.     );
  111.  
  112.     protected static $utf8ToWin1252 = array(
  113.         "\xe2\x82\xac" => "\x80",
  114.  
  115.         "\xe2\x80\x9a" => "\x82",
  116.         "\xc6\x92" => "\x83",
  117.         "\xe2\x80\x9e" => "\x84",
  118.         "\xe2\x80\xa6" => "\x85",
  119.         "\xe2\x80\xa0" => "\x86",
  120.         "\xe2\x80\xa1" => "\x87",
  121.         "\xcb\x86" => "\x88",
  122.         "\xe2\x80\xb0" => "\x89",
  123.         "\xc5\xa0" => "\x8a",
  124.         "\xe2\x80\xb9" => "\x8b",
  125.         "\xc5\x92" => "\x8c",
  126.  
  127.         "\xc5\xbd" => "\x8e",
  128.  
  129.  
  130.         "\xe2\x80\x98" => "\x91",
  131.         "\xe2\x80\x99" => "\x92",
  132.         "\xe2\x80\x9c" => "\x93",
  133.         "\xe2\x80\x9d" => "\x94",
  134.         "\xe2\x80\xa2" => "\x95",
  135.         "\xe2\x80\x93" => "\x96",
  136.         "\xe2\x80\x94" => "\x97",
  137.         "\xcb\x9c" => "\x98",
  138.         "\xe2\x84\xa2" => "\x99",
  139.         "\xc5\xa1" => "\x9a",
  140.         "\xe2\x80\xba" => "\x9b",
  141.         "\xc5\x93" => "\x9c",
  142.  
  143.         "\xc5\xbe" => "\x9e",
  144.         "\xc5\xb8" => "\x9f"
  145.     );
  146.  
  147.     static function toUTF8($text)
  148.     {
  149.         /**
  150.          * Function ForceUTF8::toUTF8
  151.          *
  152.          * This function leaves UTF8 characters alone, while converting almost all non-UTF8 to UTF8.
  153.          *
  154.          * It assumes that the encoding of the original string is either Windows-1252 or ISO 8859-1.
  155.          *
  156.          * It may fail to convert characters to UTF-8 if they fall into one of these scenarios:
  157.          *
  158.          * 1) when any of these characters:   ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß
  159.          *    are followed by any of these:  ("group B")
  160.          *                                    ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶•¸¹º»¼½¾¿
  161.          * For example:   %ABREPRESENT%C9%BB. «REPRESENTÉ»
  162.          * The "«" (%AB) character will be converted, but the "É" followed by "»" (%C9%BB)
  163.          * is also a valid unicode character, and will be left unchanged.
  164.          *
  165.          * 2) when any of these: àáâãäåæçèéêëìíîï  are followed by TWO chars from group B,
  166.          * 3) when any of these: ðñòó  are followed by THREE chars from group B.
  167.          *
  168.          * @name toUTF8
  169.          * @param string $text Any string.
  170.          * @return string  The same string, UTF8 encoded
  171.          *
  172.          */
  173.  
  174.         if (is_array($text)) {
  175.             foreach ($text as $k => $v) {
  176.                 $text[$k] = self::toUTF8($v);
  177.             }
  178.             return $text;
  179.         } elseif (is_string($text)) {
  180.  
  181.             if (function_exists('mb_strlen') && ((int)ini_get('mbstring.func_overload')) & 2) {
  182.                 $max = mb_strlen($text, '8bit');
  183.             } else {
  184.                 $max = strlen($text);
  185.             }
  186.  
  187.             $buf = "";
  188.             for ($i = 0; $i < $max; $i++) {
  189.                 $c1 = $text{$i};
  190.                 if ($c1 >= "\xc0") { //Should be converted to UTF8, if it's not UTF8 already
  191.                     $c2 = $i + 1 >= $max ? "\x00" : $text{$i + 1};
  192.                     $c3 = $i + 2 >= $max ? "\x00" : $text{$i + 2};
  193.                     $c4 = $i + 3 >= $max ? "\x00" : $text{$i + 3};
  194.                     if ($c1 >= "\xc0" & $c1 <= "\xdf") { //looks like 2 bytes UTF8
  195.                         if ($c2 >= "\x80" && $c2 <= "\xbf") { //yeah, almost sure it's UTF8 already
  196.                             $buf .= $c1 . $c2;
  197.                             $i++;
  198.                         } else { //not valid UTF8.  Convert it.
  199.                             $cc1 = (chr(ord($c1) / 64) | "\xc0");
  200.                             $cc2 = ($c1 & "\x3f") | "\x80";
  201.                             $buf .= $cc1 . $cc2;
  202.                         }
  203.                     } elseif ($c1 >= "\xe0" & $c1 <= "\xef") { //looks like 3 bytes UTF8
  204.                         if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf") { //yeah, almost sure it's UTF8 already
  205.                             $buf .= $c1 . $c2 . $c3;
  206.                             $i = $i + 2;
  207.                         } else { //not valid UTF8.  Convert it.
  208.                             $cc1 = (chr(ord($c1) / 64) | "\xc0");
  209.                             $cc2 = ($c1 & "\x3f") | "\x80";
  210.                             $buf .= $cc1 . $cc2;
  211.                         }
  212.                     } elseif ($c1 >= "\xf0" & $c1 <= "\xf7") { //looks like 4 bytes UTF8
  213.                         if ($c2 >= "\x80" && $c2 <= "\xbf" && $c3 >= "\x80" && $c3 <= "\xbf" && $c4 >= "\x80" && $c4 <= "\xbf") { //yeah, almost sure it's UTF8 already
  214.                             $buf .= $c1 . $c2 . $c3;
  215.                             $i = $i + 2;
  216.                         } else { //not valid UTF8.  Convert it.
  217.                             $cc1 = (chr(ord($c1) / 64) | "\xc0");
  218.                             $cc2 = ($c1 & "\x3f") | "\x80";
  219.                             $buf .= $cc1 . $cc2;
  220.                         }
  221.                     } else { //doesn't look like UTF8, but should be converted
  222.                         $cc1 = (chr(ord($c1) / 64) | "\xc0");
  223.                         $cc2 = (($c1 & "\x3f") | "\x80");
  224.                         $buf .= $cc1 . $cc2;
  225.                     }
  226.                 } elseif (($c1 & "\xc0") == "\x80") { // needs conversion
  227.                     if (isset(self::$win1252ToUtf8[ord($c1)])) { //found in Windows-1252 special cases
  228.                         $buf .= self::$win1252ToUtf8[ord($c1)];
  229.                     } else {
  230.                         $cc1 = (chr(ord($c1) / 64) | "\xc0");
  231.                         $cc2 = (($c1 & "\x3f") | "\x80");
  232.                         $buf .= $cc1 . $cc2;
  233.                     }
  234.                 } else { // it doesn't need conversion
  235.                     $buf .= $c1;
  236.                 }
  237.             }
  238.             return $buf;
  239.         } else {
  240.             return $text;
  241.         }
  242.     }
  243.  
  244.     static function toWin1252($text)
  245.     {
  246.         if (is_array($text)) {
  247.             foreach ($text as $k => $v) {
  248.                 $text[$k] = self::toWin1252($v);
  249.             }
  250.             return $text;
  251.         } elseif (is_string($text)) {
  252.             return utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), self::toUTF8($text)));
  253.         } else {
  254.             return $text;
  255.         }
  256.     }
  257.  
  258.     static function toISO8859($text)
  259.     {
  260.         return self::toWin1252($text);
  261.     }
  262.  
  263.     static function toLatin1($text)
  264.     {
  265.         return self::toWin1252($text);
  266.     }
  267.  
  268.     static function fixUTF8($text)
  269.     {
  270.         if (is_array($text)) {
  271.             foreach ($text as $k => $v) {
  272.                 $text[$k] = self::fixUTF8($v);
  273.             }
  274.             return $text;
  275.         }
  276.  
  277.         $last = "";
  278.         while ($last <> $text) {
  279.             $last = $text;
  280.             $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
  281.         }
  282.         $text = self::toUTF8(utf8_decode(str_replace(array_keys(self::$utf8ToWin1252), array_values(self::$utf8ToWin1252), $text)));
  283.         return $text;
  284.     }
  285.  
  286.     static function UTF8FixWin1252Chars($text)
  287.     {
  288.         // If you received an UTF-8 string that was converted from Windows-1252 as it was ISO8859-1
  289.         // (ignoring Windows-1252 chars from 80 to 9F) use this function to fix it.
  290.         // See: http://en.wikipedia.org/wiki/Windows-1252
  291.  
  292.         return str_replace(array_keys(self::$brokenUtf8ToUtf8), array_values(self::$brokenUtf8ToUtf8), $text);
  293.     }
  294.  
  295.     static function removeBOM($str = "")
  296.     {
  297.         if (substr($str, 0, 3) == pack("CCC", 0xef, 0xbb, 0xbf)) {
  298.             $str = substr($str, 3);
  299.         }
  300.         return $str;
  301.     }
  302.  
  303.     public static function normalizeEncoding($encodingLabel)
  304.     {
  305.         $encoding = strtoupper($encodingLabel);
  306.         $enc = preg_replace('/[^a-zA-Z0-9\s]/', '', $encoding);
  307.         $equivalences = array(
  308.             'ISO88591' => 'ISO-8859-1',
  309.             'ISO8859' => 'ISO-8859-1',
  310.             'ISO' => 'ISO-8859-1',
  311.             'LATIN1' => 'ISO-8859-1',
  312.             'LATIN' => 'ISO-8859-1',
  313.             'UTF8' => 'UTF-8',
  314.             'UTF' => 'UTF-8',
  315.             'WIN1252' => 'ISO-8859-1',
  316.             'WINDOWS1252' => 'ISO-8859-1'
  317.         );
  318.  
  319.         if (empty($equivalences[$encoding])) {
  320.             return 'UTF-8';
  321.         }
  322.  
  323.         return $equivalences[$encoding];
  324.     }
  325.  
  326.     public static function encode($encodingLabel, $text)
  327.     {
  328.         $encodingLabel = self::normalizeEncoding($encodingLabel);
  329.         if ($encodingLabel == 'UTF-8') return ForceUTF8::toUTF8($text);
  330.         if ($encodingLabel == 'ISO-8859-1') return ForceUTF8::toLatin1($text);
  331.     }
  332.  
  333. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement