Guest User

TextEncodingRegistry.cpp

a guest
Feb 9th, 2013
246
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 13.92 KB | None | 0 0
  1. /*
  2.  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
  3.  * Copyright (C) 2007-2009 Torch Mobile, Inc.
  4.  *
  5.  * Redistribution and use in source and binary forms, with or without
  6.  * modification, are permitted provided that the following conditions
  7.  * are met:
  8.  * 1. Redistributions of source code must retain the above copyright
  9.  *    notice, this list of conditions and the following disclaimer.
  10.  * 2. Redistributions in binary form must reproduce the above copyright
  11.  *    notice, this list of conditions and the following disclaimer in the
  12.  *    documentation and/or other materials provided with the distribution.
  13.  *
  14.  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  15.  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16.  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17.  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  18.  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  19.  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  20.  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  21.  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  22.  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  23.  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24.  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25.  */
  26.  
  27. #include "config.h"
  28. #include "TextEncodingRegistry.h"
  29.  
  30. #include "TextCodecLatin1.h"
  31. #include "TextCodecUserDefined.h"
  32. #include "TextCodecUTF16.h"
  33. #include "TextCodecUTF8.h"
  34. #include "TextEncoding.h"
  35. #include <wtf/ASCIICType.h>
  36. #include <wtf/HashMap.h>
  37. #include <wtf/HashSet.h>
  38. #include <wtf/MainThread.h>
  39. #include <wtf/StdLibExtras.h>
  40. #include <wtf/StringExtras.h>
  41.  
  42. #if USE(ICU_UNICODE)
  43. #include "TextCodecICU.h"
  44. #endif
  45. #if PLATFORM(MAC)
  46. #include "TextCodecMac.h"
  47. #endif
  48. #if USE(QT4_UNICODE)
  49. #include "qt/TextCodecQt.h"
  50. #endif
  51. #if USE(GLIB_UNICODE)
  52. #include "gtk/TextCodecGtk.h"
  53. #endif
  54. #if OS(WINDOWS) && USE(WCHAR_UNICODE)
  55. #include "win/TextCodecWin.h"
  56. #endif
  57.  
  58. #include <wtf/CurrentTime.h>
  59. #include <wtf/text/CString.h>
  60.  
  61. using namespace WTF;
  62.  
  63. namespace WebCore {
  64.  
  65. const size_t maxEncodingNameLength = 63;
  66.  
  67. // Hash for all-ASCII strings that does case folding.
  68. struct TextEncodingNameHash {
  69.     static bool equal(const char* s1, const char* s2)
  70.     {
  71.         char c1;
  72.         char c2;
  73.         do {
  74.             c1 = *s1++;
  75.             c2 = *s2++;
  76.             if (toASCIILower(c1) != toASCIILower(c2))
  77.                 return false;
  78.         } while (c1 && c2);
  79.         return !c1 && !c2;
  80.     }
  81.  
  82.     // This algorithm is the one-at-a-time hash from:
  83.     // http://burtleburtle.net/bob/hash/hashfaq.html
  84.     // http://burtleburtle.net/bob/hash/doobs.html
  85.     static unsigned hash(const char* s)
  86.     {
  87.         unsigned h = WTF::stringHashingStartValue;
  88.         for (;;) {
  89.             char c = *s++;
  90.             if (!c) {
  91.                 h += (h << 3);
  92.                 h ^= (h >> 11);
  93.                 h += (h << 15);
  94.                 return h;
  95.             }
  96.             h += toASCIILower(c);
  97.             h += (h << 10);
  98.             h ^= (h >> 6);
  99.         }
  100.     }
  101.  
  102.     static const bool safeToCompareToEmptyOrDeleted = false;
  103. };
  104.  
  105. struct TextCodecFactory {
  106.     NewTextCodecFunction function;
  107.     const void* additionalData;
  108.     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
  109. };
  110.  
  111. typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
  112. typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
  113.  
  114. static Mutex& encodingRegistryMutex()
  115. {
  116.     // We don't have to use AtomicallyInitializedStatic here because
  117.     // this function is called on the main thread for any page before
  118.     // it is used in worker threads.
  119.     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
  120.     return mutex;
  121. }
  122.  
  123. static TextEncodingNameMap* textEncodingNameMap;
  124. static TextCodecMap* textCodecMap;
  125. static bool didExtendTextCodecMaps;
  126. static HashSet<const char*>* japaneseEncodings;
  127. static HashSet<const char*>* nonBackslashEncodings;
  128.  
  129. static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
  130.  
  131. #if ERROR_DISABLED
  132.  
  133. static inline void checkExistingName(const char*, const char*) { }
  134.  
  135. #else
  136.  
  137. static void checkExistingName(const char* alias, const char* atomicName)
  138. {
  139.     const char* oldAtomicName = textEncodingNameMap->get(alias);
  140.     if (!oldAtomicName)
  141.         return;
  142.     if (oldAtomicName == atomicName)
  143.         return;
  144.     // Keep the warning silent about one case where we know this will happen.
  145.     if (strcmp(alias, "ISO-8859-8-I") == 0
  146.             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
  147.             && strcasecmp(atomicName, "iso-8859-8") == 0)
  148.         return;
  149.     LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
  150. }
  151.  
  152. #endif
  153.  
  154. static bool isUndesiredAlias(const char* alias)
  155. {
  156.     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
  157.     for (const char* p = alias; *p; ++p) {
  158.         if (*p == ',')
  159.             return true;
  160.     }
  161.     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
  162.     // problem, see bug 43554.
  163.     if (0 == strcmp(alias, "8859_1"))
  164.         return true;
  165.     return false;
  166. }
  167.  
  168. static void addToTextEncodingNameMap(const char* alias, const char* name)
  169. {
  170.     ASSERT(strlen(alias) <= maxEncodingNameLength);
  171.     if (isUndesiredAlias(alias))
  172.         return;
  173.     const char* atomicName = textEncodingNameMap->get(name);
  174.     ASSERT(strcmp(alias, name) == 0 || atomicName);
  175.     if (!atomicName)
  176.         atomicName = name;
  177.     checkExistingName(alias, atomicName);
  178.     textEncodingNameMap->add(alias, atomicName);
  179. }
  180.  
  181. static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
  182. {
  183.     //const char* atomicName = textEncodingNameMap->get(name);
  184.     const char* atomicName;
  185.     TextEncodingNameMap::iterator pos;
  186.     for (pos = textEncodingNameMap->begin(); pos != textEncodingNameMap->end(); ++pos) {
  187.       if (strcmp(pos->key, name) == 0) {
  188.         atomicName = pos->value;
  189.         break;
  190.       }
  191.     }
  192.     ASSERT(atomicName);
  193.     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
  194. }
  195.  
  196. static void pruneBlacklistedCodecs()
  197. {
  198.     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
  199.         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
  200.         if (!atomicName)
  201.             continue;
  202.  
  203.         Vector<const char*> names;
  204.         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
  205.         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
  206.         for (; it != end; ++it) {
  207.             if (it->value == atomicName)
  208.                 names.append(it->key);
  209.         }
  210.  
  211.         size_t length = names.size();
  212.         for (size_t j = 0; j < length; ++j)
  213.             textEncodingNameMap->remove(names[j]);
  214.  
  215.         textCodecMap->remove(atomicName);
  216.     }
  217. }
  218.  
  219. static void buildBaseTextCodecMaps()
  220. {
  221.     ASSERT(isMainThread());
  222.     ASSERT(!textCodecMap);
  223.     ASSERT(!textEncodingNameMap);
  224.  
  225.     textCodecMap = new TextCodecMap;
  226.     textEncodingNameMap = new TextEncodingNameMap;
  227.  
  228.     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
  229.     TextCodecLatin1::registerCodecs(addToTextCodecMap);
  230.  
  231.     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
  232.     TextCodecUTF8::registerCodecs(addToTextCodecMap);
  233.  
  234.     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
  235.     TextCodecUTF16::registerCodecs(addToTextCodecMap);
  236.  
  237.     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
  238.     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
  239.  
  240. #if USE(GLIB_UNICODE)
  241.     // FIXME: This is not needed. The code above covers all the base codecs.
  242.     TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
  243.     TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
  244. #endif
  245. }
  246.  
  247. static void addEncodingName(HashSet<const char*>* set, const char* name)
  248. {
  249.     // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
  250.     const char* atomicName = textEncodingNameMap->get(name);
  251.     if (atomicName)
  252.         set->add(atomicName);
  253. }
  254.  
  255. static void buildQuirksSets()
  256. {
  257.     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
  258.     // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
  259.  
  260.     ASSERT(!japaneseEncodings);
  261.     ASSERT(!nonBackslashEncodings);
  262.  
  263.     japaneseEncodings = new HashSet<const char*>;
  264.     addEncodingName(japaneseEncodings, "EUC-JP");
  265.     addEncodingName(japaneseEncodings, "ISO-2022-JP");
  266.     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
  267.     addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
  268.     addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
  269.     addEncodingName(japaneseEncodings, "JIS_C6226-1978");
  270.     addEncodingName(japaneseEncodings, "JIS_X0201");
  271.     addEncodingName(japaneseEncodings, "JIS_X0208-1983");
  272.     addEncodingName(japaneseEncodings, "JIS_X0208-1990");
  273.     addEncodingName(japaneseEncodings, "JIS_X0212-1990");
  274.     addEncodingName(japaneseEncodings, "Shift_JIS");
  275.     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
  276.     addEncodingName(japaneseEncodings, "cp932");
  277.     addEncodingName(japaneseEncodings, "x-mac-japanese");
  278.  
  279.     nonBackslashEncodings = new HashSet<const char*>;
  280.     // The text encodings below treat backslash as a currency symbol for IE compatibility.
  281.     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
  282.     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
  283.     addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
  284.     addEncodingName(nonBackslashEncodings, "EUC-JP");
  285.     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
  286.     addEncodingName(nonBackslashEncodings, "Shift_JIS");
  287.     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
  288. }
  289.  
  290. bool isJapaneseEncoding(const char* canonicalEncodingName)
  291. {
  292.     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
  293. }
  294.  
  295. bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
  296. {
  297.     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
  298. }
  299.  
  300. static void extendTextCodecMaps()
  301. {
  302. #if USE(ICU_UNICODE)
  303.     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
  304.     TextCodecICU::registerCodecs(addToTextCodecMap);
  305. #endif
  306.  
  307. #if USE(QT4_UNICODE)
  308.     TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
  309.     TextCodecQt::registerCodecs(addToTextCodecMap);
  310. #endif
  311.  
  312. #if PLATFORM(MAC)
  313.     TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
  314.     TextCodecMac::registerCodecs(addToTextCodecMap);
  315. #endif
  316.  
  317. #if USE(GLIB_UNICODE)
  318.     TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
  319.     TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
  320. #endif
  321.  
  322. #if OS(WINDOWS) && USE(WCHAR_UNICODE)
  323.     TextCodecWin::registerExtendedEncodingNames(addToTextEncodingNameMap);
  324.     TextCodecWin::registerExtendedCodecs(addToTextCodecMap);
  325. #endif
  326.  
  327.     pruneBlacklistedCodecs();
  328.     buildQuirksSets();
  329. }
  330.  
  331. PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
  332. {
  333.     MutexLocker lock(encodingRegistryMutex());
  334.  
  335.     ASSERT(textCodecMap);
  336.     //TextCodecFactory factory = textCodecMap->get(encoding.name());
  337.     TextCodecFactory factory;
  338.     TextCodecMap::iterator pos;
  339.     for (pos = textCodecMap->begin(); pos != textCodecMap->end(); ++pos) {
  340.       if (strcmp(pos->key, encoding.name()) == 0) {
  341.         factory = pos->value;
  342.         break;
  343.       }
  344.     }
  345.     ASSERT(factory.function);
  346.     return factory.function(encoding, factory.additionalData);
  347. }
  348.  
  349. const char* atomicCanonicalTextEncodingName(const char* name)
  350. {
  351.     if (!name || !name[0])
  352.         return 0;
  353.     if (!textEncodingNameMap)
  354.         buildBaseTextCodecMaps();
  355.  
  356.     MutexLocker lock(encodingRegistryMutex());
  357.  
  358.     if (const char* atomicName = textEncodingNameMap->get(name))
  359.         return atomicName;
  360.     if (didExtendTextCodecMaps)
  361.         return 0;
  362.     extendTextCodecMaps();
  363.     didExtendTextCodecMaps = true;
  364.     return textEncodingNameMap->get(name);
  365. }
  366.  
  367. template <typename CharacterType>
  368. const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
  369. {
  370.     char buffer[maxEncodingNameLength + 1];
  371.     size_t j = 0;
  372.     for (size_t i = 0; i < length; ++i) {
  373.         CharacterType c = characters[i];
  374.         if (j == maxEncodingNameLength)
  375.             return 0;
  376.         buffer[j++] = c;
  377.     }
  378.     buffer[j] = 0;
  379.     return atomicCanonicalTextEncodingName(buffer);
  380. }
  381.  
  382. const char* atomicCanonicalTextEncodingName(const String& alias)
  383. {
  384.     if (!alias.length())
  385.         return 0;
  386.  
  387.     if (alias.is8Bit())
  388.         return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
  389.  
  390.     return atomicCanonicalTextEncodingName<UChar>(alias.characters(), alias.length());
  391. }
  392.  
  393. bool noExtendedTextEncodingNameUsed()
  394. {
  395.     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
  396.     return !didExtendTextCodecMaps;
  397. }
  398.  
  399. #ifndef NDEBUG
  400. void dumpTextEncodingNameMap()
  401. {
  402.     unsigned size = textEncodingNameMap->size();
  403.     fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
  404.  
  405.     MutexLocker lock(encodingRegistryMutex());
  406.  
  407.     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
  408.     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
  409.     for (; it != end; ++it)
  410.         fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
  411. }
  412. #endif
  413.  
  414. } // namespace WebCore
Advertisement
Add Comment
Please, Sign In to add comment