Advertisement
Guest User

Language name search - code point buckets

a guest
Jul 19th, 2012
257
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
PHP 3.00 KB | None | 0 0
  1. <?php
  2. /**
  3.  * Script to create the language data in JSON format for ULS.
  4.  *
  5.  * Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
  6.  * Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
  7.  * contributors. See CREDITS for a list.
  8.  *
  9.  * UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don’t
  10.  * have to do anything special to choose one license or the other and you don’t
  11.  * have to notify anyone which license you are using. You are free to use
  12.  * UniversalLanguageSelector in commercial projects as long as the copyright
  13.  * header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
  14.  *
  15.  * @file
  16.  * @ingroup Extensions
  17.  * @licence GNU General Public Licence 2.0 or later
  18.  * @licence MIT License
  19.  */
  20.  
  21. // Standard boilerplate to define $IP
  22. if (getenv('MW_INSTALL_PATH') !== false) {
  23.     $IP = getenv('MW_INSTALL_PATH');
  24. } else {
  25.     $dir = __DIR__;
  26.     $IP = "$dir/../../..";
  27. }
  28. require_once ("$IP/maintenance/commandLine.inc");
  29. $languages = Language::fetchLanguageNames(null, 'all');
  30. $all = array();
  31. $buckets = array();
  32. foreach ($languages as $code => $name) {
  33.     $all[$code][strtolower($name)] = true;
  34.     $langnames = LanguageNames::getNames($code, 0, 2);
  35.     foreach ($langnames as $code => $name) {
  36.         $all[$code][] = strtolower($name);
  37.     }
  38. }
  39.  
  40. foreach ($all as $code => $names) {
  41.     //$all[$code] = array_keys($names);
  42.     foreach ($names as $index => $name) {
  43.         $bucket = getBucket($name);
  44.         $buckets[$bucket][$name] = $code;
  45.         //echo "$bucketId = ( $name => $code )\n";
  46.     }
  47. }
  48.  
  49. function getBucket($name) {
  50.     $codepoint = getCodepoint($name);
  51.     if ($codepoint < 1000) {
  52.         $bucket = $codepoint;
  53.     } else {
  54.         $bucket = $codepoint % 1000;
  55.     }
  56.     if (!isset($buckets[$bucket])) {
  57.         $buckets[$bucket] = array();
  58.     }
  59.     return $bucket;
  60. }
  61.  
  62. function getCodepoint($str) {
  63.     $unicode = array();
  64.     $values = array();
  65.     $lookingFor = 1;
  66.     for ($i = 0; $i < strlen($str); $i++) {
  67.         $thisValue = ord($str[$i]);
  68.         if ($thisValue < 128)
  69.             return $thisValue;
  70.         else {
  71.             if (count($values) == 0)
  72.                 $lookingFor = ($thisValue < 224) ? 2 : 3;
  73.             $values[] = $thisValue;
  74.             if (count($values) == $lookingFor) {
  75.                 $number = ($lookingFor == 3) ? (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : (($values[0] % 32) * 64) + ($values[1] % 64);
  76.                 return $number;
  77.             } // if
  78.         }// if
  79.     }
  80. }
  81. /*
  82. foreach ($buckets as $bucketId => $names) {
  83.     echo "Bucket $bucketId = " . count($buckets[$bucketId]) . "\n";
  84. }*/
  85. echo "Total Buckets  " . count($buckets) . "\n";
  86. file_put_contents('langnames.ser', serialize($buckets));
  87. $serializedBuckets = file_get_contents('langnames.ser');
  88. $buckets = unserialize($serializedBuckets);
  89. // ===========================
  90. $start = microtime();
  91. $search = "ഹിന്ദി";
  92. $bucket = $buckets[getBucket($search)];
  93. foreach ($bucket as $name => $code) {
  94.     if ($name === $search) {
  95.         echo "$search is language " . $code ;
  96.         $end = microtime();
  97.         echo " (Time taken: ". ($end-$start). "ms)\n";
  98.     }
  99. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement