SHARE
TWEET

Language name search - code point buckets

a guest Jul 19th, 2012 84 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. <?php
  2. /**
  3.  * Script to create the language data in JSON format for ULS.
  4.  *
  5.  * Copyright (C) 2012 Alolita Sharma, Amir Aharoni, Arun Ganesh, Brandon Harris,
  6.  * Niklas Laxström, Pau Giner, Santhosh Thottingal, Siebrand Mazeland and other
  7.  * contributors. See CREDITS for a list.
  8.  *
  9.  * UniversalLanguageSelector is dual licensed GPLv2 or later and MIT. You don’t
  10.  * have to do anything special to choose one license or the other and you don’t
  11.  * have to notify anyone which license you are using. You are free to use
  12.  * UniversalLanguageSelector in commercial projects as long as the copyright
  13.  * header is left intact. See files GPL-LICENSE and MIT-LICENSE for details.
  14.  *
  15.  * @file
  16.  * @ingroup Extensions
  17.  * @licence GNU General Public Licence 2.0 or later
  18.  * @licence MIT License
  19.  */
  20.  
  21. // Standard boilerplate to define $IP
  22. if (getenv('MW_INSTALL_PATH') !== false) {
  23.         $IP = getenv('MW_INSTALL_PATH');
  24. } else {
  25.         $dir = __DIR__;
  26.         $IP = "$dir/../../..";
  27. }
  28. require_once ("$IP/maintenance/commandLine.inc");
  29. $languages = Language::fetchLanguageNames(null, 'all');
  30. $all = array();
  31. $buckets = array();
  32. foreach ($languages as $code => $name) {
  33.         $all[$code][strtolower($name)] = true;
  34.         $langnames = LanguageNames::getNames($code, 0, 2);
  35.         foreach ($langnames as $code => $name) {
  36.                 $all[$code][] = strtolower($name);
  37.         }
  38. }
  39.  
  40. foreach ($all as $code => $names) {
  41.         //$all[$code] = array_keys($names);
  42.         foreach ($names as $index => $name) {
  43.                 $bucket = getBucket($name);
  44.                 $buckets[$bucket][$name] = $code;
  45.                 //echo "$bucketId = ( $name => $code )\n";
  46.         }
  47. }
  48.  
  49. function getBucket($name) {
  50.         $codepoint = getCodepoint($name);
  51.         if ($codepoint < 1000) {
  52.                 $bucket = $codepoint;
  53.         } else {
  54.                 $bucket = $codepoint % 1000;
  55.         }
  56.         if (!isset($buckets[$bucket])) {
  57.                 $buckets[$bucket] = array();
  58.         }
  59.         return $bucket;
  60. }
  61.  
  62. function getCodepoint($str) {
  63.         $unicode = array();
  64.         $values = array();
  65.         $lookingFor = 1;
  66.         for ($i = 0; $i < strlen($str); $i++) {
  67.                 $thisValue = ord($str[$i]);
  68.                 if ($thisValue < 128)
  69.                         return $thisValue;
  70.                 else {
  71.                         if (count($values) == 0)
  72.                                 $lookingFor = ($thisValue < 224) ? 2 : 3;
  73.                         $values[] = $thisValue;
  74.                         if (count($values) == $lookingFor) {
  75.                                 $number = ($lookingFor == 3) ? (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : (($values[0] % 32) * 64) + ($values[1] % 64);
  76.                                 return $number;
  77.                         } // if
  78.                 }// if
  79.         }
  80. }
  81. /*
  82. foreach ($buckets as $bucketId => $names) {
  83.         echo "Bucket $bucketId = " . count($buckets[$bucketId]) . "\n";
  84. }*/
  85. echo "Total Buckets  " . count($buckets) . "\n";
  86. file_put_contents('langnames.ser', serialize($buckets));
  87. $serializedBuckets = file_get_contents('langnames.ser');
  88. $buckets = unserialize($serializedBuckets);
  89. // ===========================
  90. $start = microtime();
  91. $search = "ഹിന്ദി";
  92. $bucket = $buckets[getBucket($search)];
  93. foreach ($bucket as $name => $code) {
  94.         if ($name === $search) {
  95.                 echo "$search is language " . $code ;
  96.                 $end = microtime();
  97.                 echo " (Time taken: ". ($end-$start). "ms)\n";
  98.         }
  99. }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top