Guest

collation_2_charset_table-1.php

By: thefsb on Mar 14th, 2011  |  syntax: PHP  |  size: 3.70 KB  |  hits: 273  |  expires: Never
download  |  raw  |  embed  |  report abuse
Copied
  1. <?php
  2.  
  3.     // user configuration stuff
  4.     $startswith = "\tcharset_table = ";     // how to begin th first output line
  5.     $linewidth = 100;                       // limits num. chars per output line
  6.     $tabwidth = 8;                          // how wide are your tabs?
  7.     $tabsindent = 2;                        // num. tabs to indent a continuation line
  8.     $padto = 3;                             // min. num. hex digits per codepoint literal
  9.  
  10.     // figure  from config useful values for data output
  11.     $leadstr = str_repeat("\t", $tabsindent);
  12.     $leadwidth = $tabsindent*($tabwidth);
  13.  
  14.     ini_set('default_charset', 'UTF-8' );
  15.     ini_set('mbstring.func_overload', 0);
  16.  
  17.     function u($s) {
  18.         // given a hex string argument, return a sphinx config unicode char litieral
  19.         global $padto;
  20.         return preg_match('/[1-9a-fA-F][0-9a-fA-F]*$/',$s,$m)
  21.           ? 'U+' . str_pad($m[0], $padto, "0", STR_PAD_LEFT)
  22.           : $s;
  23.     }
  24.  
  25.     while ( ! feof(STDIN) )                 // read input file form stdin
  26.       $lines[] = fgets(STDIN);
  27.     if ( !$lines )                          // need at least one line
  28.         exit();
  29.  
  30.     $sets = array();                        // sets of chars collated as the same
  31.     $singles = array();                     // terminal chars sphinx will index
  32.     foreach ( $lines as $line )
  33.         if ( preg_match(                    // search each input line
  34.                 '/^(.+)\t(.+)$/u',          // for a tab
  35.                 $line, $m) )
  36.             if ( preg_match(                // if the part after the tab
  37.                     '/^[0-9a-f]{1,5}$/',    // a single hex codepoint‚…
  38.                     $m[2] ) )
  39.                 $singles[] = '0x' . $m[2];  // ‚…then it's a single, add it to them.
  40.             elseif ( preg_match(            // if a comma separatred list of codepoints‚…
  41.                         '/^[0-9a-f]{1,5}(,[0-9a-f]{1,5})+$/',
  42.                         $m[2] ) )
  43.                 $sets[] = preg_split('/,/', $m[2]); // ‚…add it to the sets.
  44.     foreach ( $sets as $codes )
  45.         $singles[] = '0x' . $codes[0];      // add the folding targets to singles
  46.  
  47.  
  48.     // encode the rules for sphinx. do the singles (folding targets) first
  49.     sort($singles);
  50.     $run = false;
  51.     $t = array();                           // collect folding rules in $t
  52.     $s =  u($singles[0]);
  53.     for ( $i = 1; $i < count($singles) - 1; $i++ ) {
  54.         // detect runs of consecutive codeponts and use sphinx's .. notation,
  55.         // e.g.: 'U+041..U+05A'
  56.         if ( $run ) {
  57.             if ( $singles[$i] != $singles[$i+1] - 1 ) {
  58.                 $s .= ".." . u($singles[$i]);
  59.                 $run = false;
  60.             }
  61.         } else {
  62.             $run = $singles[$i] == $singles[$i-1] + 1
  63.                 && $singles[$i] == $singles[$i+1] - 1;
  64.             if ( !$run ) {
  65.                 $t[] = $s;
  66.                 $s = u($singles[$i]);
  67.             }
  68.         }
  69.     }
  70.     if ( $run ) {
  71.         $s .= ".." . u($singles[$i]);
  72.         $t[] = $s;
  73.     } else {
  74.         $t[] = u($singles[$i]);
  75.     }
  76.  
  77.     // now encode the folding rules
  78.     foreach ( $sets as $codes ) {
  79.         $to = u($codes[0]);
  80.         for ( $i = 1; $i < count($codes); ++$i )
  81.             $t[] = u($codes[$i]) . "->$to";
  82.     }
  83.  
  84.     if ( !$t )
  85.         exit(0);
  86.  
  87.     // format output for the sphinx config file
  88.     print($startswith);
  89.     $w = strlen($startswith);
  90.     $last = array_pop($t);
  91.     foreach ( $t as $s ) {
  92.         $s .= ', ';
  93.         if ( $w + strlen($s) > $linewidth ) {
  94.             print("\\\n$leadstr");
  95.             $w = $leadwidth;
  96.         }
  97.         print($s);
  98.         $w += strlen($s);
  99.     }
  100.     print("$last\n");
  101.  
  102. ?>