Guest

collation_2_charset_table-2.php

By: thefsb on Apr 4th, 2011  |  syntax: PHP  |  size: 4.03 KB  |  hits: 172  |  expires: Never
download  |  raw  |  embed  |  report abuse
Copied
  1. <?php
  2.    
  3.     // user configuration stuff
  4.     $startswith = "\tcharset_table = ";     // how to begin th first output line
  5.     $linewidth = 100;                       // limits num. chars per output line
  6.     $tabwidth = 8;                          // how wide are your tabs?
  7.     $tabsindent = 2;                        // num. tabs to indent a continuation line
  8.     $padto = 3;                             // min. num. hex digits per codepoint literal
  9.    
  10.     // figure  from config useful values for data output
  11.     $leadstr = str_repeat("\t", $tabsindent);
  12.     $leadwidth = $tabsindent*($tabwidth);
  13.    
  14.     ini_set('default_charset', 'UTF-8' );
  15.     ini_set('mbstring.func_overload', 0);
  16.    
  17.     // given a hex string argument, return a unicode character litieral
  18.     // in the format used in a sphinx config file's charset_table part
  19.     function u($s)
  20.     {
  21.         global $padto;
  22.         return preg_match('/[1-9a-fA-F][0-9a-fA-F]*$/', $s, $m)
  23.             ? 'U+' . str_pad($m[0], $padto, "0", STR_PAD_LEFT)
  24.             : $s;
  25.     }
  26.    
  27.     // input is from stdin
  28.     while ( ! feof(STDIN) ) {
  29.         $lines[] = fgets(STDIN);
  30.     }
  31.     if ( !$lines ) {
  32.         exit();
  33.     }
  34.    
  35.     // sets of characters collated with equal value, to be folded by sphinx
  36.     $sets = array();
  37.  
  38.     // $singles are terminal characters sphinx will index
  39.     $singles = array();
  40.  
  41.     // parse each input line
  42.     foreach ( $lines as $line ) {
  43.  
  44.         // search each input line for a tab character
  45.         if ( preg_match('/^(.+)\t(.+)$/u', $line, $m) ) {
  46.  
  47.             // if the part after the tab is
  48.             //   :a single hex codepoint then it's a singleton,
  49.             //    add it to the list of singles
  50.             if ( preg_match('/^[0-9a-f]{1,5}$/', $m[2])
  51.             ) {
  52.                 $singles[] = '0x' . $m[2];
  53.  
  54.             //   :a comma separatred list of codepoints,
  55.             //    it's a set of chars to be folded to the frst of them,
  56.             //    split it and add to the list of sets
  57.             } elseif ( preg_match('/^[0-9a-f]{1,5}(,[0-9a-f]{1,5})+$/', $m[2])
  58.             ) {
  59.                 $sets[] = preg_split('/,/', $m[2]);
  60.             }
  61.         }
  62.     }
  63.    
  64.     // add the folding targets to singles
  65.     foreach ( $sets as $codes ) {
  66.         $singles[] = '0x' . $codes[0];
  67.     }
  68.  
  69.     // encode the rules for sphinx. do the singles (folding targets) first
  70.     sort($singles);
  71.  
  72.     // run detection state machine var
  73.     $run = false;
  74.  
  75.     // collect folding rules in $t
  76.     $t = array();
  77.  
  78.     // $s is an output string
  79.     $s =  u($singles[0]);
  80.     for ( $i = 1; $i < count($singles) - 1; $i++ ) {
  81.  
  82.         // detect runs of consecutive codeponts and use sphinx's .. notation,
  83.         // e.g.: 'U+041..U+05A'
  84.         if ( $run ) {
  85.             if ( $singles[$i] != $singles[$i+1] - 1 ) {
  86.                 $s .= ".." . u($singles[$i]);
  87.                 $run = false;
  88.             }
  89.         } else {
  90.             $run = $singles[$i] == $singles[$i-1] + 1
  91.             && $singles[$i] == $singles[$i+1] - 1;
  92.             if ( !$run ) {
  93.                 $t[] = $s;
  94.                 $s = u($singles[$i]);
  95.             }
  96.         }
  97.     }
  98.  
  99.     // wrap up from the end of the above loop
  100.     if ( $run ) {
  101.         $s .= ".." . u($singles[$i]);
  102.         $t[] = $s;
  103.     } else {
  104.         $t[] = u($singles[$i]);
  105.     }
  106.    
  107.     // now encode the folding rules
  108.     foreach ( $sets as $codes ) {
  109.         $to = u($codes[0]);
  110.         for ( $i = 1; $i < count($codes); ++$i )
  111.             $t[] = u($codes[$i]) . "->$to";
  112.     }
  113.  
  114.     // is there anything to output?
  115.     if ( !$t ) {
  116.         exit(0);
  117.     }
  118.    
  119.     // format output for the sphinx config file
  120.     print($startswith);
  121.     $w = strlen($startswith);
  122.     $last = array_pop($t);
  123.     foreach ( $t as $s ) {
  124.         $s .= ', ';
  125.         if ( $w + strlen($s) > $linewidth ) {
  126.             print("\\\n$leadstr");
  127.             $w = $leadwidth;
  128.         }
  129.         print($s);
  130.         $w += strlen($s);
  131.     }
  132.     print("$last\n");
  133.    
  134. ?>