<?php
// user configuration stuff
$startswith = "\tcharset_table = "; // how to begin th first output line
$linewidth = 100; // limits num. chars per output line
$tabwidth = 8; // how wide are your tabs?
$tabsindent = 2; // num. tabs to indent a continuation line
$padto = 3; // min. num. hex digits per codepoint literal
// figure from config useful values for data output
$leadstr = str_repeat("\t", $tabsindent);
$leadwidth = $tabsindent*($tabwidth);
ini_set('default_charset', 'UTF-8' );
ini_set('mbstring.func_overload', 0);
function u($s) {
// given a hex string argument, return a sphinx config unicode char litieral
global $padto;
return preg_match('/[1-9a-fA-F][0-9a-fA-F]*$/',$s,$m)
? 'U+' . str_pad($m[0], $padto, "0", STR_PAD_LEFT)
: $s;
}
while ( ! feof(STDIN) ) // read input file form stdin
$lines[] = fgets(STDIN);
if ( !$lines ) // need at least one line
exit();
$sets = array(); // sets of chars collated as the same
$singles = array(); // terminal chars sphinx will index
foreach ( $lines as $line )
if ( preg_match( // search each input line
'/^(.+)\t(.+)$/u', // for a tab
$line, $m) )
if ( preg_match( // if the part after the tab
'/^[0-9a-f]{1,5}$/', // a single hex codepoint‚…
$m[2] ) )
$singles[] = '0x' . $m[2]; // ‚…then it's a single, add it to them.
elseif ( preg_match( // if a comma separatred list of codepoints‚…
'/^[0-9a-f]{1,5}(,[0-9a-f]{1,5})+$/',
$m[2] ) )
$sets[] = preg_split('/,/', $m[2]); // ‚…add it to the sets.
foreach ( $sets as $codes )
$singles[] = '0x' . $codes[0]; // add the folding targets to singles
// encode the rules for sphinx. do the singles (folding targets) first
sort($singles);
$run = false;
$t = array(); // collect folding rules in $t
$s = u($singles[0]);
for ( $i = 1; $i < count($singles) - 1; $i++ ) {
// detect runs of consecutive codeponts and use sphinx's .. notation,
// e.g.: 'U+041..U+05A'
if ( $run ) {
if ( $singles[$i] != $singles[$i+1] - 1 ) {
$s .= ".." . u($singles[$i]);
$run = false;
}
} else {
$run = $singles[$i] == $singles[$i-1] + 1
&& $singles[$i] == $singles[$i+1] - 1;
if ( !$run ) {
$t[] = $s;
$s = u($singles[$i]);
}
}
}
if ( $run ) {
$s .= ".." . u($singles[$i]);
$t[] = $s;
} else {
$t[] = u($singles[$i]);
}
// now encode the folding rules
foreach ( $sets as $codes ) {
$to = u($codes[0]);
for ( $i = 1; $i < count($codes); ++$i )
$t[] = u($codes[$i]) . "->$to";
}
if ( !$t )
exit(0);
// format output for the sphinx config file
print($startswith);
$w = strlen($startswith);
$last = array_pop($t);
foreach ( $t as $s ) {
$s .= ', ';
if ( $w + strlen($s) > $linewidth ) {
print("\\\n$leadstr");
$w = $leadwidth;
}
print($s);
$w += strlen($s);
}
print("$last\n");
?>