<?php
// user configuration stuff
$startswith = "\tcharset_table = "; // how to begin th first output line
$linewidth = 100; // limits num. chars per output line
$tabwidth = 8; // how wide are your tabs?
$tabsindent = 2; // num. tabs to indent a continuation line
$padto = 3; // min. num. hex digits per codepoint literal
// figure from config useful values for data output
$leadstr = str_repeat("\t", $tabsindent);
$leadwidth = $tabsindent*($tabwidth);
ini_set('default_charset', 'UTF-8' );
ini_set('mbstring.func_overload', 0);
// given a hex string argument, return a unicode character litieral
// in the format used in a sphinx config file's charset_table part
function u($s)
{
global $padto;
return preg_match('/[1-9a-fA-F][0-9a-fA-F]*$/', $s, $m)
? 'U+' . str_pad($m[0], $padto, "0", STR_PAD_LEFT)
: $s;
}
// input is from stdin
while ( ! feof(STDIN) ) {
$lines[] = fgets(STDIN);
}
if ( !$lines ) {
exit();
}
// sets of characters collated with equal value, to be folded by sphinx
$sets = array();
// $singles are terminal characters sphinx will index
$singles = array();
// parse each input line
foreach ( $lines as $line ) {
// search each input line for a tab character
if ( preg_match('/^(.+)\t(.+)$/u', $line, $m) ) {
// if the part after the tab is
// :a single hex codepoint then it's a singleton,
// add it to the list of singles
if ( preg_match('/^[0-9a-f]{1,5}$/', $m[2])
) {
$singles[] = '0x' . $m[2];
// :a comma separatred list of codepoints,
// it's a set of chars to be folded to the frst of them,
// split it and add to the list of sets
} elseif ( preg_match('/^[0-9a-f]{1,5}(,[0-9a-f]{1,5})+$/', $m[2])
) {
$sets[] = preg_split('/,/', $m[2]);
}
}
}
// add the folding targets to singles
foreach ( $sets as $codes ) {
$singles[] = '0x' . $codes[0];
}
// encode the rules for sphinx. do the singles (folding targets) first
sort($singles);
// run detection state machine var
$run = false;
// collect folding rules in $t
$t = array();
// $s is an output string
$s = u($singles[0]);
for ( $i = 1; $i < count($singles) - 1; $i++ ) {
// detect runs of consecutive codeponts and use sphinx's .. notation,
// e.g.: 'U+041..U+05A'
if ( $run ) {
if ( $singles[$i] != $singles[$i+1] - 1 ) {
$s .= ".." . u($singles[$i]);
$run = false;
}
} else {
$run = $singles[$i] == $singles[$i-1] + 1
&& $singles[$i] == $singles[$i+1] - 1;
if ( !$run ) {
$t[] = $s;
$s = u($singles[$i]);
}
}
}
// wrap up from the end of the above loop
if ( $run ) {
$s .= ".." . u($singles[$i]);
$t[] = $s;
} else {
$t[] = u($singles[$i]);
}
// now encode the folding rules
foreach ( $sets as $codes ) {
$to = u($codes[0]);
for ( $i = 1; $i < count($codes); ++$i )
$t[] = u($codes[$i]) . "->$to";
}
// is there anything to output?
if ( !$t ) {
exit(0);
}
// format output for the sphinx config file
print($startswith);
$w = strlen($startswith);
$last = array_pop($t);
foreach ( $t as $s ) {
$s .= ', ';
if ( $w + strlen($s) > $linewidth ) {
print("\\\n$leadstr");
$w = $leadwidth;
}
print($s);
$w += strlen($s);
}
print("$last\n");
?>