p5-mberends-c.c

#include <stdio.h>   /* fgets printf stdin */
#include <stdlib.h>  /* free malloc realloc */
#include <string.h>  /* strlen */

/*
# http://strangelyconsistent.org/blog/p5-find-the-longest-common-substring
# Post-contest discussion raised the question "did anyone use the Perl 6
# string Xor function (~^)?", but the answer was no.  So p5-mberends.pl
# was not a contest entry, but a quick experiment to check its speed.
# It uncovered several wrinkles in Parrot's and Rakudo's handling of
# chr(0), and the workarounds severely reduced performance.  Fixing the
# causes of the workarounds would result in a very competitive solution.
#
# Curiosity motivated the translation of p5-mberends.pl to C, partly
# optimized by using pointers, but still based on exhaustive brute force
# loops.  C does not have a string Xor function so the characters are
# simply compared.  Admittedly this version cheats by using bytes as
# characters, so it can give the wrong answer when fed certain non-ASCII
# UTF-8 strings.  Nevertheless it shows the orders of magnitude of
# performance that remain to be optimized in Rakudo.
#
# Algorithm: calculate offsets to effectively slide string1 over string2
# one character position at a time in a loop, like:
#
#  string1         string1        string1       string1
#        string2        string2       string2      string2     etc...
#
# Scan the overlapping substrings and detect runs of consecutive
# matching characters.  Keep track of the position and length of the
# longest run, and report it at the end.
#
# Execution time is quadratic, O( (N1+N2)*min(N1,N2) ) where N1 and N2
# are the lengths of the two strings.  Startup time must be about 2.4ms
# on a 1.6GHz Atom N270 netbook and seems to be the dominant term for
# strings shorter than a few hundred characters.
#
# To run it 1000 times, try (with your data and program names):
#
#     DATA=p5-dna.data; time { N=1000; while [[ N -gt 0 ]]; do ./p5-mberends-c < $DATA; let N=N-1; done }
#
# Results (user time in seconds for 1000 runs, followed by 1 run each of
# p5-colomon.pl p5-moritz.pl p5-fox.pl p5-mberends.pl):
# Data      N1  N2       C   colomon    moritz       fox   mberends
# dna       47  53   2.440    38.282    23.994    25.030    118.759
# dna-100  100 100   2.468    71.844    71.564    88.102    455.580
# dna-200  200 200   2.580   189.352   276.821   386.332   2167.119
# dumas-1   71  84   2.468    54.403    47.363    42.451    243.171
# dumas-2  132 156   2.532    91.598   154.566   132.676    866.674
# dumas-3  289 660   5.684   786.629  3566.672  1766.514  11857.437
#
# So p5-mberends-c.c, p5-moritz.pl, p5-fox.pl and p5-mberends.pl run
# in quadratic time, and p5-colomon in sub-quadratic time (kudos!).
# There may an opportunity to speed up the C version by using strncmp()
# or even better with strstr(), only slightly changing the algorithm,
# but that was not the goal of the experiment.
*/

#define MALLOC_CHUNK_SIZE 10000
#define min(a,b) (a<b ? a : b)
#define max(a,b) (a>b ? a : b)
int chomp(char *); /* defined below */

/* main */
int main(int argc, char *argv[])
{
    char * string1, * string2, * buf1, * buf2;
    int pos1, pos2, pos3, chars1 = 0, chars2 = 0;
    int offset_min, offset_max, offset, overlap_length;
    int run_length, best_runlength = 0, best_offset = 0;

    /* read the two strings as consecutive lines from standard input */
    string1 = (char *) malloc(MALLOC_CHUNK_SIZE);
    string2 = (char *) malloc(MALLOC_CHUNK_SIZE);
    fgets(string1, MALLOC_CHUNK_SIZE, stdin); /* BUG: cannot read lines longer */
    fgets(string2, MALLOC_CHUNK_SIZE, stdin); /* than MALLOC_CHUNK_SIZE - 1 */
    chomp(string1);
    chomp(string2);
    chars1 = strlen(string1);
    chars2 = strlen(string2);
    /* printf("string1 %d chars = '%s'\n", chars1, string1); */
    /* printf("string2 %d chars = '%s'\n", chars2, string2); */


 /* # $offset is most negative when only the last character of $string1
    # overlaps with the first character of $string2, and most positive when
    # only the first character of $string1 overlaps with the last character
    # of $string2.  Somewhere in the middle, $offset is 0 when the two
    # strings overlap from the first character. */
    pos1           = chars1 - 1;
    pos2           = 0;
    overlap_length = 1;
    offset_min     = - pos1;
    offset_max     = chars2 - 1;

    /* this is the outer O(N1+N2) loop */
    for (offset = offset_min; offset <= offset_max; ++offset )
    {
        /* show a progress indicator that gets overwritten on the console */
        /* printf("%3d%%\r",  ((offset - offset_min) / (offset_max - offset_min) * 100)); */
        pos1 = max(-offset,0);
        pos2 = max(0, offset);
        overlap_length = offset < 0
            ? min( chars2, (chars1-pos1) )
            : min( chars1, (chars2-pos2  ) );
        /* printf("offset=%d pos1=%d pos2=%d overlap_chars=%d\n", offset, pos1, pos2, overlap_chars); */
        buf1 = string1 + pos1;
        buf2 = string2 + pos2;
        run_length = 0;

        /* this is the inner O(min(N1,N2) loop */
        for (pos3=0; pos3<overlap_length; ++pos3) {
            if (* buf1++ == * buf2++) {
                ++run_length;
            }
            else {
                if (run_length > best_runlength) {
                    best_runlength = run_length;
                    best_offset    = pos1 + pos3 - run_length;
                }
                run_length = 0;
            }

        }
        if (run_length > best_runlength) {
            best_runlength = run_length;
            best_offset    = pos1 + pos3 - run_length;
        }
    }
    /* Was any common substring found at all? */
    if (best_runlength>0) {
        printf("longest common substring is '%.*s' (%d characters)\n",
               best_runlength, string1+best_offset, best_runlength);
    }
    else {
        printf("No commmon substring found\n");
    }
    free( string2 );
    free( string1 );
    return 0;
}

/* chomp */
int
chomp(char * s)
{
    int status = 0, length;
    length = strlen(s);
    if ( s[length-1] == '\n' ) {
        s[length-1] = '\0';
        status = 1;
    }
    return status;
}