Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h> /* fgets printf stdin */
- #include <stdlib.h> /* free malloc realloc */
- #include <string.h> /* strlen */
- /*
- # http://strangelyconsistent.org/blog/p5-find-the-longest-common-substring
- # Post-contest discussion raised the question "did anyone use the Perl 6
- # string Xor function (~^)?", but the answer was no. So p5-mberends.pl
- # was not a contest entry, but a quick experiment to check its speed.
- # It uncovered several wrinkles in Parrot's and Rakudo's handling of
- # chr(0), and the workarounds severely reduced performance. Fixing the
- # causes of the workarounds would result in a very competitive solution.
- #
- # Curiosity motivated the translation of p5-mberends.pl to C, partly
- # optimized by using pointers, but still based on exhaustive brute force
- # loops. C does not have a string Xor function so the characters are
- # simply compared. Admittedly this version cheats by using bytes as
- # characters, so it can give the wrong answer when fed certain non-ASCII
- # UTF-8 strings. Nevertheless it shows the orders of magnitude of
- # performance that remain to be optimized in Rakudo.
- #
- # Algorithm: calculate offsets to effectively slide string1 over string2
- # one character position at a time in a loop, like:
- #
- # string1 string1 string1 string1
- # string2 string2 string2 string2 etc...
- #
- # Scan the overlapping substrings and detect runs of consecutive
- # matching characters. Keep track of the position and length of the
- # longest run, and report it at the end.
- #
- # Execution time is quadratic, O( (N1+N2)*min(N1,N2) ) where N1 and N2
- # are the lengths of the two strings. Startup time must be about 2.4ms
- # on a 1.6GHz Atom N270 netbook and seems to be the dominant term for
- # strings shorter than a few hundred characters.
- #
- # To run it 1000 times, try (with your data and program names):
- #
- # DATA=p5-dna.data; time { N=1000; while [[ N -gt 0 ]]; do ./p5-mberends-c < $DATA; let N=N-1; done }
- #
- # Results (user time in seconds for 1000 runs, followed by 1 run each of
- # p5-colomon.pl p5-moritz.pl p5-fox.pl p5-mberends.pl):
- # Data N1 N2 C colomon moritz fox mberends
- # dna 47 53 2.440 38.282 23.994 25.030 118.759
- # dna-100 100 100 2.468 71.844 71.564 88.102 455.580
- # dna-200 200 200 2.580 189.352 276.821 386.332 2167.119
- # dumas-1 71 84 2.468 54.403 47.363 42.451 243.171
- # dumas-2 132 156 2.532 91.598 154.566 132.676 866.674
- # dumas-3 289 660 5.684 786.629 3566.672 1766.514 11857.437
- #
- # So p5-mberends-c.c, p5-moritz.pl, p5-fox.pl and p5-mberends.pl run
- # in quadratic time, and p5-colomon in sub-quadratic time (kudos!).
- # There may an opportunity to speed up the C version by using strncmp()
- # or even better with strstr(), only slightly changing the algorithm,
- # but that was not the goal of the experiment.
- */
- #define MALLOC_CHUNK_SIZE 10000
- #define min(a,b) (a<b ? a : b)
- #define max(a,b) (a>b ? a : b)
- int chomp(char *); /* defined below */
- /* main */
- int main(int argc, char *argv[])
- {
- char * string1, * string2, * buf1, * buf2;
- int pos1, pos2, pos3, chars1 = 0, chars2 = 0;
- int offset_min, offset_max, offset, overlap_length;
- int run_length, best_runlength = 0, best_offset = 0;
- /* read the two strings as consecutive lines from standard input */
- string1 = (char *) malloc(MALLOC_CHUNK_SIZE);
- string2 = (char *) malloc(MALLOC_CHUNK_SIZE);
- fgets(string1, MALLOC_CHUNK_SIZE, stdin); /* BUG: cannot read lines longer */
- fgets(string2, MALLOC_CHUNK_SIZE, stdin); /* than MALLOC_CHUNK_SIZE - 1 */
- chomp(string1);
- chomp(string2);
- chars1 = strlen(string1);
- chars2 = strlen(string2);
- /* printf("string1 %d chars = '%s'\n", chars1, string1); */
- /* printf("string2 %d chars = '%s'\n", chars2, string2); */
- /* # $offset is most negative when only the last character of $string1
- # overlaps with the first character of $string2, and most positive when
- # only the first character of $string1 overlaps with the last character
- # of $string2. Somewhere in the middle, $offset is 0 when the two
- # strings overlap from the first character. */
- pos1 = chars1 - 1;
- pos2 = 0;
- overlap_length = 1;
- offset_min = - pos1;
- offset_max = chars2 - 1;
- /* this is the outer O(N1+N2) loop */
- for (offset = offset_min; offset <= offset_max; ++offset )
- {
- /* show a progress indicator that gets overwritten on the console */
- /* printf("%3d%%\r", ((offset - offset_min) / (offset_max - offset_min) * 100)); */
- pos1 = max(-offset,0);
- pos2 = max(0, offset);
- overlap_length = offset < 0
- ? min( chars2, (chars1-pos1) )
- : min( chars1, (chars2-pos2 ) );
- /* printf("offset=%d pos1=%d pos2=%d overlap_chars=%d\n", offset, pos1, pos2, overlap_chars); */
- buf1 = string1 + pos1;
- buf2 = string2 + pos2;
- run_length = 0;
- /* this is the inner O(min(N1,N2) loop */
- for (pos3=0; pos3<overlap_length; ++pos3) {
- if (* buf1++ == * buf2++) {
- ++run_length;
- }
- else {
- if (run_length > best_runlength) {
- best_runlength = run_length;
- best_offset = pos1 + pos3 - run_length;
- }
- run_length = 0;
- }
- }
- if (run_length > best_runlength) {
- best_runlength = run_length;
- best_offset = pos1 + pos3 - run_length;
- }
- }
- /* Was any common substring found at all? */
- if (best_runlength>0) {
- printf("longest common substring is '%.*s' (%d characters)\n",
- best_runlength, string1+best_offset, best_runlength);
- }
- else {
- printf("No commmon substring found\n");
- }
- free( string2 );
- free( string1 );
- return 0;
- }
- /* chomp */
- int
- chomp(char * s)
- {
- int status = 0, length;
- length = strlen(s);
- if ( s[length-1] == '\n' ) {
- s[length-1] = '\0';
- status = 1;
- }
- return status;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement