Advertisement
Guest User

p5-mberends-c.c

a guest
Mar 13th, 2011
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 6.18 KB | None | 0 0
  1. #include <stdio.h>   /* fgets printf stdin */
  2. #include <stdlib.h>  /* free malloc realloc */
  3. #include <string.h>  /* strlen */
  4.  
  5. /*
  6. # http://strangelyconsistent.org/blog/p5-find-the-longest-common-substring
  7. # Post-contest discussion raised the question "did anyone use the Perl 6
  8. # string Xor function (~^)?", but the answer was no.  So p5-mberends.pl
  9. # was not a contest entry, but a quick experiment to check its speed.
  10. # It uncovered several wrinkles in Parrot's and Rakudo's handling of
  11. # chr(0), and the workarounds severely reduced performance.  Fixing the
  12. # causes of the workarounds would result in a very competitive solution.
  13. #
  14. # Curiosity motivated the translation of p5-mberends.pl to C, partly
  15. # optimized by using pointers, but still based on exhaustive brute force
  16. # loops.  C does not have a string Xor function so the characters are
  17. # simply compared.  Admittedly this version cheats by using bytes as
  18. # characters, so it can give the wrong answer when fed certain non-ASCII
  19. # UTF-8 strings.  Nevertheless it shows the orders of magnitude of
  20. # performance that remain to be optimized in Rakudo.
  21. #
  22. # Algorithm: calculate offsets to effectively slide string1 over string2
  23. # one character position at a time in a loop, like:
  24. #
  25. #  string1         string1        string1       string1
  26. #        string2        string2       string2      string2     etc...
  27. #
  28. # Scan the overlapping substrings and detect runs of consecutive
  29. # matching characters.  Keep track of the position and length of the
  30. # longest run, and report it at the end.
  31. #
  32. # Execution time is quadratic, O( (N1+N2)*min(N1,N2) ) where N1 and N2
  33. # are the lengths of the two strings.  Startup time must be about 2.4ms
  34. # on a 1.6GHz Atom N270 netbook and seems to be the dominant term for
  35. # strings shorter than a few hundred characters.
  36. #
  37. # To run it 1000 times, try (with your data and program names):
  38. #
  39. #     DATA=p5-dna.data; time { N=1000; while [[ N -gt 0 ]]; do ./p5-mberends-c < $DATA; let N=N-1; done }
  40. #
  41. # Results (user time in seconds for 1000 runs, followed by 1 run each of
  42. # p5-colomon.pl p5-moritz.pl p5-fox.pl p5-mberends.pl):
  43. # Data      N1  N2       C   colomon    moritz       fox   mberends
  44. # dna       47  53   2.440    38.282    23.994    25.030    118.759
  45. # dna-100  100 100   2.468    71.844    71.564    88.102    455.580
  46. # dna-200  200 200   2.580   189.352   276.821   386.332   2167.119
  47. # dumas-1   71  84   2.468    54.403    47.363    42.451    243.171
  48. # dumas-2  132 156   2.532    91.598   154.566   132.676    866.674
  49. # dumas-3  289 660   5.684   786.629  3566.672  1766.514  11857.437
  50. #
  51. # So p5-mberends-c.c, p5-moritz.pl, p5-fox.pl and p5-mberends.pl run
  52. # in quadratic time, and p5-colomon in sub-quadratic time (kudos!).
  53. # There may an opportunity to speed up the C version by using strncmp()
  54. # or even better with strstr(), only slightly changing the algorithm,
  55. # but that was not the goal of the experiment.
  56. */
  57.  
  58. #define MALLOC_CHUNK_SIZE 10000
  59. #define min(a,b) (a<b ? a : b)
  60. #define max(a,b) (a>b ? a : b)
  61. int chomp(char *); /* defined below */
  62.  
  63. /* main */
  64. int main(int argc, char *argv[])
  65. {
  66.     char * string1, * string2, * buf1, * buf2;
  67.     int pos1, pos2, pos3, chars1 = 0, chars2 = 0;
  68.     int offset_min, offset_max, offset, overlap_length;
  69.     int run_length, best_runlength = 0, best_offset = 0;
  70.  
  71.     /* read the two strings as consecutive lines from standard input */
  72.     string1 = (char *) malloc(MALLOC_CHUNK_SIZE);
  73.     string2 = (char *) malloc(MALLOC_CHUNK_SIZE);
  74.     fgets(string1, MALLOC_CHUNK_SIZE, stdin); /* BUG: cannot read lines longer */
  75.     fgets(string2, MALLOC_CHUNK_SIZE, stdin); /* than MALLOC_CHUNK_SIZE - 1 */
  76.     chomp(string1);
  77.     chomp(string2);
  78.     chars1 = strlen(string1);
  79.     chars2 = strlen(string2);
  80.     /* printf("string1 %d chars = '%s'\n", chars1, string1); */
  81.     /* printf("string2 %d chars = '%s'\n", chars2, string2); */
  82.  
  83.  
  84.  /* # $offset is most negative when only the last character of $string1
  85.     # overlaps with the first character of $string2, and most positive when
  86.     # only the first character of $string1 overlaps with the last character
  87.     # of $string2.  Somewhere in the middle, $offset is 0 when the two
  88.     # strings overlap from the first character. */
  89.     pos1           = chars1 - 1;
  90.     pos2           = 0;
  91.     overlap_length = 1;
  92.     offset_min     = - pos1;
  93.     offset_max     = chars2 - 1;
  94.  
  95.     /* this is the outer O(N1+N2) loop */
  96.     for (offset = offset_min; offset <= offset_max; ++offset )
  97.     {
  98.         /* show a progress indicator that gets overwritten on the console */
  99.         /* printf("%3d%%\r",  ((offset - offset_min) / (offset_max - offset_min) * 100)); */
  100.         pos1 = max(-offset,0);
  101.         pos2 = max(0, offset);
  102.         overlap_length = offset < 0
  103.             ? min( chars2, (chars1-pos1) )
  104.             : min( chars1, (chars2-pos2  ) );
  105.         /* printf("offset=%d pos1=%d pos2=%d overlap_chars=%d\n", offset, pos1, pos2, overlap_chars); */
  106.         buf1 = string1 + pos1;
  107.         buf2 = string2 + pos2;
  108.         run_length = 0;
  109.  
  110.         /* this is the inner O(min(N1,N2) loop */
  111.         for (pos3=0; pos3<overlap_length; ++pos3) {
  112.             if (* buf1++ == * buf2++) {
  113.                 ++run_length;
  114.             }
  115.             else {
  116.                 if (run_length > best_runlength) {
  117.                     best_runlength = run_length;
  118.                     best_offset    = pos1 + pos3 - run_length;
  119.                 }
  120.                 run_length = 0;
  121.             }
  122.            
  123.         }
  124.         if (run_length > best_runlength) {
  125.             best_runlength = run_length;
  126.             best_offset    = pos1 + pos3 - run_length;
  127.         }
  128.     }
  129.     /* Was any common substring found at all? */
  130.     if (best_runlength>0) {
  131.         printf("longest common substring is '%.*s' (%d characters)\n",
  132.                best_runlength, string1+best_offset, best_runlength);
  133.     }
  134.     else {
  135.         printf("No commmon substring found\n");
  136.     }
  137.     free( string2 );
  138.     free( string1 );
  139.     return 0;
  140. }
  141.  
  142. /* chomp */
  143. int
  144. chomp(char * s)
  145. {
  146.     int status = 0, length;
  147.     length = strlen(s);
  148.     if ( s[length-1] == '\n' ) {
  149.         s[length-1] = '\0';
  150.         status = 1;
  151.     }
  152.     return status;
  153. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement