Advertisement
Guest User

SACA-K\

a guest
Nov 23rd, 2014
114
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 12.02 KB | None | 0 0
  1. // Author: Ge Nong,  Email: issng@mail.sysu.edu.cn
  2. // Department of Computer Science, Sun Yat-sen University,
  3. // Guangzhou, China
  4. // Date: December 24, 2012
  5. //
  6. // This is the demo source code for the algorithm SACA-K presented in this article:
  7. // G. Nong, Practical Linear-Time O(1)-Workspace Suffix Sorting for Constant Alphabets,
  8. // ACM Transactions on Information Systems, Scheduled to Appear in July 2013.
  9. // A draft for this article can be retrieved from http://code.google.com/p/ge-nong/.
  10.  
  11. #include <stdlib.h>
  12.  
  13. // set only the highest bit as 1, i.e. 1000...
  14. const unsigned int EMPTY=((unsigned int)1)<<(sizeof(unsigned int)*8-1);
  15.  
  16. // get s[i] at a certain level
  17. #define chr(i) ((level==0)?((unsigned char *)s)[i]:((int *)s)[i])
  18.  
  19. void getBuckets(unsigned char *s,
  20.   unsigned int *bkt, unsigned int n,
  21.   unsigned int K, bool end) {
  22.   unsigned int i, sum=0;
  23.  
  24.   // clear all buckets .
  25.   for(i=0; i<K; i++) bkt[i]=0;
  26.  
  27.   // compute the size of each bucket .
  28.   for(i=0; i<n; i++) bkt[s[i]]++;
  29.  
  30.   for(i=0; i<K; i++) {
  31.     sum+=bkt[i];
  32.     bkt[i]=end ? sum-1 : sum-bkt[i];
  33.   }
  34. }
  35.  
  36. void putSuffix0(unsigned int *SA,
  37.   unsigned char *s, unsigned int *bkt,
  38.   unsigned int n, unsigned int K, int n1) {
  39.   unsigned int i, j;
  40.  
  41.   // find the end of each bucket.
  42.   getBuckets(s, bkt, n, K, true);
  43.  
  44.   // put the suffixes into their buckets.
  45.   for(i=n1-1; i>0; i--) {
  46.     j=SA[i]; SA[i]=0;
  47.     SA[bkt[s[j]]--]=j;
  48.   }
  49.   SA[0]=n-1; // set the single sentinel suffix.
  50. }
  51.  
  52. void induceSAl0(unsigned int *SA,
  53.   unsigned char *s, unsigned int *bkt,
  54.   unsigned int n, unsigned int K, bool suffix) {
  55.   unsigned int i, j;
  56.  
  57.   // find the head of each bucket.
  58.   getBuckets(s, bkt, n, K, false);
  59.  
  60.   bkt[0]++; // skip the virtual sentinel.
  61.   for(i=0; i<n; i++)
  62.     if(SA[i]>0) {
  63.       j=SA[i]-1;
  64.       if(s[j]>=s[j+1]) {
  65.         SA[bkt[s[j]]]=j;
  66.         bkt[s[j]]++;
  67.         if(!suffix && i>0) SA[i]=0;
  68.       }
  69.     }
  70. }
  71.  
  72. void induceSAs0(unsigned int *SA,
  73.   unsigned char *s, unsigned int *bkt,
  74.   unsigned int n, unsigned int K, bool suffix) {
  75.   unsigned int i, j;
  76.  
  77.   // find the end of each bucket.
  78.   getBuckets(s, bkt, n, K, true);
  79.  
  80.   for(i=n-1; i>0; i--)
  81.     if(SA[i]>0) {
  82.       j=SA[i]-1;
  83.       if(s[j]<=s[j+1] && bkt[s[j]]<i) {
  84.         SA[bkt[s[j]]]=j;
  85.         bkt[s[j]]--;
  86.         if(!suffix) SA[i]=0;
  87.       }
  88.     }
  89. }
  90.  
  91. void putSubstr0(unsigned int *SA,
  92.   unsigned char *s, unsigned int *bkt,
  93.   unsigned int n, unsigned int K) {
  94.   unsigned int i, cur_t, succ_t;
  95.  
  96.   // find the end of each bucket.
  97.   getBuckets(s, bkt, n, K, true);
  98.  
  99.   // set each item in SA as empty.
  100.   for(i=0; i<n; i++) SA[i]=0;
  101.  
  102.   succ_t=0; // s[n-2] must be L-type.
  103.   for(i=n-2; i>0; i--) {
  104.     cur_t=(s[i-1]<s[i] ||
  105.            (s[i-1]==s[i] && succ_t==1)
  106.           )?1:0;
  107.     if(cur_t==0 && succ_t==1) SA[bkt[s[i]]--]=i;
  108.     succ_t=cur_t;
  109.   }
  110.  
  111.   // set the single sentinel LMS-substring.
  112.   SA[0]=n-1;
  113. }
  114.  
  115. void putSuffix1(int *SA, int *s, int n1) {
  116.   int i, j, pos, cur, pre=-1;
  117.  
  118.   for(i=n1-1; i>0; i--) {
  119.     j=SA[i]; SA[i]=EMPTY;
  120.     cur=s[j];
  121.     if(cur!=pre) {
  122.       pre=cur; pos=cur;
  123.     }
  124.     SA[pos--]=j;
  125.   }
  126. }
  127.  
  128. void induceSAl1(int *SA, int *s,
  129.   int n, bool suffix) {
  130.   int h, i, j, step=1;
  131.  
  132.   for(i=0; i<n; i+=step) {
  133.     step=1; j=SA[i]-1;
  134.     if(SA[i]<=0) continue;
  135.     int c=s[j], c1=s[j+1];
  136.     bool isL=c>=c1;
  137.     if(!isL) continue;
  138.  
  139.     // s[j] is L-type.
  140.  
  141.     int d=SA[c];
  142.     if(d>=0) {
  143.       // SA[c] is borrowed by the left
  144.       //   neighbor bucket.
  145.       // shift-left the items in the
  146.       //   left neighbor bucket.
  147.       int foo, bar;
  148.       foo=SA[c];
  149.       for(h=c-1; SA[h]>=0||SA[h]==EMPTY; h--)
  150.       { bar=SA[h]; SA[h]=foo; foo=bar; }
  151.       SA[h]=foo;
  152.       if(h<i) step=0;
  153.  
  154.       d=EMPTY;
  155.     }
  156.  
  157.     if(d==EMPTY) { // SA[c] is empty.
  158.       if(c<n-1 && SA[c+1]==EMPTY) {
  159.         SA[c]=-1; // init the counter.
  160.         SA[c+1]=j;
  161.       }
  162.       else        
  163.         SA[c]=j; // a size-1 bucket.
  164.     }
  165.     else { // SA[c] is reused as a counter.
  166.         int pos=c-d+1;
  167.         if(pos>n-1 || SA[pos]!=EMPTY) {
  168.           // we are running into the right
  169.           //   neighbor bucket.
  170.           // shift-left one step the items
  171.           //   of bucket(SA, S, j).
  172.           for(h=0; h<-d; h++)
  173.             SA[c+h]=SA[c+h+1];
  174.           pos--;
  175.           if(c<i) step=0;
  176.         }
  177.         else
  178.           SA[c]--;
  179.  
  180.         SA[pos]=j;
  181.     }
  182.  
  183.     int c2;
  184.     bool isL1=(j+1<n-1) && (c1>(c2=s[j+2]) || (c1==c2 && c1<i));  // is s[SA[i]] L-type?
  185.     if((!suffix || !isL1) && i>0) {
  186.       int i1=(step==0)?i-1:i;
  187.       SA[i1]=EMPTY;
  188.     }
  189.   }
  190.  
  191.   // scan to shift-left the items in each bucket
  192.   //   with its head being reused as a counter.
  193.   for(i=1; i<n; i++) {
  194.     j=SA[i];
  195.     if(j<0 && j!=EMPTY) { // is SA[i] a counter?
  196.       for(h=0; h<-j; h++)
  197.         SA[i+h]=SA[i+h+1];
  198.       SA[i+h]=EMPTY;
  199.     }
  200.   }
  201. }
  202.  
  203. void induceSAs1(int *SA, int *s,
  204.   int n, bool suffix) {
  205.   int h, i, j, step=1;
  206.  
  207.   for(i=n-1; i>0; i-=step) {
  208.     step=1; j=SA[i]-1;
  209.     if(SA[i]<=0) continue;
  210.     int c=s[j], c1=s[j+1];
  211.     bool isS=(c<c1) || (c==c1 && c>i);
  212.     if(!isS) continue;
  213.  
  214.     // s[j] is S-type
  215.  
  216.     int d=SA[c];
  217.     if(d>=0) {
  218.       // SA[c] is borrowed by the right
  219.       //   neighbor bucket.
  220.       // shift-right the items in the
  221.       //   right neighbor bucket.
  222.       int foo, bar;
  223.       foo=SA[c];
  224.       for(h=c+1; SA[h]>=0||SA[h]==EMPTY; h++)
  225.       { bar=SA[h]; SA[h]=foo; foo=bar; }
  226.       SA[h]=foo;
  227.       if(h>i) step=0;
  228.  
  229.       d=EMPTY;
  230.     }
  231.  
  232.     if(d==EMPTY) { // SA[c] is empty.
  233.       if(SA[c-1]==EMPTY) {
  234.         SA[c]=-1; // init the counter.
  235.         SA[c-1]=j;
  236.       }
  237.       else
  238.         SA[c]=j; // a size-1 bucket.
  239.     }
  240.     else { // SA[c] is reused as a counter.
  241.         int pos=c+d-1;
  242.         if(SA[pos]!=EMPTY) {
  243.           // we are running into the left
  244.           //   neighbor bucket.
  245.           // shift-right one step the items
  246.           //   of bucket(SA, S, j).
  247.           for(h=0; h<-d; h++)
  248.             SA[c-h]=SA[c-h-1];
  249.           pos++;
  250.           if(c>i) step=0;
  251.         }
  252.         else
  253.           SA[c]--;
  254.  
  255.         SA[pos]=j;
  256.     }
  257.  
  258.     if(!suffix) {
  259.       int i1=(step==0)?i+1:i;
  260.       SA[i1]=EMPTY;
  261.     }
  262.   }
  263.  
  264.   // scan to shift-right the items in each bucket
  265.   //   with its head being reused as a counter.
  266.   if(!suffix)
  267.     for(i=n-1; i>0; i--) {
  268.       j=SA[i];
  269.       if(j<0 && j!=EMPTY) { // is SA[i] a counter?
  270.         for(h=0; h<-j; h++)
  271.           SA[i-h]=SA[i-h-1];
  272.         SA[i-h]=EMPTY;
  273.       }
  274.     }
  275. }
  276.  
  277. void putSubstr1(int *SA, int *s, int n) {
  278.   int h, i, j;
  279.  
  280.   for(i=0; i<n; i++) SA[i]=EMPTY;
  281.  
  282.   int c, c1, t, t1;
  283.   c1=s[n-2];
  284.   t1=0;
  285.   for(i=n-2; i>0; i--) {
  286.     c=c1; t=t1;
  287.     c1=s[i-1];
  288.     t1=c1<c || (c1==c && t);
  289.     if(t && !t1) {
  290.       if(SA[c]>=0) {
  291.         // SA[c] is borrowed by the right
  292.         //   neighbor bucket.
  293.         // shift-right the items in the
  294.         //   right neighbor bucket.
  295.         int foo, bar;
  296.         foo=SA[c];
  297.         for(h=c+1; SA[h]>=0; h++)
  298.         { bar=SA[h]; SA[h]=foo; foo=bar; }
  299.         SA[h]=foo;
  300.  
  301.         SA[c]=EMPTY;
  302.       }
  303.  
  304.       int d=SA[c];
  305.       if(d==EMPTY) { // SA[c] is empty.
  306.         if(SA[c-1]==EMPTY) {
  307.           SA[c]=-1; // init the counter.
  308.           SA[c-1]=i;
  309.         }
  310.         else
  311.           SA[c]=i; // a size-1 bucket.
  312.       }
  313.       else { // SA[c] is reused as a counter
  314.           int pos=c+d-1;
  315.           if(SA[pos]!=EMPTY) {
  316.             // we are running into the left
  317.             //   neighbor bucket.
  318.             // shift-right one step the items
  319.             //   of bucket(SA, S, i).
  320.             for(h=0; h<-d; h++)
  321.               SA[c-h]=SA[c-h-1];
  322.             pos++;
  323.           }
  324.           else
  325.             SA[c]--;
  326.  
  327.           SA[pos]=i;
  328.       }
  329.     }
  330.   }
  331.  
  332.   // scan to shift-right the items in each bucket
  333.   //   with its head being reused as a counter.
  334.   for(i=n-1; i>0; i--) {
  335.     j=SA[i];
  336.     if(j<0 && j!=EMPTY) { // is SA[i] a counter?
  337.       for(h=0; h<-j; h++)
  338.         SA[i-h]=SA[i-h-1];
  339.       SA[i-h]=EMPTY;
  340.     }
  341.   }
  342.  
  343.   // put the single sentinel LMS-substring.
  344.   SA[0]=n-1;
  345. }
  346.  
  347. unsigned int getLengthOfLMS(unsigned char *s,
  348.   unsigned int n, int level, unsigned int x) {
  349.   if(x==n-1) return 1;  
  350.  
  351.   unsigned int dist, i=1;  
  352.   while(1) {
  353.     if(chr(x+i)<chr(x+i-1)) break;
  354.     i++;
  355.   }  
  356.   while(1) {
  357.     if(x+i>n-1 || chr(x+i)>chr(x+i-1)) break;
  358.     if(x+i==n-1 || chr(x+i)<chr(x+i-1)) dist=i;
  359.     i++;
  360.   }
  361.  
  362.   return dist+1;
  363. }
  364.  
  365. unsigned int nameSubstr(unsigned int *SA,
  366.   unsigned char *s, unsigned int *s1, unsigned int n,
  367.   unsigned int m, unsigned int n1, int level) {
  368.   unsigned int i, j, cur_t, succ_t;
  369.  
  370.   // init the name array buffer
  371.   for(i=n1; i<n; i++) SA[i]=EMPTY;
  372.  
  373.   // scan to compute the interim s1
  374.   unsigned int name, name_ctr=0;
  375.   unsigned int pre_pos, pre_len=0;
  376.   for(i=0; i<n1; i++) {
  377.     bool diff=false;
  378.     unsigned int len, pos=SA[i];
  379.  
  380.     len=getLengthOfLMS(s, n, level, pos);
  381.     if(len!=pre_len) diff=true;
  382.     else
  383.       for(unsigned int d=0; d<len; d++)
  384.         if(pos+d==n-1 || pre_pos+d==n-1 ||
  385.            chr(pos+d)!=chr(pre_pos+d)) {
  386.           diff=true; break;
  387.         }
  388.  
  389.     if(diff) {
  390.       name=i; name_ctr++;
  391.       SA[name]=1; // a new name.
  392.       pre_pos=pos; pre_len=len;
  393.     }
  394.     else
  395.       SA[name]++; // count this name.
  396.  
  397.     SA[n1+pos/2]=name;
  398.   }
  399.  
  400.   // compact the interim s1 sparsely stored
  401.   //   in SA[n1, n-1] into SA[m-n1, m-1].
  402.   for(i=n-1, j=m-1; i>=n1; i--)
  403.     if(SA[i]!=EMPTY) SA[j--]=SA[i];
  404.  
  405.   // rename each S-type character of the
  406.   //   interim s1 as the end of its bucket
  407.   //   to produce the final s1.
  408.   succ_t=1;
  409.   for(i=n1-1; i>0; i--) {
  410.     int ch=s1[i], ch1=s1[i-1];
  411.     cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0;
  412.     if(cur_t==1) {
  413.       s1[i-1]+=SA[s1[i-1]]-1;
  414.     }
  415.     succ_t=cur_t;
  416.   }
  417.  
  418.   return name_ctr;
  419. }
  420.  
  421. void getSAlms(unsigned int *SA,
  422.   unsigned char *s,
  423.   unsigned int *s1, unsigned int n,
  424.   unsigned int n1, int level ) {
  425.   unsigned int i, j, cur_t, succ_t;
  426.  
  427.   j=n1-1; s1[j--]=n-1;
  428.   succ_t=0; // s[n-2] must be L-type
  429.   for(i=n-2; i>0; i--) {
  430.     cur_t=(chr(i-1)<chr(i) ||
  431.           (chr(i-1)==chr(i) && succ_t==1))?1:0;
  432.     if(cur_t==0 && succ_t==1) s1[j--]=i;
  433.     succ_t=cur_t;
  434.   }
  435.  
  436.   for(i=0; i<n1; i++) SA[i]=s1[SA[i]];
  437.  
  438.   // init SA[n1..n-1]
  439.   for(i=n1; i<n; i++) SA[i]=level?EMPTY:0;
  440. }
  441.  
  442.  
  443. void SACA_K(unsigned char *s, unsigned int *SA,
  444.   unsigned int n, unsigned int K,
  445.   unsigned int m, int level) {
  446.   unsigned int i;
  447.   unsigned int *bkt=NULL;
  448.  
  449.   // stage 1: reduce the problem by at least 1/2.
  450.  
  451.   if(level==0) {
  452.     bkt=(unsigned int *)malloc(sizeof(int)*K);
  453.     putSubstr0(SA, s, bkt, n, K);
  454.     induceSAl0(SA, s, bkt, n, K, false);
  455.     induceSAs0(SA, s, bkt, n, K, false);
  456.   }
  457.   else {
  458.     putSubstr1((int *)SA, (int *)s,(int)n);
  459.     induceSAl1((int *)SA, (int *)s, n ,false);
  460.     induceSAs1((int *)SA, (int *)s, n, false);
  461.   }
  462.  
  463.   // now, all the LMS-substrings are sorted and
  464.   //   stored sparsely in SA.
  465.  
  466.   // compact all the sorted substrings into
  467.   //   the first n1 items of SA.
  468.   // 2*n1 must be not larger than n.
  469.   unsigned int n1=0;
  470.   for(i=0; i<n; i++)
  471.     if((!level&&SA[i]>0) || (level&&((int *)SA)[i]>0))
  472.       SA[n1++]=SA[i];
  473.  
  474.   unsigned int *SA1=SA, *s1=SA+m-n1;
  475.   unsigned int name_ctr;
  476.   name_ctr=nameSubstr(SA,s,s1,n,m,n1,level);
  477.  
  478.   // stage 2: solve the reduced problem.
  479.  
  480.   // recurse if names are not yet unique.
  481.   if(name_ctr<n1)
  482.     SACA_K((unsigned char *)s1, SA1,
  483.           n1, 0, m-n1, level+1);
  484.   else // get the suffix array of s1 directly.
  485.     for(i=0; i<n1; i++) SA1[s1[i]]=i;
  486.  
  487.   // stage 3: induce SA(S) from SA(S1).
  488.  
  489.   getSAlms(SA, s, s1, n, n1, level);
  490.   if(level==0) {
  491.     putSuffix0(SA, s, bkt, n, K, n1);
  492.     induceSAl0(SA, s, bkt, n, K, true);
  493.     induceSAs0(SA, s, bkt, n, K, true);
  494.     free(bkt);
  495.   }
  496.   else {
  497.     putSuffix1((int *)SA, (int *)s, n1);
  498.     induceSAl1((int *)SA, (int *)s, n, true);
  499.     induceSAs1((int *)SA, (int *)s, n, true);
  500.   }
  501. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement