Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "bwt.h"
- void devoidBWT(){
- if(isBWTused){
- }
- }
- void printSA(char* msg,int16_t *sa,int n){
- printf("%sn",msg);
- for(int i=0; i<n; i++){
- printf("%4d -- %dn",i,sa[i]);
- }
- }
- void printArray(char* msg,Array ar,int n){
- printf("%sn",msg);
- for(int i=0; i<n; i++){
- printf("%4d -- %dn",ARRAY_INDEX(ar,i),ARRAY_GET(ar,i));
- }
- }
- void initFreqTable(Array buffer, Array freqTable, uint16_t symbolCount, int16_t k){
- printf("getCounts: n %d k %d buffer at %p freqTable %pn",symbolCount,k,buffer.arr,freqTable.arr);
- for(register uint16_t i=0; i < k; i++){
- ARRAY_SET(freqTable,i,0);
- }
- for(register uint16_t i=0; i < symbolCount; i++){
- ARRAY_UPDATE(freqTable,ARRAY_GET(buffer,i),1);
- }
- }
- void initBuckets(Array freqTable, Array suffixArrayHead, int16_t charset, bool isFront){
- uint32_t sum=0;
- printf("getBuckets: k %d isFront %d freqTable at %p suffixArrayHead %pn",charset,isFront,freqTable.arr,suffixArrayHead.arr);
- if(isFront){
- for(register uint16_t i=0; i < charset; i++){
- sum = sum + ARRAY_GET(freqTable,i);
- ARRAY_SET(suffixArrayHead,i,sum - ARRAY_GET(freqTable,i) );
- }
- }else{
- for(register uint16_t i=0; i < charset; i++){
- sum = sum + ARRAY_GET(freqTable,i);
- ARRAY_SET(suffixArrayHead,i,sum);
- }
- }
- }
- void lmsSort(Array buffer, Array freqTable, Array suffixArrayHead, uint16_t symbolCount, int16_t k){
- printf("lmsSort: n %d k %dn",symbolCount,k);
- int16_t index = symbolCount-1;
- int16_t c0,c1;
- int16_t bIndex;
- // Compute L suffixes
- if( freqTable.arr == suffixArrayHead.arr){
- initFreqTable(buffer,freqTable,symbolCount,k);
- }
- initBuckets(freqTable,suffixArrayHead,k,true);
- c1 = ARRAY_GET(buffer,index);
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- index--;
- suffixArray[bIndex++] = ( (ARRAY_GET(buffer,index) < c1 ) ? ~index : index);
- for(register int16_t i=0; i<symbolCount; i++){
- index = suffixArray[i];
- if(index > 0){
- c0=ARRAY_GET(buffer,index);
- if( c0 != c1 ){
- ARRAY_SET(suffixArrayHead,c1,bIndex);
- c1 = c0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- }
- index--;
- suffixArray[bIndex++] = ( (ARRAY_GET(buffer,index) < c1) ? ~index : index);
- suffixArray[i]=0;
- }else if (index < 0){
- suffixArray[i] = ~index;
- }
- }
- // Compute S suffixes
- if( freqTable.arr == suffixArrayHead.arr){
- initFreqTable(buffer,freqTable,symbolCount,k);
- }
- //printSA("lmsSort:head:before getCounts",freqTable.arr,k);
- initBuckets(freqTable,suffixArrayHead,k,false);
- //printSA("lmsSort:head:after getBuckets",suffixArrayHead.arr,k);
- //printSA("lmsSort:suffixArray",suffixArray,symbolCount);
- //mark
- c1=0; c0=0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- for(register int16_t i=symbolCount-1; i >=0 ; i--){
- index = suffixArray[i];
- if(index > 0){
- c0 = ARRAY_GET(buffer,index);
- //printf("lmsSort:s: j %d c0 %c c1 %c b %dn",index,c0,c1,bIndex);
- if(c0 != c1){
- ARRAY_SET(suffixArrayHead,c1,bIndex);
- c1=c0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- }
- index--;
- bIndex--;
- suffixArray[bIndex] = ( ( ARRAY_GET(buffer,index) > c1) ? ~(index+1) : index);
- suffixArray[i] = 0;
- }
- }
- //printSA("lmsSort:lastline:suffixArray",suffixArray,symbolCount);
- }
- int16_t lmsPost(Array buffer, uint16_t symbolCount, int16_t m){
- int16_t i, j, p, q, plen, qlen, name;
- uint8_t c0,c1;
- bool same;
- printf("lmsPost: n %d m %dn",symbolCount,m);
- // printSA("lmsPost:start",suffixArray,symbolCount);
- // Compact all sorted substrings
- for( i=0; (p = suffixArray[i]) < 0; i++ ){
- suffixArray[i] = ~p;
- }
- if(i < m){
- for(j=i, ++i;;i++){
- p = suffixArray[i];
- if( p < 0 ){
- suffixArray[j++] = ~p;
- suffixArray[i] = 0;
- if ( j == m ){
- break;
- }
- }
- }
- }
- // Store the length of all substrings
- i = symbolCount -1;
- j = symbolCount -1;
- c0 = ARRAY_GET(buffer,i);
- do{
- c1 = c0;
- }while((--i >= 0) && ((c0 = ARRAY_GET(buffer,i)) >= c1));
- for(; i >= 0; ){
- do {
- c1 = c0;
- }while((--i >= 0) && ((c0 = ARRAY_GET(buffer,i)) <= c1));
- if(i >= 0){
- suffixArray[m + ((i+1) >> 1)] = j - i;
- j = i+1;
- do {
- c1 = c0;
- }while( (--i >= 0) && ((c0 = ARRAY_GET(buffer,i)) >= c1));
- }
- }
- // Find the lexicographic names of all substrings
- for(i=0, name = 0, q = symbolCount, qlen=0; i < m; i++){
- p = suffixArray[i];
- plen = suffixArray[m + (p >> 1)];
- same=false;
- if((plen == qlen) && ((q + plen) < symbolCount)){
- for(j=0; (j < plen) && ( ARRAY_GET(buffer,p + j) == ARRAY_GET(buffer,q + j)); j++){}
- if(j == plen) {
- same = true;
- }
- }
- if(!same){
- ++name;
- q = p;
- qlen = plen;
- }
- suffixArray[m + (p >> 1)] = name;
- }
- //printf("lmsPost last line name = %dn",name);
- return name;
- }
- void induceSA(Array buffer, Array freqTable, Array suffixArrayHead, uint16_t symbolCount, int16_t k){
- int16_t bIndex, i, index;
- uint8_t c0,c1;
- printf("induceSA: n %d k %dn",symbolCount,k);
- // Compute L suffixes
- if( freqTable.arr == suffixArrayHead.arr){
- initFreqTable(buffer,freqTable,symbolCount,k);
- }
- initBuckets(freqTable,suffixArrayHead,k,true);
- index = symbolCount -1;
- c1 = ARRAY_GET(buffer,index);
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- suffixArray[bIndex++] = ( (index > 0) && (ARRAY_GET(buffer,index -1) < c1) ) ? ~index : index;
- for(i=0; i<symbolCount; i++){
- index = suffixArray[i];
- suffixArray[i] = ~index;
- if(index > 0){
- c0 = ARRAY_GET(buffer,--index);
- if(c0 != c1){
- ARRAY_SET(suffixArrayHead,c1,bIndex);
- c1 = c0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- }
- suffixArray[bIndex++] = ( (index > 0) && (ARRAY_GET(buffer,index-1) < c1) ) ? ~index : index;
- }
- }
- // Compute S suffixes
- if( freqTable.arr == suffixArrayHead.arr){
- initFreqTable(buffer,freqTable,symbolCount,k);
- }
- initBuckets(freqTable,suffixArrayHead,k,false);
- c1=0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- for(i=symbolCount-1; i >= 0; i--){
- index = suffixArray[i];
- if( index > 0 ){
- c0 = ARRAY_GET(buffer,--index);
- if(c0 != c1){
- ARRAY_SET(suffixArrayHead,c1,bIndex);
- c1 = c0;
- bIndex = ARRAY_GET(suffixArrayHead,c1);
- }
- suffixArray[--bIndex] = ( (index == 0) || (ARRAY_GET(buffer,index-1) > c1 )) ? ~index : index;
- }else{
- suffixArray[i] = ~index;
- }
- }
- }
- int16_t sais(Array buffer, int16_t fs, uint16_t symbolCount, int16_t k){
- Array freqTable, suffixArrayHead, t_sa;
- register int16_t i, j, b, c, m, p, q, name, newfs;
- uint8_t c0, c1;
- int8_t flag=0;
- printf("sais: fs %d k %d n %dn",fs,k,symbolCount);
- if( k <= MAX_SYMBOLS){
- ALLOCATE_ARRAY(freqTable,k);
- if( k <= fs ){
- flag = 1;
- ARRAY_INIT_NEW(suffixArrayHead,symbolCount,symbolCount + fs - k,suffixArray);
- }else{
- flag = 3;
- ALLOCATE_ARRAY(suffixArrayHead,k);
- }
- }else if( k <= fs ){
- ARRAY_INIT_NEW(freqTable,symbolCount,symbolCount + fs - k,suffixArray);
- if( k <= (fs-k) ){
- flag = 0;
- ARRAY_INIT_NEW(suffixArrayHead,symbolCount,symbolCount + fs - k*2,suffixArray);
- }else if( k <= 1024 ){
- flag = 2;
- ALLOCATE_ARRAY(suffixArrayHead,k);
- }else {
- ARRAY_EQU(suffixArrayHead,freqTable);
- flag = 8;
- }
- }else{
- flag = 4 | 8;
- ALLOCATE_ARRAY(freqTable,k);
- ALLOCATE_ARRAY(suffixArrayHead,k);
- }
- // Sort all LMS substrings
- initFreqTable(buffer,freqTable,symbolCount,k);
- initBuckets(freqTable,suffixArrayHead,k,false);
- for(i=0; i<symbolCount; i++){
- suffixArray[i]=0;
- }
- b = -1 ;
- i = symbolCount - 1;
- j = symbolCount;
- m = 0;
- c0 = ARRAY_GET(buffer,i);
- do {
- c1 = c0;
- }while( (--i >= 0) && ( (c0 = ARRAY_GET(buffer,i)) >= c1));
- for(; i >= 0; ){
- do {
- c1 = c0;
- }while( ( --i >= 0 ) && ( (c0 = ARRAY_GET(buffer,i)) <= c1) );
- if (i >= 0){
- if (b >= 0){
- suffixArray[b] = j;
- }
- b = ARRAY_UPDATE(suffixArrayHead,c1,-1);
- j = i;
- m++;
- do {
- c1 = c0;
- }while ( (--i >= 0) && ( (c0 = ARRAY_GET(buffer,i)) >= c1) );
- }
- }
- //printf("m is %d flag is %dn",m,flag);
- // marking
- if (m > 1){
- lmsSort(buffer,freqTable,suffixArrayHead,symbolCount,k);
- name = lmsPost(buffer,symbolCount,m);
- }else if(m == 1){
- suffixArray[b] = j+1;
- name = 1;
- } else {
- name = 0;
- }
- // Solve the reduced problem, recurse if names are same
- if(name < m){
- //printf("sais:name < m flag=%d: freqTable %p sah %p sa %p flag %dn",flag,freqTable.arr,suffixArrayHead.arr,suffixArray,flag);
- if ( (flag & 4) != 0 ){
- FREE_ARRAY(freqTable)
- FREE_ARRAY(suffixArrayHead)
- }
- if ( (flag & 2) != 0 ){
- FREE_ARRAY(suffixArrayHead)
- }
- newfs = (symbolCount + fs) - (m * 2);
- if( (flag & (1 | 4 | 8) ) == 0 ){
- if( (k + name) <= newfs ){
- newfs = newfs - k;
- } else {
- flag = flag | 8;
- }
- }
- //mark
- for(i = m + (symbolCount >> 1) - 1, j = m * 2 + newfs - 1; m <= i; i--){
- if(suffixArray[i] != 0){
- suffixArray[j--] = suffixArray[i] - 1;
- }
- }
- //mark
- //printf("sais: before t_sa alloc sa %p sah %p freqTable %p t_sa %pn",suffixArray,suffixArrayHead.arr,freqTable.arr,t_sa.arr);
- ARRAY_INIT_NEW(t_sa,symbolCount,(m + newfs),suffixArray);
- // printf("sais: after t_sa alloc sa %p sah %p freqTable %p t_sa %pn",suffixArray,suffixArrayHead.arr,freqTable.arr,t_sa.arr);
- printf("Printing new buffer curr = %dn",*(t_sa.index));
- // before sais suffixArray is fine
- sais(t_sa,newfs,m,name);
- // printf("sais: before t_sa free .arr %p .index %pn",t_sa.arr,t_sa.index);
- FREE_ARRAY(t_sa)
- //printSA("sais:after recurse:suffixArray",suffixArray,symbolCount);
- //fail
- //puts("sais: after t_sa freen");
- i = symbolCount - 1;
- j = m * 2 - 1;
- c0 = ARRAY_GET(buffer,i);
- do {
- c1 = c0;
- }while( ( --i >= 0 ) && ( (c0 = ARRAY_GET(buffer,i)) >= c1) );
- for(; i >= 0; ){
- do {
- c1 = c0;
- }while( (--i >= 0) && ( (c0 = ARRAY_GET(buffer,i)) <= c1) );
- if( i >= 0 ){
- suffixArray[j--] = i+1;
- do {
- c1 = c0;
- }while ( (--i <= 0) && ( (c0 = ARRAY_GET(buffer,i)) >= c1) );
- }
- }
- for (i = 0 ; i < m; i++){
- suffixArray[i] = suffixArray[m + suffixArray[i]];
- }
- if( (flag & 4) != 0) {
- ALLOCATE_ARRAY(freqTable,k);
- ARRAY_EQU(suffixArrayHead,freqTable);
- }
- if( (flag & 2) != 0){
- ALLOCATE_ARRAY(suffixArrayHead,k);
- }
- }
- // induce the result for the original problem
- if( (flag & 8) != 0 ) {
- initFreqTable(buffer,freqTable,symbolCount,k);
- }
- // Put all S* characters into their buckets
- if (m > 1){
- initBuckets(freqTable,suffixArrayHead,k,false);
- i = m - 1;
- j = symbolCount;
- p = suffixArray[i];
- c1 = ARRAY_GET(buffer,p);
- do {
- c0 = c1;
- q = ARRAY_GET(suffixArrayHead,c0);
- while( q < j ){
- suffixArray[--j] = 0;
- }
- do {
- suffixArray[--j] = p;
- if( --i < 0 ){
- break;
- }
- p = suffixArray[i];
- } while( (c1 = ARRAY_GET(buffer,p)) == c0);
- }while( i >= 0 );
- while ( j > 0) {
- suffixArray[--j] = 0;
- }
- induceSA(buffer,freqTable,suffixArrayHead,symbolCount,k);
- FREE_ARRAY(freqTable)
- FREE_ARRAY(suffixArrayHead)
- }
- return 0;
- }
- int main(int argc, char* argv[]){
- FILE* in=fopen(argv[1],"rb");
- uint8_t* buffer = (uint8_t*)malloc(sizeof(uint8_t)*BUFFER_SIZE);
- int symbolCount = fread(buffer,sizeof(uint8_t),BUFFER_SIZE,in);
- suffixArray = calloc(symbolCount,sizeof(int16_t));
- if( (buffer == NULL) || (suffixArray == NULL) )
- return NULL;
- isBWTused=true;
- Array bufferArray;
- ALLOCATE_ARRAY(bufferArray,symbolCount);
- for(register int16_t i=0; i<symbolCount; i++){
- bufferArray.arr[i]=buffer[i];
- }
- sais(bufferArray,0,symbolCount,MAX_SYMBOLS);
- puts("Printing suffix array and BWTn");
- int key = -99;
- for(register int i=0; i<symbolCount; i++){
- int index=suffixArray[i]-1;
- if(index<0){
- key=i;
- index=index+symbolCount;
- }
- printf("%4d %cn",suffixArray[i],buffer[index]);
- }
- printf("Key is %dn",key);
- fclose(in);
- }
- #ifndef BWT_H
- #define BWT_H
- #include<stdio.h>
- #include<stdlib.h>
- #include<stdbool.h>
- #include<inttypes.h>
- #define MAX_SYMBOLS 256
- #define BUFFER_SIZE 2048
- typedef struct {
- int16_t *index;
- int16_t *arr;
- }Array;
- #define ALLOCATE_ARRAY(A,size) A.arr = calloc(size,sizeof(int16_t));
- printf("ALLOCATE_ARRAY %s at %pn",#A,A.arr);
- A.index = calloc(1,sizeof(int16_t))
- #define FREE_ARRAY(A) if( A.arr ) { printf("FREE_ARRAY %s at %pn",#A,A.arr); free(A.arr); A.arr = NULL;}
- if( A.index ) { free(A.index); A.index = NULL;}
- #define ARRAY_INIT(A,ptr,i) A.arr = ptr;
- A.index = calloc(1,sizeof(int16_t));
- *(A.index) = i
- #define ARRAY_INIT_NEW(A,size,i,ptr) ALLOCATE_ARRAY(A,size);
- *(A.index) = i;
- for(uint16_t x=0; x<size; x++){
- A.arr[x] = ptr[x];
- }
- #define ARRAY_EQU(A,B) A.arr = B.arr;
- *(A.index) = *(B.index)
- #define ARRAY_GET(A,i) A.arr[*(A.index) + i]
- #define ARRAY_INDEX(A,i) *(A.index) + i
- #define ARRAY_SET(A,i,val) A.arr[*(A.index) + i] = val
- #define ARRAY_UPDATE(A,i,val) A.arr[*(A.index) + i] = A.arr[*(A.index) + i] + val
- bool isBWTused;
- int16_t *suffixArray;
- uint16_t totalUniqueSymbols;
- //pivan
- #endif
- ACCCGAGATCGGGTAGGTAGTCGTGCAACTTTCGGAGTGTGGTACACCACAGAGCGCCGGGGGCCGGTAAGCTCGGTATCGACCGTTGAGCTCAGGCCCGTCGCCAAAGAAGAATAAGCGGTAGGAGACCGCGTACCCCGAGCAATAGTTGAGCTTCTCGGAATTACGGATGAGGGAATGAGAATTTACCACGACATGAGGAGAGTACGATGCAACCTAAGGGGACAAGTCCCGGCTTGACCGGCTTTAAGGTTGGGCTCCGACCTGGTAAGACAGACGCCTCAATACTCGCCACTCTAACACATGCGCCGCTTAGGAAAGGTTAACTCACGTCCATAAATTGCGTTGACCTCCCTCGTCCCGGTTGGTGGTTTCGGATTTTCCGCCGTTTATCGACAAGTACAACCGGGTGGATGACTGGTTAACTCCACGTCAAGGCGGGATCTGGAATTCCTGGGTTAGTTGATACACTGTTCCACCCGAGGGGTAGATGTCCCATC
- import java.lang.String;
- public class sais {
- private static interface BaseArray {
- public int get(int i);
- public void set(int i, int val);
- public int update(int i, int val);
- }
- private static class ByteArray implements BaseArray {
- private byte[] m_A = null;
- private int m_pos = 0;
- ByteArray(byte[] A, int pos) { m_A = A; m_pos = pos; }
- public int get(int i) { return m_A[m_pos + i] & 0xff; }
- public void set(int i, int val) { m_A[m_pos + i] = (byte)(val & 0xff); }
- public int update(int i, int val) { return m_A[m_pos + i] += val & 0xff; }
- }
- private static class CharArray implements BaseArray {
- private char[] m_A = null;
- private int m_pos = 0;
- CharArray(char[] A, int pos) { m_A = A; m_pos = pos; }
- public int get(int i) { return m_A[m_pos + i] & 0xffff; }
- public void set(int i, int val) { m_A[m_pos + i] = (char)(val & 0xffff); }
- public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; }
- }
- private static class ShortArray implements BaseArray {
- private short[] m_A = null;
- private int m_pos = 0;
- ShortArray(short[] A, int pos) { m_A = A; m_pos = pos; }
- public int get(int i) { return m_A[m_pos + i] & 0xffff; }
- public void set(int i, int val) { m_A[m_pos + i] = (short)(val & 0xffff); }
- public int update(int i, int val) { return m_A[m_pos + i] += val & 0xffff; }
- }
- private static class IntArray implements BaseArray {
- public int[] m_A = null;
- public int m_pos = 0;
- IntArray(int[] A, int pos) { m_A = A; m_pos = pos; }
- public int get(int i) { return m_A[m_pos + i]; }
- public void set(int i, int val) { m_A[m_pos + i] = val; }
- public int update(int i, int val) { return m_A[m_pos + i] += val; }
- }
- private static class StringArray implements BaseArray {
- private String m_A = null;
- private int m_pos = 0;
- StringArray(String A, int pos) { m_A = A; m_pos = pos; }
- public int get(int i) { return (int)(m_A.charAt(m_pos + i) & 0xffff); }
- public void set(int i, int val) { }
- public int update(int i, int val) { return 0; }
- }
- private static void printSA(String msg,int[] SA,int n){
- System.out.println(msg);
- for(int z=0;z<n; z++){
- System.out.printf("%4d -- %dn",z,SA[z]);
- }
- }
- /* find the start or end of each bucket */
- private static
- void
- getCounts(BaseArray T, BaseArray C, int n, int k) {
- System.out.printf("getCounts: n %d k %dn",n,k);
- int i;
- for(i = 0; i < k; ++i) { C.set(i, 0); }
- for(i = 0; i < n; ++i) { C.update(T.get(i), 1); }
- }
- private static
- void
- getBuckets(BaseArray C, BaseArray B, int k, boolean end) {
- int i, sum = 0;
- System.out.printf("getBuckets: k %d front %bn",k,!end);
- if(end != false) {
- for(i = 0; i < k; ++i) {
- sum += C.get(i); B.set(i, sum);
- } }
- else {
- for(i = 0; i < k; ++i) {
- sum += C.get(i); B.set(i, sum - C.get(i));
- } }
- }
- /* sort all type LMS suffixes */
- private static
- void
- LMSsort(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) {
- System.out.printf("lmsSort: n %d k %dn",n,k);
- int b, i, j;
- int c0, c1;
- /* compute SAl */
- if(C == B) { getCounts(T, C, n, k); }
- getBuckets(C, B, k, false); /* find starts of buckets */
- j = n - 1;
- b = B.get(c1 = T.get(j));
- --j;
- SA[b++] = (T.get(j) < c1) ? ~j : j;
- for(i = 0; i < n; ++i) {
- if(0 < (j = SA[i])) {
- if((c0 = T.get(j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- --j;
- SA[b++] = (T.get(j) < c1) ? ~j : j;
- SA[i] = 0;
- } else if(j < 0) {
- SA[i] = ~j;
- }
- }
- //marking
- /* compute SAs */
- if(C == B) { getCounts(T, C, n, k); }
- // printSA("lmsSort:head:before getCounts",((IntArray)C).m_A,k);
- getBuckets(C, B, k, true); /* find ends of buckets */
- // printSA("lmsSort:head:after getBuckets",((IntArray)B).m_A,k);
- // printSA("lmsSort:suffixArray",SA,n);
- //mark
- c0 = 0;
- for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) {
- if(0 < (j = SA[i])) {
- c0 = T.get(j);
- // System.out.printf("lmsSort:s: j %d c0 %c c1 %c b %dn",j,c0,c1,b);
- if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- --j;
- SA[--b] = (T.get(j) > c1) ? ~(j + 1) : j;
- SA[i] = 0;
- }
- }
- // printSA("lmsSort:lastline:suffixArray",SA,n);
- }
- private static
- int
- LMSpostproc(BaseArray T, int[] SA, int n, int m) {
- int i, j, p, q, plen, qlen, name;
- int c0, c1;
- boolean diff;
- System.out.printf("lmsPost: n %d m %dn",n,m);
- // printSA("lmsPost:start",SA,n);
- /* compact all the sorted substrings into the first m items of SA
- 2*m must be not larger than n (proveable) */
- for(i = 0; (p = SA[i]) < 0; ++i) { SA[i] = ~p; }
- if(i < m) {
- for(j = i, ++i;; ++i) {
- if((p = SA[i]) < 0) {
- SA[j++] = ~p; SA[i] = 0;
- if(j == m) { break; }
- }
- }
- }
- /* store the length of all substrings */
- i = n - 1;
- j = n - 1;
- c0 = T.get(n - 1);
- do { c1 = c0;
- } while((0 <= --i) && ((c0 = T.get(i)) >= c1));
- for(; 0 <= i;) {
- do { c1 = c0;
- } while((0 <= --i) && ((c0 = T.get(i)) <= c1));
- if(0 <= i) {
- SA[m + ((i + 1) >> 1)] = j - i;
- j = i + 1;
- do { c1 = c0;
- } while((0 <= --i) && ((c0 = T.get(i)) >= c1));
- }
- }
- /* find the lexicographic names of all substrings */
- for(i = 0, name = 0, q = n, qlen = 0; i < m; ++i) {
- p = SA[i]; plen = SA[m + (p >> 1)]; diff = true;
- if((plen == qlen) && ((q + plen) < n)) {
- for(j = 0; (j < plen) && (T.get(p + j) == T.get(q + j)); ++j) { }
- if(j == plen) { diff = false; }
- }
- if(diff != false) { ++name; q = p; qlen = plen; }
- SA[m + (p >> 1)] = name;
- }
- //System.out.printf("lmsPost last line name = %dn",name);
- return name;
- }
- /* compute SA and BWT */
- private static
- void
- induceSA(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) {
- int b, i, j;
- int c0, c1;
- System.out.printf("induceSA: n %d k %dn",n,k);
- /* compute SAl */
- if(C == B) { getCounts(T, C, n, k); }
- getBuckets(C, B, k, false); /* find starts of buckets */
- j = n - 1;
- b = B.get(c1 = T.get(j));
- SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j;
- for(i = 0; i < n; ++i) {
- j = SA[i]; SA[i] = ~j;
- if(0 < j) {
- if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j;
- }
- }
- /* compute SAs */
- if(C == B) { getCounts(T, C, n, k); }
- getBuckets(C, B, k, true); /* find ends of buckets */
- for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) {
- if(0 < (j = SA[i])) {
- if((c0 = T.get(--j)) != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- SA[--b] = ((j == 0) || (T.get(j - 1) > c1)) ? ~j : j;
- } else {
- SA[i] = ~j;
- }
- }
- }
- private static
- int
- computeBWT(BaseArray T, int[] SA, BaseArray C, BaseArray B, int n, int k) {
- int b, i, j, pidx = -1;
- int c0, c1;
- /* compute SAl */
- if(C == B) { getCounts(T, C, n, k); }
- getBuckets(C, B, k, false); /* find starts of buckets */
- j = n - 1;
- b = B.get(c1 = T.get(j));
- SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j;
- for(i = 0; i < n; ++i) {
- if(0 < (j = SA[i])) {
- SA[i] = ~(c0 = T.get(--j));
- if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- SA[b++] = ((0 < j) && (T.get(j - 1) < c1)) ? ~j : j;
- } else if(j != 0) {
- SA[i] = ~j;
- }
- }
- /* compute SAs */
- if(C == B) { getCounts(T, C, n, k); }
- getBuckets(C, B, k, true); /* find ends of buckets */
- for(i = n - 1, b = B.get(c1 = 0); 0 <= i; --i) {
- if(0 < (j = SA[i])) {
- SA[i] = (c0 = T.get(--j));
- if(c0 != c1) { B.set(c1, b); b = B.get(c1 = c0); }
- SA[--b] = ((0 < j) && (T.get(j - 1) > c1)) ? ~((int)T.get(j - 1)) : j;
- } else if(j != 0) {
- SA[i] = ~j;
- } else {
- pidx = i;
- }
- }
- return pidx;
- }
- /* find the suffix array SA of T[0..n-1] in {0..k-1}^n
- use a working space (excluding T and SA) of at most 2n+O(1) for a constant alphabet
- args = new ByteArray(T, 0), SA, 0, n, 256, false
- */
- private static
- int
- SA_IS(BaseArray T, int[] SA, int fs, int n, int k, boolean isbwt) {
- BaseArray C, B, RA;
- int i, j, b, c, m, p, q, name, pidx = 0, newfs;
- int c0, c1;
- int flags = 0;
- System.out.printf("sais: fs %d k %d n %dn",fs,k,n);
- if(k <= 256) {
- C = new IntArray(new int[k], 0);
- System.out.println("ALLOCATE_ARRAY freqTable at "+C.hashCode());
- if(k <= fs) {
- B = new IntArray(SA, n + fs - k);
- System.out.println("ALLOCATE_ARRAY suffixArrayHead at "+B.hashCode());
- flags = 1; }
- else {
- B = new IntArray(new int[k], 0); flags = 3;
- System.out.println("ALLOCATE_ARRAY suffixArrayHead at "+B.hashCode());
- }
- } else if(k <= fs) {
- C = new IntArray(SA, n + fs - k);
- System.out.println("ALLOCATE_ARRAY freqTable at "+C.hashCode());
- if(k <= (fs - k)) { B = new IntArray(SA, n + fs - k * 2);
- System.out.println("ALLOCATE_ARRAY suffixArrayHead at "+B.hashCode());
- flags = 0; }
- else if(k <= 1024) { B = new IntArray(new int[k], 0);
- System.out.println("ALLOCATE_ARRAY suffixArrayHead at "+B.hashCode());
- flags = 2; }
- else { B = C; flags = 8; }
- } else {
- C = B = new IntArray(new int[k], 0);
- flags = 4 | 8;
- }
- /* stage 1: reduce the problem by at least 1/2
- sort all the LMS-substrings */
- getCounts(T, C, n, k); getBuckets(C, B, k, true); /* find ends of buckets */
- for(i = 0; i < n; ++i) { SA[i] = 0; }
- b = -1; i = n - 1; j = n; m = 0; c0 = T.get(n - 1);
- do { c1 = c0; } while( (0 <= --i) && ((c0 = T.get(i)) >= c1));
- for(; 0 <= i;) {
- do {
- c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) <= c1));
- if(0 <= i) {
- if(0 <= b) { SA[b] = j; }
- b = B.update(c1, -1);
- j = i; ++m;
- do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1));
- }
- }
- if(1 < m) {
- LMSsort(T, SA, C, B, n, k);
- name = LMSpostproc(T, SA, n, m);
- } else if(m == 1) {
- SA[b] = j + 1;
- name = 1;
- } else {
- name = 0;
- }
- // System.out.printf("sais:after m > 1: name %d m %d flags %dn",name,m,flags);
- /* stage 2: solve the reduced problem
- recurse if names are not yet unique */
- if(name < m) {
- if((flags & 4) != 0) {
- System.out.println("Freeing freqTable at "+C.hashCode()+"nFreeing suffixArrayHead at " +B.hashCode());
- C = null; B = null;
- }
- if((flags & 2) != 0) {
- System.out.println("Freeing suffixArrayHead at " +B.hashCode());
- B = null; }
- newfs = (n + fs) - (m * 2);
- if((flags & (1 | 4 | 8)) == 0) {
- if((k + name) <= newfs) { newfs -= k; }
- else { flags |= 8; }
- }
- //mark
- for(i = m + (n >> 1) - 1, j = m * 2 + newfs - 1; m <= i; --i) {
- if(SA[i] != 0) {
- // System.out.printf("sais:before sa j--: i %d j %dn",i,j);
- SA[j--] = SA[i] - 1;
- // System.out.printf("sais:after sa j--: i %d j %dn",i,j);
- }
- }
- RA = new IntArray(SA, m + newfs);
- System.out.println("ALLOCATE_ARRAY RA at "+RA.hashCode());
- // printSA("sais:before recurse:new buffer",((IntArray)RA).m_A,n);
- System.out.println("Printing new buffer "+((IntArray)RA).m_pos);
- SA_IS(RA, SA, newfs, m, name, false);
- System.out.println("Freeing t_sa at " +RA.hashCode());
- RA = null;
- //printSA("sais:after recurse:suffixArray",SA,n);
- i = n - 1; j = m * 2 - 1; c0 = T.get(n - 1);
- do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1));
- for(; 0 <= i;) {
- do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) <= c1));
- if(0 <= i) {
- SA[j--] = i + 1;
- do { c1 = c0; } while((0 <= --i) && ((c0 = T.get(i)) >= c1));
- }
- }
- for(i = 0; i < m; ++i) { SA[i] = SA[m + SA[i]]; }
- if((flags & 4) != 0) { C = B = new IntArray(new int[k], 0);
- System.out.println("ALLOCATE_ARRAY after sais() for freqTable at "+C.hashCode()+" for suffixArrayHead "+B.hashCode());
- }
- if((flags & 2) != 0) { B = new IntArray(new int[k], 0);
- System.out.println("ALLOCATE_ARRAY suffixArrayHead at "+B.hashCode());}
- }
- /* stage 3: induce the result for the original problem */
- if((flags & 8) != 0) { getCounts(T, C, n, k); }
- /* put all left-most S characters into their buckets */
- if(1 < m) {
- getBuckets(C, B, k, true); /* find ends of buckets */
- i = m - 1; j = n; p = SA[m - 1]; c1 = T.get(p);
- do {
- q = B.get(c0 = c1);
- while(q < j) { SA[--j] = 0; }
- do {
- SA[--j] = p;
- if(--i < 0) { break; }
- p = SA[i];
- } while((c1 = T.get(p)) == c0);
- } while(0 <= i);
- while(0 < j) { SA[--j] = 0; }
- }
- if(isbwt == false) { induceSA(T, SA, C, B, n, k); }
- else { pidx = computeBWT(T, SA, C, B, n, k); }
- System.out.println("Freeing freqTable at " +C.hashCode()+" suffixArrayHead at "+B.hashCode());
- C = null; B = null;
- return pidx;
- }
- /** Suffixsorting **/
- /* byte */
- public static
- int
- suffixsort(byte[] T, int[] SA, int n) {
- if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; }
- if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; }
- return SA_IS(new ByteArray(T, 0), SA, 0, n, 256, false);
- }
- /* char */
- public static
- int
- suffixsort(char[] T, int[] SA, int n) {
- if((T == null) || (SA == null) || (T.length < n) || (SA.length < n)) { return -1; }
- if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; }
- return SA_IS(new CharArray(T, 0), SA, 0, n, 65536, false);
- }
- /* short */
- public static
- int
- suffixsort(short[] T, int[] SA, int n, int k) {
- if((T == null) || (SA == null) ||
- (T.length < n) || (SA.length < n) ||
- (k <= 0) || (65536 < k)) { return -1; }
- if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; }
- return SA_IS(new ShortArray(T, 0), SA, 0, n, k, false);
- }
- /* int */
- public static
- int
- suffixsort(int[] T, int[] SA, int n, int k) {
- if((T == null) || (SA == null) ||
- (T.length < n) || (SA.length < n) ||
- (k <= 0)) { return -1; }
- if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; }
- return SA_IS(new IntArray(T, 0), SA, 0, n, k, false);
- }
- /* String */
- public static
- int
- suffixsort(String T, int[] SA, int n) {
- if((T == null) || (SA == null) ||
- (T.length() < n) || (SA.length < n)) { return -1; }
- if(n <= 1) { if(n == 1) { SA[0] = 0; } return 0; }
- return SA_IS(new StringArray(T, 0), SA, 0, n, 65536, false);
- }
- /** Burrows-Wheeler Transform **/
- /* byte */
- public static
- int
- bwtransform(byte[] T, byte[] U, int[] A, int n) {
- int i, pidx;
- if((T == null) || (U == null) || (A == null) ||
- (T.length < n) || (U.length < n) || (A.length < n)) { return -1; }
- if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
- pidx = SA_IS(new ByteArray(T, 0), A, 0, n, 256, true);
- U[0] = T[n - 1];
- for(i = 0; i < pidx; ++i) { U[i + 1] = (byte)(A[i] & 0xff); }
- for(i += 1; i < n; ++i) { U[i] = (byte)(A[i] & 0xff); }
- return pidx + 1;
- }
- /* char */
- public static
- int
- bwtransform(char[] T, char[] U, int[] A, int n) {
- int i, pidx;
- if((T == null) || (U == null) || (A == null) ||
- (T.length < n) || (U.length < n) || (A.length < n)) { return -1; }
- if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
- pidx = SA_IS(new CharArray(T, 0), A, 0, n, 65536, true);
- U[0] = T[n - 1];
- for(i = 0; i < pidx; ++i) { U[i + 1] = (char)(A[i] & 0xffff); }
- for(i += 1; i < n; ++i) { U[i] = (char)(A[i] & 0xffff); }
- return pidx + 1;
- }
- /* short */
- public static
- int
- bwtransform(short[] T, short[] U, int[] A, int n, int k) {
- int i, pidx;
- if((T == null) || (U == null) || (A == null) ||
- (T.length < n) || (U.length < n) || (A.length < n) ||
- (k <= 0) || (65536 < k)) { return -1; }
- if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
- pidx = SA_IS(new ShortArray(T, 0), A, 0, n, k, true);
- U[0] = T[n - 1];
- for(i = 0; i < pidx; ++i) { U[i + 1] = (short)(A[i] & 0xffff); }
- for(i += 1; i < n; ++i) { U[i] = (short)(A[i] & 0xffff); }
- return pidx + 1;
- }
- /* int */
- public static
- int
- bwtransform(int[] T, int[] U, int[] A, int n, int k) {
- int i, pidx;
- if((T == null) || (U == null) || (A == null) ||
- (T.length < n) || (U.length < n) || (A.length < n) ||
- (k <= 0)) { return -1; }
- if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
- pidx = SA_IS(new IntArray(T, 0), A, 0, n, k, true);
- U[0] = T[n - 1];
- for(i = 0; i < pidx; ++i) { U[i + 1] = A[i]; }
- for(i += 1; i < n; ++i) { U[i] = A[i]; }
- return pidx + 1;
- }
- }
- [i@localhost lab]$ ./a.out input
- ALLOCATE_ARRAY bufferArray at 0x8f2e50
- sais: fs 0 k 256 n 500
- ALLOCATE_ARRAY freqTable at 0x8f3670
- ALLOCATE_ARRAY suffixArrayHead at 0x8f38a0
- getCounts: n 500 k 256 buffer at 0x8f2e50 freqTable 0x8f3670
- getBuckets: k 256 isFront 0 freqTable at 0x8f3670 suffixArrayHead 0x8f38a0
- lmsSort: n 500 k 256
- getBuckets: k 256 isFront 1 freqTable at 0x8f3670 suffixArrayHead 0x8f38a0
- getBuckets: k 256 isFront 0 freqTable at 0x8f3670 suffixArrayHead 0x8f38a0
- lmsPost: n 500 m 138
- FREE_ARRAY suffixArrayHead at 0x8f38a0
- ALLOCATE_ARRAY t_sa at 0x8f3ad0
- Printing new buffer curr = 362
- sais: fs 224 k 95 n 138
- ALLOCATE_ARRAY freqTable at 0x8f38a0
- ALLOCATE_ARRAY suffixArrayHead at 0x8f3990
- getCounts: n 138 k 95 buffer at 0x8f3ad0 freqTable 0x8f38a0
- getBuckets: k 95 isFront 0 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- lmsSort: n 138 k 95
- getBuckets: k 95 isFront 1 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- getBuckets: k 95 isFront 0 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- lmsPost: n 138 m 50
- getBuckets: k 95 isFront 0 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- induceSA: n 138 k 95
- getBuckets: k 95 isFront 1 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- getBuckets: k 95 isFront 0 freqTable at 0x8f38a0 suffixArrayHead 0x8f3990
- FREE_ARRAY freqTable at 0x8f38a0
- FREE_ARRAY suffixArrayHead at 0x8f3990
- FREE_ARRAY t_sa at 0x8f3ad0
- ALLOCATE_ARRAY suffixArrayHead at 0x8f3ad0
- getBuckets: k 256 isFront 0 freqTable at 0x8f3670 suffixArrayHead 0x8f3ad0
- induceSA: n 500 k 256
- getBuckets: k 256 isFront 1 freqTable at 0x8f3670 suffixArrayHead 0x8f3ad0
- getBuckets: k 256 isFront 0 freqTable at 0x8f3670 suffixArrayHead 0x8f3ad0
- FREE_ARRAY freqTable at 0x8f3670
- FREE_ARRAY suffixArrayHead at 0x8f3ad0
- Printing suffix array and BWT
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 0 C
- 429 C
- 0 C
- 0 C
- .
- .
- 0 C
- 0 C
- 0 C
- 0 C
- Key is 499
- *** Error in `./a.out': double free or corruption (!prev): 0x00000000008f1a50 ***
- ======= Backtrace: =========
- /lib64/libc.so.6(+0x7925b)[0x7f6cd9ee325b]
- /lib64/libc.so.6(+0x828ea)[0x7f6cd9eec8ea]
- /lib64/libc.so.6(cfree+0x4c)[0x7f6cd9ef031c]
- /lib64/libc.so.6(_IO_setb+0x4b)[0x7f6cd9ee79ab]
- /lib64/libc.so.6(_IO_file_close_it+0xae)[0x7f6cd9ee5a6e]
- /lib64/libc.so.6(fclose+0x1bf)[0x7f6cd9ed877f]
- ./a.out[0x402848]
- /lib64/libc.so.6(__libc_start_main+0xf1)[0x7f6cd9e8a401]
- ./a.out[0x40062a]
- ======= Memory map: ========
- 00400000-00403000 r-xp 00000000 08:08 419785 a.out
- 00602000-00603000 r--p 00002000 08:08 419785 a.out
- 00603000-00604000 rw-p 00003000 08:08 419785 a.out
- 008f1000-00912000 rw-p 00000000 00:00 0 [heap]
- 7f6cd4000000-7f6cd4021000 rw-p 00000000 00:00 0
- 7f6cd4021000-7f6cd8000000 ---p 00000000 00:00 0
- 7f6cd9c53000-7f6cd9c69000 r-xp 00000000 08:08 680007 /usr/lib64/libgcc_s-6.4.1-20170727.so.1
- 7f6cd9c69000-7f6cd9e68000 ---p 00016000 08:08 680007 /usr/lib64/libgcc_s-6.4.1-20170727.so.1
- 7f6cd9e68000-7f6cd9e69000 r--p 00015000 08:08 680007 /usr/lib64/libgcc_s-6.4.1-20170727.so.1
- 7f6cd9e69000-7f6cd9e6a000 rw-p 00016000 08:08 680007 /usr/lib64/libgcc_s-6.4.1-20170727.so.1
- 7f6cd9e6a000-7f6cda027000 r-xp 00000000 08:08 666052 /usr/lib64/libc-2.24.so
- 7f6cda027000-7f6cda226000 ---p 001bd000 08:08 666052 /usr/lib64/libc-2.24.so
- 7f6cda226000-7f6cda22a000 r--p 001bc000 08:08 666052 /usr/lib64/libc-2.24.so
- 7f6cda22a000-7f6cda22c000 rw-p 001c0000 08:08 666052 /usr/lib64/libc-2.24.so
- 7f6cda22c000-7f6cda230000 rw-p 00000000 00:00 0
- 7f6cda230000-7f6cda255000 r-xp 00000000 08:08 665877 /usr/lib64/ld-2.24.so
- 7f6cda43a000-7f6cda43c000 rw-p 00000000 00:00 0
- 7f6cda452000-7f6cda455000 rw-p 00000000 00:00 0
- 7f6cda455000-7f6cda456000 r--p 00025000 08:08 665877 /usr/lib64/ld-2.24.so
- 7f6cda456000-7f6cda457000 rw-p 00026000 08:08 665877 /usr/lib64/ld-2.24.so
- 7f6cda457000-7f6cda458000 rw-p 00000000 00:00 0
- 7ffe93214000-7ffe93235000 rw-p 00000000 00:00 0 [stack]
- 7ffe932f1000-7ffe932f3000 r--p 00000000 00:00 0 [vvar]
- 7ffe932f3000-7ffe932f5000 r-xp 00000000 00:00 0 [vdso]
- ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
- Aborted (core dumped)
- input: 500 bytes ... sais: fs 0 k 256 n 500
- ALLOCATE_ARRAY freqTable at 1550089733
- ALLOCATE_ARRAY suffixArrayHead at 865113938
- getCounts: n 500 k 256
- getBuckets: k 256 front false
- lmsSort: n 500 k 256
- getBuckets: k 256 front true
- getBuckets: k 256 front false
- lmsPost: n 500 m 138
- Freeing suffixArrayHead at 865113938
- ALLOCATE_ARRAY RA at 1442407170
- Printing new buffer 362
- sais: fs 224 k 95 n 138
- ALLOCATE_ARRAY freqTable at 1028566121
- ALLOCATE_ARRAY suffixArrayHead at 1118140819
- getCounts: n 138 k 95
- getBuckets: k 95 front false
- lmsSort: n 138 k 95
- getBuckets: k 95 front true
- getBuckets: k 95 front false
- lmsPost: n 138 m 50
- getBuckets: k 95 front false
- induceSA: n 138 k 95
- getBuckets: k 95 front true
- getBuckets: k 95 front false
- Freeing freqTable at 1028566121 suffixArrayHead at 1118140819
- Freeing t_sa at 1442407170
- ALLOCATE_ARRAY suffixArrayHead at 1975012498
- getBuckets: k 256 front false
- induceSA: n 500 k 256
- getBuckets: k 256 front true
- getBuckets: k 256 front false
- Freeing freqTable at 1550089733 suffixArrayHead at 1975012498
- 0.045 sec
- Printing T and SA
- 105 C
- 317 G
- 337 T
- 298 T
- 403 C
- 213 C
- 324 T
- 423 T
- 26 C
- 106 A
- 109 G
- .
- .
- 29 C
- 371 G
- 378 A
- Key is 41
- sufcheck: Done.
Add Comment
Please, Sign In to add comment