Advertisement
Guest User

Untitled

a guest
Feb 20th, 2017
300
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.50 KB | None | 0 0
  1. #include <algorithm>
  2. #include <iostream>
  3. #include <stdint.h>
  4. #include <unistd.h>
  5. #include <cstdlib>
  6. #include <cstring>
  7. #include <fstream>
  8. #include "xsa.cpp"
  9. #include "t1ha.c"
  10. #include <vector>
  11. #include <zlib.h>
  12. #include <omp.h>
  13. using namespace std;
  14.  
  15. class iscap{
  16. private:
  17. uint8_t code[256];
  18. vector<bool> bloom;
  19. vector<uint64_t> seeds;
  20. inline void insert_bloom(const char *S);
  21. inline bool test_bloom(const char *S);
  22. public:
  23. uint64_t kmer, prime, hashn;
  24. bool fasta(const char *F);
  25. inline bool is_on_target(char *S, size_t L);
  26. void capture(vector<string> &F);
  27. void document(void);
  28. };
  29.  
  30. void iscap::insert_bloom(const char *S){
  31. for(uint64_t i=0; i<hashn; i++) bloom[t1ha(S, kmer,seeds[i])%prime]=true;
  32. }
  33.  
  34. bool iscap::test_bloom(const char *S){
  35. for(uint64_t i=0; i<hashn; i++) if(!bloom[t1ha(S, kmer,seeds[i])%prime]) return false;
  36. return true;
  37. }
  38.  
  39. bool iscap::fasta(const char *F){
  40. seeds.resize(hashn); XSA rng; rng.set(time(NULL));
  41. for(size_t i=0; i<hashn; i++) seeds[i]=rng.get();
  42. for(;prime>1;prime--){
  43. uint64_t n=sqrt(prime)+1, ok=1;
  44. for(uint64_t i=2; i<n; i++) if(prime%i==0){ ok=0; break; }
  45. if(ok) break;
  46. }
  47. cerr<<"kmer\t"<<kmer<<'\n';
  48. cerr<<"#hash\t"<<hashn<<'\n';
  49. cerr<<"prime\t"<<prime<<'\n';
  50. bloom.resize(prime);
  51. for(size_t i=0; i<256; i++) code[i]=4;
  52. code['A']=code['a']=0; code['C']=code['c']=1; code['G']=code['g']=2; code['T']=code['t']=3;
  53. ifstream fi(F);
  54. if(!fi) return false;
  55. string n, s;
  56. for(fi>>n>>s; !fi.eof(); fi>>n>>s){
  57. if(s.size()<kmer) continue;
  58. for(size_t i=0; i<s.size(); i++) s[i]="ACGTN"[code[(uint8_t)s[i]]];
  59. for(size_t i=0; i<=s.size()-kmer; i++) insert_bloom(s.c_str()+i);
  60. for(size_t i=0; i<s.size(); i++){
  61. char c=s[i]; unsigned start, startl, startr;
  62. if(i<=kmer/2) start=0; else if(i-kmer/2+kmer>s.size()) start=s.size()-kmer; else start=i-kmer/2;
  63. if(i<=kmer-1) startl=0; else if(i-(kmer-1)+kmer>s.size()) startl=s.size()-kmer; else startl=i-(kmer-1);
  64. if(i<=0) startr=0; else if(i+kmer>s.size()) startr=s.size()-kmer; else startr=i;
  65. if(c!='A'){ s[i]='A'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  66. if(c!='C'){ s[i]='C'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  67. if(c!='G'){ s[i]='G'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  68. if(c!='T'){ s[i]='T'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  69. s[i]=c;
  70. }
  71. reverse(s.begin(), s.end());
  72. for(size_t i=0; i<s.size(); i++) s[i]="TGCAN"[code[(uint8_t)s[i]]];
  73. for(size_t i=0; i<=s.size()-kmer; i++) insert_bloom(s.c_str()+i);
  74. for(size_t i=0; i<s.size(); i++){
  75. char c=s[i]; unsigned start, startl, startr;
  76. if(i<=kmer/2) start=0; else if(i-kmer/2+kmer>s.size()) start=s.size()-kmer; else start=i-kmer/2;
  77. if(i<=kmer-1) startl=0; else if(i-(kmer-1)+kmer>s.size()) startl=s.size()-kmer; else startl=i-(kmer-1);
  78. if(i<=0) startr=0; else if(i+kmer>s.size()) startr=s.size()-kmer; else startr=i;
  79. if(c!='A'){ s[i]='A'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  80. if(c!='C'){ s[i]='C'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  81. if(c!='G'){ s[i]='G'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  82. if(c!='T'){ s[i]='T'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
  83. s[i]=c;
  84. }
  85. }
  86. fi.close();
  87. return true;
  88. }
  89.  
  90. bool iscap::is_on_target(char *S, size_t L){
  91. if(L<kmer) return false;
  92. for(size_t i=0; i<=L-kmer; i++) if(test_bloom(S+i)) return true;
  93. return false;
  94. }
  95.  
  96. void iscap::capture(vector<string> &F){
  97. vector<omp_lock_t> readlock(F.size());
  98. omp_lock_t writelock; omp_init_lock(&writelock);
  99. vector<gzFile> in(F.size());
  100. for(size_t i=0; i<F.size(); i++){
  101. omp_init_lock(&readlock[i]);
  102. in[i]=gzopen(F[i].c_str(), "rt");
  103. if(in[i]==Z_NULL) return;
  104. gzbuffer(in[i], 16*1024*1024);
  105. }
  106. size_t thread=omp_get_num_procs()*2, total=0, ontar=0; omp_set_num_threads(thread);
  107. for(size_t offset=0; offset<F.size(); offset+=thread){
  108. #pragma omp parallel for
  109. for(size_t t=0; t<thread; t++){
  110. char nam[1024], seq[4096], add[1024], qua[4096];
  111. size_t n=min(F.size()-offset, (size_t)thread);
  112. size_t tid=offset+omp_get_thread_num()%n;
  113. for(;;){
  114. omp_set_lock(&readlock[tid]);
  115. if(gzeof(in[tid])){ omp_unset_lock(&readlock[tid]); break; }
  116. if(gzgets(in[tid], nam, 1024)==NULL){ omp_unset_lock(&readlock[tid]); break; }
  117. if(gzgets(in[tid], seq, 4096)==NULL){ omp_unset_lock(&readlock[tid]); break; }
  118. if(gzgets(in[tid], add, 1024)==NULL){ omp_unset_lock(&readlock[tid]); break; }
  119. if(gzgets(in[tid], qua, 4096)==NULL){ omp_unset_lock(&readlock[tid]); break; }
  120. if(*nam!='@'||*add!='+'){ cerr<<"bad fastq\n"; omp_unset_lock(&readlock[tid]); break; }
  121. omp_unset_lock(&readlock[tid]);
  122. #pragma omp atomic
  123. total++;
  124. if(is_on_target(seq, strlen(seq)-1)){
  125. #pragma omp atomic
  126. ontar++;
  127. omp_set_lock(&writelock);
  128. printf("%s%s%s%s", nam, seq, add, qua);
  129. omp_unset_lock(&writelock);
  130. }
  131. }
  132. }
  133. }
  134. omp_destroy_lock(&writelock);
  135. for(size_t i=0; i<F.size(); i++){ omp_destroy_lock(&readlock[i]); gzclose(in[i]); }
  136. cerr<<ontar<<'/'<<total<<'\t'<<100.0*ontar/total<<"%\n";
  137. }
  138.  
  139. void iscap::document(void){
  140. cerr<<"Usage: iscap [options] target.fasta in1.fq.gz [in2.fq.gz ...]\n";
  141. cerr<<"\t-k: kmer size default=24\n";
  142. cerr<<"\t-h: number of hashes default=4\n";
  143. cerr<<"\t-m: memory cost(MB) default=3000\n";
  144. exit(0);
  145. }
  146.  
  147. int main(int ac, char **av){
  148. size_t t0=time(NULL);
  149. cerr<<"***********************************\n";
  150. cerr<<"* iscap *\n";
  151. cerr<<"* author: Yi Wang *\n";
  152. cerr<<"* email: godspeed_china@yeah.net *\n";
  153. cerr<<"* date: 17/Nov/2016 *\n";
  154. cerr<<"***********************************\n";
  155. iscap xwb; xwb.kmer=24; xwb.hashn=4; xwb.prime=3000ULL*1024ULL*1024ULL*8ULL;
  156. int opt;
  157. while((opt=getopt(ac, av, "k:h:m:"))>=0){
  158. switch(opt){
  159. case 'k': xwb.kmer=atoi(optarg); break;
  160. case 'h': xwb.hashn=atoi(optarg); break;
  161. case 'm': xwb.prime=atoi(optarg)*1024ULL*1024ULL*8ULL; break;
  162. default: xwb.document();
  163. }
  164. }
  165. if(ac<optind+2) xwb.document();
  166. if(!xwb.fasta(av[optind])){ cerr<<"fail to open "<<av[1]<<'\n'; return 0; }
  167. setvbuf(stdout, NULL, _IOFBF, 16*1024*1024);
  168. vector<string> file; for(int i=optind+1; i<ac; i++) file.push_back(av[i]);
  169. xwb.capture(file);
  170. cerr<<"time\t"<<time(NULL)-t0<<" s\n";
  171. return 0;
  172. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement