Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <algorithm>
- #include <iostream>
- #include <stdint.h>
- #include <unistd.h>
- #include <cstdlib>
- #include <cstring>
- #include <fstream>
- #include "xsa.cpp"
- #include "t1ha.c"
- #include <vector>
- #include <zlib.h>
- #include <omp.h>
- using namespace std;
- class iscap{
- private:
- uint8_t code[256];
- vector<bool> bloom;
- vector<uint64_t> seeds;
- inline void insert_bloom(const char *S);
- inline bool test_bloom(const char *S);
- public:
- uint64_t kmer, prime, hashn;
- bool fasta(const char *F);
- inline bool is_on_target(char *S, size_t L);
- void capture(vector<string> &F);
- void document(void);
- };
- void iscap::insert_bloom(const char *S){
- for(uint64_t i=0; i<hashn; i++) bloom[t1ha(S, kmer,seeds[i])%prime]=true;
- }
- bool iscap::test_bloom(const char *S){
- for(uint64_t i=0; i<hashn; i++) if(!bloom[t1ha(S, kmer,seeds[i])%prime]) return false;
- return true;
- }
- bool iscap::fasta(const char *F){
- seeds.resize(hashn); XSA rng; rng.set(time(NULL));
- for(size_t i=0; i<hashn; i++) seeds[i]=rng.get();
- for(;prime>1;prime--){
- uint64_t n=sqrt(prime)+1, ok=1;
- for(uint64_t i=2; i<n; i++) if(prime%i==0){ ok=0; break; }
- if(ok) break;
- }
- cerr<<"kmer\t"<<kmer<<'\n';
- cerr<<"#hash\t"<<hashn<<'\n';
- cerr<<"prime\t"<<prime<<'\n';
- bloom.resize(prime);
- for(size_t i=0; i<256; i++) code[i]=4;
- code['A']=code['a']=0; code['C']=code['c']=1; code['G']=code['g']=2; code['T']=code['t']=3;
- ifstream fi(F);
- if(!fi) return false;
- string n, s;
- for(fi>>n>>s; !fi.eof(); fi>>n>>s){
- if(s.size()<kmer) continue;
- for(size_t i=0; i<s.size(); i++) s[i]="ACGTN"[code[(uint8_t)s[i]]];
- for(size_t i=0; i<=s.size()-kmer; i++) insert_bloom(s.c_str()+i);
- for(size_t i=0; i<s.size(); i++){
- char c=s[i]; unsigned start, startl, startr;
- if(i<=kmer/2) start=0; else if(i-kmer/2+kmer>s.size()) start=s.size()-kmer; else start=i-kmer/2;
- if(i<=kmer-1) startl=0; else if(i-(kmer-1)+kmer>s.size()) startl=s.size()-kmer; else startl=i-(kmer-1);
- if(i<=0) startr=0; else if(i+kmer>s.size()) startr=s.size()-kmer; else startr=i;
- if(c!='A'){ s[i]='A'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='C'){ s[i]='C'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='G'){ s[i]='G'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='T'){ s[i]='T'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- s[i]=c;
- }
- reverse(s.begin(), s.end());
- for(size_t i=0; i<s.size(); i++) s[i]="TGCAN"[code[(uint8_t)s[i]]];
- for(size_t i=0; i<=s.size()-kmer; i++) insert_bloom(s.c_str()+i);
- for(size_t i=0; i<s.size(); i++){
- char c=s[i]; unsigned start, startl, startr;
- if(i<=kmer/2) start=0; else if(i-kmer/2+kmer>s.size()) start=s.size()-kmer; else start=i-kmer/2;
- if(i<=kmer-1) startl=0; else if(i-(kmer-1)+kmer>s.size()) startl=s.size()-kmer; else startl=i-(kmer-1);
- if(i<=0) startr=0; else if(i+kmer>s.size()) startr=s.size()-kmer; else startr=i;
- if(c!='A'){ s[i]='A'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='C'){ s[i]='C'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='G'){ s[i]='G'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- if(c!='T'){ s[i]='T'; insert_bloom(s.c_str()+start); insert_bloom(s.c_str()+startl); insert_bloom(s.c_str()+startr); }
- s[i]=c;
- }
- }
- fi.close();
- return true;
- }
- bool iscap::is_on_target(char *S, size_t L){
- if(L<kmer) return false;
- for(size_t i=0; i<=L-kmer; i++) if(test_bloom(S+i)) return true;
- return false;
- }
- void iscap::capture(vector<string> &F){
- vector<omp_lock_t> readlock(F.size());
- omp_lock_t writelock; omp_init_lock(&writelock);
- vector<gzFile> in(F.size());
- for(size_t i=0; i<F.size(); i++){
- omp_init_lock(&readlock[i]);
- in[i]=gzopen(F[i].c_str(), "rt");
- if(in[i]==Z_NULL) return;
- gzbuffer(in[i], 16*1024*1024);
- }
- size_t thread=omp_get_num_procs()*2, total=0, ontar=0; omp_set_num_threads(thread);
- for(size_t offset=0; offset<F.size(); offset+=thread){
- #pragma omp parallel for
- for(size_t t=0; t<thread; t++){
- char nam[1024], seq[4096], add[1024], qua[4096];
- size_t n=min(F.size()-offset, (size_t)thread);
- size_t tid=offset+omp_get_thread_num()%n;
- for(;;){
- omp_set_lock(&readlock[tid]);
- if(gzeof(in[tid])){ omp_unset_lock(&readlock[tid]); break; }
- if(gzgets(in[tid], nam, 1024)==NULL){ omp_unset_lock(&readlock[tid]); break; }
- if(gzgets(in[tid], seq, 4096)==NULL){ omp_unset_lock(&readlock[tid]); break; }
- if(gzgets(in[tid], add, 1024)==NULL){ omp_unset_lock(&readlock[tid]); break; }
- if(gzgets(in[tid], qua, 4096)==NULL){ omp_unset_lock(&readlock[tid]); break; }
- if(*nam!='@'||*add!='+'){ cerr<<"bad fastq\n"; omp_unset_lock(&readlock[tid]); break; }
- omp_unset_lock(&readlock[tid]);
- #pragma omp atomic
- total++;
- if(is_on_target(seq, strlen(seq)-1)){
- #pragma omp atomic
- ontar++;
- omp_set_lock(&writelock);
- printf("%s%s%s%s", nam, seq, add, qua);
- omp_unset_lock(&writelock);
- }
- }
- }
- }
- omp_destroy_lock(&writelock);
- for(size_t i=0; i<F.size(); i++){ omp_destroy_lock(&readlock[i]); gzclose(in[i]); }
- cerr<<ontar<<'/'<<total<<'\t'<<100.0*ontar/total<<"%\n";
- }
- void iscap::document(void){
- cerr<<"Usage: iscap [options] target.fasta in1.fq.gz [in2.fq.gz ...]\n";
- cerr<<"\t-k: kmer size default=24\n";
- cerr<<"\t-h: number of hashes default=4\n";
- cerr<<"\t-m: memory cost(MB) default=3000\n";
- exit(0);
- }
- int main(int ac, char **av){
- size_t t0=time(NULL);
- cerr<<"***********************************\n";
- cerr<<"* iscap *\n";
- cerr<<"* author: Yi Wang *\n";
- cerr<<"* email: godspeed_china@yeah.net *\n";
- cerr<<"* date: 17/Nov/2016 *\n";
- cerr<<"***********************************\n";
- iscap xwb; xwb.kmer=24; xwb.hashn=4; xwb.prime=3000ULL*1024ULL*1024ULL*8ULL;
- int opt;
- while((opt=getopt(ac, av, "k:h:m:"))>=0){
- switch(opt){
- case 'k': xwb.kmer=atoi(optarg); break;
- case 'h': xwb.hashn=atoi(optarg); break;
- case 'm': xwb.prime=atoi(optarg)*1024ULL*1024ULL*8ULL; break;
- default: xwb.document();
- }
- }
- if(ac<optind+2) xwb.document();
- if(!xwb.fasta(av[optind])){ cerr<<"fail to open "<<av[1]<<'\n'; return 0; }
- setvbuf(stdout, NULL, _IOFBF, 16*1024*1024);
- vector<string> file; for(int i=optind+1; i<ac; i++) file.push_back(av[i]);
- xwb.capture(file);
- cerr<<"time\t"<<time(NULL)-t0<<" s\n";
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement