- /**
- * Author:
- * Pierre Lindenbaum PhD
- * Contact:
- * plindenbaum@yahoo.fr
- * WWW:
- * http://plindenbaum.blogspot.com
- * Reference:
- *
- * Motivation:
- * Mapping a large fasta file in memory using mmap
- * Usage:
- * g++ -Wall testmmap.cpp -lz
- * ./a.out -g /path/to/genome/indexed/with/samtools-faidx/hg.fasta file.fastq
- */
- #include <cstdio>
- #include <cstdlib>
- #include <cstring>
- #include <climits>
- #include <stdint.h>
- #include <cassert>
- #include <fstream>
- #include <iostream>
- #include <algorithm>
- #include <map>
- #include <cerrno>
- #include <sys/types.h>
- #include <sys/stat.h>
- #include <fcntl.h>
- #include <sys/mman.h>
- #include <zlib.h>
- using namespace std;
- /* it is the structure created by samtools faidx */
- typedef struct faidx1_t
- {
- int32_t line_len, line_blen;
- int64_t len;
- uint64_t offset;
- }faidx1_t,*FaidxPtr;
- /** returns the anti parallele base */
- static const char antiparralele(char c)
- {
- switch(c)
- {
- case 'A': return 'T';
- case 'T': return 'A';
- case 'G': return 'C';
- case 'C': return 'G';
- default:return 'N';
- }
- }
- /**
- * wrapper for a mmap, a fileid and some faidx indexes
- */
- class IndexedGenome
- {
- private:
- /* maps a chromosome to the samtools faidx index */
- map<string,faidx1_t> name2index;
- /* used to get the size of the file */
- struct stat buf;
- /* genome fasta file file descriptor */
- int fd;
- /* the mmap (memory mapped) pointer */
- char *mapptr;
- /** reads an fill a string */
- bool readline(gzFile in,string& line)
- {
- if(gzeof(in)) return false;
- line.clear();
- int c=-1;
- while((c=gzgetc(in))!=EOF && c!='\n') line+=(char)c;
- return true;
- }
- public:
- /** constructor
- * @param fasta: the path to the genomic fasta file indexed with samtools faidx
- */
- IndexedGenome(const char* fasta):fd(-1),mapptr(NULL)
- {
- string faidx(fasta);
- string line;
- faidx.append(".fai");
- /* open *.fai file */
- ifstream in(faidx.c_str(),ios::in);
- if(!in.is_open())
- {
- cerr << "cannot open " << faidx << endl;
- exit(EXIT_FAILURE);
- }
- /* read indexes in fai file */
- while(getline(in,line,'\n'))
- {
- if(line.empty()) continue;
- const char* p=line.c_str();
- char* tab=(char*)strchr(p,'\t');
- if(tab==NULL) continue;
- string chrom(p,tab-p);
- ++tab;
- faidx1_t index;
- if(sscanf(tab,"%ld\t%ld\t%d\t%d",
- &index.len, &index.offset, &index.line_blen,&index.line_len
- )!=4)
- {
- cerr << "Cannot read index in "<< line << endl;
- exit(EXIT_FAILURE);
- }
- /* insert in the map(chrom,faidx) */
- name2index.insert(make_pair(chrom,index));
- }
- /* close index file */
- in.close();
- /* get the whole size of the fasta file */
- if(stat(fasta, &buf)!=0)
- {
- perror("Cannot stat");
- exit(EXIT_FAILURE);
- }
- /* open the fasta file */
- fd = open(fasta, O_RDONLY);
- if (fd == -1)
- {
- perror("Error opening file for reading");
- exit(EXIT_FAILURE);
- }
- /* open a memory mapped file associated to this fasta file descriptor */
- mapptr = (char*)mmap(0, buf.st_size, PROT_READ, MAP_SHARED, fd, 0);
- if (mapptr == MAP_FAILED)
- {
- close(fd);
- perror("Error mmapping the file");
- exit(EXIT_FAILURE);
- }
- }
- /* destructor */
- ~IndexedGenome()
- {
- /* close memory mapped map */
- if(mapptr!=NULL && munmap(mapptr,buf.st_size) == -1)
- {
- perror("Error un-mmapping the file");
- }
- /* dispose fasta file descriptor */
- if(fd!=-1) close(fd);
- }
- /* return the base at position 'index' for the chromosome indexed by faidx */
- char at(const FaidxPtr faidx,int64_t index)
- {
- long pos= faidx->offset +
- index / faidx->line_blen * faidx->line_len +
- index % faidx->line_blen
- ;
- /* here is the magic: no need to fseek/fread/ftell the file */
- return toupper(mapptr[pos]);
- }
- private:
- /* search a DNA string in the genome
- * simple Boyer-Moore-Horspool_algorithm hacked from
- * http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
- */
- void boyerMoore(const string& search)
- {
- size_t bad_char_skip[UCHAR_MAX + 1];
- if (search.empty()) return;
- for (size_t scan = 0; scan <= UCHAR_MAX; scan++)
- {
- bad_char_skip[scan] = search.size();
- }
- size_t last = search.size() - 1;
- for (size_t scan = 0; scan < last; scan++)
- {
- bad_char_skip[(size_t)search[scan]] = last - scan;
- }
- /* loop over each chromosome */
- for(std::map<std::string,faidx1_t>::iterator r=name2index.begin();
- r!=name2index.end();
- ++r)
- {
- faidx1_t& index=r->second;
- size_t haystack_index=0;
- while (haystack_index+ search.size() <= (size_t)index.len)
- {
- size_t scan=last;
- while( at(&index,haystack_index+scan) == search[scan] &&
- scan>0)
- {
- --scan;
- }
- if(scan==0)
- {
- cout << search << "\t"<< r->first << "\t" << haystack_index << endl;
- haystack_index+=search.size();
- }
- else
- {
- char c=at(&index,haystack_index+last);
- haystack_index += bad_char_skip[c];
- }
- }
- }
- }
- public:
- /** read a fastq file an searches the sequences in the genome */
- void mapReads(gzFile in)
- {
- string name;
- string seq;
- string name2;
- string qual;
- for(;;)
- {
- if(!readline(in,name)) break;
- if(!readline(in,seq)) break;
- if(!readline(in,name2)) break;
- if(!readline(in,qual)) break;
- while(!seq.empty() && seq[0]=='N')
- {
- seq.erase(0,1);
- }
- while(!seq.empty() && seq[seq.size()-1]=='N')
- {
- seq.erase(seq.size()-1,1);
- }
- if(seq.size()<10) continue;
- std::transform(seq.begin(), seq.end(),seq.begin(), ::toupper);
- //search
- boyerMoore(seq);
- //rev-comp and search
- reverse(seq.begin(),seq.end());
- std::transform(seq.begin(), seq.end(),seq.begin(), ::antiparralele);
- boyerMoore(seq);
- }
- }
- };
- int main(int argc,char** argv)
- {
- char* genomePath=NULL;
- int optind=1;
- while(optind < argc)
- {
- if(strcmp(argv[optind],"-h")==0)
- {
- fprintf(stderr,"%s: Pierre Lindenbaum PHD. 2010.\n",argv[0]);
- fprintf(stderr,"Compilation: %s at %s.\n",__DATE__,__TIME__);
- fprintf(stderr," -g <path/to/genome/indexed/with/samtools/faids> FASQ[stdin|files] \n");
- exit(EXIT_FAILURE);
- }
- else if(strcmp(argv[optind],"-g")==0 && optind+1<argc)
- {
- genomePath=argv[++optind];
- }
- else if(strcmp(argv[optind],"--")==0)
- {
- ++optind;
- break;
- }
- else if(argv[optind][0]=='-')
- {
- fprintf(stderr,"unknown option '%s'\n",argv[optind]);
- exit(EXIT_FAILURE);
- }
- else
- {
- break;
- }
- ++optind;
- }
- if(genomePath==NULL)
- {
- cerr << "genome path missing"<< endl;
- return EXIT_FAILURE;
- }
- /* open IndexedGenome */
- IndexedGenome* genome=new IndexedGenome(genomePath);
- if(optind==argc)
- {
- /* read fastqs from stdin */
- genome->mapReads(gzdopen(fileno(stdin), "r"));
- }
- else
- {
- /** loop over each fastq file */
- while(optind< argc)
- {
- char* filename=argv[optind++];
- errno=0;
- gzFile in=gzopen(filename,"rb");
- if(in==NULL)
- {
- cerr << argv[0]
- << ": Cannot open "<< filename
- << ":" << strerror(errno)
- << endl;
- exit(EXIT_FAILURE);
- }
- genome->mapReads(in);
- gzclose(in);
- }
- }
- /* dispose genome */
- delete genome;
- cerr << "Done." << endl;
- return 0;
- }