Advertisement
pushrbx

Fastest File Processor ever?

Apr 24th, 2013
170
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 3.14 KB | None | 0 0
  1. /*
  2. * readfastq.h
  3. *
  4. *  Created on: Sep 27, 2011
  5. *      Author: michael
  6. *      Edited by: scarysandwich
  7. *      Edited on: April 25, 2013
  8. */
  9.  
  10. #ifndef READFASTQ_H_
  11. #define READFASTQ_H_
  12.  
  13. #include<iostream> //to open files
  14. #include<fstream> //file stream
  15. #include<stdio.h>
  16. #include<stdlib.h> //to use malloc
  17. #include<string.h>
  18. #include<string>
  19. #include"shortread.h"
  20. #include<vector>
  21.  
  22. #include <istream>
  23.  
  24.  
  25. #include <boost/iostreams/device/file.hpp>
  26. #include <boost/iostreams/stream.hpp>
  27. #ifdef GZIP
  28. #include <boost/iostreams/device/file.hpp>
  29. #include <boost/iostreams/stream.hpp>
  30. #include <boost/iostreams/filtering_streambuf.hpp>
  31. #include <boost/iostreams/filtering_stream.hpp>
  32. #include <boost/iostreams/filter/gzip.hpp>
  33. #endif
  34.  
  35.  
  36. #define TIMEMEASURE
  37. #define TIMECALC(start, end) (((end.tv_sec - start.tv_sec) * 1000000 + (end.tv_usec - start.tv_usec)) / 1000)
  38.  
  39.  
  40. using namespace std;
  41.  
  42. class fastqfile{
  43.  
  44.     std::vector<string> _names;
  45.     std::vector<char>  _seqs;
  46.     std::vector<char>  _phred;
  47.  
  48.     int _filelength;
  49.     int _SRlength;
  50.     int _blocksize;
  51.  
  52.  
  53. public:
  54.  
  55.     fastqfile(char *filename, int SRlength, uint32_t blocksize){
  56.         readfastq(filename, SRlength, blocksize);
  57.     }
  58.  
  59.     string      *getNames(){    return &(_names[0]);}
  60.     char        *getSeqs(){ return &(_seqs[0]);}
  61.     char        *getPhred(){    return &(_phred[0]);}
  62.     uint32_t    getNumberOfSR(){ return _filelength;}
  63.     uint32_t    getSRlegth() {return _SRlength;}
  64.  
  65.     string      getNameAt(uint32_t i){ return _names.at(i);}
  66.     void        getSRAt(uint32_t i, char *ret){ strncpy(ret, &(_seqs[i]), _SRlength);}
  67.     void        getPhredAt(uint32_t i, char *ret){strncpy(ret, &(_phred[i]), _SRlength);}
  68.  
  69.     void        setNumberOfSRs(uint32_t num){
  70.         _filelength = num;
  71.     }
  72.  
  73.     void readfastq(char *filename, int SRlength) {
  74.         _filelength = 0;
  75.         _SRlength = SRlength;
  76.  
  77.         size_t bytes_read, bytes_expected;
  78.         timeval start, end;
  79.         gettimeofday(&start, 0);
  80.  
  81.         FILE *fp;
  82.         fp = fopen(filename, "r");
  83.  
  84.         fseek(fp, 0L, SEEK_END); //go to the end of file
  85.         bytes_expected = ftell(fp); //get filesize
  86.         fseek(fp, 0L, SEEK_SET); //go to the begining of the file
  87.  
  88.         fclose(fp);
  89.  
  90.         if ((_seqarray = (char *) malloc(bytes_expected/2)) == NULL) //allocate space for file
  91.             err(EX_OSERR, "data malloc");
  92.  
  93.  
  94.         string name;
  95.         string seqtemp;
  96.         string garbage;
  97.         string phredtemp;
  98.  
  99.         boost::iostreams::stream<boost::iostreams::file_source>file(filename);
  100.  
  101.  
  102.         while (std::getline(file, name)) {
  103.             std::getline(file, seqtemp);
  104.             std::getline(file, garbage);
  105.             std::getline(file, phredtemp);
  106.  
  107.             if (seqtemp.size() != SRlength) {
  108.                 if (seqtemp.size() != 0)
  109.                     printf("Error on read in fastq: size is invalid\n");
  110.             } else {
  111.                 _names.push_back(name);
  112.  
  113.                 strncpy( &(_seqarray[SRlength*_filelength]), seqtemp.c_str(), seqtemp.length()); //do not handle special letters here, do on GPU
  114.  
  115.                 _filelength++;
  116.             }
  117.         }
  118.         gettimeofday(&end, 0);
  119.  
  120. #ifdef TIMEMEASURE
  121.         cerr << "FastQ block read in " << TIMECALC(start, end) << "ms" << endl;
  122. #endif
  123.     }
  124.  
  125.  
  126.  
  127.     void print(){
  128.         for(int i=0; i<_seqs.size(); i++){ //three lines each record
  129.             printf("%i-", _seqs.at(i));
  130.         }
  131.         printf("\n");
  132.     }
  133.  
  134. };
  135.  
  136. #endif /* READFASTQ_H_ */
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement