Guest User

Untitled

a guest
Jul 17th, 2018
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.72 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3.  
  4. '''
  5. Created by Brant Faircloth on 11 December 2010 11:28 PST (-0800).
  6. Copyright (c) 2010 Brant C. Faircloth. All rights reserved.
  7.  
  8. Redistribution and use in source and binary forms, with or without
  9. modification, are permitted provided that the following conditions are met:
  10.  
  11. * Redistributions of source code must retain the above copyright notice,
  12. this list of conditions and the following disclaimer.
  13.  
  14. * Redistributions in binary form must reproduce the above copyright notice,
  15. this list of conditions and the following disclaimer in the documentation
  16. and/or other materials provided with the distribution.
  17.  
  18. * Neither the name of the University of California nor the names of its
  19. contributors may be used to endorse or promote products derived from this
  20. software without specific prior written permission. THIS SOFTWARE IS
  21. PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
  22. OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  23. WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  24. DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  25. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  26. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  27. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  28. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  29. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  30. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  31. POSSIBILITY OF SUCH DAMAGE.
  32.  
  33. '''
  34.  
  35. import os
  36. import tempfile
  37. import multiprocessing
  38.  
  39. import pdb
  40.  
  41. def file_type(input):
  42. """given an input file, determine the type and return both type and record delimiter (> or @)"""
  43. name, extension = os.path.splitext(os.path.basename(input))
  44. fastas = set(['.fsa','.fasta','.fa'])
  45. fastqs = set(['.fastq','.fq'])
  46. gffs = set(['.gff'])
  47. if extension in fastas:
  48. ft = 'fasta'
  49. delim = '>'
  50. elif extension in fastqs:
  51. ft = 'fastq'
  52. delim = '@'
  53. # TODO: sff ???
  54. #elif extension in gffs:
  55. # ft = 'sff'
  56. # delim = None
  57. else:
  58. raise IOError, "Input file not of correct extension"
  59. return ft, delim
  60.  
  61. def _get_file_chunks(input, delim, size):
  62. """given input, record delimiter, and chunk size, yield an iterator contains file seek (start)
  63. and file read (stop) positions. Return final position as (6365605, None)."""
  64. f = open(input)
  65. while 1:
  66. start = f.tell()
  67. f.seek(size, 1)
  68. line = f.readline()
  69. if not line:
  70. break
  71. # if this isn't a fasta header line, read forward until
  72. # we get to one
  73. while not line.startswith(delim):
  74. line = f.readline()
  75. else:
  76. # now that we got to a fasta header, we're at the end.
  77. # back up the length of the fasta header.
  78. f.seek(-len(line), 1)
  79. # tuple up
  80. yield start, f.tell() - start, input
  81. # make sure we catch the (start, distance) for the end of the file, too
  82. yield start, None, input
  83. f.close()
  84.  
  85. def get_chunks(input, delim, split_type, mb=1, splits = None):
  86. """return a tuple of file seek (start, distance) positions covering chunks of a file"""
  87. if split_type == 'size':
  88. size = mb * (1024**2)
  89. if split_type == 'pieces':
  90. if not splits:
  91. splits = multiprocessing.cpu_count() - 1
  92. size = int(round((os.path.getsize(input)/float(splits)), 0))
  93. return _get_file_chunks(input, delim, size)
  94.  
  95. def _split_file(chunk):
  96. """function to split a file into appropriate pieces given (start, stop) file seek coords"""
  97. f = open(chunk[2])
  98. f.seek(chunk[0])
  99. if chunk[1]:
  100. d = f.read(chunk[1])
  101. else:
  102. d = f.read()
  103. td, tf = tempfile.mkstemp(suffix='.splt')
  104. os.close(td)
  105. otf = open(tf, 'w')
  106. otf.write(d)
  107. otf.close()
  108. f.close()
  109. return tf
  110.  
  111. def make_chunks(chunks, pool = None, mp = True):
  112. """return a list of tempfiles that are the chunked input file"""
  113. if mp and not pool:
  114. # create a multiprocessing pool
  115. procs = multiprocessing.cpu_count() - 1
  116. pool = multiprocessing.Pool(procs)
  117. chunks = pool.map(_split_file, chunks)
  118. # close up the pool if we no longer want to swim
  119. pool.close()
  120. pool.join()
  121. else:
  122. chunks = map(_split_file, chunks)
  123. return chunks
  124.  
  125. if __name__ == '__main__':
  126. input = '../test/galGal3.read1.fa'
  127. f_type, delim = file_type(input)
  128. chunk_offsets = get_chunks(input, delim, split_type='size', mb=24)
  129. chunks = make_chunks(chunk_offsets, mp=False)
  130. pdb.set_trace()
Add Comment
Please, Sign In to add comment