Benedek_Danko

salmon_flength_dist

Jan 13th, 2021
642
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. This script creates Salmon flenDist.txt files from fld.gz files,
  5. which is parseable by MultiQC.
  6. """
  7.  
  8. import struct
  9. import gzip
  10. import argparse
  11. import os
  12.  
  13.  
  14. parser = argparse.ArgumentParser()
  15. parser.add_argument('dataset', help='dataset to create flenDist.txt Salmon files for')
  16. args = parser.parse_args()
  17.  
  18.  
  19. def get_fld(path):
  20.     '''
  21.    Returns a tuple containing the fragment length distribution.
  22.    Number of bins (integers): 1001 (from 0 to 1000).
  23.    '''
  24.     with gzip.open(path) as fld_file:
  25.         fld = struct.unpack('i' * 1001, fld_file.read())
  26.         return fld
  27.            
  28.  
  29. def Main():
  30.     dat = args.dataset
  31.     subfolders = [f.path for f in os.scandir(str(dat) +
  32.                                              '/expression/salmon/') if f.is_dir()]
  33.     for sub in subfolders:
  34.         if os.path.exists(sub + '/aux_info/fld.gz'):
  35.             print('Writing flenDist.txt to {}...\n'.format(sub))
  36.             fld_tuple = get_fld(sub + '/aux_info/fld.gz')
  37.             with open(sub + '/libParams/flenDist.txt', 'w') as out_file:
  38.                 for val in fld_tuple:
  39.                     out_file.write(str(val) + " ")
  40.         else:
  41.             print('{}/aux_info/fld.gz does not exists\n'.format(sub))
  42.        
  43.  
  44. if __name__ == '__main__':
  45.     Main() 
RAW Paste Data