Advertisement
Guest User

Untitled

a guest
May 30th, 2015
251
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.11 KB | None | 0 0
  1. import re # to use regular expressions
  2. import math
  3. import argparse # To allow users setting flags in order to define what to filter for.
  4. from pylab import figure, title, xlabel, ylabel, hist, axis, grid, savefig # to draw a historgram
  5.  
  6. parser = argparse.ArgumentParser(description="Creates a cumulative histogram on read coverage (excluding coverage of 0) and prints out upper and lower coverage thresholds")
  7. parser.add_argument("-p","--pileup", help="pileup file name")
  8. parser.add_argument("-ma","--max", help="maximum coverage to be shown")
  9. parser.add_argument("-mi","--min", help="minimum coverage to be shown")
  10. args=parser.parse_args()
  11. # Define the search patterns:
  12.  
  13. PileupFileNamePattern = re.compile('(.*)\.pileup')
  14.  
  15.  
  16.  
  17. def Coverages(pileupfile,maximum,minimum):
  18. '''
  19. Creates a cumulative histogram on read coverage (excluding coverage of 0) and prints out upper and lower coverage thresholds
  20. '''
  21. pileup=open(pileupfile)
  22. OutfilePrefix1 =PileupFileNamePattern.match(pileupfile) # define outfile
  23. OutfilePrefix=OutfilePrefix1.group(1)
  24. Coverage=[] #Create an empty list
  25. for line in pileup:
  26. line=line.strip()
  27. splitted=line.split('\t') # Split the lines by tab separators
  28. if abs(int(splitted[3])) > 0: # excluding all coverages of 0
  29. Coverage.append(abs(int(splitted[3]))) # Pick out the 4th column, which is the number of reads that covered this position
  30. # Remove coverages of 0
  31. CoverageAboveZero=[]
  32. for c in Coverage:
  33. if c>0:
  34. CoverageAboveZero.append(c)
  35. # sort the coverages
  36. CoverageAboveZeroSorted = CoverageAboveZero.sort()
  37. TotalBasesCovered = len(CoverageAboveZeroSorted)
  38. # Identify the lower 2% limit
  39. LowerLimit=CoverageAboveZeroSorted[int((0.02*TotalBasesCovered)-1)]
  40. # Identify the upper 2% limit
  41. UpperLimit=CoverageAboveZeroSorted[int((0.98*TotalBasesCovered)-1)]
  42. # Calculating the standard deviation
  43. mean=float(sum(CoverageAboveZeroSorted)/len(CoverageAboveZeroSorted))
  44. total = 0.0
  45. for value in CoverageAboveZeroSorted:
  46. total += (value - mean) ** 2
  47. stddev = math.sqrt(total / (len(CoverageAboveZeroSorted)-1))
  48. print pileupfile, 'Coverage - mean:', '%10.2f' % mean
  49. print pileupfile, 'Coverage - max:', '%10i' % max(CoverageAboveZeroSorted)
  50. print pileupfile, 'Coverage - min:', '%10i' % min(CoverageAboveZeroSorted)
  51. print pileupfile, 'Coverage - mean-2*stddev:', '%10i' % mean-2*stddev
  52. print pileupfile, 'Coverage - mean+2*stddev:', '%10i' % mean+2*stddev
  53. print pileupfile, 'Coverage - upper 2%:', '%10i' % UpperLimit
  54. print pileupfile, 'Coverage - lower 2%:', '%10i' % LowerLimit
  55.  
  56. # Drawing the historgram
  57. figure()
  58. hist(CoverageAboveZeroSorted, range=(minimum,maximum), bins=50, histtype='bar', facecolor='green', normed=1,cumulative=1)
  59. title('Cumulative Histogram')
  60. xlabel('Coverage')
  61. ylabel('Frequency')
  62. axis()
  63. grid(True)
  64. savefig(OutfilePrefix + 'Coverage_Histogram.png')
  65.  
  66.  
  67.  
  68. Coverages(str(args.pileup),int(args.max),int(args.min))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement