Guest User

Untitled

a guest
Mar 21st, 2018
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.65 KB | None | 0 0
  1. import glob
  2. import os
  3.  
  4. class bam_pipeline:
  5. # workingdirectory, map_type, sample_type, library_matching_id, thrds
  6. def __init__(self, working_directory, map_type, sample_type, library_matching_id, thrds):
  7. self.working_directory = working_directory
  8. self.map_type = map_type
  9. self.sample_type = sample_type
  10. self.library_matching_id = library_matching_id
  11. self.threads = thrds
  12. self.logs = {'function': "", 'command': "", 'start_time': "", 'end_time': "", 'threads': self.threads,
  13. 'success': 0}
  14. self.write_logs()
  15. self.picard_path = "/home/bioinformaticslab/Desktop/AMBRY/picard.jar"
  16. self.gatk_path = "/home/bioinformaticslab/Desktop/AMBRY/GenomeAnalysisTK.jar"
  17. self.ref_dir = "/home/bioinformaticslab/Desktop/GenomicsWorks/ref_genome_indexes/"
  18. self.bundle_dir = self.ref_dir + "hg19_bundle"
  19.  
  20.  
  21. def get_fastq(self):
  22. os.chdir(self.working_directory)
  23. self.main_dir = os.getcwd()
  24.  
  25. all_fastq_files = glob.glob("*fastq.gz")
  26.  
  27. split_names_v = [os.path.splitext(os.path.splitext(i)[0])[0] for i in all_fastq_files]
  28.  
  29. return split_names_v
  30.  
  31. def get_info(self, fastq_list):
  32. sample_ID, germline_dna, index_seq, lanes, pairs_r, n_of_seq = (set() for i in range(6))
  33. if self.sample_type == "Tumor":
  34.  
  35. for i in fastq_list:
  36. sample_ID.add(i.split("_")[0])
  37. index_seq.add(i.split("_")[1])
  38. lanes.add(i.split("_")[2])
  39. pairs_r.add(i.split("_")[3])
  40. n_of_seq.add(i.split("_")[4])
  41.  
  42. list_with_info = {"Sample_ID": list(sample_ID), "Index": list(index_seq), "Lanes": list(lanes),
  43. "Pairs": list(pairs_r), "Number_of_seq": list(n_of_seq)}
  44. return list_with_info
  45. elif self == "Germline":
  46.  
  47. for i in fastq_list:
  48. sample_ID.add(i.split("_")[0])
  49. germline_dna.add(i.split("_")[1])
  50. index_seq.add(i.split("_")[2])
  51. lanes.add(i.split("_")[3])
  52. pairs_r.add(i.split("_")[4])
  53. n_of_seq.add(i.split("_")[5])
  54.  
  55. list_with_info = {"Sample_ID": list(sample_ID), "Germline": list(germline_dna), "Index": list(index_seq),
  56. "Lanes": list(lanes), "Pairs": list(pairs_r), "Number_of_seq": list(n_of_seq)}
  57. return list_with_info
  58. else:
  59. print("raise error and ask again for a valid sample type")
  60.  
  61. def mapping(self):
  62. import re
  63. import gzip
  64. fastq_list = self.get_fastq()
  65. info_dict = self.get_info(fastq_list)
  66.  
  67. RG_SM = info_dict["Sample_ID"][0]
  68. RG_PL = "Illumina"
  69. RG_LB = self.library_matching_id
  70. RG_ID = ""
  71. RG_PU = ""
  72. first_fastq_file_dir = self.working_directory + "/" + fastq_list[0] + ".fastq.gz"
  73. with gzip.open(first_fastq_file_dir) as f:
  74. first_line = f.readline()
  75.  
  76. flowcell_info = str(first_line).split(":")[2]
  77.  
  78. for i in info_dict["Lanes"]:
  79. for k in info_dict["Number_of_seq"]:
  80. r1 = re.compile(".*" + i + "_R1_" + k)
  81. read1 = [s + ".fastq.gz" for s in fastq_list if r1.match(s)]
  82.  
  83. r2 = re.compile(".*" + i + "_R2_" + k)
  84. read2 = [s + ".fastq.gz" for s in fastq_list if r2.match(s)]
  85.  
  86. RG_ID = flowcell_info + "." + i[-1]
  87. RG_PU = flowcell_info + "." + info_dict["Index"][0] + "." + i[-1]
  88. add_read_group = ""
  89. map_bam = ""
  90. gene_origin = self.map_type + "_" + info_dict["Sample_ID"][0] + "_" + info_dict["Index"][
  91. 0] + "_" + i + "_" + k + ".bam"
  92.  
  93. if self.map_type == "Bwa":
  94. add_read_group = ' -R "@RG\\tID:' + RG_ID + '\\tSM:' + RG_SM + '\\tLB:' + RG_LB + '\\tPL:' + \
  95. RG_PL + '\\tPU:' + RG_PU + '" '
  96.  
  97. map_bam = "bwa mem -t " + self.threads + " " + add_read_group + self.ref_dir + \
  98. "Bwa/ucsc.hg19.fasta " + read1[0] + " " + read2[0] + \
  99. " | samtools view -@" + self.threads + " -bS - > " + gene_origin
  100. elif self.map_type == "Bowtie":
  101.  
  102. add_read_group = " --rg-id " + RG_ID + " --rg SM:" + RG_SM + " --rg LB:" + RG_LB + " --rg PL:" + \
  103. RG_PL + " --rg PU:" + RG_PU
  104.  
  105. map_bam = "bowtie2 -p" + self.threads + add_read_group + " -x " + self.ref_dir + \
  106. "Bowtie/hg_19_bowtie2 -1 " + read1[0] + " -2 " + read2[0] + \
  107. " | samtools view -@" + self.threads + " -bS - > " + gene_origin
  108. else:
  109. return "Please specify the map type Bwa/Bowtie "
  110.  
  111. self.system_command_send(map_bam, "mapping")
  112.  
  113. self.convert_sort(gene_origin)
  114.  
  115. def convert_sort(self, sort_gene_origin):
  116.  
  117. convert_sort = "samtools view -@" + self.threads + " -bS " + sort_gene_origin + " | samtools sort -@" + \
  118. self.threads + " -o SortedBAM_" + sort_gene_origin
  119. self.system_command_send(convert_sort, "mapping_function;convert_sort_command")
  120.  
  121. def merge_bams(self):
  122. os.chdir(self.working_directory)
  123. fastq_list = self.get_fastq()
  124. info_dict = self.get_info(fastq_list)
  125. all_bam_files = glob.glob("SortedBAM*")
  126. print(all_bam_files)
  127. # sorted_bam_files = [os.path.splitext(os.path.splitext(i)[0])[0] for i in all_bam_files]
  128. inputs_list = ""
  129. for i in all_bam_files:
  130. inputs_list = inputs_list + "I=" + i + " "
  131.  
  132. ouput_name = self.map_type + "_" + info_dict["Sample_ID"][0] + "_MergedBAM.bam"
  133.  
  134. merge_command = "java -XX:ParallelGCThreads=" + self.threads + \
  135. " -jar " + self.picard_path + " MergeSamFiles " + inputs_list + \
  136. " O=" + ouput_name + " USE_THREADING=true"
  137.  
  138. self.system_command_send(merge_command, "merge_bams")
  139.  
  140. def mark_duplicate(self):
  141.  
  142. merged_bam = glob.glob("*_MergedBAM.bam")
  143. mark_prefix_removed = self.map_type + "_mdup_removed_"
  144.  
  145. picardcommand = "java -XX:ParallelGCThreads=" + self.threads + \
  146. " -jar " + self.picard_path + " MarkDuplicates I=" + merged_bam[0] + \
  147. " O=" + mark_prefix_removed + "_" + merged_bam[0] + \
  148. " M=marked_dup_metrics.txt REMOVE_DUPLICATES=true CREATE_INDEX=true"
  149. self.system_command_send(picardcommand, "mark_duplicate")
  150.  
  151. def gatk_RealignTargetCreator(self):
  152.  
  153. bamstr = self.map_type + "_mdup_removed*.bam"
  154. print(bamstr)
  155. lastbam = glob.glob(bamstr)
  156. bcal = "java -jar " + self.gatk_path + " -T RealignerTargetCreator -nt " + \
  157. self.threads + " -R " + self.bundle_dir + "/ucsc.hg19.fasta -known " + \
  158. self.bundle_dir + "/Mills_and_1000G_gold_standard.indels.hg19.vcf -I " + lastbam[0] + \
  159. " -o realign_target.intervals"
  160. self.system_command_send(bcal, "GATK_RealignTargetCreator")
  161.  
  162. def gatk_IndelRealigner(self):
  163.  
  164. bamstr = self.map_type + "_mdup_removed*.bam"
  165. lastbam = glob.glob(bamstr)
  166.  
  167. realigned_last_bam = "IndelRealigned_" + lastbam[0]
  168. bcal = "java -jar " + self.gatk_path + " -T IndelRealigner -R " + self.bundle_dir + "/ucsc.hg19.fasta -known "+\
  169. self.bundle_dir + "/Mills_and_1000G_gold_standard.indels.hg19.vcf" + \
  170. " -targetIntervals realign_target.intervals --noOriginalAlignmentTags -I " + lastbam[0] + " -o " + \
  171. realigned_last_bam
  172.  
  173. self.system_command_send(bcal, "GATK_IndelRealigner")
  174.  
  175. def gatk_BaseRecalibrator(self):
  176.  
  177. bamstr = "IndelRealigned_*.bam"
  178. lastbam = glob.glob(bamstr)
  179. basequalityscore = str(lastbam[0]).split(".")[0] + "_bqsr.grp"
  180.  
  181. bcal = "java -jar " + self.gatk_path + " -T BaseRecalibrator -R " + self.bundle_dir + "/ucsc.hg19.fasta -I " + \
  182. lastbam[0] + " -knownSites " + self.bundle_dir + "/Mills_and_1000G_gold_standard.indels.hg19.vcf" + \
  183. " -o " + basequalityscore
  184. self.system_command_send(bcal, "GATK_BaseRecalibrator")
  185.  
  186. def gatk_PrintReads(self):
  187.  
  188. bamstr = "IndelRealigned_*.bam"
  189. lastbam = glob.glob(bamstr)
  190. bqsr = glob.glob("*.grp")[0]
  191. aftercalibratorBam = "Completeted_BaseCalibrator_" + lastbam[0]
  192. bcal = "java -jar " + self.gatk_path + " -T PrintReads -R " + self.bundle_dir + "/ucsc.hg19.fasta -I " +\
  193. lastbam[0] + " --BQSR " + bqsr, " -o ", aftercalibratorBam
  194.  
  195. self.system_command_send(bcal, "GATK_PrintReads")
  196.  
  197. def run_gatks(self):
  198. self.gatk_RealignTargetCreator()
  199. self.gatk_IndelRealigner()
  200. self.gatk_BaseRecalibrator()
  201. self.gatk_PrintReads()
  202.  
  203. def run_pipeline(self):
  204. fastqs = self.get_fastq()
  205. info = self.get_info(fastqs)
  206. self.mapping()
  207. self.merge_bams()
  208. self.mark_duplicate()
  209. self.run_gatks()
  210. self.create_folder()
  211.  
  212. def system_command_send(self, command, from_function):
  213. from datetime import datetime
  214.  
  215. self.logs["function"] = from_function
  216. self.logs["command"] = command
  217. self.logs["start_time"] = str(datetime.now())
  218. self.logs["threads"] = self.threads
  219.  
  220. try:
  221. os.system(command)
  222. self.logs["end_time"] = str(datetime.now())
  223. self.logs["success"] = 1
  224. self.write_logs()
  225. self.clean_log()
  226.  
  227. except:
  228. self.logs["end_time"] = str(datetime.now())
  229. self.logs["success"] = 0
  230. self.write_logs()
  231. self.clean_log()
  232. return from_function + " give error with this command -> " + command
  233.  
  234. def write_logs(self):
  235. import json
  236. with open('log_file.txt', 'a') as file:
  237. file.write(json.dumps(self.logs))
  238. file.write(",")
  239.  
  240. def clean_log(self):
  241.  
  242. self.logs = {'function': "", 'command': "", 'start_time': "", 'end_time': "", 'threads': self.threads,
  243. 'success': 0}
  244.  
  245. def create_folder(self):
  246. import shutil
  247.  
  248. os.chdir(self.working_directory)
  249. mk_dir = self.working_directory + "/" + self.map_type
  250. os.mkdir(mk_dir)
  251. all_files = glob.glob("*.*")
  252. for file in all_files:
  253. if file[-2:] != "gz":
  254. print(file)
  255. shutil.move(self.working_directory+"/"+file, mk_dir+"/"+file)
  256.  
  257.  
  258. if __name__ == "__main__":
  259. pipeline1 = bam_pipeline("/home/bioinformaticslab/Desktop/AMBRY/Sample_NB17", "Bwa", "Tumor", "306", "7")
  260. pipeline1.run_pipeline()
  261. # print(asd)
Add Comment
Please, Sign In to add comment