Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #v8
- #none-align to noalign,delete none-annotation
- #change 'expn' to 'expr'
- #add some list.txt for rmarkdown part
- #recalculate the log2
- import os
- import sys
- import pandas as pd
- import subprocess
- import shutil
- import re
- import numpy as np
- import argparse
- import datetime
- def directory_structure(ori_samplefolder, outputpath):
- os.chdir(ori_samplefolder)
- # Change the work path to original samplefolder (the path saved all *.fastq files)
- # capture the fastq file with initial name
- global path
- path = []
- for file in os.listdir(ori_samplefolder):
- if file.endswith(".fastq"):
- samplename = os.path.splitext(file)[0]
- sampleext = os.path.splitext(file)[1]
- samplename = samplename.replace('_', '')
- new_samplefolder = os.path.join(outputpath,samplename)
- if os.path.exists(new_samplefolder):
- path.append(new_samplefolder)
- print("Work folder: \n"+new_samplefolder+ "\n is exsisting..")
- else:
- os.makedirs(new_samplefolder)
- shutil.copy(file,new_samplefolder+'/'+samplename+sampleext)
- path.append(new_samplefolder)
- print("Work folder: \n"+ new_samplefolder+'\n will be created and fastq files will be copy to..')
- elif file.endswith(".gz"):
- subprocess.call(["gunzip *.gz"], shell=True)
- samplename = os.path.splitext(file)[0]
- sampleext = os.path.splitext(file)[1]
- samplename = samplename.replace('_', '')
- new_samplefolder = os.path.join(outputpath,samplename)
- if os.path.exists(new_samplefolder):
- path.append(new_samplefolder)
- print("Work folder: \n"+new_samplefolder+ "\n is exsisting..")
- else:
- os.makedirs(new_samplefolder)
- shutil.copy(file,new_samplefolder+'/'+samplename+sampleext)
- path.append(new_samplefolder)
- print("Work folder: \n"+ new_samplefolder+'\n will be created and fastq files will be copy to..')
- print("# The work folder will be: \n" + outputpath)
- def parameters(spe,outputpath):
- species = None
- refname = None
- refpath = None
- if spe == 'hsa':
- species = ' -o hsa'
- refname = 'hg38'
- refpath = '/n/data1/genomes/indexes/hg38/hg38.fa'
- elif spe == 'mmu':
- species = ' -o mmu'
- refname = 'mm9'
- refpath = '/n/data1/genomes/indexes/mm9/mm9.fa'
- elif spe == 'dme':
- species = ' -o dme'
- refname = 'dm6'
- refpath = '/n/data1/genomes/indexes/dm6/dm6.fa'
- # if ref.endswith(".fa"):
- # fullrefname = os.path.split(ref)[1]
- # refname = os.path.splitext(fullrefname)[0]
- # print(refname)
- else:
- print("invalid species code, only support hsa, mmu and dme")
- if os.path.isfile(outputpath+'/'+'ref.txt'):
- os.remove(outputpath+'/'+'ref.txt')
- f = open(outputpath + '/' + 'ref.txt', 'w+')
- f.write(refpath+'\n')
- else:
- f = open(outputpath +'/'+'ref.txt','w+')
- f.write(refpath+'\n')
- f.close()
- return(species,refname,refpath)
- def parameters_custom(spe,ref):
- species = None
- refname = None
- refpath = None
- if spe == 'hsa':
- species = ' -o hsa'
- refname = os.path.splitext(ref)[0]
- refpath = ref
- elif spe == 'mmu':
- species = ' -o mmu'
- refname = os.path.splitext(ref)[0]
- refpath = ref
- elif spe == 'dme':
- species = os.path.splitext(ref)[0]
- refname = 'dm6'
- refpath = ref
- # if ref.endswith(".fa"):
- # fullrefname = os.path.split(ref)[1]
- # refname = os.path.splitext(fullrefname)[0]
- # print(refname)
- else:
- print("invalid species code, only support hsa, mmu and dme")
- return(species,refname,refpath)
- def samplelist(ori_samplefolder,outputpath):
- if os.path.isfile(outputpath+'/'+'samplelist.txt'):
- os.remove(outputpath+'/'+'samplelist.txt')
- f = open(outputpath + '/' + 'samplelist.txt', 'w+')
- f.write(ori_samplefolder+'\n')
- for file in os.listdir(ori_samplefolder):
- if file.endswith(".fastq"):
- samplename = os.path.splitext(file)[0]
- f.write(samplename + '.fastq\n')
- f.close()
- else:
- f = open(outputpath +'/'+'samplelist.txt','w+')
- f.write(ori_samplefolder+'\n')
- for file in os.listdir(ori_samplefolder):
- if file.endswith(".fastq"):
- samplename = os.path.splitext(file)[0]
- f.write(samplename + '.fastq\n')
- f.close()
- def alignments(referencebase_bwa):
- for i in range(len(path)):
- os.chdir(path[i])
- for file in os.listdir(os.getcwd()):
- if file.endswith(".fastq"):
- samplename, extension = os.path.splitext(file)
- print(
- "# The alignment will be running as :\n /n/apps/CentOS7/bin/bwa aln -t 3 " + referencebase_bwa + " " + samplename + ".fastq >" + samplename + ".sai")
- subprocess.call(["/n/apps/CentOS7/bin/bwa aln -t 3 " + referencebase_bwa + " "+samplename + ".fastq >" + samplename + ".sai"+'\n'],
- shell=True)
- print('# building the .sam file :\n'
- "/n/apps/CentOS7/bin/bwa samse -n 10 " + referencebase_bwa + " "+samplename + ".sai " + samplename + ".fastq >" + samplename + ".sam"+'\n')
- subprocess.call([
- "/n/apps/CentOS7/bin/bwa samse -n 10 " + referencebase_bwa + " " + samplename + ".sai " + samplename + ".fastq >" + samplename + ".sam"],
- shell=True)
- def annotation(spec,referencebase,outputpath):
- # #path of tools
- runtools_annotation = 'perl /n/ngs/tools/tcga/v0.2.7/code/annotation/annotate.pl'
- runtools_annotation_alignment_stats = 'perl /n/ngs/tools/tcga/v0.2.7/code/library_stats/alignment_stats.pl'
- runtools_annotation_graph = 'perl /n/ngs/tools/tcga/v0.2.7/code/library_stats/graph_libs.pl'
- runtools_annotation_tcga = 'perl /n/ngs/tools/tcga/v0.2.7/code/custom_output/tcga/tcga.pl'
- runtools_expr = 'perl /n/ngs/tools/tcga/v0.2.7/code/library_stats/expression_matrix.pl'
- mirnabase = ' -m mirna_21a'
- fullcommand_annotation = runtools_annotation + mirnabase + " -u "+referencebase + spec + " -p " + outputpath
- fullcommand_align_stats = runtools_annotation_alignment_stats + " -p " + outputpath
- fullcommand_graph = runtools_annotation_graph + " -p " + outputpath
- fullcommand_tcga = runtools_annotation_tcga + mirnabase + spec + " -g "+ referencebase + " -p " + outputpath
- fullcommand_expr = runtools_expr + mirnabase +spec+ " -p " + outputpath
- print("# The annotation script will run as: ")
- print(fullcommand_annotation+'\n')
- subprocess.call([fullcommand_annotation], shell=True)
- print("# The statistics of annotation script will run as:")
- print(fullcommand_align_stats+'\n')
- subprocess.call([fullcommand_align_stats], shell=True)
- print("# The visualization of stats script will run as:")
- print(fullcommand_graph+'\n')
- print("# The tcga results output script will run as:")
- print(fullcommand_tcga+'\n')
- subprocess.call([fullcommand_tcga], shell=True)
- print("# The expression matrix script will run as: ")
- print(fullcommand_tcga+'\n')
- subprocess.call([fullcommand_expr], shell=True)
- def rename(outputpath):
- df = pd.read_csv(outputpath + '/' + 'alignment_stats.csv')
- df.columns = [u'Library', u'Index', u'Total Reads greater 15bp', u'Adapter dimers',
- u'Adapter dimers', u'Adapter at 1-14bp', u'Adapter at 15-25bp',
- u'Adapter at 26-35bp', u'Adapter after 35bp',
- u'Aligned Reads Post-Filter', u'percent Aligned Reads', u'Unaligned Reads',
- u'percent Unaligned Reads', u'Filtered Reads without XA',
- u'Softclipped Reads', u'Chastity Failed Reads Post-Filter',
- u'detected miRNA', u'detected miRNA Covered by 10 Reads',
- u'Total miRNA reads', u'Crossmapped miRNA reads', u'mature miRNA reads', u'star miRNA reads',
- u'precursor miRNA reads', u'miRNA loop reads', u'unannotated miRNA reads', u'snoRNA reads',
- u'tRNA reads', u'rRNA reads', u'snRNA reads', u'scRNA reads', u'srpRNA reads',
- u'Other RepeatMasker RNAs reads', u'RNA (No CDS) reads', u'3UTR reads', u'5UTR reads',
- u'Coding Exon reads', u'Intron reads', u'LINE reads', u'SINE reads', u'LTR reads', u'Satellite reads',
- u'RepeatMasker DNA reads', u'RepeatMasker Low complexity reads',
- u'RepeatMasker Simple repeat reads', u'RepeatMasker Other reads',
- u'RepeatMasker Unknown reads', u'Unknown reads', u'Total miRNA',
- u'Crossmapped miRNA', u'mature miRNA', u'star miRNA',
- u'precursor miRNA', u'miRNA loop', u'unannotated miRNA',
- u'snoRNA', u'tRNA', u'rRNA', u'snRNA', u'scRNA', u'srpRNA',
- u'Other RepeatMasker RNAs', u'RNA(No CDS)', u'3UTR',
- u'5UTR', u'Coding Exon', u'Intron', u'LINE', u'SINE',
- u'LTR', u'Satellite', u'RepeatMasker DNA',
- u'RepeatMasker Low complexity', u'RepeatMasker Simple repeat',
- u'RepeatMasker Other', u'RepeatMasker Unknown', u'Unknown']
- df1 = df.iloc[:, 0:47]
- df2 = (df.iloc[:, 47:76]).replace('%', '', regex=True).astype('float') / 100
- df3 = pd.concat([df1, df2], axis=1)
- df3.to_csv(outputpath + '/' + "alignment_stats.csv", index=False, header=True)
- if os.path.isfile(outputpath + '/expn_matrix.txt') == True:
- os.rename(outputpath + '/expn_matrix.txt', outputpath + '/miRNA_expr_raw_matrix.txt')
- if os.path.isfile(outputpath + '/expn_matrix_norm.txt') == True:
- rpm = pd.read_table(outputpath + '/expn_matrix_norm.txt', index_col=0, header=0)
- rpm = rpm.round(6)
- rpm1 = rpm + 1
- rpm = rpm.astype(str)
- rpm = rpm.replace('0.000000', '0')
- log = np.log2(rpm1)
- log = pd.DataFrame(log)
- log = log.round(6)
- log = log.astype(str)
- log = log.replace('0.000000', '0')
- log.to_csv(outputpath + '/' + 'miRNA_expr_RPM_plus_1_log2_matrix.txt', sep='\t')
- rpm.to_csv(outputpath + '/' + 'miRNA_expr_RPM_matrix.txt', sep='\t')
- os.remove(outputpath + '/expn_matrix_norm.txt')
- if os.path.isfile(outputpath + '/expn_matrix_norm_log.txt') == True:
- os.remove(outputpath + '/expn_matrix_norm_log.txt')
- def table_merge(outputpath):
- result_table_list_1 = ['3_UTR.txt', '5_UTR.txt', 'Intron.txt', 'LINE.txt', 'LTR.txt', 'rmsk_DNA.txt', 'rmsk_RNA.txt', 'rmsk_Simple_repeat.txt', 'rmsk_Unknown.txt',
- 'rRNA.txt', 'scRNA.txt', 'SINE.txt', 'snoRNA.txt', 'snRNA.txt', 'srpRNA.txt', 'tRNA.txt', 'Satellite.txt']
- merged_result = None
- #create the empty txt table file if that table not exist under sample_features
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- workpath = path[i] + '/' + sample_name + '_features'
- os.chdir(workpath)
- for filename in result_table_list_1:
- if os.path.exists(workpath + '/' +filename):
- continue
- else:
- f = open(filename,'w+')
- f.close()
- for filename in result_table_list_1:
- result = None
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- file_content = pd.read_csv(path[i] + '/' + sample_name + '_features' + '/' + filename, names=[sample_name], index_col=0)
- if result is None:
- result = file_content
- else:
- result = pd.concat([result, file_content], axis = 1)
- result['type'] = filename.split('.')[0]
- if merged_result is None:
- merged_result = result
- else:
- merged_result = pd.concat([merged_result, result], axis=0)
- # set the missing value
- merged_result = merged_result.fillna('NA')
- #switch the type column with the second column
- type = merged_result.pop('type')
- merged_result.insert(0,'type',type)
- merged_result.to_csv(outputpath+'/'+"Feature_summary_table.txt", sep='\t')
- return merged_result
- def filter_miRNA_table():
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- workpath = path[i] + '/' + sample_name + '_features'
- os.chdir(workpath)
- name = []
- category = []
- mir_id = []
- value = []
- content = []
- file = open(workpath+"/"+ "miRNA.txt")
- reader = file.readlines()
- for line in reader:
- if (re.split('[,\s]',line)[1] == 'unannotated') or (re.split('[,\s]',line)[1]) == "precursor" or (re.split('[,\s]',line)[1] == "stemloop"):
- name.append(re.split('[,\s]', line)[0])
- category.append(re.split('[,\s]', line)[1])
- mir_id.append(re.split('[,\s]', line)[1])
- value.append(re.split('[,\s]', line)[2])
- else:
- name.append(re.split('[,\s]', line)[0])
- category.append(re.split('[,\s]', line)[1])
- mir_id.append(re.split('[,\s]', line)[2])
- value.append(re.split('[,\s]', line)[3])
- for i in range(len(value)):
- content.append(name[i] + " " + category[i] + " " + mir_id[i] + "," + value[i])
- output = open(workpath +"/" + "miRNA_filtered.txt", "w")
- for i in range(len(content)):
- output.write(content[i])
- output.write('\n')
- # def table_merge_miRNA():
- #
- # result_table_list_1 = ['miRNA_filtered.txt']
- #
- # merged_result = None
- # print(path)
- # #create the empty txt table file if that table not exist under sample_features
- # for i in range(len(path)):
- # sample_name = os.path.split(path[i])[1]
- # print(sample_name)
- # workpath = path[i] + '/' + sample_name + '_features'
- # os.chdir(workpath)
- # for filename in result_table_list_1:
- # if os.path.exists(workpath + '/' +filename):
- # continue
- # else:
- # f = open(filename,'w+')
- # f.close()
- #
- # for filename in result_table_list_1:
- # result = None
- # for i in range(len(path)):
- # sample_name = os.path.split(path[i])[1]
- # file_content = pd.read_csv(path[i] + '/' + sample_name + '_features' + '/' + filename, names=[sample_name], index_col=0)
- # print(file_content)
- # if result is None:
- # result = file_content
- # else:
- # result = pd.concat([result, file_content], axis = 1)
- # result['type'] = filename.split('.')[0]
- # if merged_result is None:
- # merged_result = result
- # else:
- # merged_result = pd.concat([merged_result, result], axis=0)
- #
- # # set the missing value
- # merged_result = merged_result.fillna('NA')
- #
- # #switch the type column with the second column
- # type = merged_result.pop('type')
- # merged_result.insert(0,'type',type)
- # return merged_result
- #
- def table_merge_miRNA(outputpath):
- result_table_list_1 = ['miRNA_filtered.txt']
- result = None
- #create the empty txt table file if that table not exist under sample_features
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- workpath = path[i] + '/' + sample_name + '_features'
- os.chdir(workpath)
- for filename in result_table_list_1:
- if os.path.exists(workpath + '/' +filename):
- continue
- else:
- f = open(filename,'w+')
- f.close()
- table_head = ['name', 'type', 'mir_id']
- for filename in result_table_list_1:
- result = None
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- table_head.append(sample_name)
- file_content = pd.read_csv(path[i] + '/' + sample_name + '_features' + '/' + filename, header= None, index_col=0)
- if result is None:
- result = file_content
- else:
- result = pd.concat([result, file_content], axis = 1)
- result = result.fillna(0) #########can not set the fill content as NA, since it leads misplacement in next step, to FIX!!!!!##########
- # set the missing value
- result.to_csv(outputpath+'/'+"miRNA_expr_summary_table.txt",sep='\t',header=False)
- merged_result = pd.read_csv(outputpath+'/'+"miRNA_expr_summary_table.txt", sep='\s+',names=table_head)
- merged_result.to_csv(outputpath+'/'+"miRNA_expr_summary_table.txt",sep='\t',index=False)
- return result
- #
- #
- def merge_mirna_taglen(outputpath):
- result = None
- for i in range(len(path)):
- sample_name = os.path.split(path[i])[1]
- # print(sample_name)
- workpath = path[i] + '/' + sample_name + '_features'
- os.chdir(workpath)
- file_content = pd.read_csv('filtered_taglengths.csv',header=0)
- file_content.insert(0,'sample',sample_name)
- file_content.to_csv('filtered_taglengths.txt', sep='\t',index=False, header=True)
- if result is None:
- result = file_content
- else:
- result = pd.concat([result, file_content], axis = 0)
- df = pd.DataFrame(result)
- col_list = list(df)
- col_list.remove('sample')
- col_list.remove('taglen')
- df['total'] = df[col_list].sum(axis=1)
- df.to_csv(outputpath+'/'+"merged_taglen_table.csv", index=False,header=True)
- def remove_files():
- print("# The redundant file will be transformed and removed as : ")
- for i in range(len(path)):
- os.chdir(path[i])
- for file in os.listdir(os.getcwd()):
- if file.endswith(".fastq"):
- os.remove(file)
- if file.endswith(".sam"):
- samplename = os.path.splitext(file)[0]
- print("\n samtools view -Su "+ file + ">" + samplename +".bam ")
- subprocess.call(["samtools view -Su "+ file + ">" + samplename +".bam "],shell=True)
- print("\n samtools sort " + samplename +".bam -o " + samplename + ".sorted.bam ")
- subprocess.call(["samtools sort " + samplename +".bam -o " + samplename + ".sorted.bam "], shell=True)
- print("\n samtools index "+samplename+".sorted.bam")
- subprocess.call(["samtools index "+samplename+".sorted.bam"], shell=True)
- os.remove(file)
- os.remove(samplename+'.bam')
- os.remove(samplename+'.sai')
- def main():
- parser = argparse.ArgumentParser()
- requiredNamed = parser.add_argument_group('required named arguments')
- requiredNamed.add_argument("-i", "--input", action="store",dest='fastq_dir',required=True, help="Please enter the path of sample .fastq folder")
- requiredNamed.add_argument("-o", "--ouput", action="store",dest='out_dir',required=True, help="Please enter the path of directory to save result")
- requiredNamed.add_argument("-s", "--species", action="store",dest='specie',required=True, help="Please enter the species type of your sample: (hsa, mmu or dme)")
- parser.add_argument("--noalign",action="store_true",default=False, dest='alignment', help="if you already alignment .sam files under your sample work folder, and you may skip alignment step please enter this argument")
- parser.add_argument("-r", "--reference", action="store", default=None, dest='ref', help="indicate the custom reference instead of default e.g. /n/data1/genomes/indexes/hg38/hg38.fa")
- args = parser.parse_args()
- f = open(args.fastq_dir+ '/' + "logging.txt", 'w+')
- logging_path= (args.fastq_dir+ '/' + "logging.txt")
- sys.stdout = f
- print('# The used arguments are:')
- a = 'python '
- for i in range(len(sys.argv)):
- a = a + ' ' + sys.argv[i]
- b = a.replace("'", '')
- b = b.replace("[", '')
- b = b.replace(",", '')
- b = b.replace("]", '')
- print(b)
- directory_structure(args.fastq_dir, args.out_dir)
- # parameter settings
- if args.ref is None:
- tcga_arg = parameters(args.specie,args.out_dir)
- print('# The using reference genome index is:')
- print(tcga_arg[2] + '\n')
- else:
- tcga_arg = parameters_custom(args.specie,args.ref)
- print('# The using reference genome index is:')
- print(tcga_arg[2] + '\n')
- samplelist(args.fastq_dir, args.out_dir)
- if args.alignment == False:
- alignments(tcga_arg[2])
- annotation(tcga_arg[0], tcga_arg[1], args.out_dir)
- rename(args.out_dir)
- table_merge(args.out_dir)
- filter_miRNA_table()
- table_merge_miRNA(args.out_dir)
- merge_mirna_taglen(args.out_dir)
- shutil.copy("/n/core/Bioinformatics/research/cxu/mirna_pipeline/python_pipeline/rmarkdown_mod.Rmd", args.out_dir)
- remove_files()
- today = datetime.date.today()
- print('\n')
- print('Date:')
- print(today)
- f.close()
- if os.path.isfile(args.out_dir + '/logging.txt') == True:
- os.remove(args.out_dir + '/logging.txt')
- shutil.move(logging_path,args.out_dir)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement