Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- Author: Darrin Schultz @conchoecia
- File: Transcriptome assembly, annotation, and db creation
- Instructions:
- - To run this script and all of its analyses: make sure that python 3,
- snakemake, and biopython are installed on your Unix computer.
- - Execute the following command: `snakemake --cores 45`, replacing `45`
- with the number of threads available on your machine.
- """
- import os
- import sys
- from Bio import SeqIO
- import shutil
- configfile: "config.yaml"
- trimmomatic = "/usr/local/bin/Trimmomatic-0.35"
- maxthreads = 90
- dockeruser = "YOUR_USERNAME_HERE"
- curr_dir = os.getcwd()
- print(curr_dir)
- config["rna_f"] = {}
- config["rna_r"] = {}
- config["input_reads"] = []
- for sample in config["samples"]:
- config["input_reads"].append(config["samples"][sample]["f"])
- config["input_reads"].append(config["samples"][sample]["r"])
- # Now we need to change how the SS_lib_type_param is handled
- for sample in config["samples"]:
- thisss_lib = config["samples"][sample]["SS_lib_type"]
- if thisss_lib in ["None", "NA", "none", "no", "nein"]:
- config["samples"][sample]["SS_lib_type"] = ""
- elif thisss_lib in ["rf", "RF", "Rf", "rF"]:
- config["samples"][sample]["SS_lib_type"] = "--SS_lib_type RF"
- elif thisss_lib in ["fr", "FR", "Fr", "fR"]:
- config["samples"][sample]["SS_lib_type"] = "--SS_lib_type FR"
- def read_number_from_file(filename):
- with open(filename, "r") as f:
- for line in f:
- if line.strip():
- return line.strip()
- def singleline_to_multiline(infile, outfile):
- """
- This function takes a fasta file and wraps it to N characters
- """
- out_handle = open(outfile, 'w') #open outfile for writing
- with open(infile, "rU") as handle:
- for record in SeqIO.parse(handle, "fasta"):
- SeqIO.write(record, out_handle, "fasta")
- out_handle.close()
- rule all:
- input:
- #make the symlinks
- expand("reads/{sample}_f.fastq.gz", sample = config["samples"]),
- expand("reads/{sample}_r.fastq.gz", sample = config["samples"]),
- ##trim the reads
- expand("trimmed/{sample}_{readtype}.trim.fastq.gz", sample = config["samples"], readtype = ["f", "r"]),
- #now assemble the transcriptomes
- expand("txomes/raw/trinity_{sample}_raw.fasta", sample = config["samples"]),
- # make it multi-line
- expand("txomes/final/{sample}.fasta", sample = config["samples"]),
- # transdecoder and translate, rename the pep files
- expand("pepfiles/final/{sample}.pep", sample = config["samples"]),
- # softlinks for /data/ncbi/db
- expand("/data/ncbi/db/{sample}.fasta", sample = config["samples"]),
- expand("/data/ncbi/db/{sample}.pep", sample = config["samples"]),
- # make the blast databases
- expand("/data/ncbi/db/{sample}.fasta.nhr", sample = config["samples"]),
- expand("/data/ncbi/db/{sample}.pep.phr", sample = config["samples"]),
- expand("/data/ncbi/db/{sample}.dmnd", sample = config["samples"]),
- # make the report
- expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
- expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
- expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
- expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
- expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
- expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
- expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
- expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
- expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
- expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
- expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
- expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),
- "report/final_report.txt"
- ###############################################################
- # __ __ ___ __ __ __ __ ___ __ __ _ __
- # |__) |__) |__ |__) |__) / \ / ` |__ /__` /__` | |\ | / _`
- # | | \ |___ | | \ \__/ \__, |___ .__/ .__/ | | \| \__>
- #
- # These are the rules for preprocessing the inputs in order
- # to prepare for the rest of the pipeline.
- # preprocessing rule 1
- rule make_symlink_f:
- output:
- illumina_f = "reads/{sample}_f.fastq.gz",
- run:
- if not os.path.exists("reads"):
- print("Making a directory called 'reads'.", file = sys.stderr)
- os.makedirs("reads")
- else:
- print("The 'reads' directory exists already.", file = sys.stderr)
- #make symlinks for the illumina reads
- print("Making read symlinks.", file =sys.stderr)
- print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
- rna_f = "reads/{}_f.fastq.gz".format(wildcards.sample)
- if not os.path.exists(rna_f):
- print(" - Making a symlink for {}".format(rna_f), file=sys.stderr)
- reads_path = config["samples"][wildcards.sample]["f"]
- if not os.path.exists(reads_path):
- raise Exception("{} does not exist.".format(reads_path))
- os.symlink(reads_path, rna_f)
- else:
- print(" - A symlink already exists for {}".format(rna_f), file=sys.stderr)
- rule make_symlink_r:
- output:
- illumina_r = "reads/{sample}_r.fastq.gz",
- run:
- if not os.path.exists("reads"):
- print("Making a directory called 'reads'.", file = sys.stderr)
- os.makedirs("reads")
- else:
- print("The 'reads' directory exists already.", file = sys.stderr)
- #make symlinks for the illumina reads
- print("Making read symlinks.", file =sys.stderr)
- for sample in config["samples"]:
- print(" - Checking files for sample {}".format(wildcards.sample), file=sys.stderr)
- rna_r = "reads/{}_r.fastq.gz".format(wildcards.sample)
- if not os.path.exists(rna_r):
- print(" - Making a symlink for {}".format(rna_r), file=sys.stderr)
- reads_path = config["samples"][wildcards.sample]["r"]
- if not os.path.exists(reads_path):
- raise Exception("{} does not exist.".format(reads_path))
- os.symlink(reads_path, rna_r)
- else:
- print(" - A symlink already exists for {}".format(rna_r), file=sys.stderr)
- #
- # end of rules for preprocessing
- #
- ###########################################################
- ##############################################################
- # _ ____ _
- # (_) / /_ ______ ___ (_)___ ____ _
- # / / / / / / / __ `__ \/ / __ \/ __ `/
- # / / / / /_/ / / / / / / / / / / /_/ /
- # /_/_/_/\__,_/_/ /_/ /_/_/_/ /_/\__,_/
- #
- # These are the rules for working with the Illumina reads.
- # illumina rule 1
- rule trim_pairs:
- input:
- f = "reads/{sample}_f.fastq.gz",
- r = "reads/{sample}_r.fastq.gz",
- trim_jar = os.path.join(trimmomatic, "trimmomatic-0.35.jar"),
- adapter_path = os.path.join(trimmomatic, "adapters/TruSeq3-PE-2.fa")
- output:
- f = "trimmed/{sample}_f.trim.fastq.gz",
- r = "trimmed/{sample}_r.trim.fastq.gz",
- u1= temp("trimmed/{sample}_f.trim.unpaired.fastq.gz"),
- u2= temp("trimmed/{sample}_r.trim.unpaired.fastq.gz")
- threads:
- maxthreads
- shell:
- """java -jar {input.trim_jar} PE \
- -phred33 -threads {threads} \
- {input.f} {input.r} \
- {output.f} \
- {output.u1} \
- {output.r} \
- {output.u2} \
- ILLUMINACLIP:{input.adapter_path}:2:30:10:1:TRUE \
- LEADING:3 TRAILING:3 \
- SLIDINGWINDOW:4:15 MINLEN:36"""
- # illumina rule 2
- rule assemble_txome:
- input:
- f_paired = "trimmed/{sample}_f.trim.fastq.gz",
- r_paired = "trimmed/{sample}_r.trim.fastq.gz"
- output:
- assemblypath = "txomes/raw/trinity_{sample}_raw.fasta"
- params:
- outpath = "txomes/trinity_{sample}",
- outfasta = "txomes/trinity_{sample}/Trinity.fasta",
- dockeruser = dockeruser,
- sslibtype = lambda wildcards: config["samples"][wildcards.sample]["SS_lib_type"]
- threads:
- maxthreads
- shell:
- """
- docker run \
- -u $(id -u):$(id -g) --rm \
- -v `pwd`:`pwd` \
- -v /etc/passwd:/etc/passwd \
- trinity_bowtie2.3.4.1 Trinity \
- --seqType fq \
- --left `pwd`/{input.f_paired} \
- --right `pwd`/{input.r_paired} \
- --max_memory 100G \
- --CPU {threads} \
- --trimmomatic \
- {params.sslibtype} \
- --output `pwd`/{params.outpath};
- mv {params.outfasta} {output.assemblypath}
- rm -rf {params.outpath}
- """
- # illumina rule 3
- rule correct_txome_names:
- input:
- assem = "txomes/raw/trinity_{sample}_raw.fasta"
- output:
- fassem = temp("txomes/singleline/singleline_trinity_{sample}.fasta")
- params:
- samplename = "{sample}"
- shell:
- """
- sed 's/^>TRINITY/>{params.samplename}/' {input.assem} > {output.fassem}
- """
- # illumina rule 4
- rule illumina_txome_single_line_to_multiline:
- """
- This turns the single line output of Trinity into a multi-line fasta
- file that can be indexed and used in something like bwa.
- """
- input:
- fassem = "txomes/singleline/singleline_trinity_{sample}.fasta"
- output:
- assem = "txomes/final/{sample}.fasta"
- run:
- singleline_to_multiline(input.fassem, output.assem)
- # illumina rule 5
- rule translate_txome:
- input:
- txome = "txomes/final/{sample}.fasta"
- output:
- outpeps = temp("{sample}.fasta.transdecoder_dir/longest_orfs.pep")
- shell:
- """
- TransDecoder.LongOrfs -t {input.txome}
- """
- # illumina rule 5.1
- rule trim_transdecoder_names:
- """
- This trims transdecoder names from an extended string like:
- >CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1 type:internal len:122 gc:universal CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1:1-363(+)
- to:
- >CHA_Caecosagitta_macrocephala_DS244_DN57546_c0_g1_i1.p1
- """
- input:
- pepfile = "{sample}.fasta.transdecoder_dir/longest_orfs.pep"
- output:
- renamed = temp("pepfiles/temp/{sample}_longest_orfs_renamed.pep")
- shell:
- """
- bioawk -cfastx '{{printf(">%s\\n%s\\n", $name, $seq)}}' {input.pepfile} > {output.renamed}
- """
- # illumina rule 6
- rule move_and_multiline_peps:
- """
- This moves the translated txome to its final resting place and
- turns it from singleline to multiline.
- """
- input:
- pepfile = "pepfiles/temp/{sample}_longest_orfs_renamed.pep"
- output:
- pepfinal = "pepfiles/final/{sample}.pep"
- params:
- rmdir = lambda wildcards: "{}.fasta.transdecoder_dir".format(wildcards.sample),
- checkpoints = lambda wildcards: "{}.fasta.transdecoder_dir.__checkpoints_longorfs".format(wildcards.sample)
- run:
- singleline_to_multiline(input.pepfile, output.pepfinal)
- if os.path.exists(params.rmdir):
- shutil.rmtree(params.rmdir)
- if os.path.exists(params.checkpoints):
- shutil.rmtree(params.checkpoints)
- # illumina rule 7 fasta softlink to db
- rule softlink_fasta_data:
- input:
- assem = "txomes/final/{sample}.fasta"
- output:
- outlink = "/data/ncbi/db/{sample}.fasta"
- params:
- absln = lambda wildcards: "{}/txomes/final/{}.fasta".format(curr_dir, wildcards.sample)
- shell:
- """
- ln -s {params.absln} {output.outlink}
- """
- # illumina rule 8 pep softlink to db
- rule softlink_pep_data:
- input:
- pep = "pepfiles/final/{sample}.pep"
- output:
- outlink = "/data/ncbi/db/{sample}.pep"
- params:
- absln = lambda wildcards: "{}/pepfiles/final/{}.pep".format(curr_dir, wildcards.sample)
- shell:
- """
- ln -s {params.absln} {output.outlink}
- """
- # illumina rule 9
- rule nucl_db_data:
- input:
- inp = "/data/ncbi/db/{sample}.fasta"
- output:
- out = "/data/ncbi/db/{sample}.fasta.nhr"
- shell:
- """
- makeblastdb -in {input.inp} -input_type fasta -dbtype nucl -parse_seqids
- """
- # illumina rule 10
- rule prot_db_data:
- input:
- inp = "/data/ncbi/db/{sample}.pep"
- output:
- out = "/data/ncbi/db/{sample}.pep.phr"
- shell:
- """
- makeblastdb -in {input.inp} -input_type fasta -dbtype prot -parse_seqids
- """
- # ILLUMINA rule 11
- rule diamond_data:
- input:
- pepfile="/data/ncbi/db/{sample}.pep"
- output:
- ddb = "/data/ncbi/db/{sample}.dmnd"
- shell:
- """
- diamond makedb --in {input.pepfile} -d {output.ddb}
- """
- # report
- # this section handles writing a report on all of the samples.
- # Fields to acquire:
- # - number of reads in untrimmed
- # - number of reads in trimmed
- # - number of transcripts
- # - number of peps
- rule number_reads_untrimmed:
- input:
- raw = "reads/{sample}_f.fastq.gz"
- output:
- raw_count = "info/counts/raw/{sample}_raw_count.txt"
- shell:
- """
- echo $(bioawk -cfastx 'END{{print NR}}' {input.raw}) > {output.raw_count}
- """
- rule number_reads_trimmed:
- input:
- trimmed = "trimmed/{sample}_f.trim.fastq.gz"
- output:
- trimmed_count = "info/counts/trimmed/{sample}_trimmed_count.txt"
- shell:
- """
- echo $(bioawk -cfastx 'END{{print NR}}' {input.trimmed}) > {output.trimmed_count}
- """
- rule number_transcripts:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- fasta_counts = "info/counts/fasta/{sample}_fasta_count.txt"
- shell:
- """
- echo $(bioawk -cfastx 'END{{print NR}}' {input.fasta}) > {output.fasta_counts}
- """
- rule number_peps:
- input:
- pep = "pepfiles/final/{sample}.pep"
- output:
- pep_count = "info/counts/peps/{sample}_pep_count.txt"
- shell:
- """
- echo $(bioawk -cfastx 'END{{print NR}}' {input.pep}) > {output.pep_count}
- """
- rule calcN50_tx:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- fasta_N50 = "info/counts/fasta/{sample}_N50.txt"
- shell:
- """
- bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \
- sort -n | \
- awk '{{len[i++]=$1;sum+=$1}} \
- END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
- if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.fasta_N50}
- """
- rule calcN90_tx:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- fasta_N90 = "info/counts/fasta/{sample}_N90.txt"
- shell:
- """
- bioawk -cfastx '{{print(length($seq))}}' {input.fasta} | \
- sort -n | \
- awk '{{len[i++]=$1;sum+=$1}} \
- END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
- if (csum>=(sum*0.9)) {{print len[j];break}} }} }}' > {output.fasta_N90}
- """
- rule calcN50_pep:
- input:
- pep = "pepfiles/final/{sample}.pep"
- output:
- pep_N50 = "info/counts/peps/{sample}_N50.txt"
- shell:
- """
- bioawk -cfastx '{{print(length($seq))}}' {input.pep} | \
- sort -n | \
- awk '{{len[i++]=$1;sum+=$1}} \
- END {{for (j=0;j<i+1;j++) {{csum+=len[j]; \
- if (csum>=sum/2) {{print len[j];break}} }} }}' > {output.pep_N50}
- """
- rule quality_rejects:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- q_rejects = "info/trimming/txome/{sample}_qual_rejects.txt"
- shell:
- """
- echo "0" > {output.q_rejects}
- """
- rule adapter_rejects:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- a_rejects = "info/trimming/txome/{sample}_adapter_rejects.txt"
- shell:
- """
- echo "0" > {output.a_rejects}
- """
- rule GC_rejects:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- g_rejects = "info/trimming/txome/{sample}_gc_rejects.txt"
- shell:
- """
- echo "0" > {output.g_rejects}
- """
- rule calcmean:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- fasta_meanlen = "info/counts/fasta/{sample}_meanlen.txt"
- shell:
- """
- bioawk -cfastx '{{sum += length($seq)}} \
- END{{printf("%.2f", sum/NR)}}' {input.fasta} > {output.fasta_meanlen}
- """
- rule calcMB:
- input:
- fasta = "txomes/final/{sample}.fasta"
- output:
- fasta_numMB = "info/counts/fasta/{sample}_mb.txt"
- shell:
- """
- bioawk -cfastx '{{sum += length($seq)}} \
- END{{printf("%.2f", sum/1000000)}}' {input.fasta} > {output.fasta_numMB}
- """
- rule collate_report:
- """ this method prints out the final qc report of all the libraries."""
- input:
- raw_num = expand("info/counts/raw/{sample}_raw_count.txt", sample = config["samples"]),
- qual_reject = expand("info/trimming/txome/{sample}_qual_rejects.txt", sample = config["samples"]),
- adap_reject = expand("info/trimming/txome/{sample}_adapter_rejects.txt", sample = config["samples"]),
- gc_reject = expand("info/trimming/txome/{sample}_gc_rejects.txt", sample = config["samples"]),
- trim_num = expand("info/counts/trimmed/{sample}_trimmed_count.txt", sample = config["samples"]),
- txome_count = expand("info/counts/fasta/{sample}_fasta_count.txt", sample = config["samples"]),
- pep_count = expand("info/counts/peps/{sample}_pep_count.txt", sample = config["samples"]),
- txome_N50 = expand("info/counts/fasta/{sample}_N50.txt", sample = config["samples"]),
- pep_N50 = expand("info/counts/peps/{sample}_N50.txt", sample = config["samples"]),
- tx_meanlen = expand("info/counts/fasta/{sample}_meanlen.txt", sample = config["samples"]),
- tx_N90 = expand("info/counts/fasta/{sample}_N90.txt", sample = config["samples"]),
- tx_MB = expand("info/counts/fasta/{sample}_mb.txt", sample = config["samples"]),
- output:
- "report/final_report.txt"
- run:
- finalout_handle = open(output[0], "w")
- #print("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
- print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
- "sample",
- "trim_efficiency",
- "num_reads",
- "num_reads_trimmed",
- "quality_rejects",
- "adapter_rejects",
- "gc_rejects",
- "transcripts_count",
- "transcripts_meanlen",
- "transcripts_N50",
- "transcripts_N90",
- "transcripts_MB",
- "peps_count",
- "peps_N50"),
- file = finalout_handle)
- for thiss in sorted(config["samples"]):
- raw_num = read_number_from_file("info/counts/raw/{}_raw_count.txt".format(thiss))
- trim_num = read_number_from_file("info/counts/trimmed/{}_trimmed_count.txt".format(thiss))
- txome_count = read_number_from_file("info/counts/fasta/{}_fasta_count.txt".format(thiss))
- pep_count = read_number_from_file("info/counts/peps/{}_pep_count.txt".format(thiss))
- txome_N50 = read_number_from_file("info/counts/fasta/{}_N50.txt".format(thiss))
- pep_N50 = read_number_from_file("info/counts/peps/{}_N50.txt".format(thiss))
- qual_reject = read_number_from_file("info/trimming/txome/{}_qual_rejects.txt".format(thiss))
- adap_reject = read_number_from_file("info/trimming/txome/{}_adapter_rejects.txt".format(thiss))
- gc_reject = read_number_from_file("info/trimming/txome/{}_gc_rejects.txt".format(thiss))
- tx_meanlen = read_number_from_file("info/counts/fasta/{}_meanlen.txt".format(thiss))
- txome_N90 = read_number_from_file("info/counts/fasta/{}_N90.txt".format(thiss))
- txome_MB = read_number_from_file("info/counts/fasta/{}_mb.txt".format(thiss))
- print("{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}".format(
- thiss,
- int(trim_num)/int(raw_num),
- raw_num,
- trim_num,
- qual_reject,
- adap_reject,
- gc_reject,
- txome_count,
- tx_meanlen,
- txome_N50,
- txome_N90,
- txome_MB,
- pep_count,
- pep_N50
- ), file = finalout_handle)
- finalout_handle.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement