Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- set -euo pipefail
- IFS=$'nt'
- # Check for missing metadata file input
- if [ -z ${1+x} ]; then
- echo "ERROR: missing input file; aborting."
- exit 1
- fi
- INPUT=$1
- # Create / touch lists of series under processing / finished processing
- touch processing.txt finished.txt
- # Find GSE and SRR column numbers in metadata file
- GSECOL=$(head -1 $INPUT | tr 't' 'n' | nl | grep GSE | cut -f 1 | tr -d ' ')
- SRRCOL=$(head -1 $INPUT | tr 't' 'n' | nl | grep SRR | cut -f 1 | tr -d ' ')
- # Main loop: for every unique GSE in metadata file
- UNDER_PROCESSING=$(wc -l processing.txt | cut -d ' ' -f 1)
- tail -n +2 $INPUT | cut -f $GSECOL | sort | uniq | while read GSE; do
- # Get all SRRs for current GSE
- RUNS=$(grep -w $GSE $INPUT | cut -f $SRRCOL)
- # Working directory
- WORKDIR=data/$CELL/$GSE
- # Skip if GSE is already finished or is currently being processed
- set +e
- if grep -wq $GSE processing.txt || grep -wq $GSE finished.txt; then
- continue
- fi
- set -e
- # Create directory structure
- mkdir -p $WORKDIR/01_fastq $WORKDIR/02_alignment $WORKDIR/03_expression
- # Start a maximum of 10 different series at once, then exit
- if [ $UNDER_PROCESSING -eq 10 ]; then
- echo "Started 10 different GSE series; stopping."
- echo ""
- exit 1
- else
- UNDER_PROCESSING=$(( $UNDER_PROCESSING + 1 ))
- fi
- # Queue downloads
- for SRR in $RUNS; do
- # Queue download
- sbatch --job-name $SRR.download
- --error $WORKDIR/01_fastq/slurm.download.fastq.$SRR.err
- --output $WORKDIR/01_fastq/slurm.download.fastq.$SRR.out
- --mail-type=NONE
- scripts/01_download_fastq.sh $GSE $SRR
- >> sbatch.download.ids 2>&1
- done
- # Queue alignment
- sbatch --job-name $GSE.alignment
- --error $WORKDIR/02_alignment/slurm.alignment.err
- --output $WORKDIR/02_alignment/slurm.alignment.out
- --mail-type=NONE
- --dependency=afterok:$(cat sbatch.download.ids | cut -d ' ' -f 4 | xargs | tr ' ' ':')
- scripts/02_alignment.sh $GSE
- > sbatch.alignment.id 2>&1
- rm sbatch.download.ids
- # Queue expression estimation
- sbatch --job-name $GSE.expression
- --error $WORKDIR/03_expression/slurm.expression.err
- --output $WORKDIR/03_expression/slurm.expression.out
- --mail-type=NONE
- --dependency=afterok:$(cat sbatch.alignment.id | cut -d ' ' -f 4)
- scripts/03_estimate_expression.sh $GSE
- > /dev/null
- rm sbatch.alignment.id
- # Add GSE to [processing.txt] file
- echo "$GSE" >> processing.txt
- echo "Queued pipeline for $GSE ($CELL); $UNDER_PROCESSING in queue."
- done
- #!/bin/bash -l
- #SBATCH --account XXXXXXXX
- #SBATCH --partition core
- #SBATCH --ntasks 1
- #SBATCH --job-name download.fastq
- #SBATCH --time 15:00:00
- #SBATCH --mail-user XXXXXXXXXXX
- #SBATCH --mail-type=FAIL
- #SBATCH --error log.download.fastq.err
- #SBATCH --output log.download.fastq.out
- # Get GSE and SRR IDs from input argument 1
- GSE=$1
- SRR=$2
- # Working directory
- WORKDIR=data/$CELL/$GSE
- # Modules
- module load bioinfo-tools sratools/2.8.0
- # Download FASTQ files
- fastq-dump --outdir $WORKDIR/01_fastq --gzip --skip-technical --split-files --readids --clip -v $SRR
- > $WORKDIR/01_fastq/log.download.fastq.$SRR.txt 2>&1
- # Delete SRA file
- if [ -f /proj/ncbi/sra/$SRR.sra ]; then
- rm /proj/ncbi/sra/$SRR.sra
- fi
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement