Advertisement
Guest User

Untitled

a guest
Dec 27th, 2016
234
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.36 KB | None | 0 0
  1. #!/bin/bash
  2. set -euo pipefail
  3. IFS=$'nt'
  4.  
  5. # Check for missing metadata file input
  6. if [ -z ${1+x} ]; then
  7. echo "ERROR: missing input file; aborting."
  8. exit 1
  9. fi
  10.  
  11. INPUT=$1
  12.  
  13. # Create / touch lists of series under processing / finished processing
  14. touch processing.txt finished.txt
  15.  
  16. # Find GSE and SRR column numbers in metadata file
  17. GSECOL=$(head -1 $INPUT | tr 't' 'n' | nl | grep GSE | cut -f 1 | tr -d ' ')
  18. SRRCOL=$(head -1 $INPUT | tr 't' 'n' | nl | grep SRR | cut -f 1 | tr -d ' ')
  19.  
  20. # Main loop: for every unique GSE in metadata file
  21. UNDER_PROCESSING=$(wc -l processing.txt | cut -d ' ' -f 1)
  22. tail -n +2 $INPUT | cut -f $GSECOL | sort | uniq | while read GSE; do
  23.  
  24. # Get all SRRs for current GSE
  25. RUNS=$(grep -w $GSE $INPUT | cut -f $SRRCOL)
  26.  
  27. # Working directory
  28. WORKDIR=data/$CELL/$GSE
  29.  
  30. # Skip if GSE is already finished or is currently being processed
  31. set +e
  32. if grep -wq $GSE processing.txt || grep -wq $GSE finished.txt; then
  33. continue
  34. fi
  35. set -e
  36.  
  37. # Create directory structure
  38. mkdir -p $WORKDIR/01_fastq $WORKDIR/02_alignment $WORKDIR/03_expression
  39.  
  40. # Start a maximum of 10 different series at once, then exit
  41. if [ $UNDER_PROCESSING -eq 10 ]; then
  42. echo "Started 10 different GSE series; stopping."
  43. echo ""
  44. exit 1
  45. else
  46. UNDER_PROCESSING=$(( $UNDER_PROCESSING + 1 ))
  47. fi
  48.  
  49. # Queue downloads
  50. for SRR in $RUNS; do
  51.  
  52. # Queue download
  53. sbatch --job-name $SRR.download
  54. --error $WORKDIR/01_fastq/slurm.download.fastq.$SRR.err
  55. --output $WORKDIR/01_fastq/slurm.download.fastq.$SRR.out
  56. --mail-type=NONE
  57. scripts/01_download_fastq.sh $GSE $SRR
  58. >> sbatch.download.ids 2>&1
  59. done
  60.  
  61. # Queue alignment
  62. sbatch --job-name $GSE.alignment
  63. --error $WORKDIR/02_alignment/slurm.alignment.err
  64. --output $WORKDIR/02_alignment/slurm.alignment.out
  65. --mail-type=NONE
  66. --dependency=afterok:$(cat sbatch.download.ids | cut -d ' ' -f 4 | xargs | tr ' ' ':')
  67. scripts/02_alignment.sh $GSE
  68. > sbatch.alignment.id 2>&1
  69.  
  70. rm sbatch.download.ids
  71.  
  72. # Queue expression estimation
  73. sbatch --job-name $GSE.expression
  74. --error $WORKDIR/03_expression/slurm.expression.err
  75. --output $WORKDIR/03_expression/slurm.expression.out
  76. --mail-type=NONE
  77. --dependency=afterok:$(cat sbatch.alignment.id | cut -d ' ' -f 4)
  78. scripts/03_estimate_expression.sh $GSE
  79. > /dev/null
  80.  
  81. rm sbatch.alignment.id
  82.  
  83. # Add GSE to [processing.txt] file
  84. echo "$GSE" >> processing.txt
  85. echo "Queued pipeline for $GSE ($CELL); $UNDER_PROCESSING in queue."
  86. done
  87.  
  88. #!/bin/bash -l
  89. #SBATCH --account XXXXXXXX
  90. #SBATCH --partition core
  91. #SBATCH --ntasks 1
  92. #SBATCH --job-name download.fastq
  93. #SBATCH --time 15:00:00
  94. #SBATCH --mail-user XXXXXXXXXXX
  95. #SBATCH --mail-type=FAIL
  96. #SBATCH --error log.download.fastq.err
  97. #SBATCH --output log.download.fastq.out
  98.  
  99. # Get GSE and SRR IDs from input argument 1
  100. GSE=$1
  101. SRR=$2
  102.  
  103. # Working directory
  104. WORKDIR=data/$CELL/$GSE
  105.  
  106. # Modules
  107. module load bioinfo-tools sratools/2.8.0
  108.  
  109. # Download FASTQ files
  110. fastq-dump --outdir $WORKDIR/01_fastq --gzip --skip-technical --split-files --readids --clip -v $SRR
  111. > $WORKDIR/01_fastq/log.download.fastq.$SRR.txt 2>&1
  112.  
  113. # Delete SRA file
  114. if [ -f /proj/ncbi/sra/$SRR.sra ]; then
  115. rm /proj/ncbi/sra/$SRR.sra
  116. fi
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement