Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # For running step0 of the imputation procedure
- # Splits files and creates random subset for step 1 of imputation
- # adapted from original script written by Jacki Buros
- # arguments are
- # 1) plink bed source file
- # 2) prefix for output files
- # check number of arguments
- E_BADARGS=65
- if [ ! -n "$1" ]
- then
- echo "Usage: `basename $0` <file basename> <output prefix> "
- exit $E_BADARGS
- fi
- orig_dir=$(pwd)
- prefix="$2"
- # ---- settings ------
- # Plink bed source file (used in step0 script)
- SRCDIR="$(pwd)"
- PLINK="$1"
- # store output here
- OUTDIR="${orig_dir}"
- # num individuals per group in step 1 impute
- SUBSETSIZE=300
- # num individuals (total) per subset in step 2 impute
- STEP2SIZE=200
- # paths
- PLINKBIN="/usr/local/plink/plink --nonfounders --noweb" #CHANGE
- GAWKBIN="/usr/bin/gawk" #CHANGE
- TARBIN="/bin/tar" #CHANGE
- # SUBSET and COMPLETE prefixes
- COMPLETE="_${prefix}_complete" # name of plink files containing complete bed files where parents are set to 0 0
- SUBSET="_${prefix}_subset" # name of plink files containing subsets of the above files
- GROUP="_${prefix}_group" # prefix for per-group id lists
- USERNAME=$(whoami)
- # prepare dirs for output & scratch
- mkdir -p ${OUTDIR}
- scratch="/scratch/${USERNAME}/${prefix}_impute" #CHANGE???
- mkdir -p ${scratch}
- # if passing a pedigree file need to convert it to a binary file
- if [ -f "${PLINK}.ped" ]
- then
- echo "Creating PLINK binary files"
- $PLINKBIN --file ${PLINK} --map3 --allow-no-sex --make-bed --out ${PLINK}
- fi
- echo "$(date) | Copying source files in $SRCDIR to ${scratch}"
- cd ${SRCDIR}
- cp -a --dereference ${PLINK}.* ${scratch}
- echo "$(date) | Preparing base PLINK bed file (named ${COMPLETE})"
- gawk '{print $1,$2,"0","0"}' ${PLINK}.fam > _update_parents # pulls out family ids and individual ids from fam file and zero's the parents.
- $PLINKBIN --bfile $PLINK --update-parents _update_parents --set-hh-missing --allow-no-sex --make-bed --out $COMPLETE #> /dev/null # plink command to update the parental info with zeros.
- FAMFILE="temp.${PLINK}.fam"
- cp ${PLINK}.fam ${FAMFILE}
- rm ${PLINK}.*
- echo "$(date) | Preparing subset PLINK bed file (named ${SUBSET}) to be used in Step 1"
- # Randomly selects iids from fam file for use in model estimation
- for i in `cut -d' ' -f 1-2 $FAMFILE| sed s/\ /,/g`; do echo "$RANDOM $i"; done | sort | cut -d' ' -f 2| sed s/,/\ /g | head -n $SUBSETSIZE > subset.iids
- $PLINKBIN --bfile ${COMPLETE} --keep subset.iids --make-bed --out ${SUBSET}
- rm *.ped
- rm *.map
- rm *.log
- rm *.hh
- echo "$(date) | Preparing list of ids per subset (named ${GROUP}*) to be used in step 2"
- gawk '{print $1,$2}' ${COMPLETE}.fam > _idlist
- split -d -l $STEP2SIZE _idlist $GROUP
- echo "$(date) | Preparation complete; copy files to $OUTDIR"
- tar cfz ${prefix}_step0.tar.gz ${COMPLETE}.* ${SUBSET}.* ${GROUP}*
- rm -f ${GROUP}*
- mv ${prefix}_step0.tar.gz ${OUTDIR}
- echo "$(date) | copy remaining files to $OUTDIR/rsync & clean up"
- mkdir -p ${OUTDIR}/rsync
- rsync -avz ${scratch}/ ${OUTDIR}/rsync/
- # Let's clean up
- if [[ $? -eq 0 ]] ; then
- cd ${orig_dir}
- rm -rf ${scratch}
- else
- echo "Unable to sync remaining files."
- echo "Please ssh to $(hostname) and"
- echo "look at the content of ${scratch}"
- fi
- echo "=========================================================="
- echo "Finished on : $(date)"
- echo "=========================================================="
Add Comment
Please, Sign In to add comment