- #!/usr/bin/ruby
- # == Synopsis
- #
- # aloft: searches for operons in a given genome based on a given set of known patterns
- #
- # == Usage
- #
- # ./aloft.rb -g [genbank file] -c [csv file] {OPTIONS}
- #
- # == REQUIRED FLAGS
- #
- # -c [file], --csv [file]:
- # csv file containing known code
- #
- # -f [accession number], --fetch [accession number]
- # automatically fetch the genbank file from the
- # internet and save it to <accessionNumber>.gb
- # required if --genbank is not set
- #
- # -g [file], --genbank [file]:
- # genbank file containing genome to search
- # required if --fetcha is not set
- #
- # == OPTIONS
- #
- # -d, --delete-files:
- # do not leave temporary files around
- # not recommended
- #
- # -e, --expect [value]:
- # cutoff value below to limit blast results to
- # default: 1.0e-5
- #
- # -h, --help:
- # show help
- #
- # -o, --output [file]:
- # file to save results
- # default: stdout
- #
- # -q, --quiet:
- # suppress console output
- #
- # -r, --reuse-results:
- # do not perform blast searches, assumes you
- # ran with previously without --delete-files
- # unsets --delete-files
- #
- # -s, --spread [value]:
- # range to look at on either side of a gene
- # default: 3.0e+4
- #
- # -t, --temp-dir [directory]:
- # directory to keep temporary files
- # will automatically create this directory
- #
- # -v, --outputCSV [file]:
- # csv file to save results
- # default: output.csv
- #
- # -x, --cutoff [value]:
- # suppress regions with score less then [integer]
- # default: 0
- #
- # this function handles calculating the scoring of each region
- # arg orfArr Array of hashes (1 per orf) in the region
- # return integer score
- def sumScore(orfArr)
- tot = 0 #start some total at 0
- ##Sample scoring method
- orfArr.each do |orf| #for every ORF in this region
- ##we say that for each
- tot += orf[:hits].inject(1) do |sum,hit|
- ##for each hit y and the sum (starting at 1)
- sum *= hit[:csvscore].to_i #sum = sum * the hit's score (from CSV)
- end
- end
- #Sample 2
- ## Same as before but now the score of each orf is attained
- ## by multiplaying the csv score * the length of the hit
- ## for each hit, and then multiplying all of those together
- #orfArr.each{|x|
- # tot += x[:hits].inject(1){|sum,y|
- # sum *= y[:csvscore].to_i * y[:length]
- # }
- #}
- ##Sample 3
- ##the total score for the region is just
- ##multiply the blast score * the csv score
- ##for all hits in the region, and then just sum them
- #orfArr.each{|x|
- # x[:hits].each{|y|
- # tot += y[:csvscore].to_i * y[:bscore].to_i
- # }
- #}
- return tot #return result
- end
- # these four are a standard part of ruby, should be available in any sane install
- require 'getoptlong'
- require 'rdoc/usage'
- require 'tmpdir'
- require 'fileutils'
- #import the bioruby modules and error if not found
- hasbio = require 'bio'
- unless hasbio
- puts "Error, you must install bioruby first"
- puts "You should be able to this by running"
- puts "gem install bio"
- puts "as an administrator on the system"
- Process.exit
- end
- #check the bioruby version and error less then 1.3.0
- if Bio::BIORUBY_VERSION < [1,3,0] then
- puts "Error, this script was written with for BioRuby 1.3.0"
- puts "Your version is " + Bio::BIORUBY_VERSION.join(".")
- puts "Please upgrage bioruby by running"
- puts "gem update bio"
- Process.exit
- end
- #import the fastercsv modules and error if not found
- hascsv = require 'fastercsv'
- unless hascsv
- puts "Error, you must install a CSV parser first"
- puts "You should be able to this by running"
- puts "gem install fastercsv"
- puts "as an administrator on the system"
- Process.exit
- end
- #############################################
- ########## GENBANK FILE LENGTH FIX ##########
- #############################################
- # Ignore this section.. In order to get around
- # a problem of improperly written genbank files
- # I have to redefine a function of bioruby
- module Bio
- class NCBIDB
- module Common
- def features
- unless @data['FEATURES']
- ary = []
- in_quote = false
- get('FEATURES').each_line do |line|
- next if line =~ /^FEATURES/
- head = line[0,20].to_s.strip
- body = line[20,line.length].to_s.chomp
- if line =~ /^ {5}\S/
- ary.push([ head, body ])
- elsif body =~ /^ \// and not in_quote # gb:IRO125195
- ary.last.push(body)
- if body =~ /="/ and body !~ /"$/
- in_quote = true
- end
- else
- ary.last.last << body
- if body =~ /"$/
- in_quote = false
- end
- end
- end
- ary.collect! do |subary|
- parse_qualifiers(subary)
- end
- @data['FEATURES'] = ary.extend(Bio::Features::BackwardCompatibility)
- end
- if block_given?
- @data['FEATURES'].each do |f|
- yield f
- end
- else
- @data['FEATURES']
- end
- end
- end # Common
- end # GenBank
- end # Bio
- ##############################
- ###### END #########
- ##############################
- # make a string of the input command so we can print it out later for
- # the user's sanity
- actualProgramArgs = ARGV.entries.join(" ")
- # command line options parser
- opts = GetoptLong.new(
- [ '--csv', '-c', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--delete-files', '-d', GetoptLong::NO_ARGUMENT ],
- [ '--expect', '-e', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--fetch', '-f', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--genbank', '-g', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
- [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--quiet', '-q', GetoptLong::NO_ARGUMENT ],
- [ '--reuse-results', '-r', GetoptLong::NO_ARGUMENT ],
- [ '--spread', '-s', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--temp-dir', '-t', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--outputCSV', '-v', GetoptLong::REQUIRED_ARGUMENT ],
- [ '--cutoff', '-x', GetoptLong::REQUIRED_ARGUMENT ])
- # default values
- autofetch = false
- accessionNum = nil
- output = $stdout
- spread = 30_000
- cutoff = 0
- database = nil
- csvFile = nil
- quiet = false
- tempDir = Dir::tmpdir
- keepTmp = true
- reuseOld = false
- tempDir_was_created = false
- reportDir = tempDir
- reportDir_was_created = false
- outputcsv = "output.csv"
- expect_value_cutoff = 1.0e-5
- # actually parse the input options
- begin
- opts.each do |opt, arg|
- case opt
- when '--csv'
- csvFile = arg
- when '--cutoff'
- cutoff = arg.to_f
- when '--delete-files'
- keepTmp = false
- when '--expect'
- expect_value_cutoff = arg
- when '--fetch'
- accessionNum = arg
- autofetch = true
- when '--genbank'
- database = arg
- when '--help'
- RDoc::usage
- Process.exit
- when '--output'
- output = File.open(arg, 'a')
- when '--outputCSV'
- outputcsv = arg
- when '--quiet'
- quiet = true
- when '--reuse-results'
- reuseOld = true
- when '--spread'
- #odd but we have to use to_f so they
- #can use exp. notation for input
- spread = arg.to_f.to_i
- when '--temp-dir'
- tempDir = arg
- end
- end
- rescue
- #should have already output an error
- puts "please run with --help for usage"
- Process.exit
- end
- # if they gave the reuse-results option is enabled
- # we force them to also enable keep-files
- if reuseOld then keepTmp = true end
- if autofetch then
- server = Bio::Fetch.new()
- puts "Fetching genbank file."
- puts "This may take several minutes depending on your internet speed."
- puts "Please wait..."
- tempGB = server.fetch('gb', accessionNum)
- puts "Fetch complete, writing file"
- #write out the file rather then using the temp file that bioruby uses
- #by default
- File.open(accessionNum + '.gb', 'w') do |file|
- file.write tempGB
- file.close
- end
- puts "File " + accessionNum + '.gb written to current directory'
- #set this file as our database file
- database = accessionNum + '.gb'
- end
- # if they didn't input the required inputs
- if database == nil
- puts "No genbank file provided"
- RDoc::usage
- Process.exit
- elsif csvFile == nil
- puts "No CSV File provided"
- RDoc::usage
- Process.exit
- end
- # determine the directories of our files to create
- tempDir = File.join(tempDir, File.split(csvFile)[1].tr(" ","_") )
- reportDir = File.join(tempDir, File.split(database)[1].tr(" ","_") )
- # create the directories if they are not already there
- unless File.exist? tempDir
- FileUtils::mkdir_p(tempDir)
- tempDir_was_created = true
- end
- unless File.exist? reportDir
- FileUtils::mkdir_p(reportDir)
- reportDir_was_created = true
- end
- #actually open our database
- dbfile = Bio::FlatFile.new(Bio::GenBank, File.open(database))
- # this is silly, if there is an extra newline at the end of the genbank
- # file we seem to get some extra genbank object. Instead we just take the
- # first genbank object returned
- tmp = []
- dbfile.each{|x| tmp << x}
- mygb = tmp[0]
- # get the length of the genome we are working with
- largest = mygb.length
- # test to make sure they have a valid genbank file, its possible to download
- # withouth the genbank file
- if mygb.seq.empty? then
- puts "Error, it seems your genbank file does not include
- the nucleotide sequence in it, this is needed."
- if autofetch then
- puts "This is likely due to the fact that your accession number was not found"
- puts "Check the downloaded file: " + database
- else
- puts "You can have this script fetch this file for you using the --fetch flag"
- end
- Process.exit
- end
- #for blast we need to write out a fasta file from our database
- dbtempfile = File.join(tempDir, File.split(database)[1].tr(" ","_") + ".faa" )
- File.open(dbtempfile, 'w') do |file|
- file.write(mygb.seq.to_fasta(database, 60))
- file.close
- end
- print "Parsing CSV file" unless quiet
- #csv reading
- mycsv = FasterCSV.read(csvFile)
- #we assume the first line is the column names
- names = mycsv[0]
- #seperate the actual data
- therest = mycsv[1,mycsv.size]
- #now we want to convert all the data from arrays to hashs
- # since we do this, the column order in the csv doesn't matter
- hasharray=[]
- # hasharray is how we will store all our data for the query sequences
- # each entry in the array contains a hash object (named array)
- # with any columns from the CSV file, but also
- # :filename the exact name/location of the fasta file
- # :reportFile the exact name/location of the report file
- # :report the report object
- therest.each_with_index{|row, seqnum| #for each sequence in the csv
- hasht = {}
- row.each_with_index{|data, index| #for each column
- #we assign things as symbols all in lowercase so
- # the column name "Sequence" becomes :sequence
- hasht[names[index].downcase.to_sym] = data
- }
- #now we want to write all our seqences to fasta files so we can use bl2seq
- hasht[:filename] = File.join(tempDir, "seq" + seqnum.to_s + ".faa")
- #we store the filename in our hash too, and write out the file
- # I assume :sequence and :designation are 2 fields given in the CSV
- # TODO: check that these names exist first
- unless reuseOld
- File.open(hasht[:filename], 'w') do |file|
- file.write(Bio::Sequence::NA.new(hasht[:sequence]).to_fasta(hasht[:designation], 60))
- file.close
- end
- end
- hasht[:reportFile] = File.join(reportDir, "seq" + seqnum.to_s + ".report")
- #add our hash to the full array
- hasharray << hasht
- print "." unless quiet
- }
- print "Done\n" unless quiet
- #for each sequence we run blastx against the user given database
- # this assumes the database is proteins and the seqences are animo acids
- # TODO don't require blastx only
- if (reuseOld) #we don't have to run blast, just read in the old files
- print "Reading in old BLAST results" unless quiet
- hasharray.each_with_index{|searchhash, index|
- File.open(searchhash[:reportFile], 'r') do |file|
- hasharray[index][:report] = Bio::Blast::Bl2seq::Report.new(file.read)
- file.close
- end
- print "." unless quiet
- }
- print "Done\n" unless quiet
- else
- print "Performing BLAST searches" unless quiet
- hasharray.each_with_index{|searchhash, index|
- cmd = ['bl2seq', '-i', dbtempfile, '-j', searchhash[:filename], '-p', 'blastx']
- #build up our command then run it, storing each result in our has as well
- reportText = Bio::Command.query_command(cmd)
- #write out the results to a report file
- File.open(searchhash[:reportFile], 'w') do |file|
- file.write(reportText)
- file.close
- end
- hasharray[index][:report] = Bio::Blast::Bl2seq::Report.new(reportText)
- print "." unless quiet
- }
- print "Done\n" unless quiet
- end
- # knownORFs will be an array of hashs that will hold information about our
- # open reading frames
- # each hash will have data about that orf and any hits that fall in the orf
- # :direction will only be -1 or 1 for what side of the genome it is on
- # :locus_tag copied from the genbank file
- # :hits an array of hashes storing information about hits (described later)
- knownORFs = []
- print "Getting open reading frames from genbank file..." unless quiet
- mygb.features.select{|x| x.feature == "CDS"}.each{|y|
- knownORFs << {:range =>y.locations.range,
- :direction => y.locations[0].strand,
- :locus_tag => y.qualifiers.select{|x| x.qualifier == "locus_tag"}[0].value,
- :hits => [] }
- }
- print "Done\n" unless quiet
- #each ORF is put on as an array, its range, the gene name (as a string) and
- # an empty array for results
- print "Collecting hits" unless quiet
- hithash = []
- hasharray.each{|searchObj|
- searchObj[:report].each{|hit|
- hit.hsps.each{ |hsp|
- #if our expect value is greater then the cutoff
- # move on to the next hsp
- next if hsp.evalue > expect_value_cutoff
- frame = hsp.query_frame
- min = hsp.query_from
- max = hsp.query_to
- if frame < 0 then
- min,max = max, min #swap values
- end
- ## this is the actual hash for a hit, any of these will be available
- # in the scoring function
- #
- # :min, :max, :length obvious
- # :csvscore score given in the CSV file
- # :bscore blast score
- # :frame -3, -2, -1, 1, 2, or 3
- # :expect expect value given by blast
- # :desig designation of the corresponding item from the CSV file
- hithash << {:min => min,
- :max => max,
- :length => hsp.align_len,
- :csvscore => searchObj[:score],
- :bscore => hsp.score,
- :frame => hsp.query_frame,
- :expect => hsp.evalue,
- :desig => searchObj[:designation]}
- }
- }
- print "." unless quiet
- }
- print "Done\n" unless quiet
- #build the ORF array for each hit
- print "Sorting hits into Orfs" unless quiet
- knownORFs.each{|orfArr|
- hithash.select{|hit|
- orfArr[:range] === hit[:min] or #should be OR??
- orfArr[:range] === hit[:max]}.each{|hit|
- orfArr[:hits] << hit
- print "." unless quiet
- }
- }
- print "Done\n" unless quiet
- #we want to ignore any ORFs with no hits in them
- print "Removing useless ORFs.." unless quiet
- knownORFs.reject!{|orf| orf[:hits].empty?}
- print "Done\n" unless quiet
- #we sort the remaining ORFs by their start value
- # this is important for creating regions effeciently
- print "Sorting Orfs.." unless quiet
- knownORFs = knownORFs.sort{|a,b| a[:range].min <=> b[:range].min }
- print "Done\n" unless quiet
- # find sort our hits within each orf, this is nice for output reasons
- print "Sorting Hits within Orfs.." unless quiet
- knownORFs.each{|orf|
- orf[:hits] = orf[:hits].sort{|a,b| a[:min] <=> b[:min]}
- }
- print "Done\n" unless quiet
- #now we actually build our regions
- # since we know that our ORFs are sorted already, we start at the first ORF
- # and say that our region starts at that min value minus the spread
- # we say that the region may end at that max value plus the spread
- # if there the next ORF falls in this region, we say the region now
- # ends at the new ORFs max value plus the spread
- # continue until we find that the next ORF is not in the region
- # then this is our region
- print "Calculating Regions" unless quiet
- regions= []
- start = knownORFs[0][:range].min - spread
- final = knownORFs[0][:range].max + spread
- if start < 0 then start = 0 end
- if final > largest then final = largest end
- init = [start, final, [ knownORFs[0] ]]
- #for each ORF
- knownORFs[1..knownORFs.length].inject(init){|currentReg, orf|
- #see if this orf is in our region
- if orf[:range].min < currentReg[1] then
- #extend our region
- newmax = orf[:range].max + spread
- if newmax > largest then newmax = largest end
- currentReg[1] = newmax
- # add the current orf to the region
- currentReg[2] << orf
- else
- #this orf is on in this region
- #so add this region to the list of regions
- regions << {:range => (currentReg[0])..(currentReg[1]),
- :orfs => currentReg[2]}
- # start a new region from this orf
- start = orf[:range].min - spread
- final = orf[:range].max + spread
- if final > largest then final = largest end
- currentReg = [start, final, [orf]]
- print "." unless quiet
- end
- currentReg #return the current region to the next loop iteration
- }
- print "Done\n" unless quiet
- #now we actually calculate the scores, this is passed off to the sumScore
- # function that is at the top of the file
- print "Calculating scores" unless quiet
- regions.each{|reg|
- reg[:score] = sumScore(reg[:orfs])
- print "." unless quiet
- }
- print "Done\n" unless quiet
- #if we said to cleanup the temp files, we can get rid of them now
- unless keepTmp then
- print "Cleaning up temp files" unless quiet
- hasharray.each{|x|
- File.delete(x[:filename], x[:reportFile])
- print "." unless quiet
- }
- if reportDir_was_created then Dir.rmdir(reportDir) end
- if tempDir_was_created then Dir.rmdir(tempDir) end
- print "Done\n" unless quiet
- end
- print "\n" unless quiet
- #OUTPUT
- FasterCSV.open(outputcsv, "w") do |csvout| #we always write an output csv file
- #columns for the csv file
- csvout << ['range', 'score','orfs', 'sequence']
- #for keeping things nice, we output the command they used
- output.puts("Results from command \"" + actualProgramArgs + "\"\n")
- #for each region where its score is greater then the cutoff value
- regions.select{|r| r[:score] >= cutoff}.each{|reg|
- #print our range and score
- output.puts "Range: " + reg[:range].to_s
- output.puts "Score: " + reg[:score].prec_f.to_s
- #print our orf locus_tag and then the list of hits
- reg[:orfs].each {|orf|
- output.puts orf[:locus_tag].to_s + ": " +
- orf[:hits].map{|hit| hit[:desig].to_s + "(" + hit[:min].to_s + ".." + hit[:max].to_s + ")" }.join(" ")
- }
- output.puts ""
- #now do the same for the csv file
- csvout << [reg[:range].to_s,
- reg[:score].prec_f.to_s,
- reg[:orfs].map{|x| x[:locus_tag] + " [ " + x[:hits].map{|y| y[:desig].to_s}.join(" ") + " ] " }.join(" "),
- mygb.seq[reg[:range]].to_s]
- }
- end
- #unless our output is just stdout, close the file
- output.close if output.class == File