Got an iPhone or iPad? We have a brand new Pastebin App for both devices, and it's totally free! Click here to download the new Pastebin App for iOS.
Guest

Untitled

By: a guest on Feb 12th, 2012  |  syntax: None  |  size: 20.00 KB  |  hits: 32  |  expires: Never
download  |  raw  |  embed  |  report abuse
Copied
  1. #!/usr/bin/ruby
  2.  
  3. # == Synopsis
  4. #
  5. # aloft: searches for operons in a given genome based on a given set of known patterns
  6. #
  7. # == Usage
  8. #
  9. # ./aloft.rb -g [genbank file] -c [csv file] {OPTIONS}
  10. #
  11. # == REQUIRED FLAGS
  12. #
  13. # -c [file], --csv [file]:
  14. #    csv file containing known code
  15. #
  16. # -f [accession number], --fetch [accession number]
  17. #    automatically fetch the genbank file from the
  18. #    internet and save it to <accessionNumber>.gb
  19. #    required if --genbank is not set
  20. #
  21. # -g [file], --genbank [file]:
  22. #    genbank file containing genome to search
  23. #    required if --fetcha is not set
  24. #
  25. # == OPTIONS
  26. #
  27. # -d, --delete-files:
  28. #    do not leave temporary files around
  29. #    not recommended
  30. #
  31. # -e, --expect [value]:
  32. #    cutoff value below to limit blast results to
  33. #    default: 1.0e-5
  34. #
  35. # -h, --help:
  36. #    show help
  37. #
  38. # -o, --output [file]:
  39. #    file to save results
  40. #    default: stdout
  41. #
  42. # -q, --quiet:
  43. #    suppress console output
  44. #
  45. # -r, --reuse-results:
  46. #    do not perform blast searches, assumes you
  47. #    ran with previously without --delete-files
  48. #    unsets --delete-files
  49. #
  50. # -s, --spread [value]:
  51. #    range to look at on either side of a gene
  52. #    default: 3.0e+4
  53. #
  54. # -t, --temp-dir [directory]:
  55. #    directory to keep temporary files
  56. #    will automatically create this directory
  57. #
  58. # -v, --outputCSV [file]:
  59. #    csv file to save results
  60. #    default: output.csv
  61. #
  62. # -x, --cutoff [value]:
  63. #    suppress regions with score less then [integer]
  64. #    default: 0
  65. #
  66.  
  67. # this function handles calculating the scoring of each region
  68. #   arg orfArr  Array of hashes (1 per orf) in the region
  69. #   return  integer score
  70. def sumScore(orfArr)
  71.     tot = 0  #start some total at 0
  72.  
  73.     ##Sample scoring method
  74.     orfArr.each do |orf|  #for every ORF in this region
  75.         ##we say that for each
  76.         tot += orf[:hits].inject(1) do |sum,hit|
  77.             ##for each hit y and the sum (starting at 1)
  78.             sum *= hit[:csvscore].to_i  #sum = sum * the hit's score (from CSV)
  79.         end
  80.     end
  81.  
  82.     #Sample 2
  83.     ## Same as before but now the score of each orf is attained
  84.     ## by multiplaying the csv score * the length of the hit
  85.     ## for each hit, and then multiplying all of those together
  86.  
  87.     #orfArr.each{|x|
  88.     #    tot += x[:hits].inject(1){|sum,y|
  89.     #        sum *= y[:csvscore].to_i * y[:length]
  90.     #    }
  91.     #}
  92.  
  93.     ##Sample 3
  94.     ##the total score for the region is just
  95.     ##multiply the blast score * the csv score
  96.     ##for all hits in the region, and then just sum them
  97.    
  98.     #orfArr.each{|x|
  99.     #    x[:hits].each{|y|
  100.     #        tot += y[:csvscore].to_i * y[:bscore].to_i
  101.     #    }
  102.     #}
  103.  
  104.  
  105.     return tot  #return result
  106. end
  107.  
  108.  
  109.  
  110. # these four are a standard part of ruby, should be available in any sane install
  111. require 'getoptlong'
  112. require 'rdoc/usage'
  113. require 'tmpdir'
  114. require 'fileutils'
  115.  
  116. #import the bioruby modules and error if not found
  117. hasbio = require 'bio'
  118. unless hasbio
  119.     puts "Error, you must install bioruby first"
  120.     puts "You should be able to this by running"
  121.     puts "gem install bio"
  122.     puts "as an administrator on the system"
  123.     Process.exit
  124. end
  125.  
  126. #check the bioruby version and error less then 1.3.0
  127. if Bio::BIORUBY_VERSION < [1,3,0] then
  128.     puts "Error, this script was written with for BioRuby 1.3.0"
  129.     puts "Your version is " + Bio::BIORUBY_VERSION.join(".")
  130.     puts "Please upgrage bioruby by running"
  131.     puts "gem update bio"
  132.     Process.exit
  133. end
  134.  
  135. #import the fastercsv modules and error if not found
  136. hascsv = require 'fastercsv'
  137. unless hascsv
  138.     puts "Error, you must install a CSV parser first"
  139.     puts "You should be able to this by running"
  140.     puts "gem install fastercsv"
  141.     puts "as an administrator on the system"
  142.     Process.exit
  143. end
  144.  
  145.  
  146. #############################################
  147. ########## GENBANK FILE LENGTH FIX ##########
  148. #############################################
  149. # Ignore this section.. In order to get around
  150. #  a problem of improperly written genbank files
  151. #  I have to redefine a function of bioruby
  152. module Bio
  153. class NCBIDB
  154. module Common
  155.   def features
  156.     unless @data['FEATURES']
  157.       ary = []
  158.       in_quote = false
  159.       get('FEATURES').each_line do |line|
  160.         next if line =~ /^FEATURES/
  161.         head = line[0,20].to_s.strip
  162.         body = line[20,line.length].to_s.chomp
  163.         if line =~ /^ {5}\S/
  164.           ary.push([ head, body ])
  165.         elsif body =~ /^ \// and not in_quote       # gb:IRO125195
  166.           ary.last.push(body)
  167.           if body =~ /="/ and body !~ /"$/
  168.             in_quote = true
  169.           end
  170.         else
  171.           ary.last.last << body
  172.           if body =~ /"$/
  173.             in_quote = false
  174.           end
  175.         end
  176.       end
  177.       ary.collect! do |subary|
  178.         parse_qualifiers(subary)
  179.       end
  180.       @data['FEATURES'] = ary.extend(Bio::Features::BackwardCompatibility)
  181.     end
  182.     if block_given?
  183.       @data['FEATURES'].each do |f|
  184.         yield f
  185.       end
  186.     else
  187.       @data['FEATURES']
  188.     end
  189.   end
  190. end # Common
  191. end # GenBank
  192. end # Bio
  193. ##############################
  194. ######      END      #########
  195. ##############################
  196.  
  197.  
  198. # make a string of the input command so we can print it out later for
  199. #  the user's sanity
  200. actualProgramArgs = ARGV.entries.join(" ")
  201.  
  202. # command line options parser
  203. opts = GetoptLong.new(
  204.     [ '--csv', '-c', GetoptLong::REQUIRED_ARGUMENT ],
  205.     [ '--delete-files', '-d', GetoptLong::NO_ARGUMENT ],
  206.     [ '--expect', '-e', GetoptLong::REQUIRED_ARGUMENT ],
  207.     [ '--fetch', '-f', GetoptLong::REQUIRED_ARGUMENT ],
  208.     [ '--genbank', '-g',  GetoptLong::REQUIRED_ARGUMENT ],
  209.     [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
  210.     [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
  211.     [ '--quiet', '-q', GetoptLong::NO_ARGUMENT ],
  212.     [ '--reuse-results', '-r', GetoptLong::NO_ARGUMENT ],
  213.     [ '--spread', '-s', GetoptLong::REQUIRED_ARGUMENT ],
  214.     [ '--temp-dir', '-t', GetoptLong::REQUIRED_ARGUMENT ],
  215.     [ '--outputCSV', '-v', GetoptLong::REQUIRED_ARGUMENT ],
  216.     [ '--cutoff', '-x', GetoptLong::REQUIRED_ARGUMENT ])
  217.  
  218. # default values
  219. autofetch = false
  220. accessionNum = nil
  221. output = $stdout
  222. spread = 30_000
  223. cutoff = 0
  224. database = nil
  225. csvFile = nil
  226. quiet = false
  227. tempDir = Dir::tmpdir
  228. keepTmp = true
  229. reuseOld = false
  230. tempDir_was_created = false
  231. reportDir = tempDir
  232. reportDir_was_created = false
  233. outputcsv = "output.csv"
  234. expect_value_cutoff = 1.0e-5
  235.  
  236. # actually parse the input options
  237. begin
  238.     opts.each do |opt, arg|
  239.         case opt
  240.         when '--csv'
  241.             csvFile = arg
  242.         when '--cutoff'
  243.             cutoff = arg.to_f
  244.         when '--delete-files'
  245.             keepTmp = false
  246.         when '--expect'
  247.             expect_value_cutoff = arg
  248.         when '--fetch'
  249.             accessionNum = arg
  250.             autofetch = true
  251.         when '--genbank'
  252.             database = arg
  253.         when '--help'
  254.             RDoc::usage
  255.             Process.exit
  256.         when '--output'
  257.             output = File.open(arg, 'a')
  258.         when '--outputCSV'
  259.             outputcsv = arg
  260.         when '--quiet'
  261.             quiet = true
  262.         when '--reuse-results'
  263.             reuseOld = true
  264.         when '--spread'
  265.             #odd but we have to use to_f so they
  266.             #can use exp. notation for input
  267.             spread = arg.to_f.to_i
  268.         when '--temp-dir'
  269.             tempDir = arg
  270.         end
  271.     end
  272. rescue
  273.     #should have already output an error
  274.     puts "please run with --help for usage"
  275.     Process.exit
  276. end
  277.  
  278.  
  279. # if they gave the reuse-results option is enabled
  280. #   we force them to also enable keep-files
  281. if reuseOld then keepTmp = true end
  282.  
  283. if autofetch then
  284.     server = Bio::Fetch.new()
  285.     puts "Fetching genbank file."
  286.     puts "This may take several minutes depending on your internet speed."
  287.     puts "Please wait..."
  288.  
  289.     tempGB = server.fetch('gb', accessionNum)
  290.     puts "Fetch complete, writing file"
  291.  
  292.     #write out the file rather then using the temp file that bioruby uses
  293.     #by default
  294.     File.open(accessionNum + '.gb', 'w') do |file|
  295.         file.write tempGB
  296.         file.close
  297.     end
  298.     puts "File " + accessionNum + '.gb written to current directory'
  299.     #set this file as our database file
  300.     database = accessionNum + '.gb'
  301.  
  302. end
  303.  
  304.  
  305. # if they didn't input the required inputs
  306. if database == nil
  307.     puts "No genbank file provided"
  308.     RDoc::usage
  309.     Process.exit
  310. elsif csvFile == nil
  311.     puts "No CSV File provided"
  312.     RDoc::usage
  313.     Process.exit
  314. end
  315.  
  316. # determine the directories of our files to create
  317. tempDir = File.join(tempDir, File.split(csvFile)[1].tr(" ","_") )
  318. reportDir = File.join(tempDir, File.split(database)[1].tr(" ","_") )
  319.  
  320. # create the directories if they are not already there
  321. unless File.exist? tempDir
  322.     FileUtils::mkdir_p(tempDir)
  323.     tempDir_was_created = true
  324. end
  325. unless File.exist? reportDir
  326.     FileUtils::mkdir_p(reportDir)
  327.     reportDir_was_created = true
  328. end
  329.  
  330. #actually open our database
  331. dbfile = Bio::FlatFile.new(Bio::GenBank, File.open(database))
  332.  
  333. # this is silly, if there is an extra newline at the end of the genbank
  334. # file we seem to get some extra genbank object.  Instead we just take the
  335. # first genbank object returned
  336. tmp = []
  337. dbfile.each{|x| tmp << x}
  338. mygb = tmp[0]
  339.  
  340. # get the length of the genome we are working with
  341. largest = mygb.length
  342.  
  343. # test to make sure they have a valid genbank file, its possible to download
  344. # withouth the genbank file
  345. if mygb.seq.empty? then
  346.     puts "Error, it seems your genbank file does not include
  347.     the nucleotide sequence in it, this is needed."
  348.     if autofetch then
  349.         puts "This is likely due to the fact that your accession number was not found"
  350.         puts "Check the downloaded file: " + database
  351.     else
  352.         puts "You can have this script fetch this file for you using the --fetch flag"
  353.     end
  354.     Process.exit
  355. end
  356.  
  357. #for blast we need to write out a fasta file from our database
  358. dbtempfile = File.join(tempDir, File.split(database)[1].tr(" ","_") + ".faa" )
  359. File.open(dbtempfile, 'w') do |file|
  360.     file.write(mygb.seq.to_fasta(database, 60))
  361.     file.close
  362. end
  363.  
  364. print "Parsing CSV file" unless quiet
  365. #csv reading
  366. mycsv = FasterCSV.read(csvFile)
  367. #we assume the first line is the column names
  368. names = mycsv[0]
  369. #seperate the actual data
  370. therest = mycsv[1,mycsv.size]
  371.  
  372. #now we want to convert all the data from arrays to hashs
  373. # since we do this, the column order in the csv doesn't matter
  374. hasharray=[]
  375.  
  376. # hasharray is how we will store all our data for the query sequences
  377. # each entry in the array contains a hash object (named array)
  378. # with any columns from the CSV file, but also
  379. #  :filename    the exact name/location of the fasta file
  380. #  :reportFile  the exact name/location of the report file
  381. #  :report      the report object
  382.  
  383. therest.each_with_index{|row, seqnum| #for each sequence in the csv
  384.     hasht = {}
  385.     row.each_with_index{|data, index| #for each column
  386.         #we assign things as symbols all in lowercase so
  387.         # the column name "Sequence" becomes :sequence
  388.         hasht[names[index].downcase.to_sym] = data
  389.     }
  390.     #now we want to write all our seqences to fasta files so we can use bl2seq
  391.     hasht[:filename] = File.join(tempDir, "seq" + seqnum.to_s + ".faa")
  392.     #we store the filename in our hash too, and write out the file
  393.     #  I assume :sequence and :designation are 2 fields given in the CSV
  394.     #  TODO: check that these names exist first
  395.     unless reuseOld
  396.         File.open(hasht[:filename], 'w') do |file|
  397.             file.write(Bio::Sequence::NA.new(hasht[:sequence]).to_fasta(hasht[:designation], 60))
  398.             file.close
  399.         end
  400.     end
  401.     hasht[:reportFile] = File.join(reportDir, "seq" + seqnum.to_s + ".report")
  402.     #add our hash to the full array
  403.     hasharray << hasht
  404.     print "." unless quiet
  405. }
  406. print "Done\n" unless quiet
  407.  
  408. #for each sequence we run blastx against the user given database
  409. # this assumes the database is proteins and the seqences are animo acids
  410. # TODO don't require blastx only
  411.  
  412. if (reuseOld)  #we don't have to run blast, just read in the old files
  413.     print "Reading in old BLAST results" unless quiet
  414.     hasharray.each_with_index{|searchhash, index|
  415.         File.open(searchhash[:reportFile], 'r') do |file|
  416.             hasharray[index][:report] = Bio::Blast::Bl2seq::Report.new(file.read)
  417.             file.close
  418.         end
  419.         print "." unless quiet
  420.     }
  421.     print "Done\n" unless quiet
  422. else
  423.     print "Performing BLAST searches" unless quiet
  424.     hasharray.each_with_index{|searchhash, index|
  425.  
  426.         cmd = ['bl2seq', '-i', dbtempfile, '-j', searchhash[:filename], '-p', 'blastx']
  427.         #build up our command then run it, storing each result in our has as well
  428.         reportText = Bio::Command.query_command(cmd)
  429.  
  430.         #write out the results to a report file
  431.         File.open(searchhash[:reportFile], 'w') do |file|
  432.             file.write(reportText)
  433.             file.close
  434.         end
  435.         hasharray[index][:report] = Bio::Blast::Bl2seq::Report.new(reportText)
  436.         print "." unless quiet
  437.     }
  438.     print "Done\n" unless quiet
  439. end
  440.  
  441. # knownORFs will be an array of hashs that will hold information about our
  442. # open reading frames
  443. # each hash will have data about that orf and any hits that fall in the orf
  444. # :direction   will only be -1 or 1 for what side of the genome it is on
  445. # :locus_tag   copied from the genbank file
  446. # :hits        an array of hashes storing information about hits (described later)
  447. knownORFs = []
  448.  
  449. print "Getting open reading frames from genbank file..." unless quiet
  450. mygb.features.select{|x| x.feature == "CDS"}.each{|y|
  451.         knownORFs << {:range =>y.locations.range,
  452.             :direction => y.locations[0].strand,
  453.             :locus_tag => y.qualifiers.select{|x| x.qualifier == "locus_tag"}[0].value,
  454.             :hits => [] }
  455. }
  456. print "Done\n" unless quiet
  457. #each ORF is put on as an array, its range, the gene name (as a string) and
  458. #  an empty array for results
  459.  
  460. print "Collecting hits" unless quiet
  461. hithash = []
  462. hasharray.each{|searchObj|
  463.     searchObj[:report].each{|hit|
  464.         hit.hsps.each{ |hsp|
  465.             #if our expect value is greater then the cutoff
  466.             # move on to the next hsp
  467.             next if hsp.evalue > expect_value_cutoff
  468.  
  469.             frame = hsp.query_frame
  470.             min = hsp.query_from
  471.             max = hsp.query_to
  472.             if frame < 0 then
  473.                 min,max = max, min #swap values
  474.             end
  475.  
  476.             ## this is the actual hash for a hit, any of these will be available
  477.             # in the scoring function
  478.             #
  479.             # :min, :max, :length    obvious
  480.             # :csvscore     score given in the CSV file
  481.             # :bscore       blast score
  482.             # :frame        -3, -2, -1, 1, 2, or 3
  483.             # :expect       expect value given by blast
  484.             # :desig        designation of the corresponding item from the CSV file
  485.             hithash << {:min => min,
  486.                         :max => max,
  487.                         :length => hsp.align_len,
  488.                         :csvscore => searchObj[:score],
  489.                         :bscore => hsp.score,
  490.                         :frame => hsp.query_frame,
  491.                         :expect => hsp.evalue,
  492.                         :desig => searchObj[:designation]}
  493.         }
  494.     }
  495.     print "." unless quiet
  496. }
  497. print "Done\n" unless quiet
  498.  
  499. #build the ORF array for each hit
  500. print "Sorting hits into Orfs" unless quiet
  501. knownORFs.each{|orfArr|
  502.     hithash.select{|hit|
  503.         orfArr[:range] === hit[:min] or #should be OR??
  504.         orfArr[:range] === hit[:max]}.each{|hit|
  505.             orfArr[:hits] << hit
  506.             print "." unless quiet
  507.        }
  508. }
  509. print "Done\n" unless quiet
  510.  
  511. #we want to ignore any ORFs with no hits in them
  512. print "Removing useless ORFs.." unless quiet
  513. knownORFs.reject!{|orf| orf[:hits].empty?}
  514. print "Done\n" unless quiet
  515.  
  516. #we sort the remaining ORFs by their start value
  517. # this is important for creating regions effeciently
  518. print "Sorting Orfs.." unless quiet
  519. knownORFs = knownORFs.sort{|a,b| a[:range].min <=> b[:range].min }
  520. print "Done\n" unless quiet
  521.  
  522. # find sort our hits within each orf, this is nice for output reasons
  523. print "Sorting Hits within Orfs.." unless quiet
  524. knownORFs.each{|orf|
  525.     orf[:hits] = orf[:hits].sort{|a,b| a[:min] <=> b[:min]}
  526. }
  527. print "Done\n" unless quiet
  528.  
  529. #now we actually build our regions
  530. # since we know that our ORFs are sorted already, we start at the first ORF
  531. # and say that our region starts at that min value minus the spread
  532. # we say that the region may end at that max value plus the spread
  533. #  if there the next ORF falls in this region, we say the region now
  534. #  ends at the new ORFs max value plus the spread
  535. # continue until we find that the next ORF is not in the region
  536. # then this is our region
  537. print "Calculating Regions" unless quiet
  538. regions= []
  539. start = knownORFs[0][:range].min - spread
  540. final = knownORFs[0][:range].max + spread
  541. if start < 0 then start = 0 end
  542. if final > largest then final = largest end
  543.  
  544. init = [start, final, [ knownORFs[0] ]]
  545.  
  546. #for each ORF
  547. knownORFs[1..knownORFs.length].inject(init){|currentReg, orf|
  548.     #see if this orf is in our region
  549.     if orf[:range].min < currentReg[1] then
  550.         #extend our region
  551.         newmax = orf[:range].max + spread
  552.         if newmax > largest then newmax = largest end
  553.         currentReg[1] = newmax
  554.         # add the current orf to the region
  555.         currentReg[2] << orf
  556.     else
  557.         #this orf is on in this region
  558.         #so add this region to the list of regions
  559.         regions << {:range => (currentReg[0])..(currentReg[1]),
  560.                 :orfs => currentReg[2]}
  561.  
  562.         # start a new region from this orf
  563.         start = orf[:range].min - spread
  564.         final = orf[:range].max + spread
  565.         if final > largest then final = largest end
  566.        
  567.         currentReg = [start, final, [orf]]
  568.         print "." unless quiet
  569.        
  570.     end
  571.     currentReg  #return the current region to the next loop iteration
  572. }
  573. print "Done\n" unless quiet
  574.  
  575. #now we actually calculate the scores, this is passed off to the sumScore
  576. # function that is at the top of the file
  577. print "Calculating scores" unless quiet
  578. regions.each{|reg|
  579.     reg[:score] = sumScore(reg[:orfs])
  580.     print "." unless quiet
  581. }
  582. print "Done\n" unless quiet
  583.  
  584. #if we said to cleanup the temp files, we can get rid of them now
  585. unless keepTmp then
  586.     print "Cleaning up temp files" unless quiet
  587.     hasharray.each{|x|
  588.         File.delete(x[:filename], x[:reportFile])
  589.         print "." unless quiet
  590.     }
  591.     if reportDir_was_created then Dir.rmdir(reportDir) end
  592.     if tempDir_was_created then Dir.rmdir(tempDir) end
  593.     print "Done\n" unless quiet
  594. end
  595.  
  596. print "\n" unless quiet
  597.  
  598. #OUTPUT
  599. FasterCSV.open(outputcsv, "w") do |csvout| #we always write an output csv file
  600.     #columns for the csv file
  601.     csvout << ['range', 'score','orfs', 'sequence']
  602.  
  603.     #for keeping things nice, we output the command they used
  604.     output.puts("Results from command \"" + actualProgramArgs + "\"\n")
  605.  
  606.     #for each region where its score is greater then the cutoff value
  607.     regions.select{|r| r[:score] >= cutoff}.each{|reg|
  608.        
  609.         #print our range and score
  610.         output.puts "Range: " + reg[:range].to_s
  611.         output.puts "Score: " + reg[:score].prec_f.to_s
  612.         #print our orf locus_tag and then the list of hits
  613.         reg[:orfs].each {|orf|
  614.             output.puts orf[:locus_tag].to_s + ": " +
  615.                 orf[:hits].map{|hit| hit[:desig].to_s + "(" + hit[:min].to_s + ".." + hit[:max].to_s + ")" }.join(" ")
  616.         }
  617.         output.puts ""
  618.        
  619.         #now do the same for the csv file
  620.         csvout << [reg[:range].to_s,
  621.                    reg[:score].prec_f.to_s,
  622.                    reg[:orfs].map{|x| x[:locus_tag] + " [ " + x[:hits].map{|y| y[:desig].to_s}.join(" ") + " ] " }.join(" "),
  623.                    mygb.seq[reg[:range]].to_s]
  624.     }
  625. end
  626.  
  627. #unless our output is just stdout, close the file
  628. output.close if output.class == File