Guest User

Untitled

a guest
Jun 22nd, 2018
160
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.44 KB | None | 0 0
  1. maasha@cletus:~/maasha_install/lib/ruby/gems/1.9.1/gems/bioruby-0.6.4/lib/bio$ cat sequence.rb
  2. #
  3. # bio/sequence.rb - biological sequence class
  4. #
  5. # Copyright (C) 2000-2005 KATAYAMA Toshiaki <k@bioruby.org>
  6. # Copyright (C) 2001 Yoshinori K. Okuji <o@bioruby.org>
  7. # Copyright (C) 2003 GOTO Naohisa <ng@bioruby.org>
  8. #
  9. # This library is free software; you can redistribute it and/or
  10. # modify it under the terms of the GNU Lesser General Public
  11. # License as published by the Free Software Foundation; either
  12. # version 2 of the License, or (at your option) any later version.
  13. #
  14. # This library is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. # Lesser General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Lesser General Public
  20. # License along with this library; if not, write to the Free Software
  21. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  22. #
  23. # $Id: sequence.rb,v 0.40 2005/08/10 12:53:02 k Exp $
  24. #
  25.  
  26. require 'bio/data/na'
  27. require 'bio/data/aa'
  28. require 'bio/data/codontable'
  29. require 'bio/location'
  30.  
  31. module Bio
  32.  
  33. # Nucleic/Amino Acid sequence
  34.  
  35. class Sequence < String
  36.  
  37. def to_s
  38. "%s" % self
  39. end
  40. alias :to_str :to_s
  41.  
  42. def seq
  43. self.class.new(self)
  44. end
  45.  
  46. def normalize!
  47. initialize(self)
  48. self
  49. end
  50. alias :seq! :normalize!
  51.  
  52. def <<(*arg)
  53. super(self.class.new(*arg))
  54. end
  55. alias :concat :<<
  56.  
  57. def +(*arg)
  58. self.class.new(super(*arg))
  59. end
  60.  
  61.  
  62. def subseq(s = 1, e = self.length)
  63. return nil if s < 1 or e < 1
  64. s -= 1
  65. e -= 1
  66. self[s..e]
  67. end
  68.  
  69. def to_fasta(header = '', width = nil)
  70. ">#{header}\n" +
  71. if width
  72. self.to_s.gsub(Regexp.new(".{1,#{width}}"), "\\0\n")
  73. else
  74. self.to_s + "\n"
  75. end
  76. end
  77.  
  78. def fasta(factory, header = nil)
  79. factory.query(self.to_fasta(header))
  80. end
  81.  
  82. def blast(factory, header = nil)
  83. factory.query(self.to_fasta(header))
  84. end
  85.  
  86. def window_search(window_size, step_size = 1)
  87. i = 0
  88. 0.step(self.length - window_size, step_size) do |i|
  89. yield self[i, window_size]
  90. end
  91. return self[i + window_size .. -1]
  92. end
  93.  
  94. def total(hash)
  95. hash.default = 0.0 unless hash.default
  96. sum = 0.0
  97. self.each_byte do |x|
  98. begin
  99. sum += hash[x.chr]
  100. end
  101. end
  102. return sum
  103. end
  104.  
  105. def composition
  106. count = Hash.new(0)
  107. self.scan(/./) do |x|
  108. count[x] += 1
  109. end
  110. return count
  111. end
  112.  
  113. def randomize(hash = nil)
  114. length = self.length
  115. if hash
  116. count = hash.clone
  117. count.each_value {|x| length += x}
  118. else
  119. count = self.composition
  120. end
  121.  
  122. seq = ''
  123. tmp = {}
  124. length.times do
  125. count.each do |k, v|
  126. tmp[k] = v * rand
  127. end
  128. max = tmp.max {|a, b| a[1] <=> b[1]}
  129. count[max.first] -= 1
  130.  
  131. if block_given?
  132. yield max.first
  133. else
  134. seq += max.first
  135. end
  136. end
  137. return self.class.new(seq)
  138. end
  139.  
  140. def self.randomize(*arg, &block)
  141. self.new('').randomize(*arg, &block)
  142. end
  143.  
  144. # This method depends on Locations class, see bio/location.rb
  145. def splicing(position)
  146. unless position.is_a?(Locations) then
  147. position = Locations.new(position)
  148. end
  149. s = ''
  150. position.each do |location|
  151. if location.sequence
  152. s << location.sequence
  153. else
  154. exon = self.subseq(location.from, location.to)
  155. begin
  156. exon.complement! if location.strand < 0
  157. rescue NameError
  158. end
  159. s << exon
  160. end
  161. end
  162. return self.class.new(s)
  163. end
  164.  
  165.  
  166. # Nucleic Acid sequence
  167.  
  168. class NA < Sequence
  169.  
  170. def initialize(str)
  171. super
  172. self.downcase!
  173. self.tr!(" \t\n\r",'')
  174. end
  175.  
  176. # This method depends on Locations class, see bio/location.rb
  177. def splicing(position)
  178. mRNA = super
  179. if mRNA.rna?
  180. mRNA.tr!('t', 'u')
  181. else
  182. mRNA.tr!('u', 't')
  183. end
  184. mRNA
  185. end
  186.  
  187. def complement
  188. s = self.class.new(self)
  189. s.complement!
  190. s
  191. end
  192.  
  193. def complement!
  194. if self.rna?
  195. self.reverse!
  196. self.tr!('augcrymkdhvbswn', 'uacgyrkmhdbvswn')
  197. else
  198. self.reverse!
  199. self.tr!('atgcrymkdhvbswn', 'tacgyrkmhdbvswn')
  200. end
  201. self
  202. end
  203.  
  204. def translate(frame = 1, table = 1, unknown = 'X')
  205. if table.is_a?(Bio::CodonTable)
  206. ct = table
  207. else
  208. ct = Bio::CodonTable[table]
  209. end
  210. naseq = self.dna
  211. case frame
  212. when 1, 2, 3
  213. frame -= 1
  214. when 4, 5, 6
  215. frame -= 4
  216. naseq.complement!
  217. when -1, -2, -3
  218. frame = -1 - frame
  219. naseq.complement!
  220. else
  221. frame = 0
  222. end
  223. nalen = naseq.length - (naseq.length - frame) % 3
  224. aaseq = naseq[frame, nalen].gsub(/.{3}/) {|codon| ct[codon] or unknown}
  225. return Bio::Sequence::AA.new(aaseq)
  226. end
  227.  
  228. def gc_percent
  229. count = self.composition
  230. at = count['a'] + count['t'] + count['u']
  231. gc = count['g'] + count['c']
  232. gc = format("%.1f", gc.to_f / (at + gc) * 100)
  233. return gc.to_f
  234. end
  235. alias :gc :gc_percent
  236.  
  237. def illegal_bases
  238. self.scan(/[^atgcu]/).sort.uniq
  239. end
  240.  
  241. # NucleicAcid is defined in bio/data/na.rb
  242. def molecular_weight
  243. if self.rna?
  244. NucleicAcid.weight(self, true)
  245. else
  246. NucleicAcid.weight(self)
  247. end
  248. end
  249.  
  250. def to_re
  251. if self.rna?
  252. NucleicAcid.to_re(self.dna, true)
  253. else
  254. NucleicAcid.to_re(self)
  255. end
  256. end
  257.  
  258. def names
  259. array = []
  260. self.each_byte do |x|
  261. array.push(NucleicAcid.names[x.chr.upcase])
  262. end
  263. return array
  264. end
  265.  
  266. def dna
  267. self.tr('u', 't')
  268. end
  269.  
  270. def dna!
  271. self.tr!('u', 't')
  272. end
  273.  
  274. def rna
  275. self.tr('t', 'u')
  276. end
  277.  
  278. def rna!
  279. self.tr!('t', 'u')
  280. end
  281.  
  282. def rna?
  283. self.index('u')
  284. end
  285. protected :rna?
  286.  
  287. def pikachu
  288. self.dna.tr("atgc", "pika") # joke, of course :-)
  289. end
  290.  
  291. end
  292.  
  293.  
  294. # Amino Acid sequence
  295.  
  296. class AA < Sequence
  297.  
  298. def initialize(str)
  299. super
  300. self.upcase!
  301. self.tr!(" \t\n\r",'')
  302. end
  303.  
  304. # AminoAcid is defined in bio/data/aa.rb
  305. def molecular_weight
  306. AminoAcid.weight(self)
  307. end
  308.  
  309. def to_re
  310. AminoAcid.to_re(self)
  311. end
  312.  
  313. def codes
  314. array = []
  315. self.each_byte do |x|
  316. array.push(AminoAcid.names[x.chr])
  317. end
  318. return array
  319. end
  320.  
  321. def names
  322. self.codes.map do |x|
  323. AminoAcid.names[x]
  324. end
  325. end
  326.  
  327. end
  328.  
  329. end
  330.  
  331.  
  332. class Seq < Sequence
  333. attr_accessor :entry_id, :definition, :features, :references, :comments,
  334. :date, :keywords, :dblinks, :taxonomy, :moltype
  335. end
  336.  
  337. end
  338.  
  339.  
  340. if __FILE__ == $0
  341.  
  342. puts "== Test Bio::Sequence::NA.new"
  343. p Bio::Sequence::NA.new('')
  344. p na = Bio::Sequence::NA.new('atgcatgcATGCATGCAAAA')
  345. p rna = Bio::Sequence::NA.new('augcaugcaugcaugcaaaa')
  346.  
  347. puts "\n== Test Bio::Sequence::AA.new"
  348. p Bio::Sequence::AA.new('')
  349. p aa = Bio::Sequence::AA.new('ACDEFGHIKLMNPQRSTVWYU')
  350.  
  351. puts "\n== Test Bio::Sequence#to_s"
  352. p na.to_s
  353. p aa.to_s
  354.  
  355. puts "\n== Test Bio::Sequence#subseq(2,6)"
  356. p na
  357. p na.subseq(2,6)
  358.  
  359. puts "\n== Test Bio::Sequence#[2,6]"
  360. p na
  361. p na[2,6]
  362.  
  363. puts "\n== Test Bio::Sequence#to_fasta('hoge', 8)"
  364. puts na.to_fasta('hoge', 8)
  365.  
  366. puts "\n== Test Bio::Sequence#window_search(15)"
  367. p na
  368. na.window_search(15) {|x| p x}
  369.  
  370. puts "\n== Test Bio::Sequence#total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})"
  371. p na.total({'a'=>0.1,'t'=>0.2,'g'=>0.3,'c'=>0.4})
  372.  
  373. puts "\n== Test Bio::Sequence#composition"
  374. p na
  375. p na.composition
  376. p rna
  377. p rna.composition
  378.  
  379. puts "\n== Test Bio::Sequence::NA#splicing('complement(join(1..5,16..20))')"
  380. p na
  381. p na.splicing("complement(join(1..5,16..20))")
  382. p rna
  383. p rna.splicing("complement(join(1..5,16..20))")
  384.  
  385. puts "\n== Test Bio::Sequence::NA#complement"
  386. p na.complement
  387. p rna.complement
  388. p Bio::Sequence::NA.new('tacgyrkmhdbvswn').complement
  389. p Bio::Sequence::NA.new('uacgyrkmhdbvswn').complement
  390.  
  391. puts "\n== Test Bio::Sequence::NA#translate"
  392. p na
  393. p na.translate
  394. p rna
  395. p rna.translate
  396.  
  397. puts "\n== Test Bio::Sequence::NA#gc_percent"
  398. p na.gc
  399. p rna.gc
  400.  
  401. puts "\n== Test Bio::Sequence::NA#illegal_bases"
  402. p na.illegal_bases
  403. p Bio::Sequence::NA.new('tacgyrkmhdbvswn').illegal_bases
  404. p Bio::Sequence::NA.new('abcdefghijklmnopqrstuvwxyz-!%#$@').illegal_bases
  405.  
  406. puts "\n== Test Bio::Sequence::NA#molecular_weight"
  407. p na
  408. p na.molecular_weight
  409. p rna
  410. p rna.molecular_weight
  411.  
  412. puts "\n== Test Bio::Sequence::NA#to_re"
  413. p Bio::Sequence::NA.new('atgcrymkdhvbswn')
  414. p Bio::Sequence::NA.new('atgcrymkdhvbswn').to_re
  415. p Bio::Sequence::NA.new('augcrymkdhvbswn')
  416. p Bio::Sequence::NA.new('augcrymkdhvbswn').to_re
  417.  
  418. puts "\n== Test Bio::Sequence::NA#names"
  419. p na.names
  420.  
  421. puts "\n== Test Bio::Sequence::NA#pikachu"
  422. p na.pikachu
  423.  
  424. puts "\n== Test Bio::Sequence::NA#randomize"
  425. print "Orig : "; p na
  426. print "Rand : "; p na.randomize
  427. print "Rand : "; p na.randomize
  428. print "Rand : "; p na.randomize.randomize
  429. print "Block : "; na.randomize do |x| print x end; puts
  430.  
  431. print "Orig : "; p rna
  432. print "Rand : "; p rna.randomize
  433. print "Rand : "; p rna.randomize
  434. print "Rand : "; p rna.randomize.randomize
  435. print "Block : "; rna.randomize do |x| print x end; puts
  436.  
  437. puts "\n== Test Bio::Sequence::NA.randomize(counts)"
  438. print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'t'=>40}
  439. print "Rand : "; p Bio::Sequence::NA.randomize(counts)
  440. print "Count : "; p counts = {'a'=>10,'c'=>20,'g'=>30,'u'=>40}
  441. print "Rand : "; p Bio::Sequence::NA.randomize(counts)
  442. print "Block : "; Bio::Sequence::NA.randomize(counts) {|x| print x}; puts
  443.  
  444. puts "\n== Test Bio::Sequence::AA#codes"
  445. p aa
  446. p aa.codes
  447.  
  448. puts "\n== Test Bio::Sequence::AA#names"
  449. p aa
  450. p aa.names
  451.  
  452. puts "\n== Test Bio::Sequence::AA#molecular_weight"
  453. p aa.subseq(1,20)
  454. p aa.subseq(1,20).molecular_weight
  455.  
  456. puts "\n== Test Bio::Sequence::AA#randomize"
  457. aaseq = 'MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDA'
  458. s = Bio::Sequence::AA.new(aaseq)
  459. print "Orig : "; p s
  460. print "Rand : "; p s.randomize
  461. print "Rand : "; p s.randomize
  462. print "Rand : "; p s.randomize.randomize
  463. print "Block : "; s.randomize {|x| print x}; puts
  464.  
  465. puts "\n== Test Bio::Sequence::AA.randomize(counts)"
  466. print "Count : "; p counts = s.composition
  467. print "Rand : "; puts Bio::Sequence::AA.randomize(counts)
  468. print "Block : "; Bio::Sequence::AA.randomize(counts) {|x| print x}; puts
  469.  
  470. end
  471.  
  472.  
  473. =begin
  474.  
  475. = Bio::Sequence
  476.  
  477. You can use Bio::Seq instead of Bio::Sequence for short.
  478.  
  479. --- Bio::Sequence#seq
  480.  
  481. Force self to re-initialize for clean up (remove white spaces,
  482. case unification).
  483.  
  484. --- Bio::Sequence#seq!
  485. --- Bio::Sequence#normalize!
  486.  
  487. Similar to the 'seq' method, but changes the self object destructively.
  488.  
  489. --- Bio::Sequence#subseq(start = 1, end = self.length)
  490.  
  491. Returns the subsequence of the self string.
  492.  
  493. --- Bio::Sequence#to_fasta(header = '', width = nil)
  494.  
  495. Output the FASTA format string of the sequence. The 1st argument is
  496. used as the comment string. If the 2nd option is given, the output
  497. sequence will be folded.
  498.  
  499. --- Bio::Sequence#fasta(factory, header = '')
  500.  
  501. Execute fasta by the factory (Bio::Fasta object) and returns
  502. Bio::Fasta::Report object. See Bio::Fasta for more details.
  503.  
  504. --- Bio::Sequence#blast(factory, header = '')
  505.  
  506. Execute blast by the factory (Bio::Blast object) and returns
  507. Bio::Blast::Report object. See Bio::Blast for more details.
  508.  
  509. --- Bio::Sequence#splicing(position)
  510.  
  511. Receive a GenBank style position string and convert it to the Locations
  512. objects to splice the sequence itself. See also: bio/location.rb
  513.  
  514. --- Bio::Sequence#window_search(window_size, step_size = 1)
  515.  
  516. This method iterates on sub string with specified length 'window_size'.
  517. By specifing 'step_size', codon sized shifting or spliting genome
  518. sequence with ovelapping each end can easily be yielded.
  519.  
  520. The remainder sequence at the terminal end will be returned.
  521.  
  522. Example:
  523. # prints average GC% on each 100bp
  524. seq.window_search(100) do |subseq|
  525. puts subseq.gc
  526. end
  527. # prints every translated peptide (length 5aa) in the same frame
  528. seq.window_search(15, 3) do |subseq|
  529. puts subseq.translate
  530. end
  531. # split genome sequence by 10000bp with 1000bp overlap in fasta format
  532. i = 1
  533. remainder = seq.window_search(10000, 9000) do |subseq|
  534. puts subseq.to_fasta("segment #{i}", 60)
  535. i += 1
  536. end
  537. puts remainder.to_fasta("segment #{i}", 60)
  538.  
  539. --- Bio::Sequence#total(hash)
  540.  
  541. This method receive a hash of residues/bases to the particular values,
  542. and sum up the value along with the self sequence. Especially useful
  543. to use with the window_search method and amino acid indices etc.
  544.  
  545. --- Bio::Sequence#composition
  546.  
  547. Returns a hash of the occurrence counts for each residue or base.
  548.  
  549. --- Bio::Sequence#randomize(count = nil)
  550.  
  551. Returns a randomized sequence keeping its composition by default.
  552. The argument is required when generating a random sequence from the empty
  553. sequence (used by the class methods NA.randomize, AA.randomize).
  554. If the block is given, yields for each random residue/base.
  555.  
  556. --- Bio::Sequence.randomize(composition)
  557.  
  558. Generate a new random sequence with the given frequency of bases
  559. or residues. The sequence length is determined by the sum of each
  560. base/residue occurences.
  561.  
  562.  
  563. == Bio::Sequence::NA
  564.  
  565. --- Bio::Sequence::NA.new(str)
  566.  
  567. Generate a nucleic acid sequence object from a string.
  568.  
  569. --- Bio::Sequence::NA#complement
  570. --- Bio::Sequence::NA#complement!
  571.  
  572. Returns a reverse complement sequence (including the universal codes).
  573.  
  574. --- Bio::Sequence::NA#translate(frame = 1, table = 1, unknown = 'X')
  575.  
  576. Translate into the amino acid sequence from the given frame and the
  577. selected codon table. The table also can be a Bio::CodonTable object.
  578. The 'unknown' character is used for invalid/unknown codon (can be
  579. used for 'nnn' and/or gap translation in practice).
  580.  
  581. Frame can be 1, 2 or 3 for the forward strand and -1, -2 or -3
  582. (4, 5 or 6 is also accepted) for the reverse strand.
  583.  
  584. --- Bio::Sequence::NA#gc_percent
  585. --- Bio::Sequence::NA#gc
  586.  
  587. Calculate the ratio of GC / ATGC bases in percent.
  588.  
  589. --- Bio::Sequence::NA#illegal_bases
  590.  
  591. Show abnormal bases other than 'atgcu'.
  592.  
  593. --- Bio::Sequence::NA#molecular_weight
  594.  
  595. Estimate the weight of this biological string molecule.
  596.  
  597. --- Bio::Sequence::NA#to_re
  598.  
  599. Convert the universal code string into the regular expression.
  600.  
  601. --- Bio::Sequence::NA#names
  602.  
  603. Convert the self string into the list of the names of the each base.
  604.  
  605. --- Bio::Sequence::NA#dna
  606. --- Bio::Sequence::NA#dna!
  607.  
  608. Output a DNA string by substituting 'u' to 't'.
  609.  
  610. --- Bio::Sequence::NA#rna
  611. --- Bio::Sequence::NA#rna!
  612.  
  613. Output a RNA string by substituting 't' to 'u'.
  614.  
  615.  
  616. == Bio::Sequence::AA
  617.  
  618. --- Bio::Sequence::AA.new(str)
  619.  
  620. Generate a amino acid sequence object from a string.
  621.  
  622. --- Bio::Sequence::AA#codes
  623.  
  624. Generate the list of the names of the each residue along with the
  625. sequence (3 letters code).
  626.  
  627. --- Bio::Sequence::NA#names
  628.  
  629. Similar to codes but returns long names.
  630.  
  631. --- Bio::Sequence::AA#molecular_weight
  632.  
  633. Estimate the weight of this protein.
  634.  
  635. =end
  636.  
  637. maasha@cletus:~/maasha_install/lib/ruby/gems/1.9.1/gems/bioruby-0.6.4/lib/bio$
Add Comment
Please, Sign In to add comment