Guest User

Untitled

a guest
Mar 19th, 2018
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.46 KB | None | 0 0
  1. #!/usr/bin/env python
  2. """Identify low-complexity regions in a fasta file.
  3.  
  4. Outputs a bed file with annotated homopolymers and dinucleotide streches.
  5. """
  6. import re
  7. import argparse
  8. import csv
  9.  
  10. from multiprocessing import Pool, cpu_count
  11. from Bio import SeqIO
  12. from Bio.Alphabet import IUPAC
  13.  
  14.  
  15. parser = argparse.ArgumentParser(description=__doc__)
  16. parser.add_argument('reference', help='path for input fasta',
  17. type=str)
  18. parser.add_argument('--minhrun', help='minimum homopolymer run length (0=disabled)',
  19. type=int, default=0)
  20. parser.add_argument('--minrepeats', help='minimum dinucleotide repeats (0=disabled)',
  21. type=int, default=0)
  22. parser.add_argument('--threads', help='number of threads (1)',
  23. type=int, default=1, choices=range(1, cpu_count()))
  24. args = parser.parse_args()
  25.  
  26. if not (args.minhrun or args.minrepeats):
  27. parser.error('No action requested, add --minhrun or --minrepeats')
  28.  
  29.  
  30. # generate regex
  31. BASES = IUPAC.IUPACUnambiguousDNA.letters
  32. DINUC_REPEATS = [x + y for x in BASES for y in BASES if x != y]
  33. PATTERN = ''
  34. if args.minrepeats:
  35. PATTERN = ('|').join(["(%s){%s,}" % (x, args.minrepeats)
  36. for x in DINUC_REPEATS])
  37. if args.minhrun:
  38. if PATTERN: PATTERN += '|'
  39. PATTERN = ('|').join(["(%s){%s,}" % (x, args.minhrun)
  40. for x in BASES])
  41. print PATTERN
  42.  
  43.  
  44. def find_repeats(seq_record, pattern=PATTERN):
  45. """Find and record all occurrences of PATTERN in seq_record.
  46. Case-insensitive.
  47.  
  48. Outputs list of 4-tuples.
  49. Example:
  50. [('chr1', 11019, 11026, 'Gx7'),
  51. ('chr1', 36793, 36803, 'TGx5'),...]
  52. """
  53. repeat_list = list()
  54. for repeat in re.finditer(pattern, str(seq_record.seq), flags=re.I):
  55. unit = [x for x in repeat.groups() if x][0].upper() # 'AC'
  56. size = len(repeat.group())/len(unit) # len('ACACAC')/len('AC')
  57. repeat_list.append((seq_record.id,
  58. repeat.start(), repeat.end(),
  59. unit + 'x' + str(size)))
  60. return repeat_list
  61.  
  62.  
  63. p = Pool(args.threads)
  64. repeat_list = p.map(find_repeats, SeqIO.parse(args.reference, 'fasta'))
  65. repeat_list = [item for items in repeat_list for item in items] # unpack
  66.  
  67. outfile = args.reference + \
  68. '.dinuc_repeats_ge' + str(args.minrepeats) + \
  69. '.hrun_ge' + str(args.minhrun) + \
  70. '.bed'
  71.  
  72. with open(outfile, 'wb') as f:
  73. writer = csv.writer(f, delimiter='\t')
  74. writer.writerows(repeat_list)
Add Comment
Please, Sign In to add comment