joseleeph

Untitled

Dec 1st, 2020
34
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.80 KB | None | 0 0
  1. from sys import argv, exit
  2. import re
  3. if len(argv) < 2:
  4. print("mising command-line argument")
  5. exit(1)
  6.  
  7.  
  8. # take a sequence of dna and
  9. #determine the person
  10. #it probably belongs to
  11.  
  12. # a dna sequence is a sequence of
  13. # nucleotide bases with 4 bases
  14. # with the initials cgta
  15.  
  16. #a short tandem repeat, str,
  17. #a sequence of dna that repeats consecutively at particular location
  18. #strs vary a lot in the population
  19. #different people have different strs repeated a different number of times
  20.  
  21. #For any STR, people in a population
  22. #vary in how many times that
  23. #particular STR repeats consecutively
  24.  
  25. #matching str counts can be used to identify who a sample of dna belongs to
  26.  
  27. #if we count for each str how many times each str repeats consecutively from a sample of dna, and it
  28. #matches an existing piece of dna from a data base
  29. # it's very likely that the two pieces of dna came from the same person
  30.  
  31. # you will do something similar
  32. # you will have a DNA database formated as a csv file
  33. # each row corresponds to a person, each column to an str, a sequence of DNA that repeats
  34. # we can take the csv file and format it as a table
  35.  
  36. # the table will have columns labeled for the DNA sequences AGAT, AATG, and TATC
  37. # the rows will be labeled with names of the dna sequence
  38. # the intersection will be the number of times the seqence appears
  39.  
  40.  
  41.  
  42. # note to self, text 5 probably belongs to alice
  43. # think about matching substrings sto the sequence
  44.  
  45. # what does small.csv represent the count of?
  46. # i think because it's small it's the number of consecutive times that sequence
  47. # will appear for that person in the small sequence? which sequence?
  48. # the number of times in matches in te large dictionary?
  49. # try to pull up the small dictionary and count how many times the sequence repeats
  50. # check the start and end of sequence for each sequence and see how long it is
  51.  
  52.  
  53. # you will have a csv file representing the data
  54. # you will also a dna sequence formated as a text file called sequence.txt
  55. # you must take the dna sequence and figure out if it matches all the str counts
  56. # for any person in the dna database given
  57. pattern1 = re.compile(r'AGAT')
  58. pattern2 = re.compile(r'AATG')
  59. pattern3 = re.compile(r'TATC')
  60. #with open("small.csv", "r") as file:
  61. #with open(sequences/"1.txt", "r") as f:
  62. #with open(argv[1], "r") as f:
  63. #contents = f.read()
  64. #matches = pattern2.finditer(contents)
  65. #for match in matches:
  66. #print(match)
  67. #with open(argv[1], "r") as f:
  68. #contents = f.read()
  69. #matches = pattern3.finditer(contents)
  70. #for match in matches:
  71. #print(match)
  72.  
  73. #print("Here is a list of all the times the string TATC occurs")
  74. with open(argv[1], "r") as f:
  75. count = 0
  76. contents = f.read()
  77. print("contents prints: ")
  78. print(contents)
  79. #print(contents[0:4])
  80. i = 0
  81. j = i + 4
  82. while contents[i:j]: # will read contents until end of file
  83. span = contents[i:j]
  84. #count = 0
  85. #sample output for this count
  86.  
  87. #span GAAA reapeats 0 consecutive times
  88. #span GGAG reapeats 0 consecutive times
  89. #span GGAT reapeats 0 consecutive times
  90.  
  91. while contents[i+4:j+4] == span:
  92. count += 1
  93. #print("span reapeats " + str(count) + " times" )
  94. i += 4
  95. j += 4
  96. count = 0 # resetting count to 0 as suggested
  97.  
  98. #sample output for this count
  99.  
  100. #span GAAA reapeats 0 consecutive times
  101. #span GGAG reapeats 0 consecutive times
  102. #span GGAT reapeats 0 consecutive times
  103. #count = 0
  104. #count +=1
  105. i += 4
  106. j += 4
  107. print("span " + span + " reapeats " + str(count) + " consecutive times" )
  108. #count = 0
  109. count = 0
  110. #print("span " + span + " reapeats " + str(count) + " consecutive times" )
  111.  
  112.  
Advertisement
Add Comment
Please, Sign In to add comment