Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sys import argv, exit
- import re
- if len(argv) < 2:
- print("mising command-line argument")
- exit(1)
- # take a sequence of dna and
- #determine the person
- #it probably belongs to
- # a dna sequence is a sequence of
- # nucleotide bases with 4 bases
- # with the initials cgta
- #a short tandem repeat, str,
- #a sequence of dna that repeats consecutively at particular location
- #strs vary a lot in the population
- #different people have different strs repeated a different number of times
- #For any STR, people in a population
- #vary in how many times that
- #particular STR repeats consecutively
- #matching str counts can be used to identify who a sample of dna belongs to
- #if we count for each str how many times each str repeats consecutively from a sample of dna, and it
- #matches an existing piece of dna from a data base
- # it's very likely that the two pieces of dna came from the same person
- # you will do something similar
- # you will have a DNA database formated as a csv file
- # each row corresponds to a person, each column to an str, a sequence of DNA that repeats
- # we can take the csv file and format it as a table
- # the table will have columns labeled for the DNA sequences AGAT, AATG, and TATC
- # the rows will be labeled with names of the dna sequence
- # the intersection will be the number of times the seqence appears
- # note to self, text 5 probably belongs to alice
- # think about matching substrings sto the sequence
- # what does small.csv represent the count of?
- # i think because it's small it's the number of consecutive times that sequence
- # will appear for that person in the small sequence? which sequence?
- # the number of times in matches in te large dictionary?
- # try to pull up the small dictionary and count how many times the sequence repeats
- # check the start and end of sequence for each sequence and see how long it is
- # you will have a csv file representing the data
- # you will also a dna sequence formated as a text file called sequence.txt
- # you must take the dna sequence and figure out if it matches all the str counts
- # for any person in the dna database given
- pattern1 = re.compile(r'AGAT')
- pattern2 = re.compile(r'AATG')
- pattern3 = re.compile(r'TATC')
- #with open("small.csv", "r") as file:
- #with open(sequences/"1.txt", "r") as f:
- #with open(argv[1], "r") as f:
- #contents = f.read()
- #matches = pattern2.finditer(contents)
- #for match in matches:
- #print(match)
- #with open(argv[1], "r") as f:
- #contents = f.read()
- #matches = pattern3.finditer(contents)
- #for match in matches:
- #print(match)
- #print("Here is a list of all the times the string TATC occurs")
- with open(argv[1], "r") as f:
- count = 0
- contents = f.read()
- print("contents prints: ")
- print(contents)
- #print(contents[0:4])
- i = 0
- j = i + 4
- while contents[i:j]: # will read contents until end of file
- span = contents[i:j]
- #count = 0
- #sample output for this count
- #span GAAA reapeats 0 consecutive times
- #span GGAG reapeats 0 consecutive times
- #span GGAT reapeats 0 consecutive times
- while contents[i+4:j+4] == span:
- count += 1
- #print("span reapeats " + str(count) + " times" )
- i += 4
- j += 4
- count = 0 # resetting count to 0 as suggested
- #sample output for this count
- #span GAAA reapeats 0 consecutive times
- #span GGAG reapeats 0 consecutive times
- #span GGAT reapeats 0 consecutive times
- #count = 0
- #count +=1
- i += 4
- j += 4
- print("span " + span + " reapeats " + str(count) + " consecutive times" )
- #count = 0
- count = 0
- #print("span " + span + " reapeats " + str(count) + " consecutive times" )
Advertisement
Add Comment
Please, Sign In to add comment