Untitled

from sys import argv, exit
import re
if len(argv) < 2:
    print("mising command-line argument")
    exit(1)


# take a sequence of dna and
#determine the person
#it probably belongs to

# a dna sequence is a sequence of
# nucleotide bases with 4 bases
# with the initials cgta

#a short tandem repeat, str,
#a sequence of dna that repeats consecutively at particular location
#strs vary a lot in the population
#different people have different strs repeated a different number of times

#For any STR, people in a population
#vary in how many times that
#particular STR repeats consecutively

#matching str counts can be used to identify who a sample of dna belongs to

#if we count for each str how many times each str repeats consecutively from a sample of dna, and it
#matches an existing piece of dna from a data base
# it's very likely that the two pieces of dna came from the same person

# you will do something similar
# you will have a DNA database formated as a csv file
# each row corresponds to a person, each column to an str, a sequence of DNA that repeats
# we can take the csv file and format it as a table

#  the table will have columns labeled for the DNA sequences AGAT, AATG, and TATC
# the rows will be labeled with names of the dna sequence
# the intersection will be the number of times the seqence appears


# note to self, text 5 probably belongs to alice
# think about matching substrings sto the sequence

# what does small.csv represent the count of?
# i think because it's small it's the number of consecutive times that sequence
# will appear for that person in the small sequence? which sequence?
# the number of times in matches in te large dictionary?
# try to pull up the small dictionary and count how many times the sequence repeats
# check the start and end of sequence for each sequence and see how long it is


# you will have a csv file representing the data
# you will also a dna sequence formated as a text file called sequence.txt
# you must take the dna sequence and figure out if it matches all the str counts
# for any person in the dna database given
pattern1 = re.compile(r'AGAT')
pattern2 = re.compile(r'AATG')
pattern3 = re.compile(r'TATC')
#with open("small.csv", "r") as file:
#with open(sequences/"1.txt", "r") as f:
#with open(argv[1], "r") as f:
    #contents = f.read()
    #matches = pattern2.finditer(contents)
    #for match in matches:
        #print(match)
#with open(argv[1], "r") as f:
    #contents = f.read()
    #matches = pattern3.finditer(contents)
    #for match in matches:
        #print(match)

#print("Here is a list of all the times the string TATC occurs")
with open(argv[1], "r") as f:
    count = 0
    contents = f.read()
    print("contents prints: ")
    print(contents)
    #print(contents[0:4])
    i = 0
    j = i + 4
    while contents[i:j]: # will read contents until end of file
        span = contents[i:j]
        #count = 0
        #sample output for this count

        #span GAAA reapeats 0 consecutive times
        #span GGAG reapeats 0 consecutive times
        #span GGAT reapeats 0 consecutive times

        while contents[i+4:j+4] == span:
            count += 1
            #print("span reapeats " + str(count) + " times" )
            i += 4
            j += 4
            count = 0 # resetting count to 0 as suggested

            #sample output for this count

            #span GAAA reapeats 0 consecutive times
            #span GGAG reapeats 0 consecutive times
            #span GGAT reapeats 0 consecutive times
        #count = 0
        #count +=1
        i += 4
        j += 4
        print("span " + span + " reapeats " + str(count) + " consecutive times" )
        #count = 0
    count = 0
    #print("span " + span + " reapeats " + str(count) + " consecutive times" )