Untitled

from sys import argv, exit
import csv
import re
if len(argv) < 3:
    print("missing command line argument")

    #INSTRUCTIONS
    #1. Read the sequence file into a string (you've got this).

    #2. Open the csv file and get each sub string ("AGATC", "TCTG", etc)
    #from the first row into a list (you've essentially done this too).

    #3. For each item in the list of sub strings,
    #look through the sequence you read in step 1
    #and see what the longest sequential run is.
    #Store this number in a list so you have the #longest runs of all sub strings.

    #4. Iterate through the rest of the CSV file,
    #comparing the numbers in each row to the list
    #of numbers you created in step 3. When all
    #numbers match, the name from this row is the one to print.
with open(argv[1],"r") as file, open(argv[2],"r") as csvfile:
    count = 0
    contents = file.read() #1. Read the sequence file into a string (you've got this).
    csvcontents = csv.reader(csvfile)
    #2. Open the csv file and get each sub string
    #("AGATC", "TCTG", etc) from the first row
    #into a list (you've essentially done this too).

    header = next(csvcontents)
    print("header prints")
    print(header)

    #SENTOX ADVICE:
    # when you loop through anything that iterates in python
    # objects usually have a next method that provides the next

    # when you call something like

    # for row in csvcontents:

    # internally python will keep calling next() from csvcontents
    # to get each row one at a time

    # since the file has just been opened, it is the first row by definintion
    # with csv reader, the returns a lot of values
    # the header is already a list of values from the first row.
    #appending these values into another list is reduntant

    print("attempting to print rows of csvfile")
    for row in csvcontents:
        print(row)
    print("now header prints:")
    print(header)
    #SENTOX ADVICE:
    #you can use the subscript [1:] which means:
    #make a copy of this list starting from element 1 (which is the second element)
    #to the end of the list... in other words, drop the first element

    # this subscript cn be used directly on the list returned by next

     #complist = next(csvcontents)[1:]

    #3. For each item in the list of sub strings,
     #look through the sequence you read
     #in step 1 and see what the longest
     #sequential run is. Store this number
     #in a list so you have the longest runs of all sub strings.

    #complist = next(csvcontents)#[1:]
    #print("complist prints: ")
    #print(complist)

    #for item in contents:
    for item in header[1:]:
        #print("item prints:")
        #print(item)
        beg = 0 # beginning index
        end = len(item) # item length
        seqrun = 0
        longest = 0
        #while contents[beg:beg+len(item)]:
        while beg + end <= len(contents):
            #if contents[beg:beg+len(item)] == item: # trying to solve the issue of end being incremented incorrectly here
            seqrun = 0
            while contents[beg + len(item): beg + len(item)] == item:
                seqrun += 1
                beg += len(item)
                #end += len(item) i don't need to increment end as well?
            if seqrun > longest:
                longest = seqrun
            end += 1
        #print(item + " repeats " + str(seqrun) + "times")
        print(item + " repeats " + str(longest) + "times") # why is it longest?


        #if seqrun > 1:
            #print(item + " repeats " + str(seqrun) + " times")
            #beg += 1
            #end += 1
        #else:
            #beg += 1
            #end += 1

            # you are adding end to the beginning and itself, treating it as a proxy for len(item)
        # this would work, except end is no longer len(item)
        # it has been incremented through string over time
        # meaning we could be at any number... it could be index 500 of the string
        # so instead of 500 + 4 (if the item were 4 characters long)

        # once that is fixed, a variable is needed to hold the largest
        # seqrun found, otherwise you will just have the most recent seqrun instead