joseleeph

Untitled

Dec 10th, 2020
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.18 KB | None | 0 0
  1. from sys import argv, exit
  2. import csv
  3. import re
  4. if len(argv) < 3:
  5. print("mising command-line argument")
  6. exit(1)
  7.  
  8. #INSTRUCTIONS
  9.  
  10. #1. Read the sequence file into a string (you've got this).
  11.  
  12. #2. Open the csv file and get each sub string ("AGATC", "TCTG", etc) from the first row into a list (you've essentially done this too).
  13.  
  14. #3. For each item in the list of sub strings, look through the sequence you read in step 1 and see what the longest sequential run is. Store this number in a list so you have the longest runs of all sub strings.
  15.  
  16. #4. Iterate through the rest of the CSV file, comparing the numbers in each row to the list of numbers you created in step 3. When all numbers match, the name from this row is the one to print.
  17.  
  18.  
  19. with open(argv[1],"r") as file, open(argv[2],"r") as csvfile: # you may need to swap incdices
  20. count = 0
  21. contents = file.read() #1. Read the sequence file into a string (you've got this).
  22. csvcontents = csv.reader(csvfile)
  23.  
  24. #2. Open the csv file and get each sub string
  25. #("AGATC", "TCTG", etc) from the first row
  26. #into a list (you've essentially done this too).
  27.  
  28. header = next(csvcontents) # this is just a way to get the next row one time
  29. # see what next header does
  30. print("header prints:")
  31. print(header)
  32.  
  33.  
  34. #SENTOX ADVICE:
  35. # when you loop through anything that iterates in python
  36. # objects usually have a next method that provides the next
  37.  
  38. # when you call something like
  39.  
  40. # for row in csvcontents:
  41. # internally python will keep calling next() from csvcontents to get each row one at a time
  42.  
  43. # since the file has just been opened, it is the first row by definintion
  44. # with csv reader, the returns a lot of values
  45. # the header is already a list of values from the first row. appending these values into another list is reduntant
  46.  
  47. print("attempting to print rows of csv file")
  48. for row in csvcontents:
  49. print(row)
  50.  
  51.  
  52. #SENTOX ADVICE:
  53. #you can use the subscript [1:] which means:
  54. #make a copy of this list starting from element 1 (which is the second element)
  55. #to the end of the list... in other words, drop the first element
  56.  
  57. # this subscript cn be used directly on the list returned by next
  58.  
  59. #complist = next(csvcontents)[1:]
  60.  
  61. #3. For each item in the list of sub strings,
  62. #look through the sequence you read
  63. #in step 1 and see what the longest
  64. #sequential run is. Store this number
  65. #in a list so you have the longest runs of all sub strings.
  66.  
  67.  
  68.  
  69. #for item in sublist[1:]:
  70.  
  71. #while contents[beg:end]:
  72.  
  73. for item in contents: # look at each item in the list
  74. beg = 0 # beginning index
  75. end = len(item) # ending index
  76. seqrun = 0 # number of times the sequence runs/repeats
  77. longest = 0
  78. while contents[beg:beg+len(item)]: # while the substring of contents from beginning to end have values
  79. #while contents[beg:end]: # while the substring of contents from beginning to end have values
  80. if contents[beg:beg+len(item)] == item: # if the span of contents from beg to beg plus the length of the item
  81. seqrun = 1 # it occurs at least once
  82. while contents[beg + len(item):end + len(item)] == item: # as long as the contents span from beginning to end matches the contents span when incremented by end
  83. seqrun += 1
  84. beg += len(item)
  85. end += len(item)
  86. if seqrun > longest:
  87. longest = seqrun
  88. if seqrun > 1:
  89. print(item + " repeats " + str(seqrun) + " times")
  90. beg += 1
  91. end += 1
  92. else:
  93. beg += 1
  94. end += 1
  95. # SENTOX ADVICE
  96.  
  97. # you are adding end to the beginning and itself, treating it as a proxy for len(item)
  98. # this would work, except end is no longer len(item)
  99. # it has been incremented through string over time
  100. # meaning we could be at any number... it could be index 500 of the string
  101. # so instead of 500 + 4 (if the item were 4 characters long)
  102.  
  103. # once that is fixed, a variable is needed to hold the largest
  104. # seqrun found, otherwise you will just have the most recent seqrun instead
  105.  
  106.  
Advertisement
Add Comment
Please, Sign In to add comment