Advertisement
joseleeph

Untitled

Dec 10th, 2020
45
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.21 KB | None | 0 0
  1. from sys import argv, exit
  2. import csv
  3. import re
  4. if len(argv) < 3:
  5. print("missing command line argument")
  6.  
  7. #INSTRUCTIONS
  8. #1. Read the sequence file into a string (you've got this).
  9.  
  10. #2. Open the csv file and get each sub string ("AGATC", "TCTG", etc)
  11. #from the first row into a list (you've essentially done this too).
  12.  
  13. #3. For each item in the list of sub strings,
  14. #look through the sequence you read in step 1
  15. #and see what the longest sequential run is.
  16. #Store this number in a list so you have the #longest runs of all sub strings.
  17.  
  18. #4. Iterate through the rest of the CSV file,
  19. #comparing the numbers in each row to the list
  20. #of numbers you created in step 3. When all
  21. #numbers match, the name from this row is the one to print.
  22. with open(argv[1],"r") as file, open(argv[2],"r") as csvfile:
  23. count = 0
  24. contents = file.read() #1. Read the sequence file into a string (you've got this).
  25. csvcontents = csv.reader(csvfile)
  26. #2. Open the csv file and get each sub string
  27. #("AGATC", "TCTG", etc) from the first row
  28. #into a list (you've essentially done this too).
  29.  
  30. header = next(csvcontents)
  31. print("header prints")
  32. print(header)
  33.  
  34. #SENTOX ADVICE:
  35. # when you loop through anything that iterates in python
  36. # objects usually have a next method that provides the next
  37.  
  38. # when you call something like
  39.  
  40. # for row in csvcontents:
  41.  
  42. # internally python will keep calling next() from csvcontents
  43. # to get each row one at a time
  44.  
  45. # since the file has just been opened, it is the first row by definintion
  46. # with csv reader, the returns a lot of values
  47. # the header is already a list of values from the first row.
  48. #appending these values into another list is reduntant
  49.  
  50. print("attempting to print rows of csvfile")
  51. for row in csvcontents:
  52. print(row)
  53. print("now header prints:")
  54. print(header)
  55. #SENTOX ADVICE:
  56. #you can use the subscript [1:] which means:
  57. #make a copy of this list starting from element 1 (which is the second element)
  58. #to the end of the list... in other words, drop the first element
  59.  
  60. # this subscript cn be used directly on the list returned by next
  61.  
  62. #complist = next(csvcontents)[1:]
  63.  
  64. #3. For each item in the list of sub strings,
  65. #look through the sequence you read
  66. #in step 1 and see what the longest
  67. #sequential run is. Store this number
  68. #in a list so you have the longest runs of all sub strings.
  69.  
  70. #complist = next(csvcontents)#[1:]
  71. #print("complist prints: ")
  72. #print(complist)
  73.  
  74. #for item in contents:
  75. for item in header[1:]:
  76. #print("item prints:")
  77. #print(item)
  78. beg = 0 # beginning index
  79. end = len(item) # item length
  80. seqrun = 0
  81. longest = 0
  82. #while contents[beg:beg+len(item)]:
  83. while beg + end <= len(contents):
  84. #if contents[beg:beg+len(item)] == item: # trying to solve the issue of end being incremented incorrectly here
  85. seqrun = 0
  86. while contents[beg + len(item): beg + len(item)] == item:
  87. seqrun += 1
  88. beg += len(item)
  89. #end += len(item) i don't need to increment end as well?
  90. if seqrun > longest:
  91. longest = seqrun
  92. end += 1
  93. #print(item + " repeats " + str(seqrun) + "times")
  94. print(item + " repeats " + str(longest) + "times") # why is it longest?
  95.  
  96.  
  97. #if seqrun > 1:
  98. #print(item + " repeats " + str(seqrun) + " times")
  99. #beg += 1
  100. #end += 1
  101. #else:
  102. #beg += 1
  103. #end += 1
  104.  
  105. # you are adding end to the beginning and itself, treating it as a proxy for len(item)
  106. # this would work, except end is no longer len(item)
  107. # it has been incremented through string over time
  108. # meaning we could be at any number... it could be index 500 of the string
  109. # so instead of 500 + 4 (if the item were 4 characters long)
  110.  
  111. # once that is fixed, a variable is needed to hold the largest
  112. # seqrun found, otherwise you will just have the most recent seqrun instead
  113.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement