duquesne9

Michael's DNA with notes

Oct 6th, 2020 (edited)
373
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.80 KB | None | 0 0
  1. import sys
  2. if len(sys.argv) != 3:  # asks for a valid input
  3.     print(f"Usage: python dna.py data.csv sequence.txt")
  4.     exit(1)
  5.  
  6. ''' unlike C, you don't have to declare everything at the top. I personally prefer to declare
  7. things close to where they are implemented, but I'm honestly not sure what the PEP8 standard is.
  8. I've adjusted things to fit my style, FWIW. '''
  9. # defines all the variables
  10. ''' document_file = sys.argv[1]
  11. dna_file = sys.argv[2]
  12. text = open(dna_file)
  13. people = open(document_file) '''
  14.  
  15. ''' "open(file)" is almost always a sniff, instead use a context manager
  16. https://www.youtube.com/watch?v=C-gEQdGVXbk&feature=emb_logo&t=4m25s '''
  17.  
  18. with open(sys.argv[2], newline='') as f:
  19.     text = f.read()
  20. with open(sys.argv[1], newline='') as f:
  21.     people = f.read()
  22.  
  23. ''' these are unused
  24. temp_list = []
  25. list_of_people = [] '''
  26.  
  27. ''' dna_list = []
  28. for line in people:  # adds all of the sentences into a list
  29.    dna_list.append(line) '''
  30. ''' This is a list comprehension, one of the handier tools in python
  31.    https://www.pythonforbeginners.com/basics/list-comprehensions-in-python '''
  32. dna_list = [line for line in people]
  33.  
  34. ''' letters = []
  35. for words in text:  # makes a list of all the dna letters in the letters file
  36.    for letter in words:
  37.        letters.append(letter) '''
  38.  
  39. ''' list comprehensions are normally more readable, but I'm not sure that is true when nesting like this.
  40. I decided to leave it more for exposure than suggesting it's best practices.
  41. I found the syntax here:
  42. https://www.geeksforgeeks.org/nested-list-comprehensions-in-python/ '''
  43. letters = [letter for word in text for letter in word]
  44.  
  45. ''' for i in range(len(dna_list)):  # makes the dna_list into a string
  46.    peoples_dna += dna_list[i]
  47. peoples_dna = peoples_dna.strip() '''
  48.  
  49. ''' peoples_dna = ''
  50. for _, val in enumerate(dna_list):  # makes the dna_list into a string
  51.    peoples_dna += val
  52. peoples_dna = peoples_dna.strip() '''
  53.  
  54. ''' .join method - https://stackoverflow.com/questions/493819/why-is-it-string-joinlist-instead-of-list-joinstring '''
  55. peoples_dna = ''.join([val for _, val in enumerate(dna_list)]).strip()
  56.  
  57. ''' Don't overwrite reserved keywords like 'list', 'dict', 'str', it can have bad consequences
  58. list = [] '''
  59. '''lst = []
  60. for sentence in peoples_dna.splitlines():  # makes a double list where there is few list in a list
  61.    lst.append(sentence.split(',')) '''
  62.  
  63. lst = [sentence.split(',') for sentence in peoples_dna.splitlines()]
  64.  
  65.  
  66. def get_repeat_list(letters=[], lst=[]):
  67.  
  68.     dna_repeated_list = []
  69.     for m in range(len(lst[0]) - 1):
  70.         ''' by putting total_appear_time_list here you automatically reset it on each loop, meaning you don't
  71.        have to do it manually at the end. The same applies for total_times '''
  72.         total_appear_time_list = []
  73.         m = m + 1
  74.         # checks the number of times something appeared in the list
  75.         for i in range(len(letters) - 3):
  76.             total_appear_times = 0
  77.             word = ""  # makes it go be nothing
  78.  
  79.             try:  # if it gets to the end of the dna line it will make a list of range not found
  80.                 ''' for a in range(len(lst[0][m])):
  81.  
  82.                    word += letters[a + i] '''
  83.                 for idx, _ in enumerate(lst[0][m]):
  84.  
  85.                     word += letters[idx + i]
  86.             except:
  87.                 break
  88.             while word == lst[0][m]:  # does it until the word is not the dna part
  89.  
  90.                 total_appear_times += 1
  91.  
  92.                 word = ''
  93.  
  94.                 try:
  95.                     i = i + len(lst[0][m])
  96.  
  97.                     for a in range(len(lst[0][m])):
  98.  
  99.                         word += letters[a + i]
  100.  
  101.                 except:  # remember to check if there is a error 'list index out of range'
  102.                     break  # exits the loop becuase it is at the end of the dna line
  103.             # makes the number of repetion into a list
  104.             total_appear_time_list.append(total_appear_times)
  105.  
  106.             ''' this sort is costing you a lot, and is unnecessary because of the max() method for lists, see below
  107.            # sorts it from largest to lowest
  108.            sorted_list = sorted(total_appear_time_list, reverse=True) '''
  109.  
  110.             # makes the count back to zero for the amount of words in the dna file
  111.             '''total_appear_times = 0 '''
  112.  
  113.         # makes another list to store the biggest number of times the dna string has appeared
  114.         ''' dna_repeated_list.append(sorted_list[0]) '''
  115.         dna_repeated_list.append(max(total_appear_time_list))
  116.  
  117.         # it sets the list of all the times the dna pattern has been found
  118.         ''' total_appear_time_list = [] '''
  119.  
  120.     return dna_repeated_list
  121.  
  122.  
  123. dna_repeated_list = get_repeat_list(letters=letters, lst=lst)
  124.  
  125.  
  126. def get_match(lst=[], dna_repeated_list=[]):
  127.  
  128.     # loop through the dna file with the people on it checking all the verticle rows except the first
  129.     for m in range(len(lst) - 1):
  130.         adding = 0
  131.         m = m + 1
  132.         # loop through the dna file with the people on it to checking the numbers but doesn't check the names
  133.         for q in range(len(lst[1]) - 1):
  134.             q = q + 1  # makes the variable q start at 1
  135.             lst[m][q] = int(lst[m][q])  # makes it an int to compare it
  136.             # compares the double list to the singular list with all the maximumun number of times the pattern appears in a row
  137.             if lst[m][q] != dna_repeated_list[q - 1]:
  138.                 adding += 1  # if it does not match then add it by one
  139.         if adding == 0:  # if it all matches then print the name of the person
  140.             print(f"{lst[m][0]}")
  141.             exit(1)
  142.         # adding = 0  # makes the variable adding go back to zero
  143.  
  144.     print(f"No match")  # if it has not been found
  145.  
  146.  
  147. get_match(lst=lst, dna_repeated_list=dna_repeated_list)
  148.  
Add Comment
Please, Sign In to add comment