Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import sys
- def main():
- # TODO: Check for command-line usage
- if len(sys.argv) != 3:
- sys.exit("Usage: python dna.py data.csv sequence.txt")
- # TODO: Read database file into a variable
- csvFile = sys.argv[1]
- with open(csvFile) as file1:
- reader = csv.reader(file1, delimiter=',' , quotechar='"')
- lines = list(reader)
- # TODO: Read DNA sequence file into a variable
- database = []
- for data in lines:
- database.append(data)
- # print(f"{database}")
- # TODO: Find longest match of each STR in DNA sequence
- txtFile = sys.argv[2]
- file2 = open(txtFile, 'r')
- sequence = file2.read()
- profile = []
- for i in range(1, len(database[0])):
- subsequence = database[0][i]
- n = longest_match(sequence, subsequence)
- profile.append(n)
- # print(f"{profile}")
- # TODO: Check database for matching profiles
- row = 1
- column = 1
- index = 0
- while row < len(database):
- if profile[index] == int(database[row][column]):
- column += 1
- index += 1
- if column == len(database[row]):
- print(f"{database[row][0]}")
- break
- else:
- column = 1
- row += 1
- if row == len(database):
- print("No match")
- return
- def longest_match(sequence, subsequence):
- """Returns length of longest run of subsequence in sequence."""
- # Initialize variables
- longest_run = 0
- subsequence_length = len(subsequence)
- sequence_length = len(sequence)
- # Check each character in sequence for most consecutive runs of subsequence
- for i in range(sequence_length):
- # Initialize count of consecutive runs
- count = 0
- # Check for a subsequence match in a "substring" (a subset of characters) within sequence
- # If a match, move substring to next potential match in sequence
- # Continue moving substring and checking for matches until out of consecutive matches
- while True:
- # Adjust substring start and end
- start = i + count * subsequence_length
- end = start + subsequence_length
- # If there is a match in the substring
- if sequence[start:end] == subsequence:
- count += 1
- # If there is no match in the substring
- else:
- break
- # Update most consecutive matches found
- longest_run = max(longest_run, count)
- # After checking for runs at each character in seqeuence, return longest run found
- return longest_run
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement