Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import csv
- import sys
- def main():
- # TODO: Check for command-line usage
- if len(sys.argv) != 3:
- sys.exit("Usage: python dna.py data.csv sequence.txt")
- # TODO: Read database file into a variable
- database = sys.argv[1]
- db_names = []
- strnames = []
- with open(database, 'r') as file1:
- reader = csv.DictReader(file1)
- for line in reader:
- # team["rating"] = int(team["rating"])
- db_names.append(line)
- strnames = reader.fieldnames
- strnames.pop(0)
- print(db_names)
- print(strnames)
- file1.close()
- # TODO: Read DNA sequence file into a variable
- sequence = sys.argv[2]
- with open(sequence, 'r') as file2:
- dna = file2.read()
- file2.close()
- # TODO: Find longest match of each STR in DNA sequence
- str_counts = []
- for element in strnames:
- str_counts.append(longest_match(dna, element))
- print(str_counts)
- # TODO: Check database for matching profiles
- # Find the str key in the db_names and compare the value of that key to str_runs
- # I know this is currently wrong. pls help
- for i in db_names[strnames]:
- matches = 0
- print(db_names[i])
- for j in str_counts:
- if i == j:
- matches += 1
- if matches == strnames:
- print(db_names[i]['name'])
- exit(0)
- print("No match")
- # for i in range(len(strnames)):
- return
- def longest_match(sequence, subsequence):
- """Returns length of longest run of subsequence in sequence."""
- # Initialize variables
- longest_run = 0
- subsequence_length = len(subsequence)
- sequence_length = len(sequence)
- # Check each character in sequence for most consecutive runs of subsequence
- for i in range(sequence_length):
- # Initialize count of consecutive runs
- count = 0
- # Check for a subsequence match in a "substring" (a subset of characters) within sequence
- # If a match, move substring to next potential match in sequence
- # Continue moving substring and checking for matches until out of consecutive matches
- while True:
- # Adjust substring start and end
- start = i + count * subsequence_length
- end = start + subsequence_length
- # If there is a match in the substring
- if sequence[start:end] == subsequence:
- count += 1
- # If there is no match in the substring
- else:
- break
- # Update most consecutive matches found
- longest_run = max(longest_run, count)
- # After checking for runs at each character in seqeuence, return longest run found
- return longest_run
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement