Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # The folder can be unzipped in bash using - unzip 'folder name here'. Just make sure you have unzip installed.
- # Run the unzip command in bash once, if it does not run just type 'sudo apt-get install unzip'
- #note for paul, I had to do this the long winded way. I will try and set a reminder to talk to you about trying to find a shorter method
- import os # this is the module that I need to open the file
- def bar_code_length(): # this is the function that checks for all the bar code length
- list_of_less_than_five = [] # setting up a list to do hold the abnormal fasta files if necessary
- os.chdir('C:\\Users\\dhaka\\OneDrive\\Desktop\\Exam one\\pauls_dna_seqs\\pauls_dna_seqs') # you will have to change the directory to your file location
- for i in os.listdir(): # opens all the files in the directory
- if i.endswith('.fasta'): # to open only fasta file
- file_pointer = open(i) # assign file pointer and open the file
- read_file_first_line = file_pointer.readline() # reads the first line
- read_file_first_line = read_file_first_line.strip('> ') # removes the unwanted '>'
- read_file_first_line = read_file_first_line.strip('\n') # removes the new line character
- read_file_first_line = list(read_file_first_line) # makes a list of the first line
- if len(read_file_first_line) < 5: # checks the length of the first line
- list_of_less_than_five.append(i) # append if the lenght in less than five
- if len(list_of_less_than_five) == 0: # if all the files are okay then it just makes it just types that indeed all the files are okay
- print('1.Seems all of them have five bases\n')
- elif len(list_of_less_than_five) != 0: # if the list does receive something then lets us know which one is the abnormal file
- print('These are the files that have less than five bases in the bar code\n')
- print(list_of_less_than_five)
- def sequence_lenght(): # this function checks to see if all the sequences are long enough
- list_of_less_than_fifty = [] # setting up a list to do hold the abnormal fasta files if necessary
- os.chdir('C:\\Users\\dhaka\\OneDrive\\Desktop\\Exam one\\pauls_dna_seqs\\pauls_dna_seqs') # change this to your local file
- for i in os.listdir(): # opens all the files in the directory
- if i.endswith('.fasta'):# to open only fasta file
- file_pointer = open(i) # assign file pointer and open the file
- file_pointer.readline() # reads the first line and skips it
- read_file_second_line = file_pointer.readline() # reads the second line
- read_file_second_line = read_file_second_line.strip('\n')# removes the new line character
- read_file_second_line = list(read_file_second_line)# makes a list of the second line
- if len(read_file_second_line) < 50: # checks if the second line in long enough
- list_of_less_than_fifty.append(i) # if not passes it to the list that holds aberrant fies
- if len(list_of_less_than_fifty) == 0: # if not files are aberrant then we let the user know so
- print('2.Seems all of them have fifty bases\n')
- elif len(list_of_less_than_fifty) != 0: # if their are abnormal files, this lets the user know
- print('2.These are the files that have less than fifty bases in the sequence\n')
- print(list_of_less_than_fifty)
- def non_canonical_bases(): # the function to look for non canonical bases
- list_of_non_canonical_based = []
- os.chdir('C:\\Users\\dhaka\\OneDrive\\Desktop\\Exam one\\pauls_dna_seqs\\pauls_dna_seqs') # set this to your local file
- for i in os.listdir(): # opens folder and loops through it
- if i.endswith('fasta'): # makes sure they are fasta
- file_pointer = open(i) # opens file
- file_pointer.readline() # skips first line
- read_second_line = file_pointer.readline() # reads second line
- read_second_line = read_second_line.strip('\n') # removes what we do not want
- read_second_line = list(read_second_line) # makes a list
- for my_base in read_second_line: # loops through the first line
- if my_base == 'N': # if any of the bases are N then append it to the list
- list_of_non_canonical_based.append(i)
- if len(list_of_non_canonical_based) == 0: # if not non canonical bases were found then the list will be zero
- print('3.Seems non of them have non canonical bases\n')
- elif len(list_of_non_canonical_based) != 0:
- print('3.Here are the fasta files with non canonical bases\n')
- print(list_of_non_canonical_based)
- bar_code_length() # these let the user know of the result
- sequence_lenght()
- non_canonical_bases()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement