Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Instructions
- Run this notebook from the same directory as the EMBL files that will be submitted to the Otting Lab for IPD submissions. The output file name will be 'IPD_submission3_TIMESTAMP.csv', where TIMESTAMP is a unique identifier, but you may modify this to another filename if you wish.
- The only required input is a tab-delimited file with the representative animal Identifiers and Comments, formatted as:
- Working genomic allele name IPD Accession No. Representative Animal BLAST comments
- >Mamu-B11L*01:04:01:01 NHP02117 MD103 7 identical fosmids (Rh22777)
- CTCCCCGGACGCCTAGGATGGGGTCATGGCGCCTCGAGCCCTCCTCCTGCTGCTCTCGGGGGCCCTGGCCCTGACCGAGACCTGGG
- Enter this filename in the next cell for the `animalID_file` variable.
- To run the notebook, run the next two cells, or select 'Run All Cells' from the menu above.
- If the notebook finished with no errors, then you will see 'job done' appear.
- **Code**
- animalID_file = '22943_MESmerizer_MHC-I_IPD_cDNA-Identical_renamed_Mamu_gDNA_alleles_9Sep19.txt'
- dirname = ''
- outputName = ''
- import glob
- import os
- import sys
- from Bio import SeqIO
- import time
- import datetime
- def parseAnimalIDFile(f):
- l = []
- header = True
- with open(f, 'r') as fOpen:
- for i in fOpen:
- if header:
- header = False
- continue
- i = i.rstrip('\r\n')
- if i[0] == '>':
- iLine = i[1:]
- iSplit = iLine.split(',')
- if len(iSplit) == 0 or len(iSplit) == 1:
- print('Error with file formatting for animalID_file!')
- return False
- else:
- l.append((iSplit[0], iSplit[1:]))
- return l
- def createCSVfile(animalID_file, dirname, outputName):
- if not outputName:
- outputName = 'IPD_submission3_'
- # create file name with timestamp so that it is unique
- # If you wish to use a different directory (NOT RECOMMENDED), then enter a value for dirname in the next line.
- t_stamp = time.time()
- t_stamp_string = datetime.datetime.fromtimestamp(t_stamp).strftime('%Y%m%d_%H%M%S')
- outputFileName = outputName + t_stamp_string + '.csv'
- # import animalID, IPD accession, and comments
- animalID_list = parseAnimalIDFile(animalID_file)
- if not animalID_list:
- return False
- # write header to output csv file
- headerString = 'Section,Submittor ID,submission number,local name,Sequence type:,Accession Number(s),Release date,status,Current Non-human Primate species,sequence ,Cell/Animal ID/Code:,Material Available:,Primary Sequencing,Secondary Sequencing,Types of PCR primers:,Sequenced in isolation,Comments'
- with open(outputFileName, 'a') as fWrite:
- fWrite.write(headerString + '\n')
- # get a list of the EMBL files in the directory
- if not dirname:
- dirname = os.getcwd()
- dirnamePath = dirname + '/*.embl'
- files = glob.glob(dirnamePath)
- ct = 1
- for i in files:
- # extract the allele name for use in the output file and matching to the animalID_list
- iName = os.path.basename(i)
- iNameList = iName.split('.')
- iNameParsed = iNameList[0]
- res_tuple = list(filter(lambda x: x[0] == iNameParsed, animalID_list))
- tupleParsedAsList = []
- if not res_tuple:
- print('Warning! no matching animalID found from animalID_list for ' + str(iNameParsed) + '!')
- tupleParsedAsList = ['','','']
- else:
- tupleParsedAsList = res_tuple[0][1]
- # import the EMBL flatfile, then parse out the sequence from it
- seq_record = list(SeqIO.parse(i, "embl"))
- seq_string = str(seq_record[-1].seq)
- # create the csv line for the allele, then write it to the output file
- outputString = 'Non-human Primates(NHP),10560G23,' + str(ct) + ',' + str(iNameParsed) + ',' + 'Full Length genomic CDS,' + str(tupleParsedAsList[0]) + ',As soon as possible,unpublished,Rhesus macaque (Mamu),' + str(seq_string) + ',' + str(tupleParsedAsList[1]) + ',' + 'No Material Available,Illumina NovaSeq 6000,,Locus specific,yes,' + str(tupleParsedAsList[2]) + ','
- with open(outputFileName, 'a') as fWrite:
- fWrite.write(outputString + '\n')
- ct += 1
- return True
- if not animalID_file:
- print('Please enter an animal ID before proceeding.')
- else:
- r = createCSVfile(animalID_file, dirname, outputName)
- if r:
- print('job done!')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement