Advertisement
Guest User

Untitled

a guest
Oct 16th, 2019
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.04 KB | None | 0 0
  1. ## Instructions
  2. Run this notebook from the same directory as the EMBL files that will be submitted to the Otting Lab for IPD submissions. The output file name will be 'IPD_submission3_TIMESTAMP.csv', where TIMESTAMP is a unique identifier, but you may modify this to another filename if you wish.
  3.  
  4. The only required input is a tab-delimited file with the representative animal Identifiers and Comments, formatted as:
  5.  
  6. Working genomic allele name IPD Accession No. Representative Animal BLAST comments
  7. >Mamu-B11L*01:04:01:01 NHP02117 MD103 7 identical fosmids (Rh22777)
  8. CTCCCCGGACGCCTAGGATGGGGTCATGGCGCCTCGAGCCCTCCTCCTGCTGCTCTCGGGGGCCCTGGCCCTGACCGAGACCTGGG
  9.  
  10. Enter this filename in the next cell for the `animalID_file` variable.
  11.  
  12. To run the notebook, run the next two cells, or select 'Run All Cells' from the menu above.
  13.  
  14. If the notebook finished with no errors, then you will see 'job done' appear.
  15.  
  16. **Code**
  17.  
  18. animalID_file = '22943_MESmerizer_MHC-I_IPD_cDNA-Identical_renamed_Mamu_gDNA_alleles_9Sep19.txt'
  19. dirname = ''
  20. outputName = ''
  21.  
  22. import glob
  23. import os
  24. import sys
  25. from Bio import SeqIO
  26. import time
  27. import datetime
  28.  
  29. def parseAnimalIDFile(f):
  30. l = []
  31. header = True
  32. with open(f, 'r') as fOpen:
  33. for i in fOpen:
  34. if header:
  35. header = False
  36. continue
  37. i = i.rstrip('\r\n')
  38. if i[0] == '>':
  39. iLine = i[1:]
  40. iSplit = iLine.split(',')
  41. if len(iSplit) == 0 or len(iSplit) == 1:
  42. print('Error with file formatting for animalID_file!')
  43. return False
  44. else:
  45. l.append((iSplit[0], iSplit[1:]))
  46. return l
  47.  
  48.  
  49.  
  50.  
  51. def createCSVfile(animalID_file, dirname, outputName):
  52. if not outputName:
  53. outputName = 'IPD_submission3_'
  54. # create file name with timestamp so that it is unique
  55. # If you wish to use a different directory (NOT RECOMMENDED), then enter a value for dirname in the next line.
  56. t_stamp = time.time()
  57. t_stamp_string = datetime.datetime.fromtimestamp(t_stamp).strftime('%Y%m%d_%H%M%S')
  58. outputFileName = outputName + t_stamp_string + '.csv'
  59.  
  60. # import animalID, IPD accession, and comments
  61. animalID_list = parseAnimalIDFile(animalID_file)
  62. if not animalID_list:
  63. return False
  64.  
  65. # write header to output csv file
  66. headerString = 'Section,Submittor ID,submission number,local name,Sequence type:,Accession Number(s),Release date,status,Current Non-human Primate species,sequence ,Cell/Animal ID/Code:,Material Available:,Primary Sequencing,Secondary Sequencing,Types of PCR primers:,Sequenced in isolation,Comments'
  67. with open(outputFileName, 'a') as fWrite:
  68. fWrite.write(headerString + '\n')
  69.  
  70. # get a list of the EMBL files in the directory
  71. if not dirname:
  72. dirname = os.getcwd()
  73. dirnamePath = dirname + '/*.embl'
  74. files = glob.glob(dirnamePath)
  75.  
  76. ct = 1
  77. for i in files:
  78. # extract the allele name for use in the output file and matching to the animalID_list
  79. iName = os.path.basename(i)
  80. iNameList = iName.split('.')
  81. iNameParsed = iNameList[0]
  82. res_tuple = list(filter(lambda x: x[0] == iNameParsed, animalID_list))
  83. tupleParsedAsList = []
  84. if not res_tuple:
  85. print('Warning! no matching animalID found from animalID_list for ' + str(iNameParsed) + '!')
  86. tupleParsedAsList = ['','','']
  87. else:
  88. tupleParsedAsList = res_tuple[0][1]
  89. # import the EMBL flatfile, then parse out the sequence from it
  90. seq_record = list(SeqIO.parse(i, "embl"))
  91. seq_string = str(seq_record[-1].seq)
  92. # create the csv line for the allele, then write it to the output file
  93. outputString = 'Non-human Primates(NHP),10560G23,' + str(ct) + ',' + str(iNameParsed) + ',' + 'Full Length genomic CDS,' + str(tupleParsedAsList[0]) + ',As soon as possible,unpublished,Rhesus macaque (Mamu),' + str(seq_string) + ',' + str(tupleParsedAsList[1]) + ',' + 'No Material Available,Illumina NovaSeq 6000,,Locus specific,yes,' + str(tupleParsedAsList[2]) + ','
  94. with open(outputFileName, 'a') as fWrite:
  95. fWrite.write(outputString + '\n')
  96. ct += 1
  97. return True
  98.  
  99. if not animalID_file:
  100. print('Please enter an animal ID before proceeding.')
  101. else:
  102. r = createCSVfile(animalID_file, dirname, outputName)
  103. if r:
  104. print('job done!')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement