Untitled

## Instructions
Run this notebook from the same directory as the EMBL files that will be submitted to the Otting Lab for IPD submissions. The output file name will be 'IPD_submission3_TIMESTAMP.csv', where TIMESTAMP is a unique identifier, but you may modify this to another filename if you wish.

The only required input is a tab-delimited file with the representative animal Identifiers and Comments, formatted as:

      Working genomic allele name	IPD Accession No.	Representative Animal	BLAST comments
      >Mamu-B11L*01:04:01:01	NHP02117	MD103	7 identical fosmids (Rh22777)
      CTCCCCGGACGCCTAGGATGGGGTCATGGCGCCTCGAGCCCTCCTCCTGCTGCTCTCGGGGGCCCTGGCCCTGACCGAGACCTGGG

Enter this filename in the next cell for the `animalID_file` variable.

To run the notebook, run the next two cells, or select 'Run All Cells' from the menu above.

If the notebook finished with no errors, then you will see 'job done' appear.

**Code**

        animalID_file = '22943_MESmerizer_MHC-I_IPD_cDNA-Identical_renamed_Mamu_gDNA_alleles_9Sep19.txt'
        dirname = ''
        outputName = ''

        import glob
        import os
        import sys
        from Bio import SeqIO
        import time
        import datetime

        def parseAnimalIDFile(f):
            l = []
            header = True
            with open(f, 'r') as fOpen:
                for i in fOpen:
                    if header:
                        header = False
                        continue
                    i = i.rstrip('\r\n')
                    if i[0] == '>':
                        iLine = i[1:]
                        iSplit = iLine.split(',')
                        if len(iSplit) == 0 or len(iSplit) == 1:
                            print('Error with file formatting for animalID_file!')
                            return False
                        else:
                            l.append((iSplit[0], iSplit[1:]))
            return l


        def createCSVfile(animalID_file, dirname, outputName):
            if not outputName:
                outputName = 'IPD_submission3_'
            # create file name with timestamp so that it is unique
            # If you wish to use a different directory (NOT RECOMMENDED), then enter a value for dirname in the next line.
            t_stamp = time.time()
            t_stamp_string = datetime.datetime.fromtimestamp(t_stamp).strftime('%Y%m%d_%H%M%S')
            outputFileName = outputName + t_stamp_string + '.csv'

            # import animalID, IPD accession, and comments
            animalID_list = parseAnimalIDFile(animalID_file)
            if not animalID_list:
                return False

            # write header to output csv file
            headerString = 'Section,Submittor ID,submission number,local name,Sequence type:,Accession Number(s),Release date,status,Current Non-human Primate species,sequence ,Cell/Animal ID/Code:,Material Available:,Primary Sequencing,Secondary Sequencing,Types of PCR primers:,Sequenced in isolation,Comments'
            with open(outputFileName, 'a') as fWrite:
                fWrite.write(headerString + '\n')

            # get a list of the EMBL files in the directory
            if not dirname:
                dirname = os.getcwd()
            dirnamePath = dirname + '/*.embl'
            files = glob.glob(dirnamePath)

            ct = 1
            for i in files:
                # extract the allele name for use in the output file and matching to the animalID_list
                iName = os.path.basename(i)
                iNameList = iName.split('.')
                iNameParsed = iNameList[0]
                res_tuple = list(filter(lambda x: x[0] == iNameParsed, animalID_list))
                tupleParsedAsList = []
                if not res_tuple:
                    print('Warning! no matching animalID found from animalID_list for ' + str(iNameParsed) + '!')
                    tupleParsedAsList = ['','','']
                else:
                    tupleParsedAsList = res_tuple[0][1]
                # import the EMBL flatfile, then parse out the sequence from it
                seq_record = list(SeqIO.parse(i, "embl"))
                seq_string = str(seq_record[-1].seq)
                # create the csv line for the allele, then write it to the output file
                outputString = 'Non-human Primates(NHP),10560G23,' + str(ct) + ',' + str(iNameParsed) + ',' + 'Full Length genomic CDS,' + str(tupleParsedAsList[0]) + ',As soon as possible,unpublished,Rhesus macaque (Mamu),' + str(seq_string) + ',' + str(tupleParsedAsList[1]) + ',' + 'No Material Available,Illumina NovaSeq 6000,,Locus specific,yes,' + str(tupleParsedAsList[2]) + ','
                with open(outputFileName, 'a') as fWrite:
                    fWrite.write(outputString + '\n')
                ct += 1
            return True

        if not animalID_file:
            print('Please enter an animal ID before proceeding.')
        else:
            r = createCSVfile(animalID_file, dirname, outputName)
            if r:
                print('job done!')