Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "import os\n",
- "import pandas as pd\n",
- "# requires dammit env\n",
- "# source activate dammit\n",
- "from dammit.fileio.gff3 import GFF3Parser"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "L_goodei_transfer_2\n",
- "L_goodei_BW_3\n",
- "L_goodei_BW_1\n",
- "L_goodei_transfer_3\n",
- "L_goodei_transfer_1\n",
- "L_goodei_BW_2\n",
- "L_goodei_FW_2\n",
- "L_goodei_FW_1\n",
- "L_goodei_FW_3\n",
- "F_notti_FW_2\n",
- "F_notti_FW_1\n",
- "L_parva_FW_3\n",
- "L_parva_FW_1\n",
- "L_parva_transfer_3\n",
- "L_parva_transfer_1\n",
- "L_parva_FW_2\n",
- "L_parva_transfer_2\n",
- "L_parva_BW_2\n",
- "L_parva_BW_1\n",
- "L_parva_BW_3\n",
- "F_similis_transfer_1\n",
- "F_similis_BW_1\n",
- "F_similis_transfer_3\n",
- "F_similis_BW_3\n",
- "F_similis_BW_2\n",
- "F_similis_transfer_2\n",
- "F_similis_FW_2\n",
- "F_similis_FW_3\n",
- "F_similis_FW_1\n",
- "F_olivaceous_FW_2\n",
- "F_olivaceous_transfer_2\n",
- "F_olivaceous_FW_1\n",
- "F_olivaceous_transfer_1\n",
- "F_olivaceous_FW_3\n",
- "F_olivaceous_BW_3\n",
- "F_olivaceous_BW_1\n",
- "F_olivaceous_BW_2\n",
- "F_notatus_FW_2\n",
- "F_notatus_FW_3\n",
- "F_notatus_FW_1\n",
- "F_notatus_transfer_1\n",
- "F_notatus_transfer_3\n",
- "F_notatus_BW_1\n",
- "F_notatus_BW_3\n",
- "F_notatus_transfer_2\n",
- "F_notatus_BW_2\n",
- "F_heteroclitusMDPP_FW_3\n",
- "F_heteroclitusMDPP_FW_1\n",
- "F_heteroclitusMDPP_FW_2\n",
- "F_heteroclitusMDPP_BW_2\n",
- "F_heteroclitusMDPP_transfer_2\n",
- "F_heteroclitusMDPP_transfer_3\n",
- "F_heteroclitusMDPP_BW_1\n",
- "F_heteroclitusMDPP_transfer_1\n",
- "F_heteroclitusMDPP_BW_3\n",
- "F_heteroclitusMDPL_transfer_3\n",
- "F_heteroclitusMDPL_FW_2\n",
- "F_heteroclitusMDPL_transfer_1\n",
- "F_heteroclitusMDPL_FW_3\n",
- "F_heteroclitusMDPL_transfer_2\n",
- "F_heteroclitusMDPL_FW_1\n",
- "F_heteroclitusMDPL_BW_1\n",
- "F_heteroclitusMDPL_BW_3\n",
- "F_heteroclitusMDPL_BW_2\n",
- "F_parvapinis_FW_1\n",
- "F_parvapinis_transfer_2\n",
- "F_parvapinis_FW_3\n",
- "F_parvapinis_transfer_1\n",
- "F_parvapinis_FW_2\n",
- "F_parvapinis_BW_2\n",
- "F_parvapinis_BW_3\n",
- "F_parvapinis_BW_1\n",
- "F_diaphanus_BW_1\n",
- "F_diaphanus_BW_2\n",
- "F_diaphanus_transfer_2\n",
- "F_diaphanus_FW_2\n",
- "F_diaphanus_transfer_1\n",
- "F_diaphanus_FW_3\n",
- "F_catanatus_BW_2\n",
- "F_catanatus_transfer_1\n",
- "F_catanatus_BW_3\n",
- "F_catanatus_transfer_2\n",
- "F_catanatus_BW_1\n",
- "F_catanatus_FW_1\n",
- "F_catanatus_FW_2\n",
- "F_zebrinus_FW_2\n",
- "F_zebrinus_FW_1\n",
- "F_zebrinus_BW_1\n",
- "F_zebrinus_BW_2\n",
- "F_sciadicus_transfer_1\n",
- "F_sciadicus_BW_1\n",
- "F_sciadicus_FW_1\n",
- "F_sciadicus_FW_2\n",
- "F_grandis_transfer_3\n",
- "F_grandis_FW_2\n",
- "F_grandis_transfer_1\n",
- "F_grandis_FW_3\n",
- "F_grandis_transfer_2\n",
- "F_grandis_FW_1\n",
- "F_grandis_BW_1\n",
- "F_grandis_BW_3\n",
- "F_grandis_BW_2\n",
- "F_rathbuni_BW_2\n",
- "F_rathbuni_BW_3\n",
- "F_rathbuni_BW_1\n",
- "F_rathbuni_FW_1\n",
- "F_rathbuni_transfer_2\n",
- "F_rathbuni_FW_3\n",
- "F_rathbuni_transfer_1\n",
- "F_rathbuni_FW_2\n",
- "F_rathbuni_transfer_3\n",
- "F_chrysotus_BW_2\n",
- "F_chrysotus_BW_1\n",
- "F_chrysotus_BW_3\n",
- "F_chrysotus_FW_3\n",
- "F_chrysotus_transfer_1\n",
- "F_chrysotus_FW_1\n",
- "F_chrysotus_FW_2\n",
- "F_chrysotus_transfer_2\n",
- "A_xenica_FW_2\n",
- "A_xenica_transfer_2\n",
- "A_xenica_FW_1\n",
- "A_xenica_transfer_3\n",
- "A_xenica_transfer_1\n",
- "A_xenica_FW_3\n",
- "A_xenica_BW_3\n",
- "A_xenica_BW_1\n",
- "A_xenica_BW_2\n"
- ]
- }
- ],
- "source": [
- "species_dirs = \"salmon_denovo_by_species\"\n",
- "out_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\"\n",
- "dirs = os.listdir(species_dirs)\n",
- "for species in dirs:\n",
- " if species != \".DS_Store\":\n",
- " species_gene_file = species + \"_gene_transcript_table.txt\"\n",
- " files = os.listdir(species_dirs + \"/\" + species)\n",
- " for quant_dir in files:\n",
- " if quant_dir != \".DS_Store\":\n",
- " replicate = quant_dir.split(\".\")[0]\n",
- " print(replicate)\n",
- " quant_files = os.listdir(species_dirs + \"/\" + species + \"/\" + quant_dir)\n",
- " for file in quant_files:\n",
- " if file.endswith(\".sf\"):\n",
- " with open(species_dirs + \"/\" + species + \"/\" + quant_dir + \"/\" + file) as qf:\n",
- " header = next(qf).split(\"\\t\")\n",
- " expression_quant_data = qf.readlines()\n",
- " with open(out_dir + species_gene_file,\"w\") as gt:\n",
- " for line in expression_quant_data:\n",
- " transcript = line.split(\"\\t\")[0]\n",
- " gene = transcript[:-3]\n",
- " gt.write(transcript + \"\\t\")\n",
- " gt.write(gene + \"\\n\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "['F_notatus.trinity_out.Trinity.fasta.dammit.gff3', '.DS_Store', 'F_rathbuni.trinity_out.Trinity.fasta.dammit.gff3', 'F_diaphanus.trinity_out.Trinity.fasta.dammit.gff3', 'F_chrysotus.trinity_out.Trinity.fasta.dammit.gff3', 'L_goodei.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.gff3', 'F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.gff3', 'F_similis.trinity_out.Trinity.fasta.dammit.gff3', 'L_parva.trinity_out.Trinity.fasta.dammit.gff3', 'A_xenica.trinity_out.Trinity.fasta.dammit.gff3', 'F_catanatus.trinity_out.Trinity.fasta.dammit.gff3', 'F_zebrinus.trinity_out.Trinity.fasta.dammit.gff3', 'F_parvapinis.trinity_out.Trinity.fasta.dammit.gff3', 'F_grandis.trinity_out.Trinity.fasta.dammit.gff3', 'F_sciadicus.trinity_out.Trinity.fasta.dammit.gff3', 'F_olivaceous.trinity_out.Trinity.fasta.dammit.gff3', 'F_notti.trinity_out.Trinity.fasta.dammit.gff3']\n",
- "F_notatus.trinity_out.Trinity.fasta.dammit.gff3\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/johnsolk/anaconda3/lib/python3.6/site-packages/dammit/fileio/gff3.py:73: ParserWarning: Both a converter and dtype were specified for column attributes - only the converter will be used\n",
- " dtype=dict(self.columns)):\n",
- "/Users/johnsolk/anaconda3/lib/python3.6/site-packages/dammit/fileio/gff3.py:73: ParserWarning: Both a converter and dtype were specified for column attributes - only the converter will be used\n",
- " dtype=dict(self.columns)):\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "F_rathbuni.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_diaphanus.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_chrysotus.trinity_out.Trinity.fasta.dammit.gff3\n",
- "L_goodei.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_heteroclitusMDPL.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_heteroclitusMDPP.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_similis.trinity_out.Trinity.fasta.dammit.gff3\n",
- "L_parva.trinity_out.Trinity.fasta.dammit.gff3\n",
- "A_xenica.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_catanatus.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_zebrinus.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_parvapinis.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_grandis.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_sciadicus.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_olivaceous.trinity_out.Trinity.fasta.dammit.gff3\n",
- "F_notti.trinity_out.Trinity.fasta.dammit.gff3\n"
- ]
- }
- ],
- "source": [
- "annotations_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/gff_annotations/\"\n",
- "annotations = os.listdir(annotations_dir)\n",
- "print(annotations)\n",
- "for annotation in annotations:\n",
- " if annotation != \".DS_Store\":\n",
- " species = annotation.split(\".\")[0]\n",
- " print(annotation)\n",
- " name = annotations_dir + annotation\n",
- " annotations = GFF3Parser(filename=name).read()\n",
- " all_names = annotations.sort_values(by=['seqid'],ascending=True)[['seqid','Name']]\n",
- " annotations = annotations.dropna(subset=['Name'])\n",
- " pickonename = annotations.sort_values(by=['seqid', 'score'], ascending=True).query('score < 1e-05').drop_duplicates(subset='seqid')[['seqid', 'Name']]\n",
- " pickonename = pickonename.dropna(axis=0,how=\"all\")\n",
- " fund = annotations[annotations['Name'].str.startswith(\"gi\")]\n",
- " names = fund.sort_values(by=['seqid'], ascending=True)[['seqid', 'Name']]\n",
- " names_out = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_genenames.csv\"\n",
- " fund_names = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_Fhet_genenames.csv\"\n",
- " pickone = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\" + species + \"_onegenenamepertranscript.csv\"\n",
- " #all_names.to_csv(names_out)\n",
- " names.to_csv(fund_names)\n",
- " pickonename.to_csv(pickone)\n",
- " "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_diaphanus_namemap2.csv\n",
- "(384218, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_diaphanus_Fhet_genenames.csv\n",
- "(134129, 3)\n",
- "(384218, 6)\n",
- "(455093, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_chrysotus_namemap2.csv\n",
- "(396400, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_chrysotus_Fhet_genenames.csv\n",
- "(168861, 3)\n",
- "(396400, 6)\n",
- "(485204, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_goodei_namemap2.csv\n",
- "(385476, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_goodei_Fhet_genenames.csv\n",
- "(168019, 3)\n",
- "(385476, 6)\n",
- "(478428, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPL_namemap2.csv\n",
- "(592419, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPL_Fhet_genenames.csv\n",
- "(149975, 3)\n",
- "(592419, 6)\n",
- "(663574, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_grandis_namemap2.csv\n",
- "(809060, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_grandis_Fhet_genenames.csv\n",
- "(182607, 3)\n",
- "(809060, 6)\n",
- "(896995, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_olivaceous_namemap2.csv\n",
- "(350265, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_olivaceous_Fhet_genenames.csv\n",
- "(157612, 3)\n",
- "(350265, 6)\n",
- "(438933, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_zebrinus_namemap2.csv\n",
- "(266978, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_zebrinus_Fhet_genenames.csv\n",
- "(119383, 3)\n",
- "(266978, 6)\n",
- "(329326, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_catanatus_namemap2.csv\n",
- "(405866, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_catanatus_Fhet_genenames.csv\n",
- "(157130, 3)\n",
- "(405866, 6)\n",
- "(484458, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_sciadicus_namemap2.csv\n",
- "(241279, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_sciadicus_Fhet_genenames.csv\n",
- "(118097, 3)\n",
- "(241279, 6)\n",
- "(304539, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPP_namemap2.csv\n",
- "(668487, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_heteroclitusMDPP_Fhet_genenames.csv\n",
- "(146014, 3)\n",
- "(668487, 6)\n",
- "(736813, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_similis_namemap2.csv\n",
- "(520319, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_similis_Fhet_genenames.csv\n",
- "(161532, 3)\n",
- "(520319, 6)\n",
- "(607296, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/A_xenica_namemap2.csv\n",
- "(362783, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/A_xenica_Fhet_genenames.csv\n",
- "(161802, 3)\n",
- "(362783, 6)\n",
- "(451127, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_parvapinis_namemap2.csv\n",
- "(352346, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_parvapinis_Fhet_genenames.csv\n",
- "(146111, 3)\n",
- "(352346, 6)\n",
- "(435605, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notatus_namemap2.csv\n",
- "(416299, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notatus_Fhet_genenames.csv\n",
- "(178608, 3)\n",
- "(416299, 6)\n",
- "(512589, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_parva_namemap2.csv\n",
- "(409543, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/L_parva_Fhet_genenames.csv\n",
- "(159764, 3)\n",
- "(409543, 6)\n",
- "(495881, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notti_namemap2.csv\n",
- "(159771, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_notti_Fhet_genenames.csv\n",
- "(91102, 3)\n",
- "(159771, 6)\n",
- "(209036, 7)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_rathbuni_namemap2.csv\n",
- "(501222, 4)\n",
- "/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/F_rathbuni_Fhet_genenames.csv\n",
- "(176180, 3)\n",
- "(501222, 6)\n",
- "(595482, 7)\n"
- ]
- }
- ],
- "source": [
- "namemap_dir = \"/Users/johnsolk/Documents/UCDavis/Whitehead/intermediate_denovo_annotation_files/\"\n",
- "maps = os.listdir(namemap_dir)\n",
- "for namemap in maps:\n",
- " if namemap.endswith(\"dammit.namemap.csv\"):\n",
- " species = namemap.split(\".\")[0]\n",
- " namemap2 = namemap_dir + species + \"_namemap2.csv\"\n",
- " with open(namemap_dir + namemap) as f:\n",
- " header = next(f)\n",
- " new_header = header.strip() + \",\"+\"seqid\"+\",\"+\"Name\"\n",
- " ids = f.readlines()\n",
- " with open(namemap2,\"w\") as n:\n",
- " n.write(new_header+\"\\n\")\n",
- " for line in ids:\n",
- " trinity_full = line.split(\",\")[0]\n",
- " renamed = line.split(\",\")[-1].strip()\n",
- " trinity_contig = trinity_full.split(\" \")[0].strip('\"')\n",
- " gene = trinity_contig[:-3]\n",
- " n.write(trinity_contig+\",\")\n",
- " n.write(renamed+\",\")\n",
- " n.write(gene+\",\")\n",
- " n.write(gene+\"\\n\")\n",
- " annotations = namemap_dir + species + \"_onegenenamepertranscript.csv\"\n",
- " Fhet = namemap_dir + species + \"_Fhet_genenames.csv\"\n",
- " ann = pd.read_csv(annotations)\n",
- " dammit_Trinity = pd.read_csv(namemap2)\n",
- " print(namemap2)\n",
- " print(dammit_Trinity.shape)\n",
- " species_Fhet = pd.read_csv(Fhet)\n",
- " print(Fhet)\n",
- " print(species_Fhet.shape)\n",
- " species_Fhet = species_Fhet.drop(['Unnamed: 0'], axis=1)\n",
- " ann = ann.drop(['Unnamed: 0'], axis=1)\n",
- " combined = pd.merge(dammit_Trinity, ann, how='outer', left_on=\"renamed\", right_on=\"seqid\")\n",
- " print(combined.shape)\n",
- " combined = combined.drop(['renamed'],axis=1)\n",
- " combined = pd.merge(combined,species_Fhet,how = \"outer\",left_on=\"seqid_y\",right_on=\"seqid\")\n",
- " print(combined.shape)\n",
- " combined['Name_y'] = combined['Name_y'].fillna(combined['seqid_x'])\n",
- " combined = combined.drop(['Name_x'],axis=1)\n",
- " new = combined.to_csv(\"/Users/johnsolk/Documents/UCDavis/Whitehead/annotation_gene_names/\" + species + \"_gene_names.csv\")\n",
- " #print(\"/Users/johnsolk/Documents/UCDavis/Whitehead/annotation_gene_names/\" + species + \"_gene_names.csv\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
Add Comment
Please, Sign In to add comment