Advertisement
Guest User

Untitled

a guest
Feb 13th, 2016
65
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.25 KB | None | 0 0
  1. from __future__ import print_function
  2. import cPickle as pickle
  3. import os
  4. import csv
  5.  
  6. __author__ = 'galaxy'
  7. #This script was written by Philip Weiss and Megan Shortridge.
  8. #A special thanks to Phil for the help.
  9.  
  10. mydir = os.getcwd()
  11. li = pickle.load( open( "li.p", "rb" ))
  12. filelist = []
  13.  
  14. for files in os.listdir(mydir):
  15. if files.endswith("headeradded.csv"):
  16. filelist.append(files)
  17.  
  18.  
  19. for files in filelist:
  20. print(files)
  21. readobj = open(files, 'r')
  22. reader = csv.reader(readobj, "excel") #csv reader of the file
  23.  
  24. gi_list = [] #initialize a list of GI numbers
  25. taxonomicranks = ["superkingdom", "kingdom", "phylum", "subphylum", "superclass", "class", "subclass", "infraclass", "superorder", "order", "infraorder", "suborder", "superfamily", "family", "subfamily", "tribe", "genus", "species"]
  26.  
  27.  
  28. GI_field = ""
  29.  
  30. for row in reader:
  31. if "qseqid" in row: #then this is the header line
  32. for x in range (0, len(row)):
  33. if row[x]=="gi" or row[x]=="GI":
  34. GI_field = int(x) #then the column containing GI numbers is this one.
  35. elif "qseqid" not in row: #then it is not a headerline
  36. gi_list.append(row[GI_field])
  37.  
  38. readobj.close()
  39.  
  40. readobj = open(files, 'r')
  41. reader = csv.reader(readobj, "excel") #csv reader of the file
  42. newfilename = files.replace(".csv", "_taxonomy.csv")
  43. writeobj = open(newfilename, 'w', newline='')
  44. writer = csv.writer(writeobj, "excel")
  45.  
  46. rownum = 0
  47. for row in reader:
  48. if row[0] != "qseqid": #this is only if this is not a header containing line
  49. try:
  50. if row[0] != "":
  51. tax=[]
  52. gi = int(gi_list[rownum])
  53. rownum+=1
  54. #print("Loading GI database")
  55. for x in range(len(li)):
  56. if gi >= li[x][0] and gi <= li[x][1]:
  57.  
  58. fn = "gi_" + str(li[x][0]) + "_" + str(li[x][1]) + ".p"
  59. gi_to_taxon = pickle.load(open(fn, "rb"))
  60. tax.append(gi_to_taxon[gi])
  61.  
  62. c=0
  63.  
  64. #print("Obtaining Taxonomy Structure")
  65.  
  66. rank = []
  67. while tax[c] != 1:
  68.  
  69. for line in open("nodes.dmp", "rU"):
  70. ln = line.lstrip().split()
  71. if tax[c] == int(ln[0]):
  72. tax.append(int(ln[2]))
  73. if ln[4] == 'no':
  74. tmp = 'N/A'
  75. else:
  76. tmp = ln[4]
  77. rank.append(tmp)
  78. c+=1
  79. elif tax[c] < int(ln[0]):
  80. break
  81.  
  82. #print("Converting nodes to names")
  83.  
  84. name= []
  85. for x in range(len(tax)-1):
  86. if x == 0:
  87. offset = 3
  88. else:
  89. offset = 2
  90. for line in open("names.dmp", "rU"):
  91. ln = line.lstrip().split()
  92. if tax[x] == int(ln[0]):
  93. name.append(ln[offset])
  94. #print(ln[offset])
  95. break
  96. for x in range (0, len(taxonomicranks)):
  97. rank_type = taxonomicranks[x]
  98. #print(rank_type)
  99. rank_found = False #initialize True-False sequence
  100. for x in range(0, len(rank)):
  101. rank_name = rank[x]
  102. name_value = name[x]
  103. if rank_name == rank_type:
  104. rank_found = True
  105. row.append(name_value)
  106.  
  107. if rank_found==False:
  108. row.append("N/A")
  109. elif rank_found ==True:
  110. #print("rank found")
  111. pass
  112. writer.writerow(row)
  113. except IndexError:
  114. print("Oops")
  115. elif row[0]=="qseqid": #this will write the header line when it first finds it.
  116. for x in range (0, len(taxonomicranks)):
  117. row.append(taxonomicranks[x])
  118. writer.writerow(row)
  119.  
  120. writeobj.close()
  121. readobj.close()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement