Advertisement
reeps

ass v2

Apr 25th, 2019
104
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.73 KB | None | 0 0
  1. import os, sys, time, gzip, re, psutil, random
  2. import numpy as np
  3. import pandas as pd
  4. from scipy.spatial import distance
  5. import xgboost as xgb
  6. from termcolor import colored
  7. from sklearn.metrics import accuracy_score as acs
  8.  
  9.  
  10.  
  11. acids = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR']
  12.  
  13. def openPDBFile(protein_name, resLim = 2.0):
  14.     pdb = []
  15.     res = None
  16.     flag = False
  17.     name = pathFinder(protein_name)
  18.     if os.path.exists(name):
  19.         with gzip.open(name,'rt') as f:
  20.             for line in f:
  21.                 if flag == False:
  22.                     res = checkResolution(line)
  23.                     flag = True
  24.                     if res > resLim:
  25.                         return None
  26.                 pdb.append(line)
  27.                    
  28.         return pdb     
  29.     else:
  30.         return None
  31. def checkResolution(line):
  32.     res = 0
  33.     if ('RESOLUTION.' in line) and ('ANGSTROMS' in line):
  34.         res = [float(s) for s in re.findall(r'-?\d+\.?\d*', line)][1]
  35.     return res
  36.        
  37. def pathFinder(protein_name):
  38.     path = '/home/artur/a/pdb/' + protein_name[1].lower() + protein_name[2].lower() + '/pdb' + protein_name.lower() + '.ent.gz'
  39.     return path
  40.  
  41. def getXYZ(line):
  42.  
  43.     x = float(line[30:38])
  44.     y = float(line[38:46])
  45.     z = float(line[46:54])
  46.    
  47.     return [x, y, z]
  48.  
  49.  
  50. def getID(line):
  51.     i = int(line[6:11])
  52.     return i
  53.    
  54. #Get atom's name    
  55. def getAtomID(line):
  56.     i = line[13:16]
  57.     return i
  58.  
  59. #Get amino acids name
  60. def getResName(line):
  61.     i = line[17:20]
  62.     return i    
  63.  
  64.  
  65. def getProteinAndWater(pdb):
  66.    
  67.     protein = []
  68.     water = []
  69.     proteinCoords = []
  70.    
  71.     for line in pdb:
  72.         if len(line) < 78:
  73.             continue
  74.         if (line[17:20] == "HOH" and line[77] == "O"):
  75.             water.append(getXYZ(line))
  76.         if (line[16] != "A" and line[16] != " ") or (line[27] != "A" and line[27] != " "):
  77.             continue
  78.         if (line[17:20] != "HOH" and line[0:4] == "ATOM"):
  79.             protein.append(line)
  80.        
  81.    
  82.     return protein, water
  83.  
  84. def toPdb(xyz,B= 0, atom_id=None, res_id="  0", element= "U"):
  85.     if (B>999):
  86.         B = 999.0
  87.     x = "{:8.3f}".format(xyz[0])
  88.     y = "{:8.3f}".format(xyz[1])
  89.     z = "{:8.3f}".format(xyz[2])
  90.     B = "{:6.2f}".format(float(B))
  91.    
  92.     if res_id != "  0":
  93.         if res_id>999:
  94.             res_id-=int(res_id/1000)*1000
  95.         res_id = str(res_id)
  96.         res_id = " ".join([""]*(3-len(res_id)))+res_id         
  97.     line = "ATOM         "+element+"   "+element * 3+"    "+res_id+"     42.931 -14.533  18.887        0.00           "+element
  98.     line_ = line[:30]+x+y+z+line[54:60]+B+line[66:]+"\n"
  99.     return line_
  100.  
  101.  
  102. def getDistanceBetween(chosenMolecule, molecules):
  103.     return distance.cdist(chosenMolecule, molecules)
  104.    
  105. def getWatersArray(water):
  106.  
  107.     cd = getDistanceBetween(water, water)
  108.     id_cd = np.where((cd < 4.0) & (cd > 0.0))
  109.     cd_ = cd[id_cd]
  110.     cd_.sort()
  111.     return cd_
  112.  
  113. #get all the carbon, oxygen, nitro atoms from protein
  114. def getCON(protein): #carbon oxygen nitro
  115.     C = []
  116.     O = []
  117.     N = []
  118.     id_C = id_O = id_N = []
  119.    
  120.     for i in protein:
  121.         if i[13] == 'O':
  122.             O.append(getXYZ(i))
  123.             id_O.append(getID(i))
  124.         if i[13] == 'C':
  125.             C.append(getXYZ(i))
  126.             id_C.append(getID(i))
  127.         if i[13] == 'N':
  128.             N.append(getXYZ(i))
  129.             id_N.append(getID(i))
  130.            
  131.     return [C, O, N, id_C, id_O, id_N]
  132.    
  133. #get nearly located C O N atoms from each water molecule
  134. def getAtomsArrays(protein, water):
  135.     global acids
  136.     [C, O, N,  id_C, id_O, id_N] = getCON(protein)
  137.    
  138.     cd_C = getDistanceBetween(water, C)
  139.     cd_O = getDistanceBetween(water, O)
  140.     cd_N = getDistanceBetween(water, N)
  141.    
  142.     id_cd_C = np.where(cd_C < 4.0)
  143.     id_cd_O = np.where(cd_O < 4.0)
  144.     id_cd_N = np.where(cd_N < 4.0) 
  145.    
  146.     cd_C_ = np.array(cd_C[id_cd_C])
  147.     cd_O_ = np.array(cd_O[id_cd_O])
  148.     cd_N_ = np.array(cd_N[id_cd_N])
  149.    
  150.     cd_C_.sort()
  151.     cd_O_.sort()
  152.     cd_N_.sort()
  153.    
  154.     return cd_C_, cd_N_, cd_O_
  155.  
  156. #get fake water molecules
  157. def getFakeWater(water):   
  158.     cd = getDistanceBetween(water, water)
  159.     fwater = []
  160.     id_cd = np.where((cd < 4.0) & (cd > 0))
  161.    
  162.     idx = id_cd[0].tolist()
  163.     jdx = id_cd[1].tolist()
  164.        
  165.     for i in range(len(idx)):
  166.                 if (jdx[i] > idx[i]):                      
  167.                     fwater.append(((np.array(water[idx[i]]) + np.array(water[jdx[i]]) )/ 2))
  168.     return fwater
  169.  
  170.  
  171. def filterProts(lim):
  172.     file = open('bc-90.out', 'r')
  173.     global forTest
  174.     fprots = []
  175.    
  176.     count = 0
  177.     for line in file:
  178.         count += 1
  179.         prots = line.split(" ")
  180.         for i in prots:
  181.             if len(i) > 6:
  182.                 prots.remove(i) #delete proteins like 3y3q_ai
  183.                
  184.         if lim >= len(prots):      
  185.             fprots.extend(prots)   
  186.         else:
  187.             random = np.random.randint(len(prots), size= lim)
  188.             for i in random:
  189.                 fprots.append(prots[i])
  190.    
  191.     return fprots
  192.  
  193.  
  194.  
  195. def filterWater(water, lim = 100):
  196.     water_ = []
  197.     if len(water) <= lim:
  198.         water_ = water
  199.     else:
  200.         random = np.random.randint(len(water), size= lim)
  201.         for i in random:
  202.             water_.append(water[i])
  203.     return water_
  204.  
  205.  
  206. def prepareData(lim = 2):  
  207.     global protsUsed
  208.     prots = filterProts(lim)
  209.     random.shuffle(prots)
  210.     count = 0
  211.     error = 0
  212.     posData = []
  213.     negData = []
  214.     for i in prots:
  215.         count += 1
  216.         if count == 20:
  217.             break
  218.         name = i[0:4]
  219.         pdb = openPDBFile(name)
  220.         if pdb == None:
  221.             continue
  222.         else:
  223.             try:               
  224.                 [protein, water] = getProteinAndWater(pdb)
  225.                 water_ = filterWater(water)
  226.                 fwater = getFakeWater(water_)
  227.                 [C, O, N] = getAtomsArrays(protein, water_)
  228.                 posData.append([C, O, N])
  229.                 [C1, O1, N1] = getAtomsArrays(protein, fwater)
  230.                 negData.append([C, O, N])
  231.                 protsUsed.append(name)         
  232.             except ValueError:
  233.                 #error += 1
  234.                 #print colored('Value error number ' + str(error), 'red')
  235.                 continue
  236.     return posData, negData
  237.  
  238.  
  239. #def balanceXY(x, y):
  240.     #print colored(x_, 'yellow')
  241.    
  242. #   return x_, y_
  243.    
  244.     #ids = []
  245.     #if len(x) == len(y):
  246.     #   return x, y
  247.     #if len(x) > len(y):
  248.     #   ids = range(len(y))
  249.     #   random.shuffle(ids)
  250.     #   return x[ids], y
  251.     #if len(y) > len(x):
  252.     #   ids = range(len(x))
  253.     #   random.shuffle(ids)
  254.     #   return x, y[ids]
  255.  
  256. def balanceX(x):
  257.     maxLen = max([len(i) for i in x])
  258.     for i in x:
  259.         if len(i) < maxLen:
  260.             i = np.concatenate((np.zeros(maxLen - len(i)),i))
  261.     print x.shape      
  262.     return x
  263.  
  264. def createHistXY(posData, negData):
  265.     x = posData + negData
  266.     y = np.zeros(len(x))
  267.     x_ = np.empty((len(x), 3))
  268.     for i in range(len(posData)):
  269.         y[i][0] = np.histogram
  270.     for i in range(len(x)):
  271.         pass   
  272.    
  273.    
  274. def createXY(posData, negData):
  275.     x = posData + negData
  276.     maxlen = 0
  277.     for i in x:
  278.         maxx = len(max(i, key = len))
  279.         if maxx > maxlen:
  280.             print colored(maxx, 'red')
  281.             maxlen = maxx
  282.     x_= []
  283.     foo = []
  284.     for i in range(len(x)):        
  285.         foo = np.zeros(maxlen - len(x[i][0])).tolist() + x[i][0].tolist() + np.zeros(maxlen - len(x[i][1])).tolist() + x[i][1].tolist() + np.zeros(maxlen - len(x[i][2])).tolist() + x[i][2].tolist()
  286.         print colored(len(foo), 'blue')
  287.         x_.append(np.array(foo))
  288.     y = np.zeros(len(x))
  289.     for i in range(len(posData)):
  290.         y[i] = 1   
  291.     return np.array(x_), y
  292.     #ids = range(len(x))
  293.    
  294.     #random.shuffle(ids)
  295.    
  296.  
  297.  
  298.     #return x_[ids],y[ids]
  299.     #return x[ids], y[ids]
  300.     """
  301.     #tut [[[C[d1..], O[d1..], N[d1..]], [C[d1..] ...], ...]
  302.     x = posData + negData
  303.     x_ = np.array(x)
  304.     y = np.zeros(len(x))
  305.     for i in range(len(posData)):
  306.         y[i] = 1
  307.     ids = range(len(x))
  308.     random.shuffle(ids)
  309.     return x_[ids],y[ids]
  310.     """
  311. def prepareTest():
  312.     global protsUsed
  313.    
  314.  
  315. def checkPred(prots = protsUsed):
  316.     pass
  317.    
  318. def train(posData, negData):
  319.     [x, y] = createXY(posData, negData)
  320.     #x = balanceX(x)   
  321.     model = xgb.XGBClassifier()
  322.     model.fit(x, y)
  323.     print 'train complete'
  324.     return model
  325.    
  326.    
  327. #x vectors : c , o, n
  328. #y vector : fake water  
  329. timer = time.time()
  330. protsUsed = []
  331. [posData, negData] = prepareData()
  332. train(posData, negData)
  333. print "These", len(protsUsed), "proteins are used for preparing of data"
  334. print colored(protsUsed, 'cyan')
  335.  
  336.    
  337. print "My program took", time.time() - timer, "to run and memory is used", psutil.virtual_memory()
  338. #fwater = getFakeWater(protein, water)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement