Advertisement
reeps

ass

Apr 24th, 2019
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.62 KB | None | 0 0
  1. import os, sys, time, gzip, re, psutil, random
  2. import numpy as np
  3. import pandas as pd
  4. from scipy.spatial import distance
  5. import xgboost as xgb
  6. from termcolor import colored
  7.  
  8.  
  9.  
  10. acids = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR']
  11.  
  12. def openPDBFile(protein_name, resLim = 2.0):
  13.     pdb = []
  14.     res = None
  15.     flag = False
  16.     name = pathFinder(protein_name)
  17.     if os.path.exists(name):
  18.         with gzip.open(name,'rt') as f:
  19.             for line in f:
  20.                 if flag == False:
  21.                     res = checkResolution(line)
  22.                     flag = True
  23.                     if res > resLim:
  24.                         return None
  25.                 pdb.append(line)
  26.                    
  27.         return pdb     
  28.     else:
  29.         return None
  30. def checkResolution(line):
  31.     res = 0
  32.     if ('RESOLUTION.' in line) and ('ANGSTROMS' in line):
  33.         res = [float(s) for s in re.findall(r'-?\d+\.?\d*', line)][1]
  34.     return res
  35.        
  36. def pathFinder(protein_name):
  37.     path = '/home/artur/a/pdb/' + protein_name[1].lower() + protein_name[2].lower() + '/pdb' + protein_name.lower() + '.ent.gz'
  38.     return path
  39.  
  40. def getXYZ(line):
  41.  
  42.     x = float(line[30:38])
  43.     y = float(line[38:46])
  44.     z = float(line[46:54])
  45.    
  46.     return [x, y, z]
  47.  
  48.  
  49. def getID(line):
  50.     i = int(line[6:11])
  51.     return i
  52.    
  53. #Get atom's name    
  54. def getAtomID(line):
  55.     i = line[13:16]
  56.     return i
  57.  
  58. #Get amino acids name
  59. def getResName(line):
  60.     i = line[17:20]
  61.     return i    
  62.  
  63.  
  64. def getProteinAndWater(pdb):
  65.    
  66.     protein = []
  67.     water = []
  68.     proteinCoords = []
  69.    
  70.     for line in pdb:
  71.         if len(line) < 78:
  72.             continue
  73.         if (line[17:20] == "HOH" and line[77] == "O"):
  74.             water.append(getXYZ(line))
  75.         if (line[16] != "A" and line[16] != " ") or (line[27] != "A" and line[27] != " "):
  76.             continue
  77.         if (line[17:20] != "HOH" and line[0:4] == "ATOM"):
  78.             protein.append(line)
  79.        
  80.    
  81.     return protein, water
  82.  
  83. def toPdb(xyz,B= 0, atom_id=None, res_id="  0", element= "U"):
  84.     if (B>999):
  85.         B = 999.0
  86.     x = "{:8.3f}".format(xyz[0])
  87.     y = "{:8.3f}".format(xyz[1])
  88.     z = "{:8.3f}".format(xyz[2])
  89.     B = "{:6.2f}".format(float(B))
  90.    
  91.     if res_id != "  0":
  92.         if res_id>999:
  93.             res_id-=int(res_id/1000)*1000
  94.         res_id = str(res_id)
  95.         res_id = " ".join([""]*(3-len(res_id)))+res_id         
  96.     line = "ATOM         "+element+"   "+element * 3+"    "+res_id+"     42.931 -14.533  18.887        0.00           "+element
  97.     line_ = line[:30]+x+y+z+line[54:60]+B+line[66:]+"\n"
  98.     return line_
  99.  
  100.  
  101. def getDistanceBetween(chosenMolecule, molecules):
  102.     return distance.cdist(chosenMolecule, molecules)
  103.    
  104. def getWatersArray(water):
  105.  
  106.     cd = getDistanceBetween(water, water)
  107.     id_cd = np.where((cd < 4.0) & (cd > 0.0))
  108.     cd_ = cd[id_cd]
  109.     cd_.sort()
  110.     return cd_
  111.  
  112. #get all the carbon, oxygen, nitro atoms from protein
  113. def getCON(protein): #carbon oxygen nitro
  114.     C = []
  115.     O = []
  116.     N = []
  117.     id_C = id_O = id_N = []
  118.    
  119.     for i in protein:
  120.         if i[13] == 'O':
  121.             O.append(getXYZ(i))
  122.             id_O.append(getID(i))
  123.         if i[13] == 'C':
  124.             C.append(getXYZ(i))
  125.             id_C.append(getID(i))
  126.         if i[13] == 'N':
  127.             N.append(getXYZ(i))
  128.             id_N.append(getID(i))
  129.            
  130.     return [C, O, N, id_C, id_O, id_N]
  131.    
  132. #get nearly located C O N atoms from each water molecule
  133. def getAtomsArrays(protein, water):
  134.     global acids
  135.     [C, O, N,  id_C, id_O, id_N] = getCON(protein)
  136.    
  137.     cd_C = getDistanceBetween(water, C)
  138.     cd_O = getDistanceBetween(water, O)
  139.     cd_N = getDistanceBetween(water, N)
  140.    
  141.     id_cd_C = np.where(cd_C < 4.0)
  142.     id_cd_O = np.where(cd_O < 4.0)
  143.     id_cd_N = np.where(cd_N < 4.0) 
  144.    
  145.     cd_C_ = cd_C[id_cd_C]
  146.     cd_O_ = cd_O[id_cd_O]
  147.     cd_N_ = cd_N[id_cd_N]
  148.    
  149.     cd_C_.sort()
  150.     cd_O_.sort()
  151.     cd_N_.sort()
  152.    
  153.     return cd_C_, cd_N_, cd_O_
  154.  
  155. #get fake water molecules
  156. def getFakeWater(water):   
  157.     cd = getDistanceBetween(water, water)
  158.     fwater = []
  159.     id_cd = np.where((cd < 4.0) & (cd > 0))
  160.    
  161.     idx = id_cd[0].tolist()
  162.     jdx = id_cd[1].tolist()
  163.        
  164.     for i in range(len(idx)):
  165.                 if (jdx[i] > idx[i]):                      
  166.                     fwater.append(((np.array(water[idx[i]]) + np.array(water[jdx[i]]) )/ 2))
  167.     return fwater
  168.  
  169.  
  170. def filterProts(lim):
  171.     file = open('bc-90.out', 'r')
  172.     fprots = []
  173.    
  174.     count = 0
  175.     for line in file:
  176.         count += 1
  177.         prots = line.split(" ")
  178.         for i in prots:
  179.             if len(i) > 6:
  180.                 prots.remove(i) #delete proteins like 3y3q_ai
  181.                
  182.         if lim >= len(prots):      
  183.             fprots.extend(prots)   
  184.         else:
  185.             random = np.random.randint(len(prots), size= lim)
  186.             for i in random:
  187.                 fprots.append(prots[i])
  188.    
  189.     return fprots
  190.  
  191.  
  192.  
  193. def filterWater(water, lim = 100):
  194.     water_ = []
  195.     if len(water) <= lim:
  196.         water_ = water
  197.     else:
  198.         random = np.random.randint(len(water), size= lim)
  199.         for i in random:
  200.             water_.append(water[i])
  201.     return water_
  202.  
  203.  
  204. def prepareData(lim = 2):  
  205.     global protsUsed
  206.     prots = filterProts(lim)
  207.     random.shuffle(prots)
  208.     count = 0
  209.     error = 0
  210.     posData = []
  211.     negData = []
  212.     for i in prots:
  213.         count += 1
  214.         if count == 15:
  215.             break
  216.         name = i[0:4]
  217.         pdb = openPDBFile(name)
  218.         if pdb == None:
  219.             continue
  220.         else:
  221.             try:               
  222.                 [protein, water] = getProteinAndWater(pdb)
  223.                 water_ = filterWater(water)
  224.                 fwater = getFakeWater(water_)
  225.                 [C, O, N] = getAtomsArrays(protein, water_)
  226.                 posData.append([C, O, N])
  227.                 [C1, O1, N1] = getAtomsArrays(protein, fwater)
  228.                 negData.append([C, O, N])
  229.                 protsUsed.append(name)         
  230.             except ValueError:
  231.                 #error += 1
  232.                 #print colored('Value error number ' + str(error), 'red')
  233.                 continue
  234.     return posData, negData
  235.  
  236.  
  237. #def balanceXY(x, y):
  238.     #print colored(x_, 'yellow')
  239.    
  240. #   return x_, y_
  241.    
  242.     #ids = []
  243.     #if len(x) == len(y):
  244.     #   return x, y
  245.     #if len(x) > len(y):
  246.     #   ids = range(len(y))
  247.     #   random.shuffle(ids)
  248.     #   return x[ids], y
  249.     #if len(y) > len(x):
  250.     #   ids = range(len(x))
  251.     #   random.shuffle(ids)
  252.     #   return x, y[ids]
  253.  
  254. def createXY(posData, negData):
  255.     x = posData + negData
  256.     x_ = np.array(x)
  257.     y = np.zeros(len(x))
  258.     for i in range(len(posData)):
  259.         y[i] = 1
  260.     ids = range(len(x))
  261.    
  262.     random.shuffle(ids)
  263.    
  264.     return x_[ids],y[ids]
  265.     #return x[ids], y[ids]
  266.    
  267.        
  268. def prepareTest(protsUsed):
  269.     pass
  270.    
  271. def train(posData, negData):
  272.     [x, y] = createXY(posData, negData)
  273.     #print x.shape, y.shape
  274.     #x_, y_ = balanceXY(x,y)
  275.     #print x_.shape, y_.shape
  276.     #print x_, y_
  277.    
  278.     print x.shape, y.shape
  279.     model = xgb.XGBClassifier()
  280.    
  281.     model.fit(x, y)
  282.    
  283.     print 'train complete'
  284.    
  285. #x vectors : c , o, n
  286. #y vector : fake water  
  287. timer = time.time()
  288. protsUsed = []
  289. [posData, negData] = prepareData()
  290. train(posData, negData)
  291. print "These", len(protsUsed), "proteins are used for preparing of data"
  292. print colored(protsUsed, 'cyan')
  293.  
  294.    
  295. print "My program took", time.time() - timer, "to run and memory is used", psutil.virtual_memory()
  296. #fwater = getFakeWater(protein, water)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement