Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os, sys, time, gzip, re, psutil, random
- import numpy as np
- import pandas as pd
- from scipy.spatial import distance
- import xgboost as xgb
- from termcolor import colored
- acids = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE', 'LYS', 'LEU', 'MET', 'ASN', 'PRO', 'GLN', 'ARG', 'SER', 'THR', 'VAL', 'TRP', 'TYR']
- def openPDBFile(protein_name, resLim = 2.0):
- pdb = []
- res = None
- flag = False
- name = pathFinder(protein_name)
- if os.path.exists(name):
- with gzip.open(name,'rt') as f:
- for line in f:
- if flag == False:
- res = checkResolution(line)
- flag = True
- if res > resLim:
- return None
- pdb.append(line)
- return pdb
- else:
- return None
- def checkResolution(line):
- res = 0
- if ('RESOLUTION.' in line) and ('ANGSTROMS' in line):
- res = [float(s) for s in re.findall(r'-?\d+\.?\d*', line)][1]
- return res
- def pathFinder(protein_name):
- path = '/home/artur/a/pdb/' + protein_name[1].lower() + protein_name[2].lower() + '/pdb' + protein_name.lower() + '.ent.gz'
- return path
- def getXYZ(line):
- x = float(line[30:38])
- y = float(line[38:46])
- z = float(line[46:54])
- return [x, y, z]
- def getID(line):
- i = int(line[6:11])
- return i
- #Get atom's name
- def getAtomID(line):
- i = line[13:16]
- return i
- #Get amino acids name
- def getResName(line):
- i = line[17:20]
- return i
- def getProteinAndWater(pdb):
- protein = []
- water = []
- proteinCoords = []
- for line in pdb:
- if len(line) < 78:
- continue
- if (line[17:20] == "HOH" and line[77] == "O"):
- water.append(getXYZ(line))
- if (line[16] != "A" and line[16] != " ") or (line[27] != "A" and line[27] != " "):
- continue
- if (line[17:20] != "HOH" and line[0:4] == "ATOM"):
- protein.append(line)
- return protein, water
- def toPdb(xyz,B= 0, atom_id=None, res_id=" 0", element= "U"):
- if (B>999):
- B = 999.0
- x = "{:8.3f}".format(xyz[0])
- y = "{:8.3f}".format(xyz[1])
- z = "{:8.3f}".format(xyz[2])
- B = "{:6.2f}".format(float(B))
- if res_id != " 0":
- if res_id>999:
- res_id-=int(res_id/1000)*1000
- res_id = str(res_id)
- res_id = " ".join([""]*(3-len(res_id)))+res_id
- line = "ATOM "+element+" "+element * 3+" "+res_id+" 42.931 -14.533 18.887 0.00 "+element
- line_ = line[:30]+x+y+z+line[54:60]+B+line[66:]+"\n"
- return line_
- def getDistanceBetween(chosenMolecule, molecules):
- return distance.cdist(chosenMolecule, molecules)
- def getWatersArray(water):
- cd = getDistanceBetween(water, water)
- id_cd = np.where((cd < 4.0) & (cd > 0.0))
- cd_ = cd[id_cd]
- cd_.sort()
- return cd_
- #get all the carbon, oxygen, nitro atoms from protein
- def getCON(protein): #carbon oxygen nitro
- C = []
- O = []
- N = []
- id_C = id_O = id_N = []
- for i in protein:
- if i[13] == 'O':
- O.append(getXYZ(i))
- id_O.append(getID(i))
- if i[13] == 'C':
- C.append(getXYZ(i))
- id_C.append(getID(i))
- if i[13] == 'N':
- N.append(getXYZ(i))
- id_N.append(getID(i))
- return [C, O, N, id_C, id_O, id_N]
- #get nearly located C O N atoms from each water molecule
- def getAtomsArrays(protein, water):
- global acids
- [C, O, N, id_C, id_O, id_N] = getCON(protein)
- cd_C = getDistanceBetween(water, C)
- cd_O = getDistanceBetween(water, O)
- cd_N = getDistanceBetween(water, N)
- id_cd_C = np.where(cd_C < 4.0)
- id_cd_O = np.where(cd_O < 4.0)
- id_cd_N = np.where(cd_N < 4.0)
- cd_C_ = cd_C[id_cd_C]
- cd_O_ = cd_O[id_cd_O]
- cd_N_ = cd_N[id_cd_N]
- cd_C_.sort()
- cd_O_.sort()
- cd_N_.sort()
- return cd_C_, cd_N_, cd_O_
- #get fake water molecules
- def getFakeWater(water):
- cd = getDistanceBetween(water, water)
- fwater = []
- id_cd = np.where((cd < 4.0) & (cd > 0))
- idx = id_cd[0].tolist()
- jdx = id_cd[1].tolist()
- for i in range(len(idx)):
- if (jdx[i] > idx[i]):
- fwater.append(((np.array(water[idx[i]]) + np.array(water[jdx[i]]) )/ 2))
- return fwater
- def filterProts(lim):
- file = open('bc-90.out', 'r')
- fprots = []
- count = 0
- for line in file:
- count += 1
- prots = line.split(" ")
- for i in prots:
- if len(i) > 6:
- prots.remove(i) #delete proteins like 3y3q_ai
- if lim >= len(prots):
- fprots.extend(prots)
- else:
- random = np.random.randint(len(prots), size= lim)
- for i in random:
- fprots.append(prots[i])
- return fprots
- def filterWater(water, lim = 100):
- water_ = []
- if len(water) <= lim:
- water_ = water
- else:
- random = np.random.randint(len(water), size= lim)
- for i in random:
- water_.append(water[i])
- return water_
- def prepareData(lim = 2):
- global protsUsed
- prots = filterProts(lim)
- random.shuffle(prots)
- count = 0
- error = 0
- posData = []
- negData = []
- for i in prots:
- count += 1
- if count == 15:
- break
- name = i[0:4]
- pdb = openPDBFile(name)
- if pdb == None:
- continue
- else:
- try:
- [protein, water] = getProteinAndWater(pdb)
- water_ = filterWater(water)
- fwater = getFakeWater(water_)
- [C, O, N] = getAtomsArrays(protein, water_)
- posData.append([C, O, N])
- [C1, O1, N1] = getAtomsArrays(protein, fwater)
- negData.append([C, O, N])
- protsUsed.append(name)
- except ValueError:
- #error += 1
- #print colored('Value error number ' + str(error), 'red')
- continue
- return posData, negData
- #def balanceXY(x, y):
- #print colored(x_, 'yellow')
- # return x_, y_
- #ids = []
- #if len(x) == len(y):
- # return x, y
- #if len(x) > len(y):
- # ids = range(len(y))
- # random.shuffle(ids)
- # return x[ids], y
- #if len(y) > len(x):
- # ids = range(len(x))
- # random.shuffle(ids)
- # return x, y[ids]
- def createXY(posData, negData):
- x = posData + negData
- x_ = np.array(x)
- y = np.zeros(len(x))
- for i in range(len(posData)):
- y[i] = 1
- ids = range(len(x))
- random.shuffle(ids)
- return x_[ids],y[ids]
- #return x[ids], y[ids]
- def prepareTest(protsUsed):
- pass
- def train(posData, negData):
- [x, y] = createXY(posData, negData)
- #print x.shape, y.shape
- #x_, y_ = balanceXY(x,y)
- #print x_.shape, y_.shape
- #print x_, y_
- print x.shape, y.shape
- model = xgb.XGBClassifier()
- model.fit(x, y)
- print 'train complete'
- #x vectors : c , o, n
- #y vector : fake water
- timer = time.time()
- protsUsed = []
- [posData, negData] = prepareData()
- train(posData, negData)
- print "These", len(protsUsed), "proteins are used for preparing of data"
- print colored(protsUsed, 'cyan')
- print "My program took", time.time() - timer, "to run and memory is used", psutil.virtual_memory()
- #fwater = getFakeWater(protein, water)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement