Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- """
- This script pulls in homologs of proteins from PDB
- as determined by HHSuite. It then employs UCSF Chimera to
- structurally match them and get an indication of how well
- they score (RMSD) in order to pick the best simulation.
- """
- from __future__ import print_function
- import os
- import subprocess
- import sys
- import argparse
- import traceback
- import warnings
- import glob
- import fnmatch
- import pychimera
- import chimera
- from chimera import openModels,\
- Molecule,\
- replyobj
- from chimera import runCommand as rc
- from MatchMaker import\
- match,\
- CP_SPECIFIC_SPECIFIC,\
- GAP_OPEN,\
- GAP_EXTEND,\
- defaults,\
- MATRIX,\
- ITER_CUTOFF
- # If running using python interpreter and not pychimera:
- # os.environ['CHIMERADIR'] = '/home/wms_joe/Applications/CHIMERA1.11'
- # CHIMERADIR should point to the application root directory.
- # This can be found with: `chimera --root`
- # Only needed if running via normal python interpreter, not pychimera
- # pychimera.patch_environ()
- # pychimera.enable_chimera()
- ##### Main code begins #####
- # Step 1, parse the output of HHpred to get the nearest homolog.
- def main():
- def_cpus=12
- try:
- parser = argparse.ArgumentParser(description='This script compares protein structural homologs as determined with HHpred, to PDB models using UCSF CHIMERA, to gather metrics of structural similarity.')
- parser.add_argument(
- '-d',\
- '--database',\
- action='store',
- default=None,\
- help='You can specify a different HHpred database (filepath) to use if you offer this parameter. Otherwise it defaults to PDB.')
- parser.add_argument(
- '-f',\
- '--fasta',\
- action='store',\
- help='The fasta amino acid sequence that corresponds to the simulated structure and the sequence you wish to query.')
- parser.add_argument(
- '-s',\
- '--simulations',\
- action='store',\
- help='The directory where the protein structure simulations are stored. They must be in PDB format and be named logically.')
- parser.add_argument(
- '-r',\
- '--rmsd',\
- action='store',\
- help='The file to store the RMSD values for each matching.')
- parser.add_argument(
- '-t',\
- '--threads',\
- action='store',\
- default=def_cpus,\
- help='The number of threads that HHsearch can execute on.')
- args = parser.parse_args()
- except:
- print("An exception occured with argument parsing. Check your provided options.")
- traceback.print_exc()
- # Acquire HHPred Database
- if args.database is None:
- db_cmd = 'find ~/Applications/HHSuite/databases/pdb70 -type f -name "pdb70_hhm.ffdata"'
- db_path = subprocess.Popen(
- db_cmd,\
- shell=True,\
- stdin=subprocess.PIPE,\
- stdout=subprocess.PIPE,\
- stderr=subprocess.PIPE)
- (stdout,stderr) = db_path.communicate()
- filelist = stdout.decode().split()
- args.database = stdout
- else:
- print("No default database found and you haven't provided one directly.")
- if args.fasta is not None:
- split = os.path.splitext(args.fasta)
- basename = os.path.basename(split[0])
- else:
- print('No input fasta was provided. Check your input parameters')
- sys.exit(1)
- # Run the HHsearch
- if args.fasta is not None and args.database is not None:
- print("Database in use is:")
- print("Running " + args.fasta + " in HHsearch, against " + args.database + " on " + args.threads + " CPUs.")
- hhresult_file = '{0}.hhr.fasta'.format(basename)
- search_cmd = 'hhsearch -dbstrlen 50 -B 1 -b 1 -p 60 -Z 1 -E 1E-03 -nocons -nopred -nodssp -Ofas {3} -cpu {0} -i {1} -d {2}'.format(args.threads,args.fasta, args.database, hhresult_file)
- print("Executing HHsearch with the command:")
- print(search_cmd)
- hh_process = subprocess.Popen(
- search_cmd,\
- shell=True,\
- stdin=subprocess.PIPE,\
- stderr=subprocess.PIPE)
- hh_process.wait()
- else:
- print("No fasta or database has been detected. Check your input parameters.")
- sys.exit(1)
- # Parse HHpred output fasta and acquire the best hit
- headers = []
- with open(hhresult_file) as result_fasta:
- for line in result_fasta:
- if line.startswith(">"):
- headers.append(line.replace(">",""))
- top_hit = headers[1] # HHpred puts the query seq in index 0, so 1 is your top hit.
- print("Your best hit was the PDB id:")
- print(top_hit)
- print("Full results are found in: " + hhresult_file)
- hhresult_file.close()
- # Acquire the protein simulations
- model_list = []
- for root, dirnames, filenames in os.walk(directory):
- for filename in fnmatch.filter(filenames, '*.pdb'):
- if filename.startswith(basename):
- model_list.append(os.path.join(root, filename))
- print("---------------------------")
- print("Found the following models:")
- for model_path in model_list:
- print(model_path)
- # Get reference structure from PDB
- rc("open " + top_hit)
- # Open model structures
- for model_path in model_list:
- rc("open " + model_path)
- print("Opened: " + model_path)
- rmsd_file = '{0}.tsv'.format(rmsd)
- with open(rmsd_file, "a") as rmsd_output_file:
- for j in len(model_list):
- # reference, simulation, rmsd = match(
- # match each simulation loaded in vs the top_hit PDB fetched from the web
- #
- #
- rmsd_output_file.write(reference + "\t" + simulation + "\t" + rmsd)
- # Chimera Script from Eric's suggestion:
- # s1, s2 = openModels.list(modelTypes=[Molecule])
- # c1, c2 = s1.sequence('A'), s2.sequence('A')
- #
- # atoms1, atoms2, rmsd = match(CP_SPECIFIC_SPECIFIC, [(c1, c2)], defaults[MATRIX],
- # "nw", defaults[GAP_OPEN], defaults[GAP_EXTEND],
- # iterate=defaults[ITER_CUTOFF])[0]
- # for a1, a2 in zip(atoms1, atoms2):
- # print a1.residue, a2.residue, a1.xformCoord().distance(a2.xformCoord())
- #
- if __name__ == '__main__' :
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement