Source code for openawsem.openAWSEM

#!/usr/bin/env python3
try:
    from openmm.app import *
    from openmm import *
    from openmm.unit import *
except ModuleNotFoundError:
    from simtk.openmm.app import *
    from simtk.openmm import *
    from simtk.unit import *
import sys
from pdbfixer import *
import mdtraj as md
from Bio.PDB.Polypeptide import *
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB import PDBList
from Bio.PDB import PDBIO
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import textwrap
import shutil
from pathlib import Path
import openawsem.helperFunctions
import openawsem.functionTerms

__author__ = 'Carlos Bueno'

# Application location
__location__ = Path(__file__).resolve().parent

data_path = openawsem.helperFunctions.DataPath(default_location = __location__,
                                               default_config_path = __location__ / 'config.ini', 
                                               custom_config_path = Path.home() / '.awsem' / 'config.ini')

_AWSEMresidues = ['IPR', 'IGL', 'NGP']
xml = data_path.topology/'awsem.xml'

three_to_one = {'ALA':'A', 'ARG':'R', 'ASN':'N', 'ASP':'D', 'CYS':'C',
                'GLU':'E', 'GLN':'Q', 'GLY':'G', 'HIS':'H', 'ILE':'I',
                'LEU':'L', 'LYS':'K', 'MET':'M', 'PHE':'F', 'PRO':'P',
                'SER':'S', 'THR':'T', 'TRP':'W', 'TYR':'Y', 'VAL':'V'}


def parsePDB(pdb_file):
    '''Reads a pdb file and outputs a pandas DataFrame'''
    def pdb_line(line):
        return dict(recname=str(line[0:6]).strip(),
                    serial=int(line[6:11]),
                    name=str(line[12:16]).strip(),
                    altLoc=str(line[16:17]),
                    resname=str(line[17:20]).strip(),
                    chainID=str(line[21:22]),
                    resSeq=int(line[22:26]),
                    iCode=str(line[26:27]),
                    x=float(line[30:38]),
                    y=float(line[38:46]),
                    z=float(line[46:54]),
                    occupancy=0.0 if line[54:60].strip() == '' else float(line[54:60]),
                    tempFactor=0.0 if line[60:66].strip() == '' else float(line[60:66]),
                    element=str(line[76:78]),
                    charge=str(line[78:80]))

    with open(pdb_file, 'r') as pdb:
        lines = []
        for line in pdb:
            if len(line) > 6 and line[:6] in ['ATOM  ', 'HETATM']:
                lines += [pdb_line(line)]
    pdb_atoms = pd.DataFrame(lines)
    pdb_atoms = pdb_atoms[['recname', 'serial', 'name', 'altLoc',
                           'resname', 'chainID', 'resSeq', 'iCode',
                           'x', 'y', 'z', 'occupancy', 'tempFactor',
                           'element', 'charge']]
    return pdb_atoms
    
def writePDB(atoms,pdb_file):
    '''Reads a pandas DataFrame of atoms and outputs a pdb file'''
    with open(pdb_file, 'w+') as pdb:
        for i, atom in atoms.iterrows():
            pdb_line = f'{atom.recname:<6}{atom.serial:>5} {atom["name"]:^4}{atom.altLoc:1}'+\
                       f'{atom.resname:<3} {atom.chainID:1}{atom.resSeq:>4}{atom.iCode:1}   '+\
                       f'{atom.x:>8.3f}{atom.y:>8.3f}{atom.z:>8.3f}' +\
                       f'{atom.occupancy:>6.2f}{atom.occupancy:>6.2f}'+' ' * 10 +\
                       f'{atom.element:>2}{atom.charge:>2}'
            assert len(pdb_line) == 80, f'An item in the atom table is longer than expected ({len(pdb_line)})\n{pdb_line}'
            pdb.write(pdb_line + '\n')


def parseConfigTable(config_section):
    """Parses a section of the configuration file as a table"""

    def readData(config_section, a):
        """Filters comments and returns values as a list"""
        temp = config_section.get(a).split('#')[0].split()
        l = []
        for val in temp:
            val = val.strip()
            try:
                x = int(val)
                l += [x]
            except ValueError:
                try:
                    y = float(val)
                    l += [y]
                except ValueError:
                    l += [val]
        return l

    data = []
    for a in config_section:
        if a == 'name':
            columns = readData(config_section, a)
        elif len(a) > 3 and a[:3] == 'row':
            data += [readData(config_section, a)]
        else:
            logging.warning(f'Unexpected row {readData(config_section, a)}')
    return pd.DataFrame(data, columns=columns)


def copy_parameter_files():
    src = __location__ / "parameters"
    dest = '.'
    src_files = os.listdir(src)
    for file_name in src_files:
        full_file_name = os.path.join(src, file_name)
        if os.path.isfile(full_file_name):
            shutil.copy(full_file_name, dest)

def create_single_memory(fixed, memory_file="fixed.pdb", chain=-1):
    """Creates a single memory file from a openmm pdb file"""
    PDBFile.writeFile(fixed.topology, fixed.positions, open(memory_file, 'w'))
    openawsem.helperFunctions.create_single_memory(memory_file,chain)
    
def save_protein_sequence(Coarse,sequence_file='protein.seq'):
    """Saves protein sequence to a file from table"""
    protein_data=Coarse[Coarse.resname.isin(_AWSEMresidues)].copy()
    resix = (protein_data.chainID + '_' + protein_data.resSeq.astype(str))
    res_unique = resix.unique()
    protein_data['resID'] = resix.replace(dict(zip(res_unique, range(len(res_unique)))))
    protein_sequence=[r.iloc[0]['real_resname'] for i, r in protein_data.groupby('resID')]
    protein_sequence_one = [three_to_one[a] for a in protein_sequence]

    with open(sequence_file,'w+') as ps:
        ps.write(''.join(protein_sequence_one))

class BaseError(Exception):
    pass


[docs] class Protein(object):
[docs] def __init__(self, atoms, sequence, k_awsem=1): self.atoms = atoms #Include real residue name in atoms atoms = self.atoms.copy() atoms['chain_res'] = atoms['chainID'].astype(str) + '_' + atoms['resSeq'].astype(str) sel = atoms[atoms['resname'].isin(_AWSEMresidues)] resix = sel['chain_res'].unique() assert len(resix) == len(sequence), \ f'The number of residues {len(resix)} does not agree with the length of the sequence {len(sequence)}' atoms.index = atoms['chain_res'] for r, s in zip(resix, sequence): atoms.loc[r, 'real_resname'] = s atoms.index = range(len(atoms)) self.atoms = atoms protein_data = atoms[atoms.resname.isin(_AWSEMresidues)].copy() # renumber residues resix = (protein_data.chainID + '_' + protein_data.resSeq.astype(str)) res_unique = resix.unique() protein_data['resID'] = resix.replace(dict(zip(res_unique, range(len(res_unique))))) # renumber atom types atom_types_table = {'N': 'n', 'H': 'h', 'CA': 'ca', 'C': 'c', 'O': 'o', 'CB': 'cb'} protein_data['atom_list'] = protein_data['name'].replace(atom_types_table) protein_data['idx'] = protein_data.index.astype(int) self.protein_data = protein_data self.atom_lists = protein_data.pivot(index='resID', columns='atom_list', values='idx').fillna(-1).astype(int) self.n = self.atom_lists['n'].tolist() self.h = self.atom_lists['h'].tolist() self.ca = self.atom_lists['ca'].tolist() self.c = self.atom_lists['c'].tolist() self.o = self.atom_lists['o'].tolist() self.cb = self.atom_lists['cb'].tolist() self.nres = len(self.atom_lists) self.k_awsem = k_awsem self.res_type = [r.iloc[0]['resname'] for i, r in protein_data.groupby('resID')] self.chain_starts = [c.iloc[0].resID for i, c in protein_data.groupby('chainID')] self.chain_ends = [c.iloc[-1].resID for i, c in protein_data.groupby('chainID')] self.natoms = len(atoms) self.bonds = self._setup_bonds() self.seq = sequence self.resi = pd.merge(self.atoms, self.protein_data, how='left').resID.fillna(-1).astype(int).tolist() pass
def _setup_bonds(self): bonds = [] for i in range(self.nres): bonds.append((self.ca[i], self.o[i])) if not self.res_type[i] == "IGL": bonds.append((self.ca[i], self.cb[i])) if i not in self.chain_ends: bonds.append((self.ca[i], self.ca[i + 1])) bonds.append((self.o[i], self.ca[i + 1])) for i in range(self.nres): if i not in self.chain_starts and not self.res_type[i] == "IGL": bonds.append((self.n[i], self.cb[i])) if i not in self.chain_ends and not self.res_type[i] == "IGL": bonds.append((self.c[i], self.cb[i])) if i not in self.chain_starts and i not in self.chain_ends: bonds.append((self.n[i], self.c[i])) return bonds def setup_virtual_sites(self, system, ): # set virtual sites for i in range(self.nres): if i not in self.chain_starts: n_virtual_site = ThreeParticleAverageSite(self.ca[i - 1], self.ca[i], self.o[i - 1], 0.48318, 0.70328, -0.18643) system.setVirtualSite(self.n[i], n_virtual_site) if not self.res_type[i] == "IPR": h_virtual_site = ThreeParticleAverageSite(self.ca[i - 1], self.ca[i], self.o[i - 1], 0.84100, 0.89296, -0.73389) system.setVirtualSite(self.h[i], h_virtual_site) if i not in self.chain_ends: c_virtual_site = ThreeParticleAverageSite(self.ca[i], self.ca[i + 1], self.o[i], 0.44365, 0.23520, 0.32115) # print("Virtual", c[i]) system.setVirtualSite(self.c[i], c_virtual_site) @classmethod def fromPDB(cls, pdb, pdbout='CoarseProtein.pdb'): """ Initializes a protein form a pdb, making all the atoms coarse-grained""" pass @classmethod def fromCoarsePDB(cls, pdb_file, sequence): """ Initializes the protein from an already coarse grained pdb""" atoms = parsePDB(pdb_file) return cls(atoms, sequence) def parseConfigurationFile(self): """ Parses the AWSEM configuration file to use for the topology and to set the forces""" pass def computeTopology(self): """ Compute the bonds and angles from the pdb""" pass @staticmethod def CoarseGrain(pdb_table): """ Selects AWSEM atoms from a pdb table and returns a table containing only the coarse-grained atoms for AWSEM """ protein_residues = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'] awsem_atoms = ["N", "H", "CA", "C", "O", "CB"] # Select coarse grained atoms selection = pdb_table[pdb_table.resname.isin(protein_residues) & pdb_table.name.isin(awsem_atoms)].copy() # Remove virtual atoms at the end or begining of the chain drop_list = [] for chain in selection.chainID.unique(): sel = selection[selection.chainID == chain] drop_list += list(sel[(sel.resSeq == sel.resSeq.min()) & sel['name'].isin(['N', 'H'])].index) drop_list += list(sel[(sel.resSeq == sel.resSeq.max()) & sel['name'].isin(['C'])].index) selection = selection.drop(drop_list) # Replace resnames selection['real_resname'] = selection.resname.copy() resname = selection.resname.copy() resname[:] = 'NGP' resname[selection.resname == 'PRO'] = 'IPR' resname[selection.resname == 'GLY'] = 'IGL' selection.resname = resname # CB element is B selection.loc[selection['name'] == 'CB', 'element'] = 'B' # Reorder atoms selection.name = pd.Categorical(selection.name, awsem_atoms) selection.sort_values(['chainID', 'resSeq', 'name']) # Prepare virtual sites for c, chain in selection.groupby('chainID'): first = chain.resSeq.min() last = chain.resSeq.max() for i, residue in chain.groupby('resSeq'): idx = dict(zip(residue.name, residue.index)) pos = dict(zip(residue.name, [residue.loc[i, ['x', 'y', 'z']] for i in residue.index])) if i != first: if 'N' in idx.keys(): selection.loc[idx['N'], ['x', 'y', 'z']] = 0.48318 * pos_im['CA'] + 0.70328 * pos[ 'CA'] - 0.18643 * pos_im['O'] if 'H' in idx.keys(): selection.loc[idx['H'], ['x', 'y', 'z']] = 0.84100 * pos_im['CA'] + 0.89296 * pos[ 'CA'] - 0.73389 * pos_im['O'] if 'C' in idx.keys(): selection.loc[idx_im['C'], ['x', 'y', 'z']] = 0.44365 * pos_im['CA'] + 0.23520 * pos[ 'CA'] + 0.32115 * pos_im['O'] pos_im = pos.copy() idx_im = idx.copy() # Renumber selection['serial'] = range(len(selection)) return selection @staticmethod def write_sequence(Coarse, seq_file='protein.seq'): protein_data = Coarse[Coarse.resname.isin(_AWSEMresidues)].copy() resix = (protein_data.chainID + '_' + protein_data.resSeq.astype(str)) res_unique = resix.unique() protein_data['resID'] = resix.replace(dict(zip(res_unique, range(len(res_unique))))) protein_sequence = [r.iloc[0]['real_resname'] for i, r in protein_data.groupby('resID')] protein_sequence_one = [three_to_one[a] for a in protein_sequence] with open(seq_file, 'w+') as ps: ps.write(''.join(protein_sequence_one)) def corrected_resid(self, gap=100): """Return per-atom residue IDs with a guaranteed gap between chains. Inserts `gap` unused indices between consecutive chains so that cross-chain residue pairs always satisfy abs(resId1-resId2) > gap. Args: oa: OpenAWSEM system object. gap (int): Number of indices to leave between chains (must exceed the largest sequence-separation threshold used in energy expressions, i.e. gap > 5 for excl_term). Returns: np.ndarray[int]: Adjusted residue ID per atom (length oa.natoms). Non-protein atoms (oa.resi == -1) are returned as -1. Raises: ValueError: If residues within any chain are not contiguous. """ resi = np.array(self.resi) for start in self.chain_starts[1:][::-1]: resi += gap * (resi >= start) return resi
def addNonBondedExclusions(oa, force): cb_fixed = [x if x > 0 else y for x, y in zip(oa.cb, oa.ca)] none_cb_fixed = [i for i in range(oa.natoms) if i not in cb_fixed] for e1 in none_cb_fixed: for e2 in none_cb_fixed: if e1 > e2: continue force.addExclusion(e1, e2) for e1 in none_cb_fixed: for e2 in cb_fixed: force.addExclusion(e1, e2) for e1 in none_cb_fixed: for e2 in none_cb_fixed: if e1 > e2: continue force.addExclusion(e1, e2) for e1 in none_cb_fixed: for e2 in cb_fixed: force.addExclusion(e1, e2) def identify_terminal_residues(pdb_filename): # identify terminal residues parser = PDBParser() structure = parser.get_structure('X', pdb_filename) terminal_residues = {} for model in structure: for chain in model: residues = list(chain.get_residues()) terminal_residues[chain.id] = (residues[0].id[1], residues[-1].id[1]) return terminal_residues def line_number(): return sys._getframe(1).f_lineno def prepare_pdb(pdb_filename, chains_to_simulate, use_cis_proline=False, keepIds=False, removeHeterogens=True): # for more information about PDB Fixer, see: # http://htmlpreview.github.io/?https://raw.github.com/pandegroup/pdbfixer/master/Manual.html # fix up input pdb cleaned_pdb_filename = "%s-cleaned.pdb" % pdb_filename[:-4] input_pdb_filename = "%s-openmmawsem.pdb" % pdb_filename[:-4] fixer = PDBFixer(filename=pdb_filename) # remove unwanted chains chains = list(fixer.topology.chains()) chains_to_remove = [i for i, x in enumerate(chains) if x.id not in chains_to_simulate] fixer.removeChains(chains_to_remove) #Identify Missing Residues fixer.findMissingResidues() fixer.missingResidues = {} #Replace Nonstandard Residues fixer.findNonstandardResidues() fixer.replaceNonstandardResidues() #Remove Heterogens if removeHeterogens: fixer.removeHeterogens(keepWater=False) #Add Missing Heavy Atoms fixer.findMissingAtoms() fixer.addMissingAtoms() #Add Missing Hydrogens fixer.addMissingHydrogens(7.0) PDBFile.writeFile(fixer.topology, fixer.positions, open(cleaned_pdb_filename, 'w'), keepIds=keepIds) #Read sequence structure = PDBParser().get_structure('X', cleaned_pdb_filename) # identify terminal residues terminal_residues = identify_terminal_residues(cleaned_pdb_filename) # process pdb for input into OpenMM #Selects only atoms needed for the awsem topology output = open(input_pdb_filename, 'w') counter=0 for line in open(cleaned_pdb_filename): # print(line) splitline = line.split() if len(line)>4 and line[0:4] == "ATOM": try: atom_index=line[6:11].strip() atom_type=line[12:16].strip() res_type=line[17:20].strip() chain=line[21].strip() res_index=line[22:26].strip() x=line[30:38].strip() y=line[38:46].strip() z=line[46:54].strip() element = line[76:78].strip() except ValueError: print(line) raise else: continue awsem_atoms = ["CA", "O", "CB", "C", "H", "N"] if int(res_index) == terminal_residues[chain][0]: awsem_atoms.remove("N") awsem_atoms.remove("H") if int(res_index) == terminal_residues[chain][1]: awsem_atoms.remove("C") # GLY should not has CB. if res_type == "GLY": awsem_atoms.remove("CB") if atom_type in awsem_atoms: line=list(line) if res_type == "GLY": line[17:20] = "IGL" elif res_type == "PRO": line[17:20] = "IPR" else: line[17:20] = "NGP" if atom_type == "CB": line[77] = "B" line=''.join(line) output.write(line) counter+=1 #print("The system contains %i atoms"%counter) output.close() #Fix Virtual Site Coordinates: # prepare_virtual_sites(input_pdb_filename, use_cis_proline=use_cis_proline) prepare_virtual_sites_v2(input_pdb_filename, use_cis_proline=use_cis_proline) return input_pdb_filename, cleaned_pdb_filename def prepare_virtual_sites_v2(pdb_file, use_cis_proline=False): parser = PDBParser(QUIET=True) structure=parser.get_structure('X',pdb_file,) res = list(structure.get_residues()) model = structure[0] output_file = pdb_file f = open(pdb_file) all_lines = f.readlines() f.close() output = open(output_file, "w") index = 0 for line in all_lines: # print(line) splitline = line.split() if len(line)>4 and line[0:4] == "ATOM": try: atom_index=line[6:11].strip() atom_type=line[12:16].strip() res_type=line[17:20].strip() chain=line[21].strip() res_index=line[22:26].strip() x=line[30:38].strip() y=line[38:46].strip() z=line[46:54].strip() element = line[76:78].strip() except ValueError: print(line) raise else: continue res_index = int(res_index) try: r_im = model[chain][res_index-1] except KeyError: r_im = model[chain][res_index] # won't be used r_i = model[chain][res_index] try: r_ip = model[chain][res_index+1] except KeyError: r_ip = model[chain][res_index] # won't be used if use_cis_proline and res_type == "IPR": n_coord = -0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord() c_coord = 0.2196*r_i['CA'].get_coord()+ 0.2300*r_ip['CA'].get_coord() + 0.5507*r_i['O'].get_coord() h_coord = -0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord() else: n_coord = 0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord() c_coord = 0.44365*r_i['CA'].get_coord()+ 0.23520*r_ip['CA'].get_coord()+ 0.32115 *r_i['O'].get_coord() h_coord = 0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord() line_list=list(line) index += 1 line_list[6:11] = "{:5d}".format(index) if atom_type == "N": line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(n_coord[0])) line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(n_coord[1])) line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(n_coord[2]))) if atom_type == "C": line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(c_coord[0])) line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(c_coord[1])) line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(c_coord[2]))) if atom_type == "H": line_list[30:38] = '{:.8s}'.format('{:8.3f}'.format(h_coord[0])) line_list[38:46] = '{:.8s}'.format('{:8.3f}'.format(h_coord[1])) line_list[46:54] = list('{:.8s}'.format('{:8.3f}'.format(h_coord[2]))) new_line=''.join(line_list) output.write(new_line) index += 1 line_list[6:11] = "{:5d}".format(index) line_list[0:4] = "TER " line_list[30:78] = " " * 48 new_line=''.join(line_list) output.write(new_line) output.write("END\n") output.close() def prepare_virtual_sites(pdb_file, use_cis_proline=False): parser = PDBParser(QUIET=True) structure=parser.get_structure('X',pdb_file,) for model in structure: for chain in model: r_im={} r_i={} for residue in chain: r_im=r_i r_i={} for atom in residue: r_i[atom.get_name()]=atom if use_cis_proline and residue.get_resname() == "IPR": if 'N' in r_i: r_i['N'].set_coord(-0.2094*r_im['CA'].get_coord()+ 0.6908*r_i['CA'].get_coord() + 0.5190*r_im['O'].get_coord()) if 'C' in r_im: r_im['C'].set_coord(0.2196*r_im['CA'].get_coord()+ 0.2300*r_i['CA'].get_coord() + 0.5507*r_im['O'].get_coord()) if 'H' in r_i: r_i['H'].set_coord(-0.9871*r_im['CA'].get_coord()+ 0.9326*r_i['CA'].get_coord() + 1.0604*r_im['O'].get_coord()) else: if 'N' in r_i: r_i['N'].set_coord(0.48318*r_im['CA'].get_coord()+ 0.70328*r_i['CA'].get_coord()- 0.18643 *r_im['O'].get_coord()) if 'C' in r_im: r_im['C'].set_coord(0.44365*r_im['CA'].get_coord()+ 0.23520*r_i['CA'].get_coord()+ 0.32115 *r_im['O'].get_coord()) if 'H' in r_i: r_i['H'].set_coord(0.84100*r_im['CA'].get_coord()+ 0.89296*r_i['CA'].get_coord()- 0.73389 *r_im['O'].get_coord()) io = PDBIO() io.set_structure(structure) io.save(pdb_file) def build_lists_of_atoms(nres, residues): # build lists of atoms, residue types, and bonds atom_types=['n', 'h', 'ca', 'c', 'o', 'cb'] res_types=[] atom_lists=dict(zip(atom_types,[[] for i in range(len(atom_types))])) for residue in residues: res_types.append(residue.name) atom_types=['n', 'h', 'ca', 'c', 'o', 'cb'] residue_atoms = [x.index for x in residue.atoms()] if residue.index == 0: atom_types.remove('n') atom_lists['n'].append(-1) atom_types.remove('h') atom_lists['h'].append(-1) if residue.index == nres-1: atom_types.remove('c') atom_lists['c'].append(-1) pass if residue.name == "IGL" and 'cb' in atom_types: atom_types.remove('cb') atom_lists['cb'].append(-1) if residue.name in "IPR" and 'h' in atom_types: atom_types.remove('h') atom_lists['h'].append(-1) assert len(residue_atoms)==len(atom_types), '%s\n%s'%(str(residue_atoms),str(atom_types)) atom_types=[a.name.lower() for a in residue._atoms] #Sometimes the atom order may be different for atom, atype in zip(residue_atoms, atom_types): atom_lists[atype].append(atom) return atom_lists, res_types def build_lists_of_atoms_2(nres, residues, atoms): res_id = 0 n = h = ca = c = o = cb = -1 atom_types=['n', 'h', 'ca', 'c', 'o', 'cb'] atom_types_table = {'N':'n', 'H':'h', 'CA':'ca', 'C':'c', 'O':'o', 'CB':'cb'} res_types=[] for residue in residues: res_types.append(residue.name) atom_lists=dict(zip(atom_types,[[-1]*nres for i in range(len(atom_types))])) for atom in atoms: atom_lists[atom_types_table[atom.name]][atom.residue.index] = atom.index return atom_lists, res_types def ensure_atom_order(input_pdb_filename, quiet=1): # ensure order of ['n', 'h', 'ca', 'c', 'o', 'cb'] # to be more specific, this ensure 'ca' always show up before 'c'. def first(t): return t[0] order_table = {'N':0, 'H':1, 'CA':2, 'C':3, 'O':4, 'CB':5} one_residue = [] with open("tmp.pdb", "w") as out: with open(input_pdb_filename, "r") as f: all_lines = f.readlines() pre_res_id = all_lines[0].split()[5] for line in all_lines: info = line.split() if info[0]!="ATOM": sorted_residue = sorted(one_residue, key=first) for a in sorted_residue: out.write(a[1]) one_residue = [] out.write(line) continue res_id = info[5] if res_id != pre_res_id: pre_res_id = res_id # sort them in order sorted_residue = sorted(one_residue, key=first) for a in sorted_residue: out.write(a[1]) if sorted_residue != one_residue: if not quiet: print("Reorder atom position") print("Original, Changed to") for i, t in zip(one_residue, sorted_residue): if not quiet: print(i[2], i[3], ", ", t[2], t[3]) one_residue = [] atomType = info[2] one_residue.append((order_table[atomType], line, info[1], atomType)) sorted_residue = sorted(one_residue, key=first) for a in sorted_residue: out.write(a[1]) os.system(f"mv tmp.pdb {input_pdb_filename}") def get_chain_starts_and_ends(all_res): chain_starts = [] chain_ends = [] pos = -1 for i, res in enumerate(all_res): if res.chain.index != pos: chain_starts.append(i) pos = res.chain.index if i > 0: chain_ends.append(i-1) chain_ends.append(len(all_res)-1) return chain_starts, chain_ends def setup_virtual_sites(nres, system, n, h, ca, c, o, cb, res_type, chain_starts, chain_ends, use_cis_proline=False): # set virtual sites for i in range(nres): if use_cis_proline and res_type[i] == "IPR": if i not in chain_starts: n_virtual_site = ThreeParticleAverageSite(ca[i-1], ca[i], o[i-1], -0.2094, 0.6908, 0.5190) system.setVirtualSite(n[i], n_virtual_site) if not res_type[i] == "IPR": h_virtual_site = ThreeParticleAverageSite(ca[i-1], ca[i], o[i-1], -0.9871, 0.9326, 1.0604) system.setVirtualSite(h[i], h_virtual_site) if i not in chain_ends: c_virtual_site = ThreeParticleAverageSite(ca[i], ca[i+1], o[i], 0.2196, 0.2300, 0.5507) # print("Virtual", c[i]) system.setVirtualSite(c[i], c_virtual_site) else: if i not in chain_starts: n_virtual_site = ThreeParticleAverageSite(ca[i-1], ca[i], o[i-1], 0.48318, 0.70328, -0.18643) system.setVirtualSite(n[i], n_virtual_site) if not res_type[i] == "IPR": h_virtual_site = ThreeParticleAverageSite(ca[i-1], ca[i], o[i-1], 0.84100, 0.89296, -0.73389) system.setVirtualSite(h[i], h_virtual_site) if i not in chain_ends: c_virtual_site = ThreeParticleAverageSite(ca[i], ca[i+1], o[i], 0.44365, 0.23520, 0.32115) # print("Virtual", c[i]) system.setVirtualSite(c[i], c_virtual_site) def setup_bonds(nres, n, h, ca, c, o, cb, res_type, chain_starts, chain_ends): bonds = [] for i in range(nres): bonds.append((ca[i], o[i])) if not res_type[i] == "IGL": bonds.append((ca[i], cb[i])) if i not in chain_ends: bonds.append((ca[i], ca[i+1])) bonds.append((o[i], ca[i+1])) for i in range(nres): if i not in chain_starts and not res_type[i] == "IGL": bonds.append((n[i], cb[i])) if i not in chain_ends and not res_type[i] == "IGL": bonds.append((c[i], cb[i])) if i not in chain_starts and i not in chain_ends: bonds.append((n[i], c[i])) return bonds def read_fasta(fastaFile): with open(fastaFile) as input_data: data = "" for line in input_data: if(line[0] == ">"): print(line) elif(line == "\n"): pass else: data += line.strip("\n") return data def formatResidue_ThreeLetterCodeToOne(residue): residue_name = residue.get_resname() try: oneLetter = three_to_one[residue_name] except: print(f"Unknown residue: {residue.get_full_id()}, treat as ALA") residue_name = "ALA" return three_to_one[residue_name] def getSeqFromCleanPdb(input_pdb_filename, chains='A', writeFastaFile=False): cleaned_pdb_filename = input_pdb_filename.replace("openmmawsem.pdb", "cleaned.pdb") pdb = input_pdb_filename.replace("-openmmawsem.pdb", "") fastaFile = pdb + ".fasta" s = PDBParser().get_structure("X", cleaned_pdb_filename) m = s[0] # model 0 seq = "" if writeFastaFile: with open(fastaFile, "w") as out: for chain in chains: out.write(f">{pdb.upper()}:{chain}\n") c = m[chain] chain_seq = "" for residue in c: chain_seq += formatResidue_ThreeLetterCodeToOne(residue) out.write("\n".join(textwrap.wrap(chain_seq, width=80))+"\n") seq += chain_seq else: for chain in chains: c = m[chain] chain_seq = "" for residue in c: chain_seq += formatResidue_ThreeLetterCodeToOne(residue) seq += chain_seq return seq def getSeq(input_pdb_filename, chains='A', writeFastaFile=False, fromPdb=False, fromFasta=None): if fromPdb: cleaned_pdb_filename = input_pdb_filename.replace("openmmawsem.pdb", "cleaned.pdb") pdb = input_pdb_filename.replace("-openmmawsem.pdb", "") fastaFile = pdb + ".fasta" s = PDBParser().get_structure("X", cleaned_pdb_filename) m = s[0] # model 0 seq = "" if writeFastaFile: with open(fastaFile, "w") as out: for chain in chains: out.write(f">{pdb.upper()}:{chain}\n") c = m[chain] chain_seq = "" for residue in c: chain_seq += formatResidue_ThreeLetterCodeToOne(residue) out.write("\n".join(textwrap.wrap(chain_seq, width=80))+"\n") seq += chain_seq else: for chain in chains: c = m[chain] chain_seq = "" for residue in c: chain_seq += formatResidue_ThreeLetterCodeToOne(residue) seq += chain_seq elif fromFasta: seq = "" with open(fromFasta) as f: for line in f: if line[0] == ">": pass else: # print(line) seq += line.strip() return seq def fixPymolPdb(location): # location = "1r69.pdb" with open("tmp", "w") as out: with open(location, "r") as f: for line in f: info = list(line) if len(info) > 21: info[21] = "A" out.write("".join(info)) os.system(f"mv tmp {location}") def download(pdb_id): if not os.path.isfile(f"{pdb_id}.pdb"): PDBList().retrieve_pdb_file(pdb_id.lower(), pdir='.', file_format='pdb') os.rename("pdb%s.ent" % pdb_id, f"{pdb_id}.pdb")
[docs] class OpenMMAWSEMSystem:
[docs] def __init__(self, pdb_filename, chains='A', xml_filename=xml, k_awsem=1.0, seqFromPdb=None, includeLigands=False, periodic_box=None, fixed_residue_indices=[]): # read PDB self.pdb = PDBFile(str(pdb_filename)) self.forcefield = ForceField(str(xml_filename)) self.periodic_box = periodic_box self.fixed_residue_indices = fixed_residue_indices if self.fixed_residue_indices: self.fixed_atom_indices = [] for residue in self.pdb.topology.residues(): if residue.index in self.fixed_residue_indices: for atom in residue.atoms(): self.fixed_atom_indices.append(atom.index) else: self.fixed_atom_indices = [] if not includeLigands: self.system = self.forcefield.createSystem(self.pdb.topology) # define convenience variables self.nres = self.pdb.topology.getNumResidues() self.natoms = self.pdb.topology.getNumAtoms() self.residues = list(self.pdb.topology.residues()) self.resi = [x.residue.index for x in list(self.pdb.topology.atoms())] # build lists of atoms and residue types # self.atom_lists,self.res_type=build_lists_of_atoms(self.nres, self.residues) self.atom_lists,self.res_type=build_lists_of_atoms_2(self.nres, self.residues, self.pdb.topology.atoms()) if includeLigands: print(self.pdb.topology) [templates, names] = self.forcefield.generateTemplatesForUnmatchedResidues(self.pdb.topology) # a = templates[0] for a in templates: for a1 in a.atoms: # we can define the types of ligand atoms using their names. a1.type = a1.name # if a1.element.symbol == "C": # a1.type = "CA" # else: # a1.type = a1.element.symbol # a1.type = a1.element.symbol self.forcefield.registerResidueTemplate(a) res_list = list(self.pdb.topology.residues()) atom_list = list(self.pdb.topology.atoms()) protein_resNames = ["NGP", "IGL", "IPR"] DNA_resNames = ["DA", "DC", "DT", "DG"] protein_res_list = [] DNA_res_list = [] ligand_res_list = [] for res in res_list: if res.name in protein_resNames: protein_res_list.append(res) elif res.name in DNA_resNames: DNA_res_list.append(res) else: ligand_res_list.append(res) protein_atom_list = [] DNA_atom_list = [] ligand_atom_list = [] for atom in atom_list: if atom.residue.name in protein_resNames: protein_atom_list.append(atom) elif atom.residue.name in DNA_resNames: DNA_atom_list.append(atom) else: ligand_atom_list.append(atom) self.ligand_res_list = ligand_res_list self.ligand_atom_list = ligand_atom_list self.protein_atom_list = protein_atom_list self.system = self.forcefield.createSystem(self.pdb.topology) self.nres = len(protein_res_list) self.residues = protein_res_list print(f"Number of residues: {self.nres}, Number of ligands: {len(ligand_res_list)}") # self.natoms = len(protein_atom_list) self.natoms = self.pdb.topology.getNumAtoms() self.resi = [x.residue.index if x in protein_atom_list else -1 for x in atom_list] self.atom_lists,self.res_type=build_lists_of_atoms_2(self.nres, self.residues, protein_atom_list) if self.periodic_box: self.system.setDefaultPeriodicBoxVectors(Vec3(self.periodic_box[0],0,0), Vec3(0,self.periodic_box[1],0), Vec3(0,0,self.periodic_box[2])) for atom_index in self.fixed_atom_indices: self.system.setParticleMass(atom_index,0) # print(self.atom_lists,self.res_type) self.n =self.atom_lists['n'] self.h =self.atom_lists['h'] self.ca=self.atom_lists['ca'] self.c =self.atom_lists['c'] self.o =self.atom_lists['o'] self.cb=self.atom_lists['cb'] self.chain_starts, self.chain_ends = get_chain_starts_and_ends(self.residues) # setup virtual sites # if Segmentation fault in setup_virtual_sites, use ensure_atom_order setup_virtual_sites(self.nres, self.system, self.n, self.h, self.ca, self.c, self.o, self.cb, self.res_type, self.chain_starts, self.chain_ends) # setup bonds self.bonds = setup_bonds(self.nres, self.n, self.h, self.ca, self.c, self.o, self.cb, self.res_type, self.chain_starts, self.chain_ends) # identify terminal_residues # self.terminal_residues = identify_terminal_residues(pdb_filename) # set overall scaling self.k_awsem = k_awsem # keep track of force names for output purposes self.force_names = [] # save seq info if seqFromPdb is None: self.seq = getSeq(pdb_filename, chains=chains, fromPdb=True) else: self.seq = seqFromPdb
# elif seqFromPdb == 0: # self.seq = getSeq(pdb_filename, chains=chains, fromFasta=True) def addForces(self, forces): for i, (force) in enumerate(forces): self.addForce(force) force.setForceGroup(i+1) def addForce(self, force): self.system.addForce(force) def addForcesWithDefaultForceGroup(self, forces): for i, (force) in enumerate(forces): self.addForce(force) def corrected_resid(self, gap=100): """Return per-atom residue IDs with a guaranteed gap between chains. Inserts `gap` unused indices between consecutive chains so that cross-chain residue pairs always satisfy abs(resId1-resId2) > gap. Args: oa: OpenAWSEM system object. gap (int): Number of indices to leave between chains (must exceed the largest sequence-separation threshold used in energy expressions, i.e. gap > 5 for excl_term). Returns: np.ndarray[int]: Adjusted residue ID per atom (length oa.natoms). Non-protein atoms (oa.resi == -1) are returned as -1. Raises: ValueError: If residues within any chain are not contiguous. """ resi = np.array(self.resi) for start in self.chain_starts[1:][::-1]: resi += gap * (resi >= start) return resi
def read_trajectory_pdb_positions(pdb_trajectory_filename): import uuid, os pdb_trajectory_contents = open(pdb_trajectory_filename).read().split("MODEL")[1:] pdb_trajectory_contents = ['\n'.join(x.split('\n')[1:]) for x in pdb_trajectory_contents] pdb_trajectory = [] for i, pdb_contents in enumerate(pdb_trajectory_contents): temporary_file_name = str(uuid.uuid4()) temporary_pdb_file = open(temporary_file_name, 'w') temporary_pdb_file.write(pdb_contents) temporary_pdb_file.close() pdb = PDBFile(temporary_file_name) pdb_trajectory.append(pdb) os.remove(temporary_file_name) return pdb_trajectory def compute_order_parameters(openmm_awsem_pdb_file, pdb_trajectory_filename, order_parameters, platform_name='CPU', k_awsem=1.0, compute_mdtraj=False, rmsd_reference_structure=None, compute_total_energy=True, energy_columns=None, xml_filename="awsem.xml"): pdb_trajectory = read_trajectory_pdb_positions(pdb_trajectory_filename) order_parameter_values = [] for i, order_parameter in enumerate(order_parameters): order_parameter_values.append([]) oa = OpenMMAWSEMSystem(openmm_awsem_pdb_file, k_awsem=k_awsem, xml_filename=xml_filename) platform = Platform.getPlatformByName(platform_name) # OpenCL, CUDA, CPU, or Reference integrator = VerletIntegrator(2*femtoseconds) oa.addForce(order_parameter) simulation = Simulation(oa.pdb.topology, oa.system, integrator, platform) for pdb in pdb_trajectory: simulation.context.setPositions(pdb.positions) state = simulation.context.getState(getEnergy=True) order_parameter_values[i].append(state.getPotentialEnergy().value_in_unit(kilojoule_per_mole)) if compute_mdtraj: md_traj_order_parameters = compute_mdtraj_order_parmeters(pdb_trajectory_filename, rmsd_reference_structure=rmsd_reference_structure) for key, value in md_traj_order_parameters.items(): if len(value.flatten()) == len(pdb_trajectory): order_parameter_values.append(value) if compute_total_energy: order_parameter_values.append([]) for i in range(len(pdb_trajectory)): order_parameter_values[-1].append(np.sum([order_parameter_values[x-1][i] for x in energy_columns])) if not compute_mdtraj: return np.array(order_parameter_values) else: return np.array(order_parameter_values), md_traj_order_parameters def compute_mdtraj_order_parmeters(trajectory_file, rmsd_reference_structure=None): # documentation: http://mdtraj.org/1.8.0/analysis.html# trajectory = md.load(trajectory_file) return_values = [] return_value_names = [] if not rmsd_reference_structure == None: reference = md.load(rmsd_reference_structure) rmsd = md.rmsd(trajectory, reference) return_values.append(rmsd) return_value_names.append("RMSD") hydrogen_bonds = np.array([np.sum(x) for x in md.kabsch_sander(trajectory)]) return_values.append(hydrogen_bonds) return_value_names.append("HBondEnergy") ss = md.compute_dssp(trajectory) shape = ss.shape transdict = dict(zip(list(set(list(ss.flatten()))),range(len(list(set(list(ss.flatten()))))))) ss = np.array([transdict[x] for x in ss.flatten()]).reshape(shape).T return_values.append(ss) return_value_names.append("SecondaryStructure") rg = md.compute_rg(trajectory) return_values.append(rg) return_value_names.append("Rg") distances, residue_pairs = md.compute_contacts(trajectory, scheme='ca') contacts = md.geometry.squareform(distances, residue_pairs) return_values.append(contacts) return_value_names.append("Contacts") return dict(zip(return_value_names, return_values)) def plot_free_energy(order_parameter_file, labels, bins=20, discard=.2, twodlimits=[[0,1],[0,1]]): import matplotlib as mpl import scipy.ndimage mpl.rcParams['font.size'] = 24 if len(labels) > 2: print("Too many labels to plot.") return order_parameters = np.loadtxt(order_parameter_file).T data_labels = open(order_parameter_file).readlines()[0].split()[1:] data = [] for label in labels: raw_data = order_parameters[data_labels.index(label)] data_after_discard = raw_data[int(len(raw_data)*discard):] comparison_data_set_size = (1.0-discard)/2 data_set_1 = raw_data[int(len(raw_data)*discard):int(len(raw_data)*(discard+comparison_data_set_size))] data_set_2 = raw_data[int(len(raw_data)*(discard+comparison_data_set_size)):] data.append(data_after_discard) if len(labels) == 1: hist, bins = np.histogram(data[0], density=True, bins=bins) hist /= np.sum(hist) f = -np.log(hist) bin_centers = (bins[:-1]+bins[1:])/2 hist, bins = np.histogram(data_set_1, density=True, bins=bins) hist /= np.sum(hist) f_1 = -np.log(hist) bin_centers_1 = (bins[:-1]+bins[1:])/2 hist, bins = np.histogram(data_set_2, density=True, bins=bins) hist /= np.sum(hist) f_2 = -np.log(hist) bin_centers_2 = (bins[:-1]+bins[1:])/2 plt.figure(figsize=(12,8)) plt.plot(bin_centers, f, label="All data") plt.plot(bin_centers_1, f_1, label="Split data 1") plt.plot(bin_centers_2, f_2, label="Split data 2") plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.title(labels[0]) plt.xlabel(labels[0]) plt.ylabel("Free Energy (kT)") if len(labels) == 2: H, xedges, yedges = np.histogram2d(data[0], data[1], bins=bins) xcenters = (xedges[:-1] + xedges[1:])/2.0 ycenters = (yedges[:-1] + yedges[1:])/2.0 H /= np.sum(H) H = -np.log(H) H -= np.min(H) fig = plt.figure(figsize=(12, 10)) ax = fig.add_subplot(111) ax.set_title("%s vs. %s" % (labels[0], labels[1])) ax.set_xlim(twodlimits[0]) ax.set_ylim(twodlimits[1]) ax.set_xlabel(labels[0]) ax.set_ylabel(labels[1]) X, Y = np.meshgrid(xcenters, ycenters) plt.contourf(X, Y, H, cmap=plt.cm.get_cmap('jet')) plt.colorbar() plt.tight_layout() plt.show() def compute_perturbed_energies(openmm_awsem_pdb_file, pdb_trajectory_filename, perturbations, order_parameter_values, platform_name='CPU', k_awsem=1.0, total_energy_column=15): pdb_trajectory = read_trajectory_pdb_positions(pdb_trajectory_filename) all_perturbed_energies = [] for i, perturbation in enumerate(perturbations): # compute new energies perturbed_energies = [] for j, energy_term in enumerate(perturbation): perturbed_energies.append([]) oa = OpenMMAWSEMSystem(openmm_awsem_pdb_file, k_awsem=k_awsem) platform = Platform.getPlatformByName(platform_name) # OpenCL, CUDA, CPU, or Reference integrator = VerletIntegrator(2*femtoseconds) oa.addForce(energy_term[1]) simulation = Simulation(oa.pdb.topology, oa.system, integrator, platform) for pdb in pdb_trajectory: simulation.context.setPositions(pdb.positions) state = simulation.context.getState(getEnergy=True) perturbed_energies[j].append(state.getPotentialEnergy().value_in_unit(kilojoule_per_mole)) perturbed_energies = np.array(perturbed_energies) total_perturbed_energies = np.sum(perturbed_energies, axis=0) # compute new total energy by subtracting original value and adding new value new_total_energy = np.array(order_parameter_values[:, total_energy_column-1]) for j, energy_term in enumerate(perturbation): new_total_energy -= order_parameter_values[:, energy_term[0]-1] new_total_energy += total_perturbed_energies all_perturbed_energies.append(new_total_energy) all_perturbed_energies = np.array(all_perturbed_energies) return all_perturbed_energies.T def pick_structures(label, conditions_string, metadata_file, reference_structure, num_snapshots=6000, order_parameter_file_name="order_parameters.txt", pdb_trajectory_filename="output.pdb"): # parse restrictions (including which file to pull columns from) # read in data (including extra files if necessary) # go through the data and filter out snapshots that do not satisfy the criteria # select a subset of the structures that satisfy the constraints def parse_conditions_string(conditions_string): conditions = [] condition_signs = [] conditions_string = conditions_string.split() for condition in conditions_string: if "gt" in condition: condition_signs.append("+") condition = condition.split("gt") if "lt" in condition: condition_signs.append("-") condition = condition.split("lt") conditions.append(condition) return conditions, condition_signs def load_all_data(metadata_file, conditions): data_files = list(np.loadtxt(metadata_file, dtype=str)[:,0]) num_files = len(data_files) num_conditions = len(conditions) # Load all data into array data_array = np.zeros((num_files, num_conditions, num_snapshots)) for i, data_file in enumerate(data_files): all_order_parameters = np.loadtxt(data_file) for j, condition in enumerate(conditions): data_array[i][j] = all_order_parameters[0:num_snapshots,int(condition[0])-1] data_array = np.swapaxes(data_array,1,2) return data_array def write_selected_pdbs(pdb_files, snapshots_in_pdb_files): structure_index = 1 selected_pdbs = open("%s.pdb" % label, 'w') for pdb_file in pdb_files: pdb_trajectory_contents = open(pdb_file).read().split("MODEL")[1:] pdb_trajectory_contents = ['\n'.join(x.split('\n')[1:]) for x in pdb_trajectory_contents] for snapshot in snapshots_in_pdb_files[pdb_files.index(pdb_file)]: selected_pdbs.write("MODEL %d\n" % structure_index) selected_pdbs.write(pdb_trajectory_contents[snapshot]) structure_index += 1 selected_pdbs.close() # Lists files_array = list(np.loadtxt(metadata_file, dtype=str)[:,0]) conditions, condition_signs = parse_conditions_string(conditions_string) data_array = load_all_data(metadata_file, conditions) # File names and parameters output_file_name = label + ".dat" structure_index = 1 # loop over data and output those points that satisfy all conditions output_file = open(output_file_name, "w") pdb_files = [] snapshots_in_pdb_files = [] # loop over files for i, data_file in enumerate(files_array): # loop over snapshots for j, snapshot in enumerate(data_array[i]): bad_condition = False # loop over conditions for k, condition in enumerate(conditions): condition_boundary = float(condition[1]) # If all conditions are satisfied, print out the data if condition_signs[k] == "+": if not data_array[i][j][k] > condition_boundary: bad_condition = True elif condition_signs[k] == "-": if not data_array[i][j][k] < condition_boundary: bad_condition = True else: print("Bad condition argument.") sys.exit() if not bad_condition: output_file.write("%d %s %s\n" % (structure_index, data_file, j+1)) pdb_file = data_file.replace(order_parameter_file_name, pdb_trajectory_filename) if not pdb_file in pdb_files: pdb_files.append(pdb_file) snapshots_in_pdb_files.append([]) snapshots_in_pdb_files[pdb_files.index(pdb_file)].append(j) structure_index += 1 output_file.close() write_selected_pdbs(pdb_files, snapshots_in_pdb_files) pymol_script = open("%s.pml" % label, 'w') pymol_script.write("load %s\n" % reference_structure) pymol_script.write("load_traj %s.pdb\n" % label) object_name = os.path.basename(reference_structure)[:-4] pymol_script.write("intra_fit %s\n" % object_name) pymol_script.write("smooth\n") pymol_script.close() def test_Protein_fromCoarsePDB(): pass def test_Protein_fromPDB(): pass def test_Protein_parseConfigurationFile(): pass def test_Protein_computeTopology(): pass