Minor patch

stracquadaniolab · Mar 7, 2024 · bc9db61 · bc9db61
1 parent 56324fa
commit bc9db61
Show file tree

Hide file tree

Showing 5 changed files with 229 additions and 82 deletions.
diff --git a/bin/mutant_maker.py b/bin/mutant_maker.py
@@ -1,16 +1,70 @@
 #!/usr/bin/env python3
+"""mutant_maker
+
+Fix PDB files, adding missing residues, atoms and hydrogens, then create mutations in desired locations   
+Usage:
+mutant_maker.py [--incsv=<incsv>] [--from-col=<col>] [--in-pdb=<pdb>] [--chain=<chain>] [--pH=<pH>]
+
+Options:
+--incsv=<incsv>    Read data from csv file
+--from-col=<col>   Select column title containing mutations
+--in-pdb=<pdb>     PDB file of protein wildtype
+--chain=<chain>    Select chain upon which to perform mutation
+--pH=<ph>          Set pH of the protein  
+"""
+
+from docopt import docopt
+import pandas as pd
+import re
 from openmm.app import PDBFile
 import sys
 from sys import stdout
 from pdbfixer import PDBFixer
 
-pdb = PDBFixer(sys.argv[1])
-pdb.addMutations(mutationChain='A', mutationResidues='19', newResidues='ALA')
-pdb.findMissingResidues()
-pdb.findNonstandardResidues()
-pdb.replaceNonstandardResidues()
-pdb.removeHeterogens(True)
-pdb.findMissingAtoms()
-pdb.addMissingAtoms()
-pdb.addMissingHydrogens(7.0)
-PDBFile.writeFile(pdb.topology, pdb.positions, open('modified_protein.pdb', 'w'))
+def get_data(csvname: str, col: str):
+    data = pd.read_csv(csvname)
+    clipped_data=pd.DataFrame(data=data, columns=[col])
+    divided_data = clipped_data.Mutation.str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)')
+    divided_cleaned = divided_data.to_string(header=None, index=False)
+    amino_index = {'G': 'GLY' , 'L': 'LEU', 'R': 'ARG', 'A': 'ALA', 'Y': 'TYR', 'S': 'SER', 'M': 'MET', 'I': 'ILE', 'T': 'THR', 'C': 'CYS', 'V': 'VAL', 'P': 'PRO', 'F': 'PHE', 'W': 'TRP', 'H': 'HIS', 'K': 'LYS', 'D': 'ASP', 'E': 'GLU', 'N': 'ASN', 'Q': 'GLN', ' ': '-'}
+    new_rep_codes = re.sub(r"[GLRAYSMITCVPFWHKDENQ ]", lambda x: amino_index[x.group(0)], divided_cleaned)
+    new_rep_cleaned = re.sub("--","-", new_rep_codes)
+    return new_rep_cleaned
+
+
+def clean_wildtype(pdbname: str, pH: str):
+    pH_fl = float(pH)
+    pdb = PDBFixer(pdbname)
+    pdb.findMissingResidues()
+    pdb.findNonstandardResidues()
+    pdb.replaceNonstandardResidues()
+    pdb.removeHeterogens(False)
+    pdb.findMissingAtoms()
+    pdb.addMissingAtoms()
+    pdb.addMissingHydrogens(pH_fl)
+    PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w'))
+
+
+def create_mutants(pdbname: str, new_rep_cleaned, chain: str, pH: str):
+    pH_fl = float(pH)
+    for mutant in new_rep_cleaned.splitlines():
+        mutpdb = PDBFixer(pdbname)
+        mutpdb.applyMutations([mutant], chain)
+        mutpdb.findMissingResidues()
+        mutpdb.findNonstandardResidues()
+        mutpdb.replaceNonstandardResidues()
+        mutpdb.removeHeterogens(False)
+        mutpdb.findMissingAtoms()
+        mutpdb.addMissingAtoms()
+        mutpdb.addMissingHydrogens(pH_fl)
+        PDBFile.writeFile(pdb.topology, pdb.positions, open(mutant + "_fixed.pdb", 'w'))
+
+
+def main():
+    arguments = docopt(__doc__, version='mutant_maker.py')
+    new_rep_cleaned = get_data(arguments['--incsv'], arguments['--from-col'])
+    clean_wildtype(arguments['--inpdb'], arguments['--pH'])
+    create_mutants(arguments['--inpdb'], new_rep_cleaned, arguments['--chain'], arguments['--pH'])
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/openmm-minimise.py b/bin/openmm-minimise.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+"""openmm-minimise
+
+Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials 
+Usage:
+openmm-minimise.py [--i=<pdb>] [--max-iter=<iter>] [--tol==<tol>]
+Options:
+--i=<pdb>          Input the fixed PDB files from the Mutant Maker process
+--max-iter=<iter>  Maximum number of iterations to arrive at minimised energy
+--tol=<tol>        Tolerance level of energy change after which the protein's structure is considered minimised
+
+"""
+import logging
+from docopt import docopt
+from openmm.app import *
+from openmm import *
+from openmm.unit import *
+import sys
+from sys import stdout
+
+def get_pdb(filename: str):
+    pdb = PDBFile(filename)
+    return pdb
+
+def setup_forcefield():
+    forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
+    return forcefield
+
+def setup_system(pdb, forcefield):
+    system = forcefield.createSystem(pdb.topology, nonbondedMethod=NoCutoff)
+    return system
+
+def setup_simulation(pdb, system):
+    integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.001*picoseconds)
+    simulation = Simulation(pdb.topology, system, integrator)
+    simulation.context.setPositions(pdb.positions)
+    return simulation
+
+def main():
+    arguments = docopt(__doc__, version='openmm-minimise.py')
+    pdb = get_pdb(arguments['--i'])
+    forcefield = setup_forcefield()
+    system = setup_system(pdb, forcefield, arguments['--no-restraints'])
+    str_name = arguments['--i']
+    stem = str_name.replace("_fixed.pdb","")
+    #csvunfolded = str(stem + "_unfolded.csv")
+    pdbunfolded = str(stem + "_unfolded.pdb")
+    pdbfolded = str(stem + "_folded.pdb")
+    simulation = setup_simulation(pdb, system)
+    init_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Starting potential energy = %.9f kcal/mol"
+            % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    simulation.minimizeEnergy(tolerance=0.01)
+    final_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Minimised potential energy = %.9f kcal/mol"
+            % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    PDBFile.writeFile(
+        simulation.topology, final_state.getPositions(), open(pdbunfolded, "w"))
+    pdbII = get_pdb(pdbunfolded)
+    forcefield = setup_forcefield()
+    systemII = setup_system(pdbII, forcefield)
+    simulationII = setup_simulation(pdbII, systemII)
+    init_state = simulationII.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Starting potential energy = %.9f kcal/mol"
+            % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    simulationII.minimizeEnergy(tolerance=0.00000001)
+    final_state = simulationII.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Minimised potential energy = %.9f kcal/mol"
+            % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    PDBFile.writeFile(
+        simulationII.topology, final_state.getPositions(), open(pdbfolded, "w"))
+if __name__ == '__main__':
+    arguments = docopt(__doc__, version='openmm-minimise.py')
+    logging.basicConfig(stream=stem + ".out", level=logging.INFO)
+    main()
diff --git a/bin/openmm-runner.py b/bin/openmm-runner.py
@@ -1,46 +1,77 @@
 #!/usr/bin/env python3
+"""openmm-md
+
+Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials 
+Usage:
+openmm-md.py [--i=<pdb>] [--solv-mol=<mol>] [--temp==<temp>] [--pres=<pres>] [--nvt=<nvt>] [--npt=<npt] [--rep=<rep>]
+Options:
+--i=<pdb>          Input the fixed PDB files from the Mutant Maker process
+--solv-mol=<mol>   Number of solvent molecules (default: solvation with H2O) 
+--temp=<temp>      Temperature of MD thermostat 
+--pres=<pres>      Pressure of MD barostat (NPT MD only)
+--nvt=<nvt>        Number of NVT trajectory steps to initiate MD (each step = 1fs)
+--npt=<npt>        Number of NPT steps in principal MD trajectory (each step = 1fs)
+--rep=<rep>        Reporting rate wherefrom PDB coordinates and therodynamic conditions are outputted to trajectory (PDB) and log (CSV) files respectively
+"""
+import logging
+from docopt import docopt
 from openmm.app import *
 from openmm import *
 from openmm.unit import *
 import sys
 from sys import stdout
-from pdbfixer import PDBFixer
-
-#pdb = PDBFile(sys.argv[1])
-pdb = PDBFixer(sys.argv[1])
-pdb.applyMutations(["SER-19-ALA"], "A")
-pdb.findMissingResidues()
-pdb.findNonstandardResidues()
-pdb.replaceNonstandardResidues()
-pdb.removeHeterogens(True)
-pdb.findMissingAtoms()
-pdb.addMissingAtoms()
-pdb.addMissingHydrogens(7.0)
-forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
-modeller = Modeller(pdb.topology, pdb.positions)
-#modeller.deleteWater()
-#residues=modeller.addHydrogens(forcefield)
-solvmol=int(sys.argv[4])
-modeller.addSolvent(forcefield, numAdded=solvmol)
-system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0*nanometer, constraints=HBonds)
-temp=int(sys.argv[2])
-integrator = LangevinMiddleIntegrator(temp*kelvin, 1/picosecond, 0.001*picoseconds)
-simulation = Simulation(modeller.topology, system, integrator)
-simulation.context.setPositions(modeller.positions)
-print("Minimizing energy")
-simulation.minimizeEnergy()
-print("Running NVT")
-reprate=int(sys.argv[7])
-simulation.reporters.append(PDBReporter(sys.argv[8], reprate))
-simulation.reporters.append(StateDataReporter(stdout, reprate, step=True,
-        potentialEnergy=True, temperature=True, volume=True))
-simulation.reporters.append(StateDataReporter(sys.argv[9], reprate, step=True,
+
+
+def get_pdb(filename: str):
+    pdb = PDBFile(filename)
+    return pdb
+
+def setup_forcefield():
+    forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
+    return forcefield
+
+def setup_modeller(pdb):
+    modeller = Modeller(pdb.topology, pdb.positions)
+    return modeller
+
+def setup_system(modeller, forcefield, solvmol: str):
+    mol=int(solvmol)
+    modeller.addSolvent(forcefield, numAdded=mol)
+    system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0*nanometer, constraints=HBonds)
+    return system
+
+def setup_simulation(modeller, system, temp: str):
+    temp_fl = int(temp)
+    integrator = LangevinMiddleIntegrator(temp_fl*kelvin, 1/picosecond, 0.001*picoseconds)
+    simulation = Simulation(modeller.topology, system, integrator)
+    simulation.context.setPositions(modeller.positions)
+    return simulation
+
+def main():
+    arguments = docopt(__doc__, version='openmm-md.py')
+    pdb = get_pdb(arguments['--i'])
+    forcefield = setup_forcefield()
+    modeller = setup_modeller(pdb)
+    system = setup_system(modeller, forcefield, arguments['--solv-mol'])
+    simulation = setup_simulation(modeller, system, arguments['--temp'])
+    print("Minimizing energy")
+    simulation.minimizeEnergy()
+    print("Running NVT")
+    reprate = int(arguments['--rep'])
+    nvt = int(arguments['--nvt'])
+    npt = int(arguments['--npt'])
+    pres = int(arguments['--pres'])
+    temp = int(arguments['--temp'])
+    str_name = arguments['--i']
+    stem = str_name.replace("_opt.pdb", "")
+    simulation.reporters.append(PDBReporter(stem + "_MD.pdb", reprate))
+    simulation.reporters.append(StateDataReporter(stdout, reprate, step=True, potentialEnergy=True, temperature=True, volume=True))
+    simulation.reporters.append(StateDataReporter(stem + "_log.csv", reprate, step=True,
         potentialEnergy=True, temperature=True, volume=True))
-nvtsteps=int(sys.argv[5])
-pres=int(sys.argv[3])
-simulation.step(nvtsteps)
-system.addForce(MonteCarloBarostat(pres*bar, temp*kelvin))
-simulation.context.reinitialize(preserveState=True)
-print("Running NPT")
-nptsteps=int(sys.argv[6])
-simulation.step(nptsteps)
+    simulation.step(nvt)
+    system.addForce(MonteCarloBarostat(pres*bar, temp*kelvin))
+    simulation.context.reinitialize(preserveState=True)
+    print("Running NPT")
+    simulation.step(npt)
+if __name__ == '__main__':
+    main()
diff --git a/main.nf b/main.nf
@@ -1,50 +1,35 @@
 // enabling nextflow DSL v2
 nextflow.enable.dsl=2
 
-process openmm {
-
-    publishDir "${params.resultsDir}",  mode: 'copy'
-
+process pdbfixer-mutants {
     input:
+        path incsv
         path inpdb
-        val  temp
-        val  pres
-        val  solvmol
-        val  nptrun
-        val  nvtrun
-        val  reprate
     output:
-        path 'traj.pdb'
-        path 'md_log.txt', emit: md_log
-
-shell:
+        path '*_fixed.pdb', emit: fixed_pdbs
+    shell:
     """
-        openmm-runner.py $inpdb $temp $pres $solvmol $nptrun $nvtrun $reprate  traj.pdb md_log.txt
+    mutant_maker.py --incsv $incsv --from-col ${params.csv.col} --in-pdb $inpdb ${params.mutant.maker.args}
     """
-
 }
 
-process hydro_plot {
+process openmm-minimise {
     input:
-      path md_log
-      val  skipsteps
-
+       path fixed_pdbs
+    output: 
+       path '*_unfolded.pdb', emit: unfolded_pdbs
+       path '*_folded.pdb', emit: folded_pdbs
     shell:
-      """
-      plot.py $md_log $skipsteps
-      """
+    """
+    openmm-minimise.py --i $fixed_pdbs
+    """
 }
 
+
 workflow {
     inpath_ch = channel.fromPath("${params.inputFile}")
-    temp = Channel.value(params.temp)
-    pres = Channel.value(params.pres)
-    solvmol = Channel.value(params.solvmol)
-    nvtrun = Channel.value(params.nvtrun)
-    nptrun = Channel.value(params.nptrun)
-    reprate = Channel.value(params.reprate)
-    skipsteps = Channel.value(params.skipsteps)
-    openmm(inpath_ch, temp, pres, solvmol, nvtrun, nptrun, reprate)
-    hydro_plot(openmm.out.md_log, skipsteps)
+    incsv_ch = channel.fromPath("${params.inputCsv}")
+    pdbfixer-mutants(inpath_ch, incsv_ch)
+    openmm-minimise(pdbfixer-mutants.out.fixed_pdbs)
 }
 
diff --git a/nextflow.config b/nextflow.config
@@ -2,7 +2,9 @@
 params {
   inputFile  = "./testdata/2JIE.pdb"
   resultsDir = "./results/"
-}
+  inputCsv = "./testdata/tableExport-2JIEIII.csv"
+  csv.col = "Mutation"
+  mutant.maker.args = "--chain A --pH 7.5"
 
 // include basic process configuration options
 includeConfig 'conf/base.config'