Spaces:

simonduerr
/

diffdock

Runtime error

App Files Files Community

Simon Duerr commited on Oct 8, 2022

Commit

486fd8a

1 Parent(s): 3e6dce4

gradio update

Browse files

Files changed (9) hide show

app.py +461 -0
datasets/esm_embedding_preparation.py +73 -72
datasets/pdbbind.py +432 -133
datasets/process_mols.py +6 -1
examples/1a46_ligand.sdf +179 -0
examples/1a46_protein_processed.pdb +0 -0
examples/1cbr_ligand.sdf +119 -0
examples/1cbr_protein.pdb +0 -0
requirements.txt +29 -0

app.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import gradio as gr
+import os
+import copy
+import os
+import torch
+import time
+from argparse import ArgumentParser, Namespace, FileType
+from rdkit.Chem import RemoveHs
+from functools import partial
+import numpy as np
+import pandas as pd
+from rdkit import RDLogger
+from rdkit.Chem import MolFromSmiles, AddHs
+from torch_geometric.loader import DataLoader
+import yaml
+from datasets.process_mols import (
+    read_molecule,
+    generate_conformer,
+    write_mol_with_coords,
+)
+from datasets.pdbbind import PDBBind
+from utils.diffusion_utils import t_to_sigma as t_to_sigma_compl, get_t_schedule
+from utils.sampling import randomize_position, sampling
+from utils.utils import get_model
+from utils.visualise import PDBFile
+from tqdm import tqdm
+from datasets.esm_embedding_preparation import esm_embedding_prep
+import subprocess
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+with open(f"workdir/paper_score_model/model_parameters.yml") as f:
+    score_model_args = Namespace(**yaml.full_load(f))
+with open(f"workdir/paper_confidence_model/model_parameters.yml") as f:
+    confidence_args = Namespace(**yaml.full_load(f))
+t_to_sigma = partial(t_to_sigma_compl, args=score_model_args)
+model = get_model(score_model_args, device, t_to_sigma=t_to_sigma, no_parallel=True)
+state_dict = torch.load(
+    f"workdir/paper_score_model/best_ema_inference_epoch_model.pt",
+    map_location=torch.device("cpu"),
+)
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.eval()
+confidence_model = get_model(
+    confidence_args,
+    device,
+    t_to_sigma=t_to_sigma,
+    no_parallel=True,
+    confidence_mode=True,
+)
+state_dict = torch.load(
+    f"workdir/paper_confidence_model/best_model_epoch75.pt",
+    map_location=torch.device("cpu"),
+)
+confidence_model.load_state_dict(state_dict, strict=True)
+confidence_model = confidence_model.to(device)
+confidence_model.eval()
+tr_schedule = get_t_schedule(inference_steps=10)
+rot_schedule = tr_schedule
+tor_schedule = tr_schedule
+print("common t schedule", tr_schedule)
+failures, skipped, confidences_list, names_list, run_times, min_self_distances_list = (
+    0,
+    0,
+    [],
+    [],
+    [],
+    [],
+)
+N = 10
+def get_pdb(pdb_code="", filepath=""):
+    if pdb_code is None or pdb_code == "":
+        try:
+            return filepath.name
+        except AttributeError as e:
+            return None
+    else:
+        os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb")
+        return f"{pdb_code}.pdb"
+def get_ligand(smiles="", filepath=""):
+    if smiles is None or smiles == "":
+        try:
+            return filepath.name
+        except AttributeError as e:
+            return None
+    else:
+        return smiles
+def read_mol(molpath):
+    with open(molpath, "r") as fp:
+        lines = fp.readlines()
+    mol = ""
+    for l in lines:
+        mol += l
+    return mol
+def molecule(input_pdb, ligand_pdb):
+    structure = read_mol(input_pdb)
+    mol = read_mol(ligand_pdb)
+    x = (
+        """<!DOCTYPE html>
+        <html>
+        <head>
+    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+    <style>
+    body{
+        font-family:sans-serif
+    }
+    .mol-container {
+    width: 600px;
+    height: 600px;
+    position: relative;
+    mx-auto:0
+    }
+    .mol-container select{
+        background-image:None;
+    }
+    </style>
+    <script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script>
+    </head>
+    <body>
+     <button id="startanimation">Replay diffusion process</button>
+    <div id="container" class="mol-container"></div>
+            <script>
+               let ligand = `"""
+        + mol
+        + """`
+        let structure = `"""
+        + structure
+        + """`
+             let viewer = null;
+             $(document).ready(function () {
+                let element = $("#container");
+                let config = { backgroundColor: "white" };
+                viewer = $3Dmol.createViewer(element, config);
+                viewer.addModel( structure, "pdb" );
+                viewer.setStyle({}, {cartoon: {color: "gray"}});
+                viewer.zoomTo();
+                viewer.zoom(0.7);
+                viewer.addModelsAsFrames(ligand, "pdb");
+                viewer.animate({loop: "forward",reps: 1});
+                viewer.getModel(1).setStyle({stick:{colorscheme:"magentaCarbon"}});
+                viewer.render();
+              })
+              $("#startanimation").click(function() {
+                viewer.animate({loop: "forward",reps: 1});
+              });
+        </script>
+        </body></html>"""
+    )
+    return f"""<iframe style="width: 100%; height: 700px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
+def esm(protein_path, out_file):
+    esm_embedding_prep(out_file, protein_path)
+    # create args object with defaults
+    os.environ["HOME"] = "esm/model_weights"
+    subprocess.call(
+        f"python esm/scripts/extract.py esm2_t33_650M_UR50D {out_file} data/esm2_output --repr_layers 33 --include per_tok",
+        shell=True,
+    )
+def update(inp, file, ligand_inp, ligand_file):
+    pdb_path = get_pdb(inp, file)
+    ligand_path = get_ligand(ligand_inp, ligand_file)
+    esm(
+        pdb_path,
+        f"data/{os.path.basename(pdb_path)}_prepared_for_esm.fasta",
+    )
+    protein_path_list = [pdb_path]
+    ligand_descriptions = [ligand_path]
+    no_random = False
+    ode = False
+    no_final_step_noise = False
+    out_dir = "results/test"
+    test_dataset = PDBBind(
+        transform=None,
+        root="",
+        protein_path_list=protein_path_list,
+        ligand_descriptions=ligand_descriptions,
+        receptor_radius=score_model_args.receptor_radius,
+        cache_path="data/cache",
+        remove_hs=score_model_args.remove_hs,
+        max_lig_size=None,
+        c_alpha_max_neighbors=score_model_args.c_alpha_max_neighbors,
+        matching=False,
+        keep_original=False,
+        popsize=score_model_args.matching_popsize,
+        maxiter=score_model_args.matching_maxiter,
+        all_atoms=score_model_args.all_atoms,
+        atom_radius=score_model_args.atom_radius,
+        atom_max_neighbors=score_model_args.atom_max_neighbors,
+        esm_embeddings_path="data/esm2_output",
+        require_ligand=True,
+        num_workers=1,
+        keep_local_structures=False,
+    )
+    test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
+    confidence_test_dataset = PDBBind(
+        transform=None,
+        root="",
+        protein_path_list=protein_path_list,
+        ligand_descriptions=ligand_descriptions,
+        receptor_radius=confidence_args.receptor_radius,
+        cache_path="data/cache",
+        remove_hs=confidence_args.remove_hs,
+        max_lig_size=None,
+        c_alpha_max_neighbors=confidence_args.c_alpha_max_neighbors,
+        matching=False,
+        keep_original=False,
+        popsize=confidence_args.matching_popsize,
+        maxiter=confidence_args.matching_maxiter,
+        all_atoms=confidence_args.all_atoms,
+        atom_radius=confidence_args.atom_radius,
+        atom_max_neighbors=confidence_args.atom_max_neighbors,
+        esm_embeddings_path="data/esm2_output",
+        require_ligand=True,
+        num_workers=1,
+    )
+    confidence_complex_dict = {d.name: d for d in confidence_test_dataset}
+    for idx, orig_complex_graph in tqdm(enumerate(test_loader)):
+        if (
+            confidence_model is not None
+            and not (
+                confidence_args.use_original_model_cache
+                or confidence_args.transfer_weights
+            )
+            and orig_complex_graph.name[0] not in confidence_complex_dict.keys()
+        ):
+            skipped += 1
+            print(
+                f"HAPPENING | The confidence dataset did not contain {orig_complex_graph.name[0]}. We are skipping this complex."
+            )
+            continue
+        try:
+            data_list = [copy.deepcopy(orig_complex_graph) for _ in range(N)]
+            randomize_position(
+                data_list,
+                score_model_args.no_torsion,
+                no_random,
+                score_model_args.tr_sigma_max,
+            )
+            pdb = None
+            lig = orig_complex_graph.mol[0]
+            visualization_list = []
+            for graph in data_list:
+                pdb = PDBFile(lig)
+                pdb.add(lig, 0, 0)
+                pdb.add(
+                    (
+                        orig_complex_graph["ligand"].pos
+                        + orig_complex_graph.original_center
+                    )
+                    .detach()
+                    .cpu(),
+                    1,
+                    0,
+                )
+                pdb.add(
+                    (graph["ligand"].pos + graph.original_center).detach().cpu(),
+                    part=1,
+                    order=1,
+                )
+                visualization_list.append(pdb)
+            start_time = time.time()
+            if confidence_model is not None and not (
+                confidence_args.use_original_model_cache
+                or confidence_args.transfer_weights
+            ):
+                confidence_data_list = [
+                    copy.deepcopy(confidence_complex_dict[orig_complex_graph.name[0]])
+                    for _ in range(N)
+                ]
+            else:
+                confidence_data_list = None
+            data_list, confidence = sampling(
+                data_list=data_list,
+                model=model,
+                inference_steps=10,
+                tr_schedule=tr_schedule,
+                rot_schedule=rot_schedule,
+                tor_schedule=tor_schedule,
+                device=device,
+                t_to_sigma=t_to_sigma,
+                model_args=score_model_args,
+                no_random=no_random,
+                ode=ode,
+                visualization_list=visualization_list,
+                confidence_model=confidence_model,
+                confidence_data_list=confidence_data_list,
+                confidence_model_args=confidence_args,
+                batch_size=1,
+                no_final_step_noise=no_final_step_noise,
+            )
+            ligand_pos = np.asarray(
+                [
+                    complex_graph["ligand"].pos.cpu().numpy()
+                    + orig_complex_graph.original_center.cpu().numpy()
+                    for complex_graph in data_list
+                ]
+            )
+            run_times.append(time.time() - start_time)
+            if confidence is not None and isinstance(
+                confidence_args.rmsd_classification_cutoff, list
+            ):
+                confidence = confidence[:, 0]
+            if confidence is not None:
+                confidence = confidence.cpu().numpy()
+                re_order = np.argsort(confidence)[::-1]
+                confidence = confidence[re_order]
+                confidences_list.append(confidence)
+                ligand_pos = ligand_pos[re_order]
+            write_dir = (
+                f'{out_dir}/index{idx}_{data_list[0]["name"][0].replace("/","-")}'
+            )
+            os.makedirs(write_dir, exist_ok=True)
+            for rank, pos in enumerate(ligand_pos):
+                mol_pred = copy.deepcopy(lig)
+                if score_model_args.remove_hs:
+                    mol_pred = RemoveHs(mol_pred)
+                if rank == 0:
+                    write_mol_with_coords(
+                        mol_pred, pos, os.path.join(write_dir, f"rank{rank+1}.sdf")
+                    )
+                write_mol_with_coords(
+                    mol_pred,
+                    pos,
+                    os.path.join(
+                        write_dir, f"rank{rank+1}_confidence{confidence[rank]:.2f}.sdf"
+                    ),
+                )
+            self_distances = np.linalg.norm(
+                ligand_pos[:, :, None, :] - ligand_pos[:, None, :, :], axis=-1
+            )
+            self_distances = np.where(
+                np.eye(self_distances.shape[2]), np.inf, self_distances
+            )
+            min_self_distances_list.append(np.min(self_distances, axis=(1, 2)))
+            filenames = []
+            if confidence is not None:
+                for rank, batch_idx in enumerate(re_order):
+                    visualization_list[batch_idx].write(
+                        os.path.join(write_dir, f"rank{rank+1}_reverseprocess.pdb")
+                    )
+                    filenames.append(
+                        os.path.join(write_dir, f"rank{rank+1}_reverseprocess.pdb")
+                    )
+            else:
+                for rank, batch_idx in enumerate(ligand_pos):
+                    visualization_list[batch_idx].write(
+                        os.path.join(write_dir, f"rank{rank+1}_reverseprocess.pdb")
+                    )
+                    filenames.append(
+                        os.path.join(write_dir, f"rank{rank+1}_reverseprocess.pdb")
+                    )
+            names_list.append(orig_complex_graph.name[0])
+        except Exception as e:
+            print("Failed on", orig_complex_graph["name"], e)
+            failures += 1
+            return None
+    labels = [f"rank {i+1}" for i in range(len(filenames))]
+    return (
+        molecule(pdb_path, filenames[0]),
+        gr.Dropdown.update(choices=labels, value="rank 1"),
+        filenames,
+        pdb_path,
+    )
+def updateView(out, filenames, pdb):
+    i = int(out.replace("rank", ""))
+    return molecule(pdb, filenames[i])
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("# DiffDock")
+    gr.Markdown(
+        ">**DiffDock: Diffusion Steps, Twists, and Turns for Molecular Docking**, Corso, Gabriele and Stärk, Hannes and Jing, Bowen and Barzilay, Regina and Jaakkola, Tommi, arXiv:2210.01776  [GitHub](https://github.com/gcorso/diffdock)"
+    )
+    gr.Markdown("Runs the diffusion model `10` times with `10` inference steps")
+    with gr.Box():
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("## Protein")
+                inp = gr.Textbox(
+                    placeholder="PDB Code or upload file below", label="Input structure"
+                )
+                file = gr.File(file_count="single", label="Input PDB")
+            with gr.Column():
+                gr.Markdown("## Ligand")
+                ligand_inp = gr.Textbox(
+                    placeholder="Provide SMILES input or upload mol2/sdf file below",
+                    label="SMILES string",
+                )
+                ligand_file = gr.File(file_count="single", label="Input Ligand")
+    btn = gr.Button("Run predictions")
+    gr.Markdown("## Output")
+    pdb = gr.Variable()
+    filenames = gr.Variable()
+    out = gr.Dropdown(interactive=True, label="Ranked samples")
+    mol = gr.HTML()
+    gr.Examples(
+        [
+            [
+                None,
+                "examples/1a46_protein_processed.pdb",
+                None,
+                "examples/1a46_ligand.sdf",
+            ]
+        ],
+        [inp, file, ligand_inp, ligand_file],
+        [mol, out],
+        # cache_examples=True,
+    )
+    btn.click(
+        fn=update,
+        inputs=[inp, file, ligand_inp, ligand_file],
+        outputs=[mol, out, filenames, pdb],
+    )
+    out.change(fn=updateView, inputs=[out, filenames, pdb], outputs=mol)
+demo.launch()

datasets/esm_embedding_preparation.py CHANGED Viewed

@@ -9,79 +9,80 @@ from Bio.SeqRecord import SeqRecord
 from tqdm import tqdm
 from Bio import SeqIO
-parser = ArgumentParser()
-parser.add_argument('--out_file', type=str, default="data/prepared_for_esm.fasta")
-parser.add_argument('--protein_ligand_csv', type=str, default='data/protein_ligand_example_csv.csv', help='Path to a .csv specifying the input as described in the main README')
-parser.add_argument('--protein_path', type=str, default=None, help='Path to a single PDB file. If this is not None then it will be used instead of the --protein_ligand_csv')
-args = parser.parse_args()
-biopython_parser = PDBParser()
-three_to_one = {'ALA':	'A',
-'ARG':	'R',
-'ASN':	'N',
-'ASP':	'D',
-'CYS':	'C',
-'GLN':	'Q',
-'GLU':	'E',
-'GLY':	'G',
-'HIS':	'H',
-'ILE':	'I',
-'LEU':	'L',
-'LYS':	'K',
-'MET':	'M',
-'MSE':  'M', # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
-'PHE':	'F',
-'PRO':	'P',
-'PYL':	'O',
-'SER':	'S',
-'SEC':	'U',
-'THR':	'T',
-'TRP':	'W',
-'TYR':	'Y',
-'VAL':	'V',
-'ASX':	'B',
-'GLX':	'Z',
-'XAA':	'X',
-'XLE':	'J'}
-if args.protein_path is not None:
-    file_paths = [args.protein_path]
-else:
-    df = pd.read_csv(args.protein_ligand_csv)
-    file_paths = list(set(df['protein_path'].tolist()))
-sequences = []
-ids = []
-for file_path in tqdm(file_paths):
-    structure = biopython_parser.get_structure('random_id', file_path)
-    structure = structure[0]
-    for i, chain in enumerate(structure):
-        seq = ''
-        for res_idx, residue in enumerate(chain):
-            if residue.get_resname() == 'HOH':
-                continue
-            residue_coords = []
-            c_alpha, n, c = None, None, None
-            for atom in residue:
-                if atom.name == 'CA':
-                    c_alpha = list(atom.get_vector())
-                if atom.name == 'N':
-                    n = list(atom.get_vector())
-                if atom.name == 'C':
-                    c = list(atom.get_vector())
-            if c_alpha != None and n != None and c != None:  # only append residue if it is an amino acid
-                try:
-                    seq += three_to_one[residue.get_resname()]
-                except Exception as e:
-                    seq += '-'
-                    print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', file_path, '. Replacing it with a dash - .')
-        sequences.append(seq)
-        ids.append(f'{os.path.basename(file_path)}_chain_{i}')
-records = []
-for (index, seq) in zip(ids,sequences):
-    record = SeqRecord(Seq(seq), str(index))
-    record.description = ''
-    records.append(record)
-SeqIO.write(records, args.out_file, "fasta")

 from tqdm import tqdm
 from Bio import SeqIO
+def esm_embedding_prep(out_file, protein_path):
+    biopython_parser = PDBParser()
+    three_to_one = {
+        "ALA": "A",
+        "ARG": "R",
+        "ASN": "N",
+        "ASP": "D",
+        "CYS": "C",
+        "GLN": "Q",
+        "GLU": "E",
+        "GLY": "G",
+        "HIS": "H",
+        "ILE": "I",
+        "LEU": "L",
+        "LYS": "K",
+        "MET": "M",
+        "MSE": "M",  # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
+        "PHE": "F",
+        "PRO": "P",
+        "PYL": "O",
+        "SER": "S",
+        "SEC": "U",
+        "THR": "T",
+        "TRP": "W",
+        "TYR": "Y",
+        "VAL": "V",
+        "ASX": "B",
+        "GLX": "Z",
+        "XAA": "X",
+        "XLE": "J",
+    }
+    file_paths = [protein_path]
+    sequences = []
+    ids = []
+    for file_path in tqdm(file_paths):
+        structure = biopython_parser.get_structure("random_id", file_path)
+        structure = structure[0]
+        for i, chain in enumerate(structure):
+            seq = ""
+            for res_idx, residue in enumerate(chain):
+                if residue.get_resname() == "HOH":
+                    continue
+                residue_coords = []
+                c_alpha, n, c = None, None, None
+                for atom in residue:
+                    if atom.name == "CA":
+                        c_alpha = list(atom.get_vector())
+                    if atom.name == "N":
+                        n = list(atom.get_vector())
+                    if atom.name == "C":
+                        c = list(atom.get_vector())
+                if (
+                    c_alpha != None and n != None and c != None
+                ):  # only append residue if it is an amino acid
+                    try:
+                        seq += three_to_one[residue.get_resname()]
+                    except Exception as e:
+                        seq += "-"
+                        print(
+                            "encountered unknown AA: ",
+                            residue.get_resname(),
+                            " in the complex ",
+                            file_path,
+                            ". Replacing it with a dash - .",
+                        )
+            sequences.append(seq)
+            ids.append(f"{os.path.basename(file_path)}_chain_{i}")
+    records = []
+    for (index, seq) in zip(ids, sequences):
+        record = SeqRecord(Seq(seq), str(index))
+        record.description = ""
+        records.append(record)
+    SeqIO.write(records, out_file, "fasta")

datasets/pdbbind.py CHANGED Viewed

@@ -16,8 +16,15 @@ from torch_geometric.loader import DataLoader, DataListLoader
 from torch_geometric.transforms import BaseTransform
 from tqdm import tqdm
-from datasets.process_mols import read_molecule, get_rec_graph, generate_conformer, \
-    get_lig_graph_with_matching, extract_receptor_structure, parse_receptor, parse_pdb_from_path
 from utils.diffusion_utils import modify_conformer, set_time
 from utils.utils import read_strings_from_txt
 from utils import so3, torus
@@ -34,32 +41,87 @@ class NoiseTransform(BaseTransform):
         t_tr, t_rot, t_tor = t, t, t
         return self.apply_noise(data, t_tr, t_rot, t_tor)
-    def apply_noise(self, data, t_tr, t_rot, t_tor, tr_update = None, rot_update=None, torsion_updates=None):
-        if not torch.is_tensor(data['ligand'].pos):
-            data['ligand'].pos = random.choice(data['ligand'].pos)
         tr_sigma, rot_sigma, tor_sigma = self.t_to_sigma(t_tr, t_rot, t_tor)
         set_time(data, t_tr, t_rot, t_tor, 1, self.all_atom, device=None)
-        tr_update = torch.normal(mean=0, std=tr_sigma, size=(1, 3)) if tr_update is None else tr_update
         rot_update = so3.sample_vec(eps=rot_sigma) if rot_update is None else rot_update
-        torsion_updates = np.random.normal(loc=0.0, scale=tor_sigma, size=data['ligand'].edge_mask.sum()) if torsion_updates is None else torsion_updates
         torsion_updates = None if self.no_torsion else torsion_updates
-        modify_conformer(data, tr_update, torch.from_numpy(rot_update).float(), torsion_updates)
-        data.tr_score = -tr_update / tr_sigma ** 2
-        data.rot_score = torch.from_numpy(so3.score_vec(vec=rot_update, eps=rot_sigma)).float().unsqueeze(0)
-        data.tor_score = None if self.no_torsion else torch.from_numpy(torus.score(torsion_updates, tor_sigma)).float()
-        data.tor_sigma_edge = None if self.no_torsion else np.ones(data['ligand'].edge_mask.sum()) * tor_sigma
         return data
 class PDBBind(Dataset):
-    def __init__(self, root, transform=None, cache_path='data/cache', split_path='data/', limit_complexes=0,
-                 receptor_radius=30, num_workers=1, c_alpha_max_neighbors=None, popsize=15, maxiter=15,
-                 matching=True, keep_original=False, max_lig_size=None, remove_hs=False, num_conformers=1, all_atoms=False,
-                 atom_radius=5, atom_max_neighbors=None, esm_embeddings_path=None, require_ligand=False,
-                 ligands_list=None, protein_path_list=None, ligand_descriptions=None, keep_local_structures=False):
         super(PDBBind, self).__init__(root, transform)
         self.pdbbind_dir = root
@@ -75,37 +137,67 @@ class PDBBind(Dataset):
         self.protein_path_list = protein_path_list
         self.ligand_descriptions = ligand_descriptions
         self.keep_local_structures = keep_local_structures
-        if matching or protein_path_list is not None and ligand_descriptions is not None:
-            cache_path += '_torsion'
         if all_atoms:
-            cache_path += '_allatoms'
-        self.full_cache_path = os.path.join(cache_path, f'limit{self.limit_complexes}'
-                                                        f'_INDEX{os.path.splitext(os.path.basename(self.split_path))[0]}'
-                                                        f'_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}'
-                                                        f'_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}'
-                                            + ('' if not all_atoms else f'_atomRad{atom_radius}_atomMax{atom_max_neighbors}')
-                                            + ('' if not matching or num_conformers == 1 else f'_confs{num_conformers}')
-                                            + ('' if self.esm_embeddings_path is None else f'_esmEmbeddings')
-                                            + ('' if not keep_local_structures else f'_keptLocalStruct')
-                                            + ('' if protein_path_list is None or ligand_descriptions is None else str(binascii.crc32(''.join(ligand_descriptions + protein_path_list).encode()))))
         self.popsize, self.maxiter = popsize, maxiter
         self.matching, self.keep_original = matching, keep_original
         self.num_conformers = num_conformers
         self.all_atoms = all_atoms
         self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
-        if not os.path.exists(os.path.join(self.full_cache_path, "heterographs.pkl"))\
-                or (require_ligand and not os.path.exists(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"))):
             os.makedirs(self.full_cache_path, exist_ok=True)
             if protein_path_list is None or ligand_descriptions is None:
                 self.preprocessing()
             else:
                 self.inference_preprocessing()
-        print('loading data from memory: ', os.path.join(self.full_cache_path, "heterographs.pkl"))
-        with open(os.path.join(self.full_cache_path, "heterographs.pkl"), 'rb') as f:
             self.complex_graphs = pickle.load(f)
         if require_ligand:
-            with open(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), 'rb') as f:
                 self.rdkit_ligands = pickle.load(f)
         print_statistics(self.complex_graphs)
@@ -122,18 +214,20 @@ class PDBBind(Dataset):
             return copy.deepcopy(self.complex_graphs[idx])
     def preprocessing(self):
-        print(f'Processing complexes from [{self.split_path}] and saving it to [{self.full_cache_path}]')
         complex_names_all = read_strings_from_txt(self.split_path)
         if self.limit_complexes is not None and self.limit_complexes != 0:
-            complex_names_all = complex_names_all[:self.limit_complexes]
-        print(f'Loading {len(complex_names_all)} complexes.')
         if self.esm_embeddings_path is not None:
             id_to_embeddings = torch.load(self.esm_embeddings_path)
             chain_embeddings_dictlist = defaultdict(list)
             for key, embedding in id_to_embeddings.items():
-                key_name = key.split('_')[0]
                 if key_name in complex_names_all:
                     chain_embeddings_dictlist[key_name].append(embedding)
             lm_embeddings_chains_all = []
@@ -144,58 +238,98 @@ class PDBBind(Dataset):
         if self.num_workers > 1:
             # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
-            for i in range(len(complex_names_all)//1000+1):
-                if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
                     continue
-                complex_names = complex_names_all[1000*i:1000*(i+1)]
-                lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
                 complex_graphs, rdkit_ligands = [], []
                 if self.num_workers > 1:
                     p = Pool(self.num_workers, maxtasksperchild=1)
                     p.__enter__()
-                with tqdm(total=len(complex_names), desc=f'loading complexes {i}/{len(complex_names_all)//1000+1}') as pbar:
                     map_fn = p.imap_unordered if self.num_workers > 1 else map
-                    for t in map_fn(self.get_complex, zip(complex_names, lm_embeddings_chains, [None] * len(complex_names), [None] * len(complex_names))):
                         complex_graphs.extend(t[0])
                         rdkit_ligands.extend(t[1])
                         pbar.update()
-                if self.num_workers > 1: p.__exit__(None, None, None)
-                with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
                     pickle.dump((complex_graphs), f)
-                with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
                     pickle.dump((rdkit_ligands), f)
             complex_graphs_all = []
-            for i in range(len(complex_names_all)//1000+1):
-                with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'rb') as f:
                     l = pickle.load(f)
                     complex_graphs_all.extend(l)
-            with open(os.path.join(self.full_cache_path, f"heterographs.pkl"), 'wb') as f:
                 pickle.dump((complex_graphs_all), f)
             rdkit_ligands_all = []
             for i in range(len(complex_names_all) // 1000 + 1):
-                with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
                     l = pickle.load(f)
                     rdkit_ligands_all.extend(l)
-            with open(os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), 'wb') as f:
                 pickle.dump((rdkit_ligands_all), f)
         else:
             complex_graphs, rdkit_ligands = [], []
-            with tqdm(total=len(complex_names_all), desc='loading complexes') as pbar:
-                for t in map(self.get_complex, zip(complex_names_all, lm_embeddings_chains_all, [None] * len(complex_names_all), [None] * len(complex_names_all))):
                     complex_graphs.extend(t[0])
                     rdkit_ligands.extend(t[1])
                     pbar.update()
-            with open(os.path.join(self.full_cache_path, "heterographs.pkl"), 'wb') as f:
                 pickle.dump((complex_graphs), f)
-            with open(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), 'wb') as f:
                 pickle.dump((rdkit_ligands), f)
     def inference_preprocessing(self):
         ligands_list = []
-        print('Reading molecules and generating local structures with RDKit')
         for ligand_description in tqdm(self.ligand_descriptions):
             mol = MolFromSmiles(ligand_description)  # check if it is a smiles or a path
             if mol is not None:
@@ -211,70 +345,126 @@ class PDBBind(Dataset):
                 ligands_list.append(mol)
         if self.esm_embeddings_path is not None:
-            print('Reading language model embeddings.')
             lm_embeddings_chains_all = []
-            if not os.path.exists(self.esm_embeddings_path): raise Exception('ESM embeddings path does not exist: ',self.esm_embeddings_path)
             for protein_path in self.protein_path_list:
-                embeddings_paths = sorted(glob.glob(os.path.join(self.esm_embeddings_path, os.path.basename(protein_path)) + '*'))
                 lm_embeddings_chains = []
                 for embeddings_path in embeddings_paths:
-                    lm_embeddings_chains.append(torch.load(embeddings_path)['representations'][33])
                 lm_embeddings_chains_all.append(lm_embeddings_chains)
         else:
             lm_embeddings_chains_all = [None] * len(self.protein_path_list)
-        print('Generating graphs for ligands and proteins')
         if self.num_workers > 1:
             # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
-            for i in range(len(self.protein_path_list)//1000+1):
-                if os.path.exists(os.path.join(self.full_cache_path, f"heterographs{i}.pkl")):
                     continue
-                protein_paths_chunk = self.protein_path_list[1000*i:1000*(i+1)]
-                ligand_description_chunk = self.ligand_descriptions[1000*i:1000*(i+1)]
-                ligands_chunk = ligands_list[1000 * i:1000 * (i + 1)]
-                lm_embeddings_chains = lm_embeddings_chains_all[1000*i:1000*(i+1)]
                 complex_graphs, rdkit_ligands = [], []
                 if self.num_workers > 1:
                     p = Pool(self.num_workers, maxtasksperchild=1)
                     p.__enter__()
-                with tqdm(total=len(protein_paths_chunk), desc=f'loading complexes {i}/{len(protein_paths_chunk)//1000+1}') as pbar:
                     map_fn = p.imap_unordered if self.num_workers > 1 else map
-                    for t in map_fn(self.get_complex, zip(protein_paths_chunk, lm_embeddings_chains, ligands_chunk,ligand_description_chunk)):
                         complex_graphs.extend(t[0])
                         rdkit_ligands.extend(t[1])
                         pbar.update()
-                if self.num_workers > 1: p.__exit__(None, None, None)
-                with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'wb') as f:
                     pickle.dump((complex_graphs), f)
-                with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'wb') as f:
                     pickle.dump((rdkit_ligands), f)
             complex_graphs_all = []
-            for i in range(len(self.protein_path_list)//1000+1):
-                with open(os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), 'rb') as f:
                     l = pickle.load(f)
                     complex_graphs_all.extend(l)
-            with open(os.path.join(self.full_cache_path, f"heterographs.pkl"), 'wb') as f:
                 pickle.dump((complex_graphs_all), f)
             rdkit_ligands_all = []
             for i in range(len(self.protein_path_list) // 1000 + 1):
-                with open(os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), 'rb') as f:
                     l = pickle.load(f)
                     rdkit_ligands_all.extend(l)
-            with open(os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), 'wb') as f:
                 pickle.dump((rdkit_ligands_all), f)
         else:
             complex_graphs, rdkit_ligands = [], []
-            with tqdm(total=len(self.protein_path_list), desc='loading complexes') as pbar:
-                for t in map(self.get_complex, zip(self.protein_path_list, lm_embeddings_chains_all, ligands_list, self.ligand_descriptions)):
                     complex_graphs.extend(t[0])
                     rdkit_ligands.extend(t[1])
                     pbar.update()
-            with open(os.path.join(self.full_cache_path, "heterographs.pkl"), 'wb') as f:
                 pickle.dump((complex_graphs), f)
-            with open(os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), 'wb') as f:
                 pickle.dump((rdkit_ligands), f)
     def get_complex(self, par):
@@ -285,51 +475,94 @@ class PDBBind(Dataset):
         if ligand is not None:
             rec_model = parse_pdb_from_path(name)
-            name = f'{name}____{ligand_description}'
             ligs = [ligand]
         else:
             try:
                 rec_model = parse_receptor(name, self.pdbbind_dir)
             except Exception as e:
-                print(f'Skipping {name} because of the error:')
                 print(e)
                 return [], []
             ligs = read_mols(self.pdbbind_dir, name, remove_hs=False)
         complex_graphs = []
         for i, lig in enumerate(ligs):
-            if self.max_lig_size is not None and lig.GetNumHeavyAtoms() > self.max_lig_size:
-                print(f'Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data.')
                 continue
             complex_graph = HeteroData()
-            complex_graph['name'] = name
             try:
-                get_lig_graph_with_matching(lig, complex_graph, self.popsize, self.maxiter, self.matching, self.keep_original,
-                                            self.num_conformers, remove_hs=self.remove_hs)
-                rec, rec_coords, c_alpha_coords, n_coords, c_coords, lm_embeddings = extract_receptor_structure(copy.deepcopy(rec_model), lig, lm_embedding_chains=lm_embedding_chains)
-                if lm_embeddings is not None and len(c_alpha_coords) != len(lm_embeddings):
-                    print(f'LM embeddings for complex {name} did not have the right length for the protein. Skipping {name}.')
                     continue
-                get_rec_graph(rec, rec_coords, c_alpha_coords, n_coords, c_coords, complex_graph, rec_radius=self.receptor_radius,
-                              c_alpha_max_neighbors=self.c_alpha_max_neighbors, all_atoms=self.all_atoms,
-                              atom_radius=self.atom_radius, atom_max_neighbors=self.atom_max_neighbors, remove_hs=self.remove_hs, lm_embeddings=lm_embeddings)
             except Exception as e:
-                print(f'Skipping {name} because of the error:')
                 print(e)
                 raise e
                 continue
-            protein_center = torch.mean(complex_graph['receptor'].pos, dim=0, keepdim=True)
-            complex_graph['receptor'].pos -= protein_center
             if self.all_atoms:
-                complex_graph['atom'].pos -= protein_center
             if (not self.matching) or self.num_conformers == 1:
-                complex_graph['ligand'].pos -= protein_center
             else:
-                for p in complex_graph['ligand'].pos:
                     p -= protein_center
             complex_graph.original_center = protein_center
@@ -341,11 +574,18 @@ def print_statistics(complex_graphs):
     statistics = ([], [], [], [])
     for complex_graph in complex_graphs:
-        lig_pos = complex_graph['ligand'].pos if torch.is_tensor(complex_graph['ligand'].pos) else complex_graph['ligand'].pos[0]
-        radius_protein = torch.max(torch.linalg.vector_norm(complex_graph['receptor'].pos, dim=1))
         molecule_center = torch.mean(lig_pos, dim=0)
         radius_molecule = torch.max(
-            torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1))
         distance_center = torch.linalg.vector_norm(molecule_center)
         statistics[0].append(radius_protein)
         statistics[1].append(radius_molecule)
@@ -355,52 +595,111 @@ def print_statistics(complex_graphs):
         else:
             statistics[3].append(0)
-    name = ['radius protein', 'radius molecule', 'distance protein-mol', 'rmsd matching']
-    print('Number of complexes: ', len(complex_graphs))
     for i in range(4):
         array = np.asarray(statistics[i])
-        print(f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}")
 def construct_loader(args, t_to_sigma):
-    transform = NoiseTransform(t_to_sigma=t_to_sigma, no_torsion=args.no_torsion,
-                               all_atom=args.all_atoms)
-    common_args = {'transform': transform, 'root': args.data_dir, 'limit_complexes': args.limit_complexes,
-                   'receptor_radius': args.receptor_radius,
-                   'c_alpha_max_neighbors': args.c_alpha_max_neighbors,
-                   'remove_hs': args.remove_hs, 'max_lig_size': args.max_lig_size,
-                   'matching': not args.no_torsion, 'popsize': args.matching_popsize, 'maxiter': args.matching_maxiter,
-                   'num_workers': args.num_workers, 'all_atoms': args.all_atoms,
-                   'atom_radius': args.atom_radius, 'atom_max_neighbors': args.atom_max_neighbors,
-                   'esm_embeddings_path': args.esm_embeddings_path}
-    train_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_train, keep_original=True,
-                            num_conformers=args.num_conformers, **common_args)
-    val_dataset = PDBBind(cache_path=args.cache_path, split_path=args.split_val, keep_original=True, **common_args)
     loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
-    train_loader = loader_class(dataset=train_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=True, pin_memory=args.pin_memory)
-    val_loader = loader_class(dataset=val_dataset, batch_size=args.batch_size, num_workers=args.num_dataloader_workers, shuffle=True, pin_memory=args.pin_memory)
     return train_loader, val_loader
 def read_mol(pdbbind_dir, name, remove_hs=False):
-    lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_ligand.sdf'), remove_hs=remove_hs, sanitize=True)
     if lig is None:  # read mol2 file if sdf file cannot be sanitized
-        lig = read_molecule(os.path.join(pdbbind_dir, name, f'{name}_ligand.mol2'), remove_hs=remove_hs, sanitize=True)
     return lig
 def read_mols(pdbbind_dir, name, remove_hs=False):
     ligs = []
     for file in os.listdir(os.path.join(pdbbind_dir, name)):
-        if file.endswith(".sdf") and 'rdkit' not in file:
-            lig = read_molecule(os.path.join(pdbbind_dir, name, file), remove_hs=remove_hs, sanitize=True)
-            if lig is None and os.path.exists(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2")):  # read mol2 file if sdf file cannot be sanitized
-                print('Using the .sdf file failed. We found a .mol2 file instead and are trying to use that.')
-                lig = read_molecule(os.path.join(pdbbind_dir, name, file[:-4] + ".mol2"), remove_hs=remove_hs, sanitize=True)
             if lig is not None:
                 ligs.append(lig)
-    return ligs

 from torch_geometric.transforms import BaseTransform
 from tqdm import tqdm
+from datasets.process_mols import (
+    read_molecule,
+    get_rec_graph,
+    generate_conformer,
+    get_lig_graph_with_matching,
+    extract_receptor_structure,
+    parse_receptor,
+    parse_pdb_from_path,
+)
 from utils.diffusion_utils import modify_conformer, set_time
 from utils.utils import read_strings_from_txt
 from utils import so3, torus
         t_tr, t_rot, t_tor = t, t, t
         return self.apply_noise(data, t_tr, t_rot, t_tor)
+    def apply_noise(
+        self,
+        data,
+        t_tr,
+        t_rot,
+        t_tor,
+        tr_update=None,
+        rot_update=None,
+        torsion_updates=None,
+    ):
+        if not torch.is_tensor(data["ligand"].pos):
+            data["ligand"].pos = random.choice(data["ligand"].pos)
         tr_sigma, rot_sigma, tor_sigma = self.t_to_sigma(t_tr, t_rot, t_tor)
         set_time(data, t_tr, t_rot, t_tor, 1, self.all_atom, device=None)
+        tr_update = (
+            torch.normal(mean=0, std=tr_sigma, size=(1, 3))
+            if tr_update is None
+            else tr_update
+        )
         rot_update = so3.sample_vec(eps=rot_sigma) if rot_update is None else rot_update
+        torsion_updates = (
+            np.random.normal(
+                loc=0.0, scale=tor_sigma, size=data["ligand"].edge_mask.sum()
+            )
+            if torsion_updates is None
+            else torsion_updates
+        )
         torsion_updates = None if self.no_torsion else torsion_updates
+        modify_conformer(
+            data, tr_update, torch.from_numpy(rot_update).float(), torsion_updates
+        )
+        data.tr_score = -tr_update / tr_sigma**2
+        data.rot_score = (
+            torch.from_numpy(so3.score_vec(vec=rot_update, eps=rot_sigma))
+            .float()
+            .unsqueeze(0)
+        )
+        data.tor_score = (
+            None
+            if self.no_torsion
+            else torch.from_numpy(torus.score(torsion_updates, tor_sigma)).float()
+        )
+        data.tor_sigma_edge = (
+            None
+            if self.no_torsion
+            else np.ones(data["ligand"].edge_mask.sum()) * tor_sigma
+        )
         return data
 class PDBBind(Dataset):
+    def __init__(
+        self,
+        root,
+        transform=None,
+        cache_path="data/cache",
+        split_path="data/",
+        limit_complexes=0,
+        receptor_radius=30,
+        num_workers=1,
+        c_alpha_max_neighbors=None,
+        popsize=15,
+        maxiter=15,
+        matching=True,
+        keep_original=False,
+        max_lig_size=None,
+        remove_hs=False,
+        num_conformers=1,
+        all_atoms=False,
+        atom_radius=5,
+        atom_max_neighbors=None,
+        esm_embeddings_path=None,
+        require_ligand=False,
+        ligands_list=None,
+        protein_path_list=None,
+        ligand_descriptions=None,
+        keep_local_structures=False,
+    ):
         super(PDBBind, self).__init__(root, transform)
         self.pdbbind_dir = root
         self.protein_path_list = protein_path_list
         self.ligand_descriptions = ligand_descriptions
         self.keep_local_structures = keep_local_structures
+        if (
+            matching
+            or protein_path_list is not None
+            and ligand_descriptions is not None
+        ):
+            cache_path += "_torsion"
         if all_atoms:
+            cache_path += "_allatoms"
+        self.full_cache_path = os.path.join(
+            cache_path,
+            f"limit{self.limit_complexes}"
+            f"_INDEX{os.path.splitext(os.path.basename(self.split_path))[0]}"
+            f"_maxLigSize{self.max_lig_size}_H{int(not self.remove_hs)}"
+            f"_recRad{self.receptor_radius}_recMax{self.c_alpha_max_neighbors}"
+            + (
+                ""
+                if not all_atoms
+                else f"_atomRad{atom_radius}_atomMax{atom_max_neighbors}"
+            )
+            + ("" if not matching or num_conformers == 1 else f"_confs{num_conformers}")
+            + ("" if self.esm_embeddings_path is None else f"_esmEmbeddings")
+            + ("" if not keep_local_structures else f"_keptLocalStruct")
+            + (
+                ""
+                if protein_path_list is None or ligand_descriptions is None
+                else str(
+                    binascii.crc32(
+                        "".join(ligand_descriptions + protein_path_list).encode()
+                    )
+                )
+            ),
+        )
         self.popsize, self.maxiter = popsize, maxiter
         self.matching, self.keep_original = matching, keep_original
         self.num_conformers = num_conformers
         self.all_atoms = all_atoms
         self.atom_radius, self.atom_max_neighbors = atom_radius, atom_max_neighbors
+        if not os.path.exists(
+            os.path.join(self.full_cache_path, "heterographs.pkl")
+        ) or (
+            require_ligand
+            and not os.path.exists(
+                os.path.join(self.full_cache_path, "rdkit_ligands.pkl")
+            )
+        ):
             os.makedirs(self.full_cache_path, exist_ok=True)
             if protein_path_list is None or ligand_descriptions is None:
                 self.preprocessing()
             else:
                 self.inference_preprocessing()
+        print(
+            "loading data from memory: ",
+            os.path.join(self.full_cache_path, "heterographs.pkl"),
+        )
+        with open(os.path.join(self.full_cache_path, "heterographs.pkl"), "rb") as f:
             self.complex_graphs = pickle.load(f)
         if require_ligand:
+            with open(
+                os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "rb"
+            ) as f:
                 self.rdkit_ligands = pickle.load(f)
         print_statistics(self.complex_graphs)
             return copy.deepcopy(self.complex_graphs[idx])
     def preprocessing(self):
+        print(
+            f"Processing complexes from [{self.split_path}] and saving it to [{self.full_cache_path}]"
+        )
         complex_names_all = read_strings_from_txt(self.split_path)
         if self.limit_complexes is not None and self.limit_complexes != 0:
+            complex_names_all = complex_names_all[: self.limit_complexes]
+        print(f"Loading {len(complex_names_all)} complexes.")
         if self.esm_embeddings_path is not None:
             id_to_embeddings = torch.load(self.esm_embeddings_path)
             chain_embeddings_dictlist = defaultdict(list)
             for key, embedding in id_to_embeddings.items():
+                key_name = key.split("_")[0]
                 if key_name in complex_names_all:
                     chain_embeddings_dictlist[key_name].append(embedding)
             lm_embeddings_chains_all = []
         if self.num_workers > 1:
             # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+            for i in range(len(complex_names_all) // 1000 + 1):
+                if os.path.exists(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl")
+                ):
                     continue
+                complex_names = complex_names_all[1000 * i : 1000 * (i + 1)]
+                lm_embeddings_chains = lm_embeddings_chains_all[
+                    1000 * i : 1000 * (i + 1)
+                ]
                 complex_graphs, rdkit_ligands = [], []
                 if self.num_workers > 1:
                     p = Pool(self.num_workers, maxtasksperchild=1)
                     p.__enter__()
+                with tqdm(
+                    total=len(complex_names),
+                    desc=f"loading complexes {i}/{len(complex_names_all)//1000+1}",
+                ) as pbar:
                     map_fn = p.imap_unordered if self.num_workers > 1 else map
+                    for t in map_fn(
+                        self.get_complex,
+                        zip(
+                            complex_names,
+                            lm_embeddings_chains,
+                            [None] * len(complex_names),
+                            [None] * len(complex_names),
+                        ),
+                    ):
                         complex_graphs.extend(t[0])
                         rdkit_ligands.extend(t[1])
                         pbar.update()
+                if self.num_workers > 1:
+                    p.__exit__(None, None, None)
+                with open(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "wb"
+                ) as f:
                     pickle.dump((complex_graphs), f)
+                with open(
+                    os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "wb"
+                ) as f:
                     pickle.dump((rdkit_ligands), f)
             complex_graphs_all = []
+            for i in range(len(complex_names_all) // 1000 + 1):
+                with open(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "rb"
+                ) as f:
                     l = pickle.load(f)
                     complex_graphs_all.extend(l)
+            with open(
+                os.path.join(self.full_cache_path, f"heterographs.pkl"), "wb"
+            ) as f:
                 pickle.dump((complex_graphs_all), f)
             rdkit_ligands_all = []
             for i in range(len(complex_names_all) // 1000 + 1):
+                with open(
+                    os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "rb"
+                ) as f:
                     l = pickle.load(f)
                     rdkit_ligands_all.extend(l)
+            with open(
+                os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), "wb"
+            ) as f:
                 pickle.dump((rdkit_ligands_all), f)
         else:
             complex_graphs, rdkit_ligands = [], []
+            with tqdm(total=len(complex_names_all), desc="loading complexes") as pbar:
+                for t in map(
+                    self.get_complex,
+                    zip(
+                        complex_names_all,
+                        lm_embeddings_chains_all,
+                        [None] * len(complex_names_all),
+                        [None] * len(complex_names_all),
+                    ),
+                ):
                     complex_graphs.extend(t[0])
                     rdkit_ligands.extend(t[1])
                     pbar.update()
+            with open(
+                os.path.join(self.full_cache_path, "heterographs.pkl"), "wb"
+            ) as f:
                 pickle.dump((complex_graphs), f)
+            with open(
+                os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "wb"
+            ) as f:
                 pickle.dump((rdkit_ligands), f)
     def inference_preprocessing(self):
         ligands_list = []
+        print("Reading molecules and generating local structures with RDKit")
         for ligand_description in tqdm(self.ligand_descriptions):
             mol = MolFromSmiles(ligand_description)  # check if it is a smiles or a path
             if mol is not None:
                 ligands_list.append(mol)
         if self.esm_embeddings_path is not None:
+            print("Reading language model embeddings.")
             lm_embeddings_chains_all = []
+            if not os.path.exists(self.esm_embeddings_path):
+                raise Exception(
+                    "ESM embeddings path does not exist: ", self.esm_embeddings_path
+                )
             for protein_path in self.protein_path_list:
+                embeddings_paths = sorted(
+                    glob.glob(
+                        os.path.join(
+                            self.esm_embeddings_path, os.path.basename(protein_path)
+                        )
+                        + "*"
+                    )
+                )
                 lm_embeddings_chains = []
                 for embeddings_path in embeddings_paths:
+                    lm_embeddings_chains.append(
+                        torch.load(embeddings_path)["representations"][33]
+                    )
                 lm_embeddings_chains_all.append(lm_embeddings_chains)
         else:
             lm_embeddings_chains_all = [None] * len(self.protein_path_list)
+        print("Generating graphs for ligands and proteins")
         if self.num_workers > 1:
             # running preprocessing in parallel on multiple workers and saving the progress every 1000 complexes
+            for i in range(len(self.protein_path_list) // 1000 + 1):
+                if os.path.exists(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl")
+                ):
                     continue
+                protein_paths_chunk = self.protein_path_list[1000 * i : 1000 * (i + 1)]
+                ligand_description_chunk = self.ligand_descriptions[
+                    1000 * i : 1000 * (i + 1)
+                ]
+                ligands_chunk = ligands_list[1000 * i : 1000 * (i + 1)]
+                lm_embeddings_chains = lm_embeddings_chains_all[
+                    1000 * i : 1000 * (i + 1)
+                ]
                 complex_graphs, rdkit_ligands = [], []
                 if self.num_workers > 1:
                     p = Pool(self.num_workers, maxtasksperchild=1)
                     p.__enter__()
+                with tqdm(
+                    total=len(protein_paths_chunk),
+                    desc=f"loading complexes {i}/{len(protein_paths_chunk)//1000+1}",
+                ) as pbar:
                     map_fn = p.imap_unordered if self.num_workers > 1 else map
+                    for t in map_fn(
+                        self.get_complex,
+                        zip(
+                            protein_paths_chunk,
+                            lm_embeddings_chains,
+                            ligands_chunk,
+                            ligand_description_chunk,
+                        ),
+                    ):
                         complex_graphs.extend(t[0])
                         rdkit_ligands.extend(t[1])
                         pbar.update()
+                if self.num_workers > 1:
+                    p.__exit__(None, None, None)
+                with open(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "wb"
+                ) as f:
                     pickle.dump((complex_graphs), f)
+                with open(
+                    os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "wb"
+                ) as f:
                     pickle.dump((rdkit_ligands), f)
             complex_graphs_all = []
+            for i in range(len(self.protein_path_list) // 1000 + 1):
+                with open(
+                    os.path.join(self.full_cache_path, f"heterographs{i}.pkl"), "rb"
+                ) as f:
                     l = pickle.load(f)
                     complex_graphs_all.extend(l)
+            with open(
+                os.path.join(self.full_cache_path, f"heterographs.pkl"), "wb"
+            ) as f:
                 pickle.dump((complex_graphs_all), f)
             rdkit_ligands_all = []
             for i in range(len(self.protein_path_list) // 1000 + 1):
+                with open(
+                    os.path.join(self.full_cache_path, f"rdkit_ligands{i}.pkl"), "rb"
+                ) as f:
                     l = pickle.load(f)
                     rdkit_ligands_all.extend(l)
+            with open(
+                os.path.join(self.full_cache_path, f"rdkit_ligands.pkl"), "wb"
+            ) as f:
                 pickle.dump((rdkit_ligands_all), f)
         else:
             complex_graphs, rdkit_ligands = [], []
+            with tqdm(
+                total=len(self.protein_path_list), desc="loading complexes"
+            ) as pbar:
+                for t in map(
+                    self.get_complex,
+                    zip(
+                        self.protein_path_list,
+                        lm_embeddings_chains_all,
+                        ligands_list,
+                        self.ligand_descriptions,
+                    ),
+                ):
                     complex_graphs.extend(t[0])
                     rdkit_ligands.extend(t[1])
                     pbar.update()
+            with open(
+                os.path.join(self.full_cache_path, "heterographs.pkl"), "wb"
+            ) as f:
                 pickle.dump((complex_graphs), f)
+            with open(
+                os.path.join(self.full_cache_path, "rdkit_ligands.pkl"), "wb"
+            ) as f:
                 pickle.dump((rdkit_ligands), f)
     def get_complex(self, par):
         if ligand is not None:
             rec_model = parse_pdb_from_path(name)
+            name = f"{name}____{ligand_description}"
             ligs = [ligand]
         else:
             try:
                 rec_model = parse_receptor(name, self.pdbbind_dir)
             except Exception as e:
+                print(f"Skipping {name} because of the error:")
                 print(e)
                 return [], []
             ligs = read_mols(self.pdbbind_dir, name, remove_hs=False)
         complex_graphs = []
         for i, lig in enumerate(ligs):
+            if (
+                self.max_lig_size is not None
+                and lig.GetNumHeavyAtoms() > self.max_lig_size
+            ):
+                print(
+                    f"Ligand with {lig.GetNumHeavyAtoms()} heavy atoms is larger than max_lig_size {self.max_lig_size}. Not including {name} in preprocessed data."
+                )
                 continue
             complex_graph = HeteroData()
+            complex_graph["name"] = name
             try:
+                get_lig_graph_with_matching(
+                    lig,
+                    complex_graph,
+                    self.popsize,
+                    self.maxiter,
+                    self.matching,
+                    self.keep_original,
+                    self.num_conformers,
+                    remove_hs=self.remove_hs,
+                )
+                print(lm_embedding_chains)
+                (
+                    rec,
+                    rec_coords,
+                    c_alpha_coords,
+                    n_coords,
+                    c_coords,
+                    lm_embeddings,
+                ) = extract_receptor_structure(
+                    copy.deepcopy(rec_model),
+                    lig,
+                    lm_embedding_chains=lm_embedding_chains,
+                )
+                if lm_embeddings is not None and len(c_alpha_coords) != len(
+                    lm_embeddings
+                ):
+                    print(
+                        f"LM embeddings for complex {name} did not have the right length for the protein. Skipping {name}."
+                    )
                     continue
+                get_rec_graph(
+                    rec,
+                    rec_coords,
+                    c_alpha_coords,
+                    n_coords,
+                    c_coords,
+                    complex_graph,
+                    rec_radius=self.receptor_radius,
+                    c_alpha_max_neighbors=self.c_alpha_max_neighbors,
+                    all_atoms=self.all_atoms,
+                    atom_radius=self.atom_radius,
+                    atom_max_neighbors=self.atom_max_neighbors,
+                    remove_hs=self.remove_hs,
+                    lm_embeddings=lm_embeddings,
+                )
             except Exception as e:
+                print(f"Skipping {name} because of the error:")
                 print(e)
                 raise e
                 continue
+            protein_center = torch.mean(
+                complex_graph["receptor"].pos, dim=0, keepdim=True
+            )
+            complex_graph["receptor"].pos -= protein_center
             if self.all_atoms:
+                complex_graph["atom"].pos -= protein_center
             if (not self.matching) or self.num_conformers == 1:
+                complex_graph["ligand"].pos -= protein_center
             else:
+                for p in complex_graph["ligand"].pos:
                     p -= protein_center
             complex_graph.original_center = protein_center
     statistics = ([], [], [], [])
     for complex_graph in complex_graphs:
+        lig_pos = (
+            complex_graph["ligand"].pos
+            if torch.is_tensor(complex_graph["ligand"].pos)
+            else complex_graph["ligand"].pos[0]
+        )
+        radius_protein = torch.max(
+            torch.linalg.vector_norm(complex_graph["receptor"].pos, dim=1)
+        )
         molecule_center = torch.mean(lig_pos, dim=0)
         radius_molecule = torch.max(
+            torch.linalg.vector_norm(lig_pos - molecule_center.unsqueeze(0), dim=1)
+        )
         distance_center = torch.linalg.vector_norm(molecule_center)
         statistics[0].append(radius_protein)
         statistics[1].append(radius_molecule)
         else:
             statistics[3].append(0)
+    name = [
+        "radius protein",
+        "radius molecule",
+        "distance protein-mol",
+        "rmsd matching",
+    ]
+    print("Number of complexes: ", len(complex_graphs))
     for i in range(4):
         array = np.asarray(statistics[i])
+        print(
+            f"{name[i]}: mean {np.mean(array)}, std {np.std(array)}, max {np.max(array)}"
+        )
 def construct_loader(args, t_to_sigma):
+    transform = NoiseTransform(
+        t_to_sigma=t_to_sigma, no_torsion=args.no_torsion, all_atom=args.all_atoms
+    )
+    common_args = {
+        "transform": transform,
+        "root": args.data_dir,
+        "limit_complexes": args.limit_complexes,
+        "receptor_radius": args.receptor_radius,
+        "c_alpha_max_neighbors": args.c_alpha_max_neighbors,
+        "remove_hs": args.remove_hs,
+        "max_lig_size": args.max_lig_size,
+        "matching": not args.no_torsion,
+        "popsize": args.matching_popsize,
+        "maxiter": args.matching_maxiter,
+        "num_workers": args.num_workers,
+        "all_atoms": args.all_atoms,
+        "atom_radius": args.atom_radius,
+        "atom_max_neighbors": args.atom_max_neighbors,
+        "esm_embeddings_path": args.esm_embeddings_path,
+    }
+    train_dataset = PDBBind(
+        cache_path=args.cache_path,
+        split_path=args.split_train,
+        keep_original=True,
+        num_conformers=args.num_conformers,
+        **common_args,
+    )
+    val_dataset = PDBBind(
+        cache_path=args.cache_path,
+        split_path=args.split_val,
+        keep_original=True,
+        **common_args,
+    )
     loader_class = DataListLoader if torch.cuda.is_available() else DataLoader
+    train_loader = loader_class(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        num_workers=args.num_dataloader_workers,
+        shuffle=True,
+        pin_memory=args.pin_memory,
+    )
+    val_loader = loader_class(
+        dataset=val_dataset,
+        batch_size=args.batch_size,
+        num_workers=args.num_dataloader_workers,
+        shuffle=True,
+        pin_memory=args.pin_memory,
+    )
     return train_loader, val_loader
 def read_mol(pdbbind_dir, name, remove_hs=False):
+    lig = read_molecule(
+        os.path.join(pdbbind_dir, name, f"{name}_ligand.sdf"),
+        remove_hs=remove_hs,
+        sanitize=True,
+    )
     if lig is None:  # read mol2 file if sdf file cannot be sanitized
+        lig = read_molecule(
+            os.path.join(pdbbind_dir, name, f"{name}_ligand.mol2"),
+            remove_hs=remove_hs,
+            sanitize=True,
+        )
     return lig
 def read_mols(pdbbind_dir, name, remove_hs=False):
     ligs = []
     for file in os.listdir(os.path.join(pdbbind_dir, name)):
+        if file.endswith(".sdf") and "rdkit" not in file:
+            lig = read_molecule(
+                os.path.join(pdbbind_dir, name, file),
+                remove_hs=remove_hs,
+                sanitize=True,
+            )
+            if lig is None and os.path.exists(
+                os.path.join(pdbbind_dir, name, file[:-4] + ".mol2")
+            ):  # read mol2 file if sdf file cannot be sanitized
+                print(
+                    "Using the .sdf file failed. We found a .mol2 file instead and are trying to use that."
+                )
+                lig = read_molecule(
+                    os.path.join(pdbbind_dir, name, file[:-4] + ".mol2"),
+                    remove_hs=remove_hs,
+                    sanitize=True,
+                )
             if lig is not None:
                 ligs.append(lig)
+    return ligs

datasets/process_mols.py CHANGED Viewed

@@ -490,8 +490,10 @@ def read_molecule(molecule_file, sanitize=False, calc_charges=False, remove_hs=F
     if molecule_file.endswith('.mol2'):
         mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=False)
     elif molecule_file.endswith('.sdf'):
         supplier = Chem.SDMolSupplier(molecule_file, sanitize=False, removeHs=False)
         mol = supplier[0]
     elif molecule_file.endswith('.pdbqt'):
         with open(molecule_file) as file:
             pdbqt_data = file.readlines()
@@ -505,6 +507,8 @@ def read_molecule(molecule_file, sanitize=False, calc_charges=False, remove_hs=F
         return ValueError('Expect the format of the molecule_file to be '
                           'one of .mol2, .sdf, .pdbqt and .pdb, got {}'.format(molecule_file))
     try:
         if sanitize or calc_charges:
             Chem.SanitizeMol(mol)
@@ -518,7 +522,8 @@ def read_molecule(molecule_file, sanitize=False, calc_charges=False, remove_hs=F
         if remove_hs:
             mol = Chem.RemoveHs(mol, sanitize=sanitize)
-    except:
         return None
     return mol

     if molecule_file.endswith('.mol2'):
         mol = Chem.MolFromMol2File(molecule_file, sanitize=False, removeHs=False)
     elif molecule_file.endswith('.sdf'):
+        print(molecule_file)
         supplier = Chem.SDMolSupplier(molecule_file, sanitize=False, removeHs=False)
         mol = supplier[0]
+        print(mol)
     elif molecule_file.endswith('.pdbqt'):
         with open(molecule_file) as file:
             pdbqt_data = file.readlines()
         return ValueError('Expect the format of the molecule_file to be '
                           'one of .mol2, .sdf, .pdbqt and .pdb, got {}'.format(molecule_file))
+    print(sanitize, calc_charges, remove_hs)
     try:
         if sanitize or calc_charges:
             Chem.SanitizeMol(mol)
         if remove_hs:
             mol = Chem.RemoveHs(mol, sanitize=sanitize)
+    except Exception as e:
+        print(e)
         return None
     return mol

examples/1a46_ligand.sdf ADDED Viewed

	@@ -0,0 +1,179 @@

+1a46_ligand
+  -I-interpret-
+ 85 88  0  0  0  0  0  0  0  0999 V2000
+   17.8330  -13.0420   21.6620 C   0  0  0  0  0
+   18.8870  -13.0710   20.5870 C   0  0  0  0  0
+   19.8510  -14.2200   21.1170 C   0  0  0  0  0
+   19.3270  -16.4440   22.1560 C   0  0  0  0  0
+   18.1340  -17.2300   22.7620 C   0  0  0  0  0
+   17.2230  -16.3290   23.5970 C   0  0  0  0  0
+   17.0320  -14.9230   23.0460 C   0  0  0  0  0
+   18.8520  -15.2420   21.4440 N   0  3  0  0  0
+   17.7750  -14.5090   22.0480 N   0  0  0  0  0
+   15.9850  -14.2900   23.3800 O   0  0  0  0  0
+   16.6380  -13.0610   20.7550 C   0  0  0  0  0
+   16.4620  -13.9620   19.8370 O   0  0  0  0  0
+   15.8090  -16.7300   23.6610 N   0  3  0  0  0
+   17.4150  -16.4170   25.1230 C   0  0  0  0  0
+   18.7640  -15.9840   25.5820 C   0  0  0  0  0
+   19.0510  -14.6340   25.7600 C   0  0  0  0  0
+   20.3910  -14.2520   26.0760 C   0  0  0  0  0
+   21.4290  -15.1780   26.2150 C   0  0  0  0  0
+   21.0990  -16.5480   26.0980 C   0  0  0  0  0
+   19.7890  -16.9510   25.7560 C   0  0  0  0  0
+   15.6470  -12.0890   20.7690 N   0  0  0  0  0
+   14.4940  -11.8920   19.9090 C   0  0  0  0  0
+   14.4960  -10.9450   18.7130 C   0  0  0  0  0
+   13.3800  -10.6840   18.0770 O   0  0  0  0  0
+   13.1950  -11.6150   20.6280 C   0  0  0  0  0
+   12.8670  -12.5040   21.7570 C   0  0  0  0  0
+   11.5610  -12.2200   22.4370 C   0  0  0  0  0
+   11.1700  -13.3510   23.3530 C   0  0  0  0  0
+   10.0380  -13.1110   24.2350 N   0  3  0  0  0
+   14.8040  -11.9210   16.4570 N   0  0  0  0  0
+   15.3450  -11.4350   17.5510 C   0  0  0  0  0
+   16.4740  -11.0890   17.7310 O   0  0  0  0  0
+   15.6510  -12.3330   15.3350 C   0  0  0  0  0
+   16.0390  -13.7960   15.2500 C   0  0  0  0  0
+   14.9560  -14.6030   14.5390 C   0  0  0  0  0
+   14.5990  -13.9990   13.1800 C   0  0  0  0  0
+   14.1680  -12.5610   13.3540 C   0  0  0  0  0
+   15.2770  -11.7400   13.9980 C   0  0  0  0  0
+   17.9332  -12.2994   22.4536 H   0  0  0  0  0
+   19.3882  -12.1140   20.4420 H   0  0  0  0  0
+   18.4882  -13.2617   19.5906 H   0  0  0  0  0
+   20.4926  -13.9283   21.9484 H   0  0  0  0  0
+   20.6127  -14.5392   20.4056 H   0  0  0  0  0
+   19.8508  -17.0880   21.4496 H   0  0  0  0  0
+   19.9921  -16.1358   22.9627 H   0  0  0  0  0
+   18.5327  -18.0092   23.4116 H   0  0  0  0  0
+   17.5467  -17.6450   21.9429 H   0  0  0  0  0
+   18.5389  -15.7277   20.6035 H   0  0  0  0  0
+   15.7428  -17.6818   24.0216 H   0  0  0  0  0
+   15.3044  -16.0949   24.2794 H   0  0  0  0  0
+   15.4029  -16.6903   22.7262 H   0  0  0  0  0
+   17.2937  -17.4623   25.4072 H   0  0  0  0  0
+   16.6848  -15.7509   25.5825 H   0  0  0  0  0
+   18.2682  -13.8821   25.6602 H   0  0  0  0  0
+   20.6133  -13.1939   26.2145 H   0  0  0  0  0
+   22.4528  -14.8565   26.4061 H   0  0  0  0  0
+   21.8654  -17.3029   26.2740 H   0  0  0  0  0
+   19.5640  -18.0094   25.6250 H   0  0  0  0  0
+   15.7457  -11.3948   21.5098 H   0  0  0  0  0
+   14.5905  -12.8910   19.4839 H   0  0  0  0  0
+   14.8425  -10.0689   19.2612 H   0  0  0  0  0
+   13.5584  -10.0751   17.3566 H   0  0  0  0  0
+   12.4050  -11.7585   19.8909 H   0  0  0  0  0
+   13.2901  -10.6141   21.0491 H   0  0  0  0  0
+   13.6465  -12.3595   22.5050 H   0  0  0  0  0
+   12.7942  -13.5124   21.3496 H   0  0  0  0  0
+   10.7892  -12.1043   21.6761 H   0  0  0  0  0
+   11.6663  -11.3113   23.0296 H   0  0  0  0  0
+   12.0278  -13.5229   24.0031 H   0  0  0  0  0
+   10.8774  -14.1769   22.7046 H   0  0  0  0  0
+    9.8690  -13.9413   24.8029 H   0  0  0  0  0
+   10.2441  -12.3181   24.8427 H   0  0  0  0  0
+    9.2101  -12.9059   23.6756 H   0  0  0  0  0
+   13.7904  -12.0118   16.3885 H   0  0  0  0  0
+   16.5871  -11.8550   15.6237 H   0  0  0  0  0
+   16.1623  -14.1864   16.2602 H   0  0  0  0  0
+   16.9681  -13.8812   14.6864 H   0  0  0  0  0
+   14.0613  -14.5994   15.1616 H   0  0  0  0  0
+   15.3317  -15.6133   14.3772 H   0  0  0  0  0
+   13.7819  -14.5683   12.7368 H   0  0  0  0  0
+   15.4725  -14.0364   12.5291 H   0  0  0  0  0
+   13.2893  -12.5323   13.9983 H   0  0  0  0  0
+   13.9420  -12.1402   12.3742 H   0  0  0  0  0
+   16.1510  -11.7459   13.3467 H   0  0  0  0  0
+   14.9268  -10.7183   14.1449 H   0  0  0  0  0
+  2  1  1  0  0  0
+  1  9  1  0  0  0
+  1 11  1  0  0  0
+  3  2  1  0  0  0
+  8  3  1  0  0  0
+  4  5  1  0  0  0
+  4  8  1  0  0  0
+  5  6  1  0  0  0
+  6  7  1  0  0  0
+  6 13  1  0  0  0
+  6 14  1  0  0  0
+  7  9  1  0  0  0
+  7 10  2  0  0  0
+  8  9  1  0  0  0
+ 11 12  2  0  0  0
+ 11 21  1  0  0  0
+ 14 15  1  0  0  0
+ 15 16  4  0  0  0
+ 15 20  4  0  0  0
+ 16 17  4  0  0  0
+ 17 18  4  0  0  0
+ 18 19  4  0  0  0
+ 19 20  4  0  0  0
+ 21 22  1  0  0  0
+ 22 23  1  0  0  0
+ 22 25  1  0  0  0
+ 23 24  1  0  0  0
+ 23 31  1  0  0  0
+ 25 26  1  0  0  0
+ 26 27  1  0  0  0
+ 27 28  1  0  0  0
+ 28 29  1  0  0  0
+ 31 30  1  0  0  0
+ 30 33  1  0  0  0
+ 31 32  2  0  0  0
+ 33 34  1  0  0  0
+ 33 38  1  0  0  0
+ 34 35  1  0  0  0
+ 35 36  1  0  0  0
+ 36 37  1  0  0  0
+ 37 38  1  0  0  0
+  1 39  1  0  0  0
+  2 40  1  0  0  0
+  2 41  1  0  0  0
+  3 42  1  0  0  0
+  3 43  1  0  0  0
+  4 44  1  0  0  0
+  4 45  1  0  0  0
+  5 46  1  0  0  0
+  5 47  1  0  0  0
+  8 48  1  0  0  0
+ 13 49  1  0  0  0
+ 13 50  1  0  0  0
+ 13 51  1  0  0  0
+ 14 52  1  0  0  0
+ 14 53  1  0  0  0
+ 16 54  1  0  0  0
+ 17 55  1  0  0  0
+ 18 56  1  0  0  0
+ 19 57  1  0  0  0
+ 20 58  1  0  0  0
+ 21 59  1  0  0  0
+ 22 60  1  0  0  0
+ 23 61  1  0  0  0
+ 24 62  1  0  0  0
+ 25 63  1  0  0  0
+ 25 64  1  0  0  0
+ 26 65  1  0  0  0
+ 26 66  1  0  0  0
+ 27 67  1  0  0  0
+ 27 68  1  0  0  0
+ 28 69  1  0  0  0
+ 28 70  1  0  0  0
+ 29 71  1  0  0  0
+ 29 72  1  0  0  0
+ 29 73  1  0  0  0
+ 30 74  1  0  0  0
+ 33 75  1  0  0  0
+ 34 76  1  0  0  0
+ 34 77  1  0  0  0
+ 35 78  1  0  0  0
+ 35 79  1  0  0  0
+ 36 80  1  0  0  0
+ 36 81  1  0  0  0
+ 37 82  1  0  0  0
+ 37 83  1  0  0  0
+ 38 84  1  0  0  0
+ 38 85  1  0  0  0
+M  END
+$$$$

examples/1a46_protein_processed.pdb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/1cbr_ligand.sdf ADDED Viewed

	@@ -0,0 +1,119 @@

+1cbr_ligand
+Created by X-TOOL on Fri Nov 18 12:01:53 2016
+ 49 49  0  0  0  0  0  0  0  0999 V2000
+    5.0920    2.4270  -10.7940  C 0  0  0  1  0  4
+    6.0790    1.2390  -10.8790  C 0  0  0  3  0  4
+    7.4570    1.5880  -11.3400  C 0  0  0  3  0  4
+    8.1090    2.6160  -10.4790  C 0  0  0  3  0  4
+    7.1710    3.7700  -10.1040  C 0  0  0  1  0  3
+    5.8090    3.6640  -10.1590  C 0  0  0  1  0  3
+    4.8670    4.7410   -9.7870  C 0  0  0  2  0  3
+    5.0090    5.6850   -8.8490  C 0  0  0  2  0  3
+    4.0490    6.7120   -8.5120  C 0  0  0  1  0  3
+    4.3830    7.6020   -7.5550  C 0  0  0  2  0  3
+    3.5130    8.6700   -7.1050  C 0  0  0  2  0  3
+    3.9620    9.5090   -6.1670  C 0  0  0  2  0  3
+    3.1640   10.5920   -5.6370  C 0  0  0  1  0  3
+    3.7030   11.3990   -4.6890  C 0  0  0  2  0  3
+    3.0710   12.5430   -4.0160  C 0  5  0  1  0  3
+    3.9070    2.0000   -9.9190  C 0  0  0  4  0  4
+    4.5820    2.7980  -12.2130  C 0  0  0  4  0  4
+    7.9800    4.9390   -9.5360  C 0  0  0  4  0  4
+    2.7160    6.8010   -9.2660  C 0  0  0  4  0  4
+    1.7300   10.7780   -6.1620  C 0  0  0  4  0  4
+    2.5240   13.4330   -4.7040  O 0  0  0  1  0  1
+    3.0900   12.6020   -2.7660  O 0  0  0  1  0  1
+    5.6628    0.5003  -11.5797  H 0  0  0  1  0  1
+    6.1586    0.7905   -9.8778  H 0  0  0  1  0  1
+    7.3965    1.9765  -12.3673  H 0  0  0  1  0  1
+    8.0733    0.6769  -11.3282  H 0  0  0  1  0  1
+    8.9730    3.0290  -11.0202  H 0  0  0  1  0  1
+    8.4536    2.1305   -9.5541  H 0  0  0  1  0  1
+    3.9353    4.7700  -10.3501  H 0  0  0  1  0  1
+    5.9398    5.6789   -8.2837  H 0  0  0  1  0  1
+    5.3651    7.5126   -7.0930  H 0  0  0  1  0  1
+    2.5140    8.7864   -7.5226  H 0  0  0  1  0  1
+    4.9725    9.3712   -5.7852  H 0  0  0  1  0  1
+    4.7256   11.1723   -4.3911  H 0  0  0  1  0  1
+    3.1893    2.8302   -9.8432  H 0  0  0  1  0  1
+    4.2693    1.7357   -8.9146  H 0  0  0  1  0  1
+    3.4124    1.1280  -10.3717  H 0  0  0  1  0  1
+    5.4325    3.1041  -12.8399  H 0  0  0  1  0  1
+    3.8636    3.6277  -12.1392  H 0  0  0  1  0  1
+    4.0887    1.9250  -12.6652  H 0  0  0  1  0  1
+    7.2992    5.7611   -9.2702  H 0  0  0  1  0  1
+    8.6994    5.2884  -10.2913  H 0  0  0  1  0  1
+    8.5226    4.6076   -8.6384  H 0  0  0  1  0  1
+    2.6523    5.9808   -9.9962  H 0  0  0  1  0  1
+    2.6558    7.7653   -9.7917  H 0  0  0  1  0  1
+    1.8841    6.7206   -8.5508  H 0  0  0  1  0  1
+    1.5151   10.0113   -6.9209  H 0  0  0  1  0  1
+    1.6308   11.7769   -6.6117  H 0  0  0  1  0  1
+    1.0187   10.6787   -5.3288  H 0  0  0  1  0  1
+  1  2  1  0  0  1
+  1  6  1  0  0  1
+  1 16  1  0  0  2
+  1 17  1  0  0  2
+  2  3  1  0  0  1
+  3  4  1  0  0  1
+  4  5  1  0  0  1
+  5  6  2  0  0  1
+  5 18  1  0  0  2
+  6  7  1  0  0  2
+  7  8  2  0  0  2
+  8  9  1  0  0  2
+  9 10  2  0  0  2
+  9 19  1  0  0  2
+ 10 11  1  0  0  2
+ 11 12  2  0  0  2
+ 12 13  1  0  0  2
+ 13 14  2  0  0  2
+ 13 20  1  0  0  2
+ 14 15  1  0  0  2
+ 15 21  2  0  0  2
+ 15 22  2  0  0  2
+  2 23  1  0  0  2
+  2 24  1  0  0  2
+  3 25  1  0  0  2
+  3 26  1  0  0  2
+  4 27  1  0  0  2
+  4 28  1  0  0  2
+  7 29  1  0  0  2
+  8 30  1  0  0  2
+ 10 31  1  0  0  2
+ 11 32  1  0  0  2
+ 12 33  1  0  0  2
+ 14 34  1  0  0  2
+ 16 35  1  0  0  2
+ 16 36  1  0  0  2
+ 16 37  1  0  0  2
+ 17 38  1  0  0  2
+ 17 39  1  0  0  2
+ 17 40  1  0  0  2
+ 18 41  1  0  0  2
+ 18 42  1  0  0  2
+ 18 43  1  0  0  2
+ 19 44  1  0  0  2
+ 19 45  1  0  0  2
+ 19 46  1  0  0  2
+ 20 47  1  0  0  2
+ 20 48  1  0  0  2
+ 20 49  1  0  0  2
+M  END
+> <MOLECULAR_FORMULA>
+C20H27O2
+> <MOLECULAR_WEIGHT>
+299.2
+> <NUM_HB_ATOMS>
+2
+> <NUM_ROTOR>
+0
+> <XLOGP2>
+3.40
+$$$$

examples/1cbr_protein.pdb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+biopandas==0.4.1
+biopython==1.79
+e3nn==0.5.0
+jinja2==3.1.2
+joblib==1.2.0
+markupsafe==2.1.1
+mpmath==1.2.1
+networkx==2.8.7
+opt-einsum==3.3.0
+opt-einsum-fx==0.1.4
+packaging==21.3
+pandas==1.5.0
+scikit-learn==1.1.2
+scipy==1.9.1
+spyrmsd==0.5.2
+sympy==1.11.1
+spyrmsd==0.5.2
+sympy==1.11.1
+pytorch==1.12.1
+numpy==1.23.1
+torchaudio=0.12.1
+torchvision=0.13.1
+rdkit-pypi==2022.3.5
+torch-scatter
+torch-sparse
+torch-cluster
+torch-spline-conv
+torch-geometric
+-f https://data.pyg.org/whl/torch-1.12.0+cu102.html