Spaces:

ThorbenF
/

CryptoBank

Running

App Files Files Community

ThorbenFroehlking commited on 30 days ago

Commit

78b2c3a

1 Parent(s): af13564

Update

Browse files

Files changed (2) hide show

app.py +197 -112
model_loader.py +26 -19

app.py CHANGED Viewed

@@ -19,6 +19,11 @@ from torch.utils.data import DataLoader
 import re
 import pandas as pd
 import copy
 import transformers
 from transformers import AutoTokenizer, DataCollatorForTokenClassification
@@ -27,13 +32,26 @@ from datasets import Dataset
 from scipy.special import expit
 # Load model and move to device
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_cryptic'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_database'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_full'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_0925'
-#checkpoint = 'ThorbenF/prot_t5_xl_uniref50_0925_v2'
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_full_v2'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
@@ -41,21 +59,33 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device)
 model.eval()
 def normalize_scores(scores):
     min_score = np.min(scores)
     max_score = np.max(scores)
-    return (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
     with open(pdb_path, 'r') as f:
         return f.read()
-def fetch_structure(pdb_id: str, output_dir: str = ".") -> str:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
     file_path = download_structure(pdb_id, output_dir)
     return file_path
@@ -76,23 +106,29 @@ def download_structure(pdb_id: str, output_dir: str) -> str:
             return file_path
     return None
-def convert_cif_to_pdb(cif_path: str, output_dir: str = ".") -> str:
     """
     Convert a CIF file to PDB format using BioPython and return the PDB file path.
     """
     pdb_path = os.path.join(output_dir, os.path.basename(cif_path).replace('.cif', '.pdb'))
     parser = MMCIFParser(QUIET=True)
     structure = parser.get_structure('protein', cif_path)
     io = PDBIO()
     io.set_structure(structure)
     io.save(pdb_path)
     return pdb_path
 def fetch_pdb(pdb_id):
-    pdb_path = fetch_structure(pdb_id)
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
-        pdb_path = convert_cif_to_pdb(pdb_path)
     return pdb_path
 def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: list, protein_residues: list) -> str:
@@ -102,7 +138,7 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
-    output_pdb = f"{os.path.splitext(input_pdb)[0]}_{chain_id}_predictions_scores.pdb"
     # Create scores dictionary for easy lookup
     scores_dict = {resi: score for resi, score in residue_scores}
@@ -132,6 +168,9 @@ def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: lis
     io.set_structure(structure[0])
     io.save(output_pdb, selector)
     return output_pdb
 def generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, score_type):
@@ -157,7 +196,7 @@ def generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time,
     # Add PyMOL commands for each score bracket
     for bracket, residues in residues_by_bracket.items():
-        if residues:  # Only add commands if there are residues in this bracket
             color = bracket_colors[bracket]
             resi_list = '+'.join(map(str, residues))
             pymol_commands += f"""
@@ -184,9 +223,6 @@ def generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues
     return result_str
 def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
@@ -211,13 +247,23 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     sequence_id = [res.id[1] for res in protein_residues]
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
-        outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
     # Calculate scores and normalize them
-    raw_scores = expit(outputs[:, 1] - outputs[:, 0])
     normalized_scores = normalize_scores(raw_scores)
     # Choose which scores to use based on score_type
     display_scores = normalized_scores if score_type == 'normalized' else raw_scores
@@ -263,13 +309,17 @@ def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     mol_vis = molecule(pdb_path, residue_scores, segment)
     # Create prediction file
-    prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
     with open(prediction_file, "w") as f:
         f.write(result_str)
-    scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
     os.rename(scored_pdb, scored_pdb_name)
     return pymol_commands, mol_vis, [prediction_file, scored_pdb_name], raw_residue_scores, norm_residue_scores, pdb_id, segment
 def molecule(input_pdb, residue_scores=None, segment='A'):
@@ -411,6 +461,9 @@ def molecule(input_pdb, residue_scores=None, segment='A'):
     </html>
     """
     # Return the HTML content within an iframe safely encoded for special characters
     return f'<iframe width="100%" height="700" srcdoc="{html_content.replace(chr(34), "&quot;").replace(chr(39), "&#39;")}"></iframe>'
@@ -487,98 +540,114 @@ with gr.Blocks(css="""
     last_pdb_id = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
-        selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
-        # First get the actual PDB file path
-        if mode == "PDB ID":
-            pdb_path = fetch_pdb(pdb_id)  # Get the actual file path
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
-            # Store the actual file path, not just the PDB ID
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
-        elif mode == "Upload File":
-            _, ext = os.path.splitext(pdb_file.name)
-            file_path = os.path.join('./', f"{_}{ext}")
-            if ext == '.cif':
-                pdb_path = convert_cif_to_pdb(file_path)
-            else:
-                pdb_path = file_path
-            pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
-            return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
     def update_visualization_and_files(score_type_val, raw_scores, norm_scores, pdb_path, segment, pdb_id):
         if raw_scores is None or norm_scores is None or pdb_path is None or segment is None or pdb_id is None:
             return None, None, None
-        # Choose scores based on radio button selection
-        selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
-        selected_scores = norm_scores if selected_score_type == 'normalized' else raw_scores
-        # Generate visualization with selected scores
-        mol_vis = molecule(pdb_path, selected_scores, segment)
-        # Generate PyMOL commands and downloadable files
-        # Get structure for residue info
-        _, ext = os.path.splitext(pdb_path)
-        parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
-        structure = parser.get_structure('protein', pdb_path)
-        chain = structure[0][segment]
-        protein_residues = [res for res in chain if is_aa(res)]
-        sequence = "".join(seq1(res.resname) for res in protein_residues)
-        # Define score brackets
-        score_brackets = {
-            "0.0-0.2": (0.0, 0.2),
-            "0.2-0.4": (0.2, 0.4),
-            "0.4-0.6": (0.4, 0.6),
-            "0.6-0.8": (0.6, 0.8),
-            "0.8-1.0": (0.8, 1.0)
-        }
-        # Initialize a dictionary to store residues by bracket
-        residues_by_bracket = {bracket: [] for bracket in score_brackets}
-        # Categorize residues into brackets
-        for resi, score in selected_scores:
-            for bracket, (lower, upper) in score_brackets.items():
-                if lower <= score < upper:
-                    residues_by_bracket[bracket].append(resi)
-                    break
-        # Generate timestamp
-        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        # Generate result text and PyMOL commands based on score type
-        display_score_type = "Normalized" if selected_score_type == 'normalized' else "Raw"
-        scores_array = [score for _, score in selected_scores]
-        result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
-                                           scores_array, current_time, display_score_type)
-        pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
-        # Create chain-specific PDB with scores in B-factor
-        scored_pdb = create_chain_specific_pdb(pdb_path, segment, selected_scores, protein_residues)
-        # Create prediction file
-        prediction_file = f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt"
-        with open(prediction_file, "w") as f:
-            f.write(result_str)
-        scored_pdb_name = f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb"
-        os.rename(scored_pdb, scored_pdb_name)
-        return mol_vis, pymol_commands, [prediction_file, scored_pdb_name]
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
             return fetch_pdb(pdb_id)
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
-            file_path = os.path.join('./', f"{_}{ext}")
             if ext == '.cif':
-                pdb_path = convert_cif_to_pdb(file_path)
             else:
-                pdb_path= file_path
             return pdb_path
     def toggle_mode(selected_mode):
@@ -586,8 +655,6 @@ with gr.Blocks(css="""
             return gr.update(visible=True), gr.update(visible=False)
         else:
             return gr.update(visible=False), gr.update(visible=True)
     mode.change(
         toggle_mode,
@@ -628,17 +695,35 @@ with gr.Blocks(css="""
     )
     def predict_utils(sequence):
-        input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
-        with torch.no_grad():
-            outputs = model(input_ids).logits.detach().cpu().numpy().squeeze()
-        raw_scores = expit(outputs[:, 1] - outputs[:, 0])
-        normalized_scores = normalize_scores(raw_scores)
-        return {
-        "raw_scores": raw_scores.tolist(),
-        "normalized_scores": normalized_scores.tolist()
-    }
     dummy_input = gr.Textbox(visible=False)
     dummy_output = gr.Textbox(visible=False)
@@ -650,4 +735,4 @@ with gr.Blocks(css="""
         outputs=[dummy_output]
     )
-demo.launch(share=True)

 import re
 import pandas as pd
 import copy
+import gc
+import tempfile
+import shutil
+import atexit
+import weakref
 import transformers
 from transformers import AutoTokenizer, DataCollatorForTokenClassification
 from scipy.special import expit
+# Create a temporary directory for this session
+TEMP_DIR = tempfile.mkdtemp(prefix="protein_binding_")
+print(f"Using temporary directory: {TEMP_DIR}")
+# Registry to track created files for cleanup
+_file_registry = weakref.WeakSet()
+def cleanup_temp_files():
+    """Clean up temporary directory on exit"""
+    try:
+        if os.path.exists(TEMP_DIR):
+            shutil.rmtree(TEMP_DIR)
+            print(f"Cleaned up temporary directory: {TEMP_DIR}")
+    except Exception as e:
+        print(f"Error cleaning up temp directory: {e}")
+# Register cleanup function
+atexit.register(cleanup_temp_files)
 # Load model and move to device
 checkpoint = 'ThorbenF/prot_t5_xl_uniref50_full_v2'
 max_length = 1500
 model, tokenizer = load_model(checkpoint, max_length)
 model.to(device)
 model.eval()
+def cleanup_files(*file_paths):
+    """Helper function to clean up files"""
+    for path in file_paths:
+        if path and os.path.exists(path):
+            try:
+                os.remove(path)
+            except Exception as e:
+                print(f"Could not remove {path}: {e}")
 def normalize_scores(scores):
     min_score = np.min(scores)
     max_score = np.max(scores)
+    normalized = (scores - min_score) / (max_score - min_score) if max_score > min_score else scores
+    return normalized
 def read_mol(pdb_path):
     """Read PDB file and return its content as a string"""
     with open(pdb_path, 'r') as f:
         return f.read()
+def fetch_structure(pdb_id: str, output_dir: str = None) -> str:
     """
     Fetch the structure file for a given PDB ID. Prioritizes CIF files.
     If a structure file already exists locally, it uses that.
     """
+    if output_dir is None:
+        output_dir = TEMP_DIR
     file_path = download_structure(pdb_id, output_dir)
     return file_path
             return file_path
     return None
+def convert_cif_to_pdb(cif_path: str, output_dir: str = None) -> str:
     """
     Convert a CIF file to PDB format using BioPython and return the PDB file path.
     """
+    if output_dir is None:
+        output_dir = TEMP_DIR
     pdb_path = os.path.join(output_dir, os.path.basename(cif_path).replace('.cif', '.pdb'))
     parser = MMCIFParser(QUIET=True)
     structure = parser.get_structure('protein', cif_path)
     io = PDBIO()
     io.set_structure(structure)
     io.save(pdb_path)
+    # Clean up CIF file after conversion
+    cleanup_files(cif_path)
     return pdb_path
 def fetch_pdb(pdb_id):
+    pdb_path = fetch_structure(pdb_id, TEMP_DIR)
     _, ext = os.path.splitext(pdb_path)
     if ext == '.cif':
+        pdb_path = convert_cif_to_pdb(pdb_path, TEMP_DIR)
     return pdb_path
 def create_chain_specific_pdb(input_pdb: str, chain_id: str, residue_scores: list, protein_residues: list) -> str:
     parser = PDBParser(QUIET=True)
     structure = parser.get_structure('protein', input_pdb)
+    output_pdb = os.path.join(TEMP_DIR, f"{os.path.splitext(os.path.basename(input_pdb))[0]}_{chain_id}_predictions_scores.pdb")
     # Create scores dictionary for easy lookup
     scores_dict = {resi: score for resi, score in residue_scores}
     io.set_structure(structure[0])
     io.save(output_pdb, selector)
+    # Clear references
+    del structure, io, selector, scores_dict
     return output_pdb
 def generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, score_type):
     # Add PyMOL commands for each score bracket
     for bracket, residues in residues_by_bracket.items():
+        if residues:
             color = bracket_colors[bracket]
             resi_list = '+'.join(map(str, residues))
             pymol_commands += f"""
     return result_str
 def process_pdb(pdb_id_or_file, segment, score_type='normalized'):
     # Determine if input is a PDB ID or file path
     if pdb_id_or_file.endswith('.pdb'):
     sequence_id = [res.id[1] for res in protein_residues]
     input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
     with torch.no_grad():
+        outputs = model(input_ids).logits
+        outputs_cpu = outputs.detach().cpu().numpy().squeeze()
+        # Explicitly delete GPU tensors
+        del outputs, input_ids
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
     # Calculate scores and normalize them
+    raw_scores = expit(outputs_cpu[:, 1] - outputs_cpu[:, 0])
     normalized_scores = normalize_scores(raw_scores)
+    # Clear outputs_cpu
+    del outputs_cpu
     # Choose which scores to use based on score_type
     display_scores = normalized_scores if score_type == 'normalized' else raw_scores
     mol_vis = molecule(pdb_path, residue_scores, segment)
     # Create prediction file
+    prediction_file = os.path.join(TEMP_DIR, f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt")
     with open(prediction_file, "w") as f:
         f.write(result_str)
+    scored_pdb_name = os.path.join(TEMP_DIR, f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb")
     os.rename(scored_pdb, scored_pdb_name)
+    # Clear large objects from memory
+    del structure, chain, protein_residues, raw_scores, normalized_scores, display_scores
+    gc.collect()
     return pymol_commands, mol_vis, [prediction_file, scored_pdb_name], raw_residue_scores, norm_residue_scores, pdb_id, segment
 def molecule(input_pdb, residue_scores=None, segment='A'):
     </html>
     """
+    # Clear mol from memory after use
+    del mol
     # Return the HTML content within an iframe safely encoded for special characters
     return f'<iframe width="100%" height="700" srcdoc="{html_content.replace(chr(34), "&quot;").replace(chr(39), "&#39;")}"></iframe>'
     last_pdb_id = gr.State(None)
     def process_interface(mode, pdb_id, pdb_file, chain_id, score_type_val):
+        try:
+            selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
+            # First get the actual PDB file path
+            if mode == "PDB ID":
+                pdb_path = fetch_pdb(pdb_id)
+                pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
+                return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
+            elif mode == "Upload File":
+                _, ext = os.path.splitext(pdb_file.name)
+                file_path = os.path.join(TEMP_DIR, f"{os.path.basename(pdb_file.name)}")
+                # Copy uploaded file to temp directory
+                shutil.copy(pdb_file.name, file_path)
+                if ext == '.cif':
+                    pdb_path = convert_cif_to_pdb(file_path, TEMP_DIR)
+                else:
+                    pdb_path = file_path
+                pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_id_result, segment = process_pdb(pdb_path, chain_id, selected_score_type)
+                return pymol_cmd, mol_vis, files, raw_scores, norm_scores, pdb_path, chain_id, pdb_id_result
+        finally:
+            # Force garbage collection after processing
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
     def update_visualization_and_files(score_type_val, raw_scores, norm_scores, pdb_path, segment, pdb_id):
         if raw_scores is None or norm_scores is None or pdb_path is None or segment is None or pdb_id is None:
             return None, None, None
+        try:
+            # Choose scores based on radio button selection
+            selected_score_type = 'normalized' if score_type_val == "Normalized Scores" else 'raw'
+            selected_scores = norm_scores if selected_score_type == 'normalized' else raw_scores
+            # Generate visualization with selected scores
+            mol_vis = molecule(pdb_path, selected_scores, segment)
+            # Generate PyMOL commands and downloadable files
+            # Get structure for residue info
+            _, ext = os.path.splitext(pdb_path)
+            parser = MMCIFParser(QUIET=True) if ext == '.cif' else PDBParser(QUIET=True)
+            structure = parser.get_structure('protein', pdb_path)
+            chain = structure[0][segment]
+            protein_residues = [res for res in chain if is_aa(res)]
+            sequence = "".join(seq1(res.resname) for res in protein_residues)
+            # Define score brackets
+            score_brackets = {
+                "0.0-0.2": (0.0, 0.2),
+                "0.2-0.4": (0.2, 0.4),
+                "0.4-0.6": (0.4, 0.6),
+                "0.6-0.8": (0.6, 0.8),
+                "0.8-1.0": (0.8, 1.0)
+            }
+            # Initialize a dictionary to store residues by bracket
+            residues_by_bracket = {bracket: [] for bracket in score_brackets}
+            # Categorize residues into brackets
+            for resi, score in selected_scores:
+                for bracket, (lower, upper) in score_brackets.items():
+                    if lower <= score < upper:
+                        residues_by_bracket[bracket].append(resi)
+                        break
+            # Generate timestamp
+            current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            # Generate result text and PyMOL commands based on score type
+            display_score_type = "Normalized" if selected_score_type == 'normalized' else "Raw"
+            scores_array = [score for _, score in selected_scores]
+            result_str = generate_results_text(pdb_id, segment, residues_by_bracket, protein_residues, sequence,
+                                               scores_array, current_time, display_score_type)
+            pymol_commands = generate_pymol_commands(pdb_id, segment, residues_by_bracket, current_time, display_score_type)
+            # Create chain-specific PDB with scores in B-factor
+            scored_pdb = create_chain_specific_pdb(pdb_path, segment, selected_scores, protein_residues)
+            # Create prediction file
+            prediction_file = os.path.join(TEMP_DIR, f"{pdb_id}_{display_score_type.lower()}_binding_site_residues.txt")
+            with open(prediction_file, "w") as f:
+                f.write(result_str)
+            scored_pdb_name = os.path.join(TEMP_DIR, f"{pdb_id}_{segment}_{display_score_type.lower()}_predictions_scores.pdb")
+            os.rename(scored_pdb, scored_pdb_name)
+            # Clear memory
+            del structure, chain, protein_residues, scores_array
+            return mol_vis, pymol_commands, [prediction_file, scored_pdb_name]
+        finally:
+            gc.collect()
     def fetch_interface(mode, pdb_id, pdb_file):
         if mode == "PDB ID":
             return fetch_pdb(pdb_id)
         elif mode == "Upload File":
             _, ext = os.path.splitext(pdb_file.name)
+            file_path = os.path.join(TEMP_DIR, f"{os.path.basename(pdb_file.name)}")
+            shutil.copy(pdb_file.name, file_path)
             if ext == '.cif':
+                pdb_path = convert_cif_to_pdb(file_path, TEMP_DIR)
             else:
+                pdb_path = file_path
             return pdb_path
     def toggle_mode(selected_mode):
             return gr.update(visible=True), gr.update(visible=False)
         else:
             return gr.update(visible=False), gr.update(visible=True)
     mode.change(
         toggle_mode,
     )
     def predict_utils(sequence):
+        try:
+            input_ids = tokenizer(" ".join(sequence), return_tensors="pt").input_ids.to(device)
+            with torch.no_grad():
+                outputs = model(input_ids).logits
+                outputs_cpu = outputs.detach().cpu().numpy().squeeze()
+                # Explicitly delete GPU tensors
+                del outputs, input_ids
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            raw_scores = expit(outputs_cpu[:, 1] - outputs_cpu[:, 0])
+            normalized_scores = normalize_scores(raw_scores)
+            result = {
+                "raw_scores": raw_scores.tolist(),
+                "normalized_scores": normalized_scores.tolist()
+            }
+            # Clear memory
+            del outputs_cpu, raw_scores, normalized_scores
+            gc.collect()
+            return result
+        except Exception as e:
+            gc.collect()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+            raise e
     dummy_input = gr.Textbox(visible=False)
     dummy_output = gr.Textbox(visible=False)
         outputs=[dummy_output]
     )
+demo.launch(share=True)

model_loader.py CHANGED Viewed

@@ -11,6 +11,7 @@ import numpy as np
 import os
 import pandas as pd
 import copy
 import transformers, datasets
 from transformers.modeling_outputs import TokenClassifierOutput
@@ -279,27 +280,25 @@ def load_T5_model_classification(checkpoint, num_labels, half_precision, full =
     # Load model and tokenizer
     if "ankh" in checkpoint :
-        model = T5EncoderModel.from_pretrained(checkpoint,resume_download=True)
-        tokenizer = AutoTokenizer.from_pretrained(checkpoint,resume_download=True)
     elif "prot_t5" in checkpoint:
         # possible to load the half precision model (thanks to @pawel-rezo for pointing that out)
         if half_precision and deepspeed:
-            #tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
-            #model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc", torch_dtype=torch.float16)#.to(torch.device('cuda')
-            tokenizer = T5Tokenizer.from_pretrained(checkpoint, do_lower_case=False,resume_download=True)
-            model = T5EncoderModel.from_pretrained(checkpoint, torch_dtype=torch.float16).to(torch.device('cuda'),resume_download=True)
         else:
-            model = T5EncoderModel.from_pretrained(checkpoint)
-            tokenizer = T5Tokenizer.from_pretrained(checkpoint)
     elif "ProstT5" in checkpoint:
         if half_precision and deepspeed:
-            tokenizer = T5Tokenizer.from_pretrained(checkpoint, do_lower_case=False,resume_download=True)
-            model = T5EncoderModel.from_pretrained(checkpoint, torch_dtype=torch.float16).to(torch.device('cuda'),resume_download=True)
         else:
-            model = T5EncoderModel.from_pretrained(checkpoint,resume_download=True)
-            tokenizer = T5Tokenizer.from_pretrained(checkpoint,resume_download=True)
     # Create new Classifier model with PT5 dimensions
     class_config=ClassConfig(num_labels=num_labels)
@@ -309,8 +308,13 @@ def load_T5_model_classification(checkpoint, num_labels, half_precision, full =
     class_model.shared=model.shared
     class_model.encoder=model.encoder
-    # Delete the checkpoint model
-    model=class_model
     del class_model
     if full == True:
@@ -613,9 +617,7 @@ def load_esm_model_classification(checkpoint, num_labels, half_precision, full=F
     return model, tokenizer
-def load_model(checkpoint,max_length):
-    #checkpoint='ThorbenF/prot_t5_xl_uniref50'
-    #best_model_path='ThorbenF/prot_t5_xl_uniref50/cpt.pth'
     full=False
     deepspeed=False
     mixed=False
@@ -629,12 +631,17 @@ def load_model(checkpoint,max_length):
     else:
         model, tokenizer = load_T5_model_classification(checkpoint, num_labels, mixed, full, deepspeed)
     # Download the file
     local_file = hf_hub_download(repo_id=checkpoint, filename="cpt.pth")
-    # Load the best model state
     state_dict = torch.load(local_file, map_location=torch.device('cpu'), weights_only=True)
     model.load_state_dict(state_dict)
     return model, tokenizer

 import os
 import pandas as pd
 import copy
+import gc
 import transformers, datasets
 from transformers.modeling_outputs import TokenClassifierOutput
     # Load model and tokenizer
     if "ankh" in checkpoint :
+        model = T5EncoderModel.from_pretrained(checkpoint, resume_download=True)
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint, resume_download=True)
     elif "prot_t5" in checkpoint:
         # possible to load the half precision model (thanks to @pawel-rezo for pointing that out)
         if half_precision and deepspeed:
+            tokenizer = T5Tokenizer.from_pretrained(checkpoint, do_lower_case=False, resume_download=True)
+            model = T5EncoderModel.from_pretrained(checkpoint, torch_dtype=torch.float16, resume_download=True).to(torch.device('cuda'))
         else:
+            model = T5EncoderModel.from_pretrained(checkpoint, resume_download=True)
+            tokenizer = T5Tokenizer.from_pretrained(checkpoint, resume_download=True)
     elif "ProstT5" in checkpoint:
         if half_precision and deepspeed:
+            tokenizer = T5Tokenizer.from_pretrained(checkpoint, do_lower_case=False, resume_download=True)
+            model = T5EncoderModel.from_pretrained(checkpoint, torch_dtype=torch.float16, resume_download=True).to(torch.device('cuda'))
         else:
+            model = T5EncoderModel.from_pretrained(checkpoint, resume_download=True)
+            tokenizer = T5Tokenizer.from_pretrained(checkpoint, resume_download=True)
     # Create new Classifier model with PT5 dimensions
     class_config=ClassConfig(num_labels=num_labels)
     class_model.shared=model.shared
     class_model.encoder=model.encoder
+    # Delete the checkpoint model and clear memory
+    del model
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    model = class_model
     del class_model
     if full == True:
     return model, tokenizer
+def load_model(checkpoint, max_length):
     full=False
     deepspeed=False
     mixed=False
     else:
         model, tokenizer = load_T5_model_classification(checkpoint, num_labels, mixed, full, deepspeed)
     # Download the file
     local_file = hf_hub_download(repo_id=checkpoint, filename="cpt.pth")
+    # Load the best model state with memory mapping for efficiency
     state_dict = torch.load(local_file, map_location=torch.device('cpu'), weights_only=True)
     model.load_state_dict(state_dict)
+    # Clear state_dict from memory immediately after loading
+    del state_dict
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
     return model, tokenizer