from transformers import AutoTokenizer, EsmForProteinFolding from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein from transformers.models.esm.openfold_utils.feats import atom14_to_atom37 from Bio import SeqIO import gradio as gr import spaces from gradio_molecule3d import Molecule3D reps = [ { "model": 0, "chain": "", "resname": "", "style": "stick", "color": "whiteCarbon", "residue_range": "", "around": 0, "byres": False, "visible": False } ] def read_mol(molpath): with open(molpath, "r") as fp: lines = fp.readlines() mol = "" for l in lines: mol += l return mol def molecule(input_pdb): mol = read_mol(input_pdb) x = ( """<!DOCTYPE html> <html> <head> <meta http-equiv="content-type" content="text/html; charset=UTF-8" /> <style> body{ font-family:sans-serif } .mol-container { width: 100%; height: 600px; position: relative; } .mol-container select{ background-image:None; } </style> <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.3/jquery.min.js" integrity="sha512-STof4xm1wgkfm7heWqFJVn58Hm3EtS31XFaagaa8VMReCXAkQnJZ+jEy8PCC/iT18dFy95WcExNHFTqLyp72eQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script> <script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script> </head> <body> <div id="container" class="mol-container"></div> <script> let pdb = `""" + mol + """` $(document).ready(function () { let element = $("#container"); let config = { backgroundColor: "white" }; let viewer = $3Dmol.createViewer(element, config); viewer.addModel(pdb, "pdb"); viewer.getModel(0).setStyle({}, { cartoon: { colorscheme:"whiteCarbon" } }); viewer.zoomTo(); viewer.render(); viewer.zoom(0.8, 2000); }) </script> </body></html>""" ) return f"""<iframe style="width: 100%; height: 600px" name="result" allow="midi; geolocation; microphone; camera; display-capture; encrypted-media;" sandbox="allow-modals allow-forms allow-scripts allow-same-origin allow-popups allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>""" def convert_outputs_to_pdb(outputs): final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs) outputs = {k: v.to("cpu").numpy() for k, v in outputs.items()} final_atom_positions = final_atom_positions.cpu().numpy() final_atom_mask = outputs["atom37_atom_exists"] pdbs = [] for i in range(outputs["aatype"].shape[0]): aa = outputs["aatype"][i] pred_pos = final_atom_positions[i] mask = final_atom_mask[i] resid = outputs["residue_index"][i] + 1 pred = OFProtein( aatype=aa, atom_positions=pred_pos, atom_mask=mask, residue_index=resid, b_factors=outputs["plddt"][i], chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None, ) pdbs.append(to_pdb(pred)) return pdbs tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1") model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True) model = model.cuda() model.esm = model.esm.half() import torch torch.backends.cuda.matmul.allow_tf32 = True model.trunk.set_chunk_size(64) @spaces.GPU(duration=120) def fold_protein(test_protein): tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids'] tokenized_input = tokenized_input.cuda() with torch.no_grad(): output = model(tokenized_input) pdb = convert_outputs_to_pdb(output) with open("output_structure.pdb", "w") as f: f.write("".join(pdb)) html = molecule("output_structure.pdb") return html, "output_structure.pdb" @spaces.GPU(duration=180) def fold_protein_wpdb(test_protein, pdb_path): tokenized_input = tokenizer([test_protein], return_tensors="pt", add_special_tokens=False)['input_ids'] tokenized_input = tokenized_input.cuda() with torch.no_grad(): output = model(tokenized_input) pdb = convert_outputs_to_pdb(output) with open(pdb_path, "w") as f: f.write("".join(pdb)) html = molecule(pdb_path) return html def load_protein_sequences(fasta_file): protein_sequences = {} for record in SeqIO.parse(fasta_file, "fasta"): protein_sequences[record.id] = str(record.seq) return protein_sequences def show_split(inputfile): seqs = load_protein_sequences(inputfile) htmls = [] for seq in seqs: pdb_path = f'{seq.replace(" ", "_").replace(",","")}.pdb' html = fold_protein_wpdb(seqs[seq], pdb_path) x = f"""<h3>>{seq}</h3> <br> """ htmls.append(x+html) final = "\n<br>\n".join(htmls) realhtml = "<div>\n"+final+"\n</div>" return realhtml iface = gr.Interface( title="SingleProteinviz", fn=fold_protein, inputs=gr.Textbox( label="Protein Sequence", info="Find sequences examples below, and complete examples with images at: https://github.com/AstraBert/proteinviz/tree/main/examples.md; if you input a sequence, you're gonna get the static image and the 3D model to explore and play with", lines=5, value=f"Paste or write amino-acidic sequence here", ), outputs=[gr.HTML(label="Protein 3D model"), Molecule3D(label="Molecular 3D model", reps=reps)], examples=[ "MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH", "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDLAARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPGCMSCKCVLS", "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG", ] ) demo1 = gr.Interface( title="BulkProteinviz", fn=show_split, inputs=gr.File( label="FASTA File With Protein Sequences", ), outputs= gr.HTML(label="Protein 3D models"), examples = ["proteins.fasta"] ) demo = gr.TabbedInterface([iface, demo1], ["Single Protein Structure Prediction", "Bulk Protein Structure Prediction"], title="""<h1 align='center'>Proteinviz</h1> <br> <h2 align='center'>Support this space with a ⭐ on <a href='https://github.com/AstraBert/proteinviz'>GitHub</a></h2>""") demo.launch()