Recommend_system / project.tex
tong
revise streamlit path
806895c
\documentclass[a4paper,11pt]{article}
% --- Packages ---
\usepackage[utf8]{inputenc}
\usepackage{geometry}
\geometry{left=2.5cm, right=2.5cm, top=2.5cm, bottom=2.5cm}
\usepackage{graphicx}
\usepackage{amsmath, amssymb}
\usepackage{hyperref}
\usepackage{booktabs} % For professional tables
\usepackage{float}
\usepackage{listings}
\usepackage{xcolor}
\usepackage{cite}
% --- Code Listing Style ---
\definecolor{codegreen}{rgb}{0,0.6,0}
\definecolor{codegray}{rgb}{0.5,0.5,0.5}
\definecolor{codepurple}{rgb}{0.58,0,0.82}
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
\lstdefinestyle{mystyle}{
backgroundcolor=\color{backcolour},
commentstyle=\color{codegreen},
keywordstyle=\color{magenta},
numberstyle=\tiny\color{codegray},
stringstyle=\color{codepurple},
basicstyle=\ttfamily\footnotesize,
breakatwhitespace=false,
breaklines=true,
captionpos=b,
keepspaces=true,
numbers=left,
numbersep=5pt,
showspaces=false,
showstringspaces=false,
showtabs=false,
tabsize=2
}
\lstset{style=mystyle}
% --- Title Information ---
\title{\textbf{Bidirectional Protein-Molecule Retrieval via Dual-Tower Architecture with SaProt and ChemBERTa}}
\author{
\textbf{Team Name / ID} \\
Member 1 Name (Contribution: e.g., Model Training) \\
Member 2 Name (Contribution: e.g., Streamlit UI) \\
Member 3 Name (Contribution: e.g., Data Processing) \\
\textit{Course: AI for Life Science - Final Project}
}
\date{\today}
\begin{document}
\maketitle
\begin{abstract}
Efficient retrieval of potential drug candidates for specific protein targets (and vice versa) is a critical task in early-stage drug discovery. In this project, we present a \textbf{Dual-Tower} deep learning framework capable of bidirectional retrieval between protein sequences and molecular SMILES. We leverage \textbf{SaProt}, a structure-aware protein language model, and \textbf{ChemBERTa}, a transformer-based molecular encoder, to map both modalities into a shared latent space. The model is trained using contrastive learning to align positive drug-target pairs. Furthermore, we implement a full-pipeline online service using \textbf{Streamlit}, enabling users to perform real-time inference and fine-tune the model on custom datasets. Our system demonstrates the practical application of large-scale pre-trained models in cross-modal biological information retrieval.
\end{abstract}
\section{Introduction}
The identification of interactions between small molecules (drugs) and proteins (targets) is fundamental to pharmacology. Traditional methods, such as high-throughput screening (HTS) and molecular docking, are often computationally expensive or time-consuming. Recently, deep learning approaches have shown promise in accelerating this process by learning rich representations of biological entities.
This project focuses on the \textbf{Bidirectional Retrieval} task:
\begin{enumerate}
\item \textbf{Protein $\to$ Molecule:} Given a target protein, recommend potential active molecules.
\item \textbf{Molecule $\to$ Protein:} Given a molecule, identify potential protein targets (target fishing).
\end{enumerate}
We adopt a Dual-Tower architecture, which allows for efficient indexing and retrieval compared to cross-encoder architectures. By utilizing state-of-the-art pre-trained encoders—specifically SaProt \cite{su2023saprot} for proteins and ChemBERTa \cite{chithrananda2020chemberta} for molecules—we aim to capture both the structural semantics of proteins and the chemical properties of molecules.
\section{Methodology}
\subsection{System Architecture}
Our system follows a Siamese-like Dual-Tower architecture consisting of two independent encoders that project inputs into a shared $d$-dimensional embedding space.
\subsubsection{Protein Encoder: SaProt}
For the protein tower, we employ \textbf{SaProt} (Structure-aware Protein Language Model) \cite{su2023saprot}. Unlike standard protein language models that only utilize amino acid sequences, SaProt incorporates structural information by using a Foldseek-derived alphabet.
\begin{itemize}
\item \textbf{Input:} A sequence of structure-residue tokens (e.g., combined amino acid and 3Di structural tokens).
\item \textbf{Backbone:} We utilize the \texttt{SaProt\_650M\_AF2} checkpoint.
\item \textbf{Pooling:} We use the representation of the \texttt{[CLS]} token as the global protein embedding $E_p$.
\end{itemize}
\subsubsection{Molecule Encoder: ChemBERTa}
For the molecular tower, we selected \textbf{ChemBERTa} \cite{chithrananda2020chemberta}, a RoBERTa-based model pre-trained on millions of SMILES strings from the ZINC database.
\begin{itemize}
\item \textbf{Input:} SMILES (Simplified Molecular Input Line Entry System) strings.
\item \textbf{Backbone:} We utilize the \texttt{ChemBERTa-zinc-base-v1} checkpoint.
\item \textbf{Pooling:} The \texttt{[CLS]} token embedding serves as the molecular representation $E_m$.
\end{itemize}
\subsubsection{Projection Head}
To align the dimensions of $E_p$ and $E_m$ and enhance representational capability, both embeddings pass through a non-linear projection head:
\begin{equation}
z = W_2(\text{ReLU}(W_1(E)))
\end{equation}
where $z \in \mathbb{R}^{128}$ is the final vector used for retrieval.
\subsection{Training Objective: Contrastive Learning}
We treat the retrieval task as a metric learning problem. The goal is to maximize the cosine similarity between interacting protein-molecule pairs (positives) while minimizing the similarity for non-interacting pairs (negatives). We employ the InfoNCE loss (or NT-Xent loss):
\begin{equation}
\mathcal{L} = - \log \frac{\exp(\text{sim}(z_p, z_m) / \tau)}{\sum_{j=1}^{N} \exp(\text{sim}(z_p, z_{m_j}) / \tau)}
\end{equation}
where $\text{sim}(\cdot)$ denotes cosine similarity, $\tau$ is the temperature parameter, and the denominator sums over one positive and $N-1$ negative samples in the mini-batch.
\subsection{Online Service Implementation}
To meet the requirements of "Track B: Full Pipeline," we developed an interactive web interface using \textbf{Streamlit}. The service is designed to be deployed on Google Colab or Hugging Face Spaces.
\begin{figure}[H]
\centering
% \includegraphics[width=0.8\textwidth]{your_screenshot.png}
% Note: Replace the above line with your actual screenshot
\caption{Overview of the BioRetrieval Streamlit Interface.}
\label{fig:interface}
\end{figure}
Key features include:
\begin{itemize}
\item \textbf{Inference Module:} Allows users to input a Protein sequence/ID or a Molecule SMILES. The system encodes the query and performs a k-Nearest Neighbors (k-NN) search against a pre-computed FAISS/Vector index of candidates.
\item \textbf{Training Module:} Enables users to upload a custom dataset (Parquet format) to fine-tune the model. The system automatically updates the vector database after training.
\end{itemize}
\section{Experiments and Results}
\subsection{Experimental Setup}
\begin{itemize}
\item \textbf{Dataset:} [Insert Dataset Name, e.g., BindingDB, BioSNAP, or the course provided dataset].
\item \textbf{Split:} 80\% Training, 10\% Validation, 10\% Testing.
\item \textbf{Hyperparameters:} Batch size = [Insert], Learning rate = [Insert], Epochs = [Insert].
\item \textbf{Environment:} Trained on [e.g., NVIDIA T4 GPU via Google Colab].
\end{itemize}
\subsection{Performance Metrics}
We evaluate the retrieval performance using Recall at K (R@K), which measures the probability that the true positive target is found within the top K retrieved results.
\begin{table}[H]
\centering
\caption{Retrieval Performance on Test Set}
\label{tab:results}
\begin{tabular}{lccc}
\toprule
\textbf{Task} & \textbf{R@1} & \textbf{R@5} & \textbf{R@10} \\
\midrule
Protein $\to$ Molecule & [0.XX] & [0.XX] & [0.XX] \\
Molecule $\to$ Protein & [0.XX] & [0.XX] & [0.XX] \\
\bottomrule
\end{tabular}
\end{table}
\textit{[Analysis: Briefly describe your results here. For example: "The model shows strong performance in retrieving molecules for proteins, likely due to the rich structural information provided by SaProt..."]}
\section{Conclusion}
In this project, we successfully implemented a bidirectional retrieval system for drug discovery. By combining the structural awareness of SaProt with the chemical understanding of ChemBERTa, our Dual-Tower model effectively bridges the gap between biological and chemical modalities. The developed Streamlit application provides a user-friendly interface for researchers to leverage this technology, satisfying the requirements for a full-pipeline online service.
% --- Bibliography ---
\begin{filecontents}{references.bib}
@article{su2023saprot,
title={SaProt: Protein Language Modeling with Structure-aware Vocabulary},
author={Su, Jin and Han, Chenchen and Zhou, Yuyang and Shan, Jun and Zhou, Xibin and Yuan, Fajie},
journal={bioRxiv},
pages={2023--10},
year={2023},
publisher={Cold Spring Harbor Laboratory},
url={https://www.biorxiv.org/content/10.1101/2023.10.01.560349v3.full.pdf}
}
@article{chithrananda2020chemberta,
title={ChemBERTa: Large-scale self-supervised pretraining for molecular property prediction},
author={Chithrananda, Seyone and Grand, Gabriel and Ramsundar, Bharath},
journal={arXiv preprint arXiv:2010.09885},
year={2020},
url={https://arxiv.org/pdf/2010.09885}
}
@misc{westlake_repl_saprot,
author = {Westlake-Repl},
title = {SaProt GitHub Repository},
year = {2023},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/westlake-repl/SaProt}}
}
\end{filecontents}
\bibliographystyle{ieeetr}
\bibliography{references}
\end{document}