Spaces:

Tong314
/

Recommend_system

Sleeping

Recommend_system / project.tex

tong

revise streamlit path

806895c about 1 month ago

9.88 kB

	\documentclass[a4paper,11pt]{article}

	% --- Packages ---
	\usepackage[utf8]{inputenc}
	\usepackage{geometry}
	\geometry{left=2.5cm, right=2.5cm, top=2.5cm, bottom=2.5cm}
	\usepackage{graphicx}
	\usepackage{amsmath, amssymb}
	\usepackage{hyperref}
	\usepackage{booktabs} % For professional tables
	\usepackage{float}
	\usepackage{listings}
	\usepackage{xcolor}
	\usepackage{cite}

	% --- Code Listing Style ---
	\definecolor{codegreen}{rgb}{0,0.6,0}
	\definecolor{codegray}{rgb}{0.5,0.5,0.5}
	\definecolor{codepurple}{rgb}{0.58,0,0.82}
	\definecolor{backcolour}{rgb}{0.95,0.95,0.92}

	\lstdefinestyle{mystyle}{
	backgroundcolor=\color{backcolour},
	commentstyle=\color{codegreen},
	keywordstyle=\color{magenta},
	numberstyle=\tiny\color{codegray},
	stringstyle=\color{codepurple},
	basicstyle=\ttfamily\footnotesize,
	breakatwhitespace=false,
	breaklines=true,
	captionpos=b,
	keepspaces=true,
	numbers=left,
	numbersep=5pt,
	showspaces=false,
	showstringspaces=false,
	showtabs=false,
	tabsize=2
	}
	\lstset{style=mystyle}

	% --- Title Information ---
	\title{\textbf{Bidirectional Protein-Molecule Retrieval via Dual-Tower Architecture with SaProt and ChemBERTa}}
	\author{
	\textbf{Team Name / ID} \\
	Member 1 Name (Contribution: e.g., Model Training) \\
	Member 2 Name (Contribution: e.g., Streamlit UI) \\
	Member 3 Name (Contribution: e.g., Data Processing) \\
	\textit{Course: AI for Life Science - Final Project}
	}
	\date{\today}

	\begin{document}

	\maketitle

	\begin{abstract}
	Efficient retrieval of potential drug candidates for specific protein targets (and vice versa) is a critical task in early-stage drug discovery. In this project, we present a \textbf{Dual-Tower} deep learning framework capable of bidirectional retrieval between protein sequences and molecular SMILES. We leverage \textbf{SaProt}, a structure-aware protein language model, and \textbf{ChemBERTa}, a transformer-based molecular encoder, to map both modalities into a shared latent space. The model is trained using contrastive learning to align positive drug-target pairs. Furthermore, we implement a full-pipeline online service using \textbf{Streamlit}, enabling users to perform real-time inference and fine-tune the model on custom datasets. Our system demonstrates the practical application of large-scale pre-trained models in cross-modal biological information retrieval.
	\end{abstract}

	\section{Introduction}
	The identification of interactions between small molecules (drugs) and proteins (targets) is fundamental to pharmacology. Traditional methods, such as high-throughput screening (HTS) and molecular docking, are often computationally expensive or time-consuming. Recently, deep learning approaches have shown promise in accelerating this process by learning rich representations of biological entities.

	This project focuses on the \textbf{Bidirectional Retrieval} task:
	\begin{enumerate}
	\item \textbf{Protein $\to$ Molecule:} Given a target protein, recommend potential active molecules.
	\item \textbf{Molecule $\to$ Protein:} Given a molecule, identify potential protein targets (target fishing).
	\end{enumerate}

	We adopt a Dual-Tower architecture, which allows for efficient indexing and retrieval compared to cross-encoder architectures. By utilizing state-of-the-art pre-trained encoders—specifically SaProt \cite{su2023saprot} for proteins and ChemBERTa \cite{chithrananda2020chemberta} for molecules—we aim to capture both the structural semantics of proteins and the chemical properties of molecules.

	\section{Methodology}

	\subsection{System Architecture}
	Our system follows a Siamese-like Dual-Tower architecture consisting of two independent encoders that project inputs into a shared $d$-dimensional embedding space.

	\subsubsection{Protein Encoder: SaProt}
	For the protein tower, we employ \textbf{SaProt} (Structure-aware Protein Language Model) \cite{su2023saprot}. Unlike standard protein language models that only utilize amino acid sequences, SaProt incorporates structural information by using a Foldseek-derived alphabet.
	\begin{itemize}
	\item \textbf{Input:} A sequence of structure-residue tokens (e.g., combined amino acid and 3Di structural tokens).
	\item \textbf{Backbone:} We utilize the \texttt{SaProt\_650M\_AF2} checkpoint.
	\item \textbf{Pooling:} We use the representation of the \texttt{[CLS]} token as the global protein embedding $E_p$.
	\end{itemize}

	\subsubsection{Molecule Encoder: ChemBERTa}
	For the molecular tower, we selected \textbf{ChemBERTa} \cite{chithrananda2020chemberta}, a RoBERTa-based model pre-trained on millions of SMILES strings from the ZINC database.
	\begin{itemize}
	\item \textbf{Input:} SMILES (Simplified Molecular Input Line Entry System) strings.
	\item \textbf{Backbone:} We utilize the \texttt{ChemBERTa-zinc-base-v1} checkpoint.
	\item \textbf{Pooling:} The \texttt{[CLS]} token embedding serves as the molecular representation $E_m$.
	\end{itemize}

	\subsubsection{Projection Head}
	To align the dimensions of $E_p$ and $E_m$ and enhance representational capability, both embeddings pass through a non-linear projection head:
	\begin{equation}
	z = W_2(\text{ReLU}(W_1(E)))
	\end{equation}
	where $z \in \mathbb{R}^{128}$ is the final vector used for retrieval.

	\subsection{Training Objective: Contrastive Learning}
	We treat the retrieval task as a metric learning problem. The goal is to maximize the cosine similarity between interacting protein-molecule pairs (positives) while minimizing the similarity for non-interacting pairs (negatives). We employ the InfoNCE loss (or NT-Xent loss):

	\begin{equation}
	\mathcal{L} = - \log \frac{\exp(\text{sim}(z_p, z_m) / \tau)}{\sum_{j=1}^{N} \exp(\text{sim}(z_p, z_{m_j}) / \tau)}
	\end{equation}

	where $\text{sim}(\cdot)$ denotes cosine similarity, $\tau$ is the temperature parameter, and the denominator sums over one positive and $N-1$ negative samples in the mini-batch.

	\subsection{Online Service Implementation}
	To meet the requirements of "Track B: Full Pipeline," we developed an interactive web interface using \textbf{Streamlit}. The service is designed to be deployed on Google Colab or Hugging Face Spaces.

	\begin{figure}[H]
	\centering
	% \includegraphics[width=0.8\textwidth]{your_screenshot.png}
	% Note: Replace the above line with your actual screenshot
	\caption{Overview of the BioRetrieval Streamlit Interface.}
	\label{fig:interface}
	\end{figure}

	Key features include:
	\begin{itemize}
	\item \textbf{Inference Module:} Allows users to input a Protein sequence/ID or a Molecule SMILES. The system encodes the query and performs a k-Nearest Neighbors (k-NN) search against a pre-computed FAISS/Vector index of candidates.
	\item \textbf{Training Module:} Enables users to upload a custom dataset (Parquet format) to fine-tune the model. The system automatically updates the vector database after training.
	\end{itemize}

	\section{Experiments and Results}

	\subsection{Experimental Setup}
	\begin{itemize}
	\item \textbf{Dataset:} [Insert Dataset Name, e.g., BindingDB, BioSNAP, or the course provided dataset].
	\item \textbf{Split:} 80\% Training, 10\% Validation, 10\% Testing.
	\item \textbf{Hyperparameters:} Batch size = [Insert], Learning rate = [Insert], Epochs = [Insert].
	\item \textbf{Environment:} Trained on [e.g., NVIDIA T4 GPU via Google Colab].
	\end{itemize}

	\subsection{Performance Metrics}
	We evaluate the retrieval performance using Recall at K (R@K), which measures the probability that the true positive target is found within the top K retrieved results.

	\begin{table}[H]
	\centering
	\caption{Retrieval Performance on Test Set}
	\label{tab:results}
	\begin{tabular}{lccc}
	\toprule
	\textbf{Task} & \textbf{R@1} & \textbf{R@5} & \textbf{R@10} \\
	\midrule
	Protein $\to$ Molecule & [0.XX] & [0.XX] & [0.XX] \\
	Molecule $\to$ Protein & [0.XX] & [0.XX] & [0.XX] \\
	\bottomrule
	\end{tabular}
	\end{table}

	\textit{[Analysis: Briefly describe your results here. For example: "The model shows strong performance in retrieving molecules for proteins, likely due to the rich structural information provided by SaProt..."]}

	\section{Conclusion}
	In this project, we successfully implemented a bidirectional retrieval system for drug discovery. By combining the structural awareness of SaProt with the chemical understanding of ChemBERTa, our Dual-Tower model effectively bridges the gap between biological and chemical modalities. The developed Streamlit application provides a user-friendly interface for researchers to leverage this technology, satisfying the requirements for a full-pipeline online service.

	% --- Bibliography ---
	\begin{filecontents}{references.bib}
	@article{su2023saprot,
	title={SaProt: Protein Language Modeling with Structure-aware Vocabulary},
	author={Su, Jin and Han, Chenchen and Zhou, Yuyang and Shan, Jun and Zhou, Xibin and Yuan, Fajie},
	journal={bioRxiv},
	pages={2023--10},
	year={2023},
	publisher={Cold Spring Harbor Laboratory},
	url={https://www.biorxiv.org/content/10.1101/2023.10.01.560349v3.full.pdf}
	}

	@article{chithrananda2020chemberta,
	title={ChemBERTa: Large-scale self-supervised pretraining for molecular property prediction},
	author={Chithrananda, Seyone and Grand, Gabriel and Ramsundar, Bharath},
	journal={arXiv preprint arXiv:2010.09885},
	year={2020},
	url={https://arxiv.org/pdf/2010.09885}
	}

	@misc{westlake_repl_saprot,
	author = {Westlake-Repl},
	title = {SaProt GitHub Repository},
	year = {2023},
	publisher = {GitHub},
	journal = {GitHub repository},
	howpublished = {\url{https://github.com/westlake-repl/SaProt}}
	}
	\end{filecontents}

	\bibliographystyle{ieeetr}
	\bibliography{references}

	\end{document}