Spaces:

yrshi
/

AutoRefine

Sleeping

App Files Files Community

yrshi commited on 26 days ago

Commit

8b4913f

1 Parent(s): c55da95

first commit

Browse files

Files changed (6) hide show

app.py +212 -53
infer.py +150 -0
install_cuda.sh +7 -0
install_env.sh +9 -0
retreival_launch.sh +11 -0
retrieval_server.py +390 -0

app.py CHANGED Viewed

@@ -1,70 +1,229 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
 def respond(
     message,
     history: list[dict[str, str]],
-    system_message,
     max_tokens,
     temperature,
     top_p,
-    hf_token: gr.OAuthToken,
 ):
     """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+import transformers
+import torch
+import requests
+import re
 import gradio as gr
+from threading import Thread
+# --- Configuration --------------------------------------------------
+# 1. DEFINE YOUR MODEL
+model_id = "yrshi/AutoRefine-Qwen2.5-3B-Base"
+# 2. !!! CRITICAL: UPDATE THIS URL !!!
+# Your local 'http://127.0.0.1:8000/retrieve' will NOT work on Hugging Face.
+# You must deploy your retrieval service and provide its public URL here.
+RETRIEVER_URL = "http://127.0.0.1:8000/retrieve" # <-- UPDATE ME
+# 3. MODEL & SEARCH CONSTANTS
+curr_eos = [151645, 151643] # for Qwen2.5 series models
+curr_search_template = '\n\n{output_text}<documents>{search_results}</documents>\n\n'
+target_sequences = ["</search>", " </search>", "</search>\n", " </search>\n", "</search>\n\n", " </search>\n\n"]
+# --- Global Model & Tokenizer Loading -------------------------------
+# This happens once when the Space starts.
+# Ensure your Space has a GPU assigned (e.g., T4, A10G).
+print("Loading model and tokenizer...")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+model = transformers.AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+print("Model and tokenizer loaded successfully.")
+# --- Custom Stopping Criteria Class ---------------------------------
+class StopOnSequence(transformers.StoppingCriteria):
+    def __init__(self, target_sequences, tokenizer):
+        self.target_ids = [tokenizer.encode(target_sequence, add_special_tokens=False) for target_sequence in target_sequences]
+        self.target_lengths = [len(target_id) for target_id in self.target_ids]
+        self._tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs):
+        targets = [torch.as_tensor(target_id, device=input_ids.device) for target_id in self.target_ids]
+        if input_ids.shape[1] < min(self.target_lengths):
+            return False
+        for i, target in enumerate(targets):
+            if torch.equal(input_ids[0, -self.target_lengths[i]:], target):
+                return True
+        return False
+# Initialize stopping criteria globally
+stopping_criteria = transformers.StoppingCriteriaList([StopOnSequence(target_sequences, tokenizer)])
+# --- Helper Functions (Search & Parse) ------------------------------
+def get_query(text):
+    pattern = re.compile(r"<search>(.*?)</search>", re.DOTALL)
+    matches = pattern.findall(text)
+    return matches[-1] if matches else None
+def search(query: str):
+    """
+    Calls your deployed retriever service.
+    """
+    payload = {"queries": [query], "topk": 3, "return_scores": True}
+    if RETRIEVER_URL == "http://127.0.0.1:8000/retrieve":
+        print("WARNING: Using default local retriever URL. This will likely fail.")
+        print("Please update RETRIEVER_URL in app.py to your deployed service.")
+    try:
+        response = requests.post(RETRIEVER_URL, json=payload, timeout=10)
+        response.raise_for_status() # Raise an error for bad responses
+        results = response.json()['result']
+        format_reference = ''
+        for idx, doc_item in enumerate(results[0]):
+            content = doc_item['document']['contents']
+            title = content.split("\n")[0]
+            text = "\n".join(content.split("\n")[1:])
+            format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+        return format_reference
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling retriever: {e}")
+        return f"Error: Could not retrieve search results for query: {query}"
+    except (KeyError, IndexError):
+        print("Error parsing retriever response")
+        return "Error: Malformed response from retriever."
+# --- Main Gradio 'respond' Function ---------------------------------
 def respond(
     message,
     history: list[dict[str, str]],
+    system_message, # This is now our base prompt
     max_tokens,
     temperature,
     top_p,
+    hf_token: gr.OAuthToken = None, # Not used here, but in template
 ):
     """
+    This function implements your local multi-turn search logic as a
+    streaming generator for the Gradio interface.
     """
+    question = message.strip()
+    # Use the system_message from the UI as the base prompt
+    # Or, if empty, use your default.
+    if not system_message:
+        system_message = """You are a helpful assistant excel at answering questions with multi-turn search engine calling. \
+    To answer questions, you must first reason through the available information using <think> and </think>. \
+    If you identify missing knowledge, you may issue a search request using <search> query </search> at any time. The retrieval system will provide you with the three most relevant documents enclosed in <documents> and </documents>. \
+    After each search, you need to summarize and refine the existing documents in <refine> and </refine>. \
+    You may send multiple search requests if needed. \
+    Once you have sufficient information, provide a concise final answer using <answer> and </answer>. For example, <answer> Donald Trump </answer>."""
+    prompt = f"{system_message} Question: {question}\n"
+    if tokenizer.chat_template:
+        # Apply chat template if it exists
+        # Note: Your logic builds the prompt manually, but this ensures
+        # correct special tokens if the model needs them.
+        chat_prompt = [{"role": "user", "content": prompt}]
+        prompt = tokenizer.apply_chat_template(chat_prompt, add_generation_prompt=True, tokenize=False)
+    # This string will accumulate the full agent trajectory
+    full_response_trajectory = ""
+    while True:
+        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
+        attention_mask = torch.ones_like(input_ids)
+        # Check for context overflow
+        if input_ids.shape[1] > model.config.max_position_embeddings - max_tokens:
+            print("Context limit reached.")
+            full_response_trajectory += "\n\n[Error: Context limit reached. Aborting.]"
+            yield full_response_trajectory
+            break
+        # Generate text with the stopping criteria
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=tokenizer.eos_token_id,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p
+        )
+        # Decode the *newly* generated tokens
+        generated_token_ids = outputs[0][input_ids.shape[1]:]
+        output_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
+        # Check if generation ended with an EOS token
+        if outputs[0][-1].item() in curr_eos:
+            full_response_trajectory += output_text
+            yield full_response_trajectory # Yield the final text
+            break # Exit the loop
+        # --- Generation stopped at </search> ---
+        # Get the full text (prompt + new generation) to parse the *last* query
+        full_generation_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        query_text = get_query(full_generation_text)
+        if query_text:
+            search_results = search(query_text)
+        else:
+            search_results = 'Error: Stop token found but no <search> query was parsed.'
+        # Construct the text to append to the prompt
+        search_text = curr_search_template.format(
+            output_text=output_text,
+            search_results=search_results
+        )
+        # Append to the prompt for the next loop
+        prompt += search_text
+        # Append to the trajectory string and yield to the UI
+        full_response_trajectory += search_text
+        yield full_response_trajectory
+# --- Gradio UI (Example) -------------------------------------------
+# This part is just to make the file runnable.
+# You can customize your Gradio UI as needed.
 with gr.Blocks() as demo:
+    gr.Markdown("# Multi-Turn Search Agent")
+    gr.Markdown(f"Running model: `{model_id}`")
+    with gr.Accordion("Prompt & Parameters"):
+        system_message = gr.Textbox(
+            label="System Message",
+            value="""You are a helpful assistant... (full prompt from code)""",
+            lines=10
+        )
+        max_tokens = gr.Slider(50, 2048, value=1024, label="Max New Tokens")
+        temperature = gr.Slider(0.1, 1.0, value=0.7, label="Temperature")
+        top_p = gr.Slider(0.1, 1.0, value=1.0, label="Top-p")
+    chatbot = gr.Chatbot(label="Agent Trajectory")
+    msg = gr.Textbox(label="Your Question")
+    def user_turn(user_message, history):
+        return "", history + [[user_message, None]]
+    msg.submit(
+        user_turn,
+        [msg, chatbot],
+        [msg, chatbot],
+        queue=False
+    ).then(
+        respond,
+        [msg, chatbot, system_message, max_tokens, temperature, top_p],
+        chatbot
+    )
 if __name__ == "__main__":
+    demo.queue().launch(debug=True)

infer.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import transformers
+import torch
+import requests
+import re
+question_list = [
+    "Who was born first out of Cameron Mitchell (Singer) and Léopold De Saussure?", # Ground Truth: "Léopold De Saussure"
+    "The Clavivox was invented by an American composer who was born Harry Warnow in what year?", # Ground Truth: "1908"
+    "Which movie did Disney produce first, The Many Adventures of Winnie the Pooh or Ride a Wild Pony?", # Ground Truth: "Ride a Wild Pony"
+    "Who is the sibling of the author of Kapalkundala?", # Ground Truth: "Sanjib Chandra" or "Sanjib Chandra Chattopadhyay"
+]
+# Model ID and device setup
+model_id = "yrshi/AutoRefine-Qwen2.5-3B-Base"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+curr_eos = [151645, 151643] # for Qwen2.5 series models
+curr_search_template = '{output_text}\n\n<documents>{search_results}</documents>\n\n'
+# Initialize the tokenizer and model
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
+model = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+# Define the custom stopping criterion
+class StopOnSequence(transformers.StoppingCriteria):
+    def __init__(self, target_sequences, tokenizer):
+        # Encode the string so we have the exact token-IDs pattern
+        self.target_ids = [tokenizer.encode(target_sequence, add_special_tokens=False) for target_sequence in target_sequences]
+        self.target_lengths = [len(target_id) for target_id in self.target_ids]
+        self._tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs):
+        # Make sure the target IDs are on the same device
+        targets = [torch.as_tensor(target_id, device=input_ids.device) for target_id in self.target_ids]
+        if input_ids.shape[1] < min(self.target_lengths):
+            return False
+        # Compare the tail of input_ids with our target_ids
+        for i, target in enumerate(targets):
+            if torch.equal(input_ids[0, -self.target_lengths[i]:], target):
+                return True
+        return False
+def get_query(text):
+    import re
+    pattern = re.compile(r"<search>(.*?)</search>", re.DOTALL)
+    matches = pattern.findall(text)
+    if matches:
+        return matches[-1]
+    else:
+        return None
+def search(query: str):
+    payload = {
+            "queries": [query],
+            "topk": 3,
+            "return_scores": True
+        }
+    results = requests.post("http://127.0.0.1:8000/retrieve", json=payload).json()['result']
+    def _passages2string(retrieval_result):
+        format_reference = ''
+        for idx, doc_item in enumerate(retrieval_result):
+            content = doc_item['document']['contents']
+            title = content.split("\n")[0]
+            text = "\n".join(content.split("\n")[1:])
+            format_reference += f"Doc {idx+1}(Title: {title}) {text}\n"
+        return format_reference
+    return _passages2string(results[0])
+# Initialize the stopping criteria
+target_sequences = ["</search>", " </search>", "</search>\n", " </search>\n", "</search>\n\n", " </search>\n\n"]
+stopping_criteria = transformers.StoppingCriteriaList([StopOnSequence(target_sequences, tokenizer)])
+def run_search(question):
+    question = question.strip()
+    cnt = 0
+    trajectory = []
+    # Prepare the message
+    prompt = f"""You are a helpful assistant excel at answering questions with multi-turn search engine calling. \
+    To answer questions, you must first reason through the available information using <think> and </think>. \
+    If you identify missing knowledge, you may issue a search request using <search> query </search> at any time. The retrieval system will provide you with the three most relevant documents enclosed in <documents> and </documents>. \
+    After each search, you need to summarize and refine the existing documents in <refine> and </refine>. \
+    You may send multiple search requests if needed. \
+    Once you have sufficient information, provide a concise final answer using <answer> and </answer>. For example, <answer> Donald Trump </answer>. Question: {question}\n"""
+    if tokenizer.chat_template:
+        prompt = tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True, tokenize=False)
+    print(prompt)
+    # Encode the chat-formatted prompt and move it to the correct device
+    while True:
+        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
+        attention_mask = torch.ones_like(input_ids)
+        # Generate text with the stopping criteria
+        outputs = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=1024,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=tokenizer.eos_token_id,
+            do_sample=True,
+            temperature=0.7
+        )
+        if outputs[0][-1].item() in curr_eos:
+            generated_tokens = outputs[0][input_ids.shape[1]:]
+            output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+            trajectory.append(output_text)
+            print(output_text)
+            break
+        generated_tokens = outputs[0][input_ids.shape[1]:]
+        output_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        query_text = get_query(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        if query_text:
+            search_results = search(query_text)
+        else:
+            search_results = ''
+        search_text = curr_search_template.format(output_text=output_text.strip(), search_results=search_results.strip())
+        prompt += search_text
+        cnt += 1
+        print(search_text)
+        trajectory.append(search_text)
+    print(f"Total iterations: {cnt}")
+    answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
+    answer_match = answer_pattern.search(trajectory[-1])
+    if answer_match:
+        final_answer = answer_match.group(1).strip()
+        print(f"Final answer found: {final_answer}")
+    else:
+        print("No final answer found in the output.")
+        final_answer = "No final answer found."
+    return ''.join([text for text in trajectory]), final_answer
+if __name__ == "__main__":
+    output_text, final_answer = run_search(question_list[0])
+    print(f"Output trajectory: {output_text}")
+    print(f"Final answer: {final_answer}")

install_cuda.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+mkdir -p ~/miniconda3
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
+bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
+rm ~/miniconda3/miniconda.sh
+source ~/miniconda3/bin/activate
+conda init --all

install_env.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+conda create -n faiss_env python=3.10
+conda activate faiss_env
+conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install transformers datasets pyserini
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+pip install uvicorn fastapi

retreival_launch.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+file_path=./data
+index_file=$file_path/e5_Flat.index
+corpus_file=$file_path/wiki-18.jsonl
+retriever=intfloat/e5-base-v2
+export CUDA_VISIBLE_DEVICES="1,3"
+python search_r1/search/retrieval_server.py --index_path $index_file \
+                                            --corpus_path $corpus_file \
+                                            --topk 3 \
+                                            --retriever_model $retriever

retrieval_server.py ADDED Viewed

	@@ -0,0 +1,390 @@

+import json
+import os
+import warnings
+from typing import List, Dict, Optional
+import argparse
+import faiss
+import torch
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, AutoModel
+from tqdm import tqdm
+import datasets
+import uvicorn
+from fastapi import FastAPI
+from pydantic import BaseModel
+parser = argparse.ArgumentParser(description="Launch the local faiss retriever.")
+parser.add_argument("--index_path", type=str, help="Corpus indexing file.")
+parser.add_argument("--corpus_path", type=str, help="Local corpus file.")
+parser.add_argument("--topk", type=int, default=3, help="Number of retrieved passages for one query.")
+parser.add_argument("--retriever_model", type=str, default="intfloat/e5-base-v2", help="Name of the retriever model.")
+args = parser.parse_args()
+def load_corpus(corpus_path: str):
+    corpus = datasets.load_dataset(
+        'json',
+        data_files=corpus_path,
+        split="train",
+        num_proc=4
+    )
+    return corpus
+def read_jsonl(file_path):
+    data = []
+    with open(file_path, "r") as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+def load_docs(corpus, doc_idxs):
+    results = [corpus[int(idx)] for idx in doc_idxs]
+    return results
+def load_model(model_path: str, use_fp16: bool = False):
+    model_config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+    model.eval()
+    model.cuda()
+    if use_fp16:
+        model = model.half()
+    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=True)
+    return model, tokenizer
+def pooling(
+    pooler_output,
+    last_hidden_state,
+    attention_mask = None,
+    pooling_method = "mean"
+):
+    if pooling_method == "mean":
+        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
+        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pooling_method == "cls":
+        return last_hidden_state[:, 0]
+    elif pooling_method == "pooler":
+        return pooler_output
+    else:
+        raise NotImplementedError("Pooling method not implemented!")
+class Encoder:
+    def __init__(self, model_name, model_path, pooling_method, max_length, use_fp16):
+        self.model_name = model_name
+        self.model_path = model_path
+        self.pooling_method = pooling_method
+        self.max_length = max_length
+        self.use_fp16 = use_fp16
+        self.model, self.tokenizer = load_model(model_path=model_path, use_fp16=use_fp16)
+        self.model.eval()
+    @torch.no_grad()
+    def encode(self, query_list: List[str], is_query=True) -> np.ndarray:
+        # processing query for different encoders
+        if isinstance(query_list, str):
+            query_list = [query_list]
+        if "e5" in self.model_name.lower():
+            if is_query:
+                query_list = [f"query: {query}" for query in query_list]
+            else:
+                query_list = [f"passage: {query}" for query in query_list]
+        if "bge" in self.model_name.lower():
+            if is_query:
+                query_list = [f"Represent this sentence for searching relevant passages: {query}" for query in query_list]
+        inputs = self.tokenizer(query_list,
+                                max_length=self.max_length,
+                                padding=True,
+                                truncation=True,
+                                return_tensors="pt"
+                                )
+        inputs = {k: v.cuda() for k, v in inputs.items()}
+        if "T5" in type(self.model).__name__:
+            # T5-based retrieval model
+            decoder_input_ids = torch.zeros(
+                (inputs['input_ids'].shape[0], 1), dtype=torch.long
+            ).to(inputs['input_ids'].device)
+            output = self.model(
+                **inputs, decoder_input_ids=decoder_input_ids, return_dict=True
+            )
+            query_emb = output.last_hidden_state[:, 0, :]
+        else:
+            output = self.model(**inputs, return_dict=True)
+            query_emb = pooling(output.pooler_output,
+                                output.last_hidden_state,
+                                inputs['attention_mask'],
+                                self.pooling_method)
+            if "dpr" not in self.model_name.lower():
+                query_emb = torch.nn.functional.normalize(query_emb, dim=-1)
+        query_emb = query_emb.detach().cpu().numpy()
+        query_emb = query_emb.astype(np.float32, order="C")
+        del inputs, output
+        torch.cuda.empty_cache()
+        return query_emb
+class BaseRetriever:
+    def __init__(self, config):
+        self.config = config
+        self.retrieval_method = config.retrieval_method
+        self.topk = config.retrieval_topk
+        self.index_path = config.index_path
+        self.corpus_path = config.corpus_path
+    def _search(self, query: str, num: int, return_score: bool):
+        raise NotImplementedError
+    def _batch_search(self, query_list: List[str], num: int, return_score: bool):
+        raise NotImplementedError
+    def search(self, query: str, num: int = None, return_score: bool = False):
+        return self._search(query, num, return_score)
+    def batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+        return self._batch_search(query_list, num, return_score)
+class BM25Retriever(BaseRetriever):
+    def __init__(self, config):
+        super().__init__(config)
+        from pyserini.search.lucene import LuceneSearcher
+        self.searcher = LuceneSearcher(self.index_path)
+        self.contain_doc = self._check_contain_doc()
+        if not self.contain_doc:
+            self.corpus = load_corpus(self.corpus_path)
+        self.max_process_num = 8
+    def _check_contain_doc(self):
+        return self.searcher.doc(0).raw() is not None
+    def _search(self, query: str, num: int = None, return_score: bool = False):
+        if num is None:
+            num = self.topk
+        hits = self.searcher.search(query, num)
+        if len(hits) < 1:
+            if return_score:
+                return [], []
+            else:
+                return []
+        scores = [hit.score for hit in hits]
+        if len(hits) < num:
+            warnings.warn('Not enough documents retrieved!')
+        else:
+            hits = hits[:num]
+        if self.contain_doc:
+            all_contents = [
+                json.loads(self.searcher.doc(hit.docid).raw())['contents']
+                for hit in hits
+            ]
+            results = [
+                {
+                    'title': content.split("\n")[0].strip("\""),
+                    'text': "\n".join(content.split("\n")[1:]),
+                    'contents': content
+                }
+                for content in all_contents
+            ]
+        else:
+            results = load_docs(self.corpus, [hit.docid for hit in hits])
+        if return_score:
+            return results, scores
+        else:
+            return results
+    def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+        results = []
+        scores = []
+        for query in query_list:
+            item_result, item_score = self._search(query, num, True)
+            results.append(item_result)
+            scores.append(item_score)
+        if return_score:
+            return results, scores
+        else:
+            return results
+class DenseRetriever(BaseRetriever):
+    def __init__(self, config):
+        super().__init__(config)
+        self.index = faiss.read_index(self.index_path)
+        if config.faiss_gpu:
+            co = faiss.GpuMultipleClonerOptions()
+            co.useFloat16 = True
+            co.shard = True
+            self.index = faiss.index_cpu_to_all_gpus(self.index, co=co)
+        self.corpus = load_corpus(self.corpus_path)
+        self.encoder = Encoder(
+            model_name = self.retrieval_method,
+            model_path = config.retrieval_model_path,
+            pooling_method = config.retrieval_pooling_method,
+            max_length = config.retrieval_query_max_length,
+            use_fp16 = config.retrieval_use_fp16
+        )
+        self.topk = config.retrieval_topk
+        self.batch_size = config.retrieval_batch_size
+    def _search(self, query: str, num: int = None, return_score: bool = False):
+        if num is None:
+            num = self.topk
+        query_emb = self.encoder.encode(query)
+        scores, idxs = self.index.search(query_emb, k=num)
+        idxs = idxs[0]
+        scores = scores[0]
+        results = load_docs(self.corpus, idxs)
+        if return_score:
+            return results, scores.tolist()
+        else:
+            return results
+    def _batch_search(self, query_list: List[str], num: int = None, return_score: bool = False):
+        if isinstance(query_list, str):
+            query_list = [query_list]
+        if num is None:
+            num = self.topk
+        results = []
+        scores = []
+        for start_idx in tqdm(range(0, len(query_list), self.batch_size), desc='Retrieval process: '):
+            query_batch = query_list[start_idx:start_idx + self.batch_size]
+            batch_emb = self.encoder.encode(query_batch)
+            batch_scores, batch_idxs = self.index.search(batch_emb, k=num)
+            batch_scores = batch_scores.tolist()
+            batch_idxs = batch_idxs.tolist()
+            # load_docs is not vectorized, but is a python list approach
+            flat_idxs = sum(batch_idxs, [])
+            batch_results = load_docs(self.corpus, flat_idxs)
+            # chunk them back
+            batch_results = [batch_results[i*num : (i+1)*num] for i in range(len(batch_idxs))]
+            results.extend(batch_results)
+            scores.extend(batch_scores)
+            del batch_emb, batch_scores, batch_idxs, query_batch, flat_idxs, batch_results
+            torch.cuda.empty_cache()
+        if return_score:
+            return results, scores
+        else:
+            return results
+def get_retriever(config):
+    if config.retrieval_method == "bm25":
+        return BM25Retriever(config)
+    else:
+        return DenseRetriever(config)
+#####################################
+# FastAPI server below
+#####################################
+class Config:
+    """
+    Minimal config class (simulating your argparse)
+    Replace this with your real arguments or load them dynamically.
+    """
+    def __init__(
+        self,
+        retrieval_method: str = "bm25",
+        retrieval_topk: int = 10,
+        index_path: str = "./index/bm25",
+        corpus_path: str = "./data/corpus.jsonl",
+        dataset_path: str = "./data",
+        data_split: str = "train",
+        faiss_gpu: bool = True,
+        retrieval_model_path: str = "./model",
+        retrieval_pooling_method: str = "mean",
+        retrieval_query_max_length: int = 256,
+        retrieval_use_fp16: bool = False,
+        retrieval_batch_size: int = 128
+    ):
+        self.retrieval_method = retrieval_method
+        self.retrieval_topk = retrieval_topk
+        self.index_path = index_path
+        self.corpus_path = corpus_path
+        self.dataset_path = dataset_path
+        self.data_split = data_split
+        self.faiss_gpu = faiss_gpu
+        self.retrieval_model_path = retrieval_model_path
+        self.retrieval_pooling_method = retrieval_pooling_method
+        self.retrieval_query_max_length = retrieval_query_max_length
+        self.retrieval_use_fp16 = retrieval_use_fp16
+        self.retrieval_batch_size = retrieval_batch_size
+class QueryRequest(BaseModel):
+    queries: List[str]
+    topk: Optional[int] = None
+    return_scores: bool = False
+app = FastAPI()
+# 1) Build a config (could also parse from arguments).
+#    In real usage, you'd parse your CLI arguments or environment variables.
+config = Config(
+    retrieval_method = "e5",  # or "dense"
+    index_path=args.index_path,
+    corpus_path=args.corpus_path,
+    retrieval_topk=args.topk,
+    faiss_gpu=True,
+    retrieval_model_path=args.retriever_model,
+    retrieval_pooling_method="mean",
+    retrieval_query_max_length=256,
+    retrieval_use_fp16=True,
+    retrieval_batch_size=512,
+)
+# 2) Instantiate a global retriever so it is loaded once and reused.
+retriever = get_retriever(config)
+@app.post("/retrieve")
+def retrieve_endpoint(request: QueryRequest):
+    """
+    Endpoint that accepts queries and performs retrieval.
+    Input format:
+    {
+      "queries": ["What is Python?", "Tell me about neural networks."],
+      "topk": 3,
+      "return_scores": true
+    }
+    """
+    if not request.topk:
+        request.topk = config.retrieval_topk  # fallback to default
+    # Perform batch retrieval
+    results, scores = retriever.batch_search(
+        query_list=request.queries,
+        num=request.topk,
+        return_score=request.return_scores
+    )
+    # Format response
+    resp = []
+    for i, single_result in enumerate(results):
+        if request.return_scores:
+            # If scores are returned, combine them with results
+            combined = []
+            for doc, score in zip(single_result, scores[i]):
+                combined.append({"document": doc, "score": score})
+            resp.append(combined)
+        else:
+            resp.append(single_result)
+    return {"result": resp}
+if __name__ == "__main__":
+    # 3) Launch the server. By default, it listens on http://127.0.0.1:8000
+    uvicorn.run(app, host="0.0.0.0", port=8000)