Spaces:

MCP-1st-Birthday
/

unpredictable-lord

Running

App Files Files Community

ryomo commited on 16 days ago

Commit

056e98d

1 Parent(s): e20b84b

fix: update generate_stream to support async streaming for Modal and ZeroGPU

Browse files

Files changed (1) hide show

src/unpredictable_lord/chat/chat.py +66 -99

src/unpredictable_lord/chat/chat.py CHANGED Viewed

@@ -30,15 +30,19 @@ if USE_MODAL:
     APP_NAME = "unpredictable-lord"
     _generate_stream = modal.Function.from_name(APP_NAME, "generate_stream")
-    def generate_stream(input_tokens):
-        logger.info("Calling Modal LLM generate_stream")
-        return _generate_stream.remote_gen(input_tokens)
 else:
     from unpredictable_lord.chat.llm_zerogpu import generate_stream as _generate_stream
-    def generate_stream(input_tokens):
-        logger.info("Calling ZeroGPU LLM generate_stream")
-        return _generate_stream(input_tokens)
 def _get_encoding():
@@ -46,16 +50,8 @@ def _get_encoding():
     return oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
-def _build_developer_message(session_id: str, personality: str) -> oh.Message:
-    """Build developer message with system prompt and tool definitions.
-    Args:
-        session_id: The game session ID.
-        personality: The lord's personality type.
-    Returns:
-        Developer message with instructions and tool definitions.
-    """
     personality_desc = PERSONALITY_DESCRIPTIONS.get(personality, "")
     system_prompt = f"""You are a {personality} lord of a medieval fantasy kingdom.
@@ -96,14 +92,7 @@ What counsel do you offer, advisor? Shall we address their grievances or press o
 def _convert_history_to_messages(chat_history: list[dict]) -> list[oh.Message]:
-    """Convert Gradio chat history to openai-harmony messages.
-    Args:
-        chat_history: Chat history in Gradio format.
-    Returns:
-        List of openai-harmony messages.
-    """
     messages = []
     for msg in chat_history:
         if msg["role"] == "user":
@@ -117,67 +106,51 @@ def _convert_history_to_messages(chat_history: list[dict]) -> list[oh.Message]:
     return messages
-def _stream_llm_response(messages: list[oh.Message], encoding):
-    """Stream LLM response and return full text with parsed messages.
-    Args:
-        messages: List of messages to send to LLM.
-        encoding: Harmony encoding instance.
-    Yields:
-        Tuple of (response_text, parsed_messages or None).
-    """
     convo = oh.Conversation.from_messages(messages)
     input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
     parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
-    response_text = ""
-    token_count = 0
-    parser_error = False
-    all_content = ""  # Capture all content regardless of channel
-    for token in generate_stream(input_tokens):
         if token is None:
             continue
-        token_count += 1
         try:
             parser.process(token)
-        except oh.HarmonyError as e:
-            # Parser error - LLM generated invalid format (e.g., after tool result)
-            # This can happen when LLM copies tool call patterns in its response
-            logger.warning(f"Parser error at token {token_count}: {e}")
-            logger.warning("Treating this as end of valid response")
-            parser_error = True
-            break
         except Exception as e:
-            logger.error(
-                f"Unexpected parser error at token {token_count} (token={token}): {e}"
-            )
-            raise
-        # Get content from any channel for fallback
-        delta = parser.last_content_delta
-        if delta:
-            all_content += delta
-        # Get content only from final channel for display
-        if parser.current_channel == "final" and delta:
-            response_text += delta
-            yield response_text, None
-    # Finish parsing and return parsed messages
-    if not parser_error:
-        parser.process_eos()
-        parsed_messages = parser.messages
-    else:
-        # On parser error, return empty list to stop tool calling loop
-        # Use all_content as fallback if response_text is empty
-        if not response_text and all_content:
-            response_text = all_content
-            logger.info(f"Using fallback content (length: {len(all_content)})")
-        parsed_messages = []
-    yield response_text, parsed_messages
 async def chat_with_mcp_tools(
@@ -189,8 +162,6 @@ async def chat_with_mcp_tools(
     """
     Chat with LLM with MCP tool support (async streaming version).
-    This version includes tool calling capabilities for game interactions.
     Args:
         user_message: User's message
         chat_history: Past chat history (list of dictionaries in Gradio format)
@@ -203,8 +174,8 @@ async def chat_with_mcp_tools(
     try:
         encoding = _get_encoding()
-        # Build messages with tool definitions
-        messages = [_build_developer_message(session_id, personality)]
         messages.extend(_convert_history_to_messages(chat_history))
         messages.append(oh.Message.from_role_and_content(oh.Role.USER, user_message))
@@ -220,29 +191,17 @@ async def chat_with_mcp_tools(
         for iteration in range(MAX_TOOL_CALL_ITERATIONS):
             logger.info(f"Tool calling iteration {iteration + 1}")
-            parsed_messages = None
-            current_iteration_response = ""
-            # Stream LLM response
-            for response_text, parsed in _stream_llm_response(messages, encoding):
-                current_iteration_response = response_text
-                if parsed is not None:
-                    parsed_messages = parsed
-                partial_history[-1]["content"] = (
-                    accumulated_response + current_iteration_response
-                )
-                yield partial_history
-            if parsed_messages is None:
-                logger.warning("No parsed messages returned from LLM")
-                break
-            # Update accumulated response with the final text from this iteration
-            accumulated_response += current_iteration_response
             # Check for tool calls
-            tool_calls = extract_tool_calls(parsed_messages)
             if not tool_calls:
                 logger.info("No tool calls found, ending loop")
@@ -251,14 +210,22 @@ async def chat_with_mcp_tools(
             logger.info(f"Found {len(tool_calls)} tool call(s)")
             # Add parsed messages to conversation
-            messages.extend(parsed_messages)
             # Execute tools via MCP
             tool_result_messages = await execute_tool_calls(tool_calls)
             messages.extend(tool_result_messages)
-        # Ensure final response is yielded (even if empty after tool calls)
         yield partial_history
     except Exception:

     APP_NAME = "unpredictable-lord"
     _generate_stream = modal.Function.from_name(APP_NAME, "generate_stream")
+    async def generate_stream(input_tokens):
+        logger.info("Calling Modal LLM generate_stream (async)")
+        async for token in _generate_stream.remote_gen.aio(input_tokens):
+            yield token
 else:
     from unpredictable_lord.chat.llm_zerogpu import generate_stream as _generate_stream
+    async def generate_stream(input_tokens):
+        logger.info("Calling ZeroGPU LLM generate_stream (sync wrapper)")
+        # Note: This blocks the event loop, but is acceptable for ZeroGPU/Spaces
+        # where concurrency is limited anyway.
+        for token in _generate_stream(input_tokens):
+            yield token
 def _get_encoding():
     return oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
+def _build_system_message(session_id: str, personality: str) -> oh.Message:
+    """Build developer message with system prompt and tool definitions."""
     personality_desc = PERSONALITY_DESCRIPTIONS.get(personality, "")
     system_prompt = f"""You are a {personality} lord of a medieval fantasy kingdom.
 def _convert_history_to_messages(chat_history: list[dict]) -> list[oh.Message]:
+    """Convert Gradio chat history to openai-harmony messages."""
     messages = []
     for msg in chat_history:
         if msg["role"] == "user":
     return messages
+class StreamResult:
+    """Holder for streaming result."""
+    def __init__(self):
+        self.response_text = ""
+        self.parsed_messages = []
+async def _stream_response(
+    messages: list[oh.Message],
+    encoding: oh.HarmonyEncoding,
+    partial_history: list[dict],
+    accumulated_response: str,
+    result: StreamResult,
+) -> AsyncGenerator[list[dict], None]:
+    """Stream LLM response and yield history updates."""
     convo = oh.Conversation.from_messages(messages)
     input_tokens = encoding.render_conversation_for_completion(convo, oh.Role.ASSISTANT)
     parser = oh.StreamableParser(encoding, role=oh.Role.ASSISTANT)
+    current_iteration_response = ""
+    # Stream LLM response
+    async for token in generate_stream(input_tokens):
         if token is None:
             continue
         try:
             parser.process(token)
         except Exception as e:
+            logger.error(f"Parser error: {e}")
+            break
+        if parser.current_channel == "final":
+            delta = parser.last_content_delta
+            if delta:
+                current_iteration_response += delta
+                partial_history[-1]["content"] = (
+                    accumulated_response + current_iteration_response
+                )
+                yield partial_history
+    # Store results
+    result.response_text = current_iteration_response
+    result.parsed_messages = parser.messages
 async def chat_with_mcp_tools(
     """
     Chat with LLM with MCP tool support (async streaming version).
     Args:
         user_message: User's message
         chat_history: Past chat history (list of dictionaries in Gradio format)
     try:
         encoding = _get_encoding()
+        # Build messages
+        messages = [_build_system_message(session_id, personality)]
         messages.extend(_convert_history_to_messages(chat_history))
         messages.append(oh.Message.from_role_and_content(oh.Role.USER, user_message))
         for iteration in range(MAX_TOOL_CALL_ITERATIONS):
             logger.info(f"Tool calling iteration {iteration + 1}")
+            result = StreamResult()
+            async for history in _stream_response(
+                messages, encoding, partial_history, accumulated_response, result
+            ):
+                yield history
+            # Update accumulated response
+            accumulated_response += result.response_text
             # Check for tool calls
+            tool_calls = extract_tool_calls(result.parsed_messages)
             if not tool_calls:
                 logger.info("No tool calls found, ending loop")
             logger.info(f"Found {len(tool_calls)} tool call(s)")
             # Add parsed messages to conversation
+            messages.extend(result.parsed_messages)
+            # Indicate tool execution in UI
+            partial_history[-1]["content"] += "\n\n*(Executing orders...)*"
+            yield partial_history
             # Execute tools via MCP
             tool_result_messages = await execute_tool_calls(tool_calls)
             messages.extend(tool_result_messages)
+            # Remove status message
+            partial_history[-1]["content"] = accumulated_response
+            yield partial_history
+        # Ensure final response is yielded
         yield partial_history
     except Exception: