Spaces:

rubenroy
/

Zurich-14B

Running

App Files Files Community

rubenroy commited on Jan 31

Commit

4706d9e

verified ·

1 Parent(s): 7786282

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -58

app.py CHANGED Viewed

@@ -12,10 +12,10 @@ model = AutoModelForCausalLM.from_pretrained(
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 @spaces.GPU
-def generate(prompt, history, temperature, top_p, top_k, max_new_tokens, repetition_penalty):
     messages = [
         {"role": "system", "content": "You are Zurich, a 7 billion parameter Large Language model built on the Qwen 2.5 7B model developed by Alibaba Cloud, and fine-tuned by Ruben Roy. You have been fine-tuned with the GammaCorpus v2 dataset, a dataset filled with structured and filtered multi-turn conversations and was also created by Ruben Roy. You are a helpful assistant."},
-        {"role": "user", "content": prompt}
     ]
     text = tokenizer.apply_chat_template(
         messages,
@@ -25,12 +25,12 @@ def generate(prompt, history, temperature, top_p, top_k, max_new_tokens, repetit
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
     generated_ids = model.generate(
         **model_inputs,
-        temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        max_new_tokens=max_new_tokens,
-        repetition_penalty=repetition_penalty,
-        do_sample=True if temperature > 0 else False
     )
     generated_ids = [
         output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
@@ -162,63 +162,71 @@ examples = [
     ["What are the key differences between machine learning and deep learning?"]
 ]
-def create_generation_settings():
-    with gr.Group():
-        with gr.Accordion("Generation Settings", open=False):
-            temperature = gr.Slider(
-                minimum=0.0,
-                maximum=2.0,
-                value=0.7,
-                step=0.1,
-                label="Temperature",
-                info="Higher values make the output more random, lower values make it more focused and deterministic"
-            )
-            top_p = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.9,
-                step=0.05,
-                label="Top P",
-                info="Used for nucleus sampling - controls the cumulative probability of tokens to consider"
-            )
-            top_k = gr.Slider(
-                minimum=1,
-                maximum=100,
-                value=50,
-                step=1,
-                label="Top K",
-                info="Limits the number of tokens to consider for each step of text generation"
-            )
-            max_new_tokens = gr.Slider(
-                minimum=1,
-                maximum=2048,
-                value=512,
-                step=1,
-                label="Max New Tokens",
-                info="Maximum number of tokens to generate in the response"
-            )
-            repetition_penalty = gr.Slider(
-                minimum=1.0,
-                maximum=2.0,
-                value=1.1,
-                step=0.1,
-                label="Repetition Penalty",
-                info="Higher values prevent the model from repeating the same information"
-            )
-    return temperature, top_p, top_k, max_new_tokens, repetition_penalty
 with gr.Blocks() as demo:
     gr.HTML(TITLE_HTML)
-    # Create generation settings
-    temperature, top_p, top_k, max_new_tokens, repetition_penalty = create_generation_settings()
-    # Create the chat interface with the additional parameters
     chatbot = gr.ChatInterface(
-        fn=lambda msg, history: generate(msg, history, temperature.value, top_p.value, top_k.value, max_new_tokens.value, repetition_penalty.value),
         examples=examples,
         title="Chat with Zurich",
-        description="Ask me anything! I'm here to help with explanations, coding, math, writing, and more.",
     )
 demo.launch(share=True)

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 @spaces.GPU
+def generate(message, chat_history, temperature=0.7, top_p=0.9, top_k=50, max_new_tokens=512, repetition_penalty=1.1):
     messages = [
         {"role": "system", "content": "You are Zurich, a 7 billion parameter Large Language model built on the Qwen 2.5 7B model developed by Alibaba Cloud, and fine-tuned by Ruben Roy. You have been fine-tuned with the GammaCorpus v2 dataset, a dataset filled with structured and filtered multi-turn conversations and was also created by Ruben Roy. You are a helpful assistant."},
+        {"role": "user", "content": message}
     ]
     text = tokenizer.apply_chat_template(
         messages,
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
     generated_ids = model.generate(
         **model_inputs,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        top_k=int(top_k),
+        max_new_tokens=int(max_new_tokens),
+        repetition_penalty=float(repetition_penalty),
+        do_sample=True if float(temperature) > 0 else False
     )
     generated_ids = [
         output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
     ["What are the key differences between machine learning and deep learning?"]
 ]
 with gr.Blocks() as demo:
     gr.HTML(TITLE_HTML)
+    with gr.Accordion("Generation Settings", open=False):
+        with gr.Row():
+            with gr.Column():
+                temperature = gr.Slider(
+                    minimum=0.0,
+                    maximum=2.0,
+                    value=0.7,
+                    step=0.1,
+                    label="Temperature",
+                    info="Higher values make the output more random, lower values make it more focused and deterministic",
+                    interactive=True
+                )
+                top_p = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                    label="Top P",
+                    info="Controls the cumulative probability threshold for nucleus sampling",
+                    interactive=True
+                )
+                top_k = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    value=50,
+                    step=1,
+                    label="Top K",
+                    info="Limits the number of tokens to consider for each generation step",
+                    interactive=True
+                )
+            with gr.Column():
+                max_new_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=2048,
+                    value=512,
+                    step=1,
+                    label="Max New Tokens",
+                    info="Maximum number of tokens to generate in the response",
+                    interactive=True
+                )
+                repetition_penalty = gr.Slider(
+                    minimum=1.0,
+                    maximum=2.0,
+                    value=1.1,
+                    step=0.1,
+                    label="Repetition Penalty",
+                    info="Higher values prevent the model from repeating the same information",
+                    interactive=True
+                )
     chatbot = gr.ChatInterface(
+        fn=generate,
+        additional_inputs=[
+            temperature,
+            top_p,
+            top_k,
+            max_new_tokens,
+            repetition_penalty
+        ],
         examples=examples,
         title="Chat with Zurich",
+        description="Ask me anything! I'm here to help with explanations, coding, math, writing, and more."
     )
 demo.launch(share=True)