# app.py from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from threading import Thread import gradio as gr import re import torch # load model and tokenizer model_name = "inclusionAI/Ling-lite-1.5" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", device_map="auto", trust_remote_code=True ).eval() # define chat function def chat(user_input, max_new_tokens=2048): # chat history messages = [ {"role": "system", "content": "You are Ling, an assistant created by inclusionAI"}, {"role": "user", "content": user_input} ] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # encode the input prompt inputs = tokenizer(prompt, return_tensors="pt").to(model.device) #create streamer streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True) def generate(): model.generate(**inputs, max_new_tokens=max_new_tokens, streamer=streamer) thread = Thread(target=generate) thread.start() start_idx = len("SYSTEM") + len(messages[0]["content"]) + len("HUMAN") + len(user_input) + len("ASSISTANT") generated_text = "" for new_text in streamer: generated_text += new_text yield generated_text[start_idx:] thread.join() # Create a custom layout using Blocks with gr.Blocks(css=""" #markdown-output { height: 300px; overflow-y: auto; border: 1px solid #ddd; padding: 10px; } """) as demo: gr.Markdown( "## Ling-lite-1.5 AI Assistant\n" "Based on [inclusionAI/Ling-lite-1.5](https://huggingface.co/inclusionAI/Ling-lite-1.5) " ) with gr.Row(): max_tokens_slider = gr.Slider(minimum=128, maximum=2048, step=16, label="Generated length") # output_box = gr.Textbox(lines=10, label="Response") output_box = gr.Markdown(label="Response", elem_id="markdown-output") input_box = gr.Textbox(lines=8, label="Input you question") examples = gr.Examples( examples=[ ["Introducing the basic concepts of large language models"], ["How to solve long context dependencies in math problems?"] ], inputs=input_box ) interface = gr.Interface( fn=chat, inputs=[input_box, max_tokens_slider], outputs=output_box, live=False # disable auto-triggering on input change ) # launch Gradio Service demo.queue() demo.launch() # Construct Gradio Interface #interface = gr.Interface( # fn=chat, # inputs=[ # gr.Textbox(lines=8, label="输入你的问题"), # gr.Slider(minimum=100, maximum=102400, step=50, label="生成长度") # ], # outputs=[ # gr.Textbox(lines=8, label="模型回复") # ], # title="Ling-lite-1.5 AI助手", # description="基于 [inclusionAI/Ling-lite-1.5](https://huggingface.co/inclusionAI/Ling-lite-1.5) 的对话式文本生成演示。", # examples=[ # ["介绍大型语言模型的基本概念"], # ["如何解决数学问题中的长上下文依赖?"] # ] #) # launch Gradion Service #interface.launch()