Spaces:

sagar007
/

lama_storm_8b

Sleeping

App Files Files Community

lama_storm_8b / app.py

sagar007

Update app.py

cc1b568 verified about 1 year ago

raw

history blame

1.74 kB

	import gradio as gr
	import spaces
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import subprocess

	# Install flash-attn
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	# Load the model and tokenizer
	model_name = "akjindal53244/Llama-3.1-Storm-8B"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.bfloat16,
	use_flash_attention_2=True,
	device_map="auto"
	)

	@spaces.GPU(duration=120)
	def generate_text(prompt, max_length, temperature):
	messages = [
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

	inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")

	outputs = model.generate(
	**inputs,
	max_new_tokens=max_length,
	do_sample=True,
	temperature=temperature,
	top_k=100,
	top_p=0.95,
	)

	return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

	iface = gr.Interface(
	fn=generate_text,
	inputs=[
	gr.Textbox(lines=5, label="Prompt"),
	gr.Slider(minimum=1, maximum=500, value=128, step=1, label="Max Length"),
	gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
	],
	outputs=gr.Textbox(lines=10, label="Generated Text"),
	title="Llama-3.1-Storm-8B Text Generation",
	description="Enter a prompt to generate text using the Llama-3.1-Storm-8B model.",
	)

	iface.launch()