Spaces:
Runtime error
Runtime error
# llama158_chatbot.py | |
# π§ͺ INSTALLATION (run this separately in terminal before launching) | |
# pip install torch --index-url https://download.pytorch.org/whl/cu121 | |
# pip install git+https://github.com/huggingface/transformers.git@refs/pull/33410/head | |
# pip install gradio | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import gradio as gr | |
# π§ Load tokenizer and model | |
model_id = "HF1BitLLM/Llama3-8B-1.58-100B-tokens" | |
tokenizer_id = "meta-llama/Meta-Llama-3-8B-Instruct" | |
print("π Loading tokenizer...") | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) | |
print("π§ Loading 1.58-bit model...") | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.bfloat16 # Ensure GPU supports BF16 (e.g. A100/4090) | |
) | |
# π£οΈ Chat function | |
def chat(user_input, history): | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Assemble prompt from history | |
full_input = "" | |
for turn in history: | |
full_input += f"User: {turn[0]}\nAssistant: {turn[1]}\n" | |
full_input += f"User: {user_input}\nAssistant:" | |
# Tokenize and truncate if needed | |
input_ids = tokenizer.encode(full_input, return_tensors="pt", truncation=True, max_length=4000).to(device) | |
model.to(device) | |
try: | |
with torch.no_grad(): | |
output = model.generate( | |
input_ids, | |
max_new_tokens=100, | |
do_sample=True, | |
temperature=0.7 | |
) | |
response = tokenizer.decode(output[0], skip_special_tokens=True) | |
reply = response.split("Assistant:")[-1].strip() | |
except Exception as e: | |
reply = f"β οΈ Error: {str(e)}" | |
history.append((user_input, reply)) | |
return reply, history | |
# π§πΎββοΈ Launch Gradio Chat Interface | |
with gr.Blocks(title="π¦ Llama3-8B-1.58 Chatbot") as demo: | |
gr.Markdown("## π¦ Llama3-8B-1.58 Chatbot\nChat with a super-efficient 1-bit model!") | |
chatbot = gr.Chatbot() | |
msg = gr.Textbox(label="Your message", placeholder="Ask me anything...") | |
clear = gr.Button("Clear") | |
state = gr.State([]) | |
def respond(user_message, history): | |
reply, new_history = chat(user_message, history) | |
return new_history, new_history | |
msg.submit(respond, [msg, state], [chatbot, state]) | |
clear.click(lambda: ([], []), None, [chatbot, state]) | |
demo.launch(share=True,debug=True) | |