# llama158_chatbot.py # ๐Ÿงช INSTALLATION (run this separately in terminal before launching) # pip install torch --index-url https://download.pytorch.org/whl/cu121 # pip install git+https://github.com/huggingface/transformers.git@refs/pull/33410/head # pip install gradio from transformers import AutoModelForCausalLM, AutoTokenizer import torch import gradio as gr # ๐Ÿง  Load tokenizer and model model_id = "HF1BitLLM/Llama3-8B-1.58-100B-tokens" tokenizer_id = "meta-llama/Meta-Llama-3-8B-Instruct" print("๐Ÿ”„ Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) print("๐Ÿง  Loading 1.58-bit model...") model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16 # Ensure GPU supports BF16 (e.g. A100/4090) ) # ๐Ÿ—ฃ๏ธ Chat function def chat(user_input, history): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Assemble prompt from history full_input = "" for turn in history: full_input += f"User: {turn[0]}\nAssistant: {turn[1]}\n" full_input += f"User: {user_input}\nAssistant:" # Tokenize and truncate if needed input_ids = tokenizer.encode(full_input, return_tensors="pt", truncation=True, max_length=4000).to(device) model.to(device) try: with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=100, do_sample=True, temperature=0.7 ) response = tokenizer.decode(output[0], skip_special_tokens=True) reply = response.split("Assistant:")[-1].strip() except Exception as e: reply = f"โš ๏ธ Error: {str(e)}" history.append((user_input, reply)) return reply, history # ๐Ÿง™๐Ÿพโ€โ™‚๏ธ Launch Gradio Chat Interface with gr.Blocks(title="๐Ÿฆ™ Llama3-8B-1.58 Chatbot") as demo: gr.Markdown("## ๐Ÿฆ™ Llama3-8B-1.58 Chatbot\nChat with a super-efficient 1-bit model!") chatbot = gr.Chatbot() msg = gr.Textbox(label="Your message", placeholder="Ask me anything...") clear = gr.Button("Clear") state = gr.State([]) def respond(user_message, history): reply, new_history = chat(user_message, history) return new_history, new_history msg.submit(respond, [msg, state], [chatbot, state]) clear.click(lambda: ([], []), None, [chatbot, state]) demo.launch(share=True,debug=True)