Spaces:

jawakja
/

Mulktimodal_chatbot

Sleeping

App Files Files Community

jawakja commited on May 20

Commit

99f77c1

verified ·

1 Parent(s): 83b24f2

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -91

app.py CHANGED Viewed

@@ -5,21 +5,50 @@ import cv2
 import os
 import tempfile
 import shutil
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from sentence_transformers import SentenceTransformer
 import faiss
-# Load Qwen-VL-Chat
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen-VL-Chat",
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    trust_remote_code=True
-).eval()
-# Embedding model
-embed_model = SentenceTransformer('all-MiniLM-L6-v2')
 # Global state for FAISS
 chunks = []
@@ -27,98 +56,153 @@ index = None
 # PDF processing
 def extract_chunks_from_pdf(pdf_path, chunk_size=1000, overlap=200):
-    doc = fitz.open(pdf_path)
-    text = ""
-    for page in doc:
-        text += page.get_text()
-    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
 def build_faiss_index(chunks):
-    embeddings = embed_model.encode(chunks, convert_to_numpy=True)
-    dim = embeddings.shape[1]
-    idx = faiss.IndexFlatL2(dim)
-    idx.add(embeddings)
-    return idx
 def rag_query(query, chunks, index, top_k=3):
-    q_emb = embed_model.encode([query], convert_to_numpy=True)
-    D, I = index.search(q_emb, top_k)
-    return "\n\n".join([chunks[i] for i in I[0]])
-# Vision/Text chat
 def chat_with_qwen(text=None, image=None):
-    elements = []
-    if image:
-        elements.append({"image": image})
-    if text:
-        elements.append({"text": text})
-    if not elements:
-        return "Please upload or type something."
-    query = tokenizer.from_list_format(elements)
-    response, _ = model.chat(tokenizer, query, history=None)
-    return response
-# Video frame extraction
-def extract_video_frames(video_path, max_frames=3):
-    cap = cv2.VideoCapture(video_path)
-    frames, count = [], 0
-    while len(frames) < max_frames:
-        success, frame = cap.read()
-        if not success:
-            break
-        frames.append(frame)
-        count += 1
-        cap.set(cv2.CAP_PROP_POS_FRAMES, count * 30)
-    cap.release()
-    return frames
-# Main chatbot logic
 def multimodal_chat(message, history, image=None, video=None, pdf=None):
     global chunks, index
-    # PDF-based RAG
-    if pdf:
-        chunks = extract_chunks_from_pdf(pdf.name)
-        index = build_faiss_index(chunks)
-        context = rag_query(message, chunks, index)
-        final_prompt = f"Context:\n{context}\n\nQuestion: {message}"
-        response = chat_with_qwen(final_prompt)
-        return response
-    # Image
-    if image:
-        response = chat_with_qwen(message, image)
-        return response
-    # Video (extract frames and send all in one call)
-    if video:
-        temp_dir = tempfile.mkdtemp()
-        video_path = os.path.join(temp_dir, "vid.mp4")
-        shutil.copy(video, video_path)
-        frames = extract_video_frames(video_path)
-        # Save and collect image paths
-        images = []
-        for i, frame in enumerate(frames):
-            temp_img_path = os.path.join(temp_dir, f"frame_{i}.jpg")
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            cv2.imwrite(temp_img_path, frame_rgb)
-            images.append(temp_img_path)
-        # Combine all frames and text into one query
-        elements = [{"image": img} for img in images]
-        if message:
-            elements.append({"text": message})
-        query = tokenizer.from_list_format(elements)
-        response, _ = model.chat(tokenizer, query, history=None)
-        return response
-    # Text only
-    if message:
-        return chat_with_qwen(message)
-    return "Please input a message, image, video, or PDF."
 # ---- Gradio UI ---- #
 with gr.Blocks(css="""
@@ -148,7 +232,7 @@ padding: 16px;
 footer {display: none !important;}
 """) as demo:
     gr.Markdown(
-        "<h1 style='text-align: center;'>Multimodal Chatbot powered by LLAVACMVRL and QWEN-VL</h1>"
         "<p style='text-align: center;'>Ask questions with text, images, videos, or PDFs in a smart and multimodal way.</p>"
     )
@@ -165,6 +249,8 @@ footer {display: none !important;}
         pdf_input = gr.File(file_types=[".pdf"], label="Upload PDF")
     def user_send(message, history, image, video, pdf):
         response = multimodal_chat(message, history, image, video, pdf)
         history.append((message, response))
         return "", history
@@ -172,5 +258,6 @@ footer {display: none !important;}
     send_btn.click(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
     txt.submit(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
-# Launch the app
 demo.launch()

 import os
 import tempfile
 import shutil
+import logging
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from sentence_transformers import SentenceTransformer
 import faiss
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Check available resources
+logger.info(f"CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    logger.info(f"GPU: {torch.cuda.get_device_name(0)}")
+    logger.info(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")
+# Configure quantization for lower memory usage
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+)
+try:
+    # Load Qwen-2.5-Omni-3B with memory optimizations
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Omni-3B", trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        "Qwen/Qwen2.5-Omni-3B",
+        device_map="auto",
+        quantization_config=bnb_config,
+        trust_remote_code=True
+    ).eval()
+    logger.info("Model loaded successfully")
+except Exception as e:
+    logger.error(f"Error loading model: {e}")
+    model = None
+    tokenizer = None
+# Use a smaller embedding model
+try:
+    embed_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
+    logger.info("Embedding model loaded successfully")
+except Exception as e:
+    logger.error(f"Error loading embedding model: {e}")
+    embed_model = None
 # Global state for FAISS
 chunks = []
 # PDF processing
 def extract_chunks_from_pdf(pdf_path, chunk_size=1000, overlap=200):
+    try:
+        doc = fitz.open(pdf_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - overlap)]
+    except Exception as e:
+        logger.error(f"PDF extraction error: {e}")
+        return ["Error extracting PDF content"]
 def build_faiss_index(chunks):
+    try:
+        if not embed_model:
+            return None
+        embeddings = embed_model.encode(chunks, convert_to_numpy=True)
+        dim = embeddings.shape[1]
+        idx = faiss.IndexFlatL2(dim)
+        idx.add(embeddings)
+        return idx
+    except Exception as e:
+        logger.error(f"FAISS index error: {e}")
+        return None
 def rag_query(query, chunks, index, top_k=3):
+    if not index or not embed_model:
+        return "Embedding model not available"
+    try:
+        q_emb = embed_model.encode([query], convert_to_numpy=True)
+        D, I = index.search(q_emb, top_k)
+        return "\n\n".join([chunks[i] for i in I[0]])
+    except Exception as e:
+        logger.error(f"RAG query error: {e}")
+        return "Error retrieving context"
+# Vision/Text chat with Qwen-2.5-Omni
 def chat_with_qwen(text=None, image=None):
+    if not model or not tokenizer:
+        return "Model failed to load due to resource constraints. Try a smaller model or upgrade your space."
+    try:
+        # For Qwen-2.5-Omni-3B
+        messages = []
+        if image:
+            # Add the image as a message
+            messages.append({"role": "user", "content": [
+                {"image": image},
+                {"text": text if text else "Please describe this image."}
+            ]})
+        else:
+            # Text-only query
+            messages.append({"role": "user", "content": text})
+        # Generate response
+        response = model.chat(tokenizer, messages)
+        return response
+    except Exception as e:
+        logger.error(f"Chat error: {e}")
+        return f"Error generating response: {str(e)}"
+# Video frame extraction - more memory efficient
+def extract_video_frames(video_path, max_frames=2):
+    try:
+        cap = cv2.VideoCapture(video_path)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        frames = []
+        # Take fewer, evenly distributed frames
+        if total_frames > 0:
+            frame_indices = [int(i * total_frames / max_frames) for i in range(max_frames)]
+            for idx in frame_indices:
+                cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+                success, frame = cap.read()
+                if success:
+                    frames.append(frame)
+        cap.release()
+        return frames
+    except Exception as e:
+        logger.error(f"Video frame extraction error: {e}")
+        return []
+# Main chatbot logic with error handling
 def multimodal_chat(message, history, image=None, video=None, pdf=None):
     global chunks, index
+    if not model:
+        return "Model not loaded due to memory constraints. Try upgrading your Hugging Face space."
+    try:
+        # PDF-based RAG
+        if pdf:
+            chunks = extract_chunks_from_pdf(pdf.name)
+            index = build_faiss_index(chunks)
+            if index:
+                context = rag_query(message, chunks, index)
+                final_prompt = f"I'll provide some context, then ask a question. Context:\n{context}\n\nQuestion: {message}"
+                response = chat_with_qwen(final_prompt)
+            else:
+                response = "Could not process PDF due to resource constraints"
+            return response
+        # Image
+        if image:
+            response = chat_with_qwen(message, image)
+            return response
+        # Video (extract frames and process one by one)
+        if video:
+            temp_dir = tempfile.mkdtemp()
+            try:
+                video_path = os.path.join(temp_dir, "vid.mp4")
+                shutil.copy(video, video_path)
+                frames = extract_video_frames(video_path)
+                # Only process if we got frames
+                if frames:
+                    # Save frames and process them
+                    frame_descriptions = []
+                    for i, frame in enumerate(frames):
+                        temp_img_path = os.path.join(temp_dir, f"frame_{i}.jpg")
+                        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                        cv2.imwrite(temp_img_path, frame_rgb)
+                        # Get description for this frame
+                        frame_query = "Describe this video frame in detail."
+                        frame_description = chat_with_qwen(frame_query, temp_img_path)
+                        frame_descriptions.append(f"Frame {i+1}: {frame_description}")
+                    # Combine frame descriptions and answer the user's question
+                    combined_context = "\n\n".join(frame_descriptions)
+                    final_prompt = f"I analyzed some video frames and here's what I found:\n\n{combined_context}\n\nBased on these video frames, {message if message else 'please describe what's happening in this video.'}"
+                    response = chat_with_qwen(final_prompt)
+                    return response
+                else:
+                    return "Could not extract video frames"
+            finally:
+                # Cleanup temp files
+                shutil.rmtree(temp_dir, ignore_errors=True)
+        # Text only
+        if message:
+            return chat_with_qwen(message)
+        return "Please input a message, image, video, or PDF."
+    except Exception as e:
+        logger.error(f"General error in multimodal_chat: {e}")
+        return f"Error processing your request: {str(e)}. This may be due to memory constraints."
 # ---- Gradio UI ---- #
 with gr.Blocks(css="""
 footer {display: none !important;}
 """) as demo:
     gr.Markdown(
+        "<h1 style='text-align: center;'>Multimodal Chatbot powered by Qwen-2.5-Omni-3B</h1>"
         "<p style='text-align: center;'>Ask questions with text, images, videos, or PDFs in a smart and multimodal way.</p>"
     )
         pdf_input = gr.File(file_types=[".pdf"], label="Upload PDF")
     def user_send(message, history, image, video, pdf):
+        if not message and not image and not video and not pdf:
+            return "", history
         response = multimodal_chat(message, history, image, video, pdf)
         history.append((message, response))
         return "", history
     send_btn.click(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
     txt.submit(user_send, [txt, state, image_input, video_input, pdf_input], [txt, chatbot])
+# Launch the app with memory logging
+logger.info("Starting Gradio app")
 demo.launch()