MultiAgent_System_for_Screenplay_Creation

Running

App Files Files Community

luke9705 commited on Jun 7

Commit

c1164ec

1 Parent(s): b107f85

Refactor load_file function to return paths for images and audio; add caption_image tool for generating image descriptions using Gemma3

Browse files

Files changed (1) hide show

app.py +40 -16

app.py CHANGED Viewed

@@ -24,12 +24,11 @@ def is_image_extension(filename: str) -> bool:
 def load_file(path: str) -> list | dict:
     """Based on the file extension, load the file into a suitable object."""
-    image = None
     text = None
     ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
     if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
-        image = Image.open(path).convert("RGB")            # pillow object
     elif ext.endswith(".xlsx") or ext.endswith(".xls"):
         text = pd.read_excel(path)                        # DataFrame
     elif ext.endswith(".csv"):
@@ -39,10 +38,7 @@ def load_file(path: str) -> list | dict:
             text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
     elif ext.endswith(".py") or ext.endswith(".txt"):
         with open(path, 'r') as f:
-            text = f.read()                               # plain text str
-    if image is not None:
-        return [image]
     elif ext.endswith(".mp3") or ext.endswith(".wav"):
         return {"audio path": path}
     else:
@@ -197,6 +193,38 @@ def generate_audio_from_sample(prompt: str, duration: int, sample_path: str = No
         sound = client(prompt, duration, sample_path)
     return gr.Audio(value=sound)
 ## agent definition
@@ -204,6 +232,7 @@ class Agent:
     def __init__(self, ):
         #client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         client = HfApiModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         """client = OpenAIServerModel(
             model_id="claude-opus-4-20250514",
             api_base="https://api.anthropic.com/v1/",
@@ -216,6 +245,7 @@ class Agent:
                    generate_image,
                    generate_audio_from_sample,
                    generate_audio,
                    download_images,
                    transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
@@ -237,6 +267,7 @@ class Agent:
         answer = self.agent.run(message, images = images, additional_args={"files": files, "conversation_history": conversation_history})
         return answer
 ## gradio functions
 def respond(message: str, history : dict, web_search: bool = False):
@@ -251,14 +282,7 @@ def respond(message: str, history : dict, web_search: bool = False):
         message = agent(text, conversation_history=history)
     else:
         files = message.get("files", [])
-        print(f"files received: {files}")
-        if is_image_extension(files[0]) and not web_search:
-            image = load_file(files[0]) # assuming only one file is uploaded at a time (gradio default behavior)
-            message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", images=image, conversation_history=history)
-        elif is_image_extension(files[0]) and web_search:
-            image = load_file(files[0])
-            message = agent(text, images=image, conversation_history=history)
-        elif not web_search:
             file = load_file(files[0])
             message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", files=file, conversation_history=history)
         else:
@@ -276,7 +300,6 @@ def initialize_agent():
     return agent
 ## gradio interface
 global agent
 agent = initialize_agent()
 demo = gr.ChatInterface(
@@ -289,13 +312,14 @@ demo = gr.ChatInterface(
                     fill_width=True,
                     save_history=True,
                     autoscroll=True,
                     additional_inputs=[
                         gr.Checkbox(value=False, label="Web Search",
                                 info="Enable web search to find information online. If disabled, the agent will only use the provided files and images.",
                                 render=False),
                             ],
                     additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
-                        )
 if __name__ == "__main__":

 def load_file(path: str) -> list | dict:
     """Based on the file extension, load the file into a suitable object."""
     text = None
     ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
     if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
+        return {"image path": path}
     elif ext.endswith(".xlsx") or ext.endswith(".xls"):
         text = pd.read_excel(path)                        # DataFrame
     elif ext.endswith(".csv"):
             text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
     elif ext.endswith(".py") or ext.endswith(".txt"):
         with open(path, 'r') as f:
+            text = f.read()                               # plain text str
     elif ext.endswith(".mp3") or ext.endswith(".wav"):
         return {"audio path": path}
     else:
         sound = client(prompt, duration, sample_path)
     return gr.Audio(value=sound)
+@tool
+def caption_image(img_path: str, prompt: str) -> str:
+    """
+    Generate a caption for an image at the given path using Gemma3.
+    Args:
+        img_path: The file path to the image to be captioned.
+        prompt: A text prompt describing what you want the model to focus on or ask about the image.
+    Returns:
+        str: A description of the image.
+    """
+    client_2 = HfApiModel("google/gemma-3-27b-it",
+                          provider="nebius",
+                          api_key=os.getenv("NEBIUS_API_KEY"))
+    with open(img_path, "rb") as f:
+        encoded = base64.b64encode(f.read()).decode("utf-8")
+    data_uri = f"data:image/jpeg;base64,{encoded}"
+    messages = [{"role": "user", "content": [
+                    {
+                        "type": "text",
+                        "text": prompt,
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri
+                        }
+                    }
+                ]}]
+    resp = client_2(messages)
+    return resp.content
 ## agent definition
     def __init__(self, ):
         #client = HfApiModel("deepseek-ai/DeepSeek-R1-0528", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         client = HfApiModel("Qwen/Qwen3-32B", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         """client = OpenAIServerModel(
             model_id="claude-opus-4-20250514",
             api_base="https://api.anthropic.com/v1/",
                    generate_image,
                    generate_audio_from_sample,
                    generate_audio,
+                   caption_image,
                    download_images,
                    transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
         answer = self.agent.run(message, images = images, additional_args={"files": files, "conversation_history": conversation_history})
         return answer
 ## gradio functions
 def respond(message: str, history : dict, web_search: bool = False):
         message = agent(text, conversation_history=history)
     else:
         files = message.get("files", [])
+        if not web_search:
             file = load_file(files[0])
             message = agent(text + "\nADDITIONAL CONTRAINT: Don't use web search", files=file, conversation_history=history)
         else:
     return agent
 ## gradio interface
 global agent
 agent = initialize_agent()
 demo = gr.ChatInterface(
                     fill_width=True,
                     save_history=True,
                     autoscroll=True,
+                    #css = css_snippet,
                     additional_inputs=[
                         gr.Checkbox(value=False, label="Web Search",
                                 info="Enable web search to find information online. If disabled, the agent will only use the provided files and images.",
                                 render=False),
                             ],
                     additional_inputs_accordion=gr.Accordion(label="Tools available: ", open=True, render=False)
+                        ).queue()
 if __name__ == "__main__":