MultiAgent_System_for_Screenplay_Creation

Running

App Files Files Community

luke9705 commited on Jun 4

Commit

5090fe0

1 Parent(s): 5a4500e

lack image generation

Browse files

Files changed (1) hide show

app.py +57 -19

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import os
 import pandas as pd
 from PIL import Image
 from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool
@@ -11,12 +12,12 @@ from pathlib import Path
 import openai
 ## utilty functions
-def is_image_extension(filename: str) -> bool: # not used in the code, but useful to have
     IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
     ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
     return ext in IMAGE_EXTS
-def load_file(path: list) -> dict:
     """Based on the file extension, load the file into a suitable object."""
     image = None
@@ -24,7 +25,6 @@ def load_file(path: list) -> dict:
     csv = None
     text = None
     ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
-    print(f"ext: {ext}")
     if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
         image = Image.open(path).convert("RGB")            # pillow object
@@ -35,11 +35,11 @@ def load_file(path: list) -> dict:
     elif ext.endswith(".py") or ext.endswith(".txt"):
         with open(path, 'r') as f:
             text = f.read()                               # plain text str
-    elif ext.endswith(".mp3") or ext.endswith(".wav"):
-        with open(path, 'wb') as f:
-            f.write("output.mp3")                         # binary data (leave it hardcoded for now)
-    return {"image" : image, "excel": excel, "csv": csv, "raw text": text}
 ## tools definition
@@ -69,15 +69,16 @@ def download_images(image_urls: str) -> list:
     return images
 @tool # since they gave us OpenAI API credits, we can keep using it
-def transcribe_audio() -> str:
     """
     Transcribe audio file using OpenAI Whisper API.
-    The path to the audio file is hardcoded as "output.mp3". Don't need to pass it as an argument.
     Returns:
-        str: Transcription of the audio.
     """
     client = openai.Client(api_key=os.getenv("OPEN_AI_API_KEY"))
-    with open("output.mp3", "rb") as audio:                 # to modify path because it is arriving from gradio
         transcript = client.audio.transcriptions.create(
         file=audio,
         model="whisper-1",
@@ -89,6 +90,39 @@ def transcribe_audio() -> str:
     except Exception as e:
         print(f"Error transcribing audio: {e}")
 ## agent definition
 class Agent:
@@ -96,7 +130,7 @@ class Agent:
         client = HfApiModel("google/gemma-3-27b-it", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         self.agent = CodeAgent(
             model=client,
-            tools=[DuckDuckGoSearchTool(max_results=5), VisitWebpageTool(max_output_length=20000), download_images, transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
             planning_interval=1,
             max_steps=5,
@@ -105,21 +139,25 @@ class Agent:
         #print("System prompt:", self.agent.prompt_templates["system_prompt"])
     def __call__(self, message: str, images: Optional[list[Image.Image]] = None, files: Optional[str] = None) -> str:
-        answer = self.agent.run(message, additional_args={"images": images ,"files": files})
         return answer
 ## gradio functions
 def respond(message, history):
     text = message.get("text", "")
-    if not message.get("files"):
         print("No files received.")
         message = agent(text)
     else:
         files = message.get("files", [])
         print(f"files received: {files}")
-        file = load_file(files[0])
-        message = agent(text, files=file)
     return message
@@ -128,7 +166,7 @@ def initialize_agent():
     print("Agent initialized.")
     return agent
 with gr.Blocks() as demo:
     global agent
     agent = initialize_agent()
@@ -136,7 +174,7 @@ with gr.Blocks() as demo:
                         fn=respond,
                         type='messages',
                         multimodal=True,
-                        title='MultiAgent_System_for_Screenplay_Creation_and_Editing',
                         show_progress='full'
                     )

 import gradio as gr
 import os
+import base64
 import pandas as pd
 from PIL import Image
 from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, VisitWebpageTool, OpenAIServerModel, tool
 import openai
 ## utilty functions
+def is_image_extension(filename: str) -> bool:
     IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.svg'}
     ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
     return ext in IMAGE_EXTS
+def load_file(path: str) -> list | dict:
     """Based on the file extension, load the file into a suitable object."""
     image = None
     csv = None
     text = None
     ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
     if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
         image = Image.open(path).convert("RGB")            # pillow object
     elif ext.endswith(".py") or ext.endswith(".txt"):
         with open(path, 'r') as f:
             text = f.read()                               # plain text str
+    if image is not None:
+        return [image]
+    else:
+        return {"excel": excel, "csv": csv, "raw text": text, "audio path": path}
 ## tools definition
     return images
 @tool # since they gave us OpenAI API credits, we can keep using it
+def transcribe_audio(audio_path: str) -> str:
     """
     Transcribe audio file using OpenAI Whisper API.
+    Args:
+        audio_path (str): path to the audio file to be transcribed.
     Returns:
+        str : Transcription of the audio.
     """
     client = openai.Client(api_key=os.getenv("OPEN_AI_API_KEY"))
+    with open(audio_path, "rb") as audio:                 # to modify path because it is arriving from gradio
         transcript = client.audio.transcriptions.create(
         file=audio,
         model="whisper-1",
     except Exception as e:
         print(f"Error transcribing audio: {e}")
+@tool
+def generate_image(prompt: str, neg_prompt: str) -> Image.Image:
+    """
+    Generate an image based on a text prompt using Flux Dev.
+    Args:
+        prompt (str): The text prompt to generate the image from.
+        neg_prompt (str): The negative prompt to avoid certain elements in the image.
+    Returns:
+        Image.Image: The generated image as a PIL Image object.
+    """
+    client = OpenAI(base_url="https://api.studio.nebius.com/v1",
+                    api_key=os.environ.get("NEBIUS_API_KEY"),
+                    )
+    completion = client.images.generate(
+        model="black-forest-labs/flux-dev",
+        prompt=prompt,
+        response_format="b64_json",
+        extra_body={
+            "response_extension": "png",
+            "width": 1024,
+            "height": 1024,
+            "num_inference_steps": 30,
+            "seed": -1,
+            "negative_prompt": neg_prompt,
+        }
+    )
+    image_data = base64.b64decode(completion.to_dict()['data'][0]['b64_json'])
+    image = Image.open(BytesIO(image_data))
+    return image
 ## agent definition
 class Agent:
         client = HfApiModel("google/gemma-3-27b-it", provider="nebius", api_key=os.getenv("NEBIUS_API_KEY"))
         self.agent = CodeAgent(
             model=client,
+            tools=[DuckDuckGoSearchTool(max_results=5), VisitWebpageTool(max_output_length=20000), generate_image, download_images, transcribe_audio],
             additional_authorized_imports=["pandas", "PIL", "io"],
             planning_interval=1,
             max_steps=5,
         #print("System prompt:", self.agent.prompt_templates["system_prompt"])
     def __call__(self, message: str, images: Optional[list[Image.Image]] = None, files: Optional[str] = None) -> str:
+        answer = self.agent.run(message, images = images, additional_args={"files": files})
         return answer
 ## gradio functions
 def respond(message, history):
     text = message.get("text", "")
+    if not message.get("files"): # no files uploaded
         print("No files received.")
         message = agent(text)
     else:
         files = message.get("files", [])
         print(f"files received: {files}")
+        if is_image_extension(files[0]):
+            image = load_file(files[0]) # assuming only one file is uploaded at a time (gradio default behavior)
+            message = agent(text, images=image)
+        else:
+            file = load_file(files[0])
+            message = agent(text, files=file)
     return message
     print("Agent initialized.")
     return agent
+## gradio interface
 with gr.Blocks() as demo:
     global agent
     agent = initialize_agent()
                         fn=respond,
                         type='messages',
                         multimodal=True,
+                        title='MultiAgent System for Screenplay Creation and Editing',
                         show_progress='full'
                     )