MultiAgent_System_for_Screenplay_Creation

Running

App Files Files Community

DMMP commited on Jun 8

Commit

ef8b2df

verified ·

1 Parent(s): 0421ba3

updated loadfile

Browse files

match + docx + odt + dataframes

Files changed (1) hide show

app.py +30 -19

app.py CHANGED Viewed

@@ -22,28 +22,39 @@ def is_image_extension(filename: str) -> bool:
     ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
     return ext in IMAGE_EXTS
-def load_file(path: str) -> list | dict:
     """Based on the file extension, load the file into a suitable object."""
     text = None
-    ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
-    if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
-        return {"image path": path}
-    elif ext.endswith(".xlsx") or ext.endswith(".xls"):
-        text = pd.read_excel(path)                        # DataFrame
-    elif ext.endswith(".csv"):
-        text = pd.read_csv(path)                          # DataFrame
-    elif ext.endswith(".pdf"):
-        with pdfplumber.open(path) as pdf:
-            text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
-    elif ext.endswith(".py") or ext.endswith(".txt"):
-        with open(path, 'r') as f:
-            text = f.read()                               # plain text str
-    elif ext.endswith(".mp3") or ext.endswith(".wav"):
-        return {"audio path": path}
-    else:
-        return {"raw document text": text, "file path": path}
 def check_format(answer: str | list, *args, **kwargs) -> list:
     """Check if the answer is a list and not a nested list."""

     ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
     return ext in IMAGE_EXTS
+def load_file(path: str) -> dict:
     """Based on the file extension, load the file into a suitable object."""
     text = None
+    ext = Path(path).suffix.lower()  # same as os.path.splitext(filename)[1].lower()
+    match ext:
+        case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
+            return {"image path": path}
+        case '.docx':
+            text = docx2txt.process(path)
+        case ".xlsx" | ".xls" :
+            text = pd.read_excel(path)  # DataFrame
+            text = str(text).strip()
+        case '.odt':
+            text = load_odt(path)
+            text = str(text.body).strip()
+            pass
+        case ".csv":
+            text = pd.read_csv(path)  # DataFrame
+            text = str(text).strip()
+        case ".pdf":
+            with pdfplumber.open(path) as pdf:
+                text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
+        case '.py' | '.txt':
+            with open(path, 'r') as f:
+                text = f.read()  # plain text str
+        case '.mp3' | '.wav':
+            return {"audio path": path}
+        case _: # default case
+            text = None
+    return {"raw document text": text, "file path": path}
 def check_format(answer: str | list, *args, **kwargs) -> list:
     """Check if the answer is a list and not a nested list."""