DMMP commited on
Commit
ef8b2df
·
verified ·
1 Parent(s): 0421ba3

updated loadfile

Browse files

match + docx + odt + dataframes

Files changed (1) hide show
  1. app.py +30 -19
app.py CHANGED
@@ -22,28 +22,39 @@ def is_image_extension(filename: str) -> bool:
22
  ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
23
  return ext in IMAGE_EXTS
24
 
25
- def load_file(path: str) -> list | dict:
26
  """Based on the file extension, load the file into a suitable object."""
27
-
28
  text = None
29
- ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
30
 
31
- if ext.endswith(".png") or ext.endswith(".jpg") or ext.endswith(".jpeg"):
32
- return {"image path": path}
33
- elif ext.endswith(".xlsx") or ext.endswith(".xls"):
34
- text = pd.read_excel(path) # DataFrame
35
- elif ext.endswith(".csv"):
36
- text = pd.read_csv(path) # DataFrame
37
- elif ext.endswith(".pdf"):
38
- with pdfplumber.open(path) as pdf:
39
- text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
40
- elif ext.endswith(".py") or ext.endswith(".txt"):
41
- with open(path, 'r') as f:
42
- text = f.read() # plain text str
43
- elif ext.endswith(".mp3") or ext.endswith(".wav"):
44
- return {"audio path": path}
45
- else:
46
- return {"raw document text": text, "file path": path}
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def check_format(answer: str | list, *args, **kwargs) -> list:
49
  """Check if the answer is a list and not a nested list."""
 
22
  ext = os.path.splitext(filename)[1].lower() # os.path.splitext(path) returns (root, ext)
23
  return ext in IMAGE_EXTS
24
 
25
+ def load_file(path: str) -> dict:
26
  """Based on the file extension, load the file into a suitable object."""
27
+
28
  text = None
29
+ ext = Path(path).suffix.lower() # same as os.path.splitext(filename)[1].lower()
30
 
31
+ match ext:
32
+ case '.jpg'| '.jpeg'| '.png'| '.gif'| '.bmp'| '.tiff'| '.webp'| '.svg':
33
+ return {"image path": path}
34
+ case '.docx':
35
+ text = docx2txt.process(path)
36
+ case ".xlsx" | ".xls" :
37
+ text = pd.read_excel(path) # DataFrame
38
+ text = str(text).strip()
39
+ case '.odt':
40
+ text = load_odt(path)
41
+ text = str(text.body).strip()
42
+ pass
43
+ case ".csv":
44
+ text = pd.read_csv(path) # DataFrame
45
+ text = str(text).strip()
46
+ case ".pdf":
47
+ with pdfplumber.open(path) as pdf:
48
+ text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
49
+ case '.py' | '.txt':
50
+ with open(path, 'r') as f:
51
+ text = f.read() # plain text str
52
+ case '.mp3' | '.wav':
53
+ return {"audio path": path}
54
+ case _: # default case
55
+ text = None
56
+
57
+ return {"raw document text": text, "file path": path}
58
 
59
  def check_format(answer: str | list, *args, **kwargs) -> list:
60
  """Check if the answer is a list and not a nested list."""