Spaces:

PD03
/

talk_to_data

Sleeping

App Files Files Community

PD03 commited on 22 days ago

Commit

e9af0b4

verified ·

1 Parent(s): 1b29e74

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -34

app.py CHANGED Viewed

@@ -3,64 +3,73 @@
 import gradio as gr
 import pandas as pd
 import torch
-import duckdb
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-# 1) Load data and register it in DuckDB
 df = pd.read_csv('synthetic_profit.csv')
-conn = duckdb.connect(database=':memory:')
-conn.register('sap', df)
-# 2) Build a one-line schema description
-schema = ", ".join(df.columns)  # e.g. "Region, Product, FiscalYear, ..."
-# 3) Load TAPEX (WikiSQL) for SQL generation
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
-sql_generator = pipeline(
-    "text2text-generation",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
-    device=device,
-    # limit length so it doesn’t try to output the entire table!
-    max_length=128,
 )
-# 4) Your new QA function
 def answer_profitability(question: str) -> str:
-    # 4a) Prompt the model to generate SQL
-    prompt = (
-        f"Translate to SQL for table `sap` with columns ({schema}):\n"
-        f"Question: {question}\n"
-        "SQL:"
-    )
-    sql = sql_generator(prompt)[0]['generated_text'].strip()
-    # 4b) Execute the generated SQL and return results
     try:
-        result_df = conn.execute(sql).df()
-        # pretty-print as text
-        if result_df.empty:
-            return f"No rows returned. Generated SQL was:\n{sql}"
-        return result_df.to_string(index=False)
     except Exception as e:
-        # if something goes wrong, show you the SQL so you can debug
-        return f"Error executing SQL: {e}\n\nGenerated SQL:\n{sql}"
-# 5) Gradio interface
 iface = gr.Interface(
     fn=answer_profitability,
-    inputs=gr.Textbox(lines=2, placeholder="Ask about your SAP data…"),
-    outputs="textbox",
-    title="SAP Profitability Q&A (SQL-Generation)",
     description=(
-        "Uses TAPEX to translate your natural-language question "
-        "into a SQL query over the `sap` table, then runs it via DuckDB."
     )
 )

 import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# 1) Load your synthetic profitability dataset
 df = pd.read_csv('synthetic_profit.csv')
+# 2) Ensure numeric types for Revenue, Profit, ProfitMargin
+for col in ["Revenue", "Profit", "ProfitMargin"]:
+    df[col] = pd.to_numeric(df[col], errors='coerce')
+# 3) Build the schema description
+schema_lines = [f"- {col}: {dtype.name}" for col, dtype in df.dtypes.items()]
+schema_text = "Table schema:\n" + "\n".join(schema_lines)
+# 4) Few-shot examples teaching SUM and AVERAGE
+few_shot = """
+Example 1
+Q: Total profit by region?
+A: Group “Profit” by “Region” and sum → EMEA: 30172183.37; APAC: 32301788.32; Latin America: 27585378.50; North America: 25473893.34
+Example 2
+Q: Average profit margin for Product B in Americas?
+A: Filter Product=B & Region=Americas, take mean of “ProfitMargin” → 0.18
+""".strip()
+# 5) Load TAPEX-WikiSQL for table-QA
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
+table_qa = pipeline(
+    "table-question-answering",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
+    device=device
 )
+# 6) QA function using schema-aware prompting
 def answer_profitability(question: str) -> str:
+    # Cast all values to strings so TAPEX can ingest them
+    table = df.astype(str).to_dict(orient="records")
+    # Assemble prompt with schema + examples + user question
+    prompt = f"""{schema_text}
+{few_shot}
+Q: {question}
+A:"""
     try:
+        out = table_qa(table=table, query=prompt)
+        return out.get("answer", "No answer found.")
     except Exception as e:
+        return f"Error: {e}"
+# 7) Gradio interface
 iface = gr.Interface(
     fn=answer_profitability,
+    inputs=gr.Textbox(lines=2, placeholder="Ask a question about profitability…"),
+    outputs="text",
+    title="SAP Profitability Q&A (Schema-Aware TAPEX)",
     description=(
+        "Every query is prefixed with your table’s schema and two few-shot examples, "
+        "so the model learns to SUM, AVERAGE, FILTER, etc., without hard-coded fallbacks."
     )
 )