Spaces:

oberbics
/

HistorySpace

Sleeping

App Files Files Community

oberbics commited on Apr 13

Commit

be096d1

verified ·

1 Parent(s): a89d538

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -16

app.py CHANGED Viewed

@@ -1,27 +1,161 @@
-def extract_structure(template, text):
-    if not MODEL_LOADED:
-        return "❌ Model not loaded", {}, "<p style='color:red'>Model failed to initialize</p>"
-    # Using the correct format for NuExtract-1.5
-    prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
     try:
-        print(f"Generating with prompt: {prompt[:100]}...")
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        outputs = model.generate(**inputs, max_new_tokens=512)
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        print(f"Raw result: {result[:100]}...")
-        # Extract result after the output marker
         if "<|output|>" in result:
             json_text = result.split("<|output|>")[1].strip()
         else:
-            json_text = result
-        # Try to parse as JSON
-        extracted = json.loads(json_text)
-        return "✅ Success", extracted, f"<pre>{json.dumps(extracted, indent=2)}</pre>"
     except Exception as e:
-        print(f"Error in extraction: {str(e)}")
-        return f"❌ Error: {str(e)}", {}, f"<p style='color:red'>{str(e)}</p>"

+import gradio as gr
+import torch
+import json
+import re
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from itertools import cycle
+from urllib.parse import unquote
+# Load model
+model_name = "numind/NuExtract-1.5"
+try:
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_map="auto",
+        torch_dtype=torch.float16,
+        trust_remote_code=True
+    )
+    MODEL_LOADED = True
+except Exception as e:
+    MODEL_LOADED = False
+    print(f"Model loading failed: {e}")
+# Extract leaf values from JSON (simplified)
+def extract_leaves(json_data):
+    leaves = []
+    def _extract(data, path=None):
+        if path is None:
+            path = []
+        if isinstance(data, dict):
+            for key, value in data.items():
+                new_path = path + [key]
+                if isinstance(value, (dict, list)):
+                    _extract(value, new_path)
+                elif value and isinstance(value, str) and len(value.strip()) > 0:
+                    leaves.append((new_path, value))
+        elif isinstance(data, list):
+            for i, item in enumerate(data):
+                new_path = path + [i]
+                if isinstance(item, (dict, list)):
+                    _extract(item, new_path)
+                elif item and isinstance(item, str) and len(item.strip()) > 0:
+                    leaves.append((new_path, item))
+    _extract(json_data)
+    return leaves
+# Highlight words in text
+def highlight_words(input_text, json_output):
+    colors = cycle(["#90ee90", "#add8e6", "#ffb6c1", "#ffff99", "#ffa07a"])
+    color_map = {}
+    highlighted_text = input_text
+    leaves = extract_leaves(json_output)
+    for path, value in leaves:
+        path_key = tuple(path)
+        if path_key not in color_map:
+            color_map[path_key] = next(colors)
+        color = color_map[path_key]
+        try:
+            escaped_value = re.escape(value).replace(r'\ ', r'\s+')
+            pattern = rf"(?<=[ \n\t]){escaped_value}(?=[ \n\t\.\,\?\:\;])"
+            replacement = f"<span style='background-color: {color};'>{unquote(value)}</span>"
+            highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
+        except:
+            # Skip highlighting if regex fails
+            pass
+    return highlighted_text
+# Process function
+def extract_structure(template, text, size="4000"):
+    if not MODEL_LOADED:
+        return "❌ Model not loaded", "{}", "<p style='color:red'>Model failed to initialize</p>"
     try:
+        # Get window size
+        window_size = 4000
+        if isinstance(size, str) and size.isdigit():
+            window_size = min(int(size), 10000)  # Cap at 10k
+        # Format the input (simplified version without sliding window)
+        prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
+        # Generate prediction
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=2000,  # Reduced for testing
+            do_sample=False
+        )
         result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract JSON result
         if "<|output|>" in result:
             json_text = result.split("<|output|>")[1].strip()
         else:
+            json_text = result.strip()
+        # Try to parse and format JSON
+        json_data = json.loads(json_text)
+        formatted_json = json.dumps(json_data, indent=2)
+        # Create highlighted version
+        html_content = highlight_words(text, json_data)
+        return "✅ Success", formatted_json, html_content
     except Exception as e:
+        return f"❌ Error: {str(e)}", "{}", f"<p style='color:red'>{str(e)}</p>"
+# Create interface
+with gr.Blocks() as demo:
+    gr.Markdown("# NuExtract-1.5 Structured Data Extractor")
+    with gr.Row():
+        with gr.Column():
+            template = gr.Textbox(
+                label="Template (JSON)",
+                value='{"name": "", "email": ""}',
+                lines=5
+            )
+            text = gr.TextArea(
+                label="Input Text",
+                value="Contact: John Smith (john@example.com)",
+                lines=10
+            )
+            size = gr.Textbox(
+                label="Window Size",
+                value="4000",
+                visible=True
+            )
+            btn = gr.Button("Extract", variant="primary")
+        with gr.Column():
+            status = gr.Textbox(label="Status")
+            json_out = gr.Textbox(label="Extracted JSON", lines=10)
+            html_out = gr.HTML(label="Highlighted Text")
+    # Connect the button
+    btn.click(
+        fn=extract_structure,
+        inputs=[template, text, size],
+        outputs=[status, json_out, html_out]
+    )
+    # Add examples that match format
+    gr.Examples(
+        [
+            [
+                '{"name": "", "email": ""}',
+                'Contact: John Smith (john@example.com)',
+                "4000"
+            ]
+        ],
+        [template, text, size]
+    )
+demo.launch()