oberbics commited on
Commit
be096d1
Β·
verified Β·
1 Parent(s): a89d538

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -16
app.py CHANGED
@@ -1,27 +1,161 @@
1
- def extract_structure(template, text):
2
- if not MODEL_LOADED:
3
- return "❌ Model not loaded", {}, "<p style='color:red'>Model failed to initialize</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Using the correct format for NuExtract-1.5
6
- prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  try:
9
- print(f"Generating with prompt: {prompt[:100]}...")
10
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
11
- outputs = model.generate(**inputs, max_new_tokens=512)
 
 
 
 
 
 
 
 
 
 
 
 
12
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
13
- print(f"Raw result: {result[:100]}...")
14
 
15
- # Extract result after the output marker
16
  if "<|output|>" in result:
17
  json_text = result.split("<|output|>")[1].strip()
18
  else:
19
- json_text = result
20
 
21
- # Try to parse as JSON
22
- extracted = json.loads(json_text)
 
23
 
24
- return "βœ… Success", extracted, f"<pre>{json.dumps(extracted, indent=2)}</pre>"
 
 
 
25
  except Exception as e:
26
- print(f"Error in extraction: {str(e)}")
27
- return f"❌ Error: {str(e)}", {}, f"<p style='color:red'>{str(e)}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import json
4
+ import re
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ from itertools import cycle
7
+ from urllib.parse import unquote
8
+
9
+ # Load model
10
+ model_name = "numind/NuExtract-1.5"
11
+ try:
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ model = AutoModelForCausalLM.from_pretrained(
14
+ model_name,
15
+ device_map="auto",
16
+ torch_dtype=torch.float16,
17
+ trust_remote_code=True
18
+ )
19
+ MODEL_LOADED = True
20
+ except Exception as e:
21
+ MODEL_LOADED = False
22
+ print(f"Model loading failed: {e}")
23
+
24
+ # Extract leaf values from JSON (simplified)
25
+ def extract_leaves(json_data):
26
+ leaves = []
27
+
28
+ def _extract(data, path=None):
29
+ if path is None:
30
+ path = []
31
+
32
+ if isinstance(data, dict):
33
+ for key, value in data.items():
34
+ new_path = path + [key]
35
+ if isinstance(value, (dict, list)):
36
+ _extract(value, new_path)
37
+ elif value and isinstance(value, str) and len(value.strip()) > 0:
38
+ leaves.append((new_path, value))
39
+ elif isinstance(data, list):
40
+ for i, item in enumerate(data):
41
+ new_path = path + [i]
42
+ if isinstance(item, (dict, list)):
43
+ _extract(item, new_path)
44
+ elif item and isinstance(item, str) and len(item.strip()) > 0:
45
+ leaves.append((new_path, item))
46
 
47
+ _extract(json_data)
48
+ return leaves
49
+
50
+ # Highlight words in text
51
+ def highlight_words(input_text, json_output):
52
+ colors = cycle(["#90ee90", "#add8e6", "#ffb6c1", "#ffff99", "#ffa07a"])
53
+ color_map = {}
54
+ highlighted_text = input_text
55
+
56
+ leaves = extract_leaves(json_output)
57
+ for path, value in leaves:
58
+ path_key = tuple(path)
59
+ if path_key not in color_map:
60
+ color_map[path_key] = next(colors)
61
+ color = color_map[path_key]
62
+
63
+ try:
64
+ escaped_value = re.escape(value).replace(r'\ ', r'\s+')
65
+ pattern = rf"(?<=[ \n\t]){escaped_value}(?=[ \n\t\.\,\?\:\;])"
66
+ replacement = f"<span style='background-color: {color};'>{unquote(value)}</span>"
67
+ highlighted_text = re.sub(pattern, replacement, highlighted_text, flags=re.IGNORECASE)
68
+ except:
69
+ # Skip highlighting if regex fails
70
+ pass
71
+
72
+ return highlighted_text
73
+
74
+ # Process function
75
+ def extract_structure(template, text, size="4000"):
76
+ if not MODEL_LOADED:
77
+ return "❌ Model not loaded", "{}", "<p style='color:red'>Model failed to initialize</p>"
78
 
79
  try:
80
+ # Get window size
81
+ window_size = 4000
82
+ if isinstance(size, str) and size.isdigit():
83
+ window_size = min(int(size), 10000) # Cap at 10k
84
+
85
+ # Format the input (simplified version without sliding window)
86
+ prompt = f"<|input|>\n### Template:\n{template}\n### Text:\n{text}\n\n<|output|>"
87
+
88
+ # Generate prediction
89
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model.device)
90
+ outputs = model.generate(
91
+ **inputs,
92
+ max_new_tokens=2000, # Reduced for testing
93
+ do_sample=False
94
+ )
95
  result = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
96
 
97
+ # Extract JSON result
98
  if "<|output|>" in result:
99
  json_text = result.split("<|output|>")[1].strip()
100
  else:
101
+ json_text = result.strip()
102
 
103
+ # Try to parse and format JSON
104
+ json_data = json.loads(json_text)
105
+ formatted_json = json.dumps(json_data, indent=2)
106
 
107
+ # Create highlighted version
108
+ html_content = highlight_words(text, json_data)
109
+
110
+ return "βœ… Success", formatted_json, html_content
111
  except Exception as e:
112
+ return f"❌ Error: {str(e)}", "{}", f"<p style='color:red'>{str(e)}</p>"
113
+
114
+ # Create interface
115
+ with gr.Blocks() as demo:
116
+ gr.Markdown("# NuExtract-1.5 Structured Data Extractor")
117
+
118
+ with gr.Row():
119
+ with gr.Column():
120
+ template = gr.Textbox(
121
+ label="Template (JSON)",
122
+ value='{"name": "", "email": ""}',
123
+ lines=5
124
+ )
125
+ text = gr.TextArea(
126
+ label="Input Text",
127
+ value="Contact: John Smith (john@example.com)",
128
+ lines=10
129
+ )
130
+ size = gr.Textbox(
131
+ label="Window Size",
132
+ value="4000",
133
+ visible=True
134
+ )
135
+ btn = gr.Button("Extract", variant="primary")
136
+
137
+ with gr.Column():
138
+ status = gr.Textbox(label="Status")
139
+ json_out = gr.Textbox(label="Extracted JSON", lines=10)
140
+ html_out = gr.HTML(label="Highlighted Text")
141
+
142
+ # Connect the button
143
+ btn.click(
144
+ fn=extract_structure,
145
+ inputs=[template, text, size],
146
+ outputs=[status, json_out, html_out]
147
+ )
148
+
149
+ # Add examples that match format
150
+ gr.Examples(
151
+ [
152
+ [
153
+ '{"name": "", "email": ""}',
154
+ 'Contact: John Smith (john@example.com)',
155
+ "4000"
156
+ ]
157
+ ],
158
+ [template, text, size]
159
+ )
160
+
161
+ demo.launch()