|
import gradio as gr |
|
import json |
|
import torch |
|
import os |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
import spaces |
|
|
|
title = """ # 🙋🏻♂️Welcome to 🌟Tonic's 🌊 Osmosis Structure - Text to JSON Converter |
|
""" |
|
description = """ |
|
Convert unstructured text into well-formatted JSON using the Osmosis Structure 0.6B model. |
|
This model is specifically trained for structured data extraction and format conversion. |
|
|
|
### ℹ️ About Osmosis Structure |
|
|
|
- **Model**: Osmosis Structure 0.6B parameters |
|
- **Architecture**: Qwen3 (specialized for structured data) |
|
- **Purpose**: Converting unstructured text to structured JSON format |
|
- **Optimizations**: Fine-tuned for data extraction and format conversion tasks |
|
|
|
The model automatically identifies key information in your text and organizes it into logical JSON structures. |
|
""" |
|
joinus = """ |
|
## Join us : |
|
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [MultiTonic](https://github.com/MultiTonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗 |
|
""" |
|
|
|
|
|
MODEL_NAME = "osmosis-ai/Osmosis-Structure-0.6B" |
|
|
|
|
|
model = None |
|
tokenizer = None |
|
|
|
def load_model(): |
|
"""Load the Osmosis Structure model and tokenizer with HF token for gated repos. |
|
|
|
This function initializes the global model and tokenizer variables by loading them from Hugging Face. |
|
It handles authentication using the HF_KEY environment variable and provides helpful error messages |
|
for common issues like authentication failures or model not found errors. |
|
|
|
Returns: |
|
bool: True if model and tokenizer were loaded successfully, False otherwise. |
|
|
|
Example: |
|
>>> success = load_model() |
|
>>> if success: |
|
... print("Model loaded successfully!") |
|
... else: |
|
... print("Failed to load model") |
|
""" |
|
global model, tokenizer |
|
|
|
try: |
|
print("Loading Osmosis Structure model...") |
|
|
|
|
|
hf_token = os.environ.get("HF_KEY") |
|
if not hf_token: |
|
print("⚠️ Warning: HF_KEY not found in environment variables") |
|
print("Attempting to load without token...") |
|
hf_token = None |
|
else: |
|
print("✅ HF token found, accessing gated repository...") |
|
|
|
|
|
print("Loading tokenizer...") |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
MODEL_NAME, |
|
trust_remote_code=True, |
|
token=hf_token |
|
) |
|
|
|
print("Loading model...") |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
MODEL_NAME, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
device_map="auto" if torch.cuda.is_available() else None, |
|
trust_remote_code=True, |
|
token=hf_token |
|
) |
|
|
|
print("✅ Osmosis Structure model loaded successfully!") |
|
return True |
|
|
|
except Exception as e: |
|
error_msg = f"❌ Error loading model: {e}" |
|
print(error_msg) |
|
|
|
|
|
if "401" in str(e) or "authentication" in str(e).lower(): |
|
print("💡 This appears to be an authentication error.") |
|
print("Please ensure:") |
|
print("1. HF_KEY is set correctly in your Space secrets") |
|
print("2. Your token has access to the gated repository") |
|
print("3. You have accepted the model's license agreement") |
|
elif "404" in str(e) or "not found" in str(e).lower(): |
|
print("💡 Model repository not found.") |
|
print("Please check if the model name is correct and accessible") |
|
|
|
return False |
|
|
|
@spaces.GPU |
|
def text_to_json(input_text, schema_text, max_tokens=512, temperature=0.6, top_p=0.95, top_k=20): |
|
"""Convert plain text to structured JSON using Osmosis Structure model. |
|
|
|
This function takes unstructured text and optionally a JSON schema, then uses the Osmosis Structure |
|
model to convert it into well-formatted JSON. The output will follow the provided schema if one is |
|
given, otherwise it will create a logical structure based on the input text. |
|
|
|
Args: |
|
input_text (str): The unstructured text to convert to JSON. |
|
schema_text (str): Optional JSON schema that defines the desired output structure. |
|
max_tokens (int, optional): Maximum number of tokens to generate. Defaults to 512. |
|
temperature (float, optional): Controls randomness in generation. Defaults to 0.6. |
|
top_p (float, optional): Nucleus sampling parameter. Defaults to 0.95. |
|
top_k (int, optional): Number of highest probability tokens to consider. Defaults to 20. |
|
|
|
Returns: |
|
str: A JSON string containing the structured data, or an error message if something went wrong. |
|
|
|
Example: |
|
>>> input_text = "The conference will be held on June 10-12, 2024 at the Grand Hotel." |
|
>>> schema = '{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}}}' |
|
>>> result = text_to_json(input_text, schema) |
|
>>> print(result) |
|
{ |
|
"event_start_date": "2024-06-10" |
|
} |
|
""" |
|
global model, tokenizer |
|
|
|
if model is None or tokenizer is None: |
|
return "❌ Model not loaded. Please check the console for loading errors." |
|
|
|
try: |
|
|
|
system_prompt = "You are a helpful assistant that converts unstructured text into well-formatted JSON. Extract key information and organize it into a logical, structured format. Always respond with valid JSON." |
|
|
|
if schema_text and schema_text.strip(): |
|
system_prompt = f"You are a helpful assistant that understands and translates text to JSON format according to the following schema. {schema_text}" |
|
|
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": system_prompt |
|
}, |
|
{ |
|
"role": "user", |
|
"content": f"Convert this text to JSON format:\n\n{input_text}" |
|
} |
|
] |
|
|
|
|
|
formatted_prompt = tokenizer.apply_chat_template( |
|
messages, |
|
tokenize=False, |
|
add_generation_prompt=True |
|
) |
|
|
|
|
|
inputs = tokenizer( |
|
formatted_prompt, |
|
return_tensors="pt", |
|
truncation=True, |
|
max_length=2048 |
|
) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
inputs = {k: v.to(model.device) for k, v in inputs.items()} |
|
|
|
|
|
generation_config = { |
|
"max_new_tokens": max_tokens, |
|
"temperature": temperature, |
|
"top_p": top_p, |
|
"top_k": top_k, |
|
"do_sample": True, |
|
"pad_token_id": tokenizer.pad_token_id, |
|
"eos_token_id": tokenizer.eos_token_id, |
|
"repetition_penalty": 1.1, |
|
} |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
**generation_config |
|
) |
|
|
|
|
|
generated_tokens = outputs[0][len(inputs["input_ids"][0]):] |
|
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True) |
|
|
|
|
|
generated_text = generated_text.strip() |
|
|
|
|
|
json_start = generated_text.find('{') |
|
json_end = generated_text.rfind('}') |
|
|
|
if json_start != -1 and json_end != -1 and json_end > json_start: |
|
json_text = generated_text[json_start:json_end+1] |
|
else: |
|
|
|
json_text = generated_text |
|
|
|
|
|
prefixes_to_remove = ["```json", "```", "Here's the JSON:", "JSON:", "```json\n"] |
|
for prefix in prefixes_to_remove: |
|
if json_text.startswith(prefix): |
|
json_text = json_text[len(prefix):].strip() |
|
|
|
|
|
suffixes_to_remove = ["```", "\n```"] |
|
for suffix in suffixes_to_remove: |
|
if json_text.endswith(suffix): |
|
json_text = json_text[:-len(suffix)].strip() |
|
|
|
|
|
try: |
|
parsed_json = json.loads(json_text) |
|
return json.dumps(parsed_json, indent=2, ensure_ascii=False) |
|
except json.JSONDecodeError: |
|
|
|
return f"Generated response (may need manual cleanup):\n\n{json_text}" |
|
|
|
except Exception as e: |
|
return f"❌ Error generating JSON: {str(e)}" |
|
|
|
def create_demo(): |
|
"""Create and configure the Gradio demo interface. |
|
|
|
This function sets up the Gradio interface with all necessary components: |
|
- Input text area for unstructured text |
|
- Schema input area for JSON schema |
|
- Generation settings controls |
|
- Output display area |
|
- Example inputs with corresponding schemas |
|
|
|
Returns: |
|
gr.Blocks: A configured Gradio interface ready to be launched. |
|
|
|
Example: |
|
>>> demo = create_demo() |
|
>>> demo.launch() |
|
""" |
|
|
|
with gr.Blocks( |
|
title=title, |
|
theme=gr.themes.Monochrome(), |
|
css=""" |
|
.gradio-container { |
|
max-width: 1200px !important; |
|
} |
|
""" |
|
) as demo: |
|
|
|
|
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown(description) |
|
with gr.Column(scale=1): |
|
gr.Markdown(joinus) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
input_text = gr.Textbox( |
|
label="📝 Input Text", |
|
placeholder="Enter your unstructured text here...\n\nExample: 'The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact info@conference.com for questions.'", |
|
lines=8, |
|
max_lines=15 |
|
) |
|
|
|
schema_text = gr.Textbox( |
|
label="📋 JSON Schema (Optional)", |
|
placeholder="Enter your JSON schema here...\n\nExample: {\"type\": \"object\", \"properties\": {\"event_start_date\": {\"type\": \"string\", \"format\": \"date\"}, \"event_end_date\": {\"type\": \"string\", \"format\": \"date\"}, \"location\": {\"type\": \"string\"}, \"registration_fees\": {\"type\": \"object\", \"properties\": {\"early_bird_price\": {\"type\": \"number\"}, \"regular_price\": {\"type\": \"number\"}, \"early_bird_deadline\": {\"type\": \"string\", \"format\": \"date\"}}}, \"contact_email\": {\"type\": \"string\"}}}", |
|
lines=8, |
|
max_lines=15 |
|
) |
|
|
|
with gr.Accordion("⚙️ Generation Settings", open=False): |
|
max_tokens = gr.Slider( |
|
minimum=50, |
|
maximum=1000, |
|
value=512, |
|
step=10, |
|
label="Max Tokens", |
|
info="Maximum number of tokens to generate" |
|
) |
|
|
|
temperature = gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.6, |
|
step=0.1, |
|
label="Temperature", |
|
info="Controls randomness (lower = more focused)" |
|
) |
|
|
|
top_p = gr.Slider( |
|
minimum=0.1, |
|
maximum=1.0, |
|
value=0.95, |
|
step=0.05, |
|
label="Top-p", |
|
info="Nucleus sampling parameter" |
|
) |
|
|
|
top_k = gr.Slider( |
|
minimum=1, |
|
maximum=100, |
|
value=20, |
|
step=1, |
|
label="Top-k", |
|
info="Limits vocabulary for generation" |
|
) |
|
|
|
convert_btn = gr.Button( |
|
"🔄 Convert to JSON", |
|
variant="primary", |
|
size="lg" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
output_json = gr.Textbox( |
|
label="📋 Generated JSON", |
|
lines=15, |
|
max_lines=20, |
|
interactive=False, |
|
show_copy_button=True |
|
) |
|
|
|
|
|
gr.Examples( |
|
examples=[ |
|
[ |
|
"The conference will be held on June 10-12, 2024 at the Grand Hotel in San Francisco. Registration fee is $500 for early bird (before May 1) and $650 for regular registration. Contact info@conference.com for questions.", |
|
'{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}' |
|
], |
|
[ |
|
"The workshop is scheduled for March 15-16, 2024 at Tech Hub in Seattle. Early bird tickets cost $299 until February 15, after which regular tickets will be $399. For inquiries, email workshop@techhub.com", |
|
'{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "registration_fees": {"type": "object", "properties": {"early_bird_price": {"type": "number"}, "regular_price": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "contact_email": {"type": "string"}}}' |
|
], |
|
[ |
|
"Product: Wireless Headphones Model XYZ-100. Price: $199.99. Features: Bluetooth 5.0, 30-hour battery, noise cancellation, wireless charging case. Colors available: Black, White, Blue. Warranty: 2 years. Rating: 4.5/5 stars (324 reviews).", |
|
'{"type": "object", "properties": {"product_name": {"type": "string"}, "price": {"type": "number"}, "features": {"type": "array", "items": {"type": "string"}}, "colors": {"type": "array", "items": {"type": "string"}}, "warranty_years": {"type": "number"}, "rating": {"type": "object", "properties": {"score": {"type": "number"}, "reviews": {"type": "number"}}}}}' |
|
], |
|
[ |
|
"The summer festival runs from July 1-5, 2024 at Central Park. VIP passes are $150 until June 1, then $200. General admission is $75 early bird (until June 15) and $100 regular. Contact tickets@summerfest.com", |
|
'{"type": "object", "properties": {"event_start_date": {"type": "string", "format": "date"}, "event_end_date": {"type": "string", "format": "date"}, "location": {"type": "string"}, "ticket_prices": {"type": "object", "properties": {"vip": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}, "general": {"type": "object", "properties": {"early_bird": {"type": "number"}, "regular": {"type": "number"}, "early_bird_deadline": {"type": "string", "format": "date"}}}}}, "contact_email": {"type": "string"}}}' |
|
] |
|
], |
|
inputs=[input_text, schema_text], |
|
label="Click on any example to try it" |
|
) |
|
|
|
|
|
|
|
convert_btn.click( |
|
fn=text_to_json, |
|
inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k], |
|
outputs=output_json, |
|
show_progress=True |
|
) |
|
|
|
|
|
input_text.submit( |
|
fn=text_to_json, |
|
inputs=[input_text, schema_text, max_tokens, temperature, top_p, top_k], |
|
outputs=output_json, |
|
show_progress=True |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
print("🌊 Initializing Osmosis Structure Demo...") |
|
|
|
|
|
hf_token = os.environ.get("HF_KEY") |
|
if hf_token: |
|
print("✅ HF_KEY found in environment") |
|
else: |
|
print("⚠️ HF_KEY not found - this may cause issues with gated repositories") |
|
|
|
|
|
if load_model(): |
|
print("🚀 Creating Gradio interface...") |
|
demo = create_demo() |
|
demo.launch( |
|
ssr_mode=False, |
|
mcp_server=True |
|
) |
|
else: |
|
print("❌ Failed to load model. Please check your HF_KEY and model access permissions.") |