|
|
|
import gradio as gr |
|
import os |
|
import spaces |
|
import tempfile |
|
import requests |
|
import time |
|
from huggingface_hub import InferenceClient |
|
from pathlib import Path |
|
|
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None |
|
print(f"Debug: HF Token exists = {hf_token is not None}") |
|
print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}") |
|
|
|
|
|
client = InferenceClient( |
|
"meta-llama/Llama-3.3-70B-Instruct", |
|
provider="cerebras", |
|
token=hf_token |
|
) |
|
|
|
def convert_pages_to_text(file_path, api_key): |
|
"""Convert .pages file to text using CloudConvert API - Correct Format""" |
|
base_url = "https://api.cloudconvert.com/v2" |
|
headers = { |
|
"Authorization": f"Bearer {api_key}", |
|
"Content-Type": "application/json" |
|
} |
|
|
|
try: |
|
|
|
job_data = { |
|
"tasks": { |
|
"import-file": { |
|
"operation": "import/upload" |
|
}, |
|
"convert-file": { |
|
"operation": "convert", |
|
"input": "import-file", |
|
"input_format": "pages", |
|
"output_format": "txt" |
|
}, |
|
"export-file": { |
|
"operation": "export/url", |
|
"input": "convert-file" |
|
} |
|
} |
|
} |
|
|
|
print("Creating CloudConvert job...") |
|
response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data) |
|
print(f"Job creation response: {response.status_code}") |
|
|
|
if not response.ok: |
|
print(f"Job creation failed: {response.text}") |
|
response.raise_for_status() |
|
|
|
job = response.json() |
|
print(f"Job created successfully: {job['data']['id']}") |
|
|
|
|
|
upload_task = None |
|
for task in job["data"]["tasks"]: |
|
if task["operation"] == "import/upload": |
|
upload_task = task |
|
break |
|
|
|
if not upload_task: |
|
raise Exception("Upload task not found in job") |
|
|
|
upload_url = upload_task["result"]["form"]["url"] |
|
form_data = upload_task["result"]["form"]["parameters"] |
|
|
|
print("Uploading file to CloudConvert...") |
|
with open(file_path, 'rb') as f: |
|
files = {"file": f} |
|
upload_response = requests.post(upload_url, data=form_data, files=files) |
|
|
|
if not upload_response.ok: |
|
print(f"Upload failed: {upload_response.text}") |
|
upload_response.raise_for_status() |
|
|
|
print("File uploaded successfully") |
|
|
|
|
|
job_id = job["data"]["id"] |
|
print(f"Waiting for job {job_id} to complete...") |
|
|
|
max_attempts = 30 |
|
for attempt in range(max_attempts): |
|
status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers) |
|
status_response.raise_for_status() |
|
job_status = status_response.json() |
|
|
|
print(f"Job status: {job_status['data']['status']}") |
|
|
|
if job_status["data"]["status"] == "finished": |
|
print("Conversion completed successfully") |
|
break |
|
elif job_status["data"]["status"] == "error": |
|
error_msg = job_status['data'].get('message', 'Unknown error') |
|
print(f"Conversion failed: {error_msg}") |
|
|
|
|
|
for task in job_status.get('data', {}).get('tasks', []): |
|
if task.get('status') == 'error': |
|
task_error = task.get('message', 'Unknown task error') |
|
print(f"Task {task.get('operation')} error: {task_error}") |
|
|
|
raise Exception(f"Conversion failed: {error_msg}") |
|
|
|
time.sleep(2) |
|
else: |
|
raise Exception("Conversion timeout - job took too long") |
|
|
|
|
|
for task in job_status["data"]["tasks"]: |
|
if task["operation"] == "export/url" and task["status"] == "finished": |
|
download_url = task["result"]["files"][0]["url"] |
|
print(f"Downloading result from: {download_url}") |
|
|
|
download_response = requests.get(download_url) |
|
download_response.raise_for_status() |
|
|
|
text_content = download_response.text |
|
print(f"Downloaded {len(text_content)} characters") |
|
return text_content |
|
|
|
raise Exception("No converted file found in completed job") |
|
|
|
except requests.exceptions.RequestException as e: |
|
print(f"HTTP error: {e}") |
|
raise Exception(f"CloudConvert HTTP error: {str(e)}") |
|
except Exception as e: |
|
print(f"General error: {e}") |
|
raise Exception(f"CloudConvert error: {str(e)}") |
|
|
|
@spaces.GPU |
|
def convert_pages_document(file, output_format, progress=gr.Progress()): |
|
"""Convert Pages document using CloudConvert + Novita""" |
|
if not file: |
|
return None, "β Please upload a .pages file" |
|
|
|
if not cloudconvert_token: |
|
return None, "β CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets." |
|
|
|
try: |
|
progress(0.1, desc="π€ Converting with CloudConvert...") |
|
|
|
|
|
print(f"Converting file: {file.name}") |
|
text_content = convert_pages_to_text(file.name, cloudconvert_token) |
|
|
|
if not text_content or len(text_content.strip()) < 10: |
|
return None, "β Could not extract content from .pages file" |
|
|
|
print(f"Extracted text preview: {text_content[:200]}...") |
|
|
|
progress(0.5, desc="π€ Converting format with Cerebras AI...") |
|
|
|
|
|
prompt = create_conversion_prompt(text_content, output_format) |
|
|
|
progress(0.7, desc="β‘ Processing with ZeroGPU...") |
|
|
|
|
|
try: |
|
messages = [{"role": "user", "content": prompt}] |
|
response = client.chat_completion( |
|
messages=messages, |
|
max_tokens=4096, |
|
temperature=0.1 |
|
) |
|
converted_text = response.choices[0].message.content |
|
except Exception as e: |
|
print(f"Cerebras error: {e}") |
|
return None, f"β AI conversion error: {str(e)}" |
|
|
|
progress(0.9, desc="πΎ Creating output file...") |
|
|
|
|
|
output_path = create_output_file(converted_text, output_format) |
|
|
|
progress(1.0, desc="β
Conversion complete!") |
|
|
|
return output_path, f"β
Successfully converted to {output_format}!" |
|
|
|
except Exception as e: |
|
print(f"Conversion error: {e}") |
|
return None, f"β Error: {str(e)}" |
|
|
|
def create_conversion_prompt(content, output_format): |
|
"""Create optimized prompt for format conversion""" |
|
return f"""You are a document formatter. Convert the following text to {output_format} format. |
|
|
|
IMPORTANT: |
|
1. Keep ALL original content - do not summarize or remove text |
|
2. Only adjust formatting for {output_format} |
|
3. Preserve all important information, names, and details |
|
|
|
Original text: |
|
{content} |
|
|
|
Formatted {output_format} output:""" |
|
|
|
def create_output_file(content, output_format): |
|
"""Create output file in specified format""" |
|
content = content.strip() |
|
|
|
if output_format == "PDF": |
|
from reportlab.pdfgen import canvas |
|
from reportlab.lib.pagesizes import letter |
|
import textwrap |
|
|
|
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f: |
|
pdf = canvas.Canvas(f.name, pagesize=letter) |
|
width, height = letter |
|
y = height - 50 |
|
|
|
|
|
paragraphs = content.split('\n\n') |
|
for paragraph in paragraphs: |
|
if paragraph.strip(): |
|
lines = textwrap.wrap(paragraph.strip(), width=90) |
|
for line in lines: |
|
if y < 50: |
|
pdf.showPage() |
|
y = height - 50 |
|
pdf.drawString(50, y, line) |
|
y -= 20 |
|
y -= 10 |
|
|
|
pdf.save() |
|
return f.name |
|
|
|
elif output_format == "DOCX": |
|
from docx import Document |
|
|
|
with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f: |
|
doc = Document() |
|
|
|
|
|
paragraphs = content.split('\n\n') |
|
for paragraph in paragraphs: |
|
if paragraph.strip(): |
|
doc.add_paragraph(paragraph.strip()) |
|
|
|
doc.save(f.name) |
|
return f.name |
|
|
|
else: |
|
|
|
ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"} |
|
ext = ext_map.get(output_format, ".txt") |
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f: |
|
f.write(content) |
|
return f.name |
|
|
|
|
|
with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app: |
|
|
|
gr.HTML(""" |
|
<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;"> |
|
<h1>π Pages Converter Pro</h1> |
|
<p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p> |
|
<p style="font-size: 0.9em; opacity: 0.9;">β¨ Professional .pages parsing + AI-powered format conversion</p> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
gr.HTML(f""" |
|
<div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;"> |
|
<strong>CloudConvert API:</strong> {'β
Connected and Ready' if cloudconvert_token else 'β API Key Missing'} |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.HTML("<h3>π Upload & Convert</h3>") |
|
|
|
file_input = gr.File( |
|
label="Select .pages file", |
|
file_types=[".pages"] |
|
) |
|
|
|
output_format = gr.Radio( |
|
choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"], |
|
value="PDF", |
|
label="π― Output Format" |
|
) |
|
|
|
convert_btn = gr.Button( |
|
"π Convert Document", |
|
variant="primary", |
|
size="lg" |
|
) |
|
|
|
with gr.Column(scale=1): |
|
gr.HTML(""" |
|
<div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);"> |
|
<h3>β¨ Features</h3> |
|
<ul style="color: #666;"> |
|
<li>β
<strong>100% reliable</strong> .pages parsing</li> |
|
<li>β‘ ZeroGPU acceleration</li> |
|
<li>π€ AI-powered formatting</li> |
|
<li>π¨ Professional output quality</li> |
|
<li>π Secure processing</li> |
|
</ul> |
|
|
|
<div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;"> |
|
<h4 style="margin-top: 0;">π‘ How it works:</h4> |
|
<ol style="font-size: 0.9em; color: #555; margin-bottom: 0;"> |
|
<li>CloudConvert extracts text from .pages</li> |
|
<li>Cerebras AI formats for your chosen output</li> |
|
<li>Download your professionally converted file</li> |
|
</ol> |
|
</div> |
|
</div> |
|
""") |
|
|
|
|
|
with gr.Row(): |
|
output_file = gr.File( |
|
label="π Download Your Converted File" |
|
) |
|
|
|
with gr.Row(): |
|
status_html = gr.HTML( |
|
value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>" |
|
) |
|
|
|
|
|
convert_btn.click( |
|
fn=convert_pages_document, |
|
inputs=[file_input, output_format], |
|
outputs=[output_file, status_html], |
|
show_progress=True |
|
) |
|
|
|
|
|
gr.HTML(""" |
|
<div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;"> |
|
<p style="margin-bottom: 0.5rem;">π§ <strong>Technical Stack:</strong></p> |
|
<p style="font-size: 0.9em; color: #666; margin-bottom: 0;"> |
|
CloudConvert API for reliable .pages parsing β’ HuggingFace ZeroGPU for AI processing β’ Cerebras for lightning-fast inference |
|
</p> |
|
</div> |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |