Spaces:

SlouchyBuffalo
/

pages-converter-pro

Sleeping

App Files Files Community

pages-converter-pro / app.py

SlouchyBuffalo

Update app.py

3b6bffb verified 27 days ago

raw

history blame contribute delete

13.7 kB

	# app.py - Corrected CloudConvert API Integration
	import gradio as gr
	import os
	import spaces
	import tempfile
	import requests
	import time
	from huggingface_hub import InferenceClient
	from pathlib import Path

	# Debug tokens
	hf_token = os.getenv("HF_TOKEN")
	cloudconvert_token = os.getenv("CLOUDCONVERT_API_KEY").strip() if os.getenv("CLOUDCONVERT_API_KEY") else None
	print(f"Debug: HF Token exists = {hf_token is not None}")
	print(f"Debug: CloudConvert Token exists = {cloudconvert_token is not None}")

	# Initialize the client with Cerebras
	client = InferenceClient(
	"meta-llama/Llama-3.3-70B-Instruct",
	provider="cerebras",
	token=hf_token
	)

	def convert_pages_to_text(file_path, api_key):
	"""Convert .pages file to text using CloudConvert API - Correct Format"""
	base_url = "https://api.cloudconvert.com/v2"
	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}

	try:
	# Step 1: Create a job with correct task structure
	job_data = {
	"tasks": {
	"import-file": {
	"operation": "import/upload"
	},
	"convert-file": {
	"operation": "convert",
	"input": "import-file",
	"input_format": "pages",
	"output_format": "txt"
	},
	"export-file": {
	"operation": "export/url",
	"input": "convert-file"
	}
	}
	}

	print("Creating CloudConvert job...")
	response = requests.post(f"{base_url}/jobs", headers=headers, json=job_data)
	print(f"Job creation response: {response.status_code}")

	if not response.ok:
	print(f"Job creation failed: {response.text}")
	response.raise_for_status()

	job = response.json()
	print(f"Job created successfully: {job['data']['id']}")

	# Step 2: Upload the file
	upload_task = None
	for task in job["data"]["tasks"]:
	if task["operation"] == "import/upload":
	upload_task = task
	break

	if not upload_task:
	raise Exception("Upload task not found in job")

	upload_url = upload_task["result"]["form"]["url"]
	form_data = upload_task["result"]["form"]["parameters"]

	print("Uploading file to CloudConvert...")
	with open(file_path, 'rb') as f:
	files = {"file": f}
	upload_response = requests.post(upload_url, data=form_data, files=files)

	if not upload_response.ok:
	print(f"Upload failed: {upload_response.text}")
	upload_response.raise_for_status()

	print("File uploaded successfully")

	# Step 3: Wait for conversion to complete
	job_id = job["data"]["id"]
	print(f"Waiting for job {job_id} to complete...")

	max_attempts = 30 # Wait up to 1 minute
	for attempt in range(max_attempts):
	status_response = requests.get(f"{base_url}/jobs/{job_id}", headers=headers)
	status_response.raise_for_status()
	job_status = status_response.json()

	print(f"Job status: {job_status['data']['status']}")

	if job_status["data"]["status"] == "finished":
	print("Conversion completed successfully")
	break
	elif job_status["data"]["status"] == "error":
	error_msg = job_status['data'].get('message', 'Unknown error')
	print(f"Conversion failed: {error_msg}")

	# Check task-level errors
	for task in job_status.get('data', {}).get('tasks', []):
	if task.get('status') == 'error':
	task_error = task.get('message', 'Unknown task error')
	print(f"Task {task.get('operation')} error: {task_error}")

	raise Exception(f"Conversion failed: {error_msg}")

	time.sleep(2) # Wait 2 seconds before checking again
	else:
	raise Exception("Conversion timeout - job took too long")

	# Step 4: Download the converted text
	for task in job_status["data"]["tasks"]:
	if task["operation"] == "export/url" and task["status"] == "finished":
	download_url = task["result"]["files"][0]["url"]
	print(f"Downloading result from: {download_url}")

	download_response = requests.get(download_url)
	download_response.raise_for_status()

	text_content = download_response.text
	print(f"Downloaded {len(text_content)} characters")
	return text_content

	raise Exception("No converted file found in completed job")

	except requests.exceptions.RequestException as e:
	print(f"HTTP error: {e}")
	raise Exception(f"CloudConvert HTTP error: {str(e)}")
	except Exception as e:
	print(f"General error: {e}")
	raise Exception(f"CloudConvert error: {str(e)}")

	@spaces.GPU
	def convert_pages_document(file, output_format, progress=gr.Progress()):
	"""Convert Pages document using CloudConvert + Novita"""
	if not file:
	return None, "❌ Please upload a .pages file"

	if not cloudconvert_token:
	return None, "❌ CloudConvert API key not configured. Please add CLOUDCONVERT_API_KEY to secrets."

	try:
	progress(0.1, desc="📤 Converting with CloudConvert...")

	# Use CloudConvert to extract text from .pages file
	print(f"Converting file: {file.name}")
	text_content = convert_pages_to_text(file.name, cloudconvert_token)

	if not text_content or len(text_content.strip()) < 10:
	return None, "❌ Could not extract content from .pages file"

	print(f"Extracted text preview: {text_content[:200]}...")

	progress(0.5, desc="🤖 Converting format with Cerebras AI...")

	# Create format-specific prompt
	prompt = create_conversion_prompt(text_content, output_format)

	progress(0.7, desc="⚡ Processing with ZeroGPU...")

	# Convert using Cerebras
	try:
	messages = [{"role": "user", "content": prompt}]
	response = client.chat_completion(
	messages=messages,
	max_tokens=4096,
	temperature=0.1
	)
	converted_text = response.choices[0].message.content
	except Exception as e:
	print(f"Cerebras error: {e}")
	return None, f"❌ AI conversion error: {str(e)}"

	progress(0.9, desc="💾 Creating output file...")

	# Create output file
	output_path = create_output_file(converted_text, output_format)

	progress(1.0, desc="✅ Conversion complete!")

	return output_path, f"✅ Successfully converted to {output_format}!"

	except Exception as e:
	print(f"Conversion error: {e}")
	return None, f"❌ Error: {str(e)}"

	def create_conversion_prompt(content, output_format):
	"""Create optimized prompt for format conversion"""
	return f"""You are a document formatter. Convert the following text to {output_format} format.

	IMPORTANT:
	1. Keep ALL original content - do not summarize or remove text
	2. Only adjust formatting for {output_format}
	3. Preserve all important information, names, and details

	Original text:
	{content}

	Formatted {output_format} output:"""

	def create_output_file(content, output_format):
	"""Create output file in specified format"""
	content = content.strip()

	if output_format == "PDF":
	from reportlab.pdfgen import canvas
	from reportlab.lib.pagesizes import letter
	import textwrap

	with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as f:
	pdf = canvas.Canvas(f.name, pagesize=letter)
	width, height = letter
	y = height - 50

	# Better paragraph handling
	paragraphs = content.split('\n\n')
	for paragraph in paragraphs:
	if paragraph.strip():
	lines = textwrap.wrap(paragraph.strip(), width=90)
	for line in lines:
	if y < 50:
	pdf.showPage()
	y = height - 50
	pdf.drawString(50, y, line)
	y -= 20
	y -= 10 # Space between paragraphs

	pdf.save()
	return f.name

	elif output_format == "DOCX":
	from docx import Document

	with tempfile.NamedTemporaryFile(suffix='.docx', delete=False) as f:
	doc = Document()

	# Add paragraphs
	paragraphs = content.split('\n\n')
	for paragraph in paragraphs:
	if paragraph.strip():
	doc.add_paragraph(paragraph.strip())

	doc.save(f.name)
	return f.name

	else:
	# For TXT, HTML, Markdown
	ext_map = {"TXT": ".txt", "HTML": ".html", "Markdown": ".md"}
	ext = ext_map.get(output_format, ".txt")

	with tempfile.NamedTemporaryFile(mode='w', suffix=ext, delete=False, encoding='utf-8') as f:
	f.write(content)
	return f.name

	# Create the Gradio interface
	with gr.Blocks(title="Pages Converter Pro - CloudConvert", theme=gr.themes.Soft()) as app:
	# Header
	gr.HTML("""
	<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 1rem; margin-bottom: 2rem;">
	<h1>📄 Pages Converter Pro</h1>
	<p>Convert Apple Pages documents using CloudConvert + Cerebras AI</p>
	<p style="font-size: 0.9em; opacity: 0.9;">✨ Professional .pages parsing + AI-powered format conversion</p>
	</div>
	""")

	# Status indicator
	with gr.Row():
	gr.HTML(f"""
	<div style="background: {'#d4edda' if cloudconvert_token else '#f8d7da'}; color: {'#155724' if cloudconvert_token else '#721c24'}; padding: 1rem; border-radius: 0.5rem; text-align: center;">
	<strong>CloudConvert API:</strong> {'✅ Connected and Ready' if cloudconvert_token else '❌ API Key Missing'}
	</div>
	""")

	# Main interface
	with gr.Row():
	with gr.Column(scale=2):
	gr.HTML("<h3>📎 Upload & Convert</h3>")

	file_input = gr.File(
	label="Select .pages file",
	file_types=[".pages"]
	)

	output_format = gr.Radio(
	choices=["PDF", "DOCX", "TXT", "HTML", "Markdown"],
	value="PDF",
	label="🎯 Output Format"
	)

	convert_btn = gr.Button(
	"🚀 Convert Document",
	variant="primary",
	size="lg"
	)

	with gr.Column(scale=1):
	gr.HTML("""
	<div style="background: white; padding: 1.5rem; border-radius: 1rem; box-shadow: 0 5px 15px rgba(0,0,0,0.1);">
	<h3>✨ Features</h3>
	<ul style="color: #666;">
	<li>✅ <strong>100% reliable</strong> .pages parsing</li>
	<li>⚡ ZeroGPU acceleration</li>
	<li>🤖 AI-powered formatting</li>
	<li>🎨 Professional output quality</li>
	<li>🔒 Secure processing</li>
	</ul>

	<div style="background: #f5f5f5; padding: 1rem; border-radius: 0.5rem; margin-top: 1rem;">
	<h4 style="margin-top: 0;">💡 How it works:</h4>
	<ol style="font-size: 0.9em; color: #555; margin-bottom: 0;">
	<li>CloudConvert extracts text from .pages</li>
	<li>Cerebras AI formats for your chosen output</li>
	<li>Download your professionally converted file</li>
	</ol>
	</div>
	</div>
	""")

	# Output section
	with gr.Row():
	output_file = gr.File(
	label="📁 Download Your Converted File"
	)

	with gr.Row():
	status_html = gr.HTML(
	value="<div style='text-align: center; padding: 1rem; color: #666; background: #f8f9fa; border-radius: 0.5rem;'>Upload a .pages file to get started</div>"
	)

	# Connect the interface
	convert_btn.click(
	fn=convert_pages_document,
	inputs=[file_input, output_format],
	outputs=[output_file, status_html],
	show_progress=True
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; margin-top: 2rem; padding: 1rem; background: #f8f9fa; border-radius: 0.5rem;">
	<p style="margin-bottom: 0.5rem;">🔧 <strong>Technical Stack:</strong></p>
	<p style="font-size: 0.9em; color: #666; margin-bottom: 0;">
	CloudConvert API for reliable .pages parsing • HuggingFace ZeroGPU for AI processing • Cerebras for lightning-fast inference
	</p>
	</div>
	""")

	# Launch the app
	if __name__ == "__main__":
	app.launch()