Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Alina Lozovskaya
commited on
Commit
·
d1ed69b
1
Parent(s):
4111351
First commit
Browse files- .gitignore +45 -0
- Dockerfile +30 -0
- app.py +152 -0
- pyproject.toml +26 -0
- uv.lock +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
logs/
|
| 6 |
+
log_backups/
|
| 7 |
+
plots/
|
| 8 |
+
files/
|
| 9 |
+
.gradio/
|
| 10 |
+
fr.sh
|
| 11 |
+
repo_contents.txt
|
| 12 |
+
# C extensions
|
| 13 |
+
*.so
|
| 14 |
+
|
| 15 |
+
# Distribution / packaging
|
| 16 |
+
.Python
|
| 17 |
+
build/
|
| 18 |
+
develop-eggs/
|
| 19 |
+
dist/
|
| 20 |
+
downloads/
|
| 21 |
+
eggs/
|
| 22 |
+
.eggs/
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
parts/
|
| 26 |
+
sdist/
|
| 27 |
+
var/
|
| 28 |
+
wheels/
|
| 29 |
+
share/python-wheels/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
MANIFEST
|
| 34 |
+
|
| 35 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 36 |
+
__pypackages__/
|
| 37 |
+
|
| 38 |
+
# Environments
|
| 39 |
+
.env
|
| 40 |
+
.venv
|
| 41 |
+
env/
|
| 42 |
+
venv/
|
| 43 |
+
ENV/
|
| 44 |
+
env.bak/
|
| 45 |
+
venv.bak/
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.12.1 slim image as base
|
| 2 |
+
FROM python:3.12.1-slim
|
| 3 |
+
|
| 4 |
+
# Install dependencies required for UV and Python packages
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
curl ca-certificates git && \
|
| 7 |
+
rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
# Install UV (fast Python dependency manager)
|
| 10 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 11 |
+
|
| 12 |
+
# Ensure UV is available in PATH
|
| 13 |
+
ENV PATH="/root/.local/bin:$PATH"
|
| 14 |
+
|
| 15 |
+
# Set working directory
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Copy pyproject and install dependencies using UV
|
| 19 |
+
COPY pyproject.toml .
|
| 20 |
+
RUN uv venv && uv sync
|
| 21 |
+
|
| 22 |
+
# Copy application code
|
| 23 |
+
COPY app.py .
|
| 24 |
+
|
| 25 |
+
# Expose Gradio app port
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 28 |
+
|
| 29 |
+
# Entrypoint to run the Gradio app
|
| 30 |
+
ENTRYPOINT ["uv", "run", "python", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import pathlib
|
| 4 |
+
import threading
|
| 5 |
+
import shutil
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import yaml
|
| 8 |
+
import io
|
| 9 |
+
|
| 10 |
+
from loguru import logger
|
| 11 |
+
from yourbench.pipeline import run_pipeline
|
| 12 |
+
|
| 13 |
+
UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
|
| 14 |
+
UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")
|
| 17 |
+
|
| 18 |
+
yourbench_log_stream = io.StringIO()
|
| 19 |
+
|
| 20 |
+
def custom_log_handler(message):
|
| 21 |
+
yourbench_log_stream.write(message + "\n")
|
| 22 |
+
# yourbench_log_stream.flush()
|
| 23 |
+
|
| 24 |
+
def get_log_content():
|
| 25 |
+
yourbench_log_stream.seek(0)
|
| 26 |
+
content = yourbench_log_stream.read()
|
| 27 |
+
print(len(content))
|
| 28 |
+
return content
|
| 29 |
+
|
| 30 |
+
logger.add(custom_log_handler, filter="yourbench")
|
| 31 |
+
|
| 32 |
+
def start_task():
|
| 33 |
+
# Start the long-running task in a separate thread
|
| 34 |
+
task_thread = threading.Thread(target=run_pipeline, args=(CONFIG_PATH,), daemon=True)
|
| 35 |
+
task_thread.start()
|
| 36 |
+
task_thread.join()
|
| 37 |
+
|
| 38 |
+
def generate_config(
|
| 39 |
+
hf_token,
|
| 40 |
+
hf_org,
|
| 41 |
+
model_name,
|
| 42 |
+
provider,
|
| 43 |
+
base_url,
|
| 44 |
+
api_key,
|
| 45 |
+
max_concurrent_requests,
|
| 46 |
+
ingestion_source,
|
| 47 |
+
ingestion_output,
|
| 48 |
+
run_ingestion,
|
| 49 |
+
summarization_source,
|
| 50 |
+
summarization_output,
|
| 51 |
+
run_summarization
|
| 52 |
+
):
|
| 53 |
+
|
| 54 |
+
"""Generates a config.yaml based on user inputs"""
|
| 55 |
+
config = {
|
| 56 |
+
"hf_configuration": {
|
| 57 |
+
"token": hf_token,
|
| 58 |
+
"private": True,
|
| 59 |
+
"hf_organization": hf_org
|
| 60 |
+
},
|
| 61 |
+
"model_list": [{
|
| 62 |
+
"model_name": model_name,
|
| 63 |
+
"provider": provider,
|
| 64 |
+
"base_url": base_url,
|
| 65 |
+
"api_key": api_key,
|
| 66 |
+
"max_concurrent_requests": max_concurrent_requests
|
| 67 |
+
}],
|
| 68 |
+
"pipeline": {
|
| 69 |
+
"ingestion": {
|
| 70 |
+
"source_documents_dir": ingestion_source,
|
| 71 |
+
"output_dir": ingestion_output,
|
| 72 |
+
"run": run_ingestion
|
| 73 |
+
},
|
| 74 |
+
"summarization": {
|
| 75 |
+
"source_dataset_name": summarization_source,
|
| 76 |
+
"output_dataset_name": summarization_output,
|
| 77 |
+
"run": run_summarization
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
return yaml.dump(config, default_flow_style=False)
|
| 82 |
+
|
| 83 |
+
def save_config(yaml_text):
|
| 84 |
+
with open(CONFIG_PATH, "w") as file:
|
| 85 |
+
file.write(yaml_text)
|
| 86 |
+
return "✅ Config saved as config.yaml!"
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def save_files(files: list[str]):
|
| 90 |
+
saved_paths = []
|
| 91 |
+
for file in files:
|
| 92 |
+
file_path = pathlib.Path(file)
|
| 93 |
+
save_path = UPLOAD_DIRECTORY / file_path.name
|
| 94 |
+
shutil.move(str(file_path), str(save_path))
|
| 95 |
+
saved_paths.append(str(save_path))
|
| 96 |
+
return f"Files have been successfully saved to: {', '.join(saved_paths)}"
|
| 97 |
+
|
| 98 |
+
def start_youbench():
|
| 99 |
+
run_pipeline(CONFIG_PATH, debug=False)
|
| 100 |
+
|
| 101 |
+
app = gr.Blocks()
|
| 102 |
+
|
| 103 |
+
with app:
|
| 104 |
+
gr.Markdown("## YourBench Configuration")
|
| 105 |
+
|
| 106 |
+
with gr.Tab("HF Configuration"):
|
| 107 |
+
hf_token = gr.Textbox(label="HF Token")
|
| 108 |
+
hf_org = gr.Textbox(label="HF Organization")
|
| 109 |
+
|
| 110 |
+
with gr.Tab("Model Settings"):
|
| 111 |
+
model_name = gr.Textbox(label="Model Name")
|
| 112 |
+
provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
|
| 113 |
+
base_url = gr.Textbox(label="Base URL")
|
| 114 |
+
api_key = gr.Textbox(label="API Key")
|
| 115 |
+
max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
|
| 116 |
+
|
| 117 |
+
with gr.Tab("Pipeline Stages"):
|
| 118 |
+
ingestion_source = gr.Textbox(label="Ingestion Source Directory")
|
| 119 |
+
ingestion_output = gr.Textbox(label="Ingestion Output Directory")
|
| 120 |
+
run_ingestion = gr.Checkbox(label="Run Ingestion", value=False)
|
| 121 |
+
summarization_source = gr.Textbox(label="Summarization Source Dataset")
|
| 122 |
+
summarization_output = gr.Textbox(label="Summarization Output Dataset")
|
| 123 |
+
run_summarization = gr.Checkbox(label="Run Summarization", value=False)
|
| 124 |
+
|
| 125 |
+
with gr.Tab("Config"):
|
| 126 |
+
config_output = gr.Code(label="Generated Config", language="yaml")
|
| 127 |
+
preview_button = gr.Button("Generate Config")
|
| 128 |
+
save_button = gr.Button("Save Config")
|
| 129 |
+
|
| 130 |
+
preview_button.click(generate_config,
|
| 131 |
+
inputs=[hf_token, hf_org, model_name, provider, base_url, api_key,
|
| 132 |
+
max_concurrent_requests, ingestion_source, ingestion_output,
|
| 133 |
+
run_ingestion, summarization_source, summarization_output, run_summarization],
|
| 134 |
+
outputs=config_output)
|
| 135 |
+
|
| 136 |
+
save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])
|
| 137 |
+
|
| 138 |
+
with gr.Tab("Files"):
|
| 139 |
+
file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
|
| 140 |
+
file_explorer = gr.FileExplorer(root_dir=UPLOAD_DIRECTORY, interactive=False, label="Current Files")
|
| 141 |
+
output = gr.Textbox(label="Log")
|
| 142 |
+
file_input.upload(save_files, file_input, output)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
with gr.Tab("Run Generation"):
|
| 146 |
+
log_output = gr.Code(label="Log Output", language=None,lines=20, interactive=False)
|
| 147 |
+
start_button = gr.Button("Start Long-Running Task")
|
| 148 |
+
timer = gr.Timer(0.5, active=True)
|
| 149 |
+
timer.tick(get_log_content, outputs=log_output)
|
| 150 |
+
start_button.click(start_task)
|
| 151 |
+
|
| 152 |
+
app.launch()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "yourbench-space"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
requires-python = ">=3.12, <3.13"
|
| 5 |
+
|
| 6 |
+
dependencies = [
|
| 7 |
+
"yourbench @ git+https://github.com/huggingface/yourbench.git@v0.2-alpha-summarization",
|
| 8 |
+
"asyncio>=3.4.3",
|
| 9 |
+
"datasets>=3.3.0",
|
| 10 |
+
"gradio>=5.20.0",
|
| 11 |
+
"hf-transfer>=0.1.9",
|
| 12 |
+
"langfuse>=2.59.3",
|
| 13 |
+
"litellm>=1.61.16",
|
| 14 |
+
"loguru>=0.7.3",
|
| 15 |
+
"markitdown>=0.0.1a4",
|
| 16 |
+
"matplotlib>=3.10.0",
|
| 17 |
+
"openai>=1.63.0",
|
| 18 |
+
"python-dotenv>=1.0.1",
|
| 19 |
+
"torch>=2.6.0",
|
| 20 |
+
"tqdm>=4.67.1",
|
| 21 |
+
"transformers>=4.48.3",
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[build-system]
|
| 25 |
+
requires = ["setuptools>=61.0"]
|
| 26 |
+
build-backend = "setuptools.build_meta"
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|