Spaces:

wolfofbackstreet
/

tiny-gguf-on-cpu

Running

File size: 4,063 Bytes

from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig
import inspect
from typing import get_type_hints, Callable, Any
import gradio as gr

model_name = "wolfofbackstreet/SmolLM2-135M-int4-qptq-v2"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define GPTQ configuration
#gptq_config = GPTQConfig(bits=4, use_exllama=False, use_cuda_fp16=False)

# Load pre-quantized model on CPU
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cpu"  # Explicitly enforce CPU execution
    # quantization_config=gptq_config,
)


def parse_docstring(func):
    doc = inspect.getdoc(func)
    if not doc:
        return {"title": "Untitled", "description": ""}

    lines = doc.splitlines()
    title = next((line.replace("Title:", "").strip() for line in lines if line.startswith("Title:")), "Untitled")
    description = "\n".join(line.strip() for line in lines if line.startswith("Description:"))
    description = description.replace("Description:", "").strip()

    return {"title": title, "description": description}

def gradio_app_with_docs(func: Callable) -> Callable:
    sig = inspect.signature(func)
    type_hints = get_type_hints(func)
    metadata = parse_docstring(func)

    """
    A decorator that automatically builds and launches a Gradio interface
    based on function type hints.

    Args:
        func: A callable with type-hinted parameters and return type.

    Returns:
        The wrapped function with a `.launch()` method to start the app.
    """
    # Infer Gradio components from type hints
    def _map_type(t: type) -> gr.Component:
        if t == str:
            return gr.Textbox(label="Input")
        elif t == int:
            return gr.Number(precision=0)
        elif t == float:
            return gr.Number()
        elif t == bool:
            return gr.Checkbox()
        elif hasattr(t, "__origin__") and t.__origin__ == list:  # Handle List[type]
            elem_type = t.__args__[0]
            if elem_type == str:
                return gr.Dropdown(choices=["Option1", "Option2"])
            else:
                raise ValueError(f"Unsupported list element type: {elem_type}")
        else:
            raise ValueError(f"Unsupported type: {t}")

    # Extract function signature and type hints
    sig = inspect.signature(func)
    type_hints = get_type_hints(func)

    # Map parameters to Gradio inputs
    inputs = []
    for name, param in sig.parameters.items():
        if name == "self":
            continue  # Skip self in class methods
        param_type = type_hints.get(name, Any)
        component = _map_type(param_type)
        component.label = name.replace("_", " ").title()
        inputs.append(component)

    # Map return type to Gradio output
    return_type = type_hints.get("return", Any)
    outputs = _map_type(return_type)

    # Wrap function with Gradio interface
    interface = gr.Interface(fn=func, inputs=inputs, outputs=outputs)

    with gr.Blocks() as demo:
        gr.Markdown(f"## {metadata['title']}\n{metadata['description']}")
        interface = gr.Interface(fn=func, inputs=inputs, outputs=outputs)

    def wrapper(*args, **kwargs):
        return func(*args, **kwargs)

    wrapper.launch = lambda: demo.launch()
    return wrapper


@gradio_app_with_docs
def generate_response(prompt: str) -> str:
    """
    Title: Super Tiny GPTQ V2 Model on CPU
    Description: A Simple app to test out the potentials of small GPTQ LLM model.

    Args:
        prompt (str): A simple prompt.

    Returns:
        str: Simplified response.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")  # Move inputs to CPU
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Example usage
# prompt = "Explain quantum computing in simple terms."
# response = generate_response(prompt)
# print(response)


if __name__ == "__main__":
    generate_response.launch()