janni-t's picture
feat: int8 quantized version
d716917 verified
#!/bin/bash
# Test script for TEI with the converted ONNX model
echo "Testing Qwen3-Embedding-0.6B INT8 ONNX with TEI..."
echo "This model is quantized for faster CPU inference"
MODEL_PATH=$(pwd)
echo "Model path: $MODEL_PATH"
echo "Files in model directory:"
ls -la $MODEL_PATH
echo ""
echo "Expected performance improvement: 2-4x faster on CPU"
echo "Note: There may be a small accuracy drop (1-3%)"
echo ""
echo "To use this model with TEI:"
echo "1. Upload to HuggingFace Hub, or"
echo "2. Mount this directory in your TEI container"
echo "3. Update model-id in porter.yaml to point to this model"
echo ""
echo "For optimal CPU performance, set these environment variables:"
echo "export OMP_NUM_THREADS=$(nproc) # Use all physical cores"
echo "export KMP_AFFINITY=granularity=fine,compact,1,0"
echo "export ORT_THREAD_POOL_SIZE=$(nproc)"