QwQ-32B-MLX-Q5 / convert-to-mlx.sh
div0-space's picture
Upload 17 files
2d31fd4 verified
#!/bin/bash
# MLX Model Conversion Utility for Dragon M3 Ultra
# Updated: January 2025 for MLX 0.26+ and modern uv workflow
# Supports Q5 quantization and M3 Ultra optimizations
# Text formatting
BOLD="\033[1m"
BLUE="\033[34m"
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
CYAN="\033[36m"
MAGENTA="\033[35m"
RESET="\033[0m"
# Detect system specs
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824))
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
# Check if running on M3 Ultra
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then
IS_M3_ULTRA=true
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}"
else
IS_M3_ULTRA=false
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${BLUE} MLX Model Conversion Utility v2.0 ${RESET}"
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "Updated for MLX 0.26+ with Q5 support and M3 Ultra optimizations\n"
# Default values
DEFAULT_HF_PATH="meta-llama/Llama-3.1-405B"
DEFAULT_OUTPUT_DIR="models/Llama-3.1-405B-MLX-Q5"
DEFAULT_QUANTIZE="y"
DEFAULT_BITS="5" # Changed to Q5 as default for better quality/size ratio
DEFAULT_GROUP_SIZE="64"
DEFAULT_DTYPE="float16"
# hf-xet optimization for Dragon M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
export HF_XET_HIGH_PERFORMANCE_MODE=1
export HF_XET_CHUNK_CACHE_SIZE_BYTES=107374182400 # 100GB cache
export HF_XET_CONCURRENT_DOWNLOADS=32
echo -e "${CYAN}✓ hf-xet optimizations enabled for Dragon${RESET}"
fi
# Get HF Path
echo -e "${BOLD}Hugging Face model path or local directory:${RESET}"
echo -e "(Default: ${DEFAULT_HF_PATH})"
echo -e "${CYAN}Examples:${RESET}"
echo -e " HF repo: meta-llama/Llama-3.1-405B"
echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name"
read -p "> " HF_PATH
HF_PATH=${HF_PATH:-$DEFAULT_HF_PATH}
# Check if it's a local path
if [[ -d "$HF_PATH" ]]; then
echo -e "${GREEN}✓ Local model detected: ${HF_PATH}${RESET}"
IS_LOCAL=true
else
IS_LOCAL=false
# Ask about hf-xet for remote models
echo -e "\n${BOLD}Use hf-xet for faster download? [y/n]${RESET}"
echo -e "(10x faster downloads with chunk deduplication)"
echo -e "Default: y"
read -p "> " USE_HF_XET
USE_HF_XET=${USE_HF_XET:-y}
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
# Check if hf-xet is installed
if ! uv run python -c "import hf_xet" 2>/dev/null; then
echo -e "${YELLOW}⚠️ hf-xet not installed. Installing...${RESET}"
echo -e "Run: uv add 'huggingface_hub[hf_xet]'"
echo -e "${CYAN}Note: hf-xet only works with Xet-backed repos${RESET}"
else
echo -e "${GREEN}✓ hf-xet enabled for download${RESET}"
fi
fi
fi
# Get output directory
echo -e "\n${BOLD}Output MLX model directory:${RESET}"
echo -e "(Default: ${DEFAULT_OUTPUT_DIR})"
read -p "> " MLX_PATH
MLX_PATH=${MLX_PATH:-$DEFAULT_OUTPUT_DIR}
# Ask about data type
echo -e "\n${BOLD}Model data type:${RESET}"
echo -e "(Default: ${DEFAULT_DTYPE}, Options: float16, bfloat16, float32)"
read -p "> " DTYPE
DTYPE=${DTYPE:-$DEFAULT_DTYPE}
# Ask about quantization
echo -e "\n${BOLD}Quantize the model? [y/n]${RESET}"
echo -e "(Default: ${DEFAULT_QUANTIZE})"
read -p "> " QUANTIZE
QUANTIZE=${QUANTIZE:-$DEFAULT_QUANTIZE}
# If quantizing, get more details
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
echo -e "\n${BOLD}Quantization bits:${RESET}"
echo -e "${CYAN}Options:${RESET}"
echo -e " 2 - Extreme compression (lowest quality)"
echo -e " 3 - High compression"
echo -e " 4 - Standard compression (good balance)"
echo -e " ${GREEN}5 - Recommended (best quality/size ratio)${RESET}"
echo -e " 6 - Low compression"
echo -e " 8 - Minimal compression (highest quality)"
echo -e "(Default: ${DEFAULT_BITS})"
read -p "> " BITS
BITS=${BITS:-$DEFAULT_BITS}
echo -e "\n${BOLD}Group size:${RESET}"
echo -e "(Default: ${DEFAULT_GROUP_SIZE}, Options: 32, 64, 128)"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${CYAN}💡 M3 Ultra tip: Use 64 or 128 for better performance${RESET}"
fi
read -p "> " GROUP_SIZE
GROUP_SIZE=${GROUP_SIZE:-$DEFAULT_GROUP_SIZE}
echo -e "\n${BOLD}Quantization strategy:${RESET}"
echo -e "${CYAN}Options:${RESET}"
echo -e " none - Uniform quantization (default)"
echo -e " mixed_2_6 - Mix of 2 and 6 bit"
echo -e " ${GREEN}mixed_3_4 - Mix of 3 and 4 bit${RESET}"
echo -e " mixed_3_6 - Mix of 3 and 6 bit"
echo -e " mixed_4_6 - Mix of 4 and 6 bit"
echo -e "Leave empty for uniform quantization"
read -p "> " QUANT_PREDICATE
QUANT_OPTIONS="-q --q-bits ${BITS} --q-group-size ${GROUP_SIZE}"
if [[ -n "$QUANT_PREDICATE" ]]; then
QUANT_OPTIONS="${QUANT_OPTIONS} --quant-predicate ${QUANT_PREDICATE}"
fi
else
QUANT_OPTIONS=""
fi
# Memory optimization options for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}M3 Ultra optimization note:${RESET}"
echo -e "${CYAN}MLX will automatically optimize for your 512GB system${RESET}"
echo -e "${CYAN}The framework uses unified memory efficiently${RESET}"
M3_ULTRA_FLAGS=""
else
M3_ULTRA_FLAGS=""
fi
# Ask about upload repository (optional)
echo -e "\n${BOLD}Upload to Hugging Face Hub? (optional):${RESET}"
echo -e "(Leave empty to skip upload)"
read -p "> " UPLOAD_REPO
if [[ -n "$UPLOAD_REPO" ]]; then
UPLOAD_OPTION="--upload-repo ${UPLOAD_REPO}"
else
UPLOAD_OPTION=""
fi
# Build the command - UV is now default
UV_CMD="uv run mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
# Alternative commands
DIRECT_CMD="mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
PYTHON_CMD="python -m mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
# Print the preview
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}"
echo -e "$UV_CMD"
# Expected outcomes based on options
echo -e "\n${BOLD}${YELLOW}Expected outcomes:${RESET}"
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
MODEL_SIZE_GB=500 # Approximate for 405B model
case "$BITS" in
2)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 8))
echo -e "- ${GREEN}Q2: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${YELLOW}⚠️ Significant quality loss expected${RESET}"
;;
3)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 3 / 16))
echo -e "- ${GREEN}Q3: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${YELLOW}Moderate quality loss${RESET}"
;;
4)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 4))
echo -e "- ${GREEN}Q4: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}Good balance of quality and size${RESET}"
;;
5)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 5 / 16))
echo -e "- ${GREEN}Q5: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}✨ Excellent quality/size ratio${RESET}"
;;
6)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 6 / 16))
echo -e "- ${GREEN}Q6: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}High quality preservation${RESET}"
;;
8)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 2))
echo -e "- ${GREEN}Q8: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}Near-lossless quality${RESET}"
;;
esac
if [[ -n "$QUANT_PREDICATE" ]]; then
echo -e "- ${CYAN}Using mixed precision: ${QUANT_PREDICATE}${RESET}"
fi
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "- ${MAGENTA}Expected memory usage: ${EXPECTED_SIZE}-$((EXPECTED_SIZE * 2))GB peak${RESET}"
echo -e "- ${MAGENTA}M3 Ultra can handle this comfortably${RESET}"
else
echo -e "- ${YELLOW}Expected memory usage: High - monitor closely${RESET}"
fi
else
echo -e "- ${GREEN}No quantization - model remains in ${DTYPE} format${RESET}"
echo -e "- ${YELLOW}Very high memory requirements (400-500GB)${RESET}"
fi
echo -e "- ${CYAN}Expected conversion time: 2-6 hours${RESET}"
# Ask for command format choice
echo -e "\n${BOLD}${GREEN}Choose command format:${RESET}"
echo -e "1. ${YELLOW}UV (recommended): ${RESET}${UV_CMD}"
echo -e "2. ${YELLOW}Direct command: ${RESET}${DIRECT_CMD}"
echo -e "3. ${YELLOW}Python module: ${RESET}${PYTHON_CMD}"
read -p "> " FORMAT_CHOICE
case "$FORMAT_CHOICE" in
2)
FINAL_CMD="${DIRECT_CMD}"
;;
3)
FINAL_CMD="${PYTHON_CMD}"
;;
*)
FINAL_CMD="${UV_CMD}"
;;
esac
# M3 Ultra specific preparation tips
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}🐉 Dragon M3 Ultra Preparation:${RESET}"
echo -e "1. ${CYAN}Your 512GB RAM can handle even 405B models${RESET}"
echo -e "2. ${CYAN}Enable High Power Mode in Energy Saver${RESET}"
echo -e "3. ${CYAN}Consider using Activity Monitor to track memory${RESET}"
echo -e "4. ${CYAN}MLX will use unified memory efficiently${RESET}"
else
echo -e "\n${BOLD}${BLUE}Preparation tips:${RESET}"
echo -e "1. ${YELLOW}Ensure Mac is plugged in and won't sleep${RESET}"
echo -e "2. ${YELLOW}Close other memory-intensive applications${RESET}"
echo -e "3. ${YELLOW}Be prepared for high fan speeds${RESET}"
echo -e "4. ${YELLOW}The process may appear to hang - this is normal${RESET}"
fi
# Print the final command
echo -e "\n${BOLD}${RED}Your conversion command:${RESET}"
echo -e "${FINAL_CMD}"
# Copy to clipboard option
echo -e "\n${BOLD}${GREEN}Copy command to clipboard? [y/n]${RESET}"
read -p "> " COPY_CMD
if [[ "$COPY_CMD" == "y" || "$COPY_CMD" == "Y" ]]; then
echo "${FINAL_CMD}" | pbcopy
echo -e "${GREEN}✓ Command copied to clipboard!${RESET}"
fi
# Download command if using remote model
if [[ "$IS_LOCAL" == false ]]; then
echo -e "\n${BOLD}${CYAN}Optional: Download model first (if needed):${RESET}"
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
echo -e "# With hf-xet (10x faster):"
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
else
echo -e "# Standard download:"
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
fi
fi
# Test commands
echo -e "\n${BOLD}${BLUE}After conversion, test with:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"Hello, I am\" --max-tokens 50"
# Memory monitoring for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}Monitor Dragon performance:${RESET}"
echo -e "uv run python -c \"import mlx.core as mx; print(f'Peak: {mx.metal.get_peak_memory()/1e9:.2f}GB of ${TOTAL_MEMORY_GB}GB')\""
echo -e "\n${BOLD}${CYAN}Pro tip for large models:${RESET}"
echo -e "# Set memory limit before conversion (optional):"
echo -e "export MLX_METAL_MEMORY_LIMIT=$((TOTAL_MEMORY_GB * 95 / 100))GB"
fi
# Benchmark command
echo -e "\n${BOLD}${CYAN}Benchmark the converted model:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"The\" --max-tokens 100 --verbose"
echo -e "\n${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${GREEN}✨ Conversion setup complete!${RESET}"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra ready to roar!${RESET}"
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"