|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
BOLD="\033[1m" |
|
BLUE="\033[34m" |
|
GREEN="\033[32m" |
|
YELLOW="\033[33m" |
|
RED="\033[31m" |
|
CYAN="\033[36m" |
|
MAGENTA="\033[35m" |
|
RESET="\033[0m" |
|
|
|
|
|
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0) |
|
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824)) |
|
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") |
|
|
|
|
|
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then |
|
IS_M3_ULTRA=true |
|
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}" |
|
else |
|
IS_M3_ULTRA=false |
|
fi |
|
|
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "${BOLD}${BLUE} MLX Model Conversion Utility v2.0 ${RESET}" |
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "Updated for MLX 0.26+ with Q5 support and M3 Ultra optimizations\n" |
|
|
|
|
|
DEFAULT_HF_PATH="meta-llama/Llama-3.1-405B" |
|
DEFAULT_OUTPUT_DIR="models/Llama-3.1-405B-MLX-Q5" |
|
DEFAULT_QUANTIZE="y" |
|
DEFAULT_BITS="5" |
|
DEFAULT_GROUP_SIZE="64" |
|
DEFAULT_DTYPE="float16" |
|
|
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
export HF_XET_HIGH_PERFORMANCE_MODE=1 |
|
export HF_XET_CHUNK_CACHE_SIZE_BYTES=107374182400 |
|
export HF_XET_CONCURRENT_DOWNLOADS=32 |
|
echo -e "${CYAN}✓ hf-xet optimizations enabled for Dragon${RESET}" |
|
fi |
|
|
|
|
|
echo -e "${BOLD}Hugging Face model path or local directory:${RESET}" |
|
echo -e "(Default: ${DEFAULT_HF_PATH})" |
|
echo -e "${CYAN}Examples:${RESET}" |
|
echo -e " HF repo: meta-llama/Llama-3.1-405B" |
|
echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name" |
|
read -p "> " HF_PATH |
|
HF_PATH=${HF_PATH:-$DEFAULT_HF_PATH} |
|
|
|
|
|
if [[ -d "$HF_PATH" ]]; then |
|
echo -e "${GREEN}✓ Local model detected: ${HF_PATH}${RESET}" |
|
IS_LOCAL=true |
|
else |
|
IS_LOCAL=false |
|
|
|
echo -e "\n${BOLD}Use hf-xet for faster download? [y/n]${RESET}" |
|
echo -e "(10x faster downloads with chunk deduplication)" |
|
echo -e "Default: y" |
|
read -p "> " USE_HF_XET |
|
USE_HF_XET=${USE_HF_XET:-y} |
|
|
|
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then |
|
|
|
if ! uv run python -c "import hf_xet" 2>/dev/null; then |
|
echo -e "${YELLOW}⚠️ hf-xet not installed. Installing...${RESET}" |
|
echo -e "Run: uv add 'huggingface_hub[hf_xet]'" |
|
echo -e "${CYAN}Note: hf-xet only works with Xet-backed repos${RESET}" |
|
else |
|
echo -e "${GREEN}✓ hf-xet enabled for download${RESET}" |
|
fi |
|
fi |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}Output MLX model directory:${RESET}" |
|
echo -e "(Default: ${DEFAULT_OUTPUT_DIR})" |
|
read -p "> " MLX_PATH |
|
MLX_PATH=${MLX_PATH:-$DEFAULT_OUTPUT_DIR} |
|
|
|
|
|
echo -e "\n${BOLD}Model data type:${RESET}" |
|
echo -e "(Default: ${DEFAULT_DTYPE}, Options: float16, bfloat16, float32)" |
|
read -p "> " DTYPE |
|
DTYPE=${DTYPE:-$DEFAULT_DTYPE} |
|
|
|
|
|
echo -e "\n${BOLD}Quantize the model? [y/n]${RESET}" |
|
echo -e "(Default: ${DEFAULT_QUANTIZE})" |
|
read -p "> " QUANTIZE |
|
QUANTIZE=${QUANTIZE:-$DEFAULT_QUANTIZE} |
|
|
|
|
|
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then |
|
echo -e "\n${BOLD}Quantization bits:${RESET}" |
|
echo -e "${CYAN}Options:${RESET}" |
|
echo -e " 2 - Extreme compression (lowest quality)" |
|
echo -e " 3 - High compression" |
|
echo -e " 4 - Standard compression (good balance)" |
|
echo -e " ${GREEN}5 - Recommended (best quality/size ratio)${RESET}" |
|
echo -e " 6 - Low compression" |
|
echo -e " 8 - Minimal compression (highest quality)" |
|
echo -e "(Default: ${DEFAULT_BITS})" |
|
read -p "> " BITS |
|
BITS=${BITS:-$DEFAULT_BITS} |
|
|
|
echo -e "\n${BOLD}Group size:${RESET}" |
|
echo -e "(Default: ${DEFAULT_GROUP_SIZE}, Options: 32, 64, 128)" |
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "${CYAN}💡 M3 Ultra tip: Use 64 or 128 for better performance${RESET}" |
|
fi |
|
read -p "> " GROUP_SIZE |
|
GROUP_SIZE=${GROUP_SIZE:-$DEFAULT_GROUP_SIZE} |
|
|
|
echo -e "\n${BOLD}Quantization strategy:${RESET}" |
|
echo -e "${CYAN}Options:${RESET}" |
|
echo -e " none - Uniform quantization (default)" |
|
echo -e " mixed_2_6 - Mix of 2 and 6 bit" |
|
echo -e " ${GREEN}mixed_3_4 - Mix of 3 and 4 bit${RESET}" |
|
echo -e " mixed_3_6 - Mix of 3 and 6 bit" |
|
echo -e " mixed_4_6 - Mix of 4 and 6 bit" |
|
echo -e "Leave empty for uniform quantization" |
|
read -p "> " QUANT_PREDICATE |
|
|
|
QUANT_OPTIONS="-q --q-bits ${BITS} --q-group-size ${GROUP_SIZE}" |
|
|
|
if [[ -n "$QUANT_PREDICATE" ]]; then |
|
QUANT_OPTIONS="${QUANT_OPTIONS} --quant-predicate ${QUANT_PREDICATE}" |
|
fi |
|
else |
|
QUANT_OPTIONS="" |
|
fi |
|
|
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "\n${BOLD}${MAGENTA}M3 Ultra optimization note:${RESET}" |
|
echo -e "${CYAN}MLX will automatically optimize for your 512GB system${RESET}" |
|
echo -e "${CYAN}The framework uses unified memory efficiently${RESET}" |
|
M3_ULTRA_FLAGS="" |
|
else |
|
M3_ULTRA_FLAGS="" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}Upload to Hugging Face Hub? (optional):${RESET}" |
|
echo -e "(Leave empty to skip upload)" |
|
read -p "> " UPLOAD_REPO |
|
|
|
if [[ -n "$UPLOAD_REPO" ]]; then |
|
UPLOAD_OPTION="--upload-repo ${UPLOAD_REPO}" |
|
else |
|
UPLOAD_OPTION="" |
|
fi |
|
|
|
|
|
UV_CMD="uv run mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" |
|
|
|
|
|
DIRECT_CMD="mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" |
|
PYTHON_CMD="python -m mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}" |
|
|
|
|
|
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}" |
|
echo -e "$UV_CMD" |
|
|
|
|
|
echo -e "\n${BOLD}${YELLOW}Expected outcomes:${RESET}" |
|
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then |
|
MODEL_SIZE_GB=500 |
|
|
|
case "$BITS" in |
|
2) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB / 8)) |
|
echo -e "- ${GREEN}Q2: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${YELLOW}⚠️ Significant quality loss expected${RESET}" |
|
;; |
|
3) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB * 3 / 16)) |
|
echo -e "- ${GREEN}Q3: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${YELLOW}Moderate quality loss${RESET}" |
|
;; |
|
4) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB / 4)) |
|
echo -e "- ${GREEN}Q4: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${GREEN}Good balance of quality and size${RESET}" |
|
;; |
|
5) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB * 5 / 16)) |
|
echo -e "- ${GREEN}Q5: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${GREEN}✨ Excellent quality/size ratio${RESET}" |
|
;; |
|
6) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB * 6 / 16)) |
|
echo -e "- ${GREEN}Q6: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${GREEN}High quality preservation${RESET}" |
|
;; |
|
8) |
|
EXPECTED_SIZE=$((MODEL_SIZE_GB / 2)) |
|
echo -e "- ${GREEN}Q8: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}" |
|
echo -e "- ${GREEN}Near-lossless quality${RESET}" |
|
;; |
|
esac |
|
|
|
if [[ -n "$QUANT_PREDICATE" ]]; then |
|
echo -e "- ${CYAN}Using mixed precision: ${QUANT_PREDICATE}${RESET}" |
|
fi |
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "- ${MAGENTA}Expected memory usage: ${EXPECTED_SIZE}-$((EXPECTED_SIZE * 2))GB peak${RESET}" |
|
echo -e "- ${MAGENTA}M3 Ultra can handle this comfortably${RESET}" |
|
else |
|
echo -e "- ${YELLOW}Expected memory usage: High - monitor closely${RESET}" |
|
fi |
|
else |
|
echo -e "- ${GREEN}No quantization - model remains in ${DTYPE} format${RESET}" |
|
echo -e "- ${YELLOW}Very high memory requirements (400-500GB)${RESET}" |
|
fi |
|
|
|
echo -e "- ${CYAN}Expected conversion time: 2-6 hours${RESET}" |
|
|
|
|
|
echo -e "\n${BOLD}${GREEN}Choose command format:${RESET}" |
|
echo -e "1. ${YELLOW}UV (recommended): ${RESET}${UV_CMD}" |
|
echo -e "2. ${YELLOW}Direct command: ${RESET}${DIRECT_CMD}" |
|
echo -e "3. ${YELLOW}Python module: ${RESET}${PYTHON_CMD}" |
|
read -p "> " FORMAT_CHOICE |
|
|
|
case "$FORMAT_CHOICE" in |
|
2) |
|
FINAL_CMD="${DIRECT_CMD}" |
|
;; |
|
3) |
|
FINAL_CMD="${PYTHON_CMD}" |
|
;; |
|
*) |
|
FINAL_CMD="${UV_CMD}" |
|
;; |
|
esac |
|
|
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "\n${BOLD}${MAGENTA}🐉 Dragon M3 Ultra Preparation:${RESET}" |
|
echo -e "1. ${CYAN}Your 512GB RAM can handle even 405B models${RESET}" |
|
echo -e "2. ${CYAN}Enable High Power Mode in Energy Saver${RESET}" |
|
echo -e "3. ${CYAN}Consider using Activity Monitor to track memory${RESET}" |
|
echo -e "4. ${CYAN}MLX will use unified memory efficiently${RESET}" |
|
else |
|
echo -e "\n${BOLD}${BLUE}Preparation tips:${RESET}" |
|
echo -e "1. ${YELLOW}Ensure Mac is plugged in and won't sleep${RESET}" |
|
echo -e "2. ${YELLOW}Close other memory-intensive applications${RESET}" |
|
echo -e "3. ${YELLOW}Be prepared for high fan speeds${RESET}" |
|
echo -e "4. ${YELLOW}The process may appear to hang - this is normal${RESET}" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}${RED}Your conversion command:${RESET}" |
|
echo -e "${FINAL_CMD}" |
|
|
|
|
|
echo -e "\n${BOLD}${GREEN}Copy command to clipboard? [y/n]${RESET}" |
|
read -p "> " COPY_CMD |
|
|
|
if [[ "$COPY_CMD" == "y" || "$COPY_CMD" == "Y" ]]; then |
|
echo "${FINAL_CMD}" | pbcopy |
|
echo -e "${GREEN}✓ Command copied to clipboard!${RESET}" |
|
fi |
|
|
|
|
|
if [[ "$IS_LOCAL" == false ]]; then |
|
echo -e "\n${BOLD}${CYAN}Optional: Download model first (if needed):${RESET}" |
|
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then |
|
echo -e "# With hf-xet (10x faster):" |
|
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}" |
|
else |
|
echo -e "# Standard download:" |
|
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}" |
|
fi |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}${BLUE}After conversion, test with:${RESET}" |
|
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"Hello, I am\" --max-tokens 50" |
|
|
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "\n${BOLD}${MAGENTA}Monitor Dragon performance:${RESET}" |
|
echo -e "uv run python -c \"import mlx.core as mx; print(f'Peak: {mx.metal.get_peak_memory()/1e9:.2f}GB of ${TOTAL_MEMORY_GB}GB')\"" |
|
|
|
echo -e "\n${BOLD}${CYAN}Pro tip for large models:${RESET}" |
|
echo -e "# Set memory limit before conversion (optional):" |
|
echo -e "export MLX_METAL_MEMORY_LIMIT=$((TOTAL_MEMORY_GB * 95 / 100))GB" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}${CYAN}Benchmark the converted model:${RESET}" |
|
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"The\" --max-tokens 100 --verbose" |
|
|
|
echo -e "\n${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "${BOLD}${GREEN}✨ Conversion setup complete!${RESET}" |
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra ready to roar!${RESET}" |
|
fi |
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |