File size: 11,622 Bytes
2d31fd4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
#!/bin/bash
# MLX Model Conversion Utility for Dragon M3 Ultra
# Updated: January 2025 for MLX 0.26+ and modern uv workflow
# Supports Q5 quantization and M3 Ultra optimizations
# Text formatting
BOLD="\033[1m"
BLUE="\033[34m"
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
CYAN="\033[36m"
MAGENTA="\033[35m"
RESET="\033[0m"
# Detect system specs
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824))
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
# Check if running on M3 Ultra
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then
IS_M3_ULTRA=true
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}"
else
IS_M3_ULTRA=false
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${BLUE} MLX Model Conversion Utility v2.0 ${RESET}"
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "Updated for MLX 0.26+ with Q5 support and M3 Ultra optimizations\n"
# Default values
DEFAULT_HF_PATH="meta-llama/Llama-3.1-405B"
DEFAULT_OUTPUT_DIR="models/Llama-3.1-405B-MLX-Q5"
DEFAULT_QUANTIZE="y"
DEFAULT_BITS="5" # Changed to Q5 as default for better quality/size ratio
DEFAULT_GROUP_SIZE="64"
DEFAULT_DTYPE="float16"
# hf-xet optimization for Dragon M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
export HF_XET_HIGH_PERFORMANCE_MODE=1
export HF_XET_CHUNK_CACHE_SIZE_BYTES=107374182400 # 100GB cache
export HF_XET_CONCURRENT_DOWNLOADS=32
echo -e "${CYAN}✓ hf-xet optimizations enabled for Dragon${RESET}"
fi
# Get HF Path
echo -e "${BOLD}Hugging Face model path or local directory:${RESET}"
echo -e "(Default: ${DEFAULT_HF_PATH})"
echo -e "${CYAN}Examples:${RESET}"
echo -e " HF repo: meta-llama/Llama-3.1-405B"
echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name"
read -p "> " HF_PATH
HF_PATH=${HF_PATH:-$DEFAULT_HF_PATH}
# Check if it's a local path
if [[ -d "$HF_PATH" ]]; then
echo -e "${GREEN}✓ Local model detected: ${HF_PATH}${RESET}"
IS_LOCAL=true
else
IS_LOCAL=false
# Ask about hf-xet for remote models
echo -e "\n${BOLD}Use hf-xet for faster download? [y/n]${RESET}"
echo -e "(10x faster downloads with chunk deduplication)"
echo -e "Default: y"
read -p "> " USE_HF_XET
USE_HF_XET=${USE_HF_XET:-y}
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
# Check if hf-xet is installed
if ! uv run python -c "import hf_xet" 2>/dev/null; then
echo -e "${YELLOW}⚠️ hf-xet not installed. Installing...${RESET}"
echo -e "Run: uv add 'huggingface_hub[hf_xet]'"
echo -e "${CYAN}Note: hf-xet only works with Xet-backed repos${RESET}"
else
echo -e "${GREEN}✓ hf-xet enabled for download${RESET}"
fi
fi
fi
# Get output directory
echo -e "\n${BOLD}Output MLX model directory:${RESET}"
echo -e "(Default: ${DEFAULT_OUTPUT_DIR})"
read -p "> " MLX_PATH
MLX_PATH=${MLX_PATH:-$DEFAULT_OUTPUT_DIR}
# Ask about data type
echo -e "\n${BOLD}Model data type:${RESET}"
echo -e "(Default: ${DEFAULT_DTYPE}, Options: float16, bfloat16, float32)"
read -p "> " DTYPE
DTYPE=${DTYPE:-$DEFAULT_DTYPE}
# Ask about quantization
echo -e "\n${BOLD}Quantize the model? [y/n]${RESET}"
echo -e "(Default: ${DEFAULT_QUANTIZE})"
read -p "> " QUANTIZE
QUANTIZE=${QUANTIZE:-$DEFAULT_QUANTIZE}
# If quantizing, get more details
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
echo -e "\n${BOLD}Quantization bits:${RESET}"
echo -e "${CYAN}Options:${RESET}"
echo -e " 2 - Extreme compression (lowest quality)"
echo -e " 3 - High compression"
echo -e " 4 - Standard compression (good balance)"
echo -e " ${GREEN}5 - Recommended (best quality/size ratio)${RESET}"
echo -e " 6 - Low compression"
echo -e " 8 - Minimal compression (highest quality)"
echo -e "(Default: ${DEFAULT_BITS})"
read -p "> " BITS
BITS=${BITS:-$DEFAULT_BITS}
echo -e "\n${BOLD}Group size:${RESET}"
echo -e "(Default: ${DEFAULT_GROUP_SIZE}, Options: 32, 64, 128)"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${CYAN}💡 M3 Ultra tip: Use 64 or 128 for better performance${RESET}"
fi
read -p "> " GROUP_SIZE
GROUP_SIZE=${GROUP_SIZE:-$DEFAULT_GROUP_SIZE}
echo -e "\n${BOLD}Quantization strategy:${RESET}"
echo -e "${CYAN}Options:${RESET}"
echo -e " none - Uniform quantization (default)"
echo -e " mixed_2_6 - Mix of 2 and 6 bit"
echo -e " ${GREEN}mixed_3_4 - Mix of 3 and 4 bit${RESET}"
echo -e " mixed_3_6 - Mix of 3 and 6 bit"
echo -e " mixed_4_6 - Mix of 4 and 6 bit"
echo -e "Leave empty for uniform quantization"
read -p "> " QUANT_PREDICATE
QUANT_OPTIONS="-q --q-bits ${BITS} --q-group-size ${GROUP_SIZE}"
if [[ -n "$QUANT_PREDICATE" ]]; then
QUANT_OPTIONS="${QUANT_OPTIONS} --quant-predicate ${QUANT_PREDICATE}"
fi
else
QUANT_OPTIONS=""
fi
# Memory optimization options for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}M3 Ultra optimization note:${RESET}"
echo -e "${CYAN}MLX will automatically optimize for your 512GB system${RESET}"
echo -e "${CYAN}The framework uses unified memory efficiently${RESET}"
M3_ULTRA_FLAGS=""
else
M3_ULTRA_FLAGS=""
fi
# Ask about upload repository (optional)
echo -e "\n${BOLD}Upload to Hugging Face Hub? (optional):${RESET}"
echo -e "(Leave empty to skip upload)"
read -p "> " UPLOAD_REPO
if [[ -n "$UPLOAD_REPO" ]]; then
UPLOAD_OPTION="--upload-repo ${UPLOAD_REPO}"
else
UPLOAD_OPTION=""
fi
# Build the command - UV is now default
UV_CMD="uv run mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
# Alternative commands
DIRECT_CMD="mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
PYTHON_CMD="python -m mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
# Print the preview
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}"
echo -e "$UV_CMD"
# Expected outcomes based on options
echo -e "\n${BOLD}${YELLOW}Expected outcomes:${RESET}"
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
MODEL_SIZE_GB=500 # Approximate for 405B model
case "$BITS" in
2)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 8))
echo -e "- ${GREEN}Q2: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${YELLOW}⚠️ Significant quality loss expected${RESET}"
;;
3)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 3 / 16))
echo -e "- ${GREEN}Q3: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${YELLOW}Moderate quality loss${RESET}"
;;
4)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 4))
echo -e "- ${GREEN}Q4: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}Good balance of quality and size${RESET}"
;;
5)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 5 / 16))
echo -e "- ${GREEN}Q5: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}✨ Excellent quality/size ratio${RESET}"
;;
6)
EXPECTED_SIZE=$((MODEL_SIZE_GB * 6 / 16))
echo -e "- ${GREEN}Q6: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}High quality preservation${RESET}"
;;
8)
EXPECTED_SIZE=$((MODEL_SIZE_GB / 2))
echo -e "- ${GREEN}Q8: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
echo -e "- ${GREEN}Near-lossless quality${RESET}"
;;
esac
if [[ -n "$QUANT_PREDICATE" ]]; then
echo -e "- ${CYAN}Using mixed precision: ${QUANT_PREDICATE}${RESET}"
fi
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "- ${MAGENTA}Expected memory usage: ${EXPECTED_SIZE}-$((EXPECTED_SIZE * 2))GB peak${RESET}"
echo -e "- ${MAGENTA}M3 Ultra can handle this comfortably${RESET}"
else
echo -e "- ${YELLOW}Expected memory usage: High - monitor closely${RESET}"
fi
else
echo -e "- ${GREEN}No quantization - model remains in ${DTYPE} format${RESET}"
echo -e "- ${YELLOW}Very high memory requirements (400-500GB)${RESET}"
fi
echo -e "- ${CYAN}Expected conversion time: 2-6 hours${RESET}"
# Ask for command format choice
echo -e "\n${BOLD}${GREEN}Choose command format:${RESET}"
echo -e "1. ${YELLOW}UV (recommended): ${RESET}${UV_CMD}"
echo -e "2. ${YELLOW}Direct command: ${RESET}${DIRECT_CMD}"
echo -e "3. ${YELLOW}Python module: ${RESET}${PYTHON_CMD}"
read -p "> " FORMAT_CHOICE
case "$FORMAT_CHOICE" in
2)
FINAL_CMD="${DIRECT_CMD}"
;;
3)
FINAL_CMD="${PYTHON_CMD}"
;;
*)
FINAL_CMD="${UV_CMD}"
;;
esac
# M3 Ultra specific preparation tips
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}🐉 Dragon M3 Ultra Preparation:${RESET}"
echo -e "1. ${CYAN}Your 512GB RAM can handle even 405B models${RESET}"
echo -e "2. ${CYAN}Enable High Power Mode in Energy Saver${RESET}"
echo -e "3. ${CYAN}Consider using Activity Monitor to track memory${RESET}"
echo -e "4. ${CYAN}MLX will use unified memory efficiently${RESET}"
else
echo -e "\n${BOLD}${BLUE}Preparation tips:${RESET}"
echo -e "1. ${YELLOW}Ensure Mac is plugged in and won't sleep${RESET}"
echo -e "2. ${YELLOW}Close other memory-intensive applications${RESET}"
echo -e "3. ${YELLOW}Be prepared for high fan speeds${RESET}"
echo -e "4. ${YELLOW}The process may appear to hang - this is normal${RESET}"
fi
# Print the final command
echo -e "\n${BOLD}${RED}Your conversion command:${RESET}"
echo -e "${FINAL_CMD}"
# Copy to clipboard option
echo -e "\n${BOLD}${GREEN}Copy command to clipboard? [y/n]${RESET}"
read -p "> " COPY_CMD
if [[ "$COPY_CMD" == "y" || "$COPY_CMD" == "Y" ]]; then
echo "${FINAL_CMD}" | pbcopy
echo -e "${GREEN}✓ Command copied to clipboard!${RESET}"
fi
# Download command if using remote model
if [[ "$IS_LOCAL" == false ]]; then
echo -e "\n${BOLD}${CYAN}Optional: Download model first (if needed):${RESET}"
if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
echo -e "# With hf-xet (10x faster):"
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
else
echo -e "# Standard download:"
echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
fi
fi
# Test commands
echo -e "\n${BOLD}${BLUE}After conversion, test with:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"Hello, I am\" --max-tokens 50"
# Memory monitoring for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}Monitor Dragon performance:${RESET}"
echo -e "uv run python -c \"import mlx.core as mx; print(f'Peak: {mx.metal.get_peak_memory()/1e9:.2f}GB of ${TOTAL_MEMORY_GB}GB')\""
echo -e "\n${BOLD}${CYAN}Pro tip for large models:${RESET}"
echo -e "# Set memory limit before conversion (optional):"
echo -e "export MLX_METAL_MEMORY_LIMIT=$((TOTAL_MEMORY_GB * 95 / 100))GB"
fi
# Benchmark command
echo -e "\n${BOLD}${CYAN}Benchmark the converted model:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"The\" --max-tokens 100 --verbose"
echo -e "\n${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${GREEN}✨ Conversion setup complete!${RESET}"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra ready to roar!${RESET}"
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}" |