File size: 11,622 Bytes
2d31fd4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#!/bin/bash
# MLX Model Conversion Utility for Dragon M3 Ultra
# Updated: January 2025 for MLX 0.26+ and modern uv workflow
# Supports Q5 quantization and M3 Ultra optimizations

# Text formatting
BOLD="\033[1m"
BLUE="\033[34m"
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
CYAN="\033[36m"
MAGENTA="\033[35m"
RESET="\033[0m"

# Detect system specs
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824))
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")

# Check if running on M3 Ultra
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then
    IS_M3_ULTRA=true
    echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}"
else
    IS_M3_ULTRA=false
fi

echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${BLUE}   MLX Model Conversion Utility v2.0 ${RESET}"
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "Updated for MLX 0.26+ with Q5 support and M3 Ultra optimizations\n"

# Default values
DEFAULT_HF_PATH="meta-llama/Llama-3.1-405B"
DEFAULT_OUTPUT_DIR="models/Llama-3.1-405B-MLX-Q5"
DEFAULT_QUANTIZE="y"
DEFAULT_BITS="5"  # Changed to Q5 as default for better quality/size ratio
DEFAULT_GROUP_SIZE="64"
DEFAULT_DTYPE="float16"

# hf-xet optimization for Dragon M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
    export HF_XET_HIGH_PERFORMANCE_MODE=1
    export HF_XET_CHUNK_CACHE_SIZE_BYTES=107374182400  # 100GB cache
    export HF_XET_CONCURRENT_DOWNLOADS=32
    echo -e "${CYAN}✓ hf-xet optimizations enabled for Dragon${RESET}"
fi

# Get HF Path
echo -e "${BOLD}Hugging Face model path or local directory:${RESET}"
echo -e "(Default: ${DEFAULT_HF_PATH})"
echo -e "${CYAN}Examples:${RESET}"
echo -e "  HF repo: meta-llama/Llama-3.1-405B"
echo -e "  Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name"
read -p "> " HF_PATH
HF_PATH=${HF_PATH:-$DEFAULT_HF_PATH}

# Check if it's a local path
if [[ -d "$HF_PATH" ]]; then
    echo -e "${GREEN}✓ Local model detected: ${HF_PATH}${RESET}"
    IS_LOCAL=true
else
    IS_LOCAL=false
    # Ask about hf-xet for remote models
    echo -e "\n${BOLD}Use hf-xet for faster download? [y/n]${RESET}"
    echo -e "(10x faster downloads with chunk deduplication)"
    echo -e "Default: y"
    read -p "> " USE_HF_XET
    USE_HF_XET=${USE_HF_XET:-y}
    
    if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
        # Check if hf-xet is installed
        if ! uv run python -c "import hf_xet" 2>/dev/null; then
            echo -e "${YELLOW}⚠️  hf-xet not installed. Installing...${RESET}"
            echo -e "Run: uv add 'huggingface_hub[hf_xet]'"
            echo -e "${CYAN}Note: hf-xet only works with Xet-backed repos${RESET}"
        else
            echo -e "${GREEN}✓ hf-xet enabled for download${RESET}"
        fi
    fi
fi

# Get output directory
echo -e "\n${BOLD}Output MLX model directory:${RESET}"
echo -e "(Default: ${DEFAULT_OUTPUT_DIR})"
read -p "> " MLX_PATH
MLX_PATH=${MLX_PATH:-$DEFAULT_OUTPUT_DIR}

# Ask about data type
echo -e "\n${BOLD}Model data type:${RESET}"
echo -e "(Default: ${DEFAULT_DTYPE}, Options: float16, bfloat16, float32)"
read -p "> " DTYPE
DTYPE=${DTYPE:-$DEFAULT_DTYPE}

# Ask about quantization
echo -e "\n${BOLD}Quantize the model? [y/n]${RESET}"
echo -e "(Default: ${DEFAULT_QUANTIZE})"
read -p "> " QUANTIZE
QUANTIZE=${QUANTIZE:-$DEFAULT_QUANTIZE}

# If quantizing, get more details
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
  echo -e "\n${BOLD}Quantization bits:${RESET}"
  echo -e "${CYAN}Options:${RESET}"
  echo -e "  2 - Extreme compression (lowest quality)"
  echo -e "  3 - High compression"
  echo -e "  4 - Standard compression (good balance)"
  echo -e "  ${GREEN}5 - Recommended (best quality/size ratio)${RESET}"
  echo -e "  6 - Low compression"
  echo -e "  8 - Minimal compression (highest quality)"
  echo -e "(Default: ${DEFAULT_BITS})"
  read -p "> " BITS
  BITS=${BITS:-$DEFAULT_BITS}
  
  echo -e "\n${BOLD}Group size:${RESET}"
  echo -e "(Default: ${DEFAULT_GROUP_SIZE}, Options: 32, 64, 128)"
  if [[ "$IS_M3_ULTRA" == true ]]; then
    echo -e "${CYAN}💡 M3 Ultra tip: Use 64 or 128 for better performance${RESET}"
  fi
  read -p "> " GROUP_SIZE
  GROUP_SIZE=${GROUP_SIZE:-$DEFAULT_GROUP_SIZE}

  echo -e "\n${BOLD}Quantization strategy:${RESET}"
  echo -e "${CYAN}Options:${RESET}"
  echo -e "  none - Uniform quantization (default)"
  echo -e "  mixed_2_6 - Mix of 2 and 6 bit"
  echo -e "  ${GREEN}mixed_3_4 - Mix of 3 and 4 bit${RESET}"
  echo -e "  mixed_3_6 - Mix of 3 and 6 bit"
  echo -e "  mixed_4_6 - Mix of 4 and 6 bit"
  echo -e "Leave empty for uniform quantization"
  read -p "> " QUANT_PREDICATE
  
  QUANT_OPTIONS="-q --q-bits ${BITS} --q-group-size ${GROUP_SIZE}"
  
  if [[ -n "$QUANT_PREDICATE" ]]; then
    QUANT_OPTIONS="${QUANT_OPTIONS} --quant-predicate ${QUANT_PREDICATE}"
  fi
else
  QUANT_OPTIONS=""
fi

# Memory optimization options for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
  echo -e "\n${BOLD}${MAGENTA}M3 Ultra optimization note:${RESET}"
  echo -e "${CYAN}MLX will automatically optimize for your 512GB system${RESET}"
  echo -e "${CYAN}The framework uses unified memory efficiently${RESET}"
  M3_ULTRA_FLAGS=""
else
  M3_ULTRA_FLAGS=""
fi

# Ask about upload repository (optional)
echo -e "\n${BOLD}Upload to Hugging Face Hub? (optional):${RESET}"
echo -e "(Leave empty to skip upload)"
read -p "> " UPLOAD_REPO

if [[ -n "$UPLOAD_REPO" ]]; then
  UPLOAD_OPTION="--upload-repo ${UPLOAD_REPO}"
else
  UPLOAD_OPTION=""
fi

# Build the command - UV is now default
UV_CMD="uv run mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"

# Alternative commands
DIRECT_CMD="mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"
PYTHON_CMD="python -m mlx_lm.convert --hf-path ${HF_PATH} --mlx-path ${MLX_PATH} --dtype ${DTYPE} ${QUANT_OPTIONS} ${UPLOAD_OPTION}"

# Print the preview
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}"
echo -e "$UV_CMD"

# Expected outcomes based on options
echo -e "\n${BOLD}${YELLOW}Expected outcomes:${RESET}"
if [[ "$QUANTIZE" == "y" || "$QUANTIZE" == "Y" ]]; then
  MODEL_SIZE_GB=500  # Approximate for 405B model
  
  case "$BITS" in
    2)
      EXPECTED_SIZE=$((MODEL_SIZE_GB / 8))
      echo -e "- ${GREEN}Q2: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${YELLOW}⚠️  Significant quality loss expected${RESET}"
      ;;
    3)
      EXPECTED_SIZE=$((MODEL_SIZE_GB * 3 / 16))
      echo -e "- ${GREEN}Q3: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${YELLOW}Moderate quality loss${RESET}"
      ;;
    4)
      EXPECTED_SIZE=$((MODEL_SIZE_GB / 4))
      echo -e "- ${GREEN}Q4: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${GREEN}Good balance of quality and size${RESET}"
      ;;
    5)
      EXPECTED_SIZE=$((MODEL_SIZE_GB * 5 / 16))
      echo -e "- ${GREEN}Q5: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${GREEN}✨ Excellent quality/size ratio${RESET}"
      ;;
    6)
      EXPECTED_SIZE=$((MODEL_SIZE_GB * 6 / 16))
      echo -e "- ${GREEN}Q6: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${GREEN}High quality preservation${RESET}"
      ;;
    8)
      EXPECTED_SIZE=$((MODEL_SIZE_GB / 2))
      echo -e "- ${GREEN}Q8: ~${EXPECTED_SIZE}GB (from ~${MODEL_SIZE_GB}GB)${RESET}"
      echo -e "- ${GREEN}Near-lossless quality${RESET}"
      ;;
  esac
  
  if [[ -n "$QUANT_PREDICATE" ]]; then
    echo -e "- ${CYAN}Using mixed precision: ${QUANT_PREDICATE}${RESET}"
  fi
  
  if [[ "$IS_M3_ULTRA" == true ]]; then
    echo -e "- ${MAGENTA}Expected memory usage: ${EXPECTED_SIZE}-$((EXPECTED_SIZE * 2))GB peak${RESET}"
    echo -e "- ${MAGENTA}M3 Ultra can handle this comfortably${RESET}"
  else
    echo -e "- ${YELLOW}Expected memory usage: High - monitor closely${RESET}"
  fi
else
  echo -e "- ${GREEN}No quantization - model remains in ${DTYPE} format${RESET}"
  echo -e "- ${YELLOW}Very high memory requirements (400-500GB)${RESET}"
fi

echo -e "- ${CYAN}Expected conversion time: 2-6 hours${RESET}"

# Ask for command format choice
echo -e "\n${BOLD}${GREEN}Choose command format:${RESET}"
echo -e "1. ${YELLOW}UV (recommended): ${RESET}${UV_CMD}"
echo -e "2. ${YELLOW}Direct command: ${RESET}${DIRECT_CMD}"
echo -e "3. ${YELLOW}Python module: ${RESET}${PYTHON_CMD}"
read -p "> " FORMAT_CHOICE

case "$FORMAT_CHOICE" in
  2)
    FINAL_CMD="${DIRECT_CMD}"
    ;;
  3)
    FINAL_CMD="${PYTHON_CMD}"
    ;;
  *)
    FINAL_CMD="${UV_CMD}"
    ;;
esac

# M3 Ultra specific preparation tips
if [[ "$IS_M3_ULTRA" == true ]]; then
  echo -e "\n${BOLD}${MAGENTA}🐉 Dragon M3 Ultra Preparation:${RESET}"
  echo -e "1. ${CYAN}Your 512GB RAM can handle even 405B models${RESET}"
  echo -e "2. ${CYAN}Enable High Power Mode in Energy Saver${RESET}"
  echo -e "3. ${CYAN}Consider using Activity Monitor to track memory${RESET}"
  echo -e "4. ${CYAN}MLX will use unified memory efficiently${RESET}"
else
  echo -e "\n${BOLD}${BLUE}Preparation tips:${RESET}"
  echo -e "1. ${YELLOW}Ensure Mac is plugged in and won't sleep${RESET}"
  echo -e "2. ${YELLOW}Close other memory-intensive applications${RESET}"
  echo -e "3. ${YELLOW}Be prepared for high fan speeds${RESET}"
  echo -e "4. ${YELLOW}The process may appear to hang - this is normal${RESET}"
fi

# Print the final command
echo -e "\n${BOLD}${RED}Your conversion command:${RESET}"
echo -e "${FINAL_CMD}"

# Copy to clipboard option
echo -e "\n${BOLD}${GREEN}Copy command to clipboard? [y/n]${RESET}"
read -p "> " COPY_CMD

if [[ "$COPY_CMD" == "y" || "$COPY_CMD" == "Y" ]]; then
    echo "${FINAL_CMD}" | pbcopy
    echo -e "${GREEN}✓ Command copied to clipboard!${RESET}"
fi

# Download command if using remote model
if [[ "$IS_LOCAL" == false ]]; then
    echo -e "\n${BOLD}${CYAN}Optional: Download model first (if needed):${RESET}"
    if [[ "$USE_HF_XET" == "y" || "$USE_HF_XET" == "Y" ]]; then
        echo -e "# With hf-xet (10x faster):"
        echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
    else
        echo -e "# Standard download:"
        echo -e "uv run huggingface-cli download ${HF_PATH} --local-dir ./downloads/${HF_PATH##*/}"
    fi
fi

# Test commands
echo -e "\n${BOLD}${BLUE}After conversion, test with:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"Hello, I am\" --max-tokens 50"

# Memory monitoring for M3 Ultra
if [[ "$IS_M3_ULTRA" == true ]]; then
  echo -e "\n${BOLD}${MAGENTA}Monitor Dragon performance:${RESET}"
  echo -e "uv run python -c \"import mlx.core as mx; print(f'Peak: {mx.metal.get_peak_memory()/1e9:.2f}GB of ${TOTAL_MEMORY_GB}GB')\""
  
  echo -e "\n${BOLD}${CYAN}Pro tip for large models:${RESET}"
  echo -e "# Set memory limit before conversion (optional):"
  echo -e "export MLX_METAL_MEMORY_LIMIT=$((TOTAL_MEMORY_GB * 95 / 100))GB"
fi

# Benchmark command
echo -e "\n${BOLD}${CYAN}Benchmark the converted model:${RESET}"
echo -e "uv run mlx_lm.generate --model ${MLX_PATH} --prompt \"The\" --max-tokens 100 --verbose"

echo -e "\n${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${GREEN}✨ Conversion setup complete!${RESET}"
if [[ "$IS_M3_ULTRA" == true ]]; then
  echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra ready to roar!${RESET}"
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"