QwQ-32B-MLX-Q5 / mlx-serve.sh
div0-space's picture
Upload 17 files
2d31fd4 verified
#!/bin/bash
# MLX Server Launcher for Dragon M3 Ultra
# Created: January 2025 for MLX 0.26+
# Supports local/remote models with full parameter control
# Text formatting
BOLD="\033[1m"
BLUE="\033[34m"
GREEN="\033[32m"
YELLOW="\033[33m"
RED="\033[31m"
CYAN="\033[36m"
MAGENTA="\033[35m"
RESET="\033[0m"
# Detect system specs
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0)
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824))
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown")
# Check if running on M3 Ultra
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then
IS_M3_ULTRA=true
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}"
else
IS_M3_ULTRA=false
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${BLUE} MLX Server Launcher v1.0 ${RESET}"
echo -e "${BOLD}${BLUE}=====================================${RESET}"
echo -e "Launch MLX model server with custom parameters\n"
# Default values
DEFAULT_MODEL="/Users/polyversai/.lmstudio/models/LibraxisAI/c4ai-command-a-03-2025-q5-mlx"
DEFAULT_HOST="0.0.0.0"
DEFAULT_PORT="12345"
DEFAULT_TEMP="0.7"
DEFAULT_TOP_P="0.95"
DEFAULT_TOP_K="0"
DEFAULT_MIN_P="0.0"
DEFAULT_MAX_TOKENS="2048"
DEFAULT_LOG_LEVEL="INFO"
# Get model path
echo -e "${BOLD}Model path (local or HF repo):${RESET}"
echo -e "(Default: ${DEFAULT_MODEL})"
echo -e "${CYAN}Examples:${RESET}"
echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name"
echo -e " HF: mlx-community/Llama-3.2-3B-Instruct-4bit"
read -p "> " MODEL_PATH
MODEL_PATH=${MODEL_PATH:-$DEFAULT_MODEL}
# Check if it's a local path
if [[ -d "$MODEL_PATH" ]]; then
echo -e "${GREEN}✓ Local model detected: ${MODEL_PATH}${RESET}"
else
echo -e "${GREEN}✓ Remote model specified: ${MODEL_PATH}${RESET}"
fi
# Network configuration
echo -e "\n${BOLD}Host IP address:${RESET}"
echo -e "(Default: ${DEFAULT_HOST} - accessible from network)"
echo -e "Use 127.0.0.1 for localhost only"
read -p "> " HOST
HOST=${HOST:-$DEFAULT_HOST}
echo -e "\n${BOLD}Port number:${RESET}"
echo -e "(Default: ${DEFAULT_PORT})"
read -p "> " PORT
PORT=${PORT:-$DEFAULT_PORT}
# Sampling parameters
echo -e "\n${BOLD}${CYAN}=== Sampling Parameters ===${RESET}"
echo -e "\n${BOLD}Temperature (creativity):${RESET}"
echo -e "Range: 0.0-2.0 (Default: ${DEFAULT_TEMP})"
echo -e "${YELLOW}0.0 = deterministic, 1.0 = balanced, 2.0 = very creative${RESET}"
read -p "> " TEMP
TEMP=${TEMP:-$DEFAULT_TEMP}
echo -e "\n${BOLD}Top-p (nucleus sampling):${RESET}"
echo -e "Range: 0.0-1.0 (Default: ${DEFAULT_TOP_P})"
echo -e "${YELLOW}Lower = more focused, Higher = more diverse${RESET}"
read -p "> " TOP_P
TOP_P=${TOP_P:-$DEFAULT_TOP_P}
echo -e "\n${BOLD}Top-k (vocabulary limit):${RESET}"
echo -e "Default: ${DEFAULT_TOP_K} (0 = disabled)"
echo -e "${YELLOW}Limits selection to top K tokens${RESET}"
read -p "> " TOP_K
TOP_K=${TOP_K:-$DEFAULT_TOP_K}
echo -e "\n${BOLD}Min-p (minimum probability):${RESET}"
echo -e "Range: 0.0-1.0 (Default: ${DEFAULT_MIN_P})"
echo -e "${YELLOW}0.0 = disabled, higher = filter low probability tokens${RESET}"
read -p "> " MIN_P
MIN_P=${MIN_P:-$DEFAULT_MIN_P}
echo -e "\n${BOLD}Max tokens per response:${RESET}"
echo -e "(Default: ${DEFAULT_MAX_TOKENS})"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${MAGENTA}Dragon can handle 8192+ tokens easily${RESET}"
fi
read -p "> " MAX_TOKENS
MAX_TOKENS=${MAX_TOKENS:-$DEFAULT_MAX_TOKENS}
# Optional adapter
echo -e "\n${BOLD}LoRA adapter path (optional):${RESET}"
echo -e "(Leave empty if not using adapters)"
read -p "> " ADAPTER_PATH
if [[ -n "$ADAPTER_PATH" ]]; then
ADAPTER_OPTION="--adapter-path ${ADAPTER_PATH}"
else
ADAPTER_OPTION=""
fi
# Chat template args
echo -e "\n${BOLD}Chat template args (optional JSON):${RESET}"
echo -e "Example: {\"enable_thinking\":false}"
echo -e "(Leave empty for defaults)"
read -p "> " CHAT_TEMPLATE_ARGS
if [[ -n "$CHAT_TEMPLATE_ARGS" ]]; then
CHAT_TEMPLATE_OPTION="--chat-template-args \"${CHAT_TEMPLATE_ARGS}\""
else
CHAT_TEMPLATE_OPTION=""
fi
# Log level
echo -e "\n${BOLD}Log level:${RESET}"
echo -e "(Default: ${DEFAULT_LOG_LEVEL}, Options: DEBUG, INFO, WARNING, ERROR, CRITICAL)"
read -p "> " LOG_LEVEL
LOG_LEVEL=${LOG_LEVEL:-$DEFAULT_LOG_LEVEL}
# Build the command
SERVER_CMD="uv run mlx_lm.server --model ${MODEL_PATH} --host ${HOST} --port ${PORT} --temp ${TEMP} --top-p ${TOP_P} --top-k ${TOP_K} --min-p ${MIN_P} --max-tokens ${MAX_TOKENS} --log-level ${LOG_LEVEL} ${ADAPTER_OPTION} ${CHAT_TEMPLATE_OPTION}"
# Print preview
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}"
echo -e "$SERVER_CMD"
# Launch mode selection
echo -e "\n${BOLD}${GREEN}Launch mode:${RESET}"
echo -e "1. ${YELLOW}Foreground${RESET} - See logs in terminal (Ctrl+C to stop)"
echo -e "2. ${YELLOW}Background with logging${RESET} - Logs to mlx-server.log"
echo -e "3. ${YELLOW}Background detached${RESET} - Run with nohup"
echo -e "4. ${YELLOW}Just copy command${RESET} - Don't launch"
read -p "> " LAUNCH_MODE
# Create logs directory if needed
if [[ "$LAUNCH_MODE" == "2" || "$LAUNCH_MODE" == "3" ]]; then
mkdir -p logs
LOG_FILE="logs/mlx-server-$(date +%Y%m%d-%H%M%S).log"
fi
case "$LAUNCH_MODE" in
1)
echo -e "\n${BOLD}${GREEN}Starting server in foreground...${RESET}"
echo -e "${YELLOW}Press Ctrl+C to stop${RESET}\n"
eval "$SERVER_CMD"
;;
2)
echo -e "\n${BOLD}${GREEN}Starting server in background...${RESET}"
echo -e "Logs: ${LOG_FILE}"
eval "$SERVER_CMD" > "${LOG_FILE}" 2>&1 &
SERVER_PID=$!
echo -e "${GREEN}✓ Server started with PID: ${SERVER_PID}${RESET}"
echo -e "\nTo monitor: tail -f ${LOG_FILE}"
echo -e "To stop: kill ${SERVER_PID}"
# Save PID for easy stopping
echo $SERVER_PID > logs/mlx-server.pid
;;
3)
echo -e "\n${BOLD}${GREEN}Starting server with nohup...${RESET}"
echo -e "Logs: ${LOG_FILE}"
nohup bash -c "$SERVER_CMD" > "${LOG_FILE}" 2>&1 &
SERVER_PID=$!
echo -e "${GREEN}✓ Server started with PID: ${SERVER_PID}${RESET}"
echo -e "\nTo monitor: tail -f ${LOG_FILE}"
echo -e "To stop: kill ${SERVER_PID}"
# Save PID
echo $SERVER_PID > logs/mlx-server.pid
;;
4)
echo -e "\n${BOLD}${GREEN}Command copied to clipboard!${RESET}"
echo "$SERVER_CMD" | pbcopy
;;
*)
echo -e "\n${RED}Invalid choice. Exiting.${RESET}"
exit 1
;;
esac
# Print API examples
if [[ "$LAUNCH_MODE" != "4" ]]; then
echo -e "\n${BOLD}${BLUE}=== API Usage Examples ===${RESET}"
echo -e "\n${CYAN}1. Chat completion:${RESET}"
echo -e "curl http://${HOST}:${PORT}/v1/chat/completions \\"
echo -e " -H \"Content-Type: application/json\" \\"
echo -e " -d '{"
echo -e " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}],"
echo -e " \"temperature\": ${TEMP},"
echo -e " \"max_tokens\": 100"
echo -e " }'"
echo -e "\n${CYAN}2. Check models:${RESET}"
echo -e "curl http://${HOST}:${PORT}/v1/models"
echo -e "\n${CYAN}3. Health check:${RESET}"
echo -e "curl http://${HOST}:${PORT}/health"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "\n${BOLD}${MAGENTA}Dragon Performance Monitoring:${RESET}"
echo -e "# In another terminal:"
echo -e "watch -n 1 'curl -s http://${HOST}:${PORT}/health | jq .'"
fi
fi
echo -e "\n${BOLD}${BLUE}=====================================${RESET}"
echo -e "${BOLD}${GREEN}✨ MLX Server ready!${RESET}"
if [[ "$IS_M3_ULTRA" == true ]]; then
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra serving at full power!${RESET}"
fi
echo -e "${BOLD}${BLUE}=====================================${RESET}"