|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
BOLD="\033[1m" |
|
BLUE="\033[34m" |
|
GREEN="\033[32m" |
|
YELLOW="\033[33m" |
|
RED="\033[31m" |
|
CYAN="\033[36m" |
|
MAGENTA="\033[35m" |
|
RESET="\033[0m" |
|
|
|
|
|
TOTAL_MEMORY=$(sysctl -n hw.memsize 2>/dev/null || echo 0) |
|
TOTAL_MEMORY_GB=$((TOTAL_MEMORY / 1073741824)) |
|
CPU_BRAND=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "Unknown") |
|
|
|
|
|
if [[ "$CPU_BRAND" == *"M3 Ultra"* ]] || [[ "$TOTAL_MEMORY_GB" -ge 400 ]]; then |
|
IS_M3_ULTRA=true |
|
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra detected! (${TOTAL_MEMORY_GB}GB RAM)${RESET}" |
|
else |
|
IS_M3_ULTRA=false |
|
fi |
|
|
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "${BOLD}${BLUE} MLX Server Launcher v1.0 ${RESET}" |
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "Launch MLX model server with custom parameters\n" |
|
|
|
|
|
DEFAULT_MODEL="/Users/polyversai/.lmstudio/models/LibraxisAI/c4ai-command-a-03-2025-q5-mlx" |
|
DEFAULT_HOST="0.0.0.0" |
|
DEFAULT_PORT="12345" |
|
DEFAULT_TEMP="0.7" |
|
DEFAULT_TOP_P="0.95" |
|
DEFAULT_TOP_K="0" |
|
DEFAULT_MIN_P="0.0" |
|
DEFAULT_MAX_TOKENS="2048" |
|
DEFAULT_LOG_LEVEL="INFO" |
|
|
|
|
|
echo -e "${BOLD}Model path (local or HF repo):${RESET}" |
|
echo -e "(Default: ${DEFAULT_MODEL})" |
|
echo -e "${CYAN}Examples:${RESET}" |
|
echo -e " Local: /Users/polyversai/.lmstudio/models/mlx-community/model-name" |
|
echo -e " HF: mlx-community/Llama-3.2-3B-Instruct-4bit" |
|
read -p "> " MODEL_PATH |
|
MODEL_PATH=${MODEL_PATH:-$DEFAULT_MODEL} |
|
|
|
|
|
if [[ -d "$MODEL_PATH" ]]; then |
|
echo -e "${GREEN}✓ Local model detected: ${MODEL_PATH}${RESET}" |
|
else |
|
echo -e "${GREEN}✓ Remote model specified: ${MODEL_PATH}${RESET}" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}Host IP address:${RESET}" |
|
echo -e "(Default: ${DEFAULT_HOST} - accessible from network)" |
|
echo -e "Use 127.0.0.1 for localhost only" |
|
read -p "> " HOST |
|
HOST=${HOST:-$DEFAULT_HOST} |
|
|
|
echo -e "\n${BOLD}Port number:${RESET}" |
|
echo -e "(Default: ${DEFAULT_PORT})" |
|
read -p "> " PORT |
|
PORT=${PORT:-$DEFAULT_PORT} |
|
|
|
|
|
echo -e "\n${BOLD}${CYAN}=== Sampling Parameters ===${RESET}" |
|
|
|
echo -e "\n${BOLD}Temperature (creativity):${RESET}" |
|
echo -e "Range: 0.0-2.0 (Default: ${DEFAULT_TEMP})" |
|
echo -e "${YELLOW}0.0 = deterministic, 1.0 = balanced, 2.0 = very creative${RESET}" |
|
read -p "> " TEMP |
|
TEMP=${TEMP:-$DEFAULT_TEMP} |
|
|
|
echo -e "\n${BOLD}Top-p (nucleus sampling):${RESET}" |
|
echo -e "Range: 0.0-1.0 (Default: ${DEFAULT_TOP_P})" |
|
echo -e "${YELLOW}Lower = more focused, Higher = more diverse${RESET}" |
|
read -p "> " TOP_P |
|
TOP_P=${TOP_P:-$DEFAULT_TOP_P} |
|
|
|
echo -e "\n${BOLD}Top-k (vocabulary limit):${RESET}" |
|
echo -e "Default: ${DEFAULT_TOP_K} (0 = disabled)" |
|
echo -e "${YELLOW}Limits selection to top K tokens${RESET}" |
|
read -p "> " TOP_K |
|
TOP_K=${TOP_K:-$DEFAULT_TOP_K} |
|
|
|
echo -e "\n${BOLD}Min-p (minimum probability):${RESET}" |
|
echo -e "Range: 0.0-1.0 (Default: ${DEFAULT_MIN_P})" |
|
echo -e "${YELLOW}0.0 = disabled, higher = filter low probability tokens${RESET}" |
|
read -p "> " MIN_P |
|
MIN_P=${MIN_P:-$DEFAULT_MIN_P} |
|
|
|
echo -e "\n${BOLD}Max tokens per response:${RESET}" |
|
echo -e "(Default: ${DEFAULT_MAX_TOKENS})" |
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "${MAGENTA}Dragon can handle 8192+ tokens easily${RESET}" |
|
fi |
|
read -p "> " MAX_TOKENS |
|
MAX_TOKENS=${MAX_TOKENS:-$DEFAULT_MAX_TOKENS} |
|
|
|
|
|
echo -e "\n${BOLD}LoRA adapter path (optional):${RESET}" |
|
echo -e "(Leave empty if not using adapters)" |
|
read -p "> " ADAPTER_PATH |
|
|
|
if [[ -n "$ADAPTER_PATH" ]]; then |
|
ADAPTER_OPTION="--adapter-path ${ADAPTER_PATH}" |
|
else |
|
ADAPTER_OPTION="" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}Chat template args (optional JSON):${RESET}" |
|
echo -e "Example: {\"enable_thinking\":false}" |
|
echo -e "(Leave empty for defaults)" |
|
read -p "> " CHAT_TEMPLATE_ARGS |
|
|
|
if [[ -n "$CHAT_TEMPLATE_ARGS" ]]; then |
|
CHAT_TEMPLATE_OPTION="--chat-template-args \"${CHAT_TEMPLATE_ARGS}\"" |
|
else |
|
CHAT_TEMPLATE_OPTION="" |
|
fi |
|
|
|
|
|
echo -e "\n${BOLD}Log level:${RESET}" |
|
echo -e "(Default: ${DEFAULT_LOG_LEVEL}, Options: DEBUG, INFO, WARNING, ERROR, CRITICAL)" |
|
read -p "> " LOG_LEVEL |
|
LOG_LEVEL=${LOG_LEVEL:-$DEFAULT_LOG_LEVEL} |
|
|
|
|
|
SERVER_CMD="uv run mlx_lm.server --model ${MODEL_PATH} --host ${HOST} --port ${PORT} --temp ${TEMP} --top-p ${TOP_P} --top-k ${TOP_K} --min-p ${MIN_P} --max-tokens ${MAX_TOKENS} --log-level ${LOG_LEVEL} ${ADAPTER_OPTION} ${CHAT_TEMPLATE_OPTION}" |
|
|
|
|
|
echo -e "\n${BOLD}${YELLOW}Command Preview:${RESET}" |
|
echo -e "$SERVER_CMD" |
|
|
|
|
|
echo -e "\n${BOLD}${GREEN}Launch mode:${RESET}" |
|
echo -e "1. ${YELLOW}Foreground${RESET} - See logs in terminal (Ctrl+C to stop)" |
|
echo -e "2. ${YELLOW}Background with logging${RESET} - Logs to mlx-server.log" |
|
echo -e "3. ${YELLOW}Background detached${RESET} - Run with nohup" |
|
echo -e "4. ${YELLOW}Just copy command${RESET} - Don't launch" |
|
read -p "> " LAUNCH_MODE |
|
|
|
|
|
if [[ "$LAUNCH_MODE" == "2" || "$LAUNCH_MODE" == "3" ]]; then |
|
mkdir -p logs |
|
LOG_FILE="logs/mlx-server-$(date +%Y%m%d-%H%M%S).log" |
|
fi |
|
|
|
case "$LAUNCH_MODE" in |
|
1) |
|
echo -e "\n${BOLD}${GREEN}Starting server in foreground...${RESET}" |
|
echo -e "${YELLOW}Press Ctrl+C to stop${RESET}\n" |
|
eval "$SERVER_CMD" |
|
;; |
|
2) |
|
echo -e "\n${BOLD}${GREEN}Starting server in background...${RESET}" |
|
echo -e "Logs: ${LOG_FILE}" |
|
eval "$SERVER_CMD" > "${LOG_FILE}" 2>&1 & |
|
SERVER_PID=$! |
|
echo -e "${GREEN}✓ Server started with PID: ${SERVER_PID}${RESET}" |
|
echo -e "\nTo monitor: tail -f ${LOG_FILE}" |
|
echo -e "To stop: kill ${SERVER_PID}" |
|
|
|
|
|
echo $SERVER_PID > logs/mlx-server.pid |
|
;; |
|
3) |
|
echo -e "\n${BOLD}${GREEN}Starting server with nohup...${RESET}" |
|
echo -e "Logs: ${LOG_FILE}" |
|
nohup bash -c "$SERVER_CMD" > "${LOG_FILE}" 2>&1 & |
|
SERVER_PID=$! |
|
echo -e "${GREEN}✓ Server started with PID: ${SERVER_PID}${RESET}" |
|
echo -e "\nTo monitor: tail -f ${LOG_FILE}" |
|
echo -e "To stop: kill ${SERVER_PID}" |
|
|
|
|
|
echo $SERVER_PID > logs/mlx-server.pid |
|
;; |
|
4) |
|
echo -e "\n${BOLD}${GREEN}Command copied to clipboard!${RESET}" |
|
echo "$SERVER_CMD" | pbcopy |
|
;; |
|
*) |
|
echo -e "\n${RED}Invalid choice. Exiting.${RESET}" |
|
exit 1 |
|
;; |
|
esac |
|
|
|
|
|
if [[ "$LAUNCH_MODE" != "4" ]]; then |
|
echo -e "\n${BOLD}${BLUE}=== API Usage Examples ===${RESET}" |
|
|
|
echo -e "\n${CYAN}1. Chat completion:${RESET}" |
|
echo -e "curl http://${HOST}:${PORT}/v1/chat/completions \\" |
|
echo -e " -H \"Content-Type: application/json\" \\" |
|
echo -e " -d '{" |
|
echo -e " \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]," |
|
echo -e " \"temperature\": ${TEMP}," |
|
echo -e " \"max_tokens\": 100" |
|
echo -e " }'" |
|
|
|
echo -e "\n${CYAN}2. Check models:${RESET}" |
|
echo -e "curl http://${HOST}:${PORT}/v1/models" |
|
|
|
echo -e "\n${CYAN}3. Health check:${RESET}" |
|
echo -e "curl http://${HOST}:${PORT}/health" |
|
|
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "\n${BOLD}${MAGENTA}Dragon Performance Monitoring:${RESET}" |
|
echo -e "# In another terminal:" |
|
echo -e "watch -n 1 'curl -s http://${HOST}:${PORT}/health | jq .'" |
|
fi |
|
fi |
|
|
|
echo -e "\n${BOLD}${BLUE}=====================================${RESET}" |
|
echo -e "${BOLD}${GREEN}✨ MLX Server ready!${RESET}" |
|
if [[ "$IS_M3_ULTRA" == true ]]; then |
|
echo -e "${BOLD}${MAGENTA}🐉 Dragon M3 Ultra serving at full power!${RESET}" |
|
fi |
|
echo -e "${BOLD}${BLUE}=====================================${RESET}" |