velocity-ai
/

phi-3.5-address-validation-pretrained

+import os
+import json
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import logging
+logger = logging.getLogger(__name__)
+# Test CUDA device availability and names with:
+# python -c "import torch; print('\n'.join([f'{i}: {torch.cuda.get_device_name(i)}' for i in range(torch.cuda.device_count())]))"
+# Can specify GPU device with:
+# CUDA_VISIBLE_DEVICES="1" python script.py
+def model_fn(model_dir):
+    """Load the model for inference"""
+    model_id = os.getenv("HF_MODEL_ID")
+    # Set specific GPU device if available
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    if device.type == 'cuda':
+        torch.cuda.set_device(device)
+        torch.cuda.empty_cache()
+    logger.info(f"Using device: {device}")
+    try:
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        # Load model with specific configuration
+        model = AutoModelForSequenceClassification.from_pretrained(
+            model_id,
+            num_labels=2,
+            torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
+            trust_remote_code=True,
+            device_map=None
+        )
+        # Move model to device explicitly
+        model = model.to(device)
+        # Force all existing tensors to device and set default tensor type
+        if device.type == 'cuda':
+            torch.set_default_tensor_type('torch.cuda.FloatTensor')
+        for param in model.parameters():
+            param.data = param.data.to(device)
+        for buffer in model.buffers():
+            buffer.data = buffer.data.to(device)
+        # Ensure model is in eval mode
+        model.eval()
+        # Set memory optimizations
+        if device.type == 'cuda':
+            torch.backends.cudnn.benchmark = True
+        logger.info(f"Model loaded successfully on {device}")
+        logger.info(f"Model device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'N/A'}")
+        logger.info(f"Default tensor type: {torch.get_default_tensor_type()}")
+        # Verify all model components are on correct device
+        def verify_module_devices(module, prefix=''):
+            issues = []
+            for name, child in module.named_children():
+                child_prefix = f"{prefix}.{name}" if prefix else name
+                if hasattr(child, 'device'):
+                    if child.device != device:
+                        issues.append(f"{child_prefix} on {child.device}")
+                for param_name, param in child.named_parameters(recurse=False):
+                    if param.device != device:
+                        issues.append(f"{child_prefix}.{param_name} on {param.device}")
+                issues.extend(verify_module_devices(child, child_prefix))
+            return issues
+        device_issues = verify_module_devices(model)
+        if device_issues:
+            logger.warning("Found model components on wrong device:")
+            for issue in device_issues:
+                logger.warning(issue)
+        return {
+            "model": model,
+            "tokenizer": tokenizer,
+            "device": device
+        }
+    except Exception as e:
+        logger.error(f"Error loading model: {str(e)}")
+        raise
+def predict_fn(data, model_dict):
+    """Make a prediction"""
+    try:
+        logger.info("Starting prediction")
+        model = model_dict["model"]
+        tokenizer = model_dict["tokenizer"]
+        device = model_dict["device"]
+        logger.info(f"Model is on device: {device}")
+        # Set default tensor type for any new tensors
+        if device.type == 'cuda':
+            torch.set_default_tensor_type('torch.cuda.FloatTensor')
+        # Parse input
+        if isinstance(data, str):
+            input_text = data
+        elif isinstance(data, dict):
+            input_text = data.get("inputs", data.get("text", str(data)))
+        else:
+            input_text = str(data)
+        logger.debug(f"Parsed input text: {input_text}")
+        # Create tensors directly on target device
+        inputs = tokenizer(
+            input_text,
+            add_special_tokens=True,
+            max_length=128,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        # Move inputs to CUDA directly
+        if device.type == 'cuda':
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        logger.debug(f"Inputs moved to device: {device}")
+        # Log tensor devices and dtypes
+        for k, v in inputs.items():
+            logger.debug(f"Input '{k}' - Device: {v.device}, Shape: {v.shape}, Dtype: {v.dtype}")
+        # Generate prediction
+        logger.info("Generating prediction")
+        with torch.no_grad():
+            if device.type == 'cuda':
+                torch.cuda.empty_cache()
+            try:
+                # Run inference
+                outputs = model(**inputs)
+                predictions = torch.softmax(outputs.logits, dim=1)
+            except RuntimeError as e:
+                logger.error("Error during inference:")
+                logger.error(f"Model device: {next(model.parameters()).device}")
+                logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
+                raise
+        # Move predictions to CPU for numpy conversion
+        predictions = predictions.cpu().numpy()
+        # Reset default tensor type
+        torch.set_default_tensor_type('torch.FloatTensor')
+        return predictions
+    except Exception as e:
+        logger.error(f"Error during prediction: {str(e)}")
+        logger.error(f"Model device: {next(model.parameters()).device}")
+        logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
+        logger.error(f"Default tensor type: {torch.get_default_tensor_type()}")
+        raise
+def input_fn(request_body, request_content_type):
+    """Parse input request"""
+    if request_content_type == "application/json":
+        # Try to parse as JSON
+        try:
+            data = json.loads(request_body)
+        except:
+            # If JSON parsing fails, treat as raw text
+            data = request_body
+        return data
+    else:
+        # For non-JSON content, treat as raw text
+        return request_body
+def output_fn(prediction, response_content_type):
+    """Format the output"""
+    if response_content_type == "application/json":
+        return json.dumps(prediction.tolist())
+    else:
+        raise ValueError(f"Unsupported content type: {response_content_type}")