velocity-ai
/

phi-3.5-address-validation-pretrained

@@ -1,7 +1,8 @@
 import os
 import json
 import torch
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
 import logging
 logger = logging.getLogger(__name__)
@@ -11,72 +12,55 @@ logger = logging.getLogger(__name__)
 # Can specify GPU device with:
 # CUDA_VISIBLE_DEVICES="1" python script.py
-def model_fn(model_dir):
     """Load the model for inference"""
-    model_id = os.getenv("HF_MODEL_ID")
-    # Set specific GPU device if available
-    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-    if device.type == 'cuda':
-        torch.cuda.set_device(device)
-        torch.cuda.empty_cache()
-    logger.info(f"Using device: {device}")
     try:
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        # Load model with specific configuration
-        model = AutoModelForSequenceClassification.from_pretrained(
             model_id,
-            num_labels=2,
             torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
-            trust_remote_code=True,
-            device_map=None
         )
-        # Move model to device explicitly
         model = model.to(device)
-        # Force all existing tensors to device and set default tensor type
         if device.type == 'cuda':
-            torch.set_default_tensor_type('torch.cuda.FloatTensor')
-        for param in model.parameters():
-            param.data = param.data.to(device)
-        for buffer in model.buffers():
-            buffer.data = buffer.data.to(device)
         # Ensure model is in eval mode
         model.eval()
-        # Set memory optimizations
-        if device.type == 'cuda':
-            torch.backends.cudnn.benchmark = True
         logger.info(f"Model loaded successfully on {device}")
-        logger.info(f"Model device map: {model.hf_device_map if hasattr(model, 'hf_device_map') else 'N/A'}")
-        logger.info(f"Default tensor type: {torch.get_default_tensor_type()}")
-        # Verify all model components are on correct device
-        def verify_module_devices(module, prefix=''):
-            issues = []
-            for name, child in module.named_children():
-                child_prefix = f"{prefix}.{name}" if prefix else name
-                if hasattr(child, 'device'):
-                    if child.device != device:
-                        issues.append(f"{child_prefix} on {child.device}")
-                for param_name, param in child.named_parameters(recurse=False):
-                    if param.device != device:
-                        issues.append(f"{child_prefix}.{param_name} on {param.device}")
-                issues.extend(verify_module_devices(child, child_prefix))
-            return issues
-        device_issues = verify_module_devices(model)
-        if device_issues:
-            logger.warning("Found model components on wrong device:")
-            for issue in device_issues:
-                logger.warning(issue)
         return {
             "model": model,
@@ -97,10 +81,6 @@ def predict_fn(data, model_dict):
         logger.info(f"Model is on device: {device}")
-        # Set default tensor type for any new tensors
-        if device.type == 'cuda':
-            torch.set_default_tensor_type('torch.cuda.FloatTensor')
         # Parse input
         if isinstance(data, str):
             input_text = data
@@ -120,7 +100,7 @@ def predict_fn(data, model_dict):
             return_tensors='pt'
         )
-        # Move inputs to CUDA directly
         if device.type == 'cuda':
             inputs = {k: v.cuda() for k, v in inputs.items()}
@@ -150,30 +130,23 @@ def predict_fn(data, model_dict):
         # Move predictions to CPU for numpy conversion
         predictions = predictions.cpu().numpy()
-        # Reset default tensor type
-        torch.set_default_tensor_type('torch.FloatTensor')
         return predictions
     except Exception as e:
         logger.error(f"Error during prediction: {str(e)}")
         logger.error(f"Model device: {next(model.parameters()).device}")
         logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
-        logger.error(f"Default tensor type: {torch.get_default_tensor_type()}")
         raise
 def input_fn(request_body, request_content_type):
     """Parse input request"""
     if request_content_type == "application/json":
-        # Try to parse as JSON
         try:
             data = json.loads(request_body)
         except:
-            # If JSON parsing fails, treat as raw text
             data = request_body
         return data
     else:
-        # For non-JSON content, treat as raw text
         return request_body
 def output_fn(prediction, response_content_type):

 import os
 import json
 import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import logging
 logger = logging.getLogger(__name__)
 # Can specify GPU device with:
 # CUDA_VISIBLE_DEVICES="1" python script.py
+class PhiForSequenceClassification(nn.Module):
+    def __init__(self, base_model, num_labels=2):
+        super().__init__()
+        self.phi = base_model
+        self.classifier = nn.Linear(self.phi.config.hidden_size, num_labels)
+    def forward(self, **inputs):
+        outputs = self.phi(**inputs, output_hidden_states=True)
+        # Use the last hidden state of the last token for classification
+        last_hidden_state = outputs.hidden_states[-1][:, -1, :]
+        logits = self.classifier(last_hidden_state)
+        return type('Outputs', (), {'logits': logits})()
+def model_fn(model_dir, context=None):
     """Load the model for inference"""
     try:
+        model_id = os.getenv("HF_MODEL_ID")
+        # Set specific GPU device if available
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+        if device.type == 'cuda':
+            torch.cuda.set_device(device)
+            torch.cuda.empty_cache()
+        logger.info(f"Using device: {device}")
         # Load tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        # Load base model
+        base_model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.bfloat16 if device.type == 'cuda' else torch.float32,
+            trust_remote_code=True
         )
+        # Create classification model
+        model = PhiForSequenceClassification(base_model, num_labels=2)
+        # Move model to device
         model = model.to(device)
+        # Set memory optimizations
         if device.type == 'cuda':
+            torch.backends.cudnn.benchmark = True
         # Ensure model is in eval mode
         model.eval()
         logger.info(f"Model loaded successfully on {device}")
         return {
             "model": model,
         logger.info(f"Model is on device: {device}")
         # Parse input
         if isinstance(data, str):
             input_text = data
             return_tensors='pt'
         )
+        # Move inputs to device
         if device.type == 'cuda':
             inputs = {k: v.cuda() for k, v in inputs.items()}
         # Move predictions to CPU for numpy conversion
         predictions = predictions.cpu().numpy()
         return predictions
     except Exception as e:
         logger.error(f"Error during prediction: {str(e)}")
         logger.error(f"Model device: {next(model.parameters()).device}")
         logger.error(f"Input devices: {[f'{k}: {v.device}' for k, v in inputs.items()]}")
         raise
 def input_fn(request_body, request_content_type):
     """Parse input request"""
     if request_content_type == "application/json":
         try:
             data = json.loads(request_body)
         except:
             data = request_body
         return data
     else:
         return request_body
 def output_fn(prediction, response_content_type):