# -*- coding: utf-8 -*- """text_classification.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1D25W7EYF5v1a0FoSHKAcyVhwMMIU6yg4 """ !pip install transformers datasets !pip install torch # Ultra-Simple Arabic Product Classifier with Enhanced Training import pandas as pd import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report import joblib import numpy as np from collections import Counter # Load and preprocess your data print("Loading and preprocessing data...") df = pd.read_excel('/content/Copy ofمنتجات مقاهي (1).xlsx', sheet_name='products') df = df[['اسم المنتج', 'التصنيف المحاسبي']].dropna() # Prepare text and labels label_encoder = LabelEncoder() labels = label_encoder.fit_transform(df['التصنيف المحاسبي']) texts = df['اسم المنتج'].tolist() print(f"Loaded {len(texts)} products with {len(set(labels))} unique categories.") print(f"Categories: {list(label_encoder.classes_)}") # Check class distribution and handle single-sample classes from collections import Counter label_counts = Counter(labels) print(f"Class distribution:") for label_id, count in sorted(label_counts.items()): label_name = label_encoder.inverse_transform([label_id])[0] print(f" {label_name}: {count} samples") # Separate single-sample classes from multi-sample classes single_sample_mask = np.array([label_counts[label] == 1 for label in labels]) multi_sample_mask = ~single_sample_mask # Get indices for single and multi sample data single_indices = np.where(single_sample_mask)[0] multi_indices = np.where(multi_sample_mask)[0] print(f"\nSingle-sample classes: {np.sum(single_sample_mask)} samples") print(f"Multi-sample classes: {np.sum(multi_sample_mask)} samples") if np.sum(multi_sample_mask) > 0: # Split multi-sample data with stratification multi_texts = [texts[i] for i in multi_indices] multi_labels = [labels[i] for i in multi_indices] train_texts, val_texts, train_labels, val_labels = train_test_split( multi_texts, multi_labels, test_size=0.2, random_state=42, stratify=multi_labels ) # Add single-sample data to training set (can't split them) if np.sum(single_sample_mask) > 0: single_texts = [texts[i] for i in single_indices] single_labels = [labels[i] for i in single_indices] train_texts.extend(single_texts) train_labels.extend(single_labels) print(f"Added {len(single_texts)} single-sample items to training set") else: # If all classes have single samples, use simple split without stratification print("Warning: All or most classes have single samples. Using simple split.") train_texts, val_texts, train_labels, val_labels = train_test_split( texts, labels, test_size=0.2, random_state=42 ) print(f"Training set: {len(train_texts)} samples") print(f"Validation set: {len(val_texts)} samples") # Load Arabic BERT model_name = "asafaya/bert-base-arabic" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels))) # Define Enhanced Dataset class class SimpleDataset(torch.utils.data.Dataset): def __init__(self, texts, labels, tokenizer): self.texts = texts self.labels = labels self.tokenizer = tokenizer def __len__(self): return len(self.texts) def __getitem__(self, idx): encoding = self.tokenizer( str(self.texts[idx]), truncation=True, padding='max_length', max_length=128, return_tensors='pt' ) return { 'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'labels': torch.tensor(self.labels[idx], dtype=torch.long) } # Create datasets train_dataset = SimpleDataset(train_texts, train_labels, tokenizer) val_dataset = SimpleDataset(val_texts, val_labels, tokenizer) # Define compute metrics function for evaluation def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) accuracy = accuracy_score(labels, predictions) return {'accuracy': accuracy} # Enhanced Training setup with evaluation training_args = TrainingArguments( output_dir='./model', num_train_epochs=50, per_device_train_batch_size=16, # زودت الـ batch size من 8 لـ 16 per_device_eval_batch_size=16, # batch size للتقييم eval_strategy="epoch", # تقييم بعد كل epoch save_strategy="epoch", # حفظ بعد كل epoch logging_steps=10, # تسجيل أكثر تكراراً save_total_limit=2, # الاحتفاظ بأفضل 2 نماذج فقط load_best_model_at_end=True, # تحميل أفضل نموذج في النهاية metric_for_best_model="eval_accuracy", # المقياس لاختيار أفضل نموذج greater_is_better=True, # كلما زادت الدقة كان أفضل report_to=None, warmup_steps=100, # خطوات إحماء للتدريب weight_decay=0.01, # تنظيم لمنع الـ overfitting learning_rate=2e-5, # معدل تعلم محسن ) # Enhanced Trainer instance with evaluation trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, # إضافة بيانات التقييم tokenizer=tokenizer, compute_metrics=compute_metrics # إضافة وظيفة حساب المقاييس ) # Start training with evaluation print("Training started with evaluation...") trainer.train() # Save model, tokenizer, and label encoder trainer.save_model('./model') tokenizer.save_pretrained('./model') joblib.dump(label_encoder, './model/labels.pkl') print("Training complete! Model saved to './model'") # Enhanced prediction function with batch processing capability def predict(text): """Predict single product classification""" tokenizer = AutoTokenizer.from_pretrained('./model') model = AutoModelForSequenceClassification.from_pretrained('./model') label_encoder = joblib.load('./model/labels.pkl') inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) predicted_id = outputs.logits.argmax().item() confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() classification = label_encoder.inverse_transform([predicted_id])[0] return classification, confidence def predict_batch(texts): """Predict multiple products at once for faster processing""" tokenizer = AutoTokenizer.from_pretrained('./model') model = AutoModelForSequenceClassification.from_pretrained('./model') label_encoder = joblib.load('./model/labels.pkl') inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128) with torch.no_grad(): outputs = model(**inputs) predictions = outputs.logits.argmax(dim=-1).cpu().numpy() confidences = torch.nn.functional.softmax(outputs.logits, dim=-1).max(dim=-1)[0].cpu().numpy() classifications = label_encoder.inverse_transform(predictions) return list(zip(classifications, confidences)) # Evaluate on validation set print("\nEvaluating on validation set...") val_predictions = [] val_confidences = [] for text in val_texts: pred, conf = predict(text) val_predictions.append(pred) val_confidences.append(conf) # Convert back to numeric for comparison val_pred_numeric = label_encoder.transform(val_predictions) accuracy = accuracy_score(val_labels, val_pred_numeric) print(f"Validation Accuracy: {accuracy:.4f}") # Detailed classification report val_true_labels = label_encoder.inverse_transform(val_labels) print("\nDetailed Classification Report:") print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_)) # Test examples test_products = [ "نادك حليب طويل الأجل 1 لتر", "قهوة عربية محمصة", "شاي أحمر ليبتون", "عصير برتقال طبيعي" ] print("\n" + "="*50) print("Testing on sample products:") print("="*50) for product in test_products: result, confidence = predict(product) print(f"Product: {product}") print(f"Classification: {result}") print(f"Confidence: {confidence:.3f}") print("-" * 30) # Batch prediction example print("\nBatch prediction example:") batch_results = predict_batch(test_products) for product, (classification, confidence) in zip(test_products, batch_results): print(f"{product} -> {classification} ({confidence:.3f})") print(f"\nModel training complete!") print(f"- Single prediction: predict('product name')") print(f"- Batch prediction: predict_batch(['product1', 'product2', ...])") print(f"- Validation accuracy: {accuracy:.4f}") print(f"- Model saved to: './model'") # Using the trained model (without retraining) import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification import joblib print("Loading trained model...") # Load model and tools (only once) try: tokenizer = AutoTokenizer.from_pretrained('./model') model = AutoModelForSequenceClassification.from_pretrained('./model') label_encoder = joblib.load('./model/labels.pkl') print("Model loaded successfully!") print(f"Number of available categories: {len(label_encoder.classes_)}") # Display available categories print("\nAvailable categories:") for i, category in enumerate(label_encoder.classes_, 1): print(f"{i:2d}. {category}") except Exception as e: print(f"Error loading model: {e}") print("Make sure './model' folder exists and contains required files") exit() # Basic classification function def classify_product(product_name): """Classify a single product""" try: # Prepare text inputs = tokenizer( product_name, return_tensors="pt", truncation=True, padding=True, max_length=128 ) # Prediction with torch.no_grad(): outputs = model(**inputs) # Extract result predicted_id = outputs.logits.argmax().item() confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() classification = label_encoder.inverse_transform([predicted_id])[0] return { 'product': product_name, 'classification': classification, 'confidence': confidence, 'success': True } except Exception as e: return { 'product': product_name, 'classification': None, 'confidence': 0, 'success': False, 'error': str(e) } # Function to classify multiple products def classify_multiple_products(product_list): """Classify a list of products""" results = [] print(f"Classifying {len(product_list)} products...") for i, product in enumerate(product_list, 1): result = classify_product(product) results.append(result) if result['success']: print(f"{i:3d}. {product}") print(f" → {result['classification']}") print(f" → Confidence: {result['confidence']:.3f}") else: print(f"{i:3d}. {product} - Error: {result['error']}") print() return results # Test examples test_products = [ "نادك حليب طويل الأجل 1 لتر", "قهوة عربية محمصة", "شاي أحمر ليبتون", "منظف أرضيات فلاش", "سكر أبيض ناعم", "عصير برتقال طبيعي" ] print("\n" + "="*60) print("Testing model on sample products") print("="*60) # Classify test products test_results = classify_multiple_products(test_products) # Quick statistics successful_predictions = [r for r in test_results if r['success']] avg_confidence = sum(r['confidence'] for r in successful_predictions) / len(successful_predictions) print("="*60) print("Results summary:") print(f"Successfully classified {len(successful_predictions)} products") print(f"Average confidence level: {avg_confidence:.3f}") # Display unique classifications unique_classifications = set(r['classification'] for r in successful_predictions) print(f"Number of categories used: {len(unique_classifications)}") print("Categories:") for classification in sorted(unique_classifications): count = sum(1 for r in successful_predictions if r['classification'] == classification) print(f" • {classification} ({count} products)") print("\n" + "="*60) print("Model ready for use!") print("="*60) print("Usage:") print("result = classify_product('product name')") print("print(f\"Classification: {result['classification']}\")") print("print(f\"Confidence: {result['confidence']:.3f}\")") print("\nFor multiple products:") print("products = ['product 1', 'product 2', 'product 3']") print("results = classify_multiple_products(products)") test_product = 'عطر كروم ليجند للرجال او دي تواليت من ازارو 125 مل' result, confidence = predict(test_product) print(f"\nTest: {test_product}") print(f"Result: {result}") print(f"Confidence: {confidence:.3f}") """# Saving The model""" # احفظ النموذج model.save_pretrained('/content/my_model/') # لاحقاً، لتحميله مرة أخرى: from transformers import BertForSequenceClassification model = BertForSequenceClassification.from_pretrained('/content/my_model/') !zip -r my_model.zip /content/my_model/ tokenizer.save_pretrained('/content/my_model') model.save_pretrained('/content/my_model') import joblib joblib.dump(label_encoder, '/content/my_model/labels.pkl') from google.colab import files files.download('my_model.zip') """# Testing""" !ls /content/my_model from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import joblib # Define the path where files are saved save_path = '/content/my_model' # Load the tokenizer, model, and label encoder tokenizer = AutoTokenizer.from_pretrained(save_path) model = AutoModelForSequenceClassification.from_pretrained(save_path) label_encoder = joblib.load(f'{save_path}/labels.pkl') def predict(text): # Preprocess the input text inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) # Perform inference with torch.no_grad(): outputs = model(**inputs) # Get predicted class ID and confidence predicted_id = outputs.logits.argmax().item() confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item() # Map the ID back to the label name classification = label_encoder.inverse_transform([predicted_id])[0] return classification, confidence # Test a product test_product = "نادك حليب طويل الأجل 1 لتر" result, confidence = predict(test_product) print(f"Test Product: {test_product}") print(f"Predicted Category: {result}") print(f"Confidence: {confidence:.3f}") # Test a product test_product = "زبادى" result, confidence = predict(test_product) print(f"Test Product: {test_product}") print(f"Predicted Category: {result}") print(f"Confidence: {confidence:.3f}") # Test a product test_product = "بترول" result, confidence = predict(test_product) print(f"Test Product: {test_product}") print(f"Predicted Category: {result}") print(f"Confidence: {confidence:.3f}") from google.colab import files uploaded = files.upload()