AllaaSaboukh commited on
Commit
f5a01c5
·
verified ·
1 Parent(s): adfe619

Upload text_classification.py

Browse files
Files changed (1) hide show
  1. text_classification.py +465 -0
text_classification.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """text_classification.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1D25W7EYF5v1a0FoSHKAcyVhwMMIU6yg4
8
+ """
9
+
10
+ !pip install transformers datasets
11
+ !pip install torch
12
+
13
+ # Ultra-Simple Arabic Product Classifier with Enhanced Training
14
+ import pandas as pd
15
+ import torch
16
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
17
+ from sklearn.preprocessing import LabelEncoder
18
+ from sklearn.model_selection import train_test_split
19
+ from sklearn.metrics import accuracy_score, classification_report
20
+ import joblib
21
+ import numpy as np
22
+ from collections import Counter
23
+
24
+ # Load and preprocess your data
25
+ print("Loading and preprocessing data...")
26
+ df = pd.read_excel('/content/Copy ofمنتجات مقاهي (1).xlsx', sheet_name='products')
27
+ df = df[['اسم المنتج', 'التصنيف المحاسبي']].dropna()
28
+
29
+ # Prepare text and labels
30
+ label_encoder = LabelEncoder()
31
+ labels = label_encoder.fit_transform(df['التصنيف المحاسبي'])
32
+ texts = df['اسم المنتج'].tolist()
33
+
34
+ print(f"Loaded {len(texts)} products with {len(set(labels))} unique categories.")
35
+ print(f"Categories: {list(label_encoder.classes_)}")
36
+
37
+ # Check class distribution and handle single-sample classes
38
+ from collections import Counter
39
+ label_counts = Counter(labels)
40
+ print(f"Class distribution:")
41
+ for label_id, count in sorted(label_counts.items()):
42
+ label_name = label_encoder.inverse_transform([label_id])[0]
43
+ print(f" {label_name}: {count} samples")
44
+
45
+ # Separate single-sample classes from multi-sample classes
46
+ single_sample_mask = np.array([label_counts[label] == 1 for label in labels])
47
+ multi_sample_mask = ~single_sample_mask
48
+
49
+ # Get indices for single and multi sample data
50
+ single_indices = np.where(single_sample_mask)[0]
51
+ multi_indices = np.where(multi_sample_mask)[0]
52
+
53
+ print(f"\nSingle-sample classes: {np.sum(single_sample_mask)} samples")
54
+ print(f"Multi-sample classes: {np.sum(multi_sample_mask)} samples")
55
+
56
+ if np.sum(multi_sample_mask) > 0:
57
+ # Split multi-sample data with stratification
58
+ multi_texts = [texts[i] for i in multi_indices]
59
+ multi_labels = [labels[i] for i in multi_indices]
60
+
61
+ train_texts, val_texts, train_labels, val_labels = train_test_split(
62
+ multi_texts, multi_labels, test_size=0.2, random_state=42, stratify=multi_labels
63
+ )
64
+
65
+ # Add single-sample data to training set (can't split them)
66
+ if np.sum(single_sample_mask) > 0:
67
+ single_texts = [texts[i] for i in single_indices]
68
+ single_labels = [labels[i] for i in single_indices]
69
+
70
+ train_texts.extend(single_texts)
71
+ train_labels.extend(single_labels)
72
+
73
+ print(f"Added {len(single_texts)} single-sample items to training set")
74
+ else:
75
+ # If all classes have single samples, use simple split without stratification
76
+ print("Warning: All or most classes have single samples. Using simple split.")
77
+ train_texts, val_texts, train_labels, val_labels = train_test_split(
78
+ texts, labels, test_size=0.2, random_state=42
79
+ )
80
+
81
+ print(f"Training set: {len(train_texts)} samples")
82
+ print(f"Validation set: {len(val_texts)} samples")
83
+
84
+ # Load Arabic BERT
85
+ model_name = "asafaya/bert-base-arabic"
86
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
87
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(labels)))
88
+
89
+ # Define Enhanced Dataset class
90
+ class SimpleDataset(torch.utils.data.Dataset):
91
+ def __init__(self, texts, labels, tokenizer):
92
+ self.texts = texts
93
+ self.labels = labels
94
+ self.tokenizer = tokenizer
95
+
96
+ def __len__(self):
97
+ return len(self.texts)
98
+
99
+ def __getitem__(self, idx):
100
+ encoding = self.tokenizer(
101
+ str(self.texts[idx]),
102
+ truncation=True,
103
+ padding='max_length',
104
+ max_length=128,
105
+ return_tensors='pt'
106
+ )
107
+ return {
108
+ 'input_ids': encoding['input_ids'].squeeze(0),
109
+ 'attention_mask': encoding['attention_mask'].squeeze(0),
110
+ 'labels': torch.tensor(self.labels[idx], dtype=torch.long)
111
+ }
112
+
113
+ # Create datasets
114
+ train_dataset = SimpleDataset(train_texts, train_labels, tokenizer)
115
+ val_dataset = SimpleDataset(val_texts, val_labels, tokenizer)
116
+
117
+ # Define compute metrics function for evaluation
118
+ def compute_metrics(eval_pred):
119
+ predictions, labels = eval_pred
120
+ predictions = np.argmax(predictions, axis=1)
121
+ accuracy = accuracy_score(labels, predictions)
122
+ return {'accuracy': accuracy}
123
+
124
+ # Enhanced Training setup with evaluation
125
+ training_args = TrainingArguments(
126
+ output_dir='./model',
127
+ num_train_epochs=50,
128
+ per_device_train_batch_size=16, # زودت الـ batch size من 8 لـ 16
129
+ per_device_eval_batch_size=16, # batch size للتقييم
130
+ eval_strategy="epoch", # تقييم بعد كل epoch
131
+ save_strategy="epoch", # حفظ بعد كل epoch
132
+ logging_steps=10, # تسجيل أكثر تكراراً
133
+ save_total_limit=2, # الاحتفاظ بأفضل 2 نماذج فقط
134
+ load_best_model_at_end=True, # تحميل أفضل نموذج في النهاية
135
+ metric_for_best_model="eval_accuracy", # المقياس لاختيار أفضل نموذج
136
+ greater_is_better=True, # كلما زادت الدقة كان أفضل
137
+ report_to=None,
138
+ warmup_steps=100, # خطوات إحماء للتدريب
139
+ weight_decay=0.01, # تنظيم لمنع الـ overfitting
140
+ learning_rate=2e-5, # معدل تعلم محسن
141
+ )
142
+
143
+ # Enhanced Trainer instance with evaluation
144
+ trainer = Trainer(
145
+ model=model,
146
+ args=training_args,
147
+ train_dataset=train_dataset,
148
+ eval_dataset=val_dataset, # إضافة بيانات التقييم
149
+ tokenizer=tokenizer,
150
+ compute_metrics=compute_metrics # إضافة وظيفة حساب المقاييس
151
+ )
152
+
153
+ # Start training with evaluation
154
+ print("Training started with evaluation...")
155
+ trainer.train()
156
+
157
+ # Save model, tokenizer, and label encoder
158
+ trainer.save_model('./model')
159
+ tokenizer.save_pretrained('./model')
160
+ joblib.dump(label_encoder, './model/labels.pkl')
161
+
162
+ print("Training complete! Model saved to './model'")
163
+
164
+ # Enhanced prediction function with batch processing capability
165
+ def predict(text):
166
+ """Predict single product classification"""
167
+ tokenizer = AutoTokenizer.from_pretrained('./model')
168
+ model = AutoModelForSequenceClassification.from_pretrained('./model')
169
+ label_encoder = joblib.load('./model/labels.pkl')
170
+
171
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
172
+ with torch.no_grad():
173
+ outputs = model(**inputs)
174
+
175
+ predicted_id = outputs.logits.argmax().item()
176
+ confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
177
+ classification = label_encoder.inverse_transform([predicted_id])[0]
178
+
179
+ return classification, confidence
180
+
181
+ def predict_batch(texts):
182
+ """Predict multiple products at once for faster processing"""
183
+ tokenizer = AutoTokenizer.from_pretrained('./model')
184
+ model = AutoModelForSequenceClassification.from_pretrained('./model')
185
+ label_encoder = joblib.load('./model/labels.pkl')
186
+
187
+ inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
188
+ with torch.no_grad():
189
+ outputs = model(**inputs)
190
+
191
+ predictions = outputs.logits.argmax(dim=-1).cpu().numpy()
192
+ confidences = torch.nn.functional.softmax(outputs.logits, dim=-1).max(dim=-1)[0].cpu().numpy()
193
+ classifications = label_encoder.inverse_transform(predictions)
194
+
195
+ return list(zip(classifications, confidences))
196
+
197
+ # Evaluate on validation set
198
+ print("\nEvaluating on validation set...")
199
+ val_predictions = []
200
+ val_confidences = []
201
+
202
+ for text in val_texts:
203
+ pred, conf = predict(text)
204
+ val_predictions.append(pred)
205
+ val_confidences.append(conf)
206
+
207
+ # Convert back to numeric for comparison
208
+ val_pred_numeric = label_encoder.transform(val_predictions)
209
+ accuracy = accuracy_score(val_labels, val_pred_numeric)
210
+ print(f"Validation Accuracy: {accuracy:.4f}")
211
+
212
+ # Detailed classification report
213
+ val_true_labels = label_encoder.inverse_transform(val_labels)
214
+ print("\nDetailed Classification Report:")
215
+ print(classification_report(val_true_labels, val_predictions, target_names=label_encoder.classes_))
216
+
217
+ # Test examples
218
+ test_products = [
219
+ "نادك حليب طويل الأجل 1 لتر",
220
+ "قهوة عربية محمصة",
221
+ "شاي أحمر ليبتون",
222
+ "عصير برتقال طبيعي"
223
+ ]
224
+
225
+ print("\n" + "="*50)
226
+ print("Testing on sample products:")
227
+ print("="*50)
228
+
229
+ for product in test_products:
230
+ result, confidence = predict(product)
231
+ print(f"Product: {product}")
232
+ print(f"Classification: {result}")
233
+ print(f"Confidence: {confidence:.3f}")
234
+ print("-" * 30)
235
+
236
+ # Batch prediction example
237
+ print("\nBatch prediction example:")
238
+ batch_results = predict_batch(test_products)
239
+ for product, (classification, confidence) in zip(test_products, batch_results):
240
+ print(f"{product} -> {classification} ({confidence:.3f})")
241
+
242
+ print(f"\nModel training complete!")
243
+ print(f"- Single prediction: predict('product name')")
244
+ print(f"- Batch prediction: predict_batch(['product1', 'product2', ...])")
245
+ print(f"- Validation accuracy: {accuracy:.4f}")
246
+ print(f"- Model saved to: './model'")
247
+
248
+ # Using the trained model (without retraining)
249
+ import torch
250
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
251
+ import joblib
252
+
253
+ print("Loading trained model...")
254
+
255
+ # Load model and tools (only once)
256
+ try:
257
+ tokenizer = AutoTokenizer.from_pretrained('./model')
258
+ model = AutoModelForSequenceClassification.from_pretrained('./model')
259
+ label_encoder = joblib.load('./model/labels.pkl')
260
+ print("Model loaded successfully!")
261
+ print(f"Number of available categories: {len(label_encoder.classes_)}")
262
+
263
+ # Display available categories
264
+ print("\nAvailable categories:")
265
+ for i, category in enumerate(label_encoder.classes_, 1):
266
+ print(f"{i:2d}. {category}")
267
+
268
+ except Exception as e:
269
+ print(f"Error loading model: {e}")
270
+ print("Make sure './model' folder exists and contains required files")
271
+ exit()
272
+
273
+ # Basic classification function
274
+ def classify_product(product_name):
275
+ """Classify a single product"""
276
+ try:
277
+ # Prepare text
278
+ inputs = tokenizer(
279
+ product_name,
280
+ return_tensors="pt",
281
+ truncation=True,
282
+ padding=True,
283
+ max_length=128
284
+ )
285
+
286
+ # Prediction
287
+ with torch.no_grad():
288
+ outputs = model(**inputs)
289
+
290
+ # Extract result
291
+ predicted_id = outputs.logits.argmax().item()
292
+ confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
293
+ classification = label_encoder.inverse_transform([predicted_id])[0]
294
+
295
+ return {
296
+ 'product': product_name,
297
+ 'classification': classification,
298
+ 'confidence': confidence,
299
+ 'success': True
300
+ }
301
+
302
+ except Exception as e:
303
+ return {
304
+ 'product': product_name,
305
+ 'classification': None,
306
+ 'confidence': 0,
307
+ 'success': False,
308
+ 'error': str(e)
309
+ }
310
+
311
+ # Function to classify multiple products
312
+ def classify_multiple_products(product_list):
313
+ """Classify a list of products"""
314
+ results = []
315
+
316
+ print(f"Classifying {len(product_list)} products...")
317
+
318
+ for i, product in enumerate(product_list, 1):
319
+ result = classify_product(product)
320
+ results.append(result)
321
+
322
+ if result['success']:
323
+ print(f"{i:3d}. {product}")
324
+ print(f" → {result['classification']}")
325
+ print(f" → Confidence: {result['confidence']:.3f}")
326
+ else:
327
+ print(f"{i:3d}. {product} - Error: {result['error']}")
328
+ print()
329
+
330
+ return results
331
+
332
+ # Test examples
333
+ test_products = [
334
+ "نادك حليب طويل الأجل 1 لتر",
335
+ "قهوة عربية محمصة",
336
+ "شاي أحمر ليبتون",
337
+ "منظف أرضيات فلاش",
338
+ "سكر أبيض ناعم",
339
+ "عصير برتقال طبيعي"
340
+ ]
341
+
342
+ print("\n" + "="*60)
343
+ print("Testing model on sample products")
344
+ print("="*60)
345
+
346
+ # Classify test products
347
+ test_results = classify_multiple_products(test_products)
348
+
349
+ # Quick statistics
350
+ successful_predictions = [r for r in test_results if r['success']]
351
+ avg_confidence = sum(r['confidence'] for r in successful_predictions) / len(successful_predictions)
352
+
353
+ print("="*60)
354
+ print("Results summary:")
355
+ print(f"Successfully classified {len(successful_predictions)} products")
356
+ print(f"Average confidence level: {avg_confidence:.3f}")
357
+
358
+ # Display unique classifications
359
+ unique_classifications = set(r['classification'] for r in successful_predictions)
360
+ print(f"Number of categories used: {len(unique_classifications)}")
361
+ print("Categories:")
362
+ for classification in sorted(unique_classifications):
363
+ count = sum(1 for r in successful_predictions if r['classification'] == classification)
364
+ print(f" • {classification} ({count} products)")
365
+
366
+ print("\n" + "="*60)
367
+ print("Model ready for use!")
368
+ print("="*60)
369
+ print("Usage:")
370
+ print("result = classify_product('product name')")
371
+ print("print(f\"Classification: {result['classification']}\")")
372
+ print("print(f\"Confidence: {result['confidence']:.3f}\")")
373
+
374
+ print("\nFor multiple products:")
375
+ print("products = ['product 1', 'product 2', 'product 3']")
376
+ print("results = classify_multiple_products(products)")
377
+
378
+ test_product = 'عطر كروم ليجند للرجال او دي تواليت من ازارو 125 مل'
379
+ result, confidence = predict(test_product)
380
+
381
+ print(f"\nTest: {test_product}")
382
+ print(f"Result: {result}")
383
+ print(f"Confidence: {confidence:.3f}")
384
+
385
+ """# Saving The model"""
386
+
387
+ # احفظ النموذج
388
+ model.save_pretrained('/content/my_model/')
389
+
390
+ # لاحقاً، لتحميله مرة أخرى:
391
+ from transformers import BertForSequenceClassification
392
+ model = BertForSequenceClassification.from_pretrained('/content/my_model/')
393
+
394
+ !zip -r my_model.zip /content/my_model/
395
+
396
+ tokenizer.save_pretrained('/content/my_model')
397
+ model.save_pretrained('/content/my_model')
398
+ import joblib
399
+ joblib.dump(label_encoder, '/content/my_model/labels.pkl')
400
+
401
+ from google.colab import files
402
+ files.download('my_model.zip')
403
+
404
+ """# Testing"""
405
+
406
+ !ls /content/my_model
407
+
408
+
409
+
410
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
411
+ import torch
412
+ import joblib
413
+
414
+ # Define the path where files are saved
415
+ save_path = '/content/my_model'
416
+
417
+ # Load the tokenizer, model, and label encoder
418
+ tokenizer = AutoTokenizer.from_pretrained(save_path)
419
+ model = AutoModelForSequenceClassification.from_pretrained(save_path)
420
+ label_encoder = joblib.load(f'{save_path}/labels.pkl')
421
+
422
+ def predict(text):
423
+ # Preprocess the input text
424
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
425
+
426
+ # Perform inference
427
+ with torch.no_grad():
428
+ outputs = model(**inputs)
429
+
430
+ # Get predicted class ID and confidence
431
+ predicted_id = outputs.logits.argmax().item()
432
+ confidence = torch.nn.functional.softmax(outputs.logits, dim=-1).max().item()
433
+
434
+ # Map the ID back to the label name
435
+ classification = label_encoder.inverse_transform([predicted_id])[0]
436
+
437
+ return classification, confidence
438
+
439
+ # Test a product
440
+ test_product = "نادك حليب طويل الأجل 1 لتر"
441
+ result, confidence = predict(test_product)
442
+
443
+ print(f"Test Product: {test_product}")
444
+ print(f"Predicted Category: {result}")
445
+ print(f"Confidence: {confidence:.3f}")
446
+
447
+ # Test a product
448
+ test_product = "زبادى"
449
+ result, confidence = predict(test_product)
450
+
451
+ print(f"Test Product: {test_product}")
452
+ print(f"Predicted Category: {result}")
453
+ print(f"Confidence: {confidence:.3f}")
454
+
455
+ # Test a product
456
+ test_product = "بترول"
457
+ result, confidence = predict(test_product)
458
+
459
+ print(f"Test Product: {test_product}")
460
+ print(f"Predicted Category: {result}")
461
+ print(f"Confidence: {confidence:.3f}")
462
+
463
+ from google.colab import files
464
+ uploaded = files.upload()
465
+