Minibase commited on
Commit
8091869
·
verified ·
1 Parent(s): 37a7297

Upload run_benchmarks.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_benchmarks.py +377 -0
run_benchmarks.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark Runner for Summarizer-Standard Model
4
+
5
+ Evaluates summarization performance using ROUGE scores, semantic similarity,
6
+ latency, and model size metrics.
7
+ """
8
+
9
+ import json
10
+ import time
11
+ import yaml
12
+ import argparse
13
+ import requests
14
+ from pathlib import Path
15
+ from datetime import datetime
16
+ import numpy as np
17
+ import re
18
+
19
+ class SummarizerStandardBenchmarkRunner:
20
+ def __init__(self, config_path: str):
21
+ self.config = self._load_config(config_path)
22
+ self.results = {
23
+ "model": "Summarizer-Standard",
24
+ "timestamp": datetime.now().isoformat(),
25
+ "datasets": {},
26
+ "overall_metrics": {}
27
+ }
28
+
29
+ # No external evaluation tools needed - using simple metrics
30
+
31
+ def _load_config(self, config_path: str) -> dict:
32
+ with open(config_path, 'r') as f:
33
+ return yaml.safe_load(f)
34
+
35
+ def _load_dataset(self, dataset_path: str, sample_size: int) -> list:
36
+ dataset_file = Path(dataset_path)
37
+ if not dataset_file.exists():
38
+ print(f"⚠️ Dataset not found: {dataset_file}")
39
+ return []
40
+
41
+ with open(dataset_file, 'r') as f:
42
+ data = [json.loads(line) for line in f]
43
+
44
+ return data[:sample_size]
45
+
46
+ def _call_model(self, text: str) -> tuple:
47
+ instruction = self.config["datasets"][0]["instruction"]
48
+ prompt = f"{instruction}\n\nInput: {text}\n\nSummary:"
49
+
50
+ payload = {
51
+ "prompt": prompt,
52
+ "max_tokens": self.config["model"]["max_tokens"],
53
+ "temperature": self.config["model"]["temperature"]
54
+ }
55
+
56
+ headers = {'Content-Type': 'application/json'}
57
+ start_time = time.time()
58
+
59
+ try:
60
+ response = requests.post(
61
+ f"{self.config['model']['base_url']}/completion",
62
+ json=payload, headers=headers, timeout=self.config["model"]["timeout"]
63
+ )
64
+ latency = time.time() - start_time
65
+
66
+ if response.status_code == 200:
67
+ return response.json()["content"], latency
68
+ else:
69
+ return f"Error: {response.status_code}", latency
70
+ except Exception as e:
71
+ return f"Error: {e}", time.time() - start_time
72
+
73
+ def _calculate_rouge_scores(self, predicted: str, expected: str) -> dict:
74
+ """Calculate simple ROUGE-style n-gram overlap scores"""
75
+ def get_ngrams(text, n):
76
+ words = re.findall(r'\b\w+\b', text.lower())
77
+ return set([tuple(words[i:i+n]) for i in range(len(words)-n+1)])
78
+
79
+ pred_words = re.findall(r'\b\w+\b', predicted.lower())
80
+ exp_words = re.findall(r'\b\w+\b', expected.lower())
81
+
82
+ # ROUGE-1: unigram overlap
83
+ pred_1grams = set(pred_words)
84
+ exp_1grams = set(exp_words)
85
+ rouge1_prec = len(pred_1grams & exp_1grams) / max(len(pred_1grams), 1)
86
+ rouge1_rec = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1)
87
+ rouge1 = 2 * rouge1_prec * rouge1_rec / max(rouge1_prec + rouge1_rec, 1e-10)
88
+
89
+ # ROUGE-2: bigram overlap
90
+ pred_2grams = get_ngrams(predicted, 2)
91
+ exp_2grams = get_ngrams(expected, 2)
92
+ rouge2_prec = len(pred_2grams & exp_2grams) / max(len(pred_2grams), 1)
93
+ rouge2_rec = len(pred_2grams & exp_2grams) / max(len(exp_2grams), 1)
94
+ rouge2 = 2 * rouge2_prec * rouge2_rec / max(rouge2_prec + rouge2_rec, 1e-10)
95
+
96
+ # Simple ROUGE-L approximation (longest common subsequence ratio)
97
+ # For simplicity, use word overlap ratio as approximation
98
+ rougeL = len(pred_1grams & exp_1grams) / max(len(exp_1grams), 1)
99
+
100
+ return {
101
+ 'rouge1': rouge1,
102
+ 'rouge2': rouge2,
103
+ 'rougeL': rougeL
104
+ }
105
+
106
+ def _calculate_semantic_similarity(self, text1: str, text2: str) -> float:
107
+ """Calculate simple word overlap similarity (Jaccard similarity)"""
108
+ try:
109
+ words1 = set(re.findall(r'\b\w+\b', text1.lower()))
110
+ words2 = set(re.findall(r'\b\w+\b', text2.lower()))
111
+
112
+ if not words1 and not words2:
113
+ return 1.0
114
+ if not words1 or not words2:
115
+ return 0.0
116
+
117
+ intersection = len(words1 & words2)
118
+ union = len(words1 | words2)
119
+ return intersection / union
120
+ except Exception as e:
121
+ print(f"Warning: Similarity calculation failed: {e}")
122
+ return 0.0
123
+
124
+ def _calculate_compression_ratio(self, input_text: str, summary: str) -> float:
125
+ """Calculate compression ratio (summary length / input length)"""
126
+ input_words = len(input_text.split())
127
+ summary_words = len(summary.split())
128
+ return summary_words / max(input_words, 1)
129
+
130
+ def _run_dataset_benchmark(self, dataset_name: str, dataset_config: dict) -> dict:
131
+ print(f"📊 Running benchmark on {dataset_name}...")
132
+
133
+ dataset = self._load_dataset(dataset_config["file"], dataset_config["sample_size"])
134
+ if not dataset:
135
+ return {"error": f"No data found for {dataset_name}"}
136
+
137
+ results = {
138
+ "sample_count": len(dataset),
139
+ "rouge1_scores": [],
140
+ "rouge2_scores": [],
141
+ "rougeL_scores": [],
142
+ "semantic_similarity": [],
143
+ "compression_ratios": [],
144
+ "latency_ms": [],
145
+ "successful_predictions": 0,
146
+ "examples": [] # Store actual input/output examples
147
+ }
148
+
149
+ for i, item in enumerate(dataset):
150
+ if i % 10 == 0: # Progress update every 10 samples
151
+ print(f" Processing sample {i+1}/{len(dataset)}")
152
+
153
+ input_text = item[dataset_config["input_field"]]
154
+ expected_summary = item[dataset_config["expected_field"]]
155
+
156
+ # Call model
157
+ predicted_summary, latency = self._call_model(input_text)
158
+
159
+ if not predicted_summary.startswith("Error"):
160
+ results["successful_predictions"] += 1
161
+
162
+ # Calculate metrics
163
+ rouge_scores = self._calculate_rouge_scores(predicted_summary, expected_summary)
164
+ semantic_sim = self._calculate_semantic_similarity(predicted_summary, expected_summary)
165
+ compression = self._calculate_compression_ratio(input_text, predicted_summary)
166
+
167
+ # Store results
168
+ results["rouge1_scores"].append(rouge_scores['rouge1'])
169
+ results["rouge2_scores"].append(rouge_scores['rouge2'])
170
+ results["rougeL_scores"].append(rouge_scores['rougeL'])
171
+ results["semantic_similarity"].append(semantic_sim)
172
+ results["compression_ratios"].append(compression)
173
+ results["latency_ms"].append(latency * 1000)
174
+
175
+ # Store example (keep first 5 for readability)
176
+ if len(results["examples"]) < 5:
177
+ results["examples"].append({
178
+ "input": input_text[:200] + "..." if len(input_text) > 200 else input_text,
179
+ "expected": expected_summary,
180
+ "predicted": predicted_summary,
181
+ "rouge1": rouge_scores['rouge1'],
182
+ "semantic_similarity": semantic_sim,
183
+ "compression_ratio": compression
184
+ })
185
+
186
+ # Calculate averages
187
+ if results["successful_predictions"] > 0:
188
+ results["averages"] = {
189
+ "rouge1": np.mean(results["rouge1_scores"]),
190
+ "rouge2": np.mean(results["rouge2_scores"]),
191
+ "rougeL": np.mean(results["rougeL_scores"]),
192
+ "semantic_similarity": np.mean(results["semantic_similarity"]),
193
+ "compression_ratio": np.mean(results["compression_ratios"]),
194
+ "latency_ms": np.mean(results["latency_ms"])
195
+ }
196
+ else:
197
+ results["averages"] = {
198
+ "rouge1": 0.0,
199
+ "rouge2": 0.0,
200
+ "rougeL": 0.0,
201
+ "semantic_similarity": 0.0,
202
+ "compression_ratio": 0.0,
203
+ "latency_ms": 0.0
204
+ }
205
+
206
+ print(f"✅ {dataset_name} completed")
207
+ return results
208
+
209
+ def run_benchmarks(self):
210
+ print("🚀 Starting Summarizer-Standard Benchmark Suite")
211
+ print("=" * 60)
212
+ print("Evaluating summarization quality with ROUGE and semantic metrics")
213
+ print()
214
+
215
+ # Check server health
216
+ try:
217
+ response = requests.get(f"{self.config['model']['base_url']}/health", timeout=10)
218
+ if response.status_code == 200:
219
+ print("✅ Summarizer-Standard server is running")
220
+ else:
221
+ print(f"❌ Server returned status {response.status_code}")
222
+ return
223
+ except Exception as e:
224
+ print(f"❌ Cannot connect to Summarizer-Standard server: {e}")
225
+ print("Make sure to start the model server first:")
226
+ print(" cd summarizer_standard_model.app/Contents/Resources && ./run_server")
227
+ return
228
+
229
+ # Run benchmarks
230
+ for dataset_config in self.config["datasets"]:
231
+ dataset_name = dataset_config["name"]
232
+ results = self._run_dataset_benchmark(dataset_name, dataset_config)
233
+ self.results["datasets"][dataset_name] = results
234
+
235
+ # Calculate overall averages
236
+ self._calculate_overall_metrics()
237
+ self._save_results()
238
+ self._create_benchmarks_txt()
239
+
240
+ def _calculate_overall_metrics(self):
241
+ all_rouge1 = []
242
+ all_rouge2 = []
243
+ all_rougeL = []
244
+ all_semantic = []
245
+ all_compression = []
246
+ all_latency = []
247
+ total_samples = 0
248
+
249
+ for dataset_results in self.results["datasets"].values():
250
+ if "averages" in dataset_results:
251
+ all_rouge1.append(dataset_results["averages"]["rouge1"])
252
+ all_rouge2.append(dataset_results["averages"]["rouge2"])
253
+ all_rougeL.append(dataset_results["averages"]["rougeL"])
254
+ all_semantic.append(dataset_results["averages"]["semantic_similarity"])
255
+ all_compression.append(dataset_results["averages"]["compression_ratio"])
256
+ all_latency.append(dataset_results["averages"]["latency_ms"])
257
+ total_samples += dataset_results["sample_count"]
258
+
259
+ self.results["overall_metrics"] = {
260
+ "avg_rouge1": np.mean(all_rouge1) if all_rouge1 else 0,
261
+ "avg_rouge2": np.mean(all_rouge2) if all_rouge2 else 0,
262
+ "avg_rougeL": np.mean(all_rougeL) if all_rougeL else 0,
263
+ "avg_semantic_similarity": np.mean(all_semantic) if all_semantic else 0,
264
+ "avg_compression_ratio": np.mean(all_compression) if all_compression else 0,
265
+ "avg_latency_ms": np.mean(all_latency) if all_latency else 0,
266
+ "model_size_gb": self.config["output"]["model_size_gb"],
267
+ "total_samples": total_samples
268
+ }
269
+
270
+ def _save_results(self):
271
+ results_dir = Path("results")
272
+ results_dir.mkdir(exist_ok=True)
273
+
274
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
275
+ results_file = results_dir / f"summarizer_standard_benchmark_{timestamp}.json"
276
+
277
+ with open(results_file, 'w') as f:
278
+ json.dump(self.results, f, indent=2)
279
+
280
+ print(f"📁 Detailed results saved to: {results_file}")
281
+
282
+ def _create_benchmarks_txt(self):
283
+ """Create the benchmarks.txt file with all results"""
284
+ benchmarks_content = []
285
+ benchmarks_content.append("="*80)
286
+ benchmarks_content.append("SUMMARIZER-STANDARD MODEL BENCHMARK RESULTS")
287
+ benchmarks_content.append("="*80)
288
+ benchmarks_content.append("")
289
+ benchmarks_content.append("📊 EXECUTIVE SUMMARY")
290
+ benchmarks_content.append("-"*50)
291
+ benchmarks_content.append(f"Benchmark Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
292
+ benchmarks_content.append(f"Model: {self.results['model']}")
293
+ benchmarks_content.append(f"Dataset: CNN/DailyMail Sample")
294
+ benchmarks_content.append(f"Total Samples: {self.results['overall_metrics']['total_samples']}")
295
+ benchmarks_content.append(f"Model Size: {self.results['overall_metrics']['model_size_gb']:.3f} GB")
296
+ benchmarks_content.append("")
297
+
298
+ overall = self.results['overall_metrics']
299
+ benchmarks_content.append("🎯 OVERALL PERFORMANCE METRICS")
300
+ benchmarks_content.append("-"*50)
301
+ benchmarks_content.append(f" ROUGE-1 Score: {overall['avg_rouge1']:.3f}")
302
+ benchmarks_content.append(f" ROUGE-2 Score: {overall['avg_rouge2']:.3f}")
303
+ benchmarks_content.append(f" ROUGE-L Score: {overall['avg_rougeL']:.3f}")
304
+ benchmarks_content.append(f" Semantic Similarity: {overall['avg_semantic_similarity']:.3f}")
305
+ benchmarks_content.append(f" Compression Ratio: {overall['avg_compression_ratio']:.3f}")
306
+ benchmarks_content.append(f" Average Latency: {overall['avg_latency_ms']:.1f}ms")
307
+ benchmarks_content.append("")
308
+
309
+ # Dataset breakdown
310
+ benchmarks_content.append("📈 DATASET BREAKDOWN")
311
+ benchmarks_content.append("-"*50)
312
+
313
+ for dataset_name, dataset_results in self.results["datasets"].items():
314
+ if "averages" in dataset_results:
315
+ benchmarks_content.append("")
316
+ benchmarks_content.append(f"🔹 {dataset_name.upper().replace('_', ' ')}")
317
+ benchmarks_content.append(f" Samples: {dataset_results['sample_count']}")
318
+ avg = dataset_results["averages"]
319
+ benchmarks_content.append(f" ROUGE-1: {avg['rouge1']:.3f}")
320
+ benchmarks_content.append(f" ROUGE-2: {avg['rouge2']:.3f}")
321
+ benchmarks_content.append(f" ROUGE-L: {avg['rougeL']:.3f}")
322
+ benchmarks_content.append(f" Semantic Similarity: {avg['semantic_similarity']:.3f}")
323
+ benchmarks_content.append(f" Compression Ratio: {avg['compression_ratio']:.3f}")
324
+ benchmarks_content.append(f" Latency: {avg['latency_ms']:.1f}ms")
325
+
326
+ # Add examples if available
327
+ if "examples" in dataset_results and dataset_results["examples"]:
328
+ benchmarks_content.append("")
329
+ benchmarks_content.append(" 📝 SAMPLE OUTPUTS:")
330
+ for i, example in enumerate(dataset_results["examples"][:3]): # Show first 3 examples
331
+ benchmarks_content.append(f" Example {i+1}:")
332
+ benchmarks_content.append(f" Input: {example['input']}")
333
+ benchmarks_content.append(f" Expected: {example['expected']}")
334
+ benchmarks_content.append(f" Predicted: {example['predicted']}")
335
+ benchmarks_content.append(f" ROUGE-1: {example['rouge1']:.3f}, Similarity: {example['semantic_similarity']:.3f}")
336
+ benchmarks_content.append("")
337
+
338
+ benchmarks_content.append("")
339
+ benchmarks_content.append("📋 METRICS EXPLANATION")
340
+ benchmarks_content.append("-"*50)
341
+ benchmarks_content.append("• ROUGE-1: Unigram (word) overlap between predicted and expected summaries")
342
+ benchmarks_content.append("• ROUGE-2: Bigram (2-word) overlap between predicted and expected summaries")
343
+ benchmarks_content.append("• ROUGE-L: Longest Common Subsequence overlap")
344
+ benchmarks_content.append("• Semantic Similarity: Word overlap similarity (Jaccard coefficient)")
345
+ benchmarks_content.append("• Compression Ratio: Summary length ÷ Input length (0.1-0.8 is ideal)")
346
+ benchmarks_content.append("• Latency: Response time in milliseconds (lower = faster)")
347
+ benchmarks_content.append("")
348
+ benchmarks_content.append("📊 INTERPRETING SCORES:")
349
+ benchmarks_content.append("• ROUGE scores > 0.5 are considered good, > 0.3 acceptable")
350
+ benchmarks_content.append("• Current scores indicate the model is not performing summarization effectively")
351
+ benchmarks_content.append("• The model generates very short outputs that miss key information")
352
+ benchmarks_content.append("")
353
+ benchmarks_content.append("="*80)
354
+
355
+ # Write to benchmarks.txt
356
+ with open("benchmarks.txt", "w") as f:
357
+ f.write("\n".join(benchmarks_content))
358
+
359
+ print("📄 Results summary saved to: benchmarks.txt")
360
+
361
+ def main():
362
+ parser = argparse.ArgumentParser(description="Run Summarizer-Standard benchmarks")
363
+ parser.add_argument("--config", default="benchmark_config.yaml", help="Config file")
364
+
365
+ args = parser.parse_args()
366
+
367
+ try:
368
+ runner = SummarizerStandardBenchmarkRunner(args.config)
369
+ runner.run_benchmarks()
370
+ print("\n✅ Benchmarking completed! Results saved to benchmarks.txt")
371
+ except Exception as e:
372
+ print(f"❌ Benchmark failed: {e}")
373
+ import traceback
374
+ traceback.print_exc()
375
+
376
+ if __name__ == "__main__":
377
+ main()