chore: resample-prep
Browse filesOpenCV's Haar Cascades for face detection + re-sampling roughly 20% of original evalset
- scripts/resample_evalset.py +77 -0
scripts/resample_evalset.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import random
|
| 3 |
+
import cv2
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
# Set up logging configuration
|
| 8 |
+
log_file = "sample_images.log"
|
| 9 |
+
logging.basicConfig(filename=log_file, level=logging.INFO,
|
| 10 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
| 11 |
+
|
| 12 |
+
def detect_faces(image_path):
|
| 13 |
+
# Load the pre-trained Haar Cascade model for face detection
|
| 14 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
| 15 |
+
|
| 16 |
+
# Read the image in grayscale
|
| 17 |
+
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
| 18 |
+
if image is None:
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
# Detect faces in the image
|
| 22 |
+
faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
|
| 23 |
+
|
| 24 |
+
# Return True if at least one face is detected
|
| 25 |
+
return len(faces) > 0
|
| 26 |
+
|
| 27 |
+
def sample_images(input_folder, output_folder, sample_rate=0.2):
|
| 28 |
+
# Ensure the output folder exists
|
| 29 |
+
if not os.path.exists(output_folder):
|
| 30 |
+
os.makedirs(output_folder)
|
| 31 |
+
|
| 32 |
+
# Initialize counters and start time
|
| 33 |
+
total_files = 0
|
| 34 |
+
sampled_files = 0
|
| 35 |
+
start_time = datetime.now()
|
| 36 |
+
|
| 37 |
+
# Walk through the input folder structure
|
| 38 |
+
for root, dirs, files in os.walk(input_folder):
|
| 39 |
+
relative_path = os.path.relpath(root, input_folder)
|
| 40 |
+
output_subfolder = os.path.join(output_folder, relative_path)
|
| 41 |
+
|
| 42 |
+
if not os.path.exists(output_subfolder):
|
| 43 |
+
os.makedirs(output_subfolder)
|
| 44 |
+
|
| 45 |
+
total_files += len(files)
|
| 46 |
+
|
| 47 |
+
# Sample files in this directory
|
| 48 |
+
sampled_files_this_batch = []
|
| 49 |
+
for file in files:
|
| 50 |
+
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
|
| 51 |
+
input_file_path = os.path.join(root, file)
|
| 52 |
+
if detect_faces(input_file_path):
|
| 53 |
+
sampled_files_this_batch.append(file)
|
| 54 |
+
|
| 55 |
+
sampled_files += len(sampled_files_this_batch)
|
| 56 |
+
|
| 57 |
+
for file in files:
|
| 58 |
+
if file in sampled_files_this_batch:
|
| 59 |
+
input_file_path = os.path.join(root, file)
|
| 60 |
+
output_file_path = os.path.join(output_subfolder, file)
|
| 61 |
+
os.link(input_file_path, output_file_path)
|
| 62 |
+
|
| 63 |
+
# Log the action
|
| 64 |
+
logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")
|
| 65 |
+
|
| 66 |
+
elapsed_time = datetime.now() - start_time
|
| 67 |
+
print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")
|
| 68 |
+
|
| 69 |
+
end_time = datetime.now()
|
| 70 |
+
total_time = end_time - start_time
|
| 71 |
+
logging.info(f"Total time taken: {total_time}")
|
| 72 |
+
logging.info(f"Sampled {sampled_files} out of {total_files} files.")
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
input_folder = "EvalSet"
|
| 76 |
+
output_folder = "resampledEvalSet"
|
| 77 |
+
sample_images(input_folder, output_folder)
|