scripts/resample_evalset.py
| 1 | import os |
| 2 | import random |
| 3 | import cv2 |
| 4 | from datetime import datetime |
| 5 | import logging |
| 6 | |
| 7 | # Set up logging configuration |
| 8 | log_file = "sample_images.log" |
| 9 | logging.basicConfig(filename=log_file, level=logging.INFO, |
| 10 | format='%(asctime)s - %(levelname)s - %(message)s') |
| 11 | |
| 12 | def detect_faces(image_path): |
| 13 | # Load the pre-trained Haar Cascade model for face detection |
| 14 | face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') |
| 15 | |
| 16 | # Read the image in grayscale |
| 17 | image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) |
| 18 | if image is None: |
| 19 | return False |
| 20 | |
| 21 | # Detect faces in the image |
| 22 | faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) |
| 23 | |
| 24 | # Return True if at least one face is detected |
| 25 | return len(faces) > 0 |
| 26 | |
| 27 | def sample_images(input_folder, output_folder, sample_rate=0.2): |
| 28 | # Ensure the output folder exists |
| 29 | if not os.path.exists(output_folder): |
| 30 | os.makedirs(output_folder) |
| 31 | |
| 32 | # Initialize counters and start time |
| 33 | total_files = 0 |
| 34 | sampled_files = 0 |
| 35 | start_time = datetime.now() |
| 36 | |
| 37 | # Walk through the input folder structure |
| 38 | for root, dirs, files in os.walk(input_folder): |
| 39 | relative_path = os.path.relpath(root, input_folder) |
| 40 | output_subfolder = os.path.join(output_folder, relative_path) |
| 41 | |
| 42 | if not os.path.exists(output_subfolder): |
| 43 | os.makedirs(output_subfolder) |
| 44 | |
| 45 | total_files += len(files) |
| 46 | |
| 47 | # Sample files in this directory |
| 48 | sampled_files_this_batch = [] |
| 49 | for file in files: |
| 50 | if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): |
| 51 | input_file_path = os.path.join(root, file) |
| 52 | if detect_faces(input_file_path): |
| 53 | sampled_files_this_batch.append(file) |
| 54 | |
| 55 | sampled_files += len(sampled_files_this_batch) |
| 56 | |
| 57 | for file in files: |
| 58 | if file in sampled_files_this_batch: |
| 59 | input_file_path = os.path.join(root, file) |
| 60 | output_file_path = os.path.join(output_subfolder, file) |
| 61 | os.link(input_file_path, output_file_path) |
| 62 | |
| 63 | # Log the action |
| 64 | logging.info(f"Sampled and copied {input_file_path} to {output_file_path}") |
| 65 | |
| 66 | elapsed_time = datetime.now() - start_time |
| 67 | print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}") |
| 68 | |
| 69 | end_time = datetime.now() |
| 70 | total_time = end_time - start_time |
| 71 | logging.info(f"Total time taken: {total_time}") |
| 72 | logging.info(f"Sampled {sampled_files} out of {total_files} files.") |
| 73 | |
| 74 | if __name__ == "__main__": |
| 75 | input_folder = "EvalSet" |
| 76 | output_folder = "resampledEvalSet" |
| 77 | sample_images(input_folder, output_folder) |
| 78 | |