scripts/resample_evalset.py
2.7 KB · 78 lines · python Raw
1 import os
2 import random
3 import cv2
4 from datetime import datetime
5 import logging
6
7 # Set up logging configuration
8 log_file = "sample_images.log"
9 logging.basicConfig(filename=log_file, level=logging.INFO,
10 format='%(asctime)s - %(levelname)s - %(message)s')
11
12 def detect_faces(image_path):
13 # Load the pre-trained Haar Cascade model for face detection
14 face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
15
16 # Read the image in grayscale
17 image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
18 if image is None:
19 return False
20
21 # Detect faces in the image
22 faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
23
24 # Return True if at least one face is detected
25 return len(faces) > 0
26
27 def sample_images(input_folder, output_folder, sample_rate=0.2):
28 # Ensure the output folder exists
29 if not os.path.exists(output_folder):
30 os.makedirs(output_folder)
31
32 # Initialize counters and start time
33 total_files = 0
34 sampled_files = 0
35 start_time = datetime.now()
36
37 # Walk through the input folder structure
38 for root, dirs, files in os.walk(input_folder):
39 relative_path = os.path.relpath(root, input_folder)
40 output_subfolder = os.path.join(output_folder, relative_path)
41
42 if not os.path.exists(output_subfolder):
43 os.makedirs(output_subfolder)
44
45 total_files += len(files)
46
47 # Sample files in this directory
48 sampled_files_this_batch = []
49 for file in files:
50 if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
51 input_file_path = os.path.join(root, file)
52 if detect_faces(input_file_path):
53 sampled_files_this_batch.append(file)
54
55 sampled_files += len(sampled_files_this_batch)
56
57 for file in files:
58 if file in sampled_files_this_batch:
59 input_file_path = os.path.join(root, file)
60 output_file_path = os.path.join(output_subfolder, file)
61 os.link(input_file_path, output_file_path)
62
63 # Log the action
64 logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")
65
66 elapsed_time = datetime.now() - start_time
67 print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")
68
69 end_time = datetime.now()
70 total_time = end_time - start_time
71 logging.info(f"Total time taken: {total_time}")
72 logging.info(f"Sampled {sampled_files} out of {total_files} files.")
73
74 if __name__ == "__main__":
75 input_folder = "EvalSet"
76 output_folder = "resampledEvalSet"
77 sample_images(input_folder, output_folder)
78