""" Validate dataset for training - check for problematic images and annotations """ import os import json from PIL import Image import cv2 def validate_coco_json(json_path, data_dir): """ Validate a COCO JSON file and check all images Args: json_path: Path to COCO JSON file data_dir: Directory where images are located Returns: dict with validation results """ print(f"\n{'='*60}") print(f"Validating: {json_path}") print(f"{'='*60}\n") issues = { 'missing_images': [], 'corrupted_images': [], 'zero_dimension_images': [], 'invalid_annotations': [], 'zero_area_boxes': [] } try: with open(json_path, 'r') as f: coco_data = json.load(f) except Exception as e: print(f"❌ Failed to load JSON: {e}") return issues images = coco_data.get('images', []) annotations = coco_data.get('annotations', []) print(f"📊 Dataset Stats:") print(f" Images: {len(images)}") print(f" Annotations: {len(annotations)}") print(f" Categories: {len(coco_data.get('categories', []))}") print() # Validate images print("🔍 Validating images...") for idx, img_info in enumerate(images): img_id = img_info.get('id') file_name = img_info.get('file_name', '') width = img_info.get('width', 0) height = img_info.get('height', 0) # Check if image file exists # Try to construct the full path if os.path.isabs(file_name): img_path = file_name else: img_path = os.path.join(data_dir, file_name) if not os.path.exists(img_path): issues['missing_images'].append({ 'id': img_id, 'file_name': file_name, 'expected_path': img_path }) continue # Check if image can be loaded try: # Try with PIL with Image.open(img_path) as pil_img: pil_width, pil_height = pil_img.size # Check if dimensions match JSON if pil_width != width or pil_height != height: print(f"⚠️ Image {img_id}: Dimension mismatch - JSON: {width}x{height}, Actual: {pil_width}x{pil_height}") # Check for zero dimensions if pil_width == 0 or pil_height == 0: issues['zero_dimension_images'].append({ 'id': img_id, 'file_name': file_name, 'dimensions': f"{pil_width}x{pil_height}" }) except Exception as e: issues['corrupted_images'].append({ 'id': img_id, 'file_name': file_name, 'error': str(e) }) # Progress indicator if (idx + 1) % 100 == 0: print(f" Checked {idx + 1}/{len(images)} images...") print(f"✅ Image validation complete\n") # Validate annotations print("🔍 Validating annotations...") for idx, ann in enumerate(annotations): ann_id = ann.get('id') img_id = ann.get('image_id') bbox = ann.get('bbox', []) if len(bbox) != 4: issues['invalid_annotations'].append({ 'id': ann_id, 'image_id': img_id, 'reason': f'Invalid bbox length: {len(bbox)}' }) continue x, y, w, h = bbox # Check for zero or negative dimensions if w <= 0 or h <= 0: issues['zero_area_boxes'].append({ 'id': ann_id, 'image_id': img_id, 'bbox': bbox, 'reason': f'Zero or negative dimensions: w={w}, h={h}' }) # Check for extremely small boxes (potential issue with mixup) if w < 1 or h < 1: issues['zero_area_boxes'].append({ 'id': ann_id, 'image_id': img_id, 'bbox': bbox, 'reason': f'Extremely small box: w={w}, h={h}' }) # Progress indicator if (idx + 1) % 1000 == 0: print(f" Checked {idx + 1}/{len(annotations)} annotations...") print(f"✅ Annotation validation complete\n") # Print summary print(f"\n{'='*60}") print("VALIDATION SUMMARY") print(f"{'='*60}\n") total_issues = sum(len(v) for v in issues.values()) if total_issues == 0: print("✅ No issues found! Dataset is ready for training.") else: print(f"⚠️ Found {total_issues} total issues:\n") if issues['missing_images']: print(f" ❌ Missing images: {len(issues['missing_images'])}") for item in issues['missing_images'][:5]: # Show first 5 print(f" - {item['file_name']}") if len(issues['missing_images']) > 5: print(f" ... and {len(issues['missing_images']) - 5} more") if issues['corrupted_images']: print(f" ❌ Corrupted images: {len(issues['corrupted_images'])}") for item in issues['corrupted_images'][:5]: print(f" - {item['file_name']}: {item['error']}") if len(issues['corrupted_images']) > 5: print(f" ... and {len(issues['corrupted_images']) - 5} more") if issues['zero_dimension_images']: print(f" ❌ Zero dimension images: {len(issues['zero_dimension_images'])}") for item in issues['zero_dimension_images'][:5]: print(f" - {item['file_name']}: {item['dimensions']}") if len(issues['zero_dimension_images']) > 5: print(f" ... and {len(issues['zero_dimension_images']) - 5} more") if issues['invalid_annotations']: print(f" ❌ Invalid annotations: {len(issues['invalid_annotations'])}") for item in issues['invalid_annotations'][:5]: print(f" - Ann ID {item['id']}: {item['reason']}") if len(issues['invalid_annotations']) > 5: print(f" ... and {len(issues['invalid_annotations']) - 5} more") if issues['zero_area_boxes']: print(f" ⚠️ Zero/tiny area boxes: {len(issues['zero_area_boxes'])}") print(f" These may cause issues with mixup augmentation!") for item in issues['zero_area_boxes'][:5]: print(f" - Ann ID {item['id']}, bbox: {item['bbox']}") if len(issues['zero_area_boxes']) > 5: print(f" ... and {len(issues['zero_area_boxes']) - 5} more") print() return issues def validate_training_dataset(training_id): """ Validate all COCO JSON files for a training Args: training_id: The training ID to validate """ from models.training import Training from models.TrainingProject import TrainingProject from services.settings_service import get_setting training = Training.query.get(training_id) if not training: print(f"❌ Training {training_id} not found") return # Get paths from models.TrainingProjectDetails import TrainingProjectDetails details = TrainingProjectDetails.query.get(training.project_details_id) training_project = TrainingProject.query.get(details.project_id) project_name = training_project.title.replace(' ', '_') if training_project else f'project_{details.project_id}' training_folder_name = f"{training.exp_name or training.training_name or 'training'}_{training_id}" training_folder_name = training_folder_name.replace(' ', '_') output_base_path = get_setting('yolox_output_path', './backend') data_dir = get_setting('yolox_data_dir', '/home/kitraining/To_Annotate/') annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations') # Validate each split splits = ['train', 'valid', 'test'] all_issues = {} for split in splits: json_file = os.path.join(annotations_dir, f'coco_project_{training_id}_{split}.json') if os.path.exists(json_file): all_issues[split] = validate_coco_json(json_file, data_dir) else: print(f"⚠️ JSON file not found: {json_file}") return all_issues if __name__ == '__main__': import sys if len(sys.argv) > 1: training_id = int(sys.argv[1]) validate_training_dataset(training_id) else: print("Usage: python validate_dataset.py ")