Files
Abschluss-Projekt/backend/services/validate_dataset.py
2025-12-08 12:26:34 +01:00

245 lines
8.6 KiB
Python
Executable File

"""
Validate dataset for training - check for problematic images and annotations
"""
import os
import json
from PIL import Image
import cv2
def validate_coco_json(json_path, data_dir):
"""
Validate a COCO JSON file and check all images
Args:
json_path: Path to COCO JSON file
data_dir: Directory where images are located
Returns:
dict with validation results
"""
print(f"\n{'='*60}")
print(f"Validating: {json_path}")
print(f"{'='*60}\n")
issues = {
'missing_images': [],
'corrupted_images': [],
'zero_dimension_images': [],
'invalid_annotations': [],
'zero_area_boxes': []
}
try:
with open(json_path, 'r') as f:
coco_data = json.load(f)
except Exception as e:
print(f"❌ Failed to load JSON: {e}")
return issues
images = coco_data.get('images', [])
annotations = coco_data.get('annotations', [])
print(f"📊 Dataset Stats:")
print(f" Images: {len(images)}")
print(f" Annotations: {len(annotations)}")
print(f" Categories: {len(coco_data.get('categories', []))}")
print()
# Validate images
print("🔍 Validating images...")
for idx, img_info in enumerate(images):
img_id = img_info.get('id')
file_name = img_info.get('file_name', '')
width = img_info.get('width', 0)
height = img_info.get('height', 0)
# Check if image file exists
# Try to construct the full path
if os.path.isabs(file_name):
img_path = file_name
else:
img_path = os.path.join(data_dir, file_name)
if not os.path.exists(img_path):
issues['missing_images'].append({
'id': img_id,
'file_name': file_name,
'expected_path': img_path
})
continue
# Check if image can be loaded
try:
# Try with PIL
with Image.open(img_path) as pil_img:
pil_width, pil_height = pil_img.size
# Check if dimensions match JSON
if pil_width != width or pil_height != height:
print(f"⚠️ Image {img_id}: Dimension mismatch - JSON: {width}x{height}, Actual: {pil_width}x{pil_height}")
# Check for zero dimensions
if pil_width == 0 or pil_height == 0:
issues['zero_dimension_images'].append({
'id': img_id,
'file_name': file_name,
'dimensions': f"{pil_width}x{pil_height}"
})
except Exception as e:
issues['corrupted_images'].append({
'id': img_id,
'file_name': file_name,
'error': str(e)
})
# Progress indicator
if (idx + 1) % 100 == 0:
print(f" Checked {idx + 1}/{len(images)} images...")
print(f"✅ Image validation complete\n")
# Validate annotations
print("🔍 Validating annotations...")
for idx, ann in enumerate(annotations):
ann_id = ann.get('id')
img_id = ann.get('image_id')
bbox = ann.get('bbox', [])
if len(bbox) != 4:
issues['invalid_annotations'].append({
'id': ann_id,
'image_id': img_id,
'reason': f'Invalid bbox length: {len(bbox)}'
})
continue
x, y, w, h = bbox
# Check for zero or negative dimensions
if w <= 0 or h <= 0:
issues['zero_area_boxes'].append({
'id': ann_id,
'image_id': img_id,
'bbox': bbox,
'reason': f'Zero or negative dimensions: w={w}, h={h}'
})
# Check for extremely small boxes (potential issue with mixup)
if w < 1 or h < 1:
issues['zero_area_boxes'].append({
'id': ann_id,
'image_id': img_id,
'bbox': bbox,
'reason': f'Extremely small box: w={w}, h={h}'
})
# Progress indicator
if (idx + 1) % 1000 == 0:
print(f" Checked {idx + 1}/{len(annotations)} annotations...")
print(f"✅ Annotation validation complete\n")
# Print summary
print(f"\n{'='*60}")
print("VALIDATION SUMMARY")
print(f"{'='*60}\n")
total_issues = sum(len(v) for v in issues.values())
if total_issues == 0:
print("✅ No issues found! Dataset is ready for training.")
else:
print(f"⚠️ Found {total_issues} total issues:\n")
if issues['missing_images']:
print(f" ❌ Missing images: {len(issues['missing_images'])}")
for item in issues['missing_images'][:5]: # Show first 5
print(f" - {item['file_name']}")
if len(issues['missing_images']) > 5:
print(f" ... and {len(issues['missing_images']) - 5} more")
if issues['corrupted_images']:
print(f" ❌ Corrupted images: {len(issues['corrupted_images'])}")
for item in issues['corrupted_images'][:5]:
print(f" - {item['file_name']}: {item['error']}")
if len(issues['corrupted_images']) > 5:
print(f" ... and {len(issues['corrupted_images']) - 5} more")
if issues['zero_dimension_images']:
print(f" ❌ Zero dimension images: {len(issues['zero_dimension_images'])}")
for item in issues['zero_dimension_images'][:5]:
print(f" - {item['file_name']}: {item['dimensions']}")
if len(issues['zero_dimension_images']) > 5:
print(f" ... and {len(issues['zero_dimension_images']) - 5} more")
if issues['invalid_annotations']:
print(f" ❌ Invalid annotations: {len(issues['invalid_annotations'])}")
for item in issues['invalid_annotations'][:5]:
print(f" - Ann ID {item['id']}: {item['reason']}")
if len(issues['invalid_annotations']) > 5:
print(f" ... and {len(issues['invalid_annotations']) - 5} more")
if issues['zero_area_boxes']:
print(f" ⚠️ Zero/tiny area boxes: {len(issues['zero_area_boxes'])}")
print(f" These may cause issues with mixup augmentation!")
for item in issues['zero_area_boxes'][:5]:
print(f" - Ann ID {item['id']}, bbox: {item['bbox']}")
if len(issues['zero_area_boxes']) > 5:
print(f" ... and {len(issues['zero_area_boxes']) - 5} more")
print()
return issues
def validate_training_dataset(training_id):
"""
Validate all COCO JSON files for a training
Args:
training_id: The training ID to validate
"""
from models.training import Training
from models.TrainingProject import TrainingProject
from services.settings_service import get_setting
training = Training.query.get(training_id)
if not training:
print(f"❌ Training {training_id} not found")
return
# Get paths
from models.TrainingProjectDetails import TrainingProjectDetails
details = TrainingProjectDetails.query.get(training.project_details_id)
training_project = TrainingProject.query.get(details.project_id)
project_name = training_project.title.replace(' ', '_') if training_project else f'project_{details.project_id}'
training_folder_name = f"{training.exp_name or training.training_name or 'training'}_{training_id}"
training_folder_name = training_folder_name.replace(' ', '_')
output_base_path = get_setting('yolox_output_path', './backend')
data_dir = get_setting('yolox_data_dir', '/home/kitraining/To_Annotate/')
annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations')
# Validate each split
splits = ['train', 'valid', 'test']
all_issues = {}
for split in splits:
json_file = os.path.join(annotations_dir, f'coco_project_{training_id}_{split}.json')
if os.path.exists(json_file):
all_issues[split] = validate_coco_json(json_file, data_dir)
else:
print(f"⚠️ JSON file not found: {json_file}")
return all_issues
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
training_id = int(sys.argv[1])
validate_training_dataset(training_id)
else:
print("Usage: python validate_dataset.py <training_id>")