cleanup add training bell
This commit is contained in:
244
backend/services/validate_dataset.py
Executable file
244
backend/services/validate_dataset.py
Executable file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
Validate dataset for training - check for problematic images and annotations
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
from PIL import Image
|
||||
import cv2
|
||||
|
||||
def validate_coco_json(json_path, data_dir):
|
||||
"""
|
||||
Validate a COCO JSON file and check all images
|
||||
|
||||
Args:
|
||||
json_path: Path to COCO JSON file
|
||||
data_dir: Directory where images are located
|
||||
|
||||
Returns:
|
||||
dict with validation results
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Validating: {json_path}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
issues = {
|
||||
'missing_images': [],
|
||||
'corrupted_images': [],
|
||||
'zero_dimension_images': [],
|
||||
'invalid_annotations': [],
|
||||
'zero_area_boxes': []
|
||||
}
|
||||
|
||||
try:
|
||||
with open(json_path, 'r') as f:
|
||||
coco_data = json.load(f)
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to load JSON: {e}")
|
||||
return issues
|
||||
|
||||
images = coco_data.get('images', [])
|
||||
annotations = coco_data.get('annotations', [])
|
||||
|
||||
print(f"📊 Dataset Stats:")
|
||||
print(f" Images: {len(images)}")
|
||||
print(f" Annotations: {len(annotations)}")
|
||||
print(f" Categories: {len(coco_data.get('categories', []))}")
|
||||
print()
|
||||
|
||||
# Validate images
|
||||
print("🔍 Validating images...")
|
||||
for idx, img_info in enumerate(images):
|
||||
img_id = img_info.get('id')
|
||||
file_name = img_info.get('file_name', '')
|
||||
width = img_info.get('width', 0)
|
||||
height = img_info.get('height', 0)
|
||||
|
||||
# Check if image file exists
|
||||
# Try to construct the full path
|
||||
if os.path.isabs(file_name):
|
||||
img_path = file_name
|
||||
else:
|
||||
img_path = os.path.join(data_dir, file_name)
|
||||
|
||||
if not os.path.exists(img_path):
|
||||
issues['missing_images'].append({
|
||||
'id': img_id,
|
||||
'file_name': file_name,
|
||||
'expected_path': img_path
|
||||
})
|
||||
continue
|
||||
|
||||
# Check if image can be loaded
|
||||
try:
|
||||
# Try with PIL
|
||||
with Image.open(img_path) as pil_img:
|
||||
pil_width, pil_height = pil_img.size
|
||||
|
||||
# Check if dimensions match JSON
|
||||
if pil_width != width or pil_height != height:
|
||||
print(f"⚠️ Image {img_id}: Dimension mismatch - JSON: {width}x{height}, Actual: {pil_width}x{pil_height}")
|
||||
|
||||
# Check for zero dimensions
|
||||
if pil_width == 0 or pil_height == 0:
|
||||
issues['zero_dimension_images'].append({
|
||||
'id': img_id,
|
||||
'file_name': file_name,
|
||||
'dimensions': f"{pil_width}x{pil_height}"
|
||||
})
|
||||
except Exception as e:
|
||||
issues['corrupted_images'].append({
|
||||
'id': img_id,
|
||||
'file_name': file_name,
|
||||
'error': str(e)
|
||||
})
|
||||
|
||||
# Progress indicator
|
||||
if (idx + 1) % 100 == 0:
|
||||
print(f" Checked {idx + 1}/{len(images)} images...")
|
||||
|
||||
print(f"✅ Image validation complete\n")
|
||||
|
||||
# Validate annotations
|
||||
print("🔍 Validating annotations...")
|
||||
for idx, ann in enumerate(annotations):
|
||||
ann_id = ann.get('id')
|
||||
img_id = ann.get('image_id')
|
||||
bbox = ann.get('bbox', [])
|
||||
|
||||
if len(bbox) != 4:
|
||||
issues['invalid_annotations'].append({
|
||||
'id': ann_id,
|
||||
'image_id': img_id,
|
||||
'reason': f'Invalid bbox length: {len(bbox)}'
|
||||
})
|
||||
continue
|
||||
|
||||
x, y, w, h = bbox
|
||||
|
||||
# Check for zero or negative dimensions
|
||||
if w <= 0 or h <= 0:
|
||||
issues['zero_area_boxes'].append({
|
||||
'id': ann_id,
|
||||
'image_id': img_id,
|
||||
'bbox': bbox,
|
||||
'reason': f'Zero or negative dimensions: w={w}, h={h}'
|
||||
})
|
||||
|
||||
# Check for extremely small boxes (potential issue with mixup)
|
||||
if w < 1 or h < 1:
|
||||
issues['zero_area_boxes'].append({
|
||||
'id': ann_id,
|
||||
'image_id': img_id,
|
||||
'bbox': bbox,
|
||||
'reason': f'Extremely small box: w={w}, h={h}'
|
||||
})
|
||||
|
||||
# Progress indicator
|
||||
if (idx + 1) % 1000 == 0:
|
||||
print(f" Checked {idx + 1}/{len(annotations)} annotations...")
|
||||
|
||||
print(f"✅ Annotation validation complete\n")
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print("VALIDATION SUMMARY")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
total_issues = sum(len(v) for v in issues.values())
|
||||
|
||||
if total_issues == 0:
|
||||
print("✅ No issues found! Dataset is ready for training.")
|
||||
else:
|
||||
print(f"⚠️ Found {total_issues} total issues:\n")
|
||||
|
||||
if issues['missing_images']:
|
||||
print(f" ❌ Missing images: {len(issues['missing_images'])}")
|
||||
for item in issues['missing_images'][:5]: # Show first 5
|
||||
print(f" - {item['file_name']}")
|
||||
if len(issues['missing_images']) > 5:
|
||||
print(f" ... and {len(issues['missing_images']) - 5} more")
|
||||
|
||||
if issues['corrupted_images']:
|
||||
print(f" ❌ Corrupted images: {len(issues['corrupted_images'])}")
|
||||
for item in issues['corrupted_images'][:5]:
|
||||
print(f" - {item['file_name']}: {item['error']}")
|
||||
if len(issues['corrupted_images']) > 5:
|
||||
print(f" ... and {len(issues['corrupted_images']) - 5} more")
|
||||
|
||||
if issues['zero_dimension_images']:
|
||||
print(f" ❌ Zero dimension images: {len(issues['zero_dimension_images'])}")
|
||||
for item in issues['zero_dimension_images'][:5]:
|
||||
print(f" - {item['file_name']}: {item['dimensions']}")
|
||||
if len(issues['zero_dimension_images']) > 5:
|
||||
print(f" ... and {len(issues['zero_dimension_images']) - 5} more")
|
||||
|
||||
if issues['invalid_annotations']:
|
||||
print(f" ❌ Invalid annotations: {len(issues['invalid_annotations'])}")
|
||||
for item in issues['invalid_annotations'][:5]:
|
||||
print(f" - Ann ID {item['id']}: {item['reason']}")
|
||||
if len(issues['invalid_annotations']) > 5:
|
||||
print(f" ... and {len(issues['invalid_annotations']) - 5} more")
|
||||
|
||||
if issues['zero_area_boxes']:
|
||||
print(f" ⚠️ Zero/tiny area boxes: {len(issues['zero_area_boxes'])}")
|
||||
print(f" These may cause issues with mixup augmentation!")
|
||||
for item in issues['zero_area_boxes'][:5]:
|
||||
print(f" - Ann ID {item['id']}, bbox: {item['bbox']}")
|
||||
if len(issues['zero_area_boxes']) > 5:
|
||||
print(f" ... and {len(issues['zero_area_boxes']) - 5} more")
|
||||
|
||||
print()
|
||||
return issues
|
||||
|
||||
|
||||
def validate_training_dataset(training_id):
|
||||
"""
|
||||
Validate all COCO JSON files for a training
|
||||
|
||||
Args:
|
||||
training_id: The training ID to validate
|
||||
"""
|
||||
from models.training import Training
|
||||
from models.TrainingProject import TrainingProject
|
||||
from services.settings_service import get_setting
|
||||
|
||||
training = Training.query.get(training_id)
|
||||
if not training:
|
||||
print(f"❌ Training {training_id} not found")
|
||||
return
|
||||
|
||||
# Get paths
|
||||
from models.TrainingProjectDetails import TrainingProjectDetails
|
||||
details = TrainingProjectDetails.query.get(training.project_details_id)
|
||||
training_project = TrainingProject.query.get(details.project_id)
|
||||
project_name = training_project.title.replace(' ', '_') if training_project else f'project_{details.project_id}'
|
||||
|
||||
training_folder_name = f"{training.exp_name or training.training_name or 'training'}_{training_id}"
|
||||
training_folder_name = training_folder_name.replace(' ', '_')
|
||||
|
||||
output_base_path = get_setting('yolox_output_path', './backend')
|
||||
data_dir = get_setting('yolox_data_dir', '/home/kitraining/To_Annotate/')
|
||||
|
||||
annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations')
|
||||
|
||||
# Validate each split
|
||||
splits = ['train', 'valid', 'test']
|
||||
all_issues = {}
|
||||
|
||||
for split in splits:
|
||||
json_file = os.path.join(annotations_dir, f'coco_project_{training_id}_{split}.json')
|
||||
if os.path.exists(json_file):
|
||||
all_issues[split] = validate_coco_json(json_file, data_dir)
|
||||
else:
|
||||
print(f"⚠️ JSON file not found: {json_file}")
|
||||
|
||||
return all_issues
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
if len(sys.argv) > 1:
|
||||
training_id = int(sys.argv[1])
|
||||
validate_training_dataset(training_id)
|
||||
else:
|
||||
print("Usage: python validate_dataset.py <training_id>")
|
||||
Reference in New Issue
Block a user