cleanup add training bell

This commit is contained in:
2025-12-08 12:26:34 +01:00
parent 036f3b178a
commit ccfb40a2b3
3070 changed files with 671040 additions and 68602 deletions

0
backend/services/__init__.py Normal file → Executable file
View File

0
backend/services/fetch_labelstudio.py Normal file → Executable file
View File

85
backend/services/generate_json_yolox.py Normal file → Executable file
View File

@@ -7,12 +7,30 @@ from models.Images import Image
from models.Annotation import Annotation
def generate_training_json(training_id):
"""Generate COCO JSON for training, validation, and test sets"""
# training_id is now project_details_id
training_project_details = TrainingProjectDetails.query.get(training_id)
"""Generate COCO JSON for training, validation, and test sets
Args:
training_id: Can be either a Training.id or TrainingProjectDetails.id
Function will automatically detect which one and find the correct details_id
"""
from models.training import Training
# First, try to get as a Training record
training_record = Training.query.get(training_id)
if training_record:
# It's a Training.id - use its project_details_id
details_id = training_record.project_details_id
print(f'[generate_training_json] Using training_id={training_id}, mapped to project_details_id={details_id}')
else:
# Try as TrainingProjectDetails.id directly
details_id = training_id
print(f'[generate_training_json] Using training_id={training_id} as project_details_id directly')
training_project_details = TrainingProjectDetails.query.get(details_id)
if not training_project_details:
raise Exception(f'No TrainingProjectDetails found for project_details_id {training_id}')
raise Exception(f'No TrainingProjectDetails found for id {training_id} (details_id: {details_id})')
details_obj = training_project_details.to_dict()
@@ -110,22 +128,35 @@ def generate_training_json(training_id):
break
# Construct ABSOLUTE path using data_dir
# Normalize data_dir - ensure it uses backslashes for Windows
normalized_data_dir = data_dir.rstrip('/\\').replace('/', '\\')
# Detect platform for proper path handling
import platform
is_windows = platform.system() == 'Windows'
# Normalize data_dir and file_name based on platform
if is_windows:
# Windows: use backslashes
normalized_data_dir = data_dir.rstrip('/\\').replace('/', '\\')
file_name = file_name.replace('/', '\\')
else:
# Linux/Mac: use forward slashes
normalized_data_dir = data_dir.rstrip('/\\').replace('\\', '/')
file_name = file_name.replace('\\', '/')
# Check if already absolute path
if not (file_name.startswith('\\\\') or (len(file_name) > 1 and file_name[1] == ':')):
# It's a relative path, combine with data_dir
# For UNC paths, we need to manually concatenate to preserve \\
if normalized_data_dir.startswith('\\\\'):
# UNC path
file_name = normalized_data_dir + '\\' + file_name.replace('/', '\\')
else:
# Regular path
file_name = os.path.join(normalized_data_dir, file_name.replace('/', '\\'))
is_absolute = False
if is_windows:
is_absolute = file_name.startswith('\\\\') or (len(file_name) > 1 and file_name[1] == ':')
else:
# Already absolute, just normalize separators
file_name = file_name.replace('/', '\\')
is_absolute = file_name.startswith('/')
if not is_absolute:
# It's a relative path, combine with data_dir
if is_windows and normalized_data_dir.startswith('\\\\'):
# Windows UNC path
file_name = normalized_data_dir + '\\' + file_name
else:
# Regular path (Windows or Linux)
file_name = os.path.join(normalized_data_dir, file_name)
# Get annotations for this image
annotations = Annotation.query.filter_by(image_id=image.image_id).all()
@@ -218,13 +249,19 @@ def generate_training_json(training_id):
project_name = training_project.title.replace(' ', '_') if training_project and training_project.title else f'project_{details_obj["project_id"]}'
# Get training record to use its name for folder
training_record = Training.query.filter_by(project_details_id=training_id).first()
training_folder_name = f"{training_record.exp_name or training_record.training_name or 'training'}_{training_record.id}" if training_record else str(training_id)
training_folder_name = training_folder_name.replace(' ', '_')
# Get training record to use its name and ID for folder and file names
# Use the same training_id that was passed in (if it was a Training.id)
# or find the first training for this details_id
if not training_record:
training_record = Training.query.filter_by(project_details_id=details_id).first()
# Use training_record.id for file names to match what generate_yolox_exp expects
training_file_id = training_record.id if training_record else training_id
if training_record:
training_folder_name = f"{training_record.exp_name or training_record.training_name or 'training'}_{training_record.id}"
training_folder_name = training_folder_name.replace(' ', '_')
training_file_id = training_record.id
else:
training_folder_name = str(details_id)
training_file_id = details_id
# Save annotations to the configured output folder
annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations')
@@ -242,7 +279,7 @@ def generate_training_json(training_id):
with open(test_path, 'w') as f:
json.dump(test_json, f, indent=2)
print(f'COCO JSON splits written to {annotations_dir} for trainingId {training_id}')
print(f'COCO JSON splits written to {annotations_dir} for training_id={training_file_id} (details_id={details_id})')
# Also generate inference exp.py
from services.generate_yolox_exp import generate_yolox_inference_exp

68
backend/services/generate_yolox_exp.py Normal file → Executable file
View File

@@ -220,6 +220,10 @@ def generate_yolox_inference_exp(training_id, options=None, use_base_config=Fals
annotations_parent_dir = os.path.join(output_base_path, project_name, training_folder_name)
annotations_parent_escaped = annotations_parent_dir.replace('\\', '\\\\')
# Set output directory for checkpoints - models subdirectory
models_dir = os.path.join(annotations_parent_dir, 'models')
models_dir_escaped = models_dir.replace('\\', '\\\\')
# Build exp content
exp_content = f'''#!/usr/bin/env python3
# -*- coding:utf-8 -*-
@@ -235,6 +239,7 @@ class Exp(MyExp):
super(Exp, self).__init__()
self.data_dir = "{data_dir_escaped}" # Where images are located
self.annotations_dir = "{annotations_parent_escaped}" # Where annotation JSONs are located
self.output_dir = "{models_dir_escaped}" # Where checkpoints will be saved
self.train_ann = "{train_ann}"
self.val_ann = "{val_ann}"
self.test_ann = "{test_ann}"
@@ -252,21 +257,46 @@ class Exp(MyExp):
if selected_model:
exp_content += f" self.pretrained_ckpt = r'{yolox_base_dir}/pretrained/{selected_model}.pth'\n"
# Format arrays
def format_value(val):
# Format arrays and values for Python code generation
# Integer-only parameters (sizes, epochs, intervals)
integer_params = {
'input_size', 'test_size', 'random_size', 'max_epoch', 'warmup_epochs',
'no_aug_epochs', 'print_interval', 'eval_interval', 'multiscale_range',
'data_num_workers', 'num_classes'
}
def format_value(val, param_name=''):
if isinstance(val, (list, tuple)):
return '(' + ', '.join(map(str, val)) + ')'
# Check if this parameter should have integer values
if param_name in integer_params:
# Convert all values to integers
formatted_items = [str(int(float(item))) if isinstance(item, (int, float)) else str(item) for item in val]
else:
# Keep as floats or original type
formatted_items = []
for item in val:
if isinstance(item, float):
formatted_items.append(str(item))
elif isinstance(item, int):
formatted_items.append(str(item))
else:
formatted_items.append(str(item))
return '(' + ', '.join(formatted_items) + ')'
elif isinstance(val, bool):
return str(val)
elif isinstance(val, str):
return f'"{val}"'
elif isinstance(val, int):
return str(val)
elif isinstance(val, float):
return str(val)
else:
return str(val)
# Add all config parameters to exp
for key, value in config.items():
if key not in ['exp_name']: # exp_name is handled separately
exp_content += f" self.{key} = {format_value(value)}\n"
exp_content += f" self.{key} = {format_value(value, key)}\n"
# Add get_dataset override using name parameter for image directory
exp_content += '''
@@ -289,7 +319,7 @@ class Exp(MyExp):
def get_eval_dataset(self, **kwargs):
"""Override eval dataset using name parameter"""
from yolox.data import COCODataset
from yolox.data import COCODataset, ValTransform
testdev = kwargs.get("testdev", False)
legacy = kwargs.get("legacy", False)
@@ -299,8 +329,34 @@ class Exp(MyExp):
json_file=self.val_ann if not testdev else self.test_ann,
name="",
img_size=self.test_size,
preproc=None, # No preprocessing for evaluation
preproc=ValTransform(legacy=legacy), # Use proper validation transform
)
def get_eval_loader(self, batch_size, is_distributed, **kwargs):
"""Standard YOLOX eval loader - matches official implementation"""
import torch
import torch.distributed as dist
from torch.utils.data import DataLoader
valdataset = self.get_eval_dataset(**kwargs)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = DataLoader(valdataset, **dataloader_kwargs)
return val_loader
'''
# Add exp_name at the end (uses dynamic path)

0
backend/services/push_yolox_exp.py Normal file → Executable file
View File

0
backend/services/seed_label_studio.py Normal file → Executable file
View File

0
backend/services/settings_service.py Normal file → Executable file
View File

46
backend/services/training_queue.py Normal file → Executable file
View File

@@ -112,16 +112,35 @@ class TrainingQueueManager:
if line:
print(line.strip())
# Parse iteration from YOLOX output
# Example: "2025-12-02 07:30:15 | INFO | yolox.core.trainer:78 - Epoch: [5/300]"
match = re.search(r'Epoch:\s*\[(\d+)/(\d+)\]', line)
if match:
current_epoch = int(match.group(1))
total_epochs = int(match.group(2))
# Parse epoch and iteration from YOLOX output
# Example: "epoch: 3/300, iter: 90/101"
epoch_match = re.search(r'epoch:\s*(\d+)/(\d+)', line, re.IGNORECASE)
iter_match = re.search(r'iter:\s*(\d+)/(\d+)', line, re.IGNORECASE)
if epoch_match:
current_epoch = int(epoch_match.group(1))
total_epochs = int(epoch_match.group(2))
if self.current_training:
self.current_training['iteration'] = current_epoch
self.current_training['current_epoch'] = current_epoch
self.current_training['max_epoch'] = total_epochs
print(f'Progress: {current_epoch}/{total_epochs}')
# Debug log
print(f'[PROGRESS] Parsed epoch: {current_epoch}/{total_epochs}')
if iter_match:
current_iter = int(iter_match.group(1))
total_iters = int(iter_match.group(2))
if self.current_training:
self.current_training['current_iter'] = current_iter
self.current_training['total_iters'] = total_iters
# Calculate overall progress percentage
if 'current_epoch' in self.current_training and 'max_epoch' in self.current_training:
epoch_progress = (self.current_training['current_epoch'] - 1) / self.current_training['max_epoch']
iter_progress = current_iter / total_iters / self.current_training['max_epoch']
total_progress = (epoch_progress + iter_progress) * 100
self.current_training['progress'] = round(total_progress, 2)
# Debug log
print(f'[PROGRESS] Epoch {self.current_training["current_epoch"]}/{self.current_training["max_epoch"]}, Iter {current_iter}/{total_iters}, Progress: {self.current_training["progress"]}%')
# Wait for completion
self.current_process.wait()
@@ -158,11 +177,18 @@ class TrainingQueueManager:
}
if self.current_training:
current_epoch = self.current_training.get('current_epoch', 0)
max_epoch = self.current_training.get('max_epoch', 300)
result['current'] = {
'training_id': self.current_training['training_id'],
'name': self.current_training.get('name', f'Training {self.current_training["training_id"]}'),
'iteration': self.current_training.get('iteration', 0),
'max_epoch': self.current_training.get('max_epoch', 300)
'epoch': current_epoch, # For backward compatibility
'current_epoch': current_epoch,
'max_epoch': max_epoch,
'current_iter': self.current_training.get('current_iter', 0),
'total_iters': self.current_training.get('total_iters', 0),
'progress': self.current_training.get('progress', 0.0),
'iteration': current_epoch # For backward compatibility
}
return result

View File

@@ -0,0 +1,244 @@
"""
Validate dataset for training - check for problematic images and annotations
"""
import os
import json
from PIL import Image
import cv2
def validate_coco_json(json_path, data_dir):
"""
Validate a COCO JSON file and check all images
Args:
json_path: Path to COCO JSON file
data_dir: Directory where images are located
Returns:
dict with validation results
"""
print(f"\n{'='*60}")
print(f"Validating: {json_path}")
print(f"{'='*60}\n")
issues = {
'missing_images': [],
'corrupted_images': [],
'zero_dimension_images': [],
'invalid_annotations': [],
'zero_area_boxes': []
}
try:
with open(json_path, 'r') as f:
coco_data = json.load(f)
except Exception as e:
print(f"❌ Failed to load JSON: {e}")
return issues
images = coco_data.get('images', [])
annotations = coco_data.get('annotations', [])
print(f"📊 Dataset Stats:")
print(f" Images: {len(images)}")
print(f" Annotations: {len(annotations)}")
print(f" Categories: {len(coco_data.get('categories', []))}")
print()
# Validate images
print("🔍 Validating images...")
for idx, img_info in enumerate(images):
img_id = img_info.get('id')
file_name = img_info.get('file_name', '')
width = img_info.get('width', 0)
height = img_info.get('height', 0)
# Check if image file exists
# Try to construct the full path
if os.path.isabs(file_name):
img_path = file_name
else:
img_path = os.path.join(data_dir, file_name)
if not os.path.exists(img_path):
issues['missing_images'].append({
'id': img_id,
'file_name': file_name,
'expected_path': img_path
})
continue
# Check if image can be loaded
try:
# Try with PIL
with Image.open(img_path) as pil_img:
pil_width, pil_height = pil_img.size
# Check if dimensions match JSON
if pil_width != width or pil_height != height:
print(f"⚠️ Image {img_id}: Dimension mismatch - JSON: {width}x{height}, Actual: {pil_width}x{pil_height}")
# Check for zero dimensions
if pil_width == 0 or pil_height == 0:
issues['zero_dimension_images'].append({
'id': img_id,
'file_name': file_name,
'dimensions': f"{pil_width}x{pil_height}"
})
except Exception as e:
issues['corrupted_images'].append({
'id': img_id,
'file_name': file_name,
'error': str(e)
})
# Progress indicator
if (idx + 1) % 100 == 0:
print(f" Checked {idx + 1}/{len(images)} images...")
print(f"✅ Image validation complete\n")
# Validate annotations
print("🔍 Validating annotations...")
for idx, ann in enumerate(annotations):
ann_id = ann.get('id')
img_id = ann.get('image_id')
bbox = ann.get('bbox', [])
if len(bbox) != 4:
issues['invalid_annotations'].append({
'id': ann_id,
'image_id': img_id,
'reason': f'Invalid bbox length: {len(bbox)}'
})
continue
x, y, w, h = bbox
# Check for zero or negative dimensions
if w <= 0 or h <= 0:
issues['zero_area_boxes'].append({
'id': ann_id,
'image_id': img_id,
'bbox': bbox,
'reason': f'Zero or negative dimensions: w={w}, h={h}'
})
# Check for extremely small boxes (potential issue with mixup)
if w < 1 or h < 1:
issues['zero_area_boxes'].append({
'id': ann_id,
'image_id': img_id,
'bbox': bbox,
'reason': f'Extremely small box: w={w}, h={h}'
})
# Progress indicator
if (idx + 1) % 1000 == 0:
print(f" Checked {idx + 1}/{len(annotations)} annotations...")
print(f"✅ Annotation validation complete\n")
# Print summary
print(f"\n{'='*60}")
print("VALIDATION SUMMARY")
print(f"{'='*60}\n")
total_issues = sum(len(v) for v in issues.values())
if total_issues == 0:
print("✅ No issues found! Dataset is ready for training.")
else:
print(f"⚠️ Found {total_issues} total issues:\n")
if issues['missing_images']:
print(f" ❌ Missing images: {len(issues['missing_images'])}")
for item in issues['missing_images'][:5]: # Show first 5
print(f" - {item['file_name']}")
if len(issues['missing_images']) > 5:
print(f" ... and {len(issues['missing_images']) - 5} more")
if issues['corrupted_images']:
print(f" ❌ Corrupted images: {len(issues['corrupted_images'])}")
for item in issues['corrupted_images'][:5]:
print(f" - {item['file_name']}: {item['error']}")
if len(issues['corrupted_images']) > 5:
print(f" ... and {len(issues['corrupted_images']) - 5} more")
if issues['zero_dimension_images']:
print(f" ❌ Zero dimension images: {len(issues['zero_dimension_images'])}")
for item in issues['zero_dimension_images'][:5]:
print(f" - {item['file_name']}: {item['dimensions']}")
if len(issues['zero_dimension_images']) > 5:
print(f" ... and {len(issues['zero_dimension_images']) - 5} more")
if issues['invalid_annotations']:
print(f" ❌ Invalid annotations: {len(issues['invalid_annotations'])}")
for item in issues['invalid_annotations'][:5]:
print(f" - Ann ID {item['id']}: {item['reason']}")
if len(issues['invalid_annotations']) > 5:
print(f" ... and {len(issues['invalid_annotations']) - 5} more")
if issues['zero_area_boxes']:
print(f" ⚠️ Zero/tiny area boxes: {len(issues['zero_area_boxes'])}")
print(f" These may cause issues with mixup augmentation!")
for item in issues['zero_area_boxes'][:5]:
print(f" - Ann ID {item['id']}, bbox: {item['bbox']}")
if len(issues['zero_area_boxes']) > 5:
print(f" ... and {len(issues['zero_area_boxes']) - 5} more")
print()
return issues
def validate_training_dataset(training_id):
"""
Validate all COCO JSON files for a training
Args:
training_id: The training ID to validate
"""
from models.training import Training
from models.TrainingProject import TrainingProject
from services.settings_service import get_setting
training = Training.query.get(training_id)
if not training:
print(f"❌ Training {training_id} not found")
return
# Get paths
from models.TrainingProjectDetails import TrainingProjectDetails
details = TrainingProjectDetails.query.get(training.project_details_id)
training_project = TrainingProject.query.get(details.project_id)
project_name = training_project.title.replace(' ', '_') if training_project else f'project_{details.project_id}'
training_folder_name = f"{training.exp_name or training.training_name or 'training'}_{training_id}"
training_folder_name = training_folder_name.replace(' ', '_')
output_base_path = get_setting('yolox_output_path', './backend')
data_dir = get_setting('yolox_data_dir', '/home/kitraining/To_Annotate/')
annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations')
# Validate each split
splits = ['train', 'valid', 'test']
all_issues = {}
for split in splits:
json_file = os.path.join(annotations_dir, f'coco_project_{training_id}_{split}.json')
if os.path.exists(json_file):
all_issues[split] = validate_coco_json(json_file, data_dir)
else:
print(f"⚠️ JSON file not found: {json_file}")
return all_issues
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
training_id = int(sys.argv[1])
validate_training_dataset(training_id)
else:
print("Usage: python validate_dataset.py <training_id>")