Abschluss-Projekt/backend/services/generate_json_yolox.py

import json
import os
import math
from models.TrainingProject import TrainingProject
from models.TrainingProjectDetails import TrainingProjectDetails
from models.Images import Image
from models.Annotation import Annotation

def generate_training_json(training_id):
    """Generate COCO JSON for training, validation, and test sets

    Args:
        training_id: Can be either a Training.id or TrainingProjectDetails.id
                     Function will automatically detect which one and find the correct details_id
    """
    from models.training import Training

    # First, try to get as a Training record
    training_record = Training.query.get(training_id)

    if training_record:
        # It's a Training.id - use its project_details_id
        details_id = training_record.project_details_id
        print(f'[generate_training_json] Using training_id={training_id}, mapped to project_details_id={details_id}')
    else:
        # Try as TrainingProjectDetails.id directly
        details_id = training_id
        print(f'[generate_training_json] Using training_id={training_id} as project_details_id directly')

    training_project_details = TrainingProjectDetails.query.get(details_id)

    if not training_project_details:
        raise Exception(f'No TrainingProjectDetails found for id {training_id} (details_id: {details_id})')

    details_obj = training_project_details.to_dict()

    # Get parent project for name
    training_project = TrainingProject.query.get(details_obj['project_id'])

    # Get the data directory setting for image paths
    from services.settings_service import get_setting
    data_dir = get_setting('yolox_data_dir', '/home/kitraining/To_Annotate/')

    # Fix UNC path if it's missing the \\ prefix
    # Check if it looks like a UNC path without proper prefix (e.g., "192.168.1.19\...")
    if data_dir and not data_dir.startswith('\\\\') and not data_dir.startswith('/'):
        # Check if it starts with an IP address pattern
        import re
        if re.match(r'^\d+\.\d+\.\d+\.\d+[/\\]', data_dir):
            data_dir = '\\\\' + data_dir

    # Ensure data_dir ends with separator
    if not data_dir.endswith(os.sep) and not data_dir.endswith('/'):
        data_dir += os.sep

    # Get split percentages (default values if not set)
    train_percent = details_obj.get('train_percent', 85)
    valid_percent = details_obj.get('valid_percent', 10)
    test_percent = details_obj.get('test_percent', 5)

    coco_images = []
    coco_annotations = []
    coco_categories = []
    category_map = {}
    category_id = 0
    image_id = 0
    annotation_id = 0

    # Build category list and mapping from class_map dictionary {source: target}
    class_map = details_obj.get('class_map', {})

    for source_class, target_class in class_map.items():
        if target_class and target_class not in category_map:
            category_map[target_class] = category_id
            coco_categories.append({'id': category_id, 'name': target_class, 'supercategory': ''})
            category_id += 1

    # Get all annotation projects (Label Studio project IDs)
    annotation_projects = details_obj.get('annotation_projects', [])

    # Get class mappings from database grouped by Label Studio project
    from models.ClassMapping import ClassMapping
    all_mappings = ClassMapping.query.filter_by(project_details_id=training_id).all()

    # Group mappings by Label Studio project ID
    mappings_by_project = {}
    for mapping in all_mappings:
        ls_proj_id = mapping.label_studio_project_id
        if ls_proj_id not in mappings_by_project:
            mappings_by_project[ls_proj_id] = {}
        mappings_by_project[ls_proj_id][mapping.source_class] = mapping.target_class

        # Also add target class to category map if not present
        if mapping.target_class and mapping.target_class not in category_map:
            category_map[mapping.target_class] = category_id
            coco_categories.append({'id': category_id, 'name': mapping.target_class, 'supercategory': ''})
            category_id += 1

    # Iterate through each annotation project to collect images and annotations
    for ls_project_id in annotation_projects:
        # Get images for this Label Studio project
        images = Image.query.filter_by(project_id=ls_project_id).all()

        for image in images:
            image_id += 1
            file_name = image.image_path

            # Clean up file path from Label Studio format
            if '%20' in file_name:
                file_name = file_name.replace('%20', ' ')
            if file_name and file_name.startswith('/data/local-files/?d='):
                file_name = file_name.replace('/data/local-files/?d=', '')

            # Remove any Label Studio prefixes but keep full path
            # Common Label Studio patterns
            prefixes_to_remove = [
                '//192.168.1.19/home/kitraining/To_Annotate/',
                '192.168.1.19/home/kitraining/To_Annotate/',
                '/home/kitraining/home/kitraining/',
                'home/kitraining/To_Annotate/',
                '/home/kitraining/To_Annotate/',
            ]

            # Try each prefix
            for prefix in prefixes_to_remove:
                if file_name.startswith(prefix):
                    file_name = file_name[len(prefix):]
                    break

            # Construct ABSOLUTE path using data_dir
            # Detect platform for proper path handling
            import platform
            is_windows = platform.system() == 'Windows'

            # Normalize data_dir and file_name based on platform
            if is_windows:
                # Windows: use backslashes
                normalized_data_dir = data_dir.rstrip('/\\').replace('/', '\\')
                file_name = file_name.replace('/', '\\')
            else:
                # Linux/Mac: use forward slashes
                normalized_data_dir = data_dir.rstrip('/\\').replace('\\', '/')
                file_name = file_name.replace('\\', '/')

            # Check if already absolute path
            is_absolute = False
            if is_windows:
                is_absolute = file_name.startswith('\\\\') or (len(file_name) > 1 and file_name[1] == ':')
            else:
                is_absolute = file_name.startswith('/')

            if not is_absolute:
                # It's a relative path, combine with data_dir
                if is_windows and normalized_data_dir.startswith('\\\\'):
                    # Windows UNC path
                    file_name = normalized_data_dir + '\\' + file_name
                else:
                    # Regular path (Windows or Linux)
                    file_name = os.path.join(normalized_data_dir, file_name)

            # Get annotations for this image
            annotations = Annotation.query.filter_by(image_id=image.image_id).all()

            coco_images.append({
                'id': image_id,
                'file_name': file_name,  # Use absolute path
                'width': image.width or 0,
                'height': image.height or 0
            })

            for annotation in annotations:
                # Translate class name using class_map for this specific Label Studio project
                original_class = annotation.Label
                project_class_map = mappings_by_project.get(ls_project_id, {})
                mapped_class = project_class_map.get(original_class, original_class)

                # Only add annotation if mapped_class is valid
                if mapped_class and mapped_class in category_map:
                    annotation_id += 1
                    area = 0
                    if annotation.width and annotation.height:
                        area = annotation.width * annotation.height

                    coco_annotations.append({
                        'id': annotation_id,
                        'image_id': image_id,
                        'category_id': category_map[mapped_class],
                        'bbox': [annotation.x, annotation.y, annotation.width, annotation.height],
                        'area': area,
                        'iscrowd': 0
                    })

    # Shuffle images for random split using seed
    def seeded_random(seed):
        x = math.sin(seed) * 10000
        return x - math.floor(x)

    def shuffle(array, seed):
        for i in range(len(array) - 1, 0, -1):
            j = int(seeded_random(seed + i) * (i + 1))
            array[i], array[j] = array[j], array[i]

    # Use seed from details_obj if present, else default to 42
    split_seed = details_obj.get('seed', 42)
    if split_seed is not None:
        split_seed = int(split_seed)
    else:
        split_seed = 42

    shuffle(coco_images, split_seed)

    # Split images
    total_images = len(coco_images)
    train_count = int(total_images * train_percent / 100)
    valid_count = int(total_images * valid_percent / 100)
    test_count = total_images - train_count - valid_count

    train_images = coco_images[0:train_count]
    valid_images = coco_images[train_count:train_count + valid_count]
    test_images = coco_images[train_count + valid_count:]

    # Helper to get image ids for each split
    train_image_ids = {img['id'] for img in train_images}
    valid_image_ids = {img['id'] for img in valid_images}
    test_image_ids = {img['id'] for img in test_images}

    # Split annotations
    train_annotations = [ann for ann in coco_annotations if ann['image_id'] in train_image_ids]
    valid_annotations = [ann for ann in coco_annotations if ann['image_id'] in valid_image_ids]
    test_annotations = [ann for ann in coco_annotations if ann['image_id'] in test_image_ids]

    # Build final COCO JSONs
    def build_coco_json(images, annotations, categories):
        return {
            'images': images,
            'annotations': annotations,
            'categories': categories
        }

    train_json = build_coco_json(train_images, train_annotations, coco_categories)
    valid_json = build_coco_json(valid_images, valid_annotations, coco_categories)
    test_json = build_coco_json(test_images, test_annotations, coco_categories)

    # Create output directory
    from services.settings_service import get_setting
    from models.training import Training

    output_base_path = get_setting('yolox_output_path', './backend')

    project_name = training_project.title.replace(' ', '_') if training_project and training_project.title else f'project_{details_obj["project_id"]}'

    # Get training record to use its name and ID for folder and file names
    # Use the same training_id that was passed in (if it was a Training.id)
    # or find the first training for this details_id
    if not training_record:
        training_record = Training.query.filter_by(project_details_id=details_id).first()

    if training_record:
        training_folder_name = f"{training_record.exp_name or training_record.training_name or 'training'}_{training_record.id}"
        training_folder_name = training_folder_name.replace(' ', '_')
        training_file_id = training_record.id
    else:
        training_folder_name = str(details_id)
        training_file_id = details_id

    # Save annotations to the configured output folder
    annotations_dir = os.path.join(output_base_path, project_name, training_folder_name, 'annotations')
    os.makedirs(annotations_dir, exist_ok=True)

    # Write to files
    train_path = os.path.join(annotations_dir, f'coco_project_{training_file_id}_train.json')
    valid_path = os.path.join(annotations_dir, f'coco_project_{training_file_id}_valid.json')
    test_path = os.path.join(annotations_dir, f'coco_project_{training_file_id}_test.json')

    with open(train_path, 'w') as f:
        json.dump(train_json, f, indent=2)
    with open(valid_path, 'w') as f:
        json.dump(valid_json, f, indent=2)
    with open(test_path, 'w') as f:
        json.dump(test_json, f, indent=2)

    print(f'COCO JSON splits written to {annotations_dir} for training_id={training_file_id} (details_id={details_id})')

    # Also generate inference exp.py
    from services.generate_yolox_exp import generate_yolox_inference_exp
    project_folder = os.path.join(output_base_path, project_name, str(training_id))
    os.makedirs(project_folder, exist_ok=True)

    inference_exp_path = os.path.join(project_folder, 'exp_infer.py')
    try:
        exp_content = generate_yolox_inference_exp(training_id)
        with open(inference_exp_path, 'w') as f:
            f.write(exp_content)
        print(f'Inference exp.py written to {inference_exp_path}')
    except Exception as err:
        print(f'Failed to generate inference exp.py: {err}')