que and training status
This commit is contained in:
@@ -73,7 +73,7 @@ def generate_yolox_json():
|
|||||||
|
|
||||||
@api_bp.route('/start-yolox-training', methods=['POST'])
|
@api_bp.route('/start-yolox-training', methods=['POST'])
|
||||||
def start_yolox_training():
|
def start_yolox_training():
|
||||||
"""Generate JSONs, exp.py, and start YOLOX training"""
|
"""Generate JSONs, exp.py, and add training to queue"""
|
||||||
try:
|
try:
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
project_id = data.get('project_id')
|
project_id = data.get('project_id')
|
||||||
@@ -113,8 +113,8 @@ def start_yolox_training():
|
|||||||
print(f'Generating exp.py at {exp_file_path}...')
|
print(f'Generating exp.py at {exp_file_path}...')
|
||||||
save_yolox_exp(training_id, exp_file_path)
|
save_yolox_exp(training_id, exp_file_path)
|
||||||
|
|
||||||
# Step 3: Start training
|
# Step 3: Build training command
|
||||||
print(f'Starting YOLOX training for training {training_id}...')
|
print(f'Preparing training command for training {training_id}...')
|
||||||
|
|
||||||
# Get YOLOX configuration from settings
|
# Get YOLOX configuration from settings
|
||||||
yolox_main_dir = get_setting('yolox_path', '/home/kitraining/Yolox/YOLOX-main')
|
yolox_main_dir = get_setting('yolox_path', '/home/kitraining/Yolox/YOLOX-main')
|
||||||
@@ -162,11 +162,12 @@ def start_yolox_training():
|
|||||||
|
|
||||||
print(f'Training command: {cmd}')
|
print(f'Training command: {cmd}')
|
||||||
|
|
||||||
# Start training in background
|
# Step 4: Add to training queue
|
||||||
subprocess.Popen(cmd, shell=True, cwd=yolox_main_dir)
|
from services.training_queue import training_queue
|
||||||
|
training_queue.add_to_queue(training_id, cmd, yolox_main_dir)
|
||||||
|
|
||||||
return jsonify({
|
return jsonify({
|
||||||
'message': f'JSONs and exp.py generated, training started for training {training_id}',
|
'message': f'Training {training_id} added to queue',
|
||||||
'exp_path': exp_file_path
|
'exp_path': exp_file_path
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -176,6 +177,18 @@ def start_yolox_training():
|
|||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
return jsonify({'message': 'Failed to start training', 'error': str(err)}), 500
|
return jsonify({'message': 'Failed to start training', 'error': str(err)}), 500
|
||||||
|
|
||||||
|
@api_bp.route('/training-status', methods=['GET'])
|
||||||
|
def get_training_status():
|
||||||
|
"""Get current training queue status"""
|
||||||
|
try:
|
||||||
|
from services.training_queue import training_queue
|
||||||
|
status = training_queue.get_status()
|
||||||
|
return jsonify(status)
|
||||||
|
|
||||||
|
except Exception as err:
|
||||||
|
print(f'Error getting training status: {err}')
|
||||||
|
return jsonify({'current': None, 'queue': []}), 500
|
||||||
|
|
||||||
@api_bp.route('/training-log', methods=['GET'])
|
@api_bp.route('/training-log', methods=['GET'])
|
||||||
def training_log():
|
def training_log():
|
||||||
"""Get YOLOX training log"""
|
"""Get YOLOX training log"""
|
||||||
|
|||||||
180
backend/services/training_queue.py
Normal file
180
backend/services/training_queue.py
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
"""
|
||||||
|
Training Queue Manager
|
||||||
|
Manages a queue of training jobs and tracks their progress
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import subprocess
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
from services.settings_service import get_setting
|
||||||
|
from models.training import Training
|
||||||
|
|
||||||
|
class TrainingQueueManager:
|
||||||
|
_instance = None
|
||||||
|
_lock = threading.Lock()
|
||||||
|
|
||||||
|
def __new__(cls):
|
||||||
|
if cls._instance is None:
|
||||||
|
with cls._lock:
|
||||||
|
if cls._instance is None:
|
||||||
|
cls._instance = super().__new__(cls)
|
||||||
|
cls._instance._initialized = False
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.queue = queue.Queue()
|
||||||
|
self.current_training = None
|
||||||
|
self.current_process = None
|
||||||
|
self.worker_thread = None
|
||||||
|
self.running = False
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
# Start the worker thread
|
||||||
|
self.start_worker()
|
||||||
|
|
||||||
|
def start_worker(self):
|
||||||
|
"""Start the background worker thread"""
|
||||||
|
if self.worker_thread is None or not self.worker_thread.is_alive():
|
||||||
|
self.running = True
|
||||||
|
self.worker_thread = threading.Thread(target=self._process_queue, daemon=True)
|
||||||
|
self.worker_thread.start()
|
||||||
|
|
||||||
|
def add_to_queue(self, training_id, command, cwd):
|
||||||
|
"""Add a training job to the queue"""
|
||||||
|
job = {
|
||||||
|
'training_id': training_id,
|
||||||
|
'command': command,
|
||||||
|
'cwd': cwd,
|
||||||
|
'iteration': 0,
|
||||||
|
'max_epoch': 300 # Will be updated from training record
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get max_epoch from training record
|
||||||
|
try:
|
||||||
|
training = Training.query.get(training_id)
|
||||||
|
if training:
|
||||||
|
job['max_epoch'] = training.max_epoch or 300
|
||||||
|
job['name'] = training.exp_name or f'Training {training_id}'
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
self.queue.put(job)
|
||||||
|
print(f'Added training {training_id} to queue. Queue size: {self.queue.qsize()}')
|
||||||
|
|
||||||
|
def _process_queue(self):
|
||||||
|
"""Worker thread that processes the queue"""
|
||||||
|
while self.running:
|
||||||
|
try:
|
||||||
|
# Wait for a job (blocking with timeout)
|
||||||
|
job = self.queue.get(timeout=1)
|
||||||
|
|
||||||
|
print(f'Starting training {job["training_id"]} from queue')
|
||||||
|
self.current_training = job
|
||||||
|
|
||||||
|
# Execute the training command
|
||||||
|
self._run_training(job)
|
||||||
|
|
||||||
|
# Mark as done
|
||||||
|
self.queue.task_done()
|
||||||
|
self.current_training = None
|
||||||
|
self.current_process = None
|
||||||
|
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error processing training job: {e}')
|
||||||
|
self.current_training = None
|
||||||
|
self.current_process = None
|
||||||
|
|
||||||
|
def _run_training(self, job):
|
||||||
|
"""Run a training command and monitor its output"""
|
||||||
|
try:
|
||||||
|
import platform
|
||||||
|
is_windows = platform.system() == 'Windows'
|
||||||
|
|
||||||
|
# Start process
|
||||||
|
self.current_process = subprocess.Popen(
|
||||||
|
job['command'],
|
||||||
|
shell=True,
|
||||||
|
cwd=job['cwd'],
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
universal_newlines=True,
|
||||||
|
bufsize=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Monitor output for progress
|
||||||
|
for line in iter(self.current_process.stdout.readline, ''):
|
||||||
|
if line:
|
||||||
|
print(line.strip())
|
||||||
|
|
||||||
|
# Parse iteration from YOLOX output
|
||||||
|
# Example: "2025-12-02 07:30:15 | INFO | yolox.core.trainer:78 - Epoch: [5/300]"
|
||||||
|
match = re.search(r'Epoch:\s*\[(\d+)/(\d+)\]', line)
|
||||||
|
if match:
|
||||||
|
current_epoch = int(match.group(1))
|
||||||
|
total_epochs = int(match.group(2))
|
||||||
|
if self.current_training:
|
||||||
|
self.current_training['iteration'] = current_epoch
|
||||||
|
self.current_training['max_epoch'] = total_epochs
|
||||||
|
print(f'Progress: {current_epoch}/{total_epochs}')
|
||||||
|
|
||||||
|
# Wait for completion
|
||||||
|
self.current_process.wait()
|
||||||
|
print(f'Training {job["training_id"]} completed with exit code {self.current_process.returncode}')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error running training: {e}')
|
||||||
|
|
||||||
|
def get_status(self):
|
||||||
|
"""Get current status of training queue"""
|
||||||
|
queue_items = []
|
||||||
|
|
||||||
|
# Get items from queue without removing them
|
||||||
|
temp_items = []
|
||||||
|
while not self.queue.empty():
|
||||||
|
try:
|
||||||
|
item = self.queue.get_nowait()
|
||||||
|
temp_items.append(item)
|
||||||
|
queue_items.append({
|
||||||
|
'training_id': item['training_id'],
|
||||||
|
'name': item.get('name', f'Training {item["training_id"]}'),
|
||||||
|
'max_epoch': item.get('max_epoch', 300)
|
||||||
|
})
|
||||||
|
except queue.Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Put items back
|
||||||
|
for item in temp_items:
|
||||||
|
self.queue.put(item)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'current': None,
|
||||||
|
'queue': queue_items
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.current_training:
|
||||||
|
result['current'] = {
|
||||||
|
'training_id': self.current_training['training_id'],
|
||||||
|
'name': self.current_training.get('name', f'Training {self.current_training["training_id"]}'),
|
||||||
|
'iteration': self.current_training.get('iteration', 0),
|
||||||
|
'max_epoch': self.current_training.get('max_epoch', 300)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
"""Stop the worker thread"""
|
||||||
|
self.running = False
|
||||||
|
if self.current_process:
|
||||||
|
try:
|
||||||
|
self.current_process.terminate()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Global instance
|
||||||
|
training_queue = TrainingQueueManager()
|
||||||
@@ -30,6 +30,14 @@
|
|||||||
<label id="project-title-label"
|
<label id="project-title-label"
|
||||||
style="display: block; text-align: left; font-weight: bold; font-size: x-large;">Project</label>
|
style="display: block; text-align: left; font-weight: bold; font-size: x-large;">Project</label>
|
||||||
<div class="button-row">
|
<div class="button-row">
|
||||||
|
<!-- Training Notification Bell -->
|
||||||
|
<button id="training-bell" onclick="toggleTrainingModal()" class="button" title="Training Status"
|
||||||
|
style="padding: 8px 16px; margin-right: 10px; position: relative; background: #999;">
|
||||||
|
🔔
|
||||||
|
<span id="bell-badge" style="display: none; position: absolute; top: -5px; right: -5px; background: #ff4d4f;
|
||||||
|
color: white; border-radius: 50%; width: 20px; height: 20px; font-size: 12px; line-height: 20px;
|
||||||
|
text-align: center; font-weight: bold;">0</span>
|
||||||
|
</button>
|
||||||
<button id="Add Training Project" onclick="window.location.href='/add-project.html'" class="button-red">Add
|
<button id="Add Training Project" onclick="window.location.href='/add-project.html'" class="button-red">Add
|
||||||
Training Project</button>
|
Training Project</button>
|
||||||
<button id="seed-db-btn" class="button">
|
<button id="seed-db-btn" class="button">
|
||||||
@@ -317,7 +325,133 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Training Status Modal -->
|
||||||
|
<div id="training-status-modal" class="modal" style="display: none;">
|
||||||
|
<div class="modal-content" style="max-width: 700px; max-height: 90vh; overflow-y: auto;">
|
||||||
|
<div class="modal-header">
|
||||||
|
<h2>Training Status</h2>
|
||||||
|
<button class="close-btn" onclick="toggleTrainingModal()">×</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="modal-body">
|
||||||
|
<!-- Current Training -->
|
||||||
|
<div class="settings-section" id="current-training-section" style="display: none;">
|
||||||
|
<h3 style="color: #009eac;">Current Training</h3>
|
||||||
|
<div id="current-training-info" style="background: #eaf7fa; padding: 16px; border-radius: 8px; margin-bottom: 16px;">
|
||||||
|
<!-- Populated by JS -->
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Queued Trainings -->
|
||||||
|
<div class="settings-section" id="queue-section" style="display: none;">
|
||||||
|
<h3 style="color: #666;">Queue (<span id="queue-count">0</span>)</h3>
|
||||||
|
<div id="queue-list" style="display: flex; flex-direction: column; gap: 12px;">
|
||||||
|
<!-- Populated by JS -->
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- No trainings message -->
|
||||||
|
<div id="no-trainings-msg" style="text-align: center; padding: 32px; color: #666;">
|
||||||
|
No trainings running or queued.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<script src="js/settings.js"></script>
|
<script src="js/settings.js"></script>
|
||||||
|
<script>
|
||||||
|
// Training status polling
|
||||||
|
let trainingStatusPoller = null;
|
||||||
|
|
||||||
|
function toggleTrainingModal() {
|
||||||
|
const modal = document.getElementById('training-status-modal');
|
||||||
|
if (modal.style.display === 'none') {
|
||||||
|
modal.style.display = 'flex';
|
||||||
|
updateTrainingStatus(); // Immediate update
|
||||||
|
} else {
|
||||||
|
modal.style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateTrainingStatus() {
|
||||||
|
fetch('/api/training-status')
|
||||||
|
.then(res => res.json())
|
||||||
|
.then(data => {
|
||||||
|
const bell = document.getElementById('training-bell');
|
||||||
|
const badge = document.getElementById('bell-badge');
|
||||||
|
const currentSection = document.getElementById('current-training-section');
|
||||||
|
const queueSection = document.getElementById('queue-section');
|
||||||
|
const noTrainingsMsg = document.getElementById('no-trainings-msg');
|
||||||
|
|
||||||
|
const totalCount = (data.current ? 1 : 0) + data.queue.length;
|
||||||
|
|
||||||
|
// Update bell appearance
|
||||||
|
if (totalCount > 0) {
|
||||||
|
bell.style.background = '#009eac';
|
||||||
|
badge.style.display = 'block';
|
||||||
|
badge.textContent = totalCount;
|
||||||
|
} else {
|
||||||
|
bell.style.background = '#999';
|
||||||
|
badge.style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update modal content
|
||||||
|
if (data.current) {
|
||||||
|
currentSection.style.display = 'block';
|
||||||
|
noTrainingsMsg.style.display = 'none';
|
||||||
|
|
||||||
|
const percentage = Math.round((data.current.iteration / data.current.max_epoch) * 100);
|
||||||
|
document.getElementById('current-training-info').innerHTML = `
|
||||||
|
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;">
|
||||||
|
<strong>${data.current.name || 'Training'}</strong>
|
||||||
|
<span style="font-weight: bold; color: #009eac;">${percentage}%</span>
|
||||||
|
</div>
|
||||||
|
<div style="background: #ddd; border-radius: 4px; height: 24px; overflow: hidden; margin-bottom: 8px;">
|
||||||
|
<div style="background: #009eac; height: 100%; width: ${percentage}%; transition: width 0.3s;"></div>
|
||||||
|
</div>
|
||||||
|
<div style="font-size: 14px; color: #666;">
|
||||||
|
Epoch ${data.current.iteration} / ${data.current.max_epoch}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
currentSection.style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data.queue.length > 0) {
|
||||||
|
queueSection.style.display = 'block';
|
||||||
|
noTrainingsMsg.style.display = 'none';
|
||||||
|
document.getElementById('queue-count').textContent = data.queue.length;
|
||||||
|
|
||||||
|
document.getElementById('queue-list').innerHTML = data.queue.map((t, idx) => `
|
||||||
|
<div style="background: #f5f5f5; padding: 12px; border-radius: 8px; border-left: 4px solid #009eac;">
|
||||||
|
<strong>#${idx + 1}: ${t.name || 'Training'}</strong>
|
||||||
|
<div style="font-size: 13px; color: #666; margin-top: 4px;">
|
||||||
|
${t.max_epoch} epochs • Waiting...
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`).join('');
|
||||||
|
} else {
|
||||||
|
queueSection.style.display = 'none';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalCount === 0) {
|
||||||
|
noTrainingsMsg.style.display = 'block';
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(err => console.error('Failed to fetch training status:', err));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Poll every 5 seconds
|
||||||
|
window.addEventListener('DOMContentLoaded', function() {
|
||||||
|
updateTrainingStatus();
|
||||||
|
trainingStatusPoller = setInterval(updateTrainingStatus, 5000);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Stop polling when page unloads
|
||||||
|
window.addEventListener('beforeunload', function() {
|
||||||
|
if (trainingStatusPoller) clearInterval(trainingStatusPoller);
|
||||||
|
});
|
||||||
|
</script>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
Reference in New Issue
Block a user