From 4e1eefad852f97c85c43f51cd721b893e2930857 Mon Sep 17 00:00:00 2001 From: AybH26 <58746253+AybH26@users.noreply.github.com> Date: Tue, 16 Jun 2026 18:47:40 +0200 Subject: [PATCH] fix(scoring): re-enqueue scoring after commit to avoid stuck SCORING rows When the compute worker PATCHes a submission to status=SCORING, the API serializer used to call run_submission() synchronously inside the same DB transaction. If the broker (RabbitMQ) was unreachable at that exact moment, the status row would commit but the scoring task would never be published, leaving the submission stuck in SCORING forever (no recovery: the 24h cleanup only rescues RUNNING rows). Move the enqueue into transaction.on_commit so the task is only published after the SCORING status is durably committed, and explicitly mark the submission as Failed (with a clear status_details) if the publish still fails, so the row never stays in a non-terminal limbo state. Wrap update() in @transaction.atomic to make the commit boundary explicit. --- src/apps/api/serializers/submissions.py | 33 ++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/src/apps/api/serializers/submissions.py b/src/apps/api/serializers/submissions.py index 9c91737ca..769ef0131 100644 --- a/src/apps/api/serializers/submissions.py +++ b/src/apps/api/serializers/submissions.py @@ -1,11 +1,15 @@ import asyncio +import logging from os.path import basename from django.core.cache import cache from django.core.exceptions import ValidationError +from django.db import transaction from rest_framework import serializers from rest_framework.exceptions import PermissionDenied +logger = logging.getLogger(__name__) + from api.mixins import DefaultUserCreateMixin from api.serializers import leaderboards from api.serializers.tasks import TaskSerializer @@ -163,6 +167,7 @@ def validate(self, attrs): return data + @transaction.atomic def update(self, submission, validated_data): # Cannot change submission if secret key is not valid @@ -198,13 +203,29 @@ def update(self, submission, validated_data): self.instance.parent.save() if validated_data.get("status") == Submission.SCORING: - # Start scoring because we're "SCORING" status now from compute worker + # Re-enqueue scoring AFTER the new status is committed: otherwise the + # site-worker may pick the task up before the row reflects SCORING, + # and a broker error here would leave the row stuck in SCORING forever. from competitions.tasks import run_submission - # task = validated_data.get('task_pk') - # if not task: - # raise ValidationError('Cannot update submission. Task pk was not provided') - # task = Task.objects.get(id=task) - run_submission(submission.pk, tasks=[submission.task], is_scoring=True) + submission_pk = submission.pk + scoring_task = submission.task + + def _enqueue_scoring(): + try: + run_submission(submission_pk, tasks=[scoring_task], is_scoring=True) + except Exception: + logger.exception( + "Failed to re-enqueue scoring for submission %s; marking Failed", + submission_pk, + ) + Submission.objects.filter( + pk=submission_pk, status=Submission.SCORING, + ).update( + status=Submission.FAILED, + status_details="Broker unavailable when re-enqueuing scoring task", + ) + + transaction.on_commit(_enqueue_scoring) elif validated_data.get("status") == Submission.FINISHED: # We finished submission, no longer need to store submission stuff in Redis, free it up!