Skip to content

Commit 0b8364e

Browse files
authored
Merge pull request #2940 from thaJeztah/19.03_backport_fix_leaking_task_db
[19.03 backport] Fix leaking tasks.db
2 parents 062b694 + 875d503 commit 0b8364e

File tree

4 files changed

+31
-15
lines changed

4 files changed

+31
-15
lines changed

agent/storage.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,9 @@ func PutTask(tx *bolt.Tx, task *api.Task) error {
131131

132132
// PutTaskStatus updates the status for the task with id.
133133
func PutTaskStatus(tx *bolt.Tx, id string, status *api.TaskStatus) error {
134-
return withCreateTaskBucketIfNotExists(tx, id, func(bkt *bolt.Bucket) error {
134+
// this used to be withCreateTaskBucketIfNotExists, but that could lead
135+
// to weird race conditions, and was not necessary.
136+
return withTaskBucket(tx, id, func(bkt *bolt.Bucket) error {
135137
p, err := proto.Marshal(status)
136138
if err != nil {
137139
return err

agent/storage_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ func TestStoragePutGetStatusAssigned(t *testing.T) {
6666
// set task, status and assignment for all tasks.
6767
assert.NoError(t, db.Update(func(tx *bolt.Tx) error {
6868
for _, task := range tasks {
69-
assert.NoError(t, PutTaskStatus(tx, task.ID, &task.Status))
7069
assert.NoError(t, PutTask(tx, task))
70+
assert.NoError(t, PutTaskStatus(tx, task.ID, &task.Status))
7171
assert.NoError(t, SetTaskAssignment(tx, task.ID, true))
7272
}
7373

agent/worker.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,10 +278,15 @@ func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.Assig
278278

279279
removeTaskAssignment := func(taskID string) error {
280280
ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", taskID))
281-
if err := SetTaskAssignment(tx, taskID, false); err != nil {
282-
log.G(ctx).WithError(err).Error("error setting task assignment in database")
281+
// if a task is no longer assigned, then we do not have to keep track
282+
// of it. a task will only be unassigned when it is deleted on the
283+
// manager. instead of SetTaskAssginment to true, we'll just remove the
284+
// task now.
285+
if err := DeleteTask(tx, taskID); err != nil {
286+
log.G(ctx).WithError(err).Error("error removing de-assigned task")
287+
return err
283288
}
284-
return err
289+
return nil
285290
}
286291

287292
// If this was a complete set of assignments, we're going to remove all the remaining
@@ -500,6 +505,21 @@ func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task
500505
// updateTaskStatus reports statuses to listeners, read lock must be held.
501506
func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error {
502507
if err := PutTaskStatus(tx, taskID, status); err != nil {
508+
// we shouldn't fail to put a task status. however, there exists the
509+
// possibility of a race in which we try to put a task status after the
510+
// task has been deleted. because this whole contraption is a careful
511+
// dance of too-tightly-coupled concurrent parts, fixing tht race is
512+
// fraught with hazards. instead, we'll recognize that it can occur,
513+
// log the error, and then ignore it.
514+
if err == errTaskUnknown {
515+
// log at info level. debug logging in docker is already really
516+
// verbose, so many people disable it. the race that causes this
517+
// behavior should be very rare, but if it occurs, we should know
518+
// about it, because if there is some case where it is _not_ rare,
519+
// then knowing about it will go a long way toward debugging.
520+
log.G(ctx).Info("attempted to update status for a task that has been removed")
521+
return nil
522+
}
503523
log.G(ctx).WithError(err).Error("failed writing status to disk")
504524
return err
505525
}

agent/worker_test.go

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ func TestWorkerAssign(t *testing.T) {
143143
},
144144
},
145145
expectedTasks: []*api.Task{
146-
{ID: "task-1"},
147146
{ID: "task-2"},
148147
},
149148
expectedSecrets: []*api.Secret{
@@ -153,15 +152,14 @@ func TestWorkerAssign(t *testing.T) {
153152
{ID: "config-2"},
154153
},
155154
expectedAssigned: []*api.Task{
155+
// task-1 should be cleaned up and deleted.
156156
{ID: "task-2"},
157157
},
158158
},
159159
{
160160
// remove assigned tasks, secret and config no longer present
161-
expectedTasks: []*api.Task{
162-
{ID: "task-1"},
163-
{ID: "task-2"},
164-
},
161+
// there should be no tasks in the tasks db after this.
162+
expectedTasks: nil,
165163
},
166164

167165
// TODO(stevvooe): There are a few more states here we need to get
@@ -173,6 +171,7 @@ func TestWorkerAssign(t *testing.T) {
173171
tasks []*api.Task
174172
assigned []*api.Task
175173
)
174+
176175
assert.NoError(t, worker.db.View(func(tx *bolt.Tx) error {
177176
return WalkTasks(tx, func(task *api.Task) error {
178177
tasks = append(tasks, task)
@@ -491,7 +490,6 @@ func TestWorkerUpdate(t *testing.T) {
491490
},
492491
},
493492
expectedTasks: []*api.Task{
494-
{ID: "task-1"},
495493
{ID: "task-2"},
496494
},
497495
expectedSecrets: []*api.Secret{
@@ -556,10 +554,6 @@ func TestWorkerUpdate(t *testing.T) {
556554
Action: api.AssignmentChange_AssignmentActionRemove,
557555
},
558556
},
559-
expectedTasks: []*api.Task{
560-
{ID: "task-1"},
561-
{ID: "task-2"},
562-
},
563557
},
564558
} {
565559
assert.NoError(t, worker.Update(ctx, testcase.changeSet))

0 commit comments

Comments
 (0)