Skip to content

Commit 8752d38

Browse files
committed
Fix race condition
There is a small race condition between acknowledge and recording the error in which the reporter may see the queue empty and no error recorded and wrongly thinks it's a success.
1 parent e954b24 commit 8752d38

File tree

5 files changed

+25
-24
lines changed

5 files changed

+25
-24
lines changed

redis/acknowledge.lua

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,18 @@
11
local zset_key = KEYS[1]
22
local processed_key = KEYS[2]
33
local owners_key = KEYS[3]
4+
local error_reports_key = KEYS[4]
45

56
local test = ARGV[1]
6-
7+
local error = ARGV[2]
8+
local ttl = ARGV[3]
79
redis.call('zrem', zset_key, test)
810
redis.call('hdel', owners_key, test) -- Doesn't matter if it was reclaimed by another workers
9-
return redis.call('sadd', processed_key, test)
11+
local acknowledged = redis.call('sadd', processed_key, test)
12+
13+
if acknowledged and error ~= "" then
14+
redis.call('hset', error_reports_key, test, error)
15+
redis.call('expire', error_reports_key, ttl)
16+
end
17+
18+
return acknowledged

ruby/lib/ci/queue/redis/base.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,8 @@ def queue_initializing?
181181
master_status == 'setup'
182182
end
183183

184-
def increment_test_failed
185-
redis.incr(key('test_failed_count'))
184+
def increment_test_failed(pipeline: redis)
185+
pipeline.incr(key('test_failed_count'))
186186
end
187187

188188
def test_failed
@@ -226,7 +226,8 @@ def master_status
226226
end
227227

228228
def eval_script(script, *args)
229-
redis.evalsha(load_script(script), *args)
229+
pipeline = args.delete(:pipeline) || redis
230+
pipeline.evalsha(load_script(script), *args)
230231
end
231232

232233
def load_script(script)

ruby/lib/ci/queue/redis/build_record.rb

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -59,20 +59,10 @@ def record_warning(type, attributes)
5959
Test = Struct.new(:id) # Hack
6060

6161
def record_error(id, payload, stats: nil)
62-
# FIXME: the ack and hset should be atomic
63-
# otherwise the reporter may see the empty queue before the error
64-
# is appended and wrongly think it's a success.
65-
if @queue.acknowledge(id)
66-
redis.pipelined do |pipeline|
67-
pipeline.hset(
68-
key('error-reports'),
69-
id,
70-
payload,
71-
)
72-
pipeline.expire(key('error-reports'), config.redis_ttl)
73-
record_stats(stats, pipeline: pipeline)
74-
@queue.increment_test_failed
75-
end
62+
redis.pipelined do |pipeline|
63+
@queue.acknowledge(id, error: payload, pipeline: pipeline)
64+
record_stats(stats, pipeline: pipeline)
65+
@queue.increment_test_failed(pipeline: pipeline)
7666
end
7767
nil
7868
end

ruby/lib/ci/queue/redis/worker.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,13 @@ def report_worker_error(error)
9797
build.report_worker_error(error)
9898
end
9999

100-
def acknowledge(test_key)
100+
def acknowledge(test_key, error: nil, pipeline: redis)
101101
raise_on_mismatching_test(test_key)
102102
eval_script(
103103
:acknowledge,
104-
keys: [key('running'), key('processed'), key('owners')],
105-
argv: [test_key],
104+
keys: [key('running'), key('processed'), key('owners'), key('error-reports')],
105+
argv: [test_key, error.to_s, config.redis_ttl],
106+
pipeline: pipeline,
106107
) == 1
107108
end
108109

ruby/lib/ci/queue/static.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,12 @@ def exhausted?
103103
@queue.empty?
104104
end
105105

106-
def acknowledge(test)
106+
def acknowledge(*)
107107
@progress += 1
108108
true
109109
end
110110

111-
def increment_test_failed
111+
def increment_test_failed(*)
112112
@test_failed = test_failed + 1
113113
end
114114

0 commit comments

Comments
 (0)