Relaxed exchange appears stronger than on real hardware

I've been testing my data structures for correctness with relacy, and found an odd behavior where `atomic.exchange(..., rl::memory_order_relaxed)` appears to be stronger than expected. I then checked with real hardware (apple m1) and confirmed that it is indeed much weaker.

As far as I under the standard says this on the topic:

> Atomic read-modify-write operations shall always read the last value (in the modification order) written before the write associated with the read-modify-write operation.

And relacy always returns the last written value. However it seems the last relaxed store is always considered the last value, where on real hardware (at least apple m1) it might get reordered with other stores and really happen later in the order.

Here's a minimal example test where relacy passes. Note how all atomic operations are relaxed:

<details><summary>check_relaxed_lock.cpp</summary>
<p>

```c++
#include <relacy/relacy.hpp>

struct test_relaxed_lock : rl::test_suite<test_relaxed_lock, 2> {
    rl::atomic<int> lock{ 0 };
    rl::atomic<int> counter{ 0 };

    void do_lock() {
        int current = 0;
        while (!lock.compare_exchange_weak(current, 1, rl::memory_order_relaxed)) {
            // spin waiting for the lock
            rl::yield(1, $);
            current = 0;
        }
    }

    void do_unlock() {
        lock.store(0, rl::memory_order_relaxed);
    }

    void thread(unsigned /*thread_index*/) {
        do_lock();
        // When the modification order of counter is considered this exchange
        // is not guaranteed to be before a store in another thread, however
        // by the time we unlock relacy has already stored the new value as
        // the "last value" and exchange loads that instead of some older
        // value. Real hardware behaves in a more relaxed way and store is
        // not always flushed before the exchange.
        int prev = counter.exchange(0, rl::memory_order_relaxed);
        counter.store(prev + 1, rl::memory_order_relaxed);
        do_unlock();
    }

    void after() {
        RL_ASSERT(lock($) == 0);
        RL_ASSERT(counter($) == params::thread_count);
    }
};

int main() {
    rl::test_params params;
    params.search_type = rl::sched_full;
    rl::simulate<test_relaxed_lock>(params);
}
```

</p>
</details>

And here's an example test that runs on real hardware and quickly fails, because counters don't match:

<details><summary>realcheck_relaxed_lock.cpp</summary>
<p>

```c++
#include <atomic>
#include <chrono>
#include <thread>
#include <cstdio>

template<class base>
class padded : public base {
public:
    using base::base;

private:
    char padding[256 - sizeof(base)];
};

struct test_relaxed {
    static constexpr size_t thread_count = 8;
    static constexpr int iterations_per_thread = 1000000/thread_count;

    padded<std::atomic<int>> ready;
    padded<std::atomic<int>> lock;
    padded<std::atomic<int>> counter;

    void enter_barrier() {
        ++ready;
        while (ready.load() != thread_count) {
            // spin
        }
    }

    void do_lock() {
        int current = 0;
        while (!lock.compare_exchange_weak(current, 1, std::memory_order_relaxed)) {
            asm volatile ("yield");
            current = 0;
        }
    }

    void do_unlock() {
        lock.store(0, std::memory_order_relaxed);
    }

    void do_once() {
        do_lock();
        // When the modification order of counter is considered this exchange
        // is not guaranteed to be before a store in another thread, and by
        // the we unlock it might not happen yet on real hardware (apple m1).
        // So this test exchanges some stale value to 0 and then the final
        // counter does not match.
        int prev = counter.exchange(0, std::memory_order_relaxed);
        counter.store(prev + 1, std::memory_order_relaxed);
        do_unlock();
    }

    void thread_func(size_t thread_index) {
        enter_barrier();
        for (int i = 0; i < iterations_per_thread; ++i) {
            do_once();
        }
    }

    void run() {
        ready.store(0);
        lock.store(0);
        counter.store(0);

        std::vector<std::thread> threads;
        threads.reserve(thread_count);
        for (size_t i = 0; i < thread_count; ++i) {
            threads.emplace_back([this, i]{ thread_func(i); });
        }
        for (auto& thread : threads) {
            thread.join();
        }

        int count = counter.load(std::memory_order_relaxed);
        if (count != iterations_per_thread * thread_count) {
            printf(" got counter=%d (expected %d)...",
                count, iterations_per_thread * thread_count);
            fflush(stdout);
            assert(false && "Iteration count did not match");
        }
    }
};

int main() {
    auto* t = new test_relaxed;
    for (int i = 0; ; ++i) {
        printf("Running iteration %d...", i);
        fflush(stdout);
        auto start = std::chrono::steady_clock::now();
        t->run();
        auto end = std::chrono::steady_clock::now();
        auto elapsed = end - start;
        printf(" done in %dms\n",
            (int)std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count());
    }
    delete t;
    return 0;
}
```

</p>
</details> 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Relaxed exchange appears stronger than on real hardware #16

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Relaxed exchange appears stronger than on real hardware #16

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions