Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
033d2bc
Added initial static_reduction_map files.
jrhemstad Dec 16, 2020
fe606cd
Add template parameter for reduction binary op.
jrhemstad Jan 4, 2021
fd3b98f
Fix static_assert for ReductionOp::value_type.
jrhemstad Jan 4, 2021
a3678fb
CG reduction insert implementation.
jrhemstad Jan 5, 2021
5a65bf6
Cleanup of CG insert.
jrhemstad Jan 5, 2021
28e0995
Pass reduction op to device view ctors.
jrhemstad Jan 5, 2021
8dc64ee
Add pair ctor for constructing from two elements.
jrhemstad Jan 5, 2021
573bce2
Allow bulk insert kernel to work on iterators over tuples.
jrhemstad Jan 5, 2021
d9236e5
Add device decorator to reduction op definition.
jrhemstad Jan 5, 2021
89ed44e
Add get_op function to allow accessing the op from
jrhemstad Jan 5, 2021
e28db80
Make insert return a bool after all.
jrhemstad Jan 5, 2021
0eeac20
Use get_op in implementation.
jrhemstad Jan 5, 2021
fa31c81
Make insert return a bool.
jrhemstad Jan 5, 2021
ab81b2b
Correct insert to return if the key was the first key inserted.
jrhemstad Jan 5, 2021
46f9b73
First test verifying size passed.
jrhemstad Jan 5, 2021
8aebabb
Update CG insert logic.
jrhemstad Jan 6, 2021
9fb930e
Add more tests.
jrhemstad Jan 6, 2021
24261b2
Add test for inserting all unique keys.
jrhemstad Jan 7, 2021
e635e31
Use relaxed fetch_add.
jrhemstad Jan 7, 2021
d749445
Update the slot references each iteration.
jrhemstad Jan 7, 2021
ca9f7d6
Increase size of unique key test.
jrhemstad Jan 7, 2021
9eebd17
Make map size function of number of keys.
jrhemstad Jan 7, 2021
212b8f6
Add other agg ops.
jrhemstad Jan 7, 2021
cda527a
Add custom binary op.
jrhemstad Jan 7, 2021
7c1af0f
Return old value in custom op.
jrhemstad Jan 7, 2021
3f1b59d
reduction map benchmarks.
jrhemstad Apr 8, 2021
71a0122
Merge remote-tracking branch 'origin/dev' into reduction-map
jrhemstad May 13, 2021
2a38d70
Remove redundant ctor.
jrhemstad May 13, 2021
f2d1a26
Add initial static_reduction_map example.
jrhemstad May 13, 2021
3c79701
Remove cuda_memcmp header.
jrhemstad May 13, 2021
8261d93
Add unsafe accessors to raw slots via reinterpret_cast.
jrhemstad May 19, 2021
c6daa09
Add retreive_all implementation.
jrhemstad May 19, 2021
62a99ab
Add retrieve_all to example.
jrhemstad May 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@ if("${GPU_ARCHS}" STREQUAL "")
evaluate_gpu_archs(GPU_ARCHS)
endif()

message("GPU_ARCHS = ${GPU_ARCHS}")

###################################################################################################
# - compiler function -----------------------------------------------------------------------------

function(ConfigureBench BENCH_NAME BENCH_SRC)
add_executable(${BENCH_NAME} "${BENCH_SRC}")
set_target_properties(${BENCH_NAME} PROPERTIES
POSITION_INDEPENDENT_CODE ON
CUDA_ARCHITECTURES ${GPU_ARCHS}
CUDA_ARCHITECTURES "${GPU_ARCHS}"
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gbenchmarks")
target_include_directories(${BENCH_NAME} PRIVATE
"${CMAKE_CURRENT_SOURCE_DIR}")
Expand All @@ -58,6 +60,9 @@ ConfigureBench(DYNAMIC_MAP_BENCH "${DYNAMIC_MAP_BENCH_SRC}")
set(STATIC_MAP_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_map_bench.cu")
ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")

###################################################################################################
ConfigureBench(STATIC_REDUCTION_MAP_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/hash_table/static_reduction_map_bench.cu")

###################################################################################################
set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
132 changes: 68 additions & 64 deletions benchmarks/hash_table/static_map_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,40 +15,38 @@
*/

#include <benchmark/benchmark.h>
#include "cuco/static_map.cuh"
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <synchronization.hpp>
#include <thrust/for_each.h>
#include <fstream>
#include <iostream>
#include <random>
#include "cuco/static_map.cuh"

enum class dist_type {
UNIQUE,
UNIFORM,
GAUSSIAN
};
enum class dist_type { UNIQUE, UNIFORM, GAUSSIAN };

template<dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end) {
template <dist_type Dist, typename Key, typename OutputIt>
static void generate_keys(OutputIt output_begin, OutputIt output_end)
{
auto num_keys = std::distance(output_begin, output_end);

std::random_device rd;
std::mt19937 gen{rd()};

switch(Dist) {
switch (Dist) {
case dist_type::UNIQUE:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = i;
}
break;
case dist_type::UNIFORM:
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(gen()));
}
break;
case dist_type::GAUSSIAN:
std::normal_distribution<> dg{1e9, 1e7};
for(auto i = 0; i < num_keys; ++i) {
for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = std::abs(static_cast<Key>(dg(gen)));
}
break;
Expand All @@ -59,141 +57,147 @@ static void generate_keys(OutputIt output_begin, OutputIt output_end) {
* @brief Generates input sizes and hash table occupancies
*
*/
static void generate_size_and_occupancy(benchmark::internal::Benchmark* b) {
for (auto size = 100'000'000; size <= 100'000'000; size *= 10) {
for (auto occupancy = 10; occupancy <= 90; occupancy += 10) {
static void generate_size_and_occupancy(benchmark::internal::Benchmark* b)
{
for (auto size = 4096; size <= 1 << 28; size *= 2) {
for (auto occupancy = 60; occupancy <= 60; occupancy += 10) {
b->Args({size, occupancy});
}
}
}



template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_insert(::benchmark::State& state) {
static void BM_static_map_insert(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

std::vector<Key> h_keys(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);

std::vector<Key> h_keys( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs( num_keys );

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;

for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

for(auto _ : state) {
state.ResumeTiming();
state.PauseTiming();
for (auto _ : state) {
map_type map{size, -1, -1};
state.ResumeTiming();

map.insert(d_pairs.begin(), d_pairs.end());

state.PauseTiming();
{
cuda_event_timer raii{state};
map.insert(d_pairs.begin(), d_pairs.end());
}
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) *
int64_t(state.iterations()) *
state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}



template <typename Key, typename Value, dist_type Dist>
static void BM_static_map_search_all(::benchmark::State& state) {
static void BM_static_map_search_all(::benchmark::State& state)
{
using map_type = cuco::static_map<Key, Value>;

std::size_t num_keys = state.range(0);
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;
float occupancy = state.range(1) / float{100};
std::size_t size = num_keys / occupancy;

map_type map{size, -1, -1};
auto view = map.get_device_mutable_view();

std::vector<Key> h_keys( num_keys );
std::vector<Value> h_values( num_keys );
std::vector<cuco::pair_type<Key, Value>> h_pairs ( num_keys );
std::vector<Value> h_results (num_keys);
std::vector<Key> h_keys(num_keys);
std::vector<Value> h_values(num_keys);
std::vector<cuco::pair_type<Key, Value>> h_pairs(num_keys);
std::vector<Value> h_results(num_keys);

generate_keys<Dist, Key>(h_keys.begin(), h_keys.end());
for(auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;

for (auto i = 0; i < num_keys; ++i) {
Key key = h_keys[i];
Value val = h_keys[i];
h_pairs[i].first = key;
h_pairs[i].second = val;
}

thrust::device_vector<Key> d_keys( h_keys );
thrust::device_vector<Value> d_results( num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs( h_pairs );
thrust::device_vector<Key> d_keys(h_keys);
thrust::device_vector<Value> d_results(num_keys);
thrust::device_vector<cuco::pair_type<Key, Value>> d_pairs(h_pairs);

map.insert(d_pairs.begin(), d_pairs.end());
for(auto _ : state) {

for (auto _ : state) {
map.find(d_keys.begin(), d_keys.end(), d_results.begin());
}

state.SetBytesProcessed((sizeof(Key) + sizeof(Value)) * int64_t(state.iterations()) *
int64_t(state.range(0)));
}



BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_insert, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int32_t, int32_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIQUE)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::UNIFORM)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_insert, int64_t, int64_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);

BENCHMARK_TEMPLATE(BM_static_map_search_all, int64_t, int64_t, dist_type::GAUSSIAN)
->Unit(benchmark::kMillisecond)
->UseManualTime()
->Apply(generate_size_and_occupancy);
Loading