From 4ce9c353f35f8e13a4f8722e0713ca02794163a1 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 14 May 2025 15:29:53 +0000 Subject: [PATCH 01/24] Begin work on coordinate-based intersection builder --- mdio/CMakeLists.txt | 23 +++ mdio/intersection.h | 163 ++++++++++++++++++++ mdio/intersection_test.cc | 317 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 503 insertions(+) create mode 100644 mdio/intersection.h create mode 100644 mdio/intersection_test.cc diff --git a/mdio/CMakeLists.txt b/mdio/CMakeLists.txt index 0d53797..f57e284 100644 --- a/mdio/CMakeLists.txt +++ b/mdio/CMakeLists.txt @@ -334,3 +334,26 @@ mdio_cc_test( tensorstore::util_status_testutil nlohmann_json_schema_validator ) + +mdio_cc_test( + NAME + intersection_test + SRCS + intersection_test.cc + COPTS + ${mdio_DEFAULT_COPTS} + LINKOPTS + ${mdio_DEFAULT_LINKOPTS} + DEPS + GTest::gmock_main + tensorstore::driver_array + tensorstore::driver_zarr + tensorstore::driver_json + tensorstore::kvstore_file + tensorstore::tensorstore + tensorstore::stack + tensorstore::index_space_dim_expression + tensorstore::index_space_index_transform + tensorstore::util_status_testutil + nlohmann_json_schema_validator +) diff --git a/mdio/intersection.h b/mdio/intersection.h new file mode 100644 index 0000000..355037d --- /dev/null +++ b/mdio/intersection.h @@ -0,0 +1,163 @@ +#ifndef MDIO_INTERSECTION_H +#define MDIO_INTERSECTION_H + +#include "mdio/variable.h" +#include "mdio/dataset.h" +#include "mdio/impl.h" + +namespace mdio { + +/// \brief Collects valid index selections per dimension for a Dataset without +/// performing slicing immediately. +/// +/// Only dimensions explicitly filtered via add_selection appear in the map; +/// any dimension not present should be treated as having its full index range. +class IndexSelection { +public: + /// Construct from an existing Dataset (captures its full domain). + explicit IndexSelection(const Dataset& dataset) + : dataset_(dataset), base_domain_(dataset.domain) {} + + /// Add or intersect selection of indices where the coordinate variable equals + /// descriptor.value. Returns a Future for async compatibility. + /// + /// For each dimension of the coordinate variable, records the index positions + /// where the value matches, then intersects with any prior selections on that + /// dimension. + template + mdio::Future add_selection(const ValueDescriptor& descriptor) { + using Index = mdio::Index; + using Interval = typename Variable::Interval; + + // Lookup coordinate variable by label + MDIO_ASSIGN_OR_RETURN(auto var, + dataset_.variables.get(std::string(descriptor.label.label()))); + + // Get per-dimension intervals + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + + // Read coordinate data + auto fut = var.Read(); + if (!fut.status().ok()) { + return fut.status(); + } + auto data = fut.value(); + + // Access flattened buffer + const T* data_ptr = data.get_data_accessor().data(); + Index offset = data.get_flattened_offset(); + Index n_samples = data.num_samples(); + std::vector current_pos = intervals; + + // Local map: dimension label -> matching indices + std::map> local_matched; + + // Scan all elements + for (Index idx = offset; idx < offset + n_samples; ++idx) { + if (data_ptr[idx] == descriptor.value) { + // On match, capture each dimension's index + for (const auto& pos : current_pos) { + std::string dim(pos.label.label()); + local_matched[dim].push_back(pos.inclusive_min); + } + } + _current_position_increment(current_pos, intervals); + } + + if (local_matched.empty()) { + return absl::NotFoundError( + "No matches for coordinate '" + std::string(descriptor.label.label()) + "'"); + } + + // Merge into global selections_ + for (auto& kv : local_matched) { + const std::string& dim = kv.first; + auto& vec = kv.second; + + // Deduplicate + std::sort(vec.begin(), vec.end()); + vec.erase(std::unique(vec.begin(), vec.end()), vec.end()); + + // Intersect or assign + auto it = selections_.find(dim); + if (it == selections_.end()) { + selections_.emplace(dim, std::move(vec)); + } else { + auto& existing = it->second; + std::vector intersected; + std::set_intersection( + existing.begin(), existing.end(), + vec.begin(), vec.end(), + std::back_inserter(intersected)); + existing = std::move(intersected); + } + } + + return absl::OkStatus(); + } + + /// \brief Build contiguous RangeDescriptors (stride=1) from selections. + /// + /// For each dimension in selections_, coalesces consecutive indices into + /// RangeDescriptor with step=1, emitting the minimal set covering all. + std::vector> range_descriptors() const { + using Index = mdio::Index; + std::vector> result; + + for (const auto& kv : selections_) { + const auto& dim = kv.first; + const auto& idxs = kv.second; + if (idxs.empty()) continue; + + Index start = idxs.front(); + Index prev = start; + + for (size_t i = 1; i < idxs.size(); ++i) { + if (idxs[i] == prev + 1) { + prev = idxs[i]; + } else { + // close out the current range [start, prev+1) + result.emplace_back(RangeDescriptor{dim, start, prev + 1, 1}); + // begin new range + start = idxs[i]; + prev = idxs[i]; + } + } + // flush last range + result.emplace_back(RangeDescriptor{dim, start, prev + 1, 1}); + } + + return result; + } + + /// \brief Get the current selection map: dimension name -> indices. + /// Dimensions not present imply full range. + const std::map>& selections() const { + return selections_; + } + +private: + const Dataset& dataset_; + tensorstore::IndexDomain<> base_domain_; + std::map> selections_; + + /// Advance a multidimensional odometer position by one step. + template + void _current_position_increment( + std::vector::Interval>& position, + const std::vector::Interval>& interval) const { + // Walk dimensions from fastest (last) to slowest (first) + for (std::size_t d = position.size(); d-- > 0; ) { + if (position[d].inclusive_min + 1 < interval[d].exclusive_max) { + ++position[d].inclusive_min; + return; + } + position[d].inclusive_min = interval[d].inclusive_min; + } + // Should never reach here + } +}; + +} // namespace mdio + +#endif // MDIO_INTERSECTION_H diff --git a/mdio/intersection_test.cc b/mdio/intersection_test.cc new file mode 100644 index 0000000..697ceee --- /dev/null +++ b/mdio/intersection_test.cc @@ -0,0 +1,317 @@ +// Copyright 2025 TGS + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mdio/dataset.h" +#include "mdio/intersection.h" + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "mdio/dataset_factory.h" +#include "tensorstore/driver/driver.h" +#include "tensorstore/driver/registry.h" +#include "tensorstore/index_space/dim_expression.h" +#include "tensorstore/index_space/index_domain_builder.h" +#include "tensorstore/kvstore/kvstore.h" +#include "tensorstore/kvstore/operations.h" +#include "tensorstore/open.h" +#include "tensorstore/tensorstore.h" +#include "tensorstore/util/future.h" +#include "tensorstore/util/status_testutil.h" + +// clang-format off +#include // NOLINT +// clang-format on + +namespace { + +mdio::Result SetupDataset() { + std::string ds_path = "generic_with_coords.mdio"; + std::string schema_str = R"( + { + "metadata": { + "name": "generic_with_coords", + "apiVersion": "1.0.0", + "createdOn": "2025-05-13T12:00:00.000000-05:00", + "attributes": { + "generic type" : true + } + }, + "variables": [ + { + "name": "DataVariable", + "dataType": "float32", + "dimensions": ["task", "trace", "sample"], + "coordinates": ["inline", "crossline", "live_mask"], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [1, 64, 128] } + } + } + }, + { + "name": "inline", + "dataType": "int32", + "dimensions": ["task", "trace"], + "coordinates": ["live_mask"], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [1, 64] } + } + } + }, + { + "name": "crossline", + "dataType": "int32", + "dimensions": ["task", "trace"], + "coordinates": ["live_mask"], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [1, 64] } + } + } + }, + { + "name": "live_mask", + "dataType": "bool", + "dimensions": ["task", "trace"], + "coordinates": ["inline", "crossline"], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [1, 64] } + } + } + }, + { + "name": "task", + "dataType": "uint32", + "dimensions": [{"name": "task", "size": 25}], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [1] } + } + } + }, + { + "name": "trace", + "dataType": "uint32", + "dimensions": [{"name": "trace", "size": 256}], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [64] } + } + } + }, + { + "name": "sample", + "dataType": "uint32", + "dimensions": [{"name": "sample", "size": 512}], + "metadata": { + "chunkGrid": { + "name": "regular", + "configuration": { "chunkShape": [128] } + } + } + } + ] + })"; + + auto schema = ::nlohmann::json::parse(schema_str); + auto dsFut = mdio::Dataset::from_json(schema, ds_path, mdio::constants::kCreate); + if (!dsFut.status().ok()) { + return ds_path; + } + auto ds = dsFut.value(); + + // Populate the dataset with data + MDIO_ASSIGN_OR_RETURN(auto dataVar, ds.variables.get("DataVariable")); + MDIO_ASSIGN_OR_RETURN(auto inlineVar, ds.variables.get("inline")); + MDIO_ASSIGN_OR_RETURN(auto crosslineVar, ds.variables.get("crossline")); + MDIO_ASSIGN_OR_RETURN(auto liveMaskVar, ds.variables.get("live_mask")); + + MDIO_ASSIGN_OR_RETURN(auto varData, mdio::from_variable(dataVar)); + MDIO_ASSIGN_OR_RETURN(auto inlineData, mdio::from_variable(inlineVar)); + MDIO_ASSIGN_OR_RETURN(auto crosslineData, mdio::from_variable(crosslineVar)); + MDIO_ASSIGN_OR_RETURN(auto liveMaskData, mdio::from_variable(liveMaskVar)); + + auto varDataPtr = varData.get_data_accessor().data(); + auto inlineDataPtr = inlineData.get_data_accessor().data(); + auto crosslineDataPtr = crosslineData.get_data_accessor().data(); + auto liveMaskDataPtr = liveMaskData.get_data_accessor().data(); + + auto varOffset = varData.get_flattened_offset(); + auto inlineOffset = inlineData.get_flattened_offset(); + auto crosslineOffset = crosslineData.get_flattened_offset(); + auto liveMaskOffset = liveMaskData.get_flattened_offset(); + + std::size_t coords = 0; + std::size_t var = 0; + + for (int i=0; i<15; ++i) { // Only 15 of the 25 tasks were "assigned" + for (int j=0; j<256; ++j) { + inlineDataPtr[coords + inlineOffset] = coords; + crosslineDataPtr[coords + crosslineOffset] = coords*4; + liveMaskDataPtr[coords + liveMaskOffset] = true; + for (int k=0; k<512; ++k) { + varDataPtr[var + varOffset] = i * 256 * 512 + j * 512 + k; + var++; + } + coords++; + } + } + + auto varDataFut = dataVar.Write(varData); + auto inlineDataFut = inlineVar.Write(inlineData); + auto crosslineDataFut = crosslineVar.Write(crosslineData); + auto liveMaskDataFut = liveMaskVar.Write(liveMaskData); + + if (!varDataFut.status().ok()) { + return varDataFut.status(); + } + if (!inlineDataFut.status().ok()) { + return inlineDataFut.status(); + } + if (!crosslineDataFut.status().ok()) { + return crosslineDataFut.status(); + } + if (!liveMaskDataFut.status().ok()) { + return liveMaskDataFut.status(); + } + + return ds_path; +} + +TEST(Intersection, SETUP) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); +} + +TEST(Intersection, constructor) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::IndexSelection is(ds); +} + +TEST(Intersection, add_selection) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::IndexSelection is(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = is.add_selection(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. + + auto selections = is.selections(); + ASSERT_EQ(selections.size(), 2) << "Expected 2 dimensions in the selection map but got " << selections.size(); +} + +TEST(Intersection, range_descriptors) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::IndexSelection is(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = is.add_selection(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. + + auto rangeDescriptors = is.range_descriptors(); + + ASSERT_EQ(rangeDescriptors.size(), 2); + EXPECT_EQ(rangeDescriptors[0].label.label(), "task") << "Expected first RangeDescriptor to be for the 'task' dimension"; + EXPECT_EQ(rangeDescriptors[1].label.label(), "trace") << "Expected second RangeDescriptor to be for the 'trace' dimension"; + + EXPECT_EQ(rangeDescriptors[0].start, 0) << "Expected first RangeDescriptor to start at index 0"; + EXPECT_EQ(rangeDescriptors[0].stop, 15) << "Expected first RangeDescriptor to stop at index 15"; + EXPECT_EQ(rangeDescriptors[0].step, 1) << "Expected first RangeDescriptor to have a step of 1"; + + EXPECT_EQ(rangeDescriptors[1].start, 0) << "Expected second RangeDescriptor to start at index 0"; + EXPECT_EQ(rangeDescriptors[1].stop, 256) << "Expected second RangeDescriptor to stop at index 256"; + EXPECT_EQ(rangeDescriptors[1].step, 1) << "Expected second RangeDescriptor to have a step of 1"; +} + +TEST(Intersection, get_inline_range) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::IndexSelection is(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = is.add_selection(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. + isFut = is.add_selection(mdio::ValueDescriptor{"inline", 18}); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); + auto rangeDescriptors = is.range_descriptors(); + + for (const auto& desc : rangeDescriptors) { + std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; + } + +} + +TEST(Intersection, get_inline_range_dead) { + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::IndexSelection is(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = is.add_selection(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. + isFut = is.add_selection(mdio::ValueDescriptor{"inline", 5000}); + EXPECT_FALSE(isFut.status().ok()) << "Expected an error when adding a selection for an invalid inline index"; + + auto rangeDescriptors = is.range_descriptors(); + for (const auto& desc : rangeDescriptors) { + std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; + } +} + +} // namespace From f3f325f2fe307c22df77bd88b60e375dfe436ae2 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 14 May 2025 16:24:29 +0000 Subject: [PATCH 02/24] Enhance slicing logic and flexibility --- mdio/dataset.h | 122 +++++++++++++++++++++---- mdio/variable.h | 233 ++++++++++++++++++++++-------------------------- 2 files changed, 213 insertions(+), 142 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 815fb58..3b66a7d 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -609,6 +609,55 @@ class Dataset { return isel(slices[I]...); } + std::vector> merge_adjacent( + std::vector> descs) { + // 1) bucket by (label, step) using a string_view key + using Key = std::pair; + std::map>> buckets; + + for (auto &d : descs) { + buckets[{ d.label.label(), d.step }].push_back(d); + } + + std::vector> result; + result.reserve(descs.size()); + + // 2) sort & merge each bucket + for (auto &kv : buckets) { + auto &vec = kv.second; + std::sort(vec.begin(), vec.end(), + [](auto const &a, auto const &b) { + return a.start < b.start; + }); + + Index cur_start = vec[0].start; + Index cur_stop = vec[0].stop; + + for (size_t i = 1; i < vec.size(); ++i) { + if (vec[i].start <= cur_stop) { + cur_stop = std::max(cur_stop, vec[i].stop); + } else { + // copy the *original* descriptor (with a safe label) + auto run = vec[0]; + run.start = cur_start; + run.stop = cur_stop; + result.push_back(std::move(run)); + + cur_start = vec[i].start; + cur_stop = vec[i].stop; + } + } + + // emit the last run + auto run = vec[0]; + run.start = cur_start; + run.stop = cur_stop; + result.push_back(std::move(run)); + } + + return result; +} + // Wrapper function that generates the index sequence /** * @brief This version of isel is only expected to be used interally. @@ -623,24 +672,63 @@ class Dataset { return absl::InvalidArgumentError("No slices provided."); } - if (slices.size() > internal::kMaxNumSlices) { - return absl::InvalidArgumentError( - absl::StrCat("Too many slices provided or implicitly generated. " - "Maximum number of slices is ", - internal::kMaxNumSlices, " but ", slices.size(), - " were provided.\n\tUse -DMAX_NUM_SLICES cmake flag to " - "increase the maximum number of slices.")); + auto reducedSlices = merge_adjacent(slices); + + bool do_simple_slice = true; + + // Build a set of just the labels + std::set labels; + labels.insert(reducedSlices[0].label.label()); + for (auto i=1; i 0) { + do_simple_slice = false; + // break; // Don't break here, we can check all the labels and see if we are left with a single dimension. + } + labels.insert(reducedSlices[i].label.label()); } - std::vector> slicesCopy = slices; - for (int i = slices.size(); i <= internal::kMaxNumSlices; i++) { - slicesCopy.emplace_back( - RangeDescriptor({internal::kInertSliceKey, 0, 1, 1})); + // If we are left with a single dimension, we can just do a simple slice. + if (labels.size() == 1) { + do_simple_slice = true; + } + + // Pre-emptively split the RangeDescriptors if there are too many. + // This is not the final logic that we want. + if (reducedSlices.size() > internal::kMaxNumSlices) { + std::size_t halfElements = reducedSlices.size() / 2; + if (halfElements % 2 != 0) { + halfElements += 1; + } + std::vector> firstHalf(reducedSlices.begin(), reducedSlices.begin() + halfElements); + std::vector> secondHalf(reducedSlices.begin() + halfElements, reducedSlices.end()); + MDIO_ASSIGN_OR_RETURN(auto ds, isel(static_cast>&>(firstHalf))); + return ds.isel(static_cast>&>(secondHalf)); } - // Generate the index sequence and call the implementation - return call_isel_with_vector_impl( - slicesCopy, std::make_index_sequence{}); + if (do_simple_slice) { + std::vector> slicesCopy = reducedSlices; + for (int i = reducedSlices.size(); i <= internal::kMaxNumSlices; i++) { + slicesCopy.emplace_back( + RangeDescriptor({internal::kInertSliceKey, 0, 1, 1})); + } + + // Generate the index sequence and call the implementation + return call_isel_with_vector_impl( + slicesCopy, std::make_index_sequence{}); + } else { + std::vector> simpleSlices; + std::vector> complexSlices; + auto simpleLabel = reducedSlices[0].label.label(); + for (auto &slice : reducedSlices) { + if (slice.label.label() != simpleLabel) { + complexSlices.push_back(slice); + } else { + simpleSlices.push_back(slice); + } + } + MDIO_ASSIGN_OR_RETURN(auto ds, isel(static_cast>&>(simpleSlices))); + return ds.isel(static_cast>&>(complexSlices)); + } } /** @@ -849,7 +937,7 @@ class Dataset { // The map 'label_to_indices' is now populated with all the relevant // indices. You can now proceed with further processing based on this map. - return isel(slices); + return isel(static_cast>&>(slices)); } else if constexpr ((std::is_same_v>&>(slices)); } else { std::map> label_to_range; // pair.first = start, pair.second = stop @@ -976,7 +1064,7 @@ class Dataset { "No slices could be made from the given descriptors."); } - return isel(slices); + return isel(static_cast>&>(slices)); } return absl::OkStatus(); diff --git a/mdio/variable.h b/mdio/variable.h index 14cad32..2cf084d 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -1002,6 +1002,8 @@ class Variable { } if (slices.size() > internal::kMaxNumSlices) { + // We are expecting the only entry point for this method to be fro mthe Dataset::isel method. + // That method should handle the partitioning of the slices. return absl::InvalidArgumentError( absl::StrCat("Too many slices provided or implicitly generated. " "Maximum number of slices is ", @@ -1040,141 +1042,122 @@ class Variable { * @return An `mdio::Result` object containing the resulting sub-Variable. */ template - Result slice(const Descriptors&... descriptors) const { - constexpr size_t numDescriptors = sizeof...(descriptors); - - auto tuple_descs = std::make_tuple(descriptors...); - - std::vector labels; - labels.reserve(numDescriptors); - - std::vector start, stop, step; - start.reserve(numDescriptors); - stop.reserve(numDescriptors); - step.reserve(numDescriptors); - // -1 Everything is ok - // >=0 Error: Start is greater than or equal to stop - int8_t preconditionStatus = -1; +Result slice(const Descriptors&... descriptors) const { + // 1) Pack descriptors + constexpr size_t N = sizeof...(Descriptors); + std::array, N> descs = { descriptors... }; + + // 2) Clamp + precondition check + std::vector labels; + std::vector starts, stops, steps; + labels.reserve(N); + starts.reserve(N); + stops.reserve(N); + steps.reserve(N); + + int8_t bad_idx = -1; + for (size_t i = 0; i < N; ++i) { + auto d = sliceInRange(descs[i]); + if (d.start > d.stop) { + bad_idx = static_cast(i); + break; + } + if (this->hasLabel(d.label)) { + labels.push_back(d.label); + starts.push_back(d.start); + stops.push_back(d.stop); + steps.push_back(d.step); + } + } + if (bad_idx >= 0) { + auto& err = descs[bad_idx]; + return Result{absl::InvalidArgumentError( + std::string("Slice descriptor for ") + std::string(err.label.label()) + + " is invalid: start=" + std::to_string(err.start) + + " > stop=" + std::to_string(err.stop) + )}; + } - std::apply( - [&](const auto&... desc) { - size_t idx = 0; - (( - [&] { - auto clampedDesc = sliceInRange(desc); - if (clampedDesc.start > clampedDesc.stop) { - preconditionStatus = idx; - return 1; - } - if (this->hasLabel(clampedDesc.label)) { - labels.push_back(clampedDesc.label); - start.push_back(clampedDesc.start); - stop.push_back(clampedDesc.stop); - step.push_back(clampedDesc.step); - } - return 0; // Return a dummy value to satisfy the comma - // operator - }(), - idx++), - ...); - }, - tuple_descs); + // 3) Fast path: all labels (or axis indices) are unique + if (!labels.empty()) { + std::set labelSet; + std::set indexSet; + for (auto& lab : labels) { + labelSet.insert(lab.label()); + indexSet.insert(lab.index()); + } + if (labelSet.size() == labels.size() || + indexSet.size() == labels.size()) { + MDIO_ASSIGN_OR_RETURN( + auto slice_store, + store | tensorstore::Dims(labels) + .HalfOpenInterval(starts, stops, steps) + ); + return Variable{ + variableName, + longName, + metadata, + std::move(slice_store), + attributes + }; + } + } - if (preconditionStatus >= 0) { - mdio::RangeDescriptor err; - std::apply( - [&](const auto&... desc) { - size_t idx = 0; - (([&] { - if (idx == preconditionStatus) { - err = desc; - } - idx++; - }()), - ...); - }, - tuple_descs); - return absl::InvalidArgumentError( - std::string("Slice descriptor for ") + - std::string(err.label.label()) + - " had an illegal configuration.\n\tStart '" + - std::to_string(err.start) + "' greater than or equal to stop '" + - std::to_string(err.stop) + "'."); + // 4) Group by label to find any duplicates + std::map>> by_label; + for (auto& d : descs) { + if (d.label.label() != internal::kInertSliceKey) { + by_label[d.label.label()].push_back(d); } + } - auto labelSize = labels.size(); - if (labelSize) { - std::set labelSet; - std::set indexSet; - for (const auto& label : labels) { - labelSet.insert(label.label()); - indexSet.insert(label.index()); + // 5) Handle the first label that has >1 descriptor + for (auto& [label, vec] : by_label) { + if (vec.size() > 1) { + // 5a) Unwrap the Spec so we can ask for transform().input_labels() + MDIO_ASSIGN_OR_RETURN(auto spec, store.spec()); + auto spec_labels = spec.transform().input_labels(); + + // find the numeric axis for this label + auto it = std::find(spec_labels.begin(), + spec_labels.end(), + label); + if (it == spec_labels.end()) { + // no-op if the label isn't in the spec; skip it + continue; } - - if (labelSet.size() == labelSize || indexSet.size() == labelSize) { - MDIO_ASSIGN_OR_RETURN( - auto slice_store, - store | - tensorstore::Dims(labels).HalfOpenInterval(start, stop, step)); - // return a new variable with the sliced store - return Variable{variableName, longName, metadata, slice_store, - attributes}; - } else if (labelSet.size() != labelSize) { - // Concat the sliced Variable together if there are duplicate - // labels(dimensions) - std::vector fragments; - absl::Status trueStatus = absl::OkStatus(); - auto fragmentStore = [&](auto& descriptor) -> absl::Status { - if (descriptor.label.label() == internal::kInertSliceKey) { - // pass on kInertSliceKey - return absl::OkStatus(); - } - auto sliceRes = slice(descriptor); - if (!sliceRes.status().ok()) { - trueStatus = sliceRes.status(); - return trueStatus; - } - fragments.push_back(sliceRes.value()); - return absl::OkStatus(); - }; - - auto status = (fragmentStore(descriptors).ok() && ...); - if (!status) { - return trueStatus; - } - - if (!fragments.empty()) { - // Concat appears to only work with void types, so we'll strip the - // type away - tensorstore::TensorStore catStore; - // Initialize catStore with the first fragment's store - catStore = fragments.front().get_store(); - - // Concatenate remaining fragments - for (size_t i = 1; i < fragments.size(); ++i) { - MDIO_ASSIGN_OR_RETURN( - catStore, - tensorstore::Concat({catStore, fragments[i].get_store()}, - /*axis=*/0)); - } - // Recast to the original type - tensorstore::TensorStore typedCatStore = - tensorstore::TensorStore(tensorstore::unchecked, - catStore); - // Return a new Variable with the concatenated store - return Variable{variableName, longName, metadata, typedCatStore, - attributes}; - } - return absl::InternalError("No fragments to concatenate."); + int axis = static_cast(std::distance(spec_labels.begin(), it)); + + // 5b) Slice each sub‑range in isolation + std::vector> pieces; + pieces.reserve(vec.size()); + for (auto& r : vec) { + auto sub = slice(r); + if (!sub.status().ok()) return sub.status(); + pieces.push_back(sub.value().get_store()); } - return absl::InvalidArgumentError( - "Unexpected error occured while trying to slice the Variable."); + // 5c) Concatenate them along the correct axis + MDIO_ASSIGN_OR_RETURN( + auto cat_store, + tensorstore::Concat(pieces, axis) + ); + + return Variable{ + variableName, + longName, + metadata, + std::move(cat_store), + attributes + }; } - // the slice didn't change anything in the variables dimensions. - return *this; } + // 6) No descriptors matched → no change + return *this; +} + /** * @brief Retrieves the spec of the Variable as a JSON object. * @return A JSON object representing the spec of the Variable. From 535f7d1a378d615ffdcee3141328ab0adb2348ea Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 14 May 2025 20:11:26 +0000 Subject: [PATCH 03/24] Fix elements getting dropped from slice operation --- mdio/dataset.h | 138 ++++++++++++++----------------------------------- 1 file changed, 39 insertions(+), 99 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index 3b66a7d..ab3fbc5 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -609,55 +609,6 @@ class Dataset { return isel(slices[I]...); } - std::vector> merge_adjacent( - std::vector> descs) { - // 1) bucket by (label, step) using a string_view key - using Key = std::pair; - std::map>> buckets; - - for (auto &d : descs) { - buckets[{ d.label.label(), d.step }].push_back(d); - } - - std::vector> result; - result.reserve(descs.size()); - - // 2) sort & merge each bucket - for (auto &kv : buckets) { - auto &vec = kv.second; - std::sort(vec.begin(), vec.end(), - [](auto const &a, auto const &b) { - return a.start < b.start; - }); - - Index cur_start = vec[0].start; - Index cur_stop = vec[0].stop; - - for (size_t i = 1; i < vec.size(); ++i) { - if (vec[i].start <= cur_stop) { - cur_stop = std::max(cur_stop, vec[i].stop); - } else { - // copy the *original* descriptor (with a safe label) - auto run = vec[0]; - run.start = cur_start; - run.stop = cur_stop; - result.push_back(std::move(run)); - - cur_start = vec[i].start; - cur_stop = vec[i].stop; - } - } - - // emit the last run - auto run = vec[0]; - run.start = cur_start; - run.stop = cur_stop; - result.push_back(std::move(run)); - } - - return result; -} - // Wrapper function that generates the index sequence /** * @brief This version of isel is only expected to be used interally. @@ -668,67 +619,56 @@ class Dataset { * number of descriptors. */ Result isel(const std::vector>& slices) { + // Validate input if (slices.empty()) { - return absl::InvalidArgumentError("No slices provided."); + return absl::InvalidArgumentError("No slices provided."); } - auto reducedSlices = merge_adjacent(slices); - - bool do_simple_slice = true; + // If we exceed the maximum descriptors, split into two calls + if (slices.size() > internal::kMaxNumSlices) { + std::size_t half = slices.size() / 2; + if (half % 2 != 0) { + ++half; + } - // Build a set of just the labels - std::set labels; - labels.insert(reducedSlices[0].label.label()); - for (auto i=1; i 0) { - do_simple_slice = false; - // break; // Don't break here, we can check all the labels and see if we are left with a single dimension. - } - labels.insert(reducedSlices[i].label.label()); - } + std::vector> firstHalf( + slices.begin(), slices.begin() + half); + std::vector> secondHalf( + slices.begin() + half, slices.end()); - // If we are left with a single dimension, we can just do a simple slice. - if (labels.size() == 1) { - do_simple_slice = true; + MDIO_ASSIGN_OR_RETURN(auto firstResult, isel(firstHalf)); + return firstResult.isel(secondHalf); } - // Pre-emptively split the RangeDescriptors if there are too many. - // This is not the final logic that we want. - if (reducedSlices.size() > internal::kMaxNumSlices) { - std::size_t halfElements = reducedSlices.size() / 2; - if (halfElements % 2 != 0) { - halfElements += 1; - } - std::vector> firstHalf(reducedSlices.begin(), reducedSlices.begin() + halfElements); - std::vector> secondHalf(reducedSlices.begin() + halfElements, reducedSlices.end()); - MDIO_ASSIGN_OR_RETURN(auto ds, isel(static_cast>&>(firstHalf))); - return ds.isel(static_cast>&>(secondHalf)); + // Group slices by their dimension label + std::map>> groups; + for (const auto& slice : slices) { + groups[slice.label.label()].push_back(slice); } - if (do_simple_slice) { - std::vector> slicesCopy = reducedSlices; - for (int i = reducedSlices.size(); i <= internal::kMaxNumSlices; i++) { - slicesCopy.emplace_back( - RangeDescriptor({internal::kInertSliceKey, 0, 1, 1})); - } - - // Generate the index sequence and call the implementation - return call_isel_with_vector_impl( - slicesCopy, std::make_index_sequence{}); - } else { - std::vector> simpleSlices; - std::vector> complexSlices; - auto simpleLabel = reducedSlices[0].label.label(); - for (auto &slice : reducedSlices) { - if (slice.label.label() != simpleLabel) { - complexSlices.push_back(slice); - } else { - simpleSlices.push_back(slice); + // Apply slices for each group sequentially + Dataset current = *this; + for (auto& [label, group] : groups) { + // Pad with inert slices up to kMaxNumSlices + std::vector> padded = group; + padded.reserve(internal::kMaxNumSlices); + for (std::size_t i = padded.size(); i < internal::kMaxNumSlices; ++i) { + padded.emplace_back(RangeDescriptor({ + internal::kInertSliceKey, 0, 1, 1 + })); } - } - MDIO_ASSIGN_OR_RETURN(auto ds, isel(static_cast>&>(simpleSlices))); - return ds.isel(static_cast>&>(complexSlices)); + + // Invoke the core vector-based slice for this dimension + MDIO_ASSIGN_OR_RETURN( + current, + current.call_isel_with_vector_impl( + padded, + std::make_index_sequence{} + ) + ); } + + return current; } /** From 2f6e23fa6ef9c0652c005894f8b71ae0d0e0ae6a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Fri, 16 May 2025 14:53:32 +0000 Subject: [PATCH 04/24] Compilable version -- Runtime issues --- mdio/intersection.h | 92 +++++++++++++++++++++++++++++++++++++++++++++ mdio/mdio.h | 1 + mdio/variable.h | 3 ++ 3 files changed, 96 insertions(+) diff --git a/mdio/intersection.h b/mdio/intersection.h index 355037d..17f40a1 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -5,6 +5,15 @@ #include "mdio/dataset.h" #include "mdio/impl.h" +#include "tensorstore/index_space/index_transform.h" +#include "tensorstore/index_space/index_domain_builder.h" +#include "tensorstore/index_space/index_domain.h" +#include "tensorstore/index_space/dim_expression.h" +#include "tensorstore/array.h" +#include "tensorstore/util/span.h" +#include "tensorstore/index_space/index_transform_builder.h" +#include "tensorstore/box.h" + namespace mdio { /// \brief Collects valid index selections per dimension for a Dataset without @@ -136,6 +145,89 @@ class IndexSelection { return selections_; } + /* + Future SliceAndSort(std::vector to_sort_by) { + // Ensure that all the keys in to_sort_by are present in the Dataset. + // If not, return an error. + + // isel the dataset by `range_descriptors` + // Begin reading in the Variables and VariableData contained by to_sort_by. + + // Solicit the futures. + + // Perform the appropriate sorts. What I need to do for the sorting is map the actual values to the indices. + // That means that I should be able to sort by value AND have an index mapping which Tensorstore should be able to take as an IndexTransform. + // If it doesn't accept the full list for sorting, then we are in for sad times! + // I'm not sure what it means for this method if + } + */ + + Future SliceAndSort(std::string to_sort_by) { + auto non_const_ds = dataset_; + MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(static_cast>&>(range_descriptors()))); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(to_sort_by)); + auto fut = var.Read(); + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + auto currentPosition = intervals; + if (!fut.status().ok()) { + return fut.status(); + } + auto dataVar = fut.value(); + auto accessor = dataVar.get_data_accessor().data(); + auto numSamples = dataVar.num_samples(); + auto flattenedOffset = dataVar.get_flattened_offset(); + + std::vector::Interval>>> indexed_values; + indexed_values.reserve(numSamples); + for (mdio::Index i = 0; i < numSamples; ++i) { + indexed_values.emplace_back(accessor[i], currentPosition); + _current_position_increment(currentPosition, intervals); + } + std::stable_sort( + indexed_values.begin(), + indexed_values.end(), + [](const auto& a, const auto& b) { + return a.first < b.first; + } + ); + + std::vector trace_indices; + MDIO_ASSIGN_OR_RETURN(auto transformVar, ds.variables.at("depth_data")); + auto numTraceSamples = transformVar.num_samples(); + trace_indices.reserve(numTraceSamples); + for (mdio::Index i = 0; i < numTraceSamples; ++i) { + trace_indices.push_back(i); + } + + std::vector idx0; + std::vector idx1; + std::vector idx2; + for (const auto& [_coord, intervals] : indexed_values) { + idx0.push_back(intervals[0].inclusive_min); + idx1.push_back(intervals[1].inclusive_min); + idx2.push_back(intervals[2].inclusive_min); + } + + auto store = transformVar.get_mutable_store(); + auto input_domain = store.domain(); + auto rank = static_cast(input_domain.rank()); + using ::tensorstore::MakeArrayView; + + // Build an index transform here using array mapping for sorted dimensions + MDIO_ASSIGN_OR_RETURN(auto transform, + tensorstore::IndexTransformBuilder<>(rank, rank) + .input_domain(input_domain) + .output_index_array(0, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx0))) + .output_index_array(1, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx1))) + .output_index_array(2, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx2))) + .output_single_input_dimension(3, 3) + .Finalize()); + + MDIO_ASSIGN_OR_RETURN(auto new_store, store | transform); + transformVar.set_store(new_store); + return ds; + } + private: const Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; diff --git a/mdio/mdio.h b/mdio/mdio.h index 1b3b9f3..716ae10 100644 --- a/mdio/mdio.h +++ b/mdio/mdio.h @@ -22,5 +22,6 @@ #define MDIO_MDIO_H_ #include "mdio/dataset.h" +#include "mdio/intersection.h" #endif // MDIO_MDIO_H_ diff --git a/mdio/variable.h b/mdio/variable.h index 2cf084d..2880b5a 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -1512,6 +1512,9 @@ Result slice(const Descriptors&... descriptors) const { */ const tensorstore::TensorStore& get_store() const { return store; } + tensorstore::TensorStore& get_mutable_store() { return store; } + void set_store(tensorstore::TensorStore& new_store) { store = new_store; } + // The data that should remain static, but MAY need to be updated. std::shared_ptr> attributes; From 74c334ea7221e72dd5ca81998aa7251e4d6bdadd Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 19 May 2025 13:35:33 +0000 Subject: [PATCH 05/24] Checkpoint --- mdio/dataset.h | 79 ++++++------- mdio/intersection.h | 283 ++++++++++++++++++++++++-------------------- 2 files changed, 188 insertions(+), 174 deletions(-) diff --git a/mdio/dataset.h b/mdio/dataset.h index ab3fbc5..c0c3f7d 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -619,58 +619,45 @@ class Dataset { * number of descriptors. */ Result isel(const std::vector>& slices) { - // Validate input - if (slices.empty()) { - return absl::InvalidArgumentError("No slices provided."); - } - - // If we exceed the maximum descriptors, split into two calls - if (slices.size() > internal::kMaxNumSlices) { - std::size_t half = slices.size() / 2; - if (half % 2 != 0) { - ++half; - } - - std::vector> firstHalf( - slices.begin(), slices.begin() + half); - std::vector> secondHalf( - slices.begin() + half, slices.end()); - - MDIO_ASSIGN_OR_RETURN(auto firstResult, isel(firstHalf)); - return firstResult.isel(secondHalf); - } + if (slices.empty()) { + return absl::InvalidArgumentError("No slices provided."); + } - // Group slices by their dimension label - std::map>> groups; - for (const auto& slice : slices) { - groups[slice.label.label()].push_back(slice); - } + // 1) Group descriptors by their dimension label + std::map>> groups; + for (auto& desc : slices) { + groups[desc.label.label()].push_back(desc); + } - // Apply slices for each group sequentially - Dataset current = *this; - for (auto& [label, group] : groups) { - // Pad with inert slices up to kMaxNumSlices - std::vector> padded = group; - padded.reserve(internal::kMaxNumSlices); - for (std::size_t i = padded.size(); i < internal::kMaxNumSlices; ++i) { - padded.emplace_back(RangeDescriptor({ - internal::kInertSliceKey, 0, 1, 1 - })); - } + // 2) Walk through each dimension-group and break it into kMax-sized windows + Dataset current = *this; + for (auto& [label, descs] : groups) { + for (size_t i = 0; i < descs.size(); i += internal::kMaxNumSlices) { + size_t end = std::min(i + internal::kMaxNumSlices, descs.size()); + std::vector> window( + descs.begin() + i, descs.begin() + end); + + // 3) Pad this window up to kMax (if your impl still needs padding) + window.reserve(internal::kMaxNumSlices); + for (size_t p = window.size(); p < internal::kMaxNumSlices; ++p) { + window.emplace_back(RangeDescriptor{ + internal::kInertSliceKey, 0, 1, 1 + }); + } - // Invoke the core vector-based slice for this dimension - MDIO_ASSIGN_OR_RETURN( - current, - current.call_isel_with_vector_impl( - padded, - std::make_index_sequence{} - ) - ); + MDIO_ASSIGN_OR_RETURN( + current, + current.call_isel_with_vector_impl( + window, + std::make_index_sequence{} + ) + ); } - - return current; } + return current; +} + /** * @brief Internal use only. * Converts the `sel` descriptors to their `isel` equivalents. diff --git a/mdio/intersection.h b/mdio/intersection.h index 17f40a1..279a745 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -16,27 +16,25 @@ namespace mdio { + /// \brief Collects valid index selections per dimension for a Dataset without /// performing slicing immediately. /// /// Only dimensions explicitly filtered via add_selection appear in the map; /// any dimension not present should be treated as having its full index range. class IndexSelection { +struct IndexedInterval; // Forward declaration public: /// Construct from an existing Dataset (captures its full domain). explicit IndexSelection(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} - /// Add or intersect selection of indices where the coordinate variable equals - /// descriptor.value. Returns a Future for async compatibility. - /// - /// For each dimension of the coordinate variable, records the index positions - /// where the value matches, then intersects with any prior selections on that - /// dimension. + /// Add or intersect selection of indices where the coordinate variable equals descriptor.value, + /// preserving full N-D tuple of IndexedInterval metadata. template mdio::Future add_selection(const ValueDescriptor& descriptor) { using Index = mdio::Index; - using Interval = typename Variable::Interval; + using VarInterval = typename Variable::Interval; // Lookup coordinate variable by label MDIO_ASSIGN_OR_RETURN(auto var, @@ -47,103 +45,124 @@ class IndexSelection { // Read coordinate data auto fut = var.Read(); - if (!fut.status().ok()) { - return fut.status(); - } + if (!fut.status().ok()) return fut.status(); auto data = fut.value(); - // Access flattened buffer const T* data_ptr = data.get_data_accessor().data(); Index offset = data.get_flattened_offset(); Index n_samples = data.num_samples(); - std::vector current_pos = intervals; + auto current_pos = intervals; - // Local map: dimension label -> matching indices - std::map> local_matched; + // Collect all matching N-D IndexedInterval tuples in this pass + std::vector> local_tuples; + local_tuples.reserve(n_samples); - // Scan all elements for (Index idx = offset; idx < offset + n_samples; ++idx) { if (data_ptr[idx] == descriptor.value) { - // On match, capture each dimension's index - for (const auto& pos : current_pos) { - std::string dim(pos.label.label()); - local_matched[dim].push_back(pos.inclusive_min); + std::vector tuple; + tuple.reserve(current_pos.size()); + for (auto const& pos : current_pos) { + IndexedInterval iv; + iv.label = pos.label; + iv.inclusive_min = pos.inclusive_min; + tuple.push_back(iv); } + local_tuples.push_back(std::move(tuple)); } _current_position_increment(current_pos, intervals); } - if (local_matched.empty()) { - return absl::NotFoundError( - "No matches for coordinate '" + std::string(descriptor.label.label()) + "'"); + if (local_tuples.empty()) { + std::stringstream ss; + ss << "No matches for coordinate '" << descriptor.label.label() << "'"; + return absl::NotFoundError(ss.str()); } - // Merge into global selections_ - for (auto& kv : local_matched) { - const std::string& dim = kv.first; - auto& vec = kv.second; - - // Deduplicate - std::sort(vec.begin(), vec.end()); - vec.erase(std::unique(vec.begin(), vec.end()), vec.end()); - - // Intersect or assign - auto it = selections_.find(dim); - if (it == selections_.end()) { - selections_.emplace(dim, std::move(vec)); - } else { - auto& existing = it->second; - std::vector intersected; - std::set_intersection( - existing.begin(), existing.end(), - vec.begin(), vec.end(), - std::back_inserter(intersected)); - existing = std::move(intersected); + // Comparator for IndexedInterval tuples: lexicographic by (label, inclusive_min) + auto tuple_cmp = [](auto const& a, auto const& b) { + size_t na = a.size(), nb = b.size(); + for (size_t i = 0; i < na && i < nb; ++i) { + const auto& ai = a[i]; + const auto& bi = b[i]; + auto al = ai.label.label(); + auto bl = bi.label.label(); + if (al != bl) return al < bl; + if (ai.inclusive_min != bi.inclusive_min) return ai.inclusive_min < bi.inclusive_min; } + return na < nb; + }; + + if (kept_tuples_.empty()) { + // First descriptor seeds the full tuple set + kept_tuples_ = std::move(local_tuples); + } else { + std::sort(kept_tuples_.begin(), kept_tuples_.end(), tuple_cmp); + std::sort(local_tuples.begin(), local_tuples.end(), tuple_cmp); + std::vector> new_kept; + std::set_intersection( + kept_tuples_.begin(), kept_tuples_.end(), + local_tuples.begin(), local_tuples.end(), + std::back_inserter(new_kept), + tuple_cmp); + kept_tuples_.swap(new_kept); } return absl::OkStatus(); } - /// \brief Build contiguous RangeDescriptors (stride=1) from selections. - /// - /// For each dimension in selections_, coalesces consecutive indices into - /// RangeDescriptor with step=1, emitting the minimal set covering all. + /// \brief Emit a RangeDescriptor per surviving tuple coordinate, without coalescing. std::vector> range_descriptors() const { using Index = mdio::Index; std::vector> result; + if (kept_tuples_.empty()) return result; + + size_t rank = kept_tuples_[0].size(); + // For each dimension from fastest (last) to slowest (0) + for (int d = static_cast(rank) - 1; d >= 0; --d) { + // Map context of higher dims -> list of positions in dim d + std::map, std::vector> context_map; + for (auto const& tup : kept_tuples_) { + std::vector context; + context.reserve(rank - d - 1); + for (size_t k = d + 1; k < rank; ++k) + context.push_back(tup[k].inclusive_min); + context_map[context].push_back(tup[d].inclusive_min); + } - for (const auto& kv : selections_) { - const auto& dim = kv.first; - const auto& idxs = kv.second; - if (idxs.empty()) continue; - - Index start = idxs.front(); - Index prev = start; - - for (size_t i = 1; i < idxs.size(); ++i) { - if (idxs[i] == prev + 1) { - prev = idxs[i]; - } else { - // close out the current range [start, prev+1) - result.emplace_back(RangeDescriptor{dim, start, prev + 1, 1}); - // begin new range - start = idxs[i]; - prev = idxs[i]; + // For each context, coalesce runs and emit RangeDescriptors + for (auto& [ctx, coords] : context_map) { + std::sort(coords.begin(), coords.end()); + coords.erase(std::unique(coords.begin(), coords.end()), coords.end()); + if (coords.empty()) continue; + Index start = coords.front(); + Index prev = start; + for (size_t i = 1; i < coords.size(); ++i) { + if (coords[i] == prev + 1) { + prev = coords[i]; + } else { + result.emplace_back(RangeDescriptor{ + tup_label(d), start, prev + 1, 1}); + start = coords[i]; prev = start; + } } + result.emplace_back(RangeDescriptor{ + tup_label(d), start, prev + 1, 1}); } - // flush last range - result.emplace_back(RangeDescriptor{dim, start, prev + 1, 1}); } - return result; } + // Helper to get dimension label by position d + std::string_view tup_label(size_t d) const { + // All tuples have same order of labels + return kept_tuples_[0][d].label.label(); + } + /// \brief Get the current selection map: dimension name -> indices. /// Dimensions not present imply full range. - const std::map>& selections() const { - return selections_; - } + // const std::map>& selections() const { + // return selections_; + // } /* Future SliceAndSort(std::vector to_sort_by) { @@ -162,83 +181,92 @@ class IndexSelection { } */ - Future SliceAndSort(std::string to_sort_by) { + /// \brief Slice the dataset by current selections and then stable-sort by one coordinate + Future SliceAndSort(const std::string& sort_key) { + // 1. Apply current selections auto non_const_ds = dataset_; - MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(static_cast>&>(range_descriptors()))); - MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(to_sort_by)); - auto fut = var.Read(); + const auto& descs = range_descriptors(); + std::cout << "Number of range descriptors: " << descs.size() << std::endl; + MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(descs)); + + // 2. Read the sort variable + MDIO_ASSIGN_OR_RETURN( + auto var, ds.variables.get(sort_key)); MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - auto currentPosition = intervals; - if (!fut.status().ok()) { - return fut.status(); - } + auto fut = var.Read(); + if (!fut.status().ok()) return fut.status(); auto dataVar = fut.value(); - auto accessor = dataVar.get_data_accessor().data(); - auto numSamples = dataVar.num_samples(); - auto flattenedOffset = dataVar.get_flattened_offset(); - - std::vector::Interval>>> indexed_values; - indexed_values.reserve(numSamples); - for (mdio::Index i = 0; i < numSamples; ++i) { - indexed_values.emplace_back(accessor[i], currentPosition); - _current_position_increment(currentPosition, intervals); - } - std::stable_sort( - indexed_values.begin(), - indexed_values.end(), - [](const auto& a, const auto& b) { - return a.first < b.first; + + // 3. Gather values and corresponding tuples + std::vector::Interval> pos = intervals; + const int32_t* acc = dataVar.get_data_accessor().data(); + mdio::Index offset = dataVar.get_flattened_offset(); + mdio::Index n = dataVar.num_samples(); + + std::vector>> indexed; + indexed.reserve(n); + for (mdio::Index i = 0; i < n; ++i) { + // build tuple of IndexedInterval + std::vector tup; + tup.reserve(pos.size()); + for (auto const& iv : pos) { + IndexedInterval tmp{iv.label, iv.inclusive_min}; + tup.push_back(tmp); } - ); - - std::vector trace_indices; - MDIO_ASSIGN_OR_RETURN(auto transformVar, ds.variables.at("depth_data")); - auto numTraceSamples = transformVar.num_samples(); - trace_indices.reserve(numTraceSamples); - for (mdio::Index i = 0; i < numTraceSamples; ++i) { - trace_indices.push_back(i); + indexed.emplace_back(acc[i + offset], tup); + _current_position_increment(pos, intervals); } - std::vector idx0; - std::vector idx1; - std::vector idx2; - for (const auto& [_coord, intervals] : indexed_values) { - idx0.push_back(intervals[0].inclusive_min); - idx1.push_back(intervals[1].inclusive_min); - idx2.push_back(intervals[2].inclusive_min); + // 4. Stable sort by value + std::stable_sort( + indexed.begin(), indexed.end(), + [](auto const& a, auto const& b){ return a.first < b.first; }); + + // 5. Extract new tuples into kept_tuples_ + kept_tuples_.clear(); + kept_tuples_.reserve(indexed.size()); + for (auto& pr : indexed) { + kept_tuples_.push_back(std::move(pr.second)); } - auto store = transformVar.get_mutable_store(); - auto input_domain = store.domain(); - auto rank = static_cast(input_domain.rank()); - using ::tensorstore::MakeArrayView; - - // Build an index transform here using array mapping for sorted dimensions - MDIO_ASSIGN_OR_RETURN(auto transform, - tensorstore::IndexTransformBuilder<>(rank, rank) - .input_domain(input_domain) - .output_index_array(0, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx0))) - .output_index_array(1, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx1))) - .output_index_array(2, 0, 1, ::tensorstore::MakeCopy(::tensorstore::MakeArrayView(idx2))) - .output_single_input_dimension(3, 3) - .Finalize()); - - MDIO_ASSIGN_OR_RETURN(auto new_store, store | transform); - transformVar.set_store(new_store); return ds; } + /// \brief Access the surviving N-D IndexedInterval tuples directly. + const std::vector>& tuples() const { + return kept_tuples_; + } + private: const Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; - std::map> selections_; + std::vector> kept_tuples_; + + /// A minimal struct carrying label and position + struct IndexedInterval { + mdio::DimensionIdentifier label; + mdio::Index inclusive_min; + IndexedInterval() = default; + IndexedInterval(mdio::DimensionIdentifier l, mdio::Index i) + : label(l), inclusive_min(i) {} + + bool operator<(IndexedInterval const& o) const { + auto a = label.label(); + auto b = o.label.label(); + if (a != b) return a < b; + return inclusive_min < o.inclusive_min; + } + bool operator==(IndexedInterval const& o) const { + return label.label() == o.label.label() && + inclusive_min == o.inclusive_min; + } + }; /// Advance a multidimensional odometer position by one step. template void _current_position_increment( std::vector::Interval>& position, const std::vector::Interval>& interval) const { - // Walk dimensions from fastest (last) to slowest (first) for (std::size_t d = position.size(); d-- > 0; ) { if (position[d].inclusive_min + 1 < interval[d].exclusive_max) { ++position[d].inclusive_min; @@ -246,7 +274,6 @@ class IndexSelection { } position[d].inclusive_min = interval[d].inclusive_min; } - // Should never reach here } }; From 83ea50a3f591fa29ed39652f054218dfe57641c1 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 20 May 2025 13:05:04 +0000 Subject: [PATCH 06/24] Working one-slice-one-sort --- mdio/intersection.h | 348 ++++++++++++++++++++------------------------ 1 file changed, 161 insertions(+), 187 deletions(-) diff --git a/mdio/intersection.h b/mdio/intersection.h index 279a745..2e0978d 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -23,88 +23,106 @@ namespace mdio { /// Only dimensions explicitly filtered via add_selection appear in the map; /// any dimension not present should be treated as having its full index range. class IndexSelection { -struct IndexedInterval; // Forward declaration public: /// Construct from an existing Dataset (captures its full domain). explicit IndexSelection(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} - /// Add or intersect selection of indices where the coordinate variable equals descriptor.value, - /// preserving full N-D tuple of IndexedInterval metadata. + template mdio::Future add_selection(const ValueDescriptor& descriptor) { - using Index = mdio::Index; - using VarInterval = typename Variable::Interval; - - // Lookup coordinate variable by label - MDIO_ASSIGN_OR_RETURN(auto var, - dataset_.variables.get(std::string(descriptor.label.label()))); + using Interval = typename Variable::Interval; - // Get per-dimension intervals - MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - - // Read coordinate data + MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); auto fut = var.Read(); + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); if (!fut.status().ok()) return fut.status(); - auto data = fut.value(); - const T* data_ptr = data.get_data_accessor().data(); - Index offset = data.get_flattened_offset(); - Index n_samples = data.num_samples(); - auto current_pos = intervals; + auto data = fut.value(); + const T* data_ptr = data.get_data_accessor().data(); + Index offset = data.get_flattened_offset(); + Index n_samples = data.num_samples(); - // Collect all matching N-D IndexedInterval tuples in this pass - std::vector> local_tuples; - local_tuples.reserve(n_samples); + auto current_pos = intervals; + bool isInRun = false; + std::vector> local_runs; - for (Index idx = offset; idx < offset + n_samples; ++idx) { + for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { if (data_ptr[idx] == descriptor.value) { - std::vector tuple; - tuple.reserve(current_pos.size()); - for (auto const& pos : current_pos) { - IndexedInterval iv; - iv.label = pos.label; - iv.inclusive_min = pos.inclusive_min; - tuple.push_back(iv); - } - local_tuples.push_back(std::move(tuple)); + if (!isInRun) { + isInRun = true; + std::vector run = current_pos; + for (auto& pos : run) { + pos.exclusive_max = pos.inclusive_min + 1; + } + local_runs.push_back(std::move(run)); + } else { + auto& run = local_runs.back(); + for (auto i=0; i(current_pos, intervals); } - if (local_tuples.empty()) { + if (local_runs.empty()) { std::stringstream ss; ss << "No matches for coordinate '" << descriptor.label.label() << "'"; return absl::NotFoundError(ss.str()); } - // Comparator for IndexedInterval tuples: lexicographic by (label, inclusive_min) - auto tuple_cmp = [](auto const& a, auto const& b) { - size_t na = a.size(), nb = b.size(); - for (size_t i = 0; i < na && i < nb; ++i) { - const auto& ai = a[i]; - const auto& bi = b[i]; - auto al = ai.label.label(); - auto bl = bi.label.label(); - if (al != bl) return al < bl; - if (ai.inclusive_min != bi.inclusive_min) return ai.inclusive_min < bi.inclusive_min; - } - return na < nb; - }; + auto new_runs = _from_intervals(local_runs); - if (kept_tuples_.empty()) { - // First descriptor seeds the full tuple set - kept_tuples_ = std::move(local_tuples); + // First time calling add_selection_2 + if (kept_runs_.empty()) { + kept_runs_ = std::move(new_runs); } else { - std::sort(kept_tuples_.begin(), kept_tuples_.end(), tuple_cmp); - std::sort(local_tuples.begin(), local_tuples.end(), tuple_cmp); - std::vector> new_kept; - std::set_intersection( - kept_tuples_.begin(), kept_tuples_.end(), - local_tuples.begin(), local_tuples.end(), - std::back_inserter(new_kept), - tuple_cmp); - kept_tuples_.swap(new_kept); + // now intersect each kept_run with each new local run + std::vector>> new_kept; + new_kept.reserve(kept_runs_.size() * local_runs.size()); + + for (const auto& kept : kept_runs_) { + for (const auto& run : new_runs) { + // start from the old run + auto intersection = kept; + bool empty = false; + + // for each descriptor in the new run... + for (const auto& d_new : run) { + // try to find the same label in the kept run + auto it = std::find_if( + intersection.begin(), intersection.end(), + [&](auto const& d_old) { + return d_old.label.label() == d_new.label.label(); + }); + + if (it != intersection.end()) { + // intersect intervals + auto& d_old = *it; + auto new_min = std::max(d_old.start, d_new.start); + auto new_max = std::min(d_old.stop, d_new.stop); + if (new_min >= new_max) { + empty = true; + break; + } + d_old.start = new_min; + d_old.stop = new_max; + } else { + // brand-new dimension: append it + intersection.push_back(d_new); + } + } + + if (!empty) { + new_kept.push_back(std::move(intersection)); + } + } + } + + kept_runs_ = std::move(new_kept); } return absl::OkStatus(); @@ -112,155 +130,111 @@ struct IndexedInterval; // Forward declaration /// \brief Emit a RangeDescriptor per surviving tuple coordinate, without coalescing. std::vector> range_descriptors() const { - using Index = mdio::Index; - std::vector> result; - if (kept_tuples_.empty()) return result; - - size_t rank = kept_tuples_[0].size(); - // For each dimension from fastest (last) to slowest (0) - for (int d = static_cast(rank) - 1; d >= 0; --d) { - // Map context of higher dims -> list of positions in dim d - std::map, std::vector> context_map; - for (auto const& tup : kept_tuples_) { - std::vector context; - context.reserve(rank - d - 1); - for (size_t k = d + 1; k < rank; ++k) - context.push_back(tup[k].inclusive_min); - context_map[context].push_back(tup[d].inclusive_min); - } - - // For each context, coalesce runs and emit RangeDescriptors - for (auto& [ctx, coords] : context_map) { - std::sort(coords.begin(), coords.end()); - coords.erase(std::unique(coords.begin(), coords.end()), coords.end()); - if (coords.empty()) continue; - Index start = coords.front(); - Index prev = start; - for (size_t i = 1; i < coords.size(); ++i) { - if (coords[i] == prev + 1) { - prev = coords[i]; - } else { - result.emplace_back(RangeDescriptor{ - tup_label(d), start, prev + 1, 1}); - start = coords[i]; prev = start; - } - } - result.emplace_back(RangeDescriptor{ - tup_label(d), start, prev + 1, 1}); + std::vector> ret; + ret.reserve(kept_runs_.size() * kept_runs_[0].size()); + for (auto const& run : kept_runs_) { + for (auto const& interval : run) { // TODO: This is not an interval! + ret.emplace_back(RangeDescriptor{interval.label, interval.start, interval.stop, 1}); } } - return result; + return ret; } - // Helper to get dimension label by position d - std::string_view tup_label(size_t d) const { - // All tuples have same order of labels - return kept_tuples_[0][d].label.label(); - } + template + Future> run_view(const std::string& sort_key, const std::string& output_variable) { + auto non_const_ds = dataset_; + // const auto& descs = range_descriptors(); + const auto& descs = kept_runs_; - /// \brief Get the current selection map: dimension name -> indices. - /// Dimensions not present imply full range. - // const std::map>& selections() const { - // return selections_; - // } - - /* - Future SliceAndSort(std::vector to_sort_by) { - // Ensure that all the keys in to_sort_by are present in the Dataset. - // If not, return an error. - - // isel the dataset by `range_descriptors` - // Begin reading in the Variables and VariableData contained by to_sort_by. - - // Solicit the futures. - - // Perform the appropriate sorts. What I need to do for the sorting is map the actual values to the indices. - // That means that I should be able to sort by value AND have an index mapping which Tensorstore should be able to take as an IndexTransform. - // If it doesn't accept the full list for sorting, then we are in for sad times! - // I'm not sure what it means for this method if - } - */ + std::vector ret; - /// \brief Slice the dataset by current selections and then stable-sort by one coordinate - Future SliceAndSort(const std::string& sort_key) { - // 1. Apply current selections - auto non_const_ds = dataset_; - const auto& descs = range_descriptors(); - std::cout << "Number of range descriptors: " << descs.size() << std::endl; - MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(descs)); - - // 2. Read the sort variable - MDIO_ASSIGN_OR_RETURN( - auto var, ds.variables.get(sort_key)); - MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - auto fut = var.Read(); - if (!fut.status().ok()) return fut.status(); - auto dataVar = fut.value(); - - // 3. Gather values and corresponding tuples - std::vector::Interval> pos = intervals; - const int32_t* acc = dataVar.get_data_accessor().data(); - mdio::Index offset = dataVar.get_flattened_offset(); - mdio::Index n = dataVar.num_samples(); - - std::vector>> indexed; - indexed.reserve(n); - for (mdio::Index i = 0; i < n; ++i) { - // build tuple of IndexedInterval - std::vector tup; - tup.reserve(pos.size()); - for (auto const& iv : pos) { - IndexedInterval tmp{iv.label, iv.inclusive_min}; - tup.push_back(tmp); + std::size_t descs_idx = 0; + std::vector>>> run_list_fut; + std::vector> run_list; + + bool sort_key_is_dimension = false; + std::size_t total_volume = 0; + + run_list.reserve(descs.size()); + std::cout << "Grabbing " << descs.size() << " coordinates from " << sort_key << std::endl; + for (const auto& desc : descs) { + MDIO_ASSIGN_OR_RETURN(auto segmentedDs, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto var, segmentedDs.variables.get(sort_key)); + auto fut = var.Read(); + run_list_fut.emplace_back(descs_idx, fut); + descs_idx++; + + total_volume += var.num_samples(); + + if (var.rank() == 1) { + sort_key_is_dimension = true; + break; } - indexed.emplace_back(acc[i + offset], tup); - _current_position_increment(pos, intervals); } - // 4. Stable sort by value - std::stable_sort( - indexed.begin(), indexed.end(), - [](auto const& a, auto const& b){ return a.first < b.first; }); + ret.reserve(total_volume); + run_list.reserve(run_list_fut.size()); - // 5. Extract new tuples into kept_tuples_ - kept_tuples_.clear(); - kept_tuples_.reserve(indexed.size()); - for (auto& pr : indexed) { - kept_tuples_.push_back(std::move(pr.second)); + // std::cout << "Awaiting futures..." << std::endl; + for (auto& fut : run_list_fut) { + if (!fut.second.status().ok()) return fut.second.status(); + auto data = fut.second.value(); + auto val = data.get_data_accessor().data()[data.get_flattened_offset()]; + run_list.emplace_back(fut.first, val); } + run_list_fut.clear(); // Clear to save memory + + // std::cout << "Sorting..." << std::endl; + std::stable_sort(run_list.begin(), run_list.end(), [](auto const& a, auto const& b) { + return a.second < b.second; + }); + + // std::cout << "Building longest-run based vector..." << std::endl; + // Now we build the Run object. + for (const auto& run : run_list) { + auto desc = descs[run.first]; + MDIO_ASSIGN_OR_RETURN(auto segmentedDs, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto var, segmentedDs.variables.template get(output_variable)); + auto fut = var.Read(); + if (!fut.status().ok()) return fut.status(); + auto data = fut.value(); + OutputType* data_ptr = data.get_data_accessor().data(); + Index offset = data.get_flattened_offset(); + Index n = data.num_samples(); + std::vector buffer(n); + std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(OutputType)); + ret.insert(ret.end(), buffer.begin(), buffer.end()); + + if (sort_key_is_dimension) { + // std::cout << "Returning early due to sort key being dimension" << std::endl; + return ret; + } - return ds; - } + } - /// \brief Access the surviving N-D IndexedInterval tuples directly. - const std::vector>& tuples() const { - return kept_tuples_; + // std::cout << "Done!" << std::endl; + return ret; } private: const Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; - std::vector> kept_tuples_; - - /// A minimal struct carrying label and position - struct IndexedInterval { - mdio::DimensionIdentifier label; - mdio::Index inclusive_min; - IndexedInterval() = default; - IndexedInterval(mdio::DimensionIdentifier l, mdio::Index i) - : label(l), inclusive_min(i) {} - - bool operator<(IndexedInterval const& o) const { - auto a = label.label(); - auto b = o.label.label(); - if (a != b) return a < b; - return inclusive_min < o.inclusive_min; - } - bool operator==(IndexedInterval const& o) const { - return label.label() == o.label.label() && - inclusive_min == o.inclusive_min; + std::vector>> kept_runs_; + + template + std::vector>> _from_intervals(std::vector::Interval>>& intervals) { + std::vector>> ret; + ret.reserve(intervals.size()); + for (auto const& run : intervals) { + std::vector> run_descs; + run_descs.reserve(run.size()); + for (auto const& interval : run) { + run_descs.emplace_back(mdio::RangeDescriptor{interval.label, interval.inclusive_min, interval.exclusive_max, 1}); + } + ret.push_back(std::move(run_descs)); } - }; + return ret; + } /// Advance a multidimensional odometer position by one step. template From e9c17719538ab4c289dbedd9c5af3eac4d91f961 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 20 May 2025 15:11:59 +0000 Subject: [PATCH 07/24] Split sorter and getter --- mdio/intersection.h | 110 ++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 59 deletions(-) diff --git a/mdio/intersection.h b/mdio/intersection.h index 2e0978d..c28e77c 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -140,79 +140,71 @@ class IndexSelection { return ret; } - template - Future> run_view(const std::string& sort_key, const std::string& output_variable) { + template + Future sort_runs(const std::string& sort_key) { auto non_const_ds = dataset_; - // const auto& descs = range_descriptors(); - const auto& descs = kept_runs_; - - std::vector ret; - - std::size_t descs_idx = 0; - std::vector>>> run_list_fut; - std::vector> run_list; - - bool sort_key_is_dimension = false; - std::size_t total_volume = 0; - - run_list.reserve(descs.size()); - std::cout << "Grabbing " << descs.size() << " coordinates from " << sort_key << std::endl; - for (const auto& desc : descs) { - MDIO_ASSIGN_OR_RETURN(auto segmentedDs, non_const_ds.isel(desc)); - MDIO_ASSIGN_OR_RETURN(auto var, segmentedDs.variables.get(sort_key)); - auto fut = var.Read(); - run_list_fut.emplace_back(descs_idx, fut); - descs_idx++; + const size_t n = kept_runs_.size(); + + // 1) Fire off all reads in parallel and gather the key values + std::vector>> reads; + reads.reserve(n); + for (auto const& desc : kept_runs_) { + MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(sort_key)); + reads.push_back(var.Read()); + } - total_volume += var.num_samples(); + std::vector keys; + keys.reserve(n); + for (auto &f : reads) { + if (!f.status().ok()) return f.status(); + auto data = f.value(); + keys.push_back(data.get_data_accessor().data()[data.get_flattened_offset()] + ); + } - if (var.rank() == 1) { - sort_key_is_dimension = true; - break; - } + // 2) Build and stable-sort an index array [0…n-1] by key + std::vector idx(n); + std::iota(idx.begin(), idx.end(), 0); + std::stable_sort( + idx.begin(), idx.end(), + [&](size_t a, size_t b) { return keys[a] < keys[b]; } + ); + + // 3) One linear, move-only pass into a temp buffer + using Desc = std::decay_t::value_type; + std::vector tmp; + tmp.reserve(n); + for (size_t new_pos = 0; new_pos < n; ++new_pos) { + tmp.emplace_back(std::move(kept_runs_[ idx[new_pos] ])); } - ret.reserve(total_volume); - run_list.reserve(run_list_fut.size()); + // 4) Steal the buffer back + kept_runs_ = std::move(tmp); + return absl::OkStatus(); + } + + template + Future> run_values(const std::string& output_variable) { + auto non_const_ds = dataset_; + std::vector ret; - // std::cout << "Awaiting futures..." << std::endl; - for (auto& fut : run_list_fut) { - if (!fut.second.status().ok()) return fut.second.status(); - auto data = fut.second.value(); - auto val = data.get_data_accessor().data()[data.get_flattened_offset()]; - run_list.emplace_back(fut.first, val); - } - run_list_fut.clear(); // Clear to save memory - - // std::cout << "Sorting..." << std::endl; - std::stable_sort(run_list.begin(), run_list.end(), [](auto const& a, auto const& b) { - return a.second < b.second; - }); - - // std::cout << "Building longest-run based vector..." << std::endl; - // Now we build the Run object. - for (const auto& run : run_list) { - auto desc = descs[run.first]; - MDIO_ASSIGN_OR_RETURN(auto segmentedDs, non_const_ds.isel(desc)); - MDIO_ASSIGN_OR_RETURN(auto var, segmentedDs.variables.template get(output_variable)); + for (const auto& desc : kept_runs_) { + MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(output_variable)); auto fut = var.Read(); if (!fut.status().ok()) return fut.status(); auto data = fut.value(); - OutputType* data_ptr = data.get_data_accessor().data(); - Index offset = data.get_flattened_offset(); + T* data_ptr = data.get_data_accessor().data(); Index n = data.num_samples(); - std::vector buffer(n); - std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(OutputType)); + Index offset = data.get_flattened_offset(); + std::vector buffer(n); + std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(T)); ret.insert(ret.end(), buffer.begin(), buffer.end()); - - if (sort_key_is_dimension) { - // std::cout << "Returning early due to sort key being dimension" << std::endl; + if (var.rank() == 1) { return ret; } - } - - // std::cout << "Done!" << std::endl; return ret; } From 0c3c2e24051d0f9634fee6b90e2c6f81348df0e0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 20 May 2025 15:37:28 +0000 Subject: [PATCH 08/24] Begin modularizing --- mdio/intersection.h | 273 ++++++++++++++++++++++++++++---------------- 1 file changed, 173 insertions(+), 100 deletions(-) diff --git a/mdio/intersection.h b/mdio/intersection.h index c28e77c..85735dc 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -31,101 +31,107 @@ class IndexSelection { template mdio::Future add_selection(const ValueDescriptor& descriptor) { - using Interval = typename Variable::Interval; - - MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); - auto fut = var.Read(); - MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - if (!fut.status().ok()) return fut.status(); - - auto data = fut.value(); - const T* data_ptr = data.get_data_accessor().data(); - Index offset = data.get_flattened_offset(); - Index n_samples = data.num_samples(); - - auto current_pos = intervals; - bool isInRun = false; - std::vector> local_runs; + // using Interval = typename Variable::Interval; + + // MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); + // auto fut = var.Read(); + // MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + // if (!fut.status().ok()) return fut.status(); + + // auto data = fut.value(); + // const T* data_ptr = data.get_data_accessor().data(); + // Index offset = data.get_flattened_offset(); + // Index n_samples = data.num_samples(); + + // auto current_pos = intervals; + // bool isInRun = false; + // std::vector> local_runs; + + // for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { + // if (data_ptr[idx] == descriptor.value) { + // if (!isInRun) { + // isInRun = true; + // std::vector run = current_pos; + // for (auto& pos : run) { + // pos.exclusive_max = pos.inclusive_min + 1; + // } + // local_runs.push_back(std::move(run)); + // } else { + // auto& run = local_runs.back(); + // for (auto i=0; i(current_pos, intervals); + // } + + // if (local_runs.empty()) { + // std::stringstream ss; + // ss << "No matches for coordinate '" << descriptor.label.label() << "'"; + // return absl::NotFoundError(ss.str()); + // } + + // auto new_runs = _from_intervals(local_runs); + + // // First time calling add_selection_2 + // if (kept_runs_.empty()) { + // kept_runs_ = std::move(new_runs); + // } else { + // // now intersect each kept_run with each new local run + // std::vector>> new_kept; + // new_kept.reserve(kept_runs_.size() * local_runs.size()); + + // for (const auto& kept : kept_runs_) { + // for (const auto& run : new_runs) { + // // start from the old run + // auto intersection = kept; + // bool empty = false; + + // // for each descriptor in the new run... + // for (const auto& d_new : run) { + // // try to find the same label in the kept run + // auto it = std::find_if( + // intersection.begin(), intersection.end(), + // [&](auto const& d_old) { + // return d_old.label.label() == d_new.label.label(); + // }); + + // if (it != intersection.end()) { + // // intersect intervals + // auto& d_old = *it; + // auto new_min = std::max(d_old.start, d_new.start); + // auto new_max = std::min(d_old.stop, d_new.stop); + // if (new_min >= new_max) { + // empty = true; + // break; + // } + // d_old.start = new_min; + // d_old.stop = new_max; + // } else { + // // brand-new dimension: append it + // intersection.push_back(d_new); + // } + // } + + // if (!empty) { + // new_kept.push_back(std::move(intersection)); + // } + // } + // } + + // kept_runs_ = std::move(new_kept); + // } + + // return absl::OkStatus(); - for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { - if (data_ptr[idx] == descriptor.value) { - if (!isInRun) { - isInRun = true; - std::vector run = current_pos; - for (auto& pos : run) { - pos.exclusive_max = pos.inclusive_min + 1; - } - local_runs.push_back(std::move(run)); - } else { - auto& run = local_runs.back(); - for (auto i=0; i(current_pos, intervals); - } - - if (local_runs.empty()) { - std::stringstream ss; - ss << "No matches for coordinate '" << descriptor.label.label() << "'"; - return absl::NotFoundError(ss.str()); - } - - auto new_runs = _from_intervals(local_runs); - - // First time calling add_selection_2 if (kept_runs_.empty()) { - kept_runs_ = std::move(new_runs); + return _init_runs(descriptor); } else { - // now intersect each kept_run with each new local run - std::vector>> new_kept; - new_kept.reserve(kept_runs_.size() * local_runs.size()); - - for (const auto& kept : kept_runs_) { - for (const auto& run : new_runs) { - // start from the old run - auto intersection = kept; - bool empty = false; - - // for each descriptor in the new run... - for (const auto& d_new : run) { - // try to find the same label in the kept run - auto it = std::find_if( - intersection.begin(), intersection.end(), - [&](auto const& d_old) { - return d_old.label.label() == d_new.label.label(); - }); - - if (it != intersection.end()) { - // intersect intervals - auto& d_old = *it; - auto new_min = std::max(d_old.start, d_new.start); - auto new_max = std::min(d_old.stop, d_new.stop); - if (new_min >= new_max) { - empty = true; - break; - } - d_old.start = new_min; - d_old.stop = new_max; - } else { - // brand-new dimension: append it - intersection.push_back(d_new); - } - } - - if (!empty) { - new_kept.push_back(std::move(intersection)); - } - } - } - - kept_runs_ = std::move(new_kept); + return _add_new_run(descriptor); } - - return absl::OkStatus(); } /// \brief Emit a RangeDescriptor per surviving tuple coordinate, without coalescing. @@ -157,10 +163,15 @@ class IndexSelection { std::vector keys; keys.reserve(n); for (auto &f : reads) { - if (!f.status().ok()) return f.status(); - auto data = f.value(); - keys.push_back(data.get_data_accessor().data()[data.get_flattened_offset()] - ); + // if (!f.status().ok()) return f.status(); + // auto data = f.value(); + // keys.push_back(data.get_data_accessor().data()[data.get_flattened_offset()]); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); + auto data = std::get<0>(resolution); + auto data_ptr = std::get<1>(resolution); + auto offset = std::get<2>(resolution); + // auto n = std::get<3>(resolution); // Not required + keys.push_back(data_ptr[offset]); } // 2) Build and stable-sort an index array [0…n-1] by key @@ -193,11 +204,11 @@ class IndexSelection { MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(output_variable)); auto fut = var.Read(); - if (!fut.status().ok()) return fut.status(); - auto data = fut.value(); - T* data_ptr = data.get_data_accessor().data(); - Index n = data.num_samples(); - Index offset = data.get_flattened_offset(); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); + auto data = std::get<0>(resolution); + auto data_ptr = std::get<1>(resolution); + auto offset = std::get<2>(resolution); + auto n = std::get<3>(resolution); std::vector buffer(n); std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(T)); ret.insert(ret.end(), buffer.begin(), buffer.end()); @@ -213,6 +224,58 @@ class IndexSelection { tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; + template + Future _init_runs(const ValueDescriptor& descriptor) { + using Interval = typename Variable::Interval; + MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); + auto fut = var.Read(); + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); + auto data = std::get<0>(resolution); + auto data_ptr = std::get<1>(resolution); + auto offset = std::get<2>(resolution); + auto n_samples = std::get<3>(resolution); + + auto current_pos = intervals; + bool isInRun = false; + std::vector> local_runs; + + for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { + if (data_ptr[idx] == descriptor.value) { + if (!isInRun) { + isInRun = true; + std::vector run = current_pos; + for (auto& pos : run) { + pos.exclusive_max = pos.inclusive_min + 1; + } + local_runs.push_back(std::move(run)); + } else { + auto& run = local_runs.back(); + for (auto i=0; i(current_pos, intervals); + } + + if (local_runs.empty()) { + std::stringstream ss; + ss << "No matches for coordinate '" << descriptor.label.label() << "'"; + return absl::NotFoundError(ss.str()); + } + + kept_runs_ = _from_intervals(local_runs); + return absl::OkStatus(); + } + + template + Future _add_new_run(const ValueDescriptor& descriptor) { + return absl::UnimplementedError("Adding selection to an existing IndexSelection is not yet implemented"); + } + template std::vector>> _from_intervals(std::vector::Interval>>& intervals) { std::vector>> ret; @@ -241,6 +304,16 @@ class IndexSelection { position[d].inclusive_min = interval[d].inclusive_min; } } + + template + Result, const T*, Index, Index>> _resolve_future(Future>& fut) { + if (!fut.status().ok()) return fut.status(); + auto data = fut.value(); + const T* data_ptr = data.get_data_accessor().data(); + Index offset = data.get_flattened_offset(); + Index n_samples = data.num_samples(); + return std::make_tuple(std::move(data), data_ptr, offset, n_samples); + } }; } // namespace mdio From 364b9ed2ca6db79e63eea8b2743ec5936b7d5e99 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 20 May 2025 16:24:09 +0000 Subject: [PATCH 09/24] Cleanup old code --- mdio/intersection.h | 160 +++++++++++++------------------------------- 1 file changed, 48 insertions(+), 112 deletions(-) diff --git a/mdio/intersection.h b/mdio/intersection.h index 85735dc..1122ece 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -31,102 +31,6 @@ class IndexSelection { template mdio::Future add_selection(const ValueDescriptor& descriptor) { - // using Interval = typename Variable::Interval; - - // MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); - // auto fut = var.Read(); - // MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - // if (!fut.status().ok()) return fut.status(); - - // auto data = fut.value(); - // const T* data_ptr = data.get_data_accessor().data(); - // Index offset = data.get_flattened_offset(); - // Index n_samples = data.num_samples(); - - // auto current_pos = intervals; - // bool isInRun = false; - // std::vector> local_runs; - - // for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { - // if (data_ptr[idx] == descriptor.value) { - // if (!isInRun) { - // isInRun = true; - // std::vector run = current_pos; - // for (auto& pos : run) { - // pos.exclusive_max = pos.inclusive_min + 1; - // } - // local_runs.push_back(std::move(run)); - // } else { - // auto& run = local_runs.back(); - // for (auto i=0; i(current_pos, intervals); - // } - - // if (local_runs.empty()) { - // std::stringstream ss; - // ss << "No matches for coordinate '" << descriptor.label.label() << "'"; - // return absl::NotFoundError(ss.str()); - // } - - // auto new_runs = _from_intervals(local_runs); - - // // First time calling add_selection_2 - // if (kept_runs_.empty()) { - // kept_runs_ = std::move(new_runs); - // } else { - // // now intersect each kept_run with each new local run - // std::vector>> new_kept; - // new_kept.reserve(kept_runs_.size() * local_runs.size()); - - // for (const auto& kept : kept_runs_) { - // for (const auto& run : new_runs) { - // // start from the old run - // auto intersection = kept; - // bool empty = false; - - // // for each descriptor in the new run... - // for (const auto& d_new : run) { - // // try to find the same label in the kept run - // auto it = std::find_if( - // intersection.begin(), intersection.end(), - // [&](auto const& d_old) { - // return d_old.label.label() == d_new.label.label(); - // }); - - // if (it != intersection.end()) { - // // intersect intervals - // auto& d_old = *it; - // auto new_min = std::max(d_old.start, d_new.start); - // auto new_max = std::min(d_old.stop, d_new.stop); - // if (new_min >= new_max) { - // empty = true; - // break; - // } - // d_old.start = new_min; - // d_old.stop = new_max; - // } else { - // // brand-new dimension: append it - // intersection.push_back(d_new); - // } - // } - - // if (!empty) { - // new_kept.push_back(std::move(intersection)); - // } - // } - // } - - // kept_runs_ = std::move(new_kept); - // } - - // return absl::OkStatus(); - if (kept_runs_.empty()) { return _init_runs(descriptor); } else { @@ -240,25 +144,42 @@ class IndexSelection { bool isInRun = false; std::vector> local_runs; + std::size_t run_idx = offset; + for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { - if (data_ptr[idx] == descriptor.value) { - if (!isInRun) { - isInRun = true; - std::vector run = current_pos; - for (auto& pos : run) { - pos.exclusive_max = pos.inclusive_min + 1; - } - local_runs.push_back(std::move(run)); - } else { - auto& run = local_runs.back(); - for (auto i=0; i(current_pos, intervals); + } + // _current_position_stride(current_pos, intervals, idx - run_idx); + run_idx = idx; + std::vector run = current_pos; + local_runs.push_back(std::move(run)); + } else if (is_match && isInRun) { + // Somewhere in the middle of a run + // do nothing TODO: Remove me + } else if (!is_match && isInRun) { + // The end of a run isInRun = false; + for (auto i=run_idx; i(current_pos, intervals); + } + // _current_position_stride(current_pos, intervals, idx - run_idx); + run_idx = idx; + auto& last_run = local_runs.back(); + for (auto i=0; i(current_pos, intervals); } if (local_runs.empty()) { @@ -305,6 +226,21 @@ class IndexSelection { } } + template + void _current_position_stride( + std::vector::Interval>& position, + const std::vector::Interval>& interval, + const std::size_t num_elements) { + auto dims = position.size(); + if (position[dims-1].exclusive_max < position[dims-1].inclusive_min + num_elements) { + position[dims-1].inclusive_min = position[dims-1].inclusive_min + num_elements; + return; + } + for (auto i=0; i(position, interval); + } + } + template Result, const T*, Index, Index>> _resolve_future(Future>& fut) { if (!fut.status().ok()) return fut.status(); From 01260320890cf4957108c4b77d299ed0b7c1ff70 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 20 May 2025 19:18:51 +0000 Subject: [PATCH 10/24] Solicit entire read before building vector --- mdio/intersection.h | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mdio/intersection.h b/mdio/intersection.h index 1122ece..4e05dfb 100644 --- a/mdio/intersection.h +++ b/mdio/intersection.h @@ -102,13 +102,23 @@ class IndexSelection { template Future> run_values(const std::string& output_variable) { auto non_const_ds = dataset_; + std::vector>> reads; + reads.reserve(kept_runs_.size()); std::vector ret; for (const auto& desc : kept_runs_) { MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(output_variable)); auto fut = var.Read(); - MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); + reads.push_back(fut); + if (var.rank() == 1) { + break; + } + } + + + for (auto& f : reads) { + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); auto data = std::get<0>(resolution); auto data_ptr = std::get<1>(resolution); auto offset = std::get<2>(resolution); @@ -116,9 +126,6 @@ class IndexSelection { std::vector buffer(n); std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(T)); ret.insert(ret.end(), buffer.begin(), buffer.end()); - if (var.rank() == 1) { - return ret; - } } return ret; } From dcdd41d31d5c89f008271c8699d7bd0476874992 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 13:53:34 +0000 Subject: [PATCH 11/24] Begin prep for final API --- .../{intersection.h => coordinate_selector.h} | 168 ++++++++++++++++-- ...on_test.cc => coordinate_selector_test.cc} | 64 +++---- 2 files changed, 181 insertions(+), 51 deletions(-) rename mdio/{intersection.h => coordinate_selector.h} (59%) rename mdio/{intersection_test.cc => coordinate_selector_test.cc} (81%) diff --git a/mdio/intersection.h b/mdio/coordinate_selector.h similarity index 59% rename from mdio/intersection.h rename to mdio/coordinate_selector.h index 4e05dfb..a113f2b 100644 --- a/mdio/intersection.h +++ b/mdio/coordinate_selector.h @@ -14,23 +14,32 @@ #include "tensorstore/index_space/index_transform_builder.h" #include "tensorstore/box.h" +#define MDIO_INTERNAL_PROFILING 0 // TODO(BrianMichell): Remove simple profiling code once we approach a more mature API access. + namespace mdio { +#ifdef MDIO_INTERNAL_PROFILING +void timer(std::chrono::high_resolution_clock::time_point start) { + auto end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end - start); + std::cout << "Time taken: " << duration.count() << " microseconds" << std::endl; +} +#endif /// \brief Collects valid index selections per dimension for a Dataset without /// performing slicing immediately. /// -/// Only dimensions explicitly filtered via add_selection appear in the map; +/// Only dimensions explicitly filtered via filterByCoordinate appear in the map; /// any dimension not present should be treated as having its full index range. -class IndexSelection { +class CoordinateSelector { public: /// Construct from an existing Dataset (captures its full domain). - explicit IndexSelection(const Dataset& dataset) + explicit CoordinateSelector(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} template - mdio::Future add_selection(const ValueDescriptor& descriptor) { + mdio::Future filterByCoordinate(const ValueDescriptor& descriptor) { if (kept_runs_.empty()) { return _init_runs(descriptor); } else { @@ -38,20 +47,11 @@ class IndexSelection { } } - /// \brief Emit a RangeDescriptor per surviving tuple coordinate, without coalescing. - std::vector> range_descriptors() const { - std::vector> ret; - ret.reserve(kept_runs_.size() * kept_runs_[0].size()); - for (auto const& run : kept_runs_) { - for (auto const& interval : run) { // TODO: This is not an interval! - ret.emplace_back(RangeDescriptor{interval.label, interval.start, interval.stop, 1}); - } - } - return ret; - } - template - Future sort_runs(const std::string& sort_key) { + Future sortSelectionByKey(const std::string& sort_key) { + #ifdef MDIO_INTERNAL_PROFILING + auto start = std::chrono::high_resolution_clock::now(); + #endif auto non_const_ds = dataset_; const size_t n = kept_runs_.size(); @@ -66,6 +66,11 @@ class IndexSelection { std::vector keys; keys.reserve(n); + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Set up sorting of " << sort_key << " ... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif for (auto &f : reads) { // if (!f.status().ok()) return f.status(); // auto data = f.value(); @@ -77,6 +82,11 @@ class IndexSelection { // auto n = std::get<3>(resolution); // Not required keys.push_back(data_ptr[offset]); } + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Waiting for reads to complete for " << sort_key << " ... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif // 2) Build and stable-sort an index array [0…n-1] by key std::vector idx(n); @@ -85,6 +95,11 @@ class IndexSelection { idx.begin(), idx.end(), [&](size_t a, size_t b) { return keys[a] < keys[b]; } ); + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Sorting time for " << sort_key << " ... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif // 3) One linear, move-only pass into a temp buffer using Desc = std::decay_t::value_type; @@ -96,11 +111,18 @@ class IndexSelection { // 4) Steal the buffer back kept_runs_ = std::move(tmp); + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Stealing buffer back time for " << sort_key << " ... "; + timer(start); + #endif return absl::OkStatus(); } template - Future> run_values(const std::string& output_variable) { + Future> readSelection(const std::string& output_variable) { + #ifdef MDIO_INTERNAL_PROFILING + auto start = std::chrono::high_resolution_clock::now(); + #endif auto non_const_ds = dataset_; std::vector>> reads; reads.reserve(kept_runs_.size()); @@ -116,6 +138,11 @@ class IndexSelection { } } + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Set up reading of " << output_variable << " ... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif for (auto& f : reads) { MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); @@ -127,6 +154,11 @@ class IndexSelection { std::memcpy(buffer.data(), data_ptr + offset, n * sizeof(T)); ret.insert(ret.end(), buffer.begin(), buffer.end()); } + + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Reading time for " << output_variable << " ... "; + timer(start); + #endif return ret; } @@ -135,9 +167,23 @@ class IndexSelection { tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; + /* + TODO: The built RangeDescriptors aren't behaving as I hoped. + They are building the longest runs possible properly, however + as it becomes disjointed we start to lose some info. + + e.g. We can have [0,1], [0, 25], [0, 120] but + the last dimension is actually [0, 1000]. + + What we should get instead is [0, 1], [0, 24], [0, 1000] and [0, 1], [24, 25], [0, 120] + */ + template Future _init_runs(const ValueDescriptor& descriptor) { using Interval = typename Variable::Interval; + #ifdef MDIO_INTERNAL_PROFILING + auto start = std::chrono::high_resolution_clock::now(); + #endif MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); auto fut = var.Read(); MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); @@ -153,6 +199,12 @@ class IndexSelection { std::size_t run_idx = offset; + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Initialize and read time... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif + for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { bool is_match = data_ptr[idx] == descriptor.value; @@ -189,6 +241,12 @@ class IndexSelection { } } + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Build runs time... "; + timer(start); + start = std::chrono::high_resolution_clock::now(); + #endif + if (local_runs.empty()) { std::stringstream ss; ss << "No matches for coordinate '" << descriptor.label.label() << "'"; @@ -196,12 +254,84 @@ class IndexSelection { } kept_runs_ = _from_intervals(local_runs); + #ifdef MDIO_INTERNAL_PROFILING + std::cout << "Finalize time... "; + timer(start); + #endif return absl::OkStatus(); } + /** + * @brief Using the existing runs, further filter the Dataset by the new coordiante. + */ template Future _add_new_run(const ValueDescriptor& descriptor) { - return absl::UnimplementedError("Adding selection to an existing IndexSelection is not yet implemented"); + using Interval = typename Variable::Interval; + std::vector> new_runs; + + std::vector> stored_intervals; // Use this to ensure everything remains in memory until the Intervals are no longer needed. + stored_intervals.reserve(kept_runs_.size()); + + auto non_const_ds = dataset_; + + for (const auto& desc : kept_runs_) { + MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(std::string(descriptor.label.label()))); + auto fut = var.Read(); + MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + stored_intervals.push_back(std::move(intervals)); // Just to ensure nothing gets freed prematurely. + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); + auto data = std::get<0>(resolution); + auto data_ptr = std::get<1>(resolution); + auto offset = std::get<2>(resolution); + auto n = std::get<3>(resolution); + + auto current_pos = intervals; + bool isInRun = false; + + std::size_t run_idx = offset; + + for (Index idx = offset; idx < offset + n; ++idx) { + bool is_match = data_ptr[idx] == descriptor.value; + if (is_match && !isInRun) { + isInRun = true; + for (auto i=run_idx; i(current_pos, intervals); + } + run_idx = idx; + std::vector run = current_pos; + new_runs.push_back(std::move(run)); + } else if (is_match && isInRun) { + // Somewhere in the middle of a run + // do nothing TODO: Remove me + } else if (!is_match && isInRun) { + // The end of a run + isInRun = false; + for (auto i=run_idx; i(current_pos, intervals); + } + run_idx = idx; + auto& last_run = new_runs.back(); + for (auto i=0; i(new_runs); // TODO: We need to ensure we don't accidentally drop any pre-sliced dimensions... + return absl::OkStatus(); } template diff --git a/mdio/intersection_test.cc b/mdio/coordinate_selector_test.cc similarity index 81% rename from mdio/intersection_test.cc rename to mdio/coordinate_selector_test.cc index 697ceee..239340c 100644 --- a/mdio/intersection_test.cc +++ b/mdio/coordinate_selector_test.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "mdio/dataset.h" -#include "mdio/intersection.h" +#include "mdio/coordinate_selector.h" #include #include @@ -219,7 +219,7 @@ TEST(Intersection, constructor) { ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); auto ds = dsFut.value(); - mdio::IndexSelection is(ds); + mdio::CoordinateSelector cs(ds); } TEST(Intersection, add_selection) { @@ -231,13 +231,13 @@ TEST(Intersection, add_selection) { ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); auto ds = dsFut.value(); - mdio::IndexSelection is(ds); + mdio::CoordinateSelector cs(ds); mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = is.add_selection(liveMaskDesc); + auto isFut = cs.filterByCoordinate(liveMaskDesc); ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - auto selections = is.selections(); - ASSERT_EQ(selections.size(), 2) << "Expected 2 dimensions in the selection map but got " << selections.size(); + // auto selections = cs.selections(); + // ASSERT_EQ(selections.size(), 2) << "Expected 2 dimensions in the selection map but got " << selections.size(); } TEST(Intersection, range_descriptors) { @@ -249,24 +249,24 @@ TEST(Intersection, range_descriptors) { ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); auto ds = dsFut.value(); - mdio::IndexSelection is(ds); + mdio::CoordinateSelector cs(ds); mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = is.add_selection(liveMaskDesc); + auto isFut = cs.filterByCoordinate(liveMaskDesc); ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - auto rangeDescriptors = is.range_descriptors(); + // auto rangeDescriptors = cs.range_descriptors(); - ASSERT_EQ(rangeDescriptors.size(), 2); - EXPECT_EQ(rangeDescriptors[0].label.label(), "task") << "Expected first RangeDescriptor to be for the 'task' dimension"; - EXPECT_EQ(rangeDescriptors[1].label.label(), "trace") << "Expected second RangeDescriptor to be for the 'trace' dimension"; + // ASSERT_EQ(rangeDescriptors.size(), 2); + // EXPECT_EQ(rangeDescriptors[0].label.label(), "task") << "Expected first RangeDescriptor to be for the 'task' dimension"; + // EXPECT_EQ(rangeDescriptors[1].label.label(), "trace") << "Expected second RangeDescriptor to be for the 'trace' dimension"; - EXPECT_EQ(rangeDescriptors[0].start, 0) << "Expected first RangeDescriptor to start at index 0"; - EXPECT_EQ(rangeDescriptors[0].stop, 15) << "Expected first RangeDescriptor to stop at index 15"; - EXPECT_EQ(rangeDescriptors[0].step, 1) << "Expected first RangeDescriptor to have a step of 1"; + // EXPECT_EQ(rangeDescriptors[0].start, 0) << "Expected first RangeDescriptor to start at index 0"; + // EXPECT_EQ(rangeDescriptors[0].stop, 15) << "Expected first RangeDescriptor to stop at index 15"; + // EXPECT_EQ(rangeDescriptors[0].step, 1) << "Expected first RangeDescriptor to have a step of 1"; - EXPECT_EQ(rangeDescriptors[1].start, 0) << "Expected second RangeDescriptor to start at index 0"; - EXPECT_EQ(rangeDescriptors[1].stop, 256) << "Expected second RangeDescriptor to stop at index 256"; - EXPECT_EQ(rangeDescriptors[1].step, 1) << "Expected second RangeDescriptor to have a step of 1"; + // EXPECT_EQ(rangeDescriptors[1].start, 0) << "Expected second RangeDescriptor to start at index 0"; + // EXPECT_EQ(rangeDescriptors[1].stop, 256) << "Expected second RangeDescriptor to stop at index 256"; + // EXPECT_EQ(rangeDescriptors[1].step, 1) << "Expected second RangeDescriptor to have a step of 1"; } TEST(Intersection, get_inline_range) { @@ -278,17 +278,17 @@ TEST(Intersection, get_inline_range) { ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); auto ds = dsFut.value(); - mdio::IndexSelection is(ds); + mdio::CoordinateSelector cs(ds); mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = is.add_selection(liveMaskDesc); + auto isFut = cs.filterByCoordinate(liveMaskDesc); ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - isFut = is.add_selection(mdio::ValueDescriptor{"inline", 18}); + isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 18}); ASSERT_TRUE(isFut.status().ok()) << isFut.status(); - auto rangeDescriptors = is.range_descriptors(); + // auto rangeDescriptors = cs.range_descriptors(); - for (const auto& desc : rangeDescriptors) { - std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; - } + // for (const auto& desc : rangeDescriptors) { + // std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; + // } } @@ -301,17 +301,17 @@ TEST(Intersection, get_inline_range_dead) { ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); auto ds = dsFut.value(); - mdio::IndexSelection is(ds); + mdio::CoordinateSelector cs(ds); mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = is.add_selection(liveMaskDesc); + auto isFut = cs.filterByCoordinate(liveMaskDesc); ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - isFut = is.add_selection(mdio::ValueDescriptor{"inline", 5000}); + isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 5000}); EXPECT_FALSE(isFut.status().ok()) << "Expected an error when adding a selection for an invalid inline index"; - auto rangeDescriptors = is.range_descriptors(); - for (const auto& desc : rangeDescriptors) { - std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; - } + // auto rangeDescriptors = is.range_descriptors(); + // for (const auto& desc : rangeDescriptors) { + // std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; + // } } } // namespace From 5d77a9e6c7bc0de7da6d7ed9c76f6007cc0c47d7 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 13:54:15 +0000 Subject: [PATCH 12/24] Fix coordinate selector internal import --- mdio/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdio/mdio.h b/mdio/mdio.h index 716ae10..2435723 100644 --- a/mdio/mdio.h +++ b/mdio/mdio.h @@ -22,6 +22,6 @@ #define MDIO_MDIO_H_ #include "mdio/dataset.h" -#include "mdio/intersection.h" +#include "mdio/coordinate_selector.h" #endif // MDIO_MDIO_H_ From a6908cd8e3300680021eef8c4f6b0328561ea27e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 14:00:00 +0000 Subject: [PATCH 13/24] Add warning --- mdio/coordinate_selector.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index a113f2b..de1e997 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -274,11 +274,22 @@ class CoordinateSelector { auto non_const_ds = dataset_; + bool is_first_run = true; + for (const auto& desc : kept_runs_) { MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(std::string(descriptor.label.label()))); auto fut = var.Read(); MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); + + if (is_first_run) { + is_first_run = false; + if (intervals.size() != kept_runs_[0].size()) { + std::cout << "WARNING: Different coordinate dimensions detected. This behavior is not yet supported." << std::endl; + std::cout << "\tFor expected behavior, please ensure all previous dimensions are less than or equal to the current dimension." << std::endl; + } + } + stored_intervals.push_back(std::move(intervals)); // Just to ensure nothing gets freed prematurely. MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); auto data = std::get<0>(resolution); From b2b5fdcca81f9787573970088cadc79b769cc85b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 15:09:42 +0000 Subject: [PATCH 14/24] Add a composible "read" function --- mdio/coordinate_selector.h | 58 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index de1e997..5d80f87 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -26,6 +26,25 @@ void timer(std::chrono::high_resolution_clock::time_point start) { } #endif +//— helper to tag multi-key sorts —— +template +struct SortKey { + std::string key; + using value_type = T; +}; + +//— trait to detect ValueDescriptor —— +template struct is_value_descriptor : std::false_type {}; +template struct is_value_descriptor> : std::true_type {}; +template +inline constexpr bool is_value_descriptor_v = is_value_descriptor>::value; + +//— trait to detect SortKey —— +template struct is_sort_key : std::false_type {}; +template struct is_sort_key> : std::true_type {}; +template +inline constexpr bool is_sort_key_v = is_sort_key>::value; + /// \brief Collects valid index selections per dimension for a Dataset without /// performing slicing immediately. /// @@ -37,7 +56,24 @@ class CoordinateSelector { explicit CoordinateSelector(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} +template +Future> ReadDataVariable(const std::string& data_variable, Ops const&... ops) { + // 1) apply filters & sorts in the exact order given + absl::Status st = absl::OkStatus(); + ((st = st.ok() ? _applyOp(ops).status() : st), ...); + if (!st.ok()) return st; + // 2) finally read out the requested variable + return readSelection(data_variable); +} + + /** + * @brief Filter the Dataset by the given coordinate. + * Limitations: + * - Only a single filter is currently tested. + * - A bug exists if the filter value does not make a perfect hyper-rectangle within its dimensions. + * + */ template mdio::Future filterByCoordinate(const ValueDescriptor& descriptor) { if (kept_runs_.empty()) { @@ -167,6 +203,28 @@ class CoordinateSelector { tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; + template + Future _applyOp(D const& op) { + if constexpr (is_value_descriptor_v) { + auto fut = filterByCoordinate(op); + if (!fut.status().ok()) { + return fut.status(); + } + return absl::OkStatus(); + } else if constexpr (is_sort_key_v) { + using SortT = typename std::decay_t::value_type; + auto fut = sortSelectionByKey(op.key); + if (!fut.status().ok()) { + return fut.status(); + } + return absl::OkStatus(); + } else { + return absl::UnimplementedError( + "query(): RangeDescriptor and ListDescriptor not supported"); + } + } + + /* TODO: The built RangeDescriptors aren't behaving as I hoped. They are building the longest runs possible properly, however From 26b2a3c514d3f6efc6b5f8ec554644d85592cf9a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 15:11:27 +0000 Subject: [PATCH 15/24] Begin formatting --- mdio/coordinate_selector.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 5d80f87..fd8b812 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -14,7 +14,7 @@ #include "tensorstore/index_space/index_transform_builder.h" #include "tensorstore/box.h" -#define MDIO_INTERNAL_PROFILING 0 // TODO(BrianMichell): Remove simple profiling code once we approach a more mature API access. +// #define MDIO_INTERNAL_PROFILING 0 // TODO(BrianMichell): Remove simple profiling code once we approach a more mature API access. namespace mdio { @@ -56,16 +56,16 @@ class CoordinateSelector { explicit CoordinateSelector(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} -template -Future> ReadDataVariable(const std::string& data_variable, Ops const&... ops) { - // 1) apply filters & sorts in the exact order given - absl::Status st = absl::OkStatus(); - ((st = st.ok() ? _applyOp(ops).status() : st), ...); - if (!st.ok()) return st; + template + Future> ReadDataVariable(const std::string& data_variable, Ops const&... ops) { + // 1) apply filters & sorts in the exact order given + absl::Status st = absl::OkStatus(); + ((st = st.ok() ? _applyOp(ops).status() : st), ...); + if (!st.ok()) return st; - // 2) finally read out the requested variable - return readSelection(data_variable); -} + // 2) finally read out the requested variable + return readSelection(data_variable); + } /** * @brief Filter the Dataset by the given coordinate. From 9f03ca85e3f58b83057249c22b7cf6d6b48e5702 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 16:18:10 +0000 Subject: [PATCH 16/24] Resolve off-by-one error in run builder --- mdio/coordinate_selector.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index fd8b812..17dd2ed 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -282,7 +282,8 @@ class CoordinateSelector { } else if (!is_match && isInRun) { // The end of a run isInRun = false; - for (auto i=run_idx; i(current_pos, intervals); } // _current_position_stride(current_pos, intervals, idx - run_idx); @@ -291,6 +292,8 @@ class CoordinateSelector { for (auto i=0; i(current_pos, intervals); } else if (!is_match && !isInRun) { // No run at all // do nothing TODO: Remove me @@ -375,6 +378,7 @@ class CoordinateSelector { // do nothing TODO: Remove me } else if (!is_match && isInRun) { // The end of a run + // TODO(BrianMichell): Ensure we are using the correct index (see above) isInRun = false; for (auto i=run_idx; i(current_pos, intervals); From e2692819fcbfe310e32a58525b3c5b233d4775be Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 16:28:08 +0000 Subject: [PATCH 17/24] clang-formatting --- mdio/coordinate_selector.h | 229 +++++++++++---------- mdio/coordinate_selector_test.cc | 332 +++++++++++++++++-------------- mdio/dataset.h | 71 ++++--- mdio/mdio.h | 2 +- mdio/variable.h | 202 +++++++++---------- 5 files changed, 437 insertions(+), 399 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 17dd2ed..95f4b69 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -1,63 +1,72 @@ #ifndef MDIO_INTERSECTION_H #define MDIO_INTERSECTION_H -#include "mdio/variable.h" #include "mdio/dataset.h" #include "mdio/impl.h" - -#include "tensorstore/index_space/index_transform.h" -#include "tensorstore/index_space/index_domain_builder.h" -#include "tensorstore/index_space/index_domain.h" -#include "tensorstore/index_space/dim_expression.h" +#include "mdio/variable.h" #include "tensorstore/array.h" -#include "tensorstore/util/span.h" -#include "tensorstore/index_space/index_transform_builder.h" #include "tensorstore/box.h" +#include "tensorstore/index_space/dim_expression.h" +#include "tensorstore/index_space/index_domain.h" +#include "tensorstore/index_space/index_domain_builder.h" +#include "tensorstore/index_space/index_transform.h" +#include "tensorstore/index_space/index_transform_builder.h" +#include "tensorstore/util/span.h" -// #define MDIO_INTERNAL_PROFILING 0 // TODO(BrianMichell): Remove simple profiling code once we approach a more mature API access. +// #define MDIO_INTERNAL_PROFILING 0 // TODO(BrianMichell): Remove simple +// profiling code once we approach a more mature API access. namespace mdio { #ifdef MDIO_INTERNAL_PROFILING void timer(std::chrono::high_resolution_clock::time_point start) { auto end = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end - start); - std::cout << "Time taken: " << duration.count() << " microseconds" << std::endl; + auto duration = + std::chrono::duration_cast(end - start); + std::cout << "Time taken: " << duration.count() << " microseconds" + << std::endl; } #endif -//— helper to tag multi-key sorts —— -template +// — helper to tag multi-key sorts —— +template struct SortKey { std::string key; using value_type = T; }; -//— trait to detect ValueDescriptor —— -template struct is_value_descriptor : std::false_type {}; -template struct is_value_descriptor> : std::true_type {}; -template -inline constexpr bool is_value_descriptor_v = is_value_descriptor>::value; - -//— trait to detect SortKey —— -template struct is_sort_key : std::false_type {}; -template struct is_sort_key> : std::true_type {}; -template +// — trait to detect ValueDescriptor —— +template +struct is_value_descriptor : std::false_type {}; +template +struct is_value_descriptor> : std::true_type {}; +template +inline constexpr bool is_value_descriptor_v = + is_value_descriptor>::value; + +// — trait to detect SortKey —— +template +struct is_sort_key : std::false_type {}; +template +struct is_sort_key> : std::true_type {}; +template inline constexpr bool is_sort_key_v = is_sort_key>::value; /// \brief Collects valid index selections per dimension for a Dataset without /// performing slicing immediately. /// -/// Only dimensions explicitly filtered via filterByCoordinate appear in the map; -/// any dimension not present should be treated as having its full index range. +/// Only dimensions explicitly filtered via filterByCoordinate appear in the +/// map; any dimension not present should be treated as having its full index +/// range. class CoordinateSelector { -public: + public: /// Construct from an existing Dataset (captures its full domain). explicit CoordinateSelector(const Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} template - Future> ReadDataVariable(const std::string& data_variable, Ops const&... ops) { + Future> ReadDataVariable(const std::string& data_variable, + Ops const&... ops) { // 1) apply filters & sorts in the exact order given absl::Status st = absl::OkStatus(); ((st = st.ok() ? _applyOp(ops).status() : st), ...); @@ -69,10 +78,11 @@ class CoordinateSelector { /** * @brief Filter the Dataset by the given coordinate. - * Limitations: + * Limitations: * - Only a single filter is currently tested. - * - A bug exists if the filter value does not make a perfect hyper-rectangle within its dimensions. - * + * - A bug exists if the filter value does not make a perfect hyper-rectangle + * within its dimensions. + * */ template mdio::Future filterByCoordinate(const ValueDescriptor& descriptor) { @@ -85,9 +95,9 @@ class CoordinateSelector { template Future sortSelectionByKey(const std::string& sort_key) { - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); - #endif +#endif auto non_const_ds = dataset_; const size_t n = kept_runs_.size(); @@ -102,12 +112,12 @@ class CoordinateSelector { std::vector keys; keys.reserve(n); - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Set up sorting of " << sort_key << " ... "; timer(start); start = std::chrono::high_resolution_clock::now(); - #endif - for (auto &f : reads) { +#endif + for (auto& f : reads) { // if (!f.status().ok()) return f.status(); // auto data = f.value(); // keys.push_back(data.get_data_accessor().data()[data.get_flattened_offset()]); @@ -118,47 +128,45 @@ class CoordinateSelector { // auto n = std::get<3>(resolution); // Not required keys.push_back(data_ptr[offset]); } - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Waiting for reads to complete for " << sort_key << " ... "; timer(start); start = std::chrono::high_resolution_clock::now(); - #endif +#endif // 2) Build and stable-sort an index array [0…n-1] by key std::vector idx(n); std::iota(idx.begin(), idx.end(), 0); - std::stable_sort( - idx.begin(), idx.end(), - [&](size_t a, size_t b) { return keys[a] < keys[b]; } - ); - #ifdef MDIO_INTERNAL_PROFILING + std::stable_sort(idx.begin(), idx.end(), + [&](size_t a, size_t b) { return keys[a] < keys[b]; }); +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Sorting time for " << sort_key << " ... "; timer(start); start = std::chrono::high_resolution_clock::now(); - #endif +#endif // 3) One linear, move-only pass into a temp buffer using Desc = std::decay_t::value_type; std::vector tmp; tmp.reserve(n); for (size_t new_pos = 0; new_pos < n; ++new_pos) { - tmp.emplace_back(std::move(kept_runs_[ idx[new_pos] ])); + tmp.emplace_back(std::move(kept_runs_[idx[new_pos]])); } // 4) Steal the buffer back kept_runs_ = std::move(tmp); - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Stealing buffer back time for " << sort_key << " ... "; timer(start); - #endif +#endif return absl::OkStatus(); } template Future> readSelection(const std::string& output_variable) { - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); - #endif +#endif auto non_const_ds = dataset_; std::vector>> reads; reads.reserve(kept_runs_.size()); @@ -174,11 +182,11 @@ class CoordinateSelector { } } - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Set up reading of " << output_variable << " ... "; timer(start); start = std::chrono::high_resolution_clock::now(); - #endif +#endif for (auto& f : reads) { MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); @@ -191,19 +199,19 @@ class CoordinateSelector { ret.insert(ret.end(), buffer.begin(), buffer.end()); } - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Reading time for " << output_variable << " ... "; timer(start); - #endif +#endif return ret; } -private: + private: const Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; - template + template Future _applyOp(D const& op) { if constexpr (is_value_descriptor_v) { auto fut = filterByCoordinate(op); @@ -220,29 +228,30 @@ class CoordinateSelector { return absl::OkStatus(); } else { return absl::UnimplementedError( - "query(): RangeDescriptor and ListDescriptor not supported"); + "query(): RangeDescriptor and ListDescriptor not supported"); } } - /* TODO: The built RangeDescriptors aren't behaving as I hoped. They are building the longest runs possible properly, however as it becomes disjointed we start to lose some info. e.g. We can have [0,1], [0, 25], [0, 120] but - the last dimension is actually [0, 1000]. + the last dimension is actually [0, 1000]. - What we should get instead is [0, 1], [0, 24], [0, 1000] and [0, 1], [24, 25], [0, 120] + What we should get instead is [0, 1], [0, 24], [0, 1000] and [0, 1], [24, 25], + [0, 120] */ template Future _init_runs(const ValueDescriptor& descriptor) { using Interval = typename Variable::Interval; - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); - #endif - MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get(std::string(descriptor.label.label()))); +#endif + MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get( + std::string(descriptor.label.label()))); auto fut = var.Read(); MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); @@ -257,11 +266,11 @@ class CoordinateSelector { std::size_t run_idx = offset; - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Initialize and read time... "; timer(start); start = std::chrono::high_resolution_clock::now(); - #endif +#endif for (mdio::Index idx = offset; idx < offset + n_samples; ++idx) { bool is_match = data_ptr[idx] == descriptor.value; @@ -269,7 +278,7 @@ class CoordinateSelector { if (is_match && !isInRun) { // The start of a new run isInRun = true; - for (auto i=run_idx; i(current_pos, intervals); } // _current_position_stride(current_pos, intervals, idx - run_idx); @@ -282,14 +291,15 @@ class CoordinateSelector { } else if (!is_match && isInRun) { // The end of a run isInRun = false; - // Use 1 less than the current index to ensure we get the correct end location. - for (auto i=run_idx; i(current_pos, intervals); } // _current_position_stride(current_pos, intervals, idx - run_idx); run_idx = idx; auto& last_run = local_runs.back(); - for (auto i=0; i(local_runs); - #ifdef MDIO_INTERNAL_PROFILING +#ifdef MDIO_INTERNAL_PROFILING std::cout << "Finalize time... "; - timer(start); - #endif + timer(start); +#endif return absl::OkStatus(); } /** - * @brief Using the existing runs, further filter the Dataset by the new coordiante. + * @brief Using the existing runs, further filter the Dataset by the new + * coordiante. */ template Future _add_new_run(const ValueDescriptor& descriptor) { using Interval = typename Variable::Interval; std::vector> new_runs; - std::vector> stored_intervals; // Use this to ensure everything remains in memory until the Intervals are no longer needed. + std::vector> + stored_intervals; // Use this to ensure everything remains in memory + // until the Intervals are no longer needed. stored_intervals.reserve(kept_runs_.size()); auto non_const_ds = dataset_; @@ -339,19 +352,26 @@ class CoordinateSelector { for (const auto& desc : kept_runs_) { MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); - MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(std::string(descriptor.label.label()))); + MDIO_ASSIGN_OR_RETURN( + auto var, ds.variables.get(std::string(descriptor.label.label()))); auto fut = var.Read(); MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); if (is_first_run) { is_first_run = false; if (intervals.size() != kept_runs_[0].size()) { - std::cout << "WARNING: Different coordinate dimensions detected. This behavior is not yet supported." << std::endl; - std::cout << "\tFor expected behavior, please ensure all previous dimensions are less than or equal to the current dimension." << std::endl; + std::cout << "WARNING: Different coordinate dimensions detected. " + "This behavior is not yet supported." + << std::endl; + std::cout + << "\tFor expected behavior, please ensure all previous " + "dimensions are less than or equal to the current dimension." + << std::endl; } } - stored_intervals.push_back(std::move(intervals)); // Just to ensure nothing gets freed prematurely. + stored_intervals.push_back(std::move( + intervals)); // Just to ensure nothing gets freed prematurely. MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); auto data = std::get<0>(resolution); auto data_ptr = std::get<1>(resolution); @@ -367,7 +387,7 @@ class CoordinateSelector { bool is_match = data_ptr[idx] == descriptor.value; if (is_match && !isInRun) { isInRun = true; - for (auto i=run_idx; i(current_pos, intervals); } run_idx = idx; @@ -378,14 +398,15 @@ class CoordinateSelector { // do nothing TODO: Remove me } else if (!is_match && isInRun) { // The end of a run - // TODO(BrianMichell): Ensure we are using the correct index (see above) + // TODO(BrianMichell): Ensure we are using the correct index (see + // above) isInRun = false; - for (auto i=run_idx; i(current_pos, intervals); } run_idx = idx; auto& last_run = new_runs.back(); - for (auto i=0; i(new_runs); // TODO: We need to ensure we don't accidentally drop any pre-sliced dimensions... + kept_runs_ = _from_intervals( + new_runs); // TODO: We need to ensure we don't accidentally drop any + // pre-sliced dimensions... return absl::OkStatus(); } template - std::vector>> _from_intervals(std::vector::Interval>>& intervals) { + std::vector>> _from_intervals( + std::vector::Interval>>& + intervals) { std::vector>> ret; ret.reserve(intervals.size()); for (auto const& run : intervals) { std::vector> run_descs; run_descs.reserve(run.size()); for (auto const& interval : run) { - run_descs.emplace_back(mdio::RangeDescriptor{interval.label, interval.inclusive_min, interval.exclusive_max, 1}); + run_descs.emplace_back(mdio::RangeDescriptor{ + interval.label, interval.inclusive_min, interval.exclusive_max, 1}); } ret.push_back(std::move(run_descs)); } @@ -427,7 +453,7 @@ class CoordinateSelector { void _current_position_increment( std::vector::Interval>& position, const std::vector::Interval>& interval) const { - for (std::size_t d = position.size(); d-- > 0; ) { + for (std::size_t d = position.size(); d-- > 0;) { if (position[d].inclusive_min + 1 < interval[d].exclusive_max) { ++position[d].inclusive_min; return; @@ -438,21 +464,24 @@ class CoordinateSelector { template void _current_position_stride( - std::vector::Interval>& position, - const std::vector::Interval>& interval, - const std::size_t num_elements) { - auto dims = position.size(); - if (position[dims-1].exclusive_max < position[dims-1].inclusive_min + num_elements) { - position[dims-1].inclusive_min = position[dims-1].inclusive_min + num_elements; - return; - } - for (auto i=0; i(position, interval); - } + std::vector::Interval>& position, + const std::vector::Interval>& interval, + const std::size_t num_elements) { + auto dims = position.size(); + if (position[dims - 1].exclusive_max < + position[dims - 1].inclusive_min + num_elements) { + position[dims - 1].inclusive_min = + position[dims - 1].inclusive_min + num_elements; + return; + } + for (auto i = 0; i < num_elements; ++i) { + _current_position_increment(position, interval); + } } template - Result, const T*, Index, Index>> _resolve_future(Future>& fut) { + Result, const T*, Index, Index>> _resolve_future( + Future>& fut) { if (!fut.status().ok()) return fut.status(); auto data = fut.value(); const T* data_ptr = data.get_data_accessor().data(); @@ -462,6 +491,6 @@ class CoordinateSelector { } }; -} // namespace mdio +} // namespace mdio -#endif // MDIO_INTERSECTION_H +#endif // MDIO_INTERSECTION_H diff --git a/mdio/coordinate_selector_test.cc b/mdio/coordinate_selector_test.cc index 239340c..93eaec9 100644 --- a/mdio/coordinate_selector_test.cc +++ b/mdio/coordinate_selector_test.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "mdio/dataset.h" #include "mdio/coordinate_selector.h" #include @@ -25,6 +24,7 @@ #include #include +#include "mdio/dataset.h" #include "mdio/dataset_factory.h" #include "tensorstore/driver/driver.h" #include "tensorstore/driver/registry.h" @@ -44,8 +44,8 @@ namespace { mdio::Result SetupDataset() { - std::string ds_path = "generic_with_coords.mdio"; - std::string schema_str = R"( + std::string ds_path = "generic_with_coords.mdio"; + std::string schema_str = R"( { "metadata": { "name": "generic_with_coords", @@ -140,178 +140,202 @@ mdio::Result SetupDataset() { ] })"; - auto schema = ::nlohmann::json::parse(schema_str); - auto dsFut = mdio::Dataset::from_json(schema, ds_path, mdio::constants::kCreate); - if (!dsFut.status().ok()) { - return ds_path; - } - auto ds = dsFut.value(); - - // Populate the dataset with data - MDIO_ASSIGN_OR_RETURN(auto dataVar, ds.variables.get("DataVariable")); - MDIO_ASSIGN_OR_RETURN(auto inlineVar, ds.variables.get("inline")); - MDIO_ASSIGN_OR_RETURN(auto crosslineVar, ds.variables.get("crossline")); - MDIO_ASSIGN_OR_RETURN(auto liveMaskVar, ds.variables.get("live_mask")); - - MDIO_ASSIGN_OR_RETURN(auto varData, mdio::from_variable(dataVar)); - MDIO_ASSIGN_OR_RETURN(auto inlineData, mdio::from_variable(inlineVar)); - MDIO_ASSIGN_OR_RETURN(auto crosslineData, mdio::from_variable(crosslineVar)); - MDIO_ASSIGN_OR_RETURN(auto liveMaskData, mdio::from_variable(liveMaskVar)); - - auto varDataPtr = varData.get_data_accessor().data(); - auto inlineDataPtr = inlineData.get_data_accessor().data(); - auto crosslineDataPtr = crosslineData.get_data_accessor().data(); - auto liveMaskDataPtr = liveMaskData.get_data_accessor().data(); - - auto varOffset = varData.get_flattened_offset(); - auto inlineOffset = inlineData.get_flattened_offset(); - auto crosslineOffset = crosslineData.get_flattened_offset(); - auto liveMaskOffset = liveMaskData.get_flattened_offset(); - - std::size_t coords = 0; - std::size_t var = 0; - - for (int i=0; i<15; ++i) { // Only 15 of the 25 tasks were "assigned" - for (int j=0; j<256; ++j) { - inlineDataPtr[coords + inlineOffset] = coords; - crosslineDataPtr[coords + crosslineOffset] = coords*4; - liveMaskDataPtr[coords + liveMaskOffset] = true; - for (int k=0; k<512; ++k) { - varDataPtr[var + varOffset] = i * 256 * 512 + j * 512 + k; - var++; - } - coords++; - } - } - - auto varDataFut = dataVar.Write(varData); - auto inlineDataFut = inlineVar.Write(inlineData); - auto crosslineDataFut = crosslineVar.Write(crosslineData); - auto liveMaskDataFut = liveMaskVar.Write(liveMaskData); - - if (!varDataFut.status().ok()) { - return varDataFut.status(); - } - if (!inlineDataFut.status().ok()) { - return inlineDataFut.status(); - } - if (!crosslineDataFut.status().ok()) { - return crosslineDataFut.status(); - } - if (!liveMaskDataFut.status().ok()) { - return liveMaskDataFut.status(); - } - + auto schema = ::nlohmann::json::parse(schema_str); + auto dsFut = + mdio::Dataset::from_json(schema, ds_path, mdio::constants::kCreate); + if (!dsFut.status().ok()) { return ds_path; + } + auto ds = dsFut.value(); + + // Populate the dataset with data + MDIO_ASSIGN_OR_RETURN(auto dataVar, ds.variables.get("DataVariable")); + MDIO_ASSIGN_OR_RETURN(auto inlineVar, ds.variables.get("inline")); + MDIO_ASSIGN_OR_RETURN(auto crosslineVar, + ds.variables.get("crossline")); + MDIO_ASSIGN_OR_RETURN(auto liveMaskVar, ds.variables.get("live_mask")); + + MDIO_ASSIGN_OR_RETURN(auto varData, mdio::from_variable(dataVar)); + MDIO_ASSIGN_OR_RETURN(auto inlineData, + mdio::from_variable(inlineVar)); + MDIO_ASSIGN_OR_RETURN(auto crosslineData, + mdio::from_variable(crosslineVar)); + MDIO_ASSIGN_OR_RETURN(auto liveMaskData, + mdio::from_variable(liveMaskVar)); + + auto varDataPtr = varData.get_data_accessor().data(); + auto inlineDataPtr = inlineData.get_data_accessor().data(); + auto crosslineDataPtr = crosslineData.get_data_accessor().data(); + auto liveMaskDataPtr = liveMaskData.get_data_accessor().data(); + + auto varOffset = varData.get_flattened_offset(); + auto inlineOffset = inlineData.get_flattened_offset(); + auto crosslineOffset = crosslineData.get_flattened_offset(); + auto liveMaskOffset = liveMaskData.get_flattened_offset(); + + std::size_t coords = 0; + std::size_t var = 0; + + for (int i = 0; i < 15; ++i) { // Only 15 of the 25 tasks were "assigned" + for (int j = 0; j < 256; ++j) { + inlineDataPtr[coords + inlineOffset] = coords; + crosslineDataPtr[coords + crosslineOffset] = coords * 4; + liveMaskDataPtr[coords + liveMaskOffset] = true; + for (int k = 0; k < 512; ++k) { + varDataPtr[var + varOffset] = i * 256 * 512 + j * 512 + k; + var++; + } + coords++; + } + } + + auto varDataFut = dataVar.Write(varData); + auto inlineDataFut = inlineVar.Write(inlineData); + auto crosslineDataFut = crosslineVar.Write(crosslineData); + auto liveMaskDataFut = liveMaskVar.Write(liveMaskData); + + if (!varDataFut.status().ok()) { + return varDataFut.status(); + } + if (!inlineDataFut.status().ok()) { + return inlineDataFut.status(); + } + if (!crosslineDataFut.status().ok()) { + return crosslineDataFut.status(); + } + if (!liveMaskDataFut.status().ok()) { + return liveMaskDataFut.status(); + } + + return ds_path; } TEST(Intersection, SETUP) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); } TEST(Intersection, constructor) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); - auto path = pathResult.value(); + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); - auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); - ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); - auto ds = dsFut.value(); + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); - mdio::CoordinateSelector cs(ds); + mdio::CoordinateSelector cs(ds); } TEST(Intersection, add_selection) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); - auto path = pathResult.value(); - - auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); - ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); - auto ds = dsFut.value(); - - mdio::CoordinateSelector cs(ds); - mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = cs.filterByCoordinate(liveMaskDesc); - ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - - // auto selections = cs.selections(); - // ASSERT_EQ(selections.size(), 2) << "Expected 2 dimensions in the selection map but got " << selections.size(); + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::CoordinateSelector cs(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = cs.filterByCoordinate(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) + << isFut + .status(); // When this resolves, the selection object is updated. + + // auto selections = cs.selections(); + // ASSERT_EQ(selections.size(), 2) << "Expected 2 dimensions in the selection + // map but got " << selections.size(); } TEST(Intersection, range_descriptors) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); - auto path = pathResult.value(); - - auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); - ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); - auto ds = dsFut.value(); - - mdio::CoordinateSelector cs(ds); - mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = cs.filterByCoordinate(liveMaskDesc); - ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - - // auto rangeDescriptors = cs.range_descriptors(); - - // ASSERT_EQ(rangeDescriptors.size(), 2); - // EXPECT_EQ(rangeDescriptors[0].label.label(), "task") << "Expected first RangeDescriptor to be for the 'task' dimension"; - // EXPECT_EQ(rangeDescriptors[1].label.label(), "trace") << "Expected second RangeDescriptor to be for the 'trace' dimension"; - - // EXPECT_EQ(rangeDescriptors[0].start, 0) << "Expected first RangeDescriptor to start at index 0"; - // EXPECT_EQ(rangeDescriptors[0].stop, 15) << "Expected first RangeDescriptor to stop at index 15"; - // EXPECT_EQ(rangeDescriptors[0].step, 1) << "Expected first RangeDescriptor to have a step of 1"; - - // EXPECT_EQ(rangeDescriptors[1].start, 0) << "Expected second RangeDescriptor to start at index 0"; - // EXPECT_EQ(rangeDescriptors[1].stop, 256) << "Expected second RangeDescriptor to stop at index 256"; - // EXPECT_EQ(rangeDescriptors[1].step, 1) << "Expected second RangeDescriptor to have a step of 1"; + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::CoordinateSelector cs(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = cs.filterByCoordinate(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) + << isFut + .status(); // When this resolves, the selection object is updated. + + // auto rangeDescriptors = cs.range_descriptors(); + + // ASSERT_EQ(rangeDescriptors.size(), 2); + // EXPECT_EQ(rangeDescriptors[0].label.label(), "task") << "Expected first + // RangeDescriptor to be for the 'task' dimension"; + // EXPECT_EQ(rangeDescriptors[1].label.label(), "trace") << "Expected second + // RangeDescriptor to be for the 'trace' dimension"; + + // EXPECT_EQ(rangeDescriptors[0].start, 0) << "Expected first RangeDescriptor + // to start at index 0"; EXPECT_EQ(rangeDescriptors[0].stop, 15) << "Expected + // first RangeDescriptor to stop at index 15"; + // EXPECT_EQ(rangeDescriptors[0].step, 1) << "Expected first RangeDescriptor + // to have a step of 1"; + + // EXPECT_EQ(rangeDescriptors[1].start, 0) << "Expected second RangeDescriptor + // to start at index 0"; EXPECT_EQ(rangeDescriptors[1].stop, 256) << "Expected + // second RangeDescriptor to stop at index 256"; + // EXPECT_EQ(rangeDescriptors[1].step, 1) << "Expected second RangeDescriptor + // to have a step of 1"; } TEST(Intersection, get_inline_range) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); - auto path = pathResult.value(); - - auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); - ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); - auto ds = dsFut.value(); - - mdio::CoordinateSelector cs(ds); - mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = cs.filterByCoordinate(liveMaskDesc); - ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 18}); - ASSERT_TRUE(isFut.status().ok()) << isFut.status(); - // auto rangeDescriptors = cs.range_descriptors(); - - // for (const auto& desc : rangeDescriptors) { - // std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; - // } - + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::CoordinateSelector cs(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = cs.filterByCoordinate(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) + << isFut + .status(); // When this resolves, the selection object is updated. + isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 18}); + ASSERT_TRUE(isFut.status().ok()) << isFut.status(); + // auto rangeDescriptors = cs.range_descriptors(); + + // for (const auto& desc : rangeDescriptors) { + // std::cout << "Dimension: " << desc.label.label() << " Start: " << + // desc.start << " Stop: " << desc.stop << " Step: " << desc.step << + // std::endl; + // } } TEST(Intersection, get_inline_range_dead) { - auto pathResult = SetupDataset(); - ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); - auto path = pathResult.value(); - - auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); - ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); - auto ds = dsFut.value(); - - mdio::CoordinateSelector cs(ds); - mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; - auto isFut = cs.filterByCoordinate(liveMaskDesc); - ASSERT_TRUE(isFut.status().ok()) << isFut.status(); // When this resolves, the selection object is updated. - isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 5000}); - EXPECT_FALSE(isFut.status().ok()) << "Expected an error when adding a selection for an invalid inline index"; - - // auto rangeDescriptors = is.range_descriptors(); - // for (const auto& desc : rangeDescriptors) { - // std::cout << "Dimension: " << desc.label.label() << " Start: " << desc.start << " Stop: " << desc.stop << " Step: " << desc.step << std::endl; - // } + auto pathResult = SetupDataset(); + ASSERT_TRUE(pathResult.status().ok()) << pathResult.status(); + auto path = pathResult.value(); + + auto dsFut = mdio::Dataset::Open(path, mdio::constants::kOpen); + ASSERT_TRUE(dsFut.status().ok()) << dsFut.status(); + auto ds = dsFut.value(); + + mdio::CoordinateSelector cs(ds); + mdio::ValueDescriptor liveMaskDesc = {"live_mask", true}; + auto isFut = cs.filterByCoordinate(liveMaskDesc); + ASSERT_TRUE(isFut.status().ok()) + << isFut + .status(); // When this resolves, the selection object is updated. + isFut = cs.filterByCoordinate(mdio::ValueDescriptor{"inline", 5000}); + EXPECT_FALSE(isFut.status().ok()) << "Expected an error when adding a " + "selection for an invalid inline index"; + + // auto rangeDescriptors = is.range_descriptors(); + // for (const auto& desc : rangeDescriptors) { + // std::cout << "Dimension: " << desc.label.label() << " Start: " << + // desc.start << " Stop: " << desc.stop << " Step: " << desc.step << + // std::endl; + // } } } // namespace diff --git a/mdio/dataset.h b/mdio/dataset.h index c0c3f7d..457b09b 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -619,44 +619,40 @@ class Dataset { * number of descriptors. */ Result isel(const std::vector>& slices) { - if (slices.empty()) { - return absl::InvalidArgumentError("No slices provided."); - } + if (slices.empty()) { + return absl::InvalidArgumentError("No slices provided."); + } - // 1) Group descriptors by their dimension label - std::map>> groups; - for (auto& desc : slices) { - groups[desc.label.label()].push_back(desc); - } + // 1) Group descriptors by their dimension label + std::map>> groups; + for (auto& desc : slices) { + groups[desc.label.label()].push_back(desc); + } - // 2) Walk through each dimension-group and break it into kMax-sized windows - Dataset current = *this; - for (auto& [label, descs] : groups) { - for (size_t i = 0; i < descs.size(); i += internal::kMaxNumSlices) { - size_t end = std::min(i + internal::kMaxNumSlices, descs.size()); - std::vector> window( - descs.begin() + i, descs.begin() + end); - - // 3) Pad this window up to kMax (if your impl still needs padding) - window.reserve(internal::kMaxNumSlices); - for (size_t p = window.size(); p < internal::kMaxNumSlices; ++p) { - window.emplace_back(RangeDescriptor{ - internal::kInertSliceKey, 0, 1, 1 - }); - } + // 2) Walk through each dimension-group and break it into kMax-sized windows + Dataset current = *this; + for (auto& [label, descs] : groups) { + for (size_t i = 0; i < descs.size(); i += internal::kMaxNumSlices) { + size_t end = std::min(i + internal::kMaxNumSlices, descs.size()); + std::vector> window(descs.begin() + i, + descs.begin() + end); + + // 3) Pad this window up to kMax (if your impl still needs padding) + window.reserve(internal::kMaxNumSlices); + for (size_t p = window.size(); p < internal::kMaxNumSlices; ++p) { + window.emplace_back( + RangeDescriptor{internal::kInertSliceKey, 0, 1, 1}); + } - MDIO_ASSIGN_OR_RETURN( - current, - current.call_isel_with_vector_impl( - window, - std::make_index_sequence{} - ) - ); + MDIO_ASSIGN_OR_RETURN( + current, + current.call_isel_with_vector_impl( + window, std::make_index_sequence{})); + } } - } - return current; -} + return current; + } /** * @brief Internal use only. @@ -864,7 +860,8 @@ class Dataset { // The map 'label_to_indices' is now populated with all the relevant // indices. You can now proceed with further processing based on this map. - return isel(static_cast>&>(slices)); + return isel( + static_cast>&>(slices)); } else if constexpr ((std::is_same_v>&>(slices)); + return isel( + static_cast>&>(slices)); } else { std::map> label_to_range; // pair.first = start, pair.second = stop @@ -991,7 +989,8 @@ class Dataset { "No slices could be made from the given descriptors."); } - return isel(static_cast>&>(slices)); + return isel( + static_cast>&>(slices)); } return absl::OkStatus(); diff --git a/mdio/mdio.h b/mdio/mdio.h index 2435723..082bc0d 100644 --- a/mdio/mdio.h +++ b/mdio/mdio.h @@ -21,7 +21,7 @@ #ifndef MDIO_MDIO_H_ #define MDIO_MDIO_H_ -#include "mdio/dataset.h" #include "mdio/coordinate_selector.h" +#include "mdio/dataset.h" #endif // MDIO_MDIO_H_ diff --git a/mdio/variable.h b/mdio/variable.h index 2880b5a..1991e93 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -1002,8 +1002,9 @@ class Variable { } if (slices.size() > internal::kMaxNumSlices) { - // We are expecting the only entry point for this method to be fro mthe Dataset::isel method. - // That method should handle the partitioning of the slices. + // We are expecting the only entry point for this method to be fro mthe + // Dataset::isel method. That method should handle the partitioning of the + // slices. return absl::InvalidArgumentError( absl::StrCat("Too many slices provided or implicitly generated. " "Maximum number of slices is ", @@ -1042,121 +1043,104 @@ class Variable { * @return An `mdio::Result` object containing the resulting sub-Variable. */ template -Result slice(const Descriptors&... descriptors) const { - // 1) Pack descriptors - constexpr size_t N = sizeof...(Descriptors); - std::array, N> descs = { descriptors... }; - - // 2) Clamp + precondition check - std::vector labels; - std::vector starts, stops, steps; - labels.reserve(N); - starts.reserve(N); - stops.reserve(N); - steps.reserve(N); - - int8_t bad_idx = -1; - for (size_t i = 0; i < N; ++i) { - auto d = sliceInRange(descs[i]); - if (d.start > d.stop) { - bad_idx = static_cast(i); - break; + Result slice(const Descriptors&... descriptors) const { + // 1) Pack descriptors + constexpr size_t N = sizeof...(Descriptors); + std::array, N> descs = {descriptors...}; + + // 2) Clamp + precondition check + std::vector labels; + std::vector starts, stops, steps; + labels.reserve(N); + starts.reserve(N); + stops.reserve(N); + steps.reserve(N); + + int8_t bad_idx = -1; + for (size_t i = 0; i < N; ++i) { + auto d = sliceInRange(descs[i]); + if (d.start > d.stop) { + bad_idx = static_cast(i); + break; + } + if (this->hasLabel(d.label)) { + labels.push_back(d.label); + starts.push_back(d.start); + stops.push_back(d.stop); + steps.push_back(d.step); + } } - if (this->hasLabel(d.label)) { - labels.push_back(d.label); - starts.push_back(d.start); - stops.push_back(d.stop); - steps.push_back(d.step); + if (bad_idx >= 0) { + auto& err = descs[bad_idx]; + return Result{absl::InvalidArgumentError( + std::string("Slice descriptor for ") + + std::string(err.label.label()) + " is invalid: start=" + + std::to_string(err.start) + " > stop=" + std::to_string(err.stop))}; } - } - if (bad_idx >= 0) { - auto& err = descs[bad_idx]; - return Result{absl::InvalidArgumentError( - std::string("Slice descriptor for ") + std::string(err.label.label()) + - " is invalid: start=" + std::to_string(err.start) + - " > stop=" + std::to_string(err.stop) - )}; - } - // 3) Fast path: all labels (or axis indices) are unique - if (!labels.empty()) { - std::set labelSet; - std::set indexSet; - for (auto& lab : labels) { - labelSet.insert(lab.label()); - indexSet.insert(lab.index()); - } - if (labelSet.size() == labels.size() || - indexSet.size() == labels.size()) { - MDIO_ASSIGN_OR_RETURN( - auto slice_store, - store | tensorstore::Dims(labels) - .HalfOpenInterval(starts, stops, steps) - ); - return Variable{ - variableName, - longName, - metadata, - std::move(slice_store), - attributes - }; + // 3) Fast path: all labels (or axis indices) are unique + if (!labels.empty()) { + std::set labelSet; + std::set indexSet; + for (auto& lab : labels) { + labelSet.insert(lab.label()); + indexSet.insert(lab.index()); + } + if (labelSet.size() == labels.size() || + indexSet.size() == labels.size()) { + MDIO_ASSIGN_OR_RETURN( + auto slice_store, + store | tensorstore::Dims(labels).HalfOpenInterval(starts, stops, + steps)); + return Variable{variableName, longName, metadata, + std::move(slice_store), attributes}; + } } - } - // 4) Group by label to find any duplicates - std::map>> by_label; - for (auto& d : descs) { - if (d.label.label() != internal::kInertSliceKey) { - by_label[d.label.label()].push_back(d); + // 4) Group by label to find any duplicates + std::map>> by_label; + for (auto& d : descs) { + if (d.label.label() != internal::kInertSliceKey) { + by_label[d.label.label()].push_back(d); + } } - } - // 5) Handle the first label that has >1 descriptor - for (auto& [label, vec] : by_label) { - if (vec.size() > 1) { - // 5a) Unwrap the Spec so we can ask for transform().input_labels() - MDIO_ASSIGN_OR_RETURN(auto spec, store.spec()); - auto spec_labels = spec.transform().input_labels(); - - // find the numeric axis for this label - auto it = std::find(spec_labels.begin(), - spec_labels.end(), - label); - if (it == spec_labels.end()) { - // no-op if the label isn't in the spec; skip it - continue; - } - int axis = static_cast(std::distance(spec_labels.begin(), it)); - - // 5b) Slice each sub‑range in isolation - std::vector> pieces; - pieces.reserve(vec.size()); - for (auto& r : vec) { - auto sub = slice(r); - if (!sub.status().ok()) return sub.status(); - pieces.push_back(sub.value().get_store()); - } + // 5) Handle the first label that has >1 descriptor + for (auto& [label, vec] : by_label) { + if (vec.size() > 1) { + // 5a) Unwrap the Spec so we can ask for transform().input_labels() + MDIO_ASSIGN_OR_RETURN(auto spec, store.spec()); + auto spec_labels = spec.transform().input_labels(); + + // find the numeric axis for this label + auto it = std::find(spec_labels.begin(), spec_labels.end(), label); + if (it == spec_labels.end()) { + // no-op if the label isn't in the spec; skip it + continue; + } + int axis = static_cast(std::distance(spec_labels.begin(), it)); + + // 5b) Slice each sub‑range in isolation + std::vector> pieces; + pieces.reserve(vec.size()); + for (auto& r : vec) { + auto sub = slice(r); + if (!sub.status().ok()) return sub.status(); + pieces.push_back(sub.value().get_store()); + } + + // 5c) Concatenate them along the correct axis + MDIO_ASSIGN_OR_RETURN(auto cat_store, + tensorstore::Concat(pieces, axis)); - // 5c) Concatenate them along the correct axis - MDIO_ASSIGN_OR_RETURN( - auto cat_store, - tensorstore::Concat(pieces, axis) - ); - - return Variable{ - variableName, - longName, - metadata, - std::move(cat_store), - attributes - }; + return Variable{variableName, longName, metadata, std::move(cat_store), + attributes}; + } } - } - // 6) No descriptors matched → no change - return *this; -} + // 6) No descriptors matched → no change + return *this; + } /** * @brief Retrieves the spec of the Variable as a JSON object. @@ -1513,7 +1497,9 @@ Result slice(const Descriptors&... descriptors) const { const tensorstore::TensorStore& get_store() const { return store; } tensorstore::TensorStore& get_mutable_store() { return store; } - void set_store(tensorstore::TensorStore& new_store) { store = new_store; } + void set_store(tensorstore::TensorStore& new_store) { + store = new_store; + } // The data that should remain static, but MAY need to be updated. std::shared_ptr> attributes; From 2ef5f9c35d619bb90008dd12e7a8e8b8e50eab47 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 16:36:56 +0000 Subject: [PATCH 18/24] Linting --- mdio/coordinate_selector.h | 37 +++++++++++++++++++++++++++++-------- mdio/dataset.h | 1 + mdio/variable.h | 4 +++- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 95f4b69..4fb3877 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -1,5 +1,24 @@ -#ifndef MDIO_INTERSECTION_H -#define MDIO_INTERSECTION_H +// Copyright 2025 TGS + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MDIO_COORDINATE_SELECTOR_H_ +#define MDIO_COORDINATE_SELECTOR_H_ + +#include +#include +#include +#include #include "mdio/dataset.h" #include "mdio/impl.h" @@ -425,8 +444,8 @@ class CoordinateSelector { } kept_runs_ = _from_intervals( - new_runs); // TODO: We need to ensure we don't accidentally drop any - // pre-sliced dimensions... + new_runs); // TODO(BrianMichell): We need to ensure we don't + // accidentally drop any pre-sliced dimensions... return absl::OkStatus(); } @@ -451,7 +470,8 @@ class CoordinateSelector { /// Advance a multidimensional odometer position by one step. template void _current_position_increment( - std::vector::Interval>& position, + std::vector::Interval>& + position, // NOLINT (non-const) const std::vector::Interval>& interval) const { for (std::size_t d = position.size(); d-- > 0;) { if (position[d].inclusive_min + 1 < interval[d].exclusive_max) { @@ -464,7 +484,8 @@ class CoordinateSelector { template void _current_position_stride( - std::vector::Interval>& position, + std::vector::Interval>& + position, // NOLINT (non-const) const std::vector::Interval>& interval, const std::size_t num_elements) { auto dims = position.size(); @@ -481,7 +502,7 @@ class CoordinateSelector { template Result, const T*, Index, Index>> _resolve_future( - Future>& fut) { + Future>& fut) { // NOLINT (non-const) if (!fut.status().ok()) return fut.status(); auto data = fut.value(); const T* data_ptr = data.get_data_accessor().data(); @@ -493,4 +514,4 @@ class CoordinateSelector { } // namespace mdio -#endif // MDIO_INTERSECTION_H +#endif // MDIO_COORDINATE_SELECTOR_H_ diff --git a/mdio/dataset.h b/mdio/dataset.h index 457b09b..da96a59 100644 --- a/mdio/dataset.h +++ b/mdio/dataset.h @@ -16,6 +16,7 @@ #define MDIO_API_VERSION "1.0.0" +#include #include #include #include diff --git a/mdio/variable.h b/mdio/variable.h index 1991e93..c1142b1 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -1497,7 +1498,8 @@ class Variable { const tensorstore::TensorStore& get_store() const { return store; } tensorstore::TensorStore& get_mutable_store() { return store; } - void set_store(tensorstore::TensorStore& new_store) { + void set_store( + tensorstore::TensorStore& new_store) { // NOLINT (non-const) store = new_store; } From d2a6264900309c88dfe46892c27ce834608d4a0d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 18:47:07 +0000 Subject: [PATCH 19/24] Fix test build --- mdio/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mdio/CMakeLists.txt b/mdio/CMakeLists.txt index f57e284..47fc89f 100644 --- a/mdio/CMakeLists.txt +++ b/mdio/CMakeLists.txt @@ -337,9 +337,9 @@ mdio_cc_test( mdio_cc_test( NAME - intersection_test + coordinate_selector_test SRCS - intersection_test.cc + coordinate_selector_test.cc COPTS ${mdio_DEFAULT_COPTS} LINKOPTS From 35975d7f718ce5bafd4cd0df75fc5ab7be4f3165 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 21 May 2025 18:53:46 +0000 Subject: [PATCH 20/24] Run coordinate selector tests --- .github/workflows/cmake_build.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cmake_build.yaml b/.github/workflows/cmake_build.yaml index a5cb813..e7c65e9 100644 --- a/.github/workflows/cmake_build.yaml +++ b/.github/workflows/cmake_build.yaml @@ -51,4 +51,5 @@ jobs: && ./mdio_stats_test \ && ./mdio_utils_trim_test \ && ./mdio_utils_delete_test \ - && ./mdio_variable_collection_test + && ./mdio_variable_collection_test \ + && ./mdio_coordinate_selector_test \ No newline at end of file From 0dcd219e735a57d45059cd0f15b0752b5f993a4c Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 22 May 2025 13:34:21 +0000 Subject: [PATCH 21/24] Remove const qualifier for dataset --- mdio/coordinate_selector.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 4fb3877..6de49a6 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -80,7 +80,7 @@ inline constexpr bool is_sort_key_v = is_sort_key>::value; class CoordinateSelector { public: /// Construct from an existing Dataset (captures its full domain). - explicit CoordinateSelector(const Dataset& dataset) + explicit CoordinateSelector(Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} template @@ -117,14 +117,13 @@ class CoordinateSelector { #ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); #endif - auto non_const_ds = dataset_; const size_t n = kept_runs_.size(); // 1) Fire off all reads in parallel and gather the key values std::vector>> reads; reads.reserve(n); for (auto const& desc : kept_runs_) { - MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto ds, dataset_.isel(desc)); MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(sort_key)); reads.push_back(var.Read()); } @@ -186,13 +185,12 @@ class CoordinateSelector { #ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); #endif - auto non_const_ds = dataset_; std::vector>> reads; reads.reserve(kept_runs_.size()); std::vector ret; for (const auto& desc : kept_runs_) { - MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto ds, dataset_.isel(desc)); MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(output_variable)); auto fut = var.Read(); reads.push_back(fut); @@ -226,7 +224,7 @@ class CoordinateSelector { } private: - const Dataset& dataset_; + Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; @@ -365,12 +363,11 @@ class CoordinateSelector { // until the Intervals are no longer needed. stored_intervals.reserve(kept_runs_.size()); - auto non_const_ds = dataset_; bool is_first_run = true; for (const auto& desc : kept_runs_) { - MDIO_ASSIGN_OR_RETURN(auto ds, non_const_ds.isel(desc)); + MDIO_ASSIGN_OR_RETURN(auto ds, dataset_.isel(desc)); MDIO_ASSIGN_OR_RETURN( auto var, ds.variables.get(std::string(descriptor.label.label()))); auto fut = var.Read(); From 2f2eb4be0948647ad779b0ad8984b6d991efce32 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 22 May 2025 13:34:39 +0000 Subject: [PATCH 22/24] Update API to read multiple Variables --- mdio/coordinate_selector.h | 59 ++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 6de49a6..684a442 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -83,16 +83,26 @@ class CoordinateSelector { explicit CoordinateSelector(Dataset& dataset) : dataset_(dataset), base_domain_(dataset.domain) {} - template - Future> ReadDataVariable(const std::string& data_variable, - Ops const&... ops) { - // 1) apply filters & sorts in the exact order given + template + Future...>> ReadDataVariables( + std::vector const& data_variables, + Ops const&... ops) + { + if (data_variables.size() != sizeof...(OutTs)) { + return absl::InvalidArgumentError( + "ReadDataVariables: number of names must match number of OutTs"); + } + // 1) apply all filters & sorts in order absl::Status st = absl::OkStatus(); ((st = st.ok() ? _applyOp(ops).status() : st), ...); if (!st.ok()) return st; - // 2) finally read out the requested variable - return readSelection(data_variable); + // 2) kick off and await all reads + return _readMultiple(data_variables); + } + + void reset() { + kept_runs_.clear(); } /** @@ -249,6 +259,43 @@ class CoordinateSelector { } } + // helper: expands readSelection(vars[I])... + template + Future...>> _readMultipleImpl( + std::vector const& vars, + std::index_sequence) + { + // 1) start all reads + auto futs = std::make_tuple( + readSelection(vars[I])... + ); + + // 2) wait on them in order + absl::Status st = absl::OkStatus(); + std::tuple...> results; + // fold over I... + ( + [&](){ + if (!st.ok()) return; + auto& f = std::get(futs); + st = f.status(); + if (st.ok()) std::get(results) = std::move(f.value()); + }(), + ... + ); + if (!st.ok()) return st; + return results; + } + + template + Future...>> _readMultiple( + std::vector const& vars) + { + return _readMultipleImpl( + vars, std::index_sequence_for{} + ); + } + /* TODO: The built RangeDescriptors aren't behaving as I hoped. They are building the longest runs possible properly, however From 901d3285e17e21271518e1231bdba3a402a9a702 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 22 May 2025 15:35:23 +0000 Subject: [PATCH 23/24] Begin type erasure and local caching optimizations --- mdio/coordinate_selector.h | 59 ++++++++++++++++++++++++-------------- mdio/variable.h | 3 ++ 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 684a442..8b1e2f3 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -130,11 +130,11 @@ class CoordinateSelector { const size_t n = kept_runs_.size(); // 1) Fire off all reads in parallel and gather the key values - std::vector>> reads; + std::vector>> reads; reads.reserve(n); for (auto const& desc : kept_runs_) { MDIO_ASSIGN_OR_RETURN(auto ds, dataset_.isel(desc)); - MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(sort_key)); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.at(sort_key)); reads.push_back(var.Read()); } @@ -149,9 +149,9 @@ class CoordinateSelector { // if (!f.status().ok()) return f.status(); // auto data = f.value(); // keys.push_back(data.get_data_accessor().data()[data.get_flattened_offset()]); - MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); auto data = std::get<0>(resolution); - auto data_ptr = std::get<1>(resolution); + auto data_ptr = static_cast(std::get<1>(resolution)); auto offset = std::get<2>(resolution); // auto n = std::get<3>(resolution); // Not required keys.push_back(data_ptr[offset]); @@ -195,13 +195,13 @@ class CoordinateSelector { #ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); #endif - std::vector>> reads; + std::vector>> reads; reads.reserve(kept_runs_.size()); std::vector ret; for (const auto& desc : kept_runs_) { MDIO_ASSIGN_OR_RETURN(auto ds, dataset_.isel(desc)); - MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.get(output_variable)); + MDIO_ASSIGN_OR_RETURN(auto var, ds.variables.at(output_variable)); auto fut = var.Read(); reads.push_back(fut); if (var.rank() == 1) { @@ -216,9 +216,9 @@ class CoordinateSelector { #endif for (auto& f : reads) { - MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(f)); auto data = std::get<0>(resolution); - auto data_ptr = std::get<1>(resolution); + auto data_ptr = static_cast(std::get<1>(resolution)); auto offset = std::get<2>(resolution); auto n = std::get<3>(resolution); std::vector buffer(n); @@ -237,6 +237,7 @@ class CoordinateSelector { Dataset& dataset_; tensorstore::IndexDomain<> base_domain_; std::vector>> kept_runs_; + std::map> cached_variables_; template Future _applyOp(D const& op) { @@ -310,19 +311,35 @@ class CoordinateSelector { template Future _init_runs(const ValueDescriptor& descriptor) { - using Interval = typename Variable::Interval; + using Interval = typename Variable::Interval; #ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); #endif - MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.get( + MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.at( std::string(descriptor.label.label()))); - auto fut = var.Read(); + + const T* data_ptr; + Index offset; + Index n_samples; MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); - auto data = std::get<0>(resolution); - auto data_ptr = std::get<1>(resolution); - auto offset = std::get<2>(resolution); - auto n_samples = std::get<3>(resolution); + if (cached_variables_.find(descriptor.label.label()) == cached_variables_.end()) { + // TODO(BrianMichell): Ensure that the domain has not changed. + std::cout << "Reading VariableData" << std::endl; + auto fut = var.Read(); + MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); + auto dataToCache = std::get<0>(resolution); + cached_variables_.insert_or_assign(descriptor.label.label(), std::move(dataToCache)); + } + auto it = cached_variables_.find(descriptor.label.label()); + if (it == cached_variables_.end()) { + std::stringstream ss; + ss << "Cached variable not found for coordinate '" << descriptor.label.label() << "'"; + return absl::NotFoundError(ss.str()); + } + auto& data = it->second; + data_ptr = static_cast(data.get_data_accessor().data()); + offset = data.get_flattened_offset(); + n_samples = data.num_samples(); auto current_pos = intervals; bool isInRun = false; @@ -343,7 +360,8 @@ class CoordinateSelector { // The start of a new run isInRun = true; for (auto i = run_idx; i < idx; ++i) { - _current_position_increment(current_pos, intervals); + // _current_position_increment(current_pos, intervals); + _current_position_increment(current_pos, intervals); } // _current_position_stride(current_pos, intervals, idx - run_idx); run_idx = idx; @@ -358,16 +376,15 @@ class CoordinateSelector { // Use 1 less than the current index to ensure we get the correct end // location. for (auto i = run_idx; i < idx - 1; ++i) { - _current_position_increment(current_pos, intervals); + _current_position_increment(current_pos, intervals); } - // _current_position_stride(current_pos, intervals, idx - run_idx); run_idx = idx; auto& last_run = local_runs.back(); for (auto i = 0; i < current_pos.size(); ++i) { last_run[i].exclusive_max = current_pos[i].inclusive_min + 1; } // We need to advance to the actual current position - _current_position_increment(current_pos, intervals); + _current_position_increment(current_pos, intervals); } else if (!is_match && !isInRun) { // No run at all // do nothing TODO: Remove me @@ -388,7 +405,7 @@ class CoordinateSelector { return absl::NotFoundError(ss.str()); } - kept_runs_ = _from_intervals(local_runs); + kept_runs_ = _from_intervals(local_runs); #ifdef MDIO_INTERNAL_PROFILING std::cout << "Finalize time... "; timer(start); diff --git a/mdio/variable.h b/mdio/variable.h index c1142b1..5d2a24c 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -576,6 +576,9 @@ Future> OpenVariable(const nlohmann::json& json_store, } // the negative of this is valid for tensorstore ... + // TODO(BrianMichell): Look into making the recheck_cached_data an open option. + // store_spec["recheck_cached_data"] = false; // This could become problematic if we are doing read/write operations. + auto spec = tensorstore::MakeReadyFuture<::nlohmann::json>(store_spec); // open a store: From 483b2ed5b4679f0ce8a0a93e63b18a0f0ccd619e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 22 May 2025 15:46:39 +0000 Subject: [PATCH 24/24] Linting --- mdio/coordinate_selector.h | 66 +++++++++++++++++--------------------- mdio/variable.h | 5 +-- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/mdio/coordinate_selector.h b/mdio/coordinate_selector.h index 8b1e2f3..43e952c 100644 --- a/mdio/coordinate_selector.h +++ b/mdio/coordinate_selector.h @@ -15,6 +15,7 @@ #ifndef MDIO_COORDINATE_SELECTOR_H_ #define MDIO_COORDINATE_SELECTOR_H_ +#include #include #include #include @@ -80,17 +81,15 @@ inline constexpr bool is_sort_key_v = is_sort_key>::value; class CoordinateSelector { public: /// Construct from an existing Dataset (captures its full domain). - explicit CoordinateSelector(Dataset& dataset) + explicit CoordinateSelector(Dataset& dataset) // NOLINT (non-const) : dataset_(dataset), base_domain_(dataset.domain) {} - template + template Future...>> ReadDataVariables( - std::vector const& data_variables, - Ops const&... ops) - { + std::vector const& data_variables, Ops const&... ops) { if (data_variables.size() != sizeof...(OutTs)) { return absl::InvalidArgumentError( - "ReadDataVariables: number of names must match number of OutTs"); + "ReadDataVariables: number of names must match number of OutTs"); } // 1) apply all filters & sorts in order absl::Status st = absl::OkStatus(); @@ -101,9 +100,7 @@ class CoordinateSelector { return _readMultiple(data_variables); } - void reset() { - kept_runs_.clear(); - } + void reset() { kept_runs_.clear(); } /** * @brief Filter the Dataset by the given coordinate. @@ -261,40 +258,33 @@ class CoordinateSelector { } // helper: expands readSelection(vars[I])... - template + template Future...>> _readMultipleImpl( - std::vector const& vars, - std::index_sequence) - { + std::vector const& vars, std::index_sequence) { // 1) start all reads - auto futs = std::make_tuple( - readSelection(vars[I])... - ); + auto futs = std::make_tuple(readSelection(vars[I])...); // 2) wait on them in order absl::Status st = absl::OkStatus(); std::tuple...> results; // fold over I... ( - [&](){ - if (!st.ok()) return; - auto& f = std::get(futs); - st = f.status(); - if (st.ok()) std::get(results) = std::move(f.value()); - }(), - ... - ); + [&]() { + if (!st.ok()) return; + auto& f = std::get(futs); + st = f.status(); + if (st.ok()) std::get(results) = std::move(f.value()); + }(), + ...); if (!st.ok()) return st; return results; } - template + template Future...>> _readMultiple( - std::vector const& vars) - { - return _readMultipleImpl( - vars, std::index_sequence_for{} - ); + std::vector const& vars) { + return _readMultipleImpl(vars, + std::index_sequence_for{}); } /* @@ -315,28 +305,31 @@ class CoordinateSelector { #ifdef MDIO_INTERNAL_PROFILING auto start = std::chrono::high_resolution_clock::now(); #endif - MDIO_ASSIGN_OR_RETURN(auto var, dataset_.variables.at( - std::string(descriptor.label.label()))); + MDIO_ASSIGN_OR_RETURN( + auto var, dataset_.variables.at(std::string(descriptor.label.label()))); const T* data_ptr; Index offset; Index n_samples; MDIO_ASSIGN_OR_RETURN(auto intervals, var.get_intervals()); - if (cached_variables_.find(descriptor.label.label()) == cached_variables_.end()) { + if (cached_variables_.find(descriptor.label.label()) == + cached_variables_.end()) { // TODO(BrianMichell): Ensure that the domain has not changed. std::cout << "Reading VariableData" << std::endl; auto fut = var.Read(); MDIO_ASSIGN_OR_RETURN(auto resolution, _resolve_future(fut)); auto dataToCache = std::get<0>(resolution); - cached_variables_.insert_or_assign(descriptor.label.label(), std::move(dataToCache)); + cached_variables_.insert_or_assign(descriptor.label.label(), + std::move(dataToCache)); } auto it = cached_variables_.find(descriptor.label.label()); if (it == cached_variables_.end()) { std::stringstream ss; - ss << "Cached variable not found for coordinate '" << descriptor.label.label() << "'"; + ss << "Cached variable not found for coordinate '" + << descriptor.label.label() << "'"; return absl::NotFoundError(ss.str()); } - auto& data = it->second; + auto& data = it->second; data_ptr = static_cast(data.get_data_accessor().data()); offset = data.get_flattened_offset(); n_samples = data.num_samples(); @@ -427,7 +420,6 @@ class CoordinateSelector { // until the Intervals are no longer needed. stored_intervals.reserve(kept_runs_.size()); - bool is_first_run = true; for (const auto& desc : kept_runs_) { diff --git a/mdio/variable.h b/mdio/variable.h index 5d2a24c..a8b5b2e 100644 --- a/mdio/variable.h +++ b/mdio/variable.h @@ -576,8 +576,9 @@ Future> OpenVariable(const nlohmann::json& json_store, } // the negative of this is valid for tensorstore ... - // TODO(BrianMichell): Look into making the recheck_cached_data an open option. - // store_spec["recheck_cached_data"] = false; // This could become problematic if we are doing read/write operations. + // TODO(BrianMichell): Look into making the recheck_cached_data an open + // option. store_spec["recheck_cached_data"] = false; // This could become + // problematic if we are doing read/write operations. auto spec = tensorstore::MakeReadyFuture<::nlohmann::json>(store_spec);