From 6b6e675022ad05bb2e289c631cebf087daf84f31 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 9 Jan 2026 11:07:21 +0900 Subject: [PATCH] [C++] Handle only relevant slices of child arrays when hashing scalars from ListArrays --- cpp/src/arrow/scalar.cc | 16 ++++++++++++---- cpp/src/arrow/scalar_test.cc | 12 ++++++++++++ python/pyarrow/tests/test_scalars.py | 9 +++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index ad2c66a928a..b00362e5672 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -179,17 +179,25 @@ struct ScalarHashImpl { // Hash the relevant child arrays for each type taking offset and length // from the parent array into account if necessary. + // - STRUCT: children share parent's offset/length + // - FIXED_SIZE_LIST: children use offset*list_size, length*list_size + // - Others (LIST, MAP, UNION, LIST_VIEW): have their own offset mechanisms switch (a.type->id()) { case Type::STRUCT: for (const auto& child : a.child_data) { RETURN_NOT_OK(ArrayHash(child, offset, length)); } break; - // TODO(GH-35830): Investigate what should be the correct behavior for - // each nested type. + case Type::FIXED_SIZE_LIST: { + const auto& list_type = checked_cast(*a.type); + const int32_t list_size = list_type.list_size(); + for (const auto& child : a.child_data) { + RETURN_NOT_OK(ArrayHash(child, offset * list_size, length * list_size)); + } + break; + } default: - // By default, just hash the arrays without considering - // the offset and length of the parent. + // LIST, MAP, UNION, LIST_VIEW have their own offset mechanisms for (const auto& child : a.child_data) { RETURN_NOT_OK(ArrayHash(child)); } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 4a34e5d13c2..fc3fa283ba1 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1419,6 +1419,18 @@ TEST(TestFixedSizeListScalar, Cast) { ASSERT_EQ(casted_str->ToString(), scalar.ToString()); } +TEST(TestFixedSizeListScalar, Hashing) { + auto inner_type = fixed_size_list(int32(), 2); + auto outer_type = fixed_size_list(inner_type, 3); + auto g = ArrayFromJSON(outer_type, + "[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]"); + auto h = ArrayFromJSON(outer_type, "[[[7, 8], [9, 10], [11, 12]]]"); + ASSERT_OK_AND_ASSIGN(auto g1, g->GetScalar(1)); + ASSERT_OK_AND_ASSIGN(auto h0, h->GetScalar(0)); + ASSERT_EQ(*g1, *h0); + ASSERT_EQ(g1->hash(), h0->hash()); +} + TEST(TestMapScalar, Basics) { auto value = ArrayFromJSON(struct_({field("key", utf8(), false), field("value", int8())}), diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 65f0c608136..ec023ad6614 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -193,6 +193,15 @@ def test_hashing_struct_scalar(): assert hash1 == hash2 +def test_hashing_fixed_size_list_scalar(): + # GH-35830 + inner = pa.list_(pa.int32(), 2) + outer = pa.list_(inner, 3) + g = pa.array([[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]], type=outer) + h = pa.array([[[7, 8], [9, 10], [11, 12]]], type=outer) + assert hash(g[1]) == hash(h[0]) + + @pytest.mark.timezone_data def test_timestamp_scalar(): a = repr(pa.scalar("0000-01-01").cast(pa.timestamp("s")))