From 3f9a883897e996aaf04b220f73d3b532528fa241 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 16 Dec 2025 14:20:28 +0900 Subject: [PATCH] GH-48551: [Ruby] Add support for reading large UTF-8 array --- .../lib/arrow-format/array.rb | 11 ++++++++++ .../lib/arrow-format/file-reader.rb | 4 ++++ .../red-arrow-format/lib/arrow-format/type.rb | 20 +++++++++++++++++++ .../red-arrow-format/test/test-file-reader.rb | 11 ++++++++++ 4 files changed, 46 insertions(+) diff --git a/ruby/red-arrow-format/lib/arrow-format/array.rb b/ruby/red-arrow-format/lib/arrow-format/array.rb index 23969bc24bd..edf8326f3d4 100644 --- a/ruby/red-arrow-format/lib/arrow-format/array.rb +++ b/ruby/red-arrow-format/lib/arrow-format/array.rb @@ -235,6 +235,17 @@ def encoding end end + class LargeUTF8Array < VariableSizeBinaryLayoutArray + private + def buffer_type + :s64 # TODO: big endian support + end + + def encoding + Encoding::UTF_8 + end + end + class FixedSizeBinaryArray < Array def initialize(type, size, validity_buffer, values_buffer) super(type, size, validity_buffer) diff --git a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb index 8b149cd1756..f08e3e448e3 100644 --- a/ruby/red-arrow-format/lib/arrow-format/file-reader.rb +++ b/ruby/red-arrow-format/lib/arrow-format/file-reader.rb @@ -26,11 +26,13 @@ require_relative "org/apache/arrow/flatbuf/bool" require_relative "org/apache/arrow/flatbuf/date" require_relative "org/apache/arrow/flatbuf/date_unit" +require_relative "org/apache/arrow/flatbuf/fixed_size_binary" require_relative "org/apache/arrow/flatbuf/floating_point" require_relative "org/apache/arrow/flatbuf/footer" require_relative "org/apache/arrow/flatbuf/int" require_relative "org/apache/arrow/flatbuf/large_binary" require_relative "org/apache/arrow/flatbuf/large_list" +require_relative "org/apache/arrow/flatbuf/large_utf8" require_relative "org/apache/arrow/flatbuf/list" require_relative "org/apache/arrow/flatbuf/map" require_relative "org/apache/arrow/flatbuf/message" @@ -232,6 +234,8 @@ def read_field(fb_field) type = LargeBinaryType.singleton when Org::Apache::Arrow::Flatbuf::Utf8 type = UTF8Type.singleton + when Org::Apache::Arrow::Flatbuf::LargeUtf8 + type = LargeUTF8Type.singleton when Org::Apache::Arrow::Flatbuf::FixedSizeBinary type = FixedSizeBinaryType.new(fb_type.byte_width) end diff --git a/ruby/red-arrow-format/lib/arrow-format/type.rb b/ruby/red-arrow-format/lib/arrow-format/type.rb index 055c0890c4b..a66fe08f25a 100644 --- a/ruby/red-arrow-format/lib/arrow-format/type.rb +++ b/ruby/red-arrow-format/lib/arrow-format/type.rb @@ -353,6 +353,26 @@ def build_array(size, validity_buffer, offsets_buffer, values_buffer) end end + class LargeUTF8Type < VariableSizeBinaryType + class << self + def singleton + @singleton ||= new + end + end + + def initialize + super("LargeUTF8") + end + + def build_array(size, validity_buffer, offsets_buffer, values_buffer) + LargeUTF8Array.new(self, + size, + validity_buffer, + offsets_buffer, + values_buffer) + end + end + class FixedSizeBinaryType < Type attr_reader :byte_width def initialize(byte_width) diff --git a/ruby/red-arrow-format/test/test-file-reader.rb b/ruby/red-arrow-format/test/test-file-reader.rb index b31e8940458..c66948518bb 100644 --- a/ruby/red-arrow-format/test/test-file-reader.rb +++ b/ruby/red-arrow-format/test/test-file-reader.rb @@ -358,6 +358,17 @@ def test_read end end + sub_test_case("LargeUTF8") do + def build_array + Arrow::LargeStringArray.new(["Hello", nil, "World"]) + end + + def test_read + assert_equal([{"value" => ["Hello", nil, "World"]}], + read) + end + end + sub_test_case("FixedSizeBinary") do def build_array data_type = Arrow::FixedSizeBinaryDataType.new(4)