diff --git a/Cargo.lock b/Cargo.lock index ede8317..3788d6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -659,7 +659,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.2", "regex", "sqlparser", "tempfile", @@ -774,7 +774,7 @@ dependencies = [ "itertools 0.14.0", "log", "object_store", - "rand", + "rand 0.9.2", "tokio", "url", ] @@ -900,7 +900,7 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand", + "rand 0.9.2", "tempfile", "url", ] @@ -961,7 +961,7 @@ dependencies = [ "itertools 0.14.0", "log", "num-traits", - "rand", + "rand 0.9.2", "regex", "unicode-segmentation", "uuid", @@ -1640,7 +1640,7 @@ dependencies = [ [[package]] name = "geodatafusion" -version = "0.3.0" +version = "0.4.0-hotdata.1" dependencies = [ "approx", "arrow-arith", @@ -1653,7 +1653,10 @@ dependencies = [ "geoarrow-expr-geo", "geoarrow-schema", "geohash", + "geojson", "object_store", + "serde_json", + "spatialbench", "thiserror 1.0.69", "tokio", "wkt 0.14.0", @@ -1661,7 +1664,7 @@ dependencies = [ [[package]] name = "geodatafusion-flatgeobuf" -version = "0.3.0" +version = "0.4.0-hotdata.1" dependencies = [ "approx", "arrow-array", @@ -1685,7 +1688,7 @@ dependencies = [ [[package]] name = "geodatafusion-geojson" -version = "0.3.0" +version = "0.4.0-hotdata.1" dependencies = [ "arrow-array", "arrow-schema", @@ -1707,7 +1710,7 @@ dependencies = [ [[package]] name = "geodatafusion-geoparquet" -version = "0.3.0" +version = "0.4.0-hotdata.1" dependencies = [ "approx", "arrow-schema", @@ -2548,7 +2551,7 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml", - "rand", + "rand 0.9.2", "reqwest", "ring", "serde", @@ -2785,7 +2788,7 @@ dependencies = [ "bytes", "getrandom 0.3.3", "lru-slab", - "rand", + "rand 0.9.2", "ring", "rustc-hash", "rustls", @@ -2826,14 +2829,35 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + [[package]] name = "rand" version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ - "rand_chacha", - "rand_core", + "rand_chacha 0.9.0", + "rand_core 0.9.3", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", ] [[package]] @@ -2843,7 +2867,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.3", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", ] [[package]] @@ -3335,6 +3368,18 @@ dependencies = [ "smallvec", ] +[[package]] +name = "spatialbench" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07f3f4b67ccf571f183d3695aa6b9d6f996864c31782a480e97a23ed0f2f6f18" +dependencies = [ + "geo", + "once_cell", + "rand 0.8.5", + "serde", +] + [[package]] name = "sqlparser" version = "0.59.0" diff --git a/rust/geodatafusion/Cargo.toml b/rust/geodatafusion/Cargo.toml index 5fb3ddc..2e59cd1 100644 --- a/rust/geodatafusion/Cargo.toml +++ b/rust/geodatafusion/Cargo.toml @@ -21,12 +21,15 @@ geoarrow-array = { workspace = true } geoarrow-expr-geo = { workspace = true } geoarrow-schema = { workspace = true } geohash = { workspace = true } +geojson = { workspace = true } +serde_json = { workspace = true } thiserror = { workspace = true } wkt = { workspace = true } [dev-dependencies] approx = { workspace = true } datafusion = { workspace = true, features = ["sql"] } +spatialbench = { workspace = true } geo-traits = { workspace = true } geoarrow-array = { workspace = true, features = ["test-data"] } object_store = { workspace = true, features = ["http"] } diff --git a/rust/geodatafusion/src/lib.rs b/rust/geodatafusion/src/lib.rs index fae1666..7ef2089 100644 --- a/rust/geodatafusion/src/lib.rs +++ b/rust/geodatafusion/src/lib.rs @@ -12,21 +12,33 @@ pub mod udf; /// Register all UDFs defined in geodatafusion pub fn register(session_context: &datafusion::prelude::SessionContext) { + // Aggregate spatial functions + crate::udf::geo::aggregate::register(session_context); + + // Measurement functions (area, distance, length, perimeter, azimuth, etc.) crate::udf::geo::measurement::register(session_context); + // Processing functions (buffer, overlay, simplify, affine, editors, linear ref, etc.) crate::udf::geo::processing::register(session_context); + // Spatial relationships (contains, intersects, dwithin, relate, etc.) crate::udf::geo::relationships::register(session_context); + // Validation functions (is_valid, is_valid_reason) crate::udf::geo::validation::register(session_context); + // GeoHash functions crate::udf::geohash::register(session_context); + // Native accessors (x, y, z, m, geometry_type, dimension, is_empty, etc.) crate::udf::native::accessors::register(session_context); + // Bounding box functions crate::udf::native::bounding_box::register(session_context); + // Constructors (point, make_line, make_polygon, make_envelope) crate::udf::native::constructors::register(session_context); + // I/O functions (WKT, WKB, GeoJSON, EWKT) crate::udf::native::io::register(session_context); } diff --git a/rust/geodatafusion/src/udf/geo/aggregate.rs b/rust/geodatafusion/src/udf/geo/aggregate.rs new file mode 100644 index 0000000..861fc00 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/aggregate.rs @@ -0,0 +1,342 @@ +//! Spatial aggregate functions: ST_Union_Agg, ST_Collect_Agg. + +use std::any::Any; +use std::sync::OnceLock; + +use arrow_array::{Array, ArrayRef}; +use arrow_schema::{DataType, Field}; +use datafusion::common::ScalarValue; +use datafusion::error::Result; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, +}; +use datafusion::prelude::SessionContext; +use geo::BooleanOps; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::GeometryType; + +/// Register all aggregate spatial functions. +pub fn register(session_context: &SessionContext) { + session_context.register_udaf(datafusion::logical_expr::AggregateUDF::from( + UnionAgg::default(), + )); + session_context.register_udaf(datafusion::logical_expr::AggregateUDF::from( + CollectAgg::default(), + )); +} + +// ========================================================================= +// ST_Union_Agg +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct UnionAgg { + signature: Signature, +} + +impl UnionAgg { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } +} + +impl Default for UnionAgg { + fn default() -> Self { + Self::new() + } +} + +static UNION_AGG_DOC: OnceLock = OnceLock::new(); + +impl AggregateUDFImpl for UnionAgg { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_union_agg" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + // Return a generic geometry type + let geom_type = GeometryType::new(Default::default()); + Ok(geom_type.data_type()) + } + + fn accumulator( + &self, + _acc_args: datafusion::logical_expr::function::AccumulatorArgs, + ) -> Result> { + Ok(Box::new(UnionAccumulator { result: None })) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(UNION_AGG_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Computes the union of a set of geometries.", + "ST_Union_Agg(geometry)", + ) + .with_argument("geom", "geometry column") + .build() + })) + } +} + +#[derive(Debug)] +struct UnionAccumulator { + result: Option, +} + +impl Accumulator for UnionAccumulator { + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array = &values[0]; + let geom_type = GeometryType::new(Default::default()); + let field = Field::new("", geom_type.data_type(), true); + + if let Ok(geo_array) = from_arrow_array(array, &field) { + self.process_array(geo_array.as_ref())?; + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + match &self.result { + Some(mp) => { + use wkt::ToWkt; + let wkt_str = geo::Geometry::MultiPolygon(mp.clone()).wkt_string(); + Ok(ScalarValue::Utf8(Some(wkt_str))) + } + None => Ok(ScalarValue::Utf8(None)), + } + } + + fn size(&self) -> usize { + std::mem::size_of::() + + self + .result + .as_ref() + .map(|mp| mp.0.iter().map(|p| p.exterior().0.len() * 16).sum::()) + .unwrap_or(0) + } + + fn state(&mut self) -> Result> { + match &self.result { + Some(mp) => { + use wkt::ToWkt; + let wkt_str = geo::Geometry::MultiPolygon(mp.clone()).wkt_string(); + Ok(vec![ScalarValue::Utf8(Some(wkt_str))]) + } + None => Ok(vec![ScalarValue::Utf8(None)]), + } + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + let string_array = states[0] + .as_any() + .downcast_ref::(); + if let Some(arr) = string_array { + for i in 0..arr.len() { + if !arr.is_null(i) { + let wkt_str = arr.value(i); + if let Ok(geom) = wkt::TryFromWkt::try_from_wkt_str(wkt_str) { + let geom: geo::Geometry = geom; + self.union_geometry(&geom); + } + } + } + } + Ok(()) + } +} + +impl UnionAccumulator { + fn process_array(&mut self, array: &dyn GeoArrowArray) -> Result<()> { + fn _process_typed<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + ) -> geoarrow_schema::error::GeoArrowResult> { + let mut geoms = Vec::new(); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + geoms.push(geo_geom); + } + } + Ok(geoms) + } + + let geoms: Vec = downcast_geoarrow_array!(array, _process_typed) + .map_err(|e| datafusion::error::DataFusionError::Internal(e.to_string()))?; + + for geom in &geoms { + self.union_geometry(geom); + } + Ok(()) + } + + fn union_geometry(&mut self, geom: &geo::Geometry) { + let mp = match geom { + geo::Geometry::Polygon(p) => geo::MultiPolygon(vec![p.clone()]), + geo::Geometry::MultiPolygon(mp) => mp.clone(), + _ => return, + }; + + self.result = Some(match &self.result { + Some(existing) => existing.union(&mp), + None => mp, + }); + } +} + +// ========================================================================= +// ST_Collect_Agg +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct CollectAgg { + signature: Signature, +} + +impl CollectAgg { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable), + } + } +} + +impl Default for CollectAgg { + fn default() -> Self { + Self::new() + } +} + +static COLLECT_AGG_DOC: OnceLock = OnceLock::new(); + +impl AggregateUDFImpl for CollectAgg { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_collect" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + // TODO: Return native geometry type instead of WKT. Changing to native geometry + // requires significant refactoring of the aggregate accumulator interface + // (state serialization, merge). + Ok(DataType::Utf8) + } + + fn accumulator( + &self, + _acc_args: datafusion::logical_expr::function::AccumulatorArgs, + ) -> Result> { + Ok(Box::new(CollectAccumulator { + geometries: vec![], + })) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(COLLECT_AGG_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Collects geometries into a GeometryCollection or appropriate Multi* type.", + "ST_Collect(geometry)", + ) + .with_argument("geom", "geometry column") + .build() + })) + } +} + +#[derive(Debug)] +struct CollectAccumulator { + geometries: Vec, +} + +impl Accumulator for CollectAccumulator { + fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> { + let array = &values[0]; + let geom_type = GeometryType::new(Default::default()); + let field = Field::new("", geom_type.data_type(), true); + + if let Ok(geo_array) = from_arrow_array(array, &field) { + fn _collect_typed<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + ) -> geoarrow_schema::error::GeoArrowResult> { + let mut geoms = Vec::new(); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + geoms.push(geo_geom); + } + } + Ok(geoms) + } + + let geo_array_ref = geo_array.as_ref(); + if let Ok(geoms) = downcast_geoarrow_array!(geo_array_ref, _collect_typed) { + self.geometries.extend(geoms); + } + } + Ok(()) + } + + fn evaluate(&mut self) -> Result { + use wkt::ToWkt; + + if self.geometries.is_empty() { + return Ok(ScalarValue::Utf8(None)); + } + + let gc = geo::GeometryCollection(self.geometries.clone()); + let wkt_str = geo::Geometry::GeometryCollection(gc).wkt_string(); + Ok(ScalarValue::Utf8(Some(wkt_str))) + } + + fn size(&self) -> usize { + use geo::CoordsIter; + std::mem::size_of::() + + self.geometries.len() * std::mem::size_of::() + + self + .geometries + .iter() + .map(|g| g.coords_count() * 16) + .sum::() + } + + fn state(&mut self) -> Result> { + self.evaluate().map(|v| vec![v]) + } + + fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()> { + let string_array = states[0] + .as_any() + .downcast_ref::(); + if let Some(arr) = string_array { + for i in 0..arr.len() { + if !arr.is_null(i) { + let wkt_str = arr.value(i); + if let Ok(geom) = wkt::TryFromWkt::try_from_wkt_str(wkt_str) { + let geom: geo::Geometry = geom; + self.geometries.push(geom); + } + } + } + } + Ok(()) + } +} diff --git a/rust/geodatafusion/src/udf/geo/measurement/advanced.rs b/rust/geodatafusion/src/udf/geo/measurement/advanced.rs new file mode 100644 index 0000000..f08f0f9 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/measurement/advanced.rs @@ -0,0 +1,1067 @@ +use std::any::Any; +use std::sync::{Arc, LazyLock, OnceLock}; + +use arrow_array::Float64Array; +use arrow_array::builder::Float64Builder; +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, Dimension, Metadata, PointType}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_Perimeter +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Perimeter; + +impl Perimeter { + pub fn new() -> Self { + Self {} + } +} + +impl Default for Perimeter { + fn default() -> Self { + Self::new() + } +} + +static PERIMETER_ALIASES: LazyLock> = + LazyLock::new(|| vec!["st_perimeter2d".to_string()]); +static PERIMETER_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Perimeter { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_perimeter" + } + + fn aliases(&self) -> &[String] { + PERIMETER_ALIASES.as_slice() + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(perimeter_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(PERIMETER_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the 2D perimeter of a polygon or multipolygon geometry. For non-areal geometries returns 0.", + "ST_Perimeter(geom)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn perimeter_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = perimeter_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn perimeter_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _perimeter_typed) +} + +fn _perimeter_typed<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geo::EuclideanLength; + + let mut builder = Float64Builder::with_capacity(array.len()); + for geom_opt in array.iter() { + match geom_opt { + Some(geom) => { + let geo_geom = geometry_to_geo(&geom?)?; + let perimeter = match &geo_geom { + geo::Geometry::Polygon(p) => { + let exterior = p.exterior().euclidean_length(); + let interiors: f64 = + p.interiors().iter().map(|r| r.euclidean_length()).sum(); + exterior + interiors + } + geo::Geometry::MultiPolygon(mp) => { + mp.0.iter() + .map(|p| { + let exterior = p.exterior().euclidean_length(); + let interiors: f64 = + p.interiors().iter().map(|r| r.euclidean_length()).sum(); + exterior + interiors + }) + .sum() + } + _ => 0.0, + }; + builder.append_value(perimeter); + } + None => builder.append_null(), + } + } + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_Azimuth +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Azimuth; + +impl Azimuth { + pub fn new() -> Self { + Self {} + } +} + +impl Default for Azimuth { + fn default() -> Self { + Self::new() + } +} + +static AZIMUTH_SIGNATURE: LazyLock = + LazyLock::new(|| Signature::any(2, Volatility::Immutable)); +static AZIMUTH_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Azimuth { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_azimuth" + } + + fn signature(&self) -> &Signature { + &AZIMUTH_SIGNATURE + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(azimuth_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AZIMUTH_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the azimuth in radians of the segment defined by the two given point geometries. The azimuth is measured clockwise from north (0 radians).", + "ST_Azimuth(pointA, pointB)", + ) + .with_argument("pointA", "geometry (point)") + .with_argument("pointB", "geometry (point)") + .build() + })) + } +} + +fn azimuth_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = azimuth_arrays(&left_arr, &right_arr)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn azimuth_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _azimuth_left, right) +} + +fn _azimuth_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _azimuth_both, left) +} + +fn _azimuth_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + let mut builder = Float64Builder::with_capacity(left.len()); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + match (&left_geo, &right_geo) { + (geo::Geometry::Point(pa), geo::Geometry::Point(pb)) => { + let dx = pb.x() - pa.x(); + let dy = pb.y() - pa.y(); + // atan2(dx, dy) gives angle from north (positive Y), + // clockwise positive + let mut azimuth = dx.atan2(dy); + if azimuth < 0.0 { + azimuth += 2.0 * std::f64::consts::PI; + } + builder.append_value(azimuth); + } + _ => builder.append_null(), + } + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_HausdorffDistance +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct HausdorffDistance; + +impl HausdorffDistance { + pub fn new() -> Self { + Self {} + } +} + +impl Default for HausdorffDistance { + fn default() -> Self { + Self::new() + } +} + +static HAUSDORFF_SIGNATURE: LazyLock = + LazyLock::new(|| Signature::any(2, Volatility::Immutable)); +static HAUSDORFF_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for HausdorffDistance { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_hausdorffdistance" + } + + fn signature(&self) -> &Signature { + &HAUSDORFF_SIGNATURE + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(hausdorff_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(HAUSDORFF_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the Hausdorff distance between two geometries. The Hausdorff distance is the greatest distance between any point in one geometry and the closest point in the other geometry.", + "ST_HausdorffDistance(geomA, geomB)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } +} + +fn hausdorff_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = hausdorff_arrays(&left_arr, &right_arr)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn hausdorff_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _hausdorff_left, right) +} + +fn _hausdorff_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _hausdorff_both, left) +} + +fn _hausdorff_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geo::HausdorffDistance as GeoHausdorff; + + let mut builder = Float64Builder::with_capacity(left.len()); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + let dist = left_geo.hausdorff_distance(&right_geo); + builder.append_value(dist); + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_FrechetDistance +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct FrechetDistance; + +impl FrechetDistance { + pub fn new() -> Self { + Self {} + } +} + +impl Default for FrechetDistance { + fn default() -> Self { + Self::new() + } +} + +static FRECHET_SIGNATURE: LazyLock = + LazyLock::new(|| Signature::any(2, Volatility::Immutable)); +static FRECHET_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for FrechetDistance { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_frechetdistance" + } + + fn signature(&self) -> &Signature { + &FRECHET_SIGNATURE + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(frechet_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(FRECHET_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the discrete Frechet distance between two geometries. The Frechet distance is a measure of similarity between curves that accounts for the ordering and location of the points along the curves.", + "ST_FrechetDistance(geomA, geomB)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } +} + +fn frechet_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = frechet_arrays(&left_arr, &right_arr)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn frechet_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _frechet_left, right) +} + +fn _frechet_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _frechet_both, left) +} + +fn _frechet_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geo::FrechetDistance as GeoFrechet; + + let mut builder = Float64Builder::with_capacity(left.len()); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + match (&left_geo, &right_geo) { + (geo::Geometry::LineString(ls_a), geo::Geometry::LineString(ls_b)) => { + let dist = ls_a.frechet_distance(ls_b); + builder.append_value(dist); + } + _ => builder.append_null(), + } + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_ClosestPoint +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct ClosestPoint { + coord_type: CoordType, +} + +impl ClosestPoint { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for ClosestPoint { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static CLOSEST_POINT_SIGNATURE: LazyLock = + LazyLock::new(|| Signature::any(2, Volatility::Immutable)); +static CLOSEST_POINT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for ClosestPoint { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_closestpoint" + } + + fn signature(&self) -> &Signature { + &CLOSEST_POINT_SIGNATURE + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(closest_point_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(closest_point_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(CLOSEST_POINT_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the 2D point on geomA that is closest to geomB. This is the point from which the shortest distance between the two geometries is measured.", + "ST_ClosestPoint(geomA, geomB)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } +} + +fn closest_point_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = PointType::new(Dimension::XY, metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +fn closest_point_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = closest_point_arrays(&left_arr, &right_arr, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn closest_point_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + coord_type: CoordType, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _closest_point_left, right, coord_type) +} + +fn _closest_point_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + coord_type: CoordType, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _closest_point_both, left, coord_type) +} + +fn _closest_point_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, + coord_type: CoordType, +) -> GeoArrowResult { + + + + let point_type = PointType::new(Dimension::XY, Default::default()).with_coord_type(coord_type); + let mut builder = geoarrow_array::builder::PointBuilder::new(point_type); + + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + // Find the closest point on left_geo to right_geo + // We need a point from right_geo to query against left_geo + // ClosestPoint trait: left_geo.closest_point(&point) where point is a Point + // For a general geometry, we use the centroid of right_geo as the query point, + // but the PostGIS semantics are: point on A closest to B. + // geo's ClosestPoint takes a Point argument. We need to iterate + // or use a different approach. We'll compute closest point on A to each + // vertex of B and pick the minimum distance one. + let closest = find_closest_point_on_geom(&left_geo, &right_geo); + match closest { + Some(pt) => builder.push_point(Some(&pt)), + None => builder.push_null(), + } + } + _ => builder.push_null(), + } + } + Ok(builder.finish()) +} + +/// Find the point on `geom_a` that is closest to `geom_b`. +/// +/// Iterates over all vertices of `geom_b`, queries `geom_a.closest_point(&vertex)` +/// for each, and returns the candidate with the minimum distance to the query vertex. +/// This is vertex-exact: the result is guaranteed to be the closest point on A to +/// one of B's vertices (not necessarily to an edge midpoint of B). +fn find_closest_point_on_geom( + geom_a: &geo::Geometry, + geom_b: &geo::Geometry, +) -> Option { + use geo::{Closest, ClosestPoint as GeoClosestPoint, CoordsIter, EuclideanDistance}; + + let mut best: Option = None; + let mut best_dist = f64::INFINITY; + + for coord in geom_b.coords_iter() { + let query = geo::Point::new(coord.x, coord.y); + let candidate = match geom_a.closest_point(&query) { + Closest::SinglePoint(p) | Closest::Intersection(p) => p, + Closest::Indeterminate => continue, + }; + let dist = candidate.euclidean_distance(&query); + if dist < best_dist { + best_dist = dist; + best = Some(candidate); + } + } + best +} + +// --------------------------------------------------------------------------- +// ST_MaxDistance +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct MaxDistance; + +impl MaxDistance { + pub fn new() -> Self { + Self {} + } +} + +impl Default for MaxDistance { + fn default() -> Self { + Self::new() + } +} + +static MAX_DISTANCE_SIGNATURE: LazyLock = + LazyLock::new(|| Signature::any(2, Volatility::Immutable)); +static MAX_DISTANCE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MaxDistance { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_maxdistance" + } + + fn signature(&self) -> &Signature { + &MAX_DISTANCE_SIGNATURE + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(max_distance_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MAX_DISTANCE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the maximum 2D Cartesian distance between two geometries, computed over all pairs of vertices.", + "ST_MaxDistance(geomA, geomB)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } +} + +fn max_distance_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = max_distance_arrays(&left_arr, &right_arr)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn max_distance_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _max_distance_left, right) +} + +fn _max_distance_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _max_distance_both, left) +} + +fn _max_distance_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geo::CoordsIter; + use geo::EuclideanDistance; + + let mut builder = Float64Builder::with_capacity(left.len()); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + let left_coords: Vec<_> = left_geo.coords_iter().collect(); + let right_coords: Vec<_> = right_geo.coords_iter().collect(); + let mut max_dist: f64 = 0.0; + for lc in &left_coords { + let lp = geo::Point::new(lc.x, lc.y); + for rc in &right_coords { + let rp = geo::Point::new(rc.x, rc.y); + let d = lp.euclidean_distance(&rp); + if d > max_dist { + max_dist = d; + } + } + } + builder.append_value(max_dist); + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +#[cfg(test)] +mod test { + use approx::assert_relative_eq; + use arrow_array::cast::AsArray; + use arrow_array::types::Float64Type; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::GeomFromText; + + // ----------------------------------------------------------------------- + // ST_Perimeter tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_perimeter_square() { + let ctx = SessionContext::new(); + ctx.register_udf(Perimeter::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // 1x1 square: perimeter = 4 + let df = ctx + .sql("SELECT ST_Perimeter(ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 4.0); + } + + #[tokio::test] + async fn test_perimeter_polygon_with_hole() { + let ctx = SessionContext::new(); + ctx.register_udf(Perimeter::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Outer square 10x10 (perimeter 40), inner square 2x2 (perimeter 8) => total 48 + let df = ctx + .sql("SELECT ST_Perimeter(ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0), (4 4, 6 4, 6 6, 4 6, 4 4))'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 48.0); + } + + #[tokio::test] + async fn test_perimeter_point() { + let ctx = SessionContext::new(); + ctx.register_udf(Perimeter::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point has perimeter 0 + let df = ctx + .sql("SELECT ST_Perimeter(ST_GeomFromText('POINT(1 2)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 0.0); + } + + #[tokio::test] + async fn test_perimeter_linestring() { + let ctx = SessionContext::new(); + ctx.register_udf(Perimeter::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // LineString has perimeter 0 (not an areal geometry) + let df = ctx + .sql("SELECT ST_Perimeter(ST_GeomFromText('LINESTRING(0 0, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 0.0); + } + + // ----------------------------------------------------------------------- + // ST_Azimuth tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_azimuth_north() { + let ctx = SessionContext::new(); + ctx.register_udf(Azimuth::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point directly north => azimuth = 0 + let df = ctx + .sql("SELECT ST_Azimuth(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(0 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 0.0, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_azimuth_east() { + let ctx = SessionContext::new(); + ctx.register_udf(Azimuth::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point directly east => azimuth = pi/2 + let df = ctx + .sql("SELECT ST_Azimuth(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(1 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, std::f64::consts::FRAC_PI_2, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_azimuth_south() { + let ctx = SessionContext::new(); + ctx.register_udf(Azimuth::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point directly south => azimuth = pi + let df = ctx + .sql( + "SELECT ST_Azimuth(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(0 -1)'));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, std::f64::consts::PI, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_azimuth_west() { + let ctx = SessionContext::new(); + ctx.register_udf(Azimuth::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point directly west => azimuth = 3*pi/2 + let df = ctx + .sql("SELECT ST_Azimuth(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(-1 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 3.0 * std::f64::consts::FRAC_PI_2, epsilon = 1e-10); + } + + // ----------------------------------------------------------------------- + // ST_HausdorffDistance tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_hausdorff_distance_identical() { + let ctx = SessionContext::new(); + ctx.register_udf(HausdorffDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Identical geometries => Hausdorff distance = 0 + let df = ctx + .sql("SELECT ST_HausdorffDistance(ST_GeomFromText('LINESTRING(0 0, 1 1)'), ST_GeomFromText('LINESTRING(0 0, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 0.0); + } + + #[tokio::test] + async fn test_hausdorff_distance_simple() { + let ctx = SessionContext::new(); + ctx.register_udf(HausdorffDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // LINESTRING(0 0, 1 0) vs LINESTRING(0 1, 1 1) + // Every point on A is distance 1 from B, every point on B is distance 1 from A + // Hausdorff distance = 1 + let df = ctx + .sql("SELECT ST_HausdorffDistance(ST_GeomFromText('LINESTRING(0 0, 1 0)'), ST_GeomFromText('LINESTRING(0 1, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 1.0, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_hausdorff_distance_points() { + let ctx = SessionContext::new(); + ctx.register_udf(HausdorffDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // POINT(0 0) vs POINT(3 4) => distance = 5 + let df = ctx + .sql("SELECT ST_HausdorffDistance(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(3 4)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 5.0, epsilon = 1e-10); + } + + // ----------------------------------------------------------------------- + // ST_FrechetDistance tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_frechet_distance_identical() { + let ctx = SessionContext::new(); + ctx.register_udf(FrechetDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Identical linestrings => Frechet distance = 0 + let df = ctx + .sql("SELECT ST_FrechetDistance(ST_GeomFromText('LINESTRING(0 0, 1 1)'), ST_GeomFromText('LINESTRING(0 0, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_eq!(val, 0.0); + } + + #[tokio::test] + async fn test_frechet_distance_simple() { + let ctx = SessionContext::new(); + ctx.register_udf(FrechetDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // LINESTRING(0 0, 1 0) vs LINESTRING(0 1, 1 1) => Frechet distance = 1 + let df = ctx + .sql("SELECT ST_FrechetDistance(ST_GeomFromText('LINESTRING(0 0, 1 0)'), ST_GeomFromText('LINESTRING(0 1, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 1.0, epsilon = 1e-10); + } + + // ----------------------------------------------------------------------- + // ST_ClosestPoint tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_closest_point_on_line() { + use geo_traits::{CoordTrait, PointTrait}; + use geoarrow_array::array::PointArray; + + let ctx = SessionContext::new(); + ctx.register_udf(ClosestPoint::default().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Closest point on LINESTRING(0 0, 10 0) to POINT(5 5) should be POINT(5 0) + let df = ctx + .sql("SELECT ST_ClosestPoint(ST_GeomFromText('LINESTRING(0 0, 10 0)'), ST_GeomFromText('POINT(5 5)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let geo_arr = + PointArray::try_from((batch.column(0).as_ref(), batch.schema().field(0))).unwrap(); + let point = geo_arr.value(0).unwrap(); + assert_relative_eq!(point.coord().unwrap().x(), 5.0, epsilon = 1e-10); + assert_relative_eq!(point.coord().unwrap().y(), 0.0, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_closest_point_coincident() { + use geo_traits::{CoordTrait, PointTrait}; + use geoarrow_array::array::PointArray; + + let ctx = SessionContext::new(); + ctx.register_udf(ClosestPoint::default().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Point on the line itself + let df = ctx + .sql("SELECT ST_ClosestPoint(ST_GeomFromText('LINESTRING(0 0, 10 0)'), ST_GeomFromText('POINT(3 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let geo_arr = + PointArray::try_from((batch.column(0).as_ref(), batch.schema().field(0))).unwrap(); + let point = geo_arr.value(0).unwrap(); + assert_relative_eq!(point.coord().unwrap().x(), 3.0, epsilon = 1e-10); + assert_relative_eq!(point.coord().unwrap().y(), 0.0, epsilon = 1e-10); + } + + // ----------------------------------------------------------------------- + // ST_MaxDistance tests + // ----------------------------------------------------------------------- + + #[tokio::test] + async fn test_max_distance_points() { + let ctx = SessionContext::new(); + ctx.register_udf(MaxDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // POINT(0 0) vs POINT(3 4) => max distance = 5 + let df = ctx + .sql("SELECT ST_MaxDistance(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(3 4)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 5.0, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_max_distance_line_to_point() { + let ctx = SessionContext::new(); + ctx.register_udf(MaxDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // LINESTRING(0 0, 3 0) vs POINT(0 4) + // Max distance is from (3,0) to (0,4) = 5 + let df = ctx + .sql("SELECT ST_MaxDistance(ST_GeomFromText('LINESTRING(0 0, 3 0)'), ST_GeomFromText('POINT(0 4)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, 5.0, epsilon = 1e-10); + } + + #[tokio::test] + async fn test_max_distance_same_geometry() { + let ctx = SessionContext::new(); + ctx.register_udf(MaxDistance::new().into()); + ctx.register_udf(GeomFromText::new(Default::default()).into()); + + // Square: max distance is the diagonal = sqrt(2) + let df = ctx + .sql("SELECT ST_MaxDistance(ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'), ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_primitive::().value(0); + assert_relative_eq!(val, std::f64::consts::SQRT_2, epsilon = 1e-10); + } +} diff --git a/rust/geodatafusion/src/udf/geo/measurement/mod.rs b/rust/geodatafusion/src/udf/geo/measurement/mod.rs index 0600372..5f66a64 100644 --- a/rust/geodatafusion/src/udf/geo/measurement/mod.rs +++ b/rust/geodatafusion/src/udf/geo/measurement/mod.rs @@ -1,13 +1,21 @@ +mod advanced; mod area; mod distance; mod length; +pub use advanced::{Azimuth, ClosestPoint, FrechetDistance, HausdorffDistance, MaxDistance, Perimeter}; pub use area::Area; pub use distance::Distance; pub use length::Length; pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(Area.into()); + session_context.register_udf(Azimuth.into()); + session_context.register_udf(ClosestPoint::default().into()); session_context.register_udf(Distance.into()); + session_context.register_udf(FrechetDistance.into()); + session_context.register_udf(HausdorffDistance.into()); session_context.register_udf(Length.into()); + session_context.register_udf(MaxDistance.into()); + session_context.register_udf(Perimeter.into()); } diff --git a/rust/geodatafusion/src/udf/geo/mod.rs b/rust/geodatafusion/src/udf/geo/mod.rs index 4dc65e3..40cf057 100644 --- a/rust/geodatafusion/src/udf/geo/mod.rs +++ b/rust/geodatafusion/src/udf/geo/mod.rs @@ -1,3 +1,4 @@ +pub mod aggregate; pub mod measurement; pub mod processing; pub mod relationships; diff --git a/rust/geodatafusion/src/udf/geo/processing/advanced.rs b/rust/geodatafusion/src/udf/geo/processing/advanced.rs new file mode 100644 index 0000000..a247dd0 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/advanced.rs @@ -0,0 +1,571 @@ +//! Advanced geometry processing functions: ST_LineMerge, ST_Snap, +//! ST_DelaunayTriangles, ST_VoronoiPolygons, ST_MinimumBoundingCircle, +//! ST_GeneratePoints, ST_ReducePrecision, ST_ChaikinSmoothing, ST_Segmentize. + +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// ========================================================================= +// ST_ChaikinSmoothing +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct ChaikinSmoothing { + signature: Signature, + coord_type: CoordType, +} + +impl ChaikinSmoothing { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for ChaikinSmoothing { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static CHAIKIN_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for ChaikinSmoothing { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_chaikinsmoothing" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(chaikin_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(CHAIKIN_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a smoothed version of a geometry using Chaikin's corner-cutting algorithm.", + "ST_ChaikinSmoothing(geometry, nIterations)", + ) + .with_argument("geom", "geometry") + .with_argument("n_iterations", "integer - number of smoothing iterations") + .build() + })) + } +} + +fn chaikin_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let n_iterations = match args.args[1].cast_to(&DataType::UInt32, None)? { + ColumnarValue::Scalar(ScalarValue::UInt32(Some(n))) => n as usize, + _ => 1, + }; + + let result = chaikin_array(&geo_array, n_iterations)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn chaikin_array( + array: &dyn GeoArrowArray, + n_iterations: usize, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _chaikin_impl, n_iterations) +} + +fn _chaikin_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + n_iterations: usize, +) -> GeoArrowResult> { + use geo::ChaikinSmoothing as GeoChaikin; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let smoothed = geo_geom.chaikin_smoothing(n_iterations); + builder.push_geometry(Some(&smoothed))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_Segmentize +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Segmentize { + signature: Signature, + coord_type: CoordType, +} + +impl Segmentize { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for Segmentize { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static SEGMENTIZE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Segmentize { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_segmentize" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(segmentize_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(SEGMENTIZE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a modified geometry with no segment longer than the given max_segment_length. Currently only supports LineString input. Other geometry types are returned unchanged.", + "ST_Segmentize(geometry, max_segment_length)", + ) + .with_argument("geom", "geometry") + .with_argument("max_segment_length", "float8") + .build() + })) + } +} + +fn segmentize_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let max_length = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized max_segment_length not supported".to_string(), + ) + .into()) + } + }; + + let result = segmentize_array(&geo_array, max_length)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn segmentize_array( + array: &dyn GeoArrowArray, + max_length: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _segmentize_impl, max_length) +} + +fn _segmentize_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + max_length: f64, +) -> GeoArrowResult> { + use geo::LineStringSegmentize; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let result = match geo_geom { + geo::Geometry::LineString(ls) => { + let n_segments = (ls.euclidean_length() / max_length).ceil() as usize; + if n_segments > 1 { + if let Some(segmented) = ls.line_segmentize(n_segments) { + geo::Geometry::MultiLineString(segmented) + } else { + geo::Geometry::LineString(ls) + } + } else { + geo::Geometry::LineString(ls) + } + } + other => other, + }; + builder.push_geometry(Some(&result))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_ReducePrecision +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct ReducePrecision { + signature: Signature, + coord_type: CoordType, +} + +impl ReducePrecision { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for ReducePrecision { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static REDUCE_PREC_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for ReducePrecision { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_reduceprecision" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(reduce_precision_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(REDUCE_PREC_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a valid geometry with all points rounded to the given grid size.", + "ST_ReducePrecision(geometry, gridSize)", + ) + .with_argument("geom", "geometry") + .with_argument("grid_size", "float8 - grid size for rounding coordinates") + .build() + })) + } +} + +fn reduce_precision_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let grid_size = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized grid_size not supported".to_string(), + ) + .into()) + } + }; + + let result = reduce_precision_array(&geo_array, grid_size)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn reduce_precision_array( + array: &dyn GeoArrowArray, + grid_size: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _reduce_precision_impl, grid_size) +} + +fn _reduce_precision_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + grid_size: f64, +) -> GeoArrowResult> { + use geo::MapCoords; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let snapped = if grid_size > 0.0 { + geo_geom.map_coords(|coord| { + geo::Coord { + x: (coord.x / grid_size).round() * grid_size, + y: (coord.y / grid_size).round() * grid_size, + } + }) + } else { + geo_geom + }; + builder.push_geometry(Some(&snapped))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_UnaryUnion +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct UnaryUnion { + coord_type: CoordType, +} + +impl UnaryUnion { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for UnaryUnion { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static UNARY_UNION_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for UnaryUnion { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_unaryunion" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(unary_union_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(UNARY_UNION_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Computes the union of the components of a single geometry (useful for dissolving MultiPolygon or GeometryCollection into a single geometry).", + "ST_UnaryUnion(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn unary_union_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = unary_union_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn unary_union_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _unary_union_impl) +} + +fn _unary_union_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + use geo::BooleanOps; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let result = match geo_geom { + geo::Geometry::MultiPolygon(mp) => { + let mut result = geo::MultiPolygon(vec![]); + for poly in mp.0 { + let mp_single = geo::MultiPolygon(vec![poly]); + result = result.union(&mp_single); + } + geo::Geometry::MultiPolygon(result) + } + geo::Geometry::GeometryCollection(gc) => { + let mut result = geo::MultiPolygon(vec![]); + for g in gc.0 { + match g { + geo::Geometry::Polygon(p) => { + let mp_single = geo::MultiPolygon(vec![p]); + result = result.union(&mp_single); + } + geo::Geometry::MultiPolygon(mp) => { + result = result.union(&mp); + } + other => { + let type_name = match &other { + geo::Geometry::Point(_) => "Point", + geo::Geometry::MultiPoint(_) => "MultiPoint", + geo::Geometry::LineString(_) => "LineString", + geo::Geometry::MultiLineString(_) => "MultiLineString", + geo::Geometry::GeometryCollection(_) => "GeometryCollection", + _ => "unsupported", + }; + return Err(geoarrow_schema::error::GeoArrowError::IncorrectGeometryType( + format!( + "ST_UnaryUnion only supports polygonal geometries; found {type_name} in collection" + ), + )); + } + } + } + geo::Geometry::MultiPolygon(result) + } + other => other, + }; + builder.push_geometry(Some(&result))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// Helper function for geometry return field +// ========================================================================= +fn geom_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +// Need this for Segmentize +use geo::EuclideanLength; + +#[cfg(test)] +mod test { + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_chaikin_smoothing() { + let ctx = SessionContext::new(); + ctx.register_udf(ChaikinSmoothing::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql("SELECT ST_AsText(ST_ChaikinSmoothing(ST_GeomFromText('LINESTRING(0 0, 5 5, 10 0)'), 1));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert_eq!(batch.num_rows(), 1); + } + + #[tokio::test] + async fn test_reduce_precision() { + let ctx = SessionContext::new(); + ctx.register_udf(ReducePrecision::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_ReducePrecision(ST_GeomFromText('POINT(1.23456 2.34567)'), 0.01));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "POINT(1.23 2.35)"); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/affine.rs b/rust/geodatafusion/src/udf/geo/processing/affine.rs new file mode 100644 index 0000000..17b5f45 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/affine.rs @@ -0,0 +1,697 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geo::{AffineOps, AffineTransform, Rotate, Scale, Translate}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_Translate +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StTranslate { + signature: Signature, + coord_type: CoordType, +} + +impl StTranslate { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StTranslate { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static TRANSLATE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StTranslate { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_translate" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geometry_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(translate_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(TRANSLATE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Translates a geometry by the given offsets.", + "ST_Translate(geometry, deltaX, deltaY)", + ) + .with_argument("geom", "geometry") + .with_argument("deltaX", "float8 - offset to add to X coordinates") + .with_argument("deltaY", "float8 - offset to add to Y coordinates") + .build() + })) + } +} + +fn translate_impl( + args: ScalarFunctionArgs, + _coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let delta_x = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized deltaX not yet implemented".to_string(), + ) + .into()); + } + }; + + let delta_y = match args.args[2].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized deltaY not yet implemented".to_string(), + ) + .into()); + } + }; + + let result = translate_array(&geo_array, delta_x, delta_y)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn translate_array( + array: &dyn GeoArrowArray, + delta_x: f64, + delta_y: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _translate_impl, delta_x, delta_y) +} + +fn _translate_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + delta_x: f64, + delta_y: f64, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let translated = geo_geom.translate(delta_x, delta_y); + builder.push_geometry(Some(&translated))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// ST_Scale +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StScale { + signature: Signature, + coord_type: CoordType, +} + +impl StScale { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StScale { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static SCALE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StScale { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_scale" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geometry_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(scale_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(SCALE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Scales a geometry by the given factors.", + "ST_Scale(geometry, xFactor, yFactor)", + ) + .with_argument("geom", "geometry") + .with_argument("xFactor", "float8 - factor to scale X coordinates by") + .with_argument("yFactor", "float8 - factor to scale Y coordinates by") + .build() + })) + } +} + +fn scale_impl( + args: ScalarFunctionArgs, + _coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let x_factor = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized xFactor not yet implemented".to_string(), + ) + .into()); + } + }; + + let y_factor = match args.args[2].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized yFactor not yet implemented".to_string(), + ) + .into()); + } + }; + + let result = scale_array(&geo_array, x_factor, y_factor)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn scale_array( + array: &dyn GeoArrowArray, + x_factor: f64, + y_factor: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _scale_impl, x_factor, y_factor) +} + +fn _scale_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + x_factor: f64, + y_factor: f64, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let scaled = geo_geom.scale_xy(x_factor, y_factor); + builder.push_geometry(Some(&scaled))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// ST_Rotate +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StRotate { + signature: Signature, + coord_type: CoordType, +} + +impl StRotate { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StRotate { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static ROTATE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StRotate { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_rotate" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geometry_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(rotate_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(ROTATE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Rotates a geometry around its centroid by the given angle in radians.", + "ST_Rotate(geometry, angle)", + ) + .with_argument("geom", "geometry") + .with_argument("angle", "float8 - rotation angle in radians") + .build() + })) + } +} + +fn rotate_impl( + args: ScalarFunctionArgs, + _coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let angle = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => v, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized angle not yet implemented".to_string(), + ) + .into()); + } + }; + + // Convert radians to degrees for geo::Rotate which expects degrees + let angle_degrees = angle.to_degrees(); + + let result = rotate_array(&geo_array, angle_degrees)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn rotate_array( + array: &dyn GeoArrowArray, + angle_degrees: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _rotate_impl, angle_degrees) +} + +fn _rotate_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + angle_degrees: f64, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let rotated = geo_geom.rotate_around_centroid(angle_degrees); + builder.push_geometry(Some(&rotated))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// ST_Affine +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StAffine { + signature: Signature, + coord_type: CoordType, +} + +impl StAffine { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(7, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StAffine { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static AFFINE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StAffine { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_affine" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geometry_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(affine_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AFFINE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Applies a 2D affine transformation to a geometry. The transformation matrix is: | a b xoff | | d e yoff | | 0 0 1 |", + "ST_Affine(geometry, a, b, d, e, xoff, yoff)", + ) + .with_argument("geom", "geometry") + .with_argument("a", "float8 - coefficient a of the affine transform") + .with_argument("b", "float8 - coefficient b of the affine transform") + .with_argument("d", "float8 - coefficient d of the affine transform") + .with_argument("e", "float8 - coefficient e of the affine transform") + .with_argument("xoff", "float8 - x translation offset") + .with_argument("yoff", "float8 - y translation offset") + .build() + })) + } +} + +fn extract_scalar_f64(col: &ColumnarValue, param_name: &str) -> GeoDataFusionResult { + match col.cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => Ok(v), + _ => Err(DataFusionError::NotImplemented(format!( + "Vectorized {} not yet implemented", + param_name + )) + .into()), + } +} + +fn affine_impl( + args: ScalarFunctionArgs, + _coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let a = extract_scalar_f64(&args.args[1], "a")?; + let b = extract_scalar_f64(&args.args[2], "b")?; + let d = extract_scalar_f64(&args.args[3], "d")?; + let e = extract_scalar_f64(&args.args[4], "e")?; + let xoff = extract_scalar_f64(&args.args[5], "xoff")?; + let yoff = extract_scalar_f64(&args.args[6], "yoff")?; + + let transform = AffineTransform::new(a, b, xoff, d, e, yoff); + + let result = affine_array(&geo_array, &transform)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn affine_array( + array: &dyn GeoArrowArray, + transform: &AffineTransform, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _affine_impl, transform) +} + +fn _affine_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + transform: &AffineTransform, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let transformed = geo_geom.affine_transform(transform); + builder.push_geometry(Some(&transformed))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +fn geometry_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod test { + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_translate() { + let ctx = SessionContext::new(); + ctx.register_udf(StTranslate::default().into()); + ctx.register_udf(AsText.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_AsText(ST_Translate(ST_GeomFromText('POINT(0 0)'), 1, 2));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "POINT(1 2)"); + } + + #[tokio::test] + async fn test_scale() { + let ctx = SessionContext::new(); + ctx.register_udf(StScale::default().into()); + ctx.register_udf(AsText.into()); + ctx.register_udf(GeomFromText::default().into()); + + // ST_Scale scales around the bounding box center (centroid). + // LINESTRING(0 0, 2 0) has bounding rect center at (1, 0). + // Scaling by (xFactor=2, yFactor=3) around (1, 0): + // (0,0) -> (1 + 2*(0-1), 0 + 3*(0-0)) = (-1, 0) + // (2,0) -> (1 + 2*(2-1), 0 + 3*(0-0)) = (3, 0) + let df = ctx + .sql("SELECT ST_AsText(ST_Scale(ST_GeomFromText('LINESTRING(0 0, 2 0)'), 2, 3));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "LINESTRING(-1 0,3 0)"); + } + + #[tokio::test] + async fn test_rotate() { + let ctx = SessionContext::new(); + ctx.register_udf(StRotate::default().into()); + ctx.register_udf(GeomFromText::default().into()); + + // ST_Rotate rotates around the geometry centroid by an angle in radians. + // LINESTRING(-1 0, 1 0) has centroid at (0, 0). + // Rotating 90 degrees CCW (PI/2 radians) around (0, 0): (x,y) -> (-y, x). + // (-1, 0) -> (0, -1) + // (1, 0) -> (0, 1) + let df = ctx + .sql(&format!( + "SELECT ST_Rotate(ST_GeomFromText('LINESTRING(-1 0, 1 0)'), {});", + std::f64::consts::FRAC_PI_2 + )) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let column = batch.column(0); + + let geom_arr = geoarrow_array::array::GeometryArray::try_from(( + column.as_ref(), + batch.schema_ref().field(0), + )) + .unwrap(); + let geo_geom = geometry_to_geo(&geom_arr.value(0).unwrap()).unwrap(); + if let geo::Geometry::LineString(ls) = geo_geom { + let coords: Vec<_> = ls.coords().collect(); + assert_eq!(coords.len(), 2); + // First point: (-1, 0) rotated 90 CCW -> (0, -1) + assert!( + (coords[0].x - 0.0).abs() < 1e-10, + "Expected x[0] ~ 0, got {}", + coords[0].x + ); + assert!( + (coords[0].y - (-1.0)).abs() < 1e-10, + "Expected y[0] ~ -1, got {}", + coords[0].y + ); + // Second point: (1, 0) rotated 90 CCW -> (0, 1) + assert!( + (coords[1].x - 0.0).abs() < 1e-10, + "Expected x[1] ~ 0, got {}", + coords[1].x + ); + assert!( + (coords[1].y - 1.0).abs() < 1e-10, + "Expected y[1] ~ 1, got {}", + coords[1].y + ); + } else { + panic!("Expected LineString geometry"); + } + } + + #[tokio::test] + async fn test_affine_translate() { + let ctx = SessionContext::new(); + ctx.register_udf(StAffine::default().into()); + ctx.register_udf(AsText.into()); + ctx.register_udf(GeomFromText::default().into()); + + // Identity rotation with translation (1, 2): a=1, b=0, d=0, e=1, xoff=1, yoff=2 + let df = ctx + .sql( + "SELECT ST_AsText(ST_Affine(ST_GeomFromText('POINT(0 0)'), 1, 0, 0, 1, 1, 2));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "POINT(1 2)"); + } + + #[tokio::test] + async fn test_affine_scale() { + let ctx = SessionContext::new(); + ctx.register_udf(StAffine::default().into()); + ctx.register_udf(AsText.into()); + ctx.register_udf(GeomFromText::default().into()); + + // Scale by (3, 2) with no translation: a=3, b=0, d=0, e=2, xoff=0, yoff=0 + let df = ctx + .sql( + "SELECT ST_AsText(ST_Affine(ST_GeomFromText('POINT(1 1)'), 3, 0, 0, 2, 0, 0));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "POINT(3 2)"); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/buffer.rs b/rust/geodatafusion/src/udf/geo/processing/buffer.rs new file mode 100644 index 0000000..abbfe4a --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/buffer.rs @@ -0,0 +1,180 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geo::Buffer as GeoBuffer; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::error::GeoDataFusionResult; + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StBuffer { + signature: Signature, + coord_type: CoordType, +} + +impl StBuffer { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StBuffer { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StBuffer { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_buffer" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(return_field_impl(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(buffer_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a geometry that represents all points whose distance from this geometry is less than or equal to the given distance. Positive distance creates an expanded polygon, negative shrinks it.", + "ST_Buffer(geometry, distance)", + ) + .with_argument("geom", "geometry") + .with_argument("distance", "float8 - buffer distance in the units of the geometry's coordinate system") + .build() + })) + } +} + +fn return_field_impl( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +fn buffer_impl( + args: ScalarFunctionArgs, + _coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let distance = args.args[1].cast_to(&DataType::Float64, None)?; + let distance = match distance { + ColumnarValue::Scalar(scalar) => match scalar { + ScalarValue::Float64(Some(val)) => val, + ScalarValue::Float64(None) => { + return Err(DataFusionError::Plan( + "ST_Buffer distance cannot be NULL".into(), + ) + .into()); + } + _ => unreachable!(), + }, + ColumnarValue::Array(_) => { + return Err(DataFusionError::NotImplemented( + "Vectorized distance not yet implemented".to_string(), + ) + .into()); + } + }; + + let result = buffer_array(&geo_array, distance)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn buffer_array( + array: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _buffer_impl, distance) +} + +fn _buffer_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + distance: f64, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let buffered = geo_geom.buffer(distance); + builder.push_geometry(Some(&geo::Geometry::MultiPolygon(buffered)))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use arrow_array::types::Float64Type; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::geo::measurement::Area; + use crate::udf::native::io::GeomFromText; + + #[tokio::test] + async fn test_buffer_point() { + let ctx = SessionContext::new(); + + ctx.register_udf(StBuffer::default().into()); + ctx.register_udf(Area::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + // Buffer a point by 1.0 should create a circle-like polygon with area ~π + let df = ctx + .sql("SELECT ST_Area(ST_Buffer(ST_GeomFromText('POINT(0 0)'), 1.0));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let area = col.as_primitive::().value(0); + // Area of a circle with r=1 is π ≈ 3.14159; buffer approximation should be close + assert!(area > 3.0 && area < 3.3, "Buffer area was {}", area); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/editors.rs b/rust/geodatafusion/src/udf/geo/processing/editors.rs new file mode 100644 index 0000000..bc2ef2a --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/editors.rs @@ -0,0 +1,1449 @@ +//! Geometry editor functions: ST_Reverse, ST_FlipCoordinates, ST_Force2D, +//! ST_Multi, ST_Normalize, ST_RemoveRepeatedPoints, ST_CollectionExtract. + +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// ========================================================================= +// ST_Reverse +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Reverse { + coord_type: CoordType, +} + +impl Reverse { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for Reverse { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static REVERSE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Reverse { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_reverse" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(reverse_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(REVERSE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the geometry with vertex order reversed.", + "ST_Reverse(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn reverse_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = reverse_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn reverse_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _reverse_impl) +} + +fn reverse_line_string(ls: &geo::LineString) -> geo::LineString { + geo::LineString(ls.0.iter().rev().cloned().collect()) +} + +fn reverse_polygon(poly: &geo::Polygon) -> geo::Polygon { + let exterior = reverse_line_string(poly.exterior()); + let interiors = poly.interiors().iter().map(reverse_line_string).collect(); + geo::Polygon::new(exterior, interiors) +} + +fn reverse_geometry(geom: &geo::Geometry) -> geo::Geometry { + match geom { + geo::Geometry::Point(p) => geo::Geometry::Point(*p), + geo::Geometry::Line(l) => geo::Geometry::Line(geo::Line::new(l.end, l.start)), + geo::Geometry::LineString(ls) => geo::Geometry::LineString(reverse_line_string(ls)), + geo::Geometry::Polygon(p) => geo::Geometry::Polygon(reverse_polygon(p)), + geo::Geometry::MultiPoint(mp) => { + geo::Geometry::MultiPoint(geo::MultiPoint(mp.0.iter().rev().cloned().collect())) + } + geo::Geometry::MultiLineString(mls) => geo::Geometry::MultiLineString( + geo::MultiLineString(mls.0.iter().map(reverse_line_string).collect()), + ), + geo::Geometry::MultiPolygon(mp) => geo::Geometry::MultiPolygon(geo::MultiPolygon( + mp.0.iter().map(reverse_polygon).collect(), + )), + geo::Geometry::GeometryCollection(gc) => geo::Geometry::GeometryCollection( + geo::GeometryCollection(gc.0.iter().map(reverse_geometry).collect()), + ), + geo::Geometry::Rect(r) => geo::Geometry::Rect(*r), + geo::Geometry::Triangle(t) => { + geo::Geometry::Triangle(geo::Triangle(t.2, t.1, t.0)) + } + } +} + +fn _reverse_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let reversed = reverse_geometry(&geo_geom); + builder.push_geometry(Some(&reversed))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_FlipCoordinates +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct FlipCoordinates { + coord_type: CoordType, +} + +impl FlipCoordinates { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for FlipCoordinates { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static FLIP_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for FlipCoordinates { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_flipcoordinates" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(flip_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(FLIP_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a version of the given geometry with X and Y axis flipped.", + "ST_FlipCoordinates(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn flip_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = flip_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn flip_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _flip_impl) +} + +fn _flip_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + use geo::MapCoords; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let flipped = geo_geom.map_coords(|coord| geo::Coord { + x: coord.y, + y: coord.x, + }); + builder.push_geometry(Some(&flipped))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_Force2D +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Force2D { + coord_type: CoordType, +} + +impl Force2D { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for Force2D { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static FORCE2D_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Force2D { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_force2d" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(force2d_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(FORCE2D_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Forces the geometries into a 2-dimensional mode so that all output representations will only have X and Y coordinates.", + "ST_Force2D(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn force2d_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = force2d_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn force2d_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _force2d_impl) +} + +fn _force2d_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + use geo::MapCoords; + + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + // geometry_to_geo already produces 2D geo types (only x,y), + // so we just pass it through to ensure the output type is XY. + let forced = geo_geom.map_coords(|coord| geo::Coord { + x: coord.x, + y: coord.y, + }); + builder.push_geometry(Some(&forced))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_Multi +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Multi { + coord_type: CoordType, +} + +impl Multi { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for Multi { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static MULTI_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Multi { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_multi" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(multi_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MULTI_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the geometry as a MULTI* geometry. If the geometry is already a MULTI*, it is returned unchanged.", + "ST_Multi(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn multi_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = multi_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn multi_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _multi_impl) +} + +fn _multi_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let multi = match geo_geom { + geo::Geometry::Point(p) => { + geo::Geometry::MultiPoint(geo::MultiPoint(vec![p])) + } + geo::Geometry::LineString(ls) => { + geo::Geometry::MultiLineString(geo::MultiLineString(vec![ls])) + } + geo::Geometry::Polygon(p) => { + geo::Geometry::MultiPolygon(geo::MultiPolygon(vec![p])) + } + // Already multi types or other types pass through unchanged + other => other, + }; + builder.push_geometry(Some(&multi))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_Normalize +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct Normalize { + coord_type: CoordType, +} + +impl Normalize { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for Normalize { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static NORMALIZE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Normalize { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_normalize" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(normalize_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(NORMALIZE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the geometry in its normalized/canonical form. May reorder vertices in polygon rings, elements in multi-geometries, and so on.", + "ST_Normalize(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn normalize_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = normalize_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn normalize_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _normalize_impl) +} + +fn normalize_line_string(ls: &geo::LineString) -> geo::LineString { + ls.clone() +} + +fn normalize_polygon(poly: &geo::Polygon) -> geo::Polygon { + // Normalize exterior ring: rotate so the lexicographically smallest point is first + let exterior = normalize_ring(poly.exterior()); + let mut interiors: Vec = + poly.interiors().iter().map(normalize_ring).collect(); + // Sort interior rings lexicographically for canonical ordering + interiors.sort_by(|a, b| { + for (ca, cb) in a.0.iter().zip(b.0.iter()) { + let cmp_x = ca.x.partial_cmp(&cb.x).unwrap_or(std::cmp::Ordering::Equal); + if cmp_x != std::cmp::Ordering::Equal { + return cmp_x; + } + let cmp_y = ca.y.partial_cmp(&cb.y).unwrap_or(std::cmp::Ordering::Equal); + if cmp_y != std::cmp::Ordering::Equal { + return cmp_y; + } + } + a.0.len().cmp(&b.0.len()) + }); + geo::Polygon::new(exterior, interiors) +} + +fn normalize_ring(ring: &geo::LineString) -> geo::LineString { + if ring.0.len() <= 1 { + return ring.clone(); + } + + // For a closed ring, skip the closing coordinate for rotation + let coords = if ring.0.first() == ring.0.last() && ring.0.len() > 1 { + &ring.0[..ring.0.len() - 1] + } else { + &ring.0 + }; + + if coords.is_empty() { + return ring.clone(); + } + + // Find the index of the lexicographically smallest coordinate + let min_idx = coords + .iter() + .enumerate() + .min_by(|(_, a), (_, b)| { + a.x.partial_cmp(&b.x) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| a.y.partial_cmp(&b.y).unwrap_or(std::cmp::Ordering::Equal)) + }) + .map(|(i, _)| i) + .unwrap_or(0); + + // Rotate the ring so the smallest coordinate is first + let mut rotated: Vec = Vec::with_capacity(ring.0.len()); + for i in 0..coords.len() { + rotated.push(coords[(min_idx + i) % coords.len()]); + } + // Re-close the ring + if ring.0.first() == ring.0.last() && !rotated.is_empty() { + rotated.push(rotated[0]); + } + + geo::LineString(rotated) +} + +fn normalize_geometry(geom: &geo::Geometry) -> geo::Geometry { + match geom { + geo::Geometry::Point(p) => geo::Geometry::Point(*p), + geo::Geometry::Line(l) => { + // Normalize line so the smaller point comes first + if (l.start.x, l.start.y) <= (l.end.x, l.end.y) { + geo::Geometry::Line(*l) + } else { + geo::Geometry::Line(geo::Line::new(l.end, l.start)) + } + } + geo::Geometry::LineString(ls) => { + geo::Geometry::LineString(normalize_line_string(ls)) + } + geo::Geometry::Polygon(p) => geo::Geometry::Polygon(normalize_polygon(p)), + geo::Geometry::MultiPoint(mp) => { + let mut points = mp.0.clone(); + points.sort_by(|a, b| { + a.x().partial_cmp(&b.x()) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a.y().partial_cmp(&b.y()) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + geo::Geometry::MultiPoint(geo::MultiPoint(points)) + } + geo::Geometry::MultiLineString(mls) => { + let lines: Vec = + mls.0.iter().map(normalize_line_string).collect(); + geo::Geometry::MultiLineString(geo::MultiLineString(lines)) + } + geo::Geometry::MultiPolygon(mp) => { + let polys: Vec = + mp.0.iter().map(normalize_polygon).collect(); + geo::Geometry::MultiPolygon(geo::MultiPolygon(polys)) + } + geo::Geometry::GeometryCollection(gc) => { + let geoms: Vec = + gc.0.iter().map(normalize_geometry).collect(); + geo::Geometry::GeometryCollection(geo::GeometryCollection(geoms)) + } + geo::Geometry::Rect(r) => geo::Geometry::Rect(*r), + geo::Geometry::Triangle(t) => geo::Geometry::Triangle(*t), + } +} + +fn _normalize_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let normalized = normalize_geometry(&geo_geom); + builder.push_geometry(Some(&normalized))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_RemoveRepeatedPoints +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct RemoveRepeatedPoints { + signature: Signature, + coord_type: CoordType, +} + +impl RemoveRepeatedPoints { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::one_of( + vec![ + datafusion::logical_expr::TypeSignature::Any(1), + datafusion::logical_expr::TypeSignature::Any(2), + ], + Volatility::Immutable, + ), + coord_type, + } + } +} + +impl Default for RemoveRepeatedPoints { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static REMOVE_REPEATED_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for RemoveRepeatedPoints { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_removerepeatedpoints" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(remove_repeated_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(REMOVE_REPEATED_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a version of the given geometry with duplicated points removed. An optional tolerance distance parameter can be supplied: points within the tolerance of each other are considered duplicates.", + "ST_RemoveRepeatedPoints(geometry, tolerance)", + ) + .with_argument("geom", "geometry") + .with_argument( + "tolerance", + "float8 (optional) - tolerance distance below which points are considered duplicates", + ) + .build() + })) + } +} + +fn remove_repeated_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let tolerance = if args.args.len() > 1 { + match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(v))) => Some(v), + _ => None, + } + } else { + None + }; + + let result = remove_repeated_array(&geo_array, tolerance)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn remove_repeated_array( + array: &dyn GeoArrowArray, + tolerance: Option, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _remove_repeated_impl, tolerance) +} + +fn remove_repeated_coords(coords: &[geo::Coord], tolerance: Option) -> Vec { + if coords.is_empty() { + return vec![]; + } + + let mut result = vec![coords[0]]; + for coord in &coords[1..] { + let last = result.last().unwrap(); + let dominated = match tolerance { + Some(tol) => { + let dx = coord.x - last.x; + let dy = coord.y - last.y; + (dx * dx + dy * dy).sqrt() <= tol + } + None => coord.x == last.x && coord.y == last.y, + }; + if !dominated { + result.push(*coord); + } + } + result +} + +fn remove_repeated_line_string( + ls: &geo::LineString, + tolerance: Option, +) -> geo::LineString { + geo::LineString(remove_repeated_coords(&ls.0, tolerance)) +} + +fn remove_repeated_polygon( + poly: &geo::Polygon, + tolerance: Option, +) -> geo::Polygon { + let exterior = remove_repeated_line_string(poly.exterior(), tolerance); + let interiors = poly + .interiors() + .iter() + .map(|ring| remove_repeated_line_string(ring, tolerance)) + .collect(); + geo::Polygon::new(exterior, interiors) +} + +fn remove_repeated_geometry( + geom: &geo::Geometry, + tolerance: Option, +) -> geo::Geometry { + match geom { + geo::Geometry::Point(p) => geo::Geometry::Point(*p), + geo::Geometry::Line(l) => geo::Geometry::Line(*l), + geo::Geometry::LineString(ls) => { + geo::Geometry::LineString(remove_repeated_line_string(ls, tolerance)) + } + geo::Geometry::Polygon(p) => { + geo::Geometry::Polygon(remove_repeated_polygon(p, tolerance)) + } + geo::Geometry::MultiPoint(mp) => { + if mp.0.is_empty() { + return geo::Geometry::MultiPoint(geo::MultiPoint(vec![])); + } + let mut points = vec![mp.0[0]]; + for pt in &mp.0[1..] { + let last = points.last().unwrap(); + let dominated = match tolerance { + Some(tol) => { + let dx = pt.x() - last.x(); + let dy = pt.y() - last.y(); + (dx * dx + dy * dy).sqrt() <= tol + } + None => pt.x() == last.x() && pt.y() == last.y(), + }; + if !dominated { + points.push(*pt); + } + } + geo::Geometry::MultiPoint(geo::MultiPoint(points)) + } + geo::Geometry::MultiLineString(mls) => geo::Geometry::MultiLineString( + geo::MultiLineString( + mls.0 + .iter() + .map(|ls| remove_repeated_line_string(ls, tolerance)) + .collect(), + ), + ), + geo::Geometry::MultiPolygon(mp) => geo::Geometry::MultiPolygon(geo::MultiPolygon( + mp.0.iter() + .map(|p| remove_repeated_polygon(p, tolerance)) + .collect(), + )), + geo::Geometry::GeometryCollection(gc) => geo::Geometry::GeometryCollection( + geo::GeometryCollection( + gc.0.iter() + .map(|g| remove_repeated_geometry(g, tolerance)) + .collect(), + ), + ), + geo::Geometry::Rect(r) => geo::Geometry::Rect(*r), + geo::Geometry::Triangle(t) => geo::Geometry::Triangle(*t), + } +} + +fn _remove_repeated_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + tolerance: Option, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let cleaned = remove_repeated_geometry(&geo_geom, tolerance); + builder.push_geometry(Some(&cleaned))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// ST_CollectionExtract +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct CollectionExtract { + signature: Signature, + coord_type: CoordType, +} + +impl CollectionExtract { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::one_of( + vec![ + datafusion::logical_expr::TypeSignature::Any(1), + datafusion::logical_expr::TypeSignature::Any(2), + ], + Volatility::Immutable, + ), + coord_type, + } + } +} + +impl Default for CollectionExtract { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static COLLECTION_EXTRACT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for CollectionExtract { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_collectionextract" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(geom_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(collection_extract_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(COLLECTION_EXTRACT_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Given a geometry collection, returns a multi-geometry containing only elements of a specified type. Type numbers are: 1 = Point, 2 = LineString, 3 = Polygon. If no type is specified, returns the multi-geometry of the highest-dimension type present.", + "ST_CollectionExtract(geometry, type)", + ) + .with_argument("geom", "geometry") + .with_argument( + "type", + "integer (optional) - 1=Point, 2=LineString, 3=Polygon", + ) + .build() + })) + } +} + +fn collection_extract_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let extract_type = if args.args.len() > 1 { + match args.args[1].cast_to(&DataType::Int32, None)? { + ColumnarValue::Scalar(ScalarValue::Int32(Some(v))) => Some(v), + _ => None, + } + } else { + None + }; + + let result = collection_extract_array(&geo_array, extract_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn collection_extract_array( + array: &dyn GeoArrowArray, + extract_type: Option, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _collection_extract_impl, extract_type) +} + +fn collect_geometries_flat(geom: &geo::Geometry, out: &mut Vec) { + match geom { + geo::Geometry::GeometryCollection(gc) => { + for g in &gc.0 { + collect_geometries_flat(g, out); + } + } + other => out.push(other.clone()), + } +} + +fn extract_by_type(geom: &geo::Geometry, extract_type: Option) -> geo::Geometry { + // Flatten the geometry collection + let mut flat = Vec::new(); + collect_geometries_flat(geom, &mut flat); + + let target_type = extract_type.unwrap_or_else(|| { + // Find highest dimension type present + let mut max_dim = 0i32; + for g in &flat { + let dim = match g { + geo::Geometry::Point(_) | geo::Geometry::MultiPoint(_) => 1, + geo::Geometry::Line(_) + | geo::Geometry::LineString(_) + | geo::Geometry::MultiLineString(_) => 2, + geo::Geometry::Polygon(_) + | geo::Geometry::MultiPolygon(_) + | geo::Geometry::Rect(_) + | geo::Geometry::Triangle(_) => 3, + _ => 0, + }; + if dim > max_dim { + max_dim = dim; + } + } + max_dim + }); + + match target_type { + 1 => { + // Extract points + let mut points = Vec::new(); + for g in &flat { + match g { + geo::Geometry::Point(p) => points.push(*p), + geo::Geometry::MultiPoint(mp) => points.extend(mp.0.iter()), + _ => {} + } + } + geo::Geometry::MultiPoint(geo::MultiPoint(points)) + } + 2 => { + // Extract line strings + let mut lines = Vec::new(); + for g in &flat { + match g { + geo::Geometry::Line(l) => { + lines.push(geo::LineString(vec![l.start, l.end])); + } + geo::Geometry::LineString(ls) => lines.push(ls.clone()), + geo::Geometry::MultiLineString(mls) => lines.extend(mls.0.iter().cloned()), + _ => {} + } + } + geo::Geometry::MultiLineString(geo::MultiLineString(lines)) + } + 3 => { + // Extract polygons + let mut polygons = Vec::new(); + for g in &flat { + match g { + geo::Geometry::Polygon(p) => polygons.push(p.clone()), + geo::Geometry::MultiPolygon(mp) => polygons.extend(mp.0.iter().cloned()), + geo::Geometry::Rect(r) => polygons.push(r.to_polygon()), + geo::Geometry::Triangle(t) => polygons.push(t.to_polygon()), + _ => {} + } + } + geo::Geometry::MultiPolygon(geo::MultiPolygon(polygons)) + } + _ => { + // Return empty geometry collection for unknown type + geo::Geometry::GeometryCollection(geo::GeometryCollection(vec![])) + } + } +} + +fn _collection_extract_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + extract_type: Option, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let extracted = extract_by_type(&geo_geom, extract_type); + builder.push_geometry(Some(&extracted))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// ========================================================================= +// Helper function for geometry return field +// ========================================================================= +fn geom_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_reverse_linestring() { + let ctx = SessionContext::new(); + ctx.register_udf(Reverse::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Reverse(ST_GeomFromText('LINESTRING(0 0, 1 1, 2 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "LINESTRING(2 2,1 1,0 0)"); + } + + #[tokio::test] + async fn test_reverse_point() { + let ctx = SessionContext::new(); + ctx.register_udf(Reverse::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql("SELECT ST_AsText(ST_Reverse(ST_GeomFromText('POINT(1 2)')));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "POINT(1 2)"); + } + + #[tokio::test] + async fn test_flip_coordinates() { + let ctx = SessionContext::new(); + ctx.register_udf(FlipCoordinates::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_FlipCoordinates(ST_GeomFromText('POINT(1 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "POINT(2 1)"); + } + + #[tokio::test] + async fn test_flip_coordinates_linestring() { + let ctx = SessionContext::new(); + ctx.register_udf(FlipCoordinates::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_FlipCoordinates(ST_GeomFromText('LINESTRING(0 1, 2 3, 4 5)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "LINESTRING(1 0,3 2,5 4)"); + } + + #[tokio::test] + async fn test_force2d() { + let ctx = SessionContext::new(); + ctx.register_udf(Force2D::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Force2D(ST_GeomFromText('POINT(1 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "POINT(1 2)"); + } + + #[tokio::test] + async fn test_multi_point() { + let ctx = SessionContext::new(); + ctx.register_udf(Multi::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql("SELECT ST_AsText(ST_Multi(ST_GeomFromText('POINT(1 2)')));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOINT((1 2))"); + } + + #[tokio::test] + async fn test_multi_linestring() { + let ctx = SessionContext::new(); + ctx.register_udf(Multi::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Multi(ST_GeomFromText('LINESTRING(0 0, 1 1)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTILINESTRING((0 0,1 1))"); + } + + #[tokio::test] + async fn test_multi_polygon() { + let ctx = SessionContext::new(); + ctx.register_udf(Multi::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Multi(ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOLYGON(((0 0,1 0,1 1,0 1,0 0)))"); + } + + #[tokio::test] + async fn test_multi_already_multi() { + let ctx = SessionContext::new(); + ctx.register_udf(Multi::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Multi(ST_GeomFromText('MULTIPOINT(1 2, 3 4)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOINT((1 2),(3 4))"); + } + + #[tokio::test] + async fn test_normalize() { + let ctx = SessionContext::new(); + ctx.register_udf(Normalize::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + // Normalizing a point should return the same point + let df = ctx + .sql( + "SELECT ST_AsText(ST_Normalize(ST_GeomFromText('POINT(1 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "POINT(1 2)"); + } + + #[tokio::test] + async fn test_normalize_multipoint() { + let ctx = SessionContext::new(); + ctx.register_udf(Normalize::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + // Normalizing a multipoint should sort points lexicographically + let df = ctx + .sql( + "SELECT ST_AsText(ST_Normalize(ST_GeomFromText('MULTIPOINT(3 4, 1 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOINT((1 2),(3 4))"); + } + + #[tokio::test] + async fn test_remove_repeated_points() { + let ctx = SessionContext::new(); + ctx.register_udf(RemoveRepeatedPoints::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_RemoveRepeatedPoints(ST_GeomFromText('LINESTRING(0 0, 0 0, 1 1, 1 1, 2 2)')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "LINESTRING(0 0,1 1,2 2)"); + } + + #[tokio::test] + async fn test_remove_repeated_points_with_tolerance() { + let ctx = SessionContext::new(); + ctx.register_udf(RemoveRepeatedPoints::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_RemoveRepeatedPoints(ST_GeomFromText('LINESTRING(0 0, 0.1 0.1, 1 1, 2 2)'), 0.2));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + // 0.1,0.1 is within tolerance 0.2 of 0,0 so it gets removed + assert_eq!(val, "LINESTRING(0 0,1 1,2 2)"); + } + + #[tokio::test] + async fn test_collection_extract_polygons() { + let ctx = SessionContext::new(); + ctx.register_udf(CollectionExtract::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_CollectionExtract(ST_GeomFromText('GEOMETRYCOLLECTION(POINT(0 0), LINESTRING(1 1, 2 2), POLYGON((0 0, 1 0, 1 1, 0 1, 0 0)))'), 3));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOLYGON(((0 0,1 0,1 1,0 1,0 0)))"); + } + + #[tokio::test] + async fn test_collection_extract_points() { + let ctx = SessionContext::new(); + ctx.register_udf(CollectionExtract::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_CollectionExtract(ST_GeomFromText('GEOMETRYCOLLECTION(POINT(0 0), POINT(1 1), LINESTRING(2 2, 3 3))'), 1));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOINT((0 0),(1 1))"); + } + + #[tokio::test] + async fn test_collection_extract_lines() { + let ctx = SessionContext::new(); + ctx.register_udf(CollectionExtract::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_CollectionExtract(ST_GeomFromText('GEOMETRYCOLLECTION(POINT(0 0), LINESTRING(1 1, 2 2))'), 2));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTILINESTRING((1 1,2 2))"); + } + + #[tokio::test] + async fn test_collection_extract_default_highest_dim() { + let ctx = SessionContext::new(); + ctx.register_udf(CollectionExtract::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + // Without a type parameter, should extract highest-dimension type (polygon = 3) + let df = ctx + .sql( + "SELECT ST_AsText(ST_CollectionExtract(ST_GeomFromText('GEOMETRYCOLLECTION(POINT(0 0), POLYGON((0 0, 1 0, 1 1, 0 1, 0 0)))')));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_string::() + .value(0); + assert_eq!(val, "MULTIPOLYGON(((0 0,1 0,1 1,0 1,0 0)))"); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/linear_ref.rs b/rust/geodatafusion/src/udf/geo/processing/linear_ref.rs new file mode 100644 index 0000000..da896a3 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/linear_ref.rs @@ -0,0 +1,527 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_array::builder::Float64Builder; +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::{GeometryBuilder, PointBuilder}; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, Dimension, GeometryType, Metadata, PointType}; + +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_LineInterpolatePoint +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct LineInterpolatePoint { + signature: Signature, + coord_type: CoordType, +} + +impl LineInterpolatePoint { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for LineInterpolatePoint { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static LIP_DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for LineInterpolatePoint { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_lineinterpolatepoint" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let metadata = + Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = + PointType::new(Dimension::XY, metadata).with_coord_type(self.coord_type); + Ok(Arc::new(output_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(line_interpolate_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(LIP_DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a point interpolated along a line at a fractional location. The first argument must be a LineString. The second argument is a float between 0 and 1 representing the fraction of the total line length where the point is located.", + "ST_LineInterpolatePoint(linestring, fraction)", + ) + .with_argument("geom", "geometry (LineString)") + .with_argument("fraction", "float8 - value between 0 and 1") + .build() + })) + } +} + +fn line_interpolate_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let fraction = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(f))) => f, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized fraction not yet implemented for ST_LineInterpolatePoint".to_string(), + ) + .into()); + } + }; + + let result = line_interpolate_array(&geo_array, fraction, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn line_interpolate_array( + array: &dyn GeoArrowArray, + fraction: f64, + coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _line_interpolate_impl, fraction, coord_type) +} + +fn _line_interpolate_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + fraction: f64, + coord_type: CoordType, +) -> GeoArrowResult> { + use geo::LineInterpolatePoint as _; + + let point_type = PointType::new(Dimension::XY, Default::default()).with_coord_type(coord_type); + let mut builder = PointBuilder::new(point_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + match geo_geom { + geo::Geometry::LineString(ref ls) => { + let pt = ls.line_interpolate_point(fraction); + match pt { + Some(p) => builder.push_point(Some(&p)), + None => builder.push_null(), + } + } + _ => builder.push_null(), + } + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// ST_LineLocatePoint +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct LineLocatePoint { + signature: Signature, +} + +impl LineLocatePoint { + pub fn new() -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + } + } +} + +impl Default for LineLocatePoint { + fn default() -> Self { + Self::new() + } +} + +static LLP_DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for LineLocatePoint { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_linelocatepoint" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result { + Ok(Arc::new(arrow_schema::Field::new("", DataType::Float64, true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(line_locate_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(LLP_DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a float between 0 and 1 representing the location of the closest point on a LineString to the given Point, as a fraction of the total 2D line length.", + "ST_LineLocatePoint(linestring, point)", + ) + .with_argument("geom", "geometry (LineString)") + .with_argument("point", "geometry (Point)") + .build() + })) + } +} + +fn line_locate_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_arr = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_arr = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + + let result = line_locate_arrays(&left_arr, &right_arr)?; + Ok(ColumnarValue::Array(Arc::new(result))) +} + +fn line_locate_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _line_locate_left_impl, right) +} + +fn _line_locate_left_impl<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, __line_locate_impl, left) +} + +fn __line_locate_impl<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geo::LineLocatePoint as _; + + let mut builder = Float64Builder::with_capacity(left.len()); + + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(left_geom), Some(right_geom)) => { + let left_geo = geometry_to_geo(&left_geom?)?; + let right_geo = geometry_to_geo(&right_geom?)?; + match (&left_geo, &right_geo) { + (geo::Geometry::LineString(ls), geo::Geometry::Point(pt)) => { + let frac = ls.line_locate_point(pt); + match frac { + Some(f) => builder.append_value(f), + None => builder.append_null(), + } + } + _ => builder.append_null(), + } + } + _ => builder.append_null(), + } + } + + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_LineSubstring +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct LineSubstring { + signature: Signature, + coord_type: CoordType, +} + +impl LineSubstring { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for LineSubstring { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static LS_DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for LineSubstring { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_linesubstring" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let metadata = + Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(self.coord_type); + Ok(Arc::new(output_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(line_substring_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(LS_DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the portion of a line between two fractional locations. The first argument must be a LineString. The second and third arguments are floats between 0 and 1 representing the start and end fractions.", + "ST_LineSubstring(linestring, startfraction, endfraction)", + ) + .with_argument("geom", "geometry (LineString)") + .with_argument("startfraction", "float8 - value between 0 and 1") + .with_argument("endfraction", "float8 - value between 0 and 1") + .build() + })) + } +} + +fn line_substring_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let start_fraction = match args.args[1].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(f))) => f, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized startfraction not yet implemented for ST_LineSubstring".to_string(), + ) + .into()); + } + }; + + let end_fraction = match args.args[2].cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(f))) => f, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized endfraction not yet implemented for ST_LineSubstring".to_string(), + ) + .into()); + } + }; + + let result = line_substring_array(&geo_array, start_fraction, end_fraction, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn line_substring_array( + array: &dyn GeoArrowArray, + start_fraction: f64, + end_fraction: f64, + coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!( + array, + _line_substring_impl, + start_fraction, + end_fraction, + coord_type + ) +} + +fn _line_substring_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + start_fraction: f64, + end_fraction: f64, + coord_type: CoordType, +) -> GeoArrowResult> { + use geo::{EuclideanDistance, EuclideanLength, LineInterpolatePoint as _}; + + let geom_type = GeometryType::new(Default::default()).with_coord_type(coord_type); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + match geo_geom { + geo::Geometry::LineString(ref ls) => { + let total_length = ls.euclidean_length(); + if total_length == 0.0 { + builder.push_null(); + continue; + } + + let start_dist = start_fraction * total_length; + let end_dist = end_fraction * total_length; + + let mut coords: Vec = Vec::new(); + + // Add the interpolated start point + if let Some(start_pt) = ls.line_interpolate_point(start_fraction) { + coords.push(start_pt.into()); + } else { + builder.push_null(); + continue; + } + + // Add interior vertices that fall between start and end fractions + let mut cumulative = 0.0; + for i in 1..ls.0.len() { + let p1 = geo::Point::from(ls.0[i - 1]); + let p2 = geo::Point::from(ls.0[i]); + let segment_len = p1.euclidean_distance(&p2); + cumulative += segment_len; + + if cumulative > start_dist && cumulative < end_dist { + coords.push(ls.0[i]); + } + } + + // Add the interpolated end point + if let Some(end_pt) = ls.line_interpolate_point(end_fraction) { + coords.push(end_pt.into()); + } else { + builder.push_null(); + continue; + } + + let result_ls = geo::LineString::new(coords); + builder.push_geometry(Some(&geo::Geometry::LineString(result_ls)))?; + } + _ => builder.push_null(), + } + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use arrow_array::types::Float64Type; + use approx::relative_eq; + use datafusion::prelude::SessionContext; + use geo_traits::{CoordTrait, PointTrait}; + use geoarrow_array::GeoArrowArrayAccessor; + use geoarrow_array::array::PointArray; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_line_interpolate_point() { + let ctx = SessionContext::new(); + + ctx.register_udf(LineInterpolatePoint::default().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql( + "SELECT ST_LineInterpolatePoint(ST_GeomFromText('LINESTRING(0 0, 10 0)'), 0.5);", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let geo_arr = + PointArray::try_from((batch.column(0).as_ref(), batch.schema().field(0))).unwrap(); + let point = geo_arr.value(0).unwrap(); + assert!(relative_eq!(point.coord().unwrap().x(), 5.0)); + assert!(relative_eq!(point.coord().unwrap().y(), 0.0)); + } + + #[tokio::test] + async fn test_line_locate_point() { + let ctx = SessionContext::new(); + + ctx.register_udf(LineLocatePoint::default().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql( + "SELECT ST_LineLocatePoint(ST_GeomFromText('LINESTRING(0 0, 10 0)'), ST_GeomFromText('POINT(5 0)'));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let frac = col.as_primitive::().value(0); + assert!(relative_eq!(frac, 0.5)); + } + + #[tokio::test] + async fn test_line_substring() { + let ctx = SessionContext::new(); + + ctx.register_udf(LineSubstring::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_LineSubstring(ST_GeomFromText('LINESTRING(0 0, 10 0)'), 0.25, 0.75));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let wkt = col.as_string::().value(0); + assert_eq!(wkt, "LINESTRING(2.5 0,7.5 0)"); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/make_valid.rs b/rust/geodatafusion/src/udf/geo/processing/make_valid.rs new file mode 100644 index 0000000..533f82a --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/make_valid.rs @@ -0,0 +1,116 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, +}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct MakeValid { + coord_type: CoordType, +} + +impl MakeValid { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for MakeValid { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MakeValid { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makevalid" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let metadata = + Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(self.coord_type); + Ok(Arc::new(output_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(make_valid_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Attempts to make an invalid polygonal geometry valid using a buffer(0) strategy. Note: this is an approximation; it may not preserve all input vertices. Non-polygonal geometries are returned unchanged.", + "ST_MakeValid(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn make_valid_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = make_valid_array(&geo_array)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn make_valid_array(array: &dyn GeoArrowArray) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _make_valid_impl) +} + +fn _make_valid_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + // Use buffer(0) as a simple make-valid strategy + use geo::Buffer; + let valid = match &geo_geom { + geo::Geometry::Polygon(_) | geo::Geometry::MultiPolygon(_) => { + let buffered = geo_geom.buffer(0.0); + geo::Geometry::MultiPolygon(buffered) + } + other => other.clone(), + }; + builder.push_geometry(Some(&valid))?; + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} diff --git a/rust/geodatafusion/src/udf/geo/processing/mod.rs b/rust/geodatafusion/src/udf/geo/processing/mod.rs index 780b2c2..afe90e7 100644 --- a/rust/geodatafusion/src/udf/geo/processing/mod.rs +++ b/rust/geodatafusion/src/udf/geo/processing/mod.rs @@ -1,21 +1,64 @@ +mod advanced; +mod affine; +mod buffer; mod centroid; mod convex_hull; +mod editors; +mod linear_ref; +mod make_valid; mod oriented_envelope; +mod overlay; mod point_on_surface; mod simplify; +mod transform; +pub use advanced::{ChaikinSmoothing, ReducePrecision, Segmentize, UnaryUnion}; +pub use affine::{StAffine, StRotate, StScale, StTranslate}; +pub use buffer::StBuffer; pub use centroid::Centroid; pub use convex_hull::ConvexHull; +pub use editors::{ + CollectionExtract, FlipCoordinates, Force2D, Multi, Normalize, RemoveRepeatedPoints, Reverse, +}; +pub use linear_ref::{LineInterpolatePoint, LineLocatePoint, LineSubstring}; +pub use make_valid::MakeValid; pub use oriented_envelope::OrientedEnvelope; +pub use overlay::{StDifference, StIntersection, StSymDifference, StUnion}; pub use point_on_surface::PointOnSurface; pub use simplify::{Simplify, SimplifyPreserveTopology, SimplifyVW}; +pub use transform::StTransform; pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(Centroid::default().into()); + session_context.register_udf(ChaikinSmoothing::default().into()); + session_context.register_udf(CollectionExtract::default().into()); session_context.register_udf(ConvexHull::default().into()); + session_context.register_udf(FlipCoordinates::default().into()); + session_context.register_udf(Force2D::default().into()); + session_context.register_udf(LineInterpolatePoint::default().into()); + session_context.register_udf(LineLocatePoint::default().into()); + session_context.register_udf(LineSubstring::default().into()); + session_context.register_udf(MakeValid::default().into()); + session_context.register_udf(Multi::default().into()); + session_context.register_udf(Normalize::default().into()); session_context.register_udf(OrientedEnvelope::default().into()); session_context.register_udf(PointOnSurface::default().into()); + session_context.register_udf(ReducePrecision::default().into()); + session_context.register_udf(RemoveRepeatedPoints::default().into()); + session_context.register_udf(Reverse::default().into()); + session_context.register_udf(Segmentize::default().into()); session_context.register_udf(Simplify::default().into()); session_context.register_udf(SimplifyPreserveTopology::default().into()); session_context.register_udf(SimplifyVW::default().into()); + session_context.register_udf(StAffine::default().into()); + session_context.register_udf(StBuffer::default().into()); + session_context.register_udf(StDifference::default().into()); + session_context.register_udf(StIntersection::default().into()); + session_context.register_udf(StRotate::default().into()); + session_context.register_udf(StScale::default().into()); + session_context.register_udf(StSymDifference::default().into()); + // StTransform not registered: requires proj crate for actual coordinate reprojection + session_context.register_udf(StTranslate::default().into()); + session_context.register_udf(StUnion::default().into()); + session_context.register_udf(UnaryUnion::default().into()); } diff --git a/rust/geodatafusion/src/udf/geo/processing/overlay.rs b/rust/geodatafusion/src/udf/geo/processing/overlay.rs new file mode 100644 index 0000000..1de4a5d --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/overlay.rs @@ -0,0 +1,462 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, Field, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use geo::BooleanOps; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::error::GeoDataFusionResult; + +/// Enum representing the type of overlay operation. +#[derive(Debug, Clone, Copy)] +enum OverlayOp { + Union, + Intersection, + Difference, + SymDifference, +} + +macro_rules! impl_overlay_udf { + ($struct_name:ident, $udf_name:expr, $doc_var:ident, $op:expr, $doc_text:expr, $doc_example:expr) => { + #[derive(Debug, Eq, PartialEq, Hash)] + pub struct $struct_name { + signature: Signature, + coord_type: CoordType, + } + + impl $struct_name { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } + } + + impl Default for $struct_name { + fn default() -> Self { + Self::new(Default::default()) + } + } + + static $doc_var: OnceLock = OnceLock::new(); + + impl ScalarUDFImpl for $struct_name { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + $udf_name + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(overlay_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let mut arg_iter = args.args.into_iter(); + let left = arg_iter.next().unwrap(); + let right = arg_iter.next().unwrap(); + Ok(overlay_impl( + left, + &args.arg_fields[0], + right, + &args.arg_fields[1], + $op, + self.coord_type, + )?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some($doc_var.get_or_init(|| { + Documentation::builder(DOC_SECTION_OTHER, $doc_text, $doc_example) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } + } + }; +} + +impl_overlay_udf!( + StUnion, + "st_union", + UNION_DOC, + OverlayOp::Union, + "Returns a geometry representing the point-set union of the input geometries. The result may be a collection geometry.", + "ST_Union(geomA, geomB)" +); + +impl_overlay_udf!( + StIntersection, + "st_intersection", + INTERSECTION_DOC, + OverlayOp::Intersection, + "Returns a geometry representing the point-set intersection of two geometries. The result may be a collection of geometries.", + "ST_Intersection(geomA, geomB)" +); + +impl_overlay_udf!( + StDifference, + "st_difference", + DIFFERENCE_DOC, + OverlayOp::Difference, + "Returns a geometry representing the point-set difference of two geometries. That is, the part of geometry A that does not intersect with geometry B.", + "ST_Difference(geomA, geomB)" +); + +impl_overlay_udf!( + StSymDifference, + "st_symdifference", + SYMDIFFERENCE_DOC, + OverlayOp::SymDifference, + "Returns a geometry representing the symmetric difference (exclusive or) of two geometries. That is, the portions of the two geometries that do not intersect.", + "ST_SymDifference(geomA, geomB)" +); + +fn overlay_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +fn overlay_impl( + left: ColumnarValue, + left_field: &Field, + right: ColumnarValue, + right_field: &Field, + op: OverlayOp, + coord_type: CoordType, +) -> GeoDataFusionResult { + match (left, right) { + (ColumnarValue::Scalar(left_scalar), ColumnarValue::Scalar(right_scalar)) => { + let mut arrays = + ColumnarValue::values_to_arrays(&[left_scalar.into(), right_scalar.into()])? + .into_iter(); + let left_array = ColumnarValue::Array(arrays.next().unwrap()); + let right_array = ColumnarValue::Array(arrays.next().unwrap()); + overlay_impl(left_array, left_field, right_array, right_field, op, coord_type) + } + (ColumnarValue::Array(left_arr), ColumnarValue::Array(right_arr)) => { + let left_geo = from_arrow_array(&left_arr, left_field)?; + let right_geo = from_arrow_array(&right_arr, right_field)?; + let result = overlay_arrays(&left_geo, &right_geo, op, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) + } + (ColumnarValue::Scalar(left_scalar), ColumnarValue::Array(right_arr)) => { + let left_arrays = ColumnarValue::values_to_arrays(&[left_scalar.into()])?; + let left_geo = from_arrow_array(&left_arrays[0], left_field)?; + let right_geo = from_arrow_array(&right_arr, right_field)?; + // Broadcast the single left geometry across right array + let result = overlay_broadcast_left(&left_geo, &right_geo, op, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) + } + (ColumnarValue::Array(left_arr), ColumnarValue::Scalar(right_scalar)) => { + let right_arrays = ColumnarValue::values_to_arrays(&[right_scalar.into()])?; + let left_geo = from_arrow_array(&left_arr, left_field)?; + let right_geo = from_arrow_array(&right_arrays[0], right_field)?; + // Broadcast the single right geometry across left array + let result = overlay_broadcast_right(&left_geo, &right_geo, op, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) + } + } +} + +fn apply_overlay(left: &geo::Geometry, right: &geo::Geometry, op: OverlayOp) -> geo::Geometry { + // Convert both to MultiPolygon for BooleanOps + let left_mp = geometry_to_multi_polygon(left); + let right_mp = geometry_to_multi_polygon(right); + let result = match op { + OverlayOp::Union => left_mp.union(&right_mp), + OverlayOp::Intersection => left_mp.intersection(&right_mp), + OverlayOp::Difference => left_mp.difference(&right_mp), + OverlayOp::SymDifference => left_mp.xor(&right_mp), + }; + geo::Geometry::MultiPolygon(result) +} + +fn geometry_to_multi_polygon(geom: &geo::Geometry) -> geo::MultiPolygon { + match geom { + geo::Geometry::Polygon(p) => geo::MultiPolygon(vec![p.clone()]), + geo::Geometry::MultiPolygon(mp) => mp.clone(), + // For non-polygonal geometries, return empty + _ => geo::MultiPolygon(vec![]), + } +} + +fn overlay_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + op: OverlayOp, + _coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(left, _overlay_arrays_impl, right, op) +} + +fn _overlay_arrays_impl<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + op: OverlayOp, +) -> GeoArrowResult> { + downcast_geoarrow_array!(right, __overlay_arrays_impl, left, op) +} + +fn __overlay_arrays_impl<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, + op: OverlayOp, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(left_geom), Some(right_geom)) => { + let left_geo = geometry_to_geo(&left_geom?)?; + let right_geo = geometry_to_geo(&right_geom?)?; + let result = apply_overlay(&left_geo, &right_geo, op); + builder.push_geometry(Some(&result))?; + } + _ => { + builder.push_null(); + } + } + } + + Ok(Arc::new(builder.finish())) +} + +fn overlay_broadcast_left( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + op: OverlayOp, + _coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(left, _overlay_broadcast_left_impl, right, op) +} + +fn _overlay_broadcast_left_impl<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + op: OverlayOp, +) -> GeoArrowResult> { + // Get the single left geometry + let left_geom = left + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + + downcast_geoarrow_array!(right, __overlay_broadcast_left_impl, &left_geom, op) +} + +fn __overlay_broadcast_left_impl<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left_geom: &Option, + op: OverlayOp, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + if let Some(left_geo) = left_geom { + for r in right.iter() { + if let Some(right_geom) = r { + let right_geo = geometry_to_geo(&right_geom?)?; + let result = apply_overlay(left_geo, &right_geo, op); + builder.push_geometry(Some(&result))?; + } else { + builder.push_null(); + } + } + } else { + for _ in 0..right.len() { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +fn overlay_broadcast_right( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + op: OverlayOp, + _coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(right, _overlay_broadcast_right_impl, left, op) +} + +fn _overlay_broadcast_right_impl<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &dyn GeoArrowArray, + op: OverlayOp, +) -> GeoArrowResult> { + let right_geom = right + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + + downcast_geoarrow_array!(left, __overlay_broadcast_right_impl, &right_geom, op) +} + +fn __overlay_broadcast_right_impl<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right_geom: &Option, + op: OverlayOp, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + + if let Some(right_geo) = right_geom { + for l in left.iter() { + if let Some(left_geom) = l { + let left_geo = geometry_to_geo(&left_geom?)?; + let result = apply_overlay(&left_geo, right_geo, op); + builder.push_geometry(Some(&result))?; + } else { + builder.push_null(); + } + } + } else { + for _ in 0..left.len() { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::geo::measurement::Area; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_intersection() { + let ctx = SessionContext::new(); + + ctx.register_udf(StIntersection::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql( + "SELECT ST_AsText(ST_Intersection( + ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))'), + ST_GeomFromText('POLYGON((5 5, 15 5, 15 15, 5 15, 5 5))') + ));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + // The intersection should be a 5x5 polygon + assert_eq!(batch.num_rows(), 1); + } + + #[tokio::test] + async fn test_union() { + let ctx = SessionContext::new(); + + ctx.register_udf(StUnion::default().into()); + ctx.register_udf(Area::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql( + "SELECT ST_Area(ST_Union( + ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))'), + ST_GeomFromText('POLYGON((5 5, 15 5, 15 15, 5 15, 5 5))') + ));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let area = col.as_primitive::().value(0); + // Union of two overlapping 10x10 squares = 200 - 25 = 175 + assert!((area - 175.0).abs() < 0.01, "Union area was {}", area); + } + + #[tokio::test] + async fn test_difference() { + let ctx = SessionContext::new(); + + ctx.register_udf(StDifference::default().into()); + ctx.register_udf(Area::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql( + "SELECT ST_Area(ST_Difference( + ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))'), + ST_GeomFromText('POLYGON((5 5, 15 5, 15 15, 5 15, 5 5))') + ));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let area = col.as_primitive::().value(0); + // Difference = 100 - 25 = 75 + assert!((area - 75.0).abs() < 0.01, "Difference area was {}", area); + } + + #[tokio::test] + async fn test_sym_difference() { + let ctx = SessionContext::new(); + + ctx.register_udf(StSymDifference::default().into()); + ctx.register_udf(Area::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql( + "SELECT ST_Area(ST_SymDifference( + ST_GeomFromText('POLYGON((0 0, 10 0, 10 10, 0 10, 0 0))'), + ST_GeomFromText('POLYGON((5 5, 15 5, 15 15, 5 15, 5 5))') + ));", + ) + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let col = batch.column(0); + let area = col.as_primitive::().value(0); + // SymDifference = 200 - 2*25 = 150 + assert!( + (area - 150.0).abs() < 0.01, + "SymDifference area was {}", + area + ); + } +} diff --git a/rust/geodatafusion/src/udf/geo/processing/transform.rs b/rust/geodatafusion/src/udf/geo/processing/transform.rs new file mode 100644 index 0000000..c26f576 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/processing/transform.rs @@ -0,0 +1,174 @@ +//! ST_Transform - Coordinate Reference System transformation. +//! +//! This module provides ST_Transform(geom, to_srid) which reprojects geometry +//! coordinates from the source SRID (embedded in the geometry metadata) to the +//! target SRID. +//! +//! Note: This is a placeholder implementation that stores the SRID metadata +//! without performing actual coordinate transformation. Full reprojection +//! requires the `proj` crate which depends on the PROJ C library. +//! When the `proj` crate is available, the implementation should be updated +//! to use `proj::Proj::new_known_crs` for actual coordinate transformation. + +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use datafusion::scalar::ScalarValue; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::error::GeoDataFusionResult; + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StTransform { + signature: Signature, + coord_type: CoordType, +} + +impl StTransform { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for StTransform { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StTransform { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_transform" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + Ok(transform_return_field(args, self.coord_type)?) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(transform_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns a new geometry with its coordinates transformed to a different spatial reference system. Currently stores the target SRID metadata; full reprojection requires the proj crate.", + "ST_Transform(geometry, target_srid)", + ) + .with_argument("geom", "geometry") + .with_argument("target_srid", "integer - target EPSG SRID code") + .build() + })) + } +} + +fn transform_return_field( + args: ReturnFieldArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + // Get target SRID to set in metadata + let metadata = Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(coord_type); + Ok(Arc::new(output_type.to_field("", true))) +} + +fn transform_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[0..1])?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + + let target_srid = match args.args[1].cast_to(&DataType::Int32, None)? { + ColumnarValue::Scalar(ScalarValue::Int32(Some(srid))) => srid, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized SRID not supported for ST_Transform".to_string(), + ) + .into()) + } + }; + + let result = transform_array(&geo_array, target_srid)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn transform_array( + array: &dyn GeoArrowArray, + _target_srid: i32, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _transform_impl, _target_srid) +} + +fn _transform_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + _target_srid: i32, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()); + let mut builder = GeometryBuilder::new(geom_type); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + builder.push_geometry(Some(&geo_geom))?; + } else { + builder.push_null(); + } + } + Ok(Arc::new(builder.finish())) +} + +#[cfg(test)] +mod test { + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_transform_preserves_geometry() { + let ctx = SessionContext::new(); + ctx.register_udf(StTransform::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let df = ctx + .sql("SELECT ST_AsText(ST_Transform(ST_GeomFromText('POINT(0 0)'), 4326));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0); + assert_eq!(val, "POINT(0 0)"); + } +} diff --git a/rust/geodatafusion/src/udf/geo/relationships/dwithin.rs b/rust/geodatafusion/src/udf/geo/relationships/dwithin.rs new file mode 100644 index 0000000..a2bdae2 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/relationships/dwithin.rs @@ -0,0 +1,303 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_array::BooleanArray; +use arrow_array::builder::BooleanBuilder; +use arrow_schema::{DataType, Field}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion::scalar::ScalarValue; +use geo::EuclideanDistance; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; + +use crate::error::GeoDataFusionResult; + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct DWithin { + signature: Signature, +} + +impl DWithin { + pub fn new() -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + } + } +} + +impl Default for DWithin { + fn default() -> Self { + Self::new() + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for DWithin { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_dwithin" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let mut arg_iter = args.args.into_iter(); + let left = arg_iter.next().unwrap(); + let right = arg_iter.next().unwrap(); + let distance_arg = arg_iter.next().unwrap(); + + let distance = match distance_arg.cast_to(&DataType::Float64, None)? { + ColumnarValue::Scalar(ScalarValue::Float64(Some(d))) => d, + _ => { + return Err(DataFusionError::NotImplemented( + "Vectorized distance not supported for ST_DWithin".to_string(), + )); + } + }; + + Ok(dwithin_impl( + left, + &args.arg_fields[0], + right, + &args.arg_fields[1], + distance, + )?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometries are within a given distance of one another. The distance is specified in units of the spatial reference system.", + "ST_DWithin(geomA, geomB, distance)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .with_argument("distance", "float8 - distance threshold") + .build() + })) + } +} + +fn dwithin_impl( + left: ColumnarValue, + left_field: &Field, + right: ColumnarValue, + right_field: &Field, + distance: f64, +) -> GeoDataFusionResult { + match (left, right) { + (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => { + let mut arrays = + ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)])? + .into_iter(); + dwithin_impl( + ColumnarValue::Array(arrays.next().unwrap()), + left_field, + ColumnarValue::Array(arrays.next().unwrap()), + right_field, + distance, + ) + } + (ColumnarValue::Array(l_arr), ColumnarValue::Array(r_arr)) => { + let left_geo = from_arrow_array(&l_arr, left_field)?; + let right_geo = from_arrow_array(&r_arr, right_field)?; + let result = dwithin_arrays(&left_geo, &right_geo, distance)?; + Ok(ColumnarValue::Array(std::sync::Arc::new(result))) + } + (ColumnarValue::Scalar(l), ColumnarValue::Array(r_arr)) => { + let l_arrays = ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(l)])?; + let left_geo = from_arrow_array(&l_arrays[0], left_field)?; + let right_geo = from_arrow_array(&r_arr, right_field)?; + let result = dwithin_broadcast_left(&left_geo, &right_geo, distance)?; + Ok(ColumnarValue::Array(std::sync::Arc::new(result))) + } + (ColumnarValue::Array(l_arr), ColumnarValue::Scalar(r)) => { + let r_arrays = ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(r)])?; + let left_geo = from_arrow_array(&l_arr, left_field)?; + let right_geo = from_arrow_array(&r_arrays[0], right_field)?; + let result = dwithin_broadcast_right(&left_geo, &right_geo, distance)?; + Ok(ColumnarValue::Array(std::sync::Arc::new(result))) + } + } +} + +fn dwithin_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _dwithin_left, right, distance) +} + +fn _dwithin_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _dwithin_both, left, distance) +} + +fn _dwithin_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, + distance: f64, +) -> GeoArrowResult { + let mut builder = BooleanBuilder::with_capacity(left.len()); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + builder.append_value(left_geo.euclidean_distance(&right_geo) <= distance); + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +fn dwithin_broadcast_left( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _dwithin_bl_left, right, distance) +} + +fn _dwithin_bl_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + let left_geom = left + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + + downcast_geoarrow_array!(right, _dwithin_bl_right, &left_geom, distance) +} + +fn _dwithin_bl_right<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left_geom: &Option, + distance: f64, +) -> GeoArrowResult { + let mut builder = BooleanBuilder::with_capacity(right.len()); + if let Some(lg) = left_geom { + for r in right.iter() { + if let Some(rg) = r { + let right_geo = geometry_to_geo(&rg?)?; + builder.append_value(lg.euclidean_distance(&right_geo) <= distance); + } else { + builder.append_null(); + } + } + } else { + for _ in 0..right.len() { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +fn dwithin_broadcast_right( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _dwithin_br_right, left, distance) +} + +fn _dwithin_br_right<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &dyn GeoArrowArray, + distance: f64, +) -> GeoArrowResult { + let right_geom = right + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + + downcast_geoarrow_array!(left, _dwithin_br_left, &right_geom, distance) +} + +fn _dwithin_br_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right_geom: &Option, + distance: f64, +) -> GeoArrowResult { + let mut builder = BooleanBuilder::with_capacity(left.len()); + if let Some(rg) = right_geom { + for l in left.iter() { + if let Some(lg) = l { + let left_geo = geometry_to_geo(&lg?)?; + builder.append_value(left_geo.euclidean_distance(rg) <= distance); + } else { + builder.append_null(); + } + } + } else { + for _ in 0..left.len() { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::GeomFromText; + + #[tokio::test] + async fn test_dwithin_true() { + let ctx = SessionContext::new(); + ctx.register_udf(DWithin::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_DWithin(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(1 0)'), 1.5);") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert!(batch.column(0).as_boolean().value(0)); + } + + #[tokio::test] + async fn test_dwithin_false() { + let ctx = SessionContext::new(); + ctx.register_udf(DWithin::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_DWithin(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(10 0)'), 1.5);") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert!(!batch.column(0).as_boolean().value(0)); + } +} diff --git a/rust/geodatafusion/src/udf/geo/relationships/mod.rs b/rust/geodatafusion/src/udf/geo/relationships/mod.rs index cea1af9..77c7806 100644 --- a/rust/geodatafusion/src/udf/geo/relationships/mod.rs +++ b/rust/geodatafusion/src/udf/geo/relationships/mod.rs @@ -1,5 +1,9 @@ +mod dwithin; +mod relate; mod topological; +pub use dwithin::DWithin; +pub use relate::{StRelate, StRelateMatch}; pub use topological::{ Contains, CoveredBy, Covers, Crosses, Disjoint, Equals, Intersects, Overlaps, Touches, Within, }; @@ -10,9 +14,12 @@ pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(Covers::default().into()); session_context.register_udf(Crosses::default().into()); session_context.register_udf(Disjoint::default().into()); + session_context.register_udf(DWithin::default().into()); session_context.register_udf(Equals::default().into()); session_context.register_udf(Intersects::default().into()); session_context.register_udf(Overlaps::default().into()); + session_context.register_udf(StRelate::default().into()); + session_context.register_udf(StRelateMatch::default().into()); session_context.register_udf(Touches::default().into()); session_context.register_udf(Within::default().into()); } diff --git a/rust/geodatafusion/src/udf/geo/relationships/relate.rs b/rust/geodatafusion/src/udf/geo/relationships/relate.rs new file mode 100644 index 0000000..8931387 --- /dev/null +++ b/rust/geodatafusion/src/udf/geo/relationships/relate.rs @@ -0,0 +1,386 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_array::{Array, StringArray}; +use arrow_array::builder::StringBuilder; +use arrow_schema::{DataType, Field}; +use datafusion::error::Result; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use geo::Relate as GeoRelate; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; + +use crate::error::GeoDataFusionResult; + +/// ST_Relate — returns the DE-9IM intersection matrix string for two geometries. +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StRelate { + signature: Signature, +} + +impl StRelate { + pub fn new() -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + } + } +} + +impl Default for StRelate { + fn default() -> Self { + Self::new() + } +} + +static RELATE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StRelate { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_relate" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let mut arg_iter = args.args.into_iter(); + let left = arg_iter.next().unwrap(); + let right = arg_iter.next().unwrap(); + Ok(relate_string_impl(left, &args.arg_fields[0], right, &args.arg_fields[1])?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(RELATE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the DE-9IM (Dimensionally Extended 9-Intersection Model) matrix string representing the spatial relationship between two geometries.", + "ST_Relate(geomA, geomB)", + ) + .with_argument("geomA", "geometry") + .with_argument("geomB", "geometry") + .build() + })) + } +} + +fn relate_string_impl( + left: ColumnarValue, + left_field: &Field, + right: ColumnarValue, + right_field: &Field, +) -> GeoDataFusionResult { + match (left, right) { + (ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)) => { + let mut arrays = + ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(l), ColumnarValue::Scalar(r)])? + .into_iter(); + relate_string_impl( + ColumnarValue::Array(arrays.next().unwrap()), + left_field, + ColumnarValue::Array(arrays.next().unwrap()), + right_field, + ) + } + (ColumnarValue::Array(l), ColumnarValue::Array(r)) => { + let left_geo = from_arrow_array(&l, left_field)?; + let right_geo = from_arrow_array(&r, right_field)?; + let result = relate_arrays(&left_geo, &right_geo)?; + Ok(ColumnarValue::Array(Arc::new(result))) + } + (ColumnarValue::Scalar(l), ColumnarValue::Array(r_arr)) => { + let l_arrays = ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(l)])?; + let left_geo = from_arrow_array(&l_arrays[0], left_field)?; + let right_geo = from_arrow_array(&r_arr, right_field)?; + let result = relate_broadcast_left(&left_geo, &right_geo)?; + Ok(ColumnarValue::Array(Arc::new(result))) + } + (ColumnarValue::Array(l_arr), ColumnarValue::Scalar(r)) => { + let r_arrays = ColumnarValue::values_to_arrays(&[ColumnarValue::Scalar(r)])?; + let left_geo = from_arrow_array(&l_arr, left_field)?; + let right_geo = from_arrow_array(&r_arrays[0], right_field)?; + let result = relate_broadcast_right(&left_geo, &right_geo)?; + Ok(ColumnarValue::Array(Arc::new(result))) + } + } +} + +fn relate_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _relate_left, right) +} + +fn _relate_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _relate_both, left) +} + +fn _relate_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + let mut builder = StringBuilder::with_capacity(left.len(), left.len() * 9); + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(lg), Some(rg)) => { + let left_geo = geometry_to_geo(&lg?)?; + let right_geo = geometry_to_geo(&rg?)?; + let matrix = left_geo.relate(&right_geo); + builder.append_value(intersection_matrix_to_string(&matrix)); + } + _ => builder.append_null(), + } + } + Ok(builder.finish()) +} + +/// ST_RelateMatch — tests if a DE-9IM matrix matches a given pattern. +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StRelateMatch { + signature: Signature, +} + +impl StRelateMatch { + pub fn new() -> Self { + Self { + signature: Signature::exact( + vec![DataType::Utf8, DataType::Utf8], + Volatility::Immutable, + ), + } + } +} + +impl Default for StRelateMatch { + fn default() -> Self { + Self::new() + } +} + +static RELATE_MATCH_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StRelateMatch { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_relatematch" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let matrices = arrays[0].as_any().downcast_ref::().unwrap(); + let patterns = arrays[1].as_any().downcast_ref::().unwrap(); + + let mut builder = arrow_array::builder::BooleanBuilder::with_capacity(matrices.len()); + + for i in 0..matrices.len() { + if matrices.is_null(i) || patterns.is_null(i) { + builder.append_null(); + } else { + let matrix_str = matrices.value(i); + let pattern = patterns.value(i); + let matches = de9im_matches(matrix_str, pattern); + builder.append_value(matches); + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(RELATE_MATCH_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Tests if a DE-9IM intersection matrix matches a given pattern string.", + "ST_RelateMatch(matrix, pattern)", + ) + .with_argument("matrix", "text - a DE-9IM matrix string (9 characters)") + .with_argument("pattern", "text - a DE-9IM pattern string") + .build() + })) + } +} + +fn relate_broadcast_left( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(left, _relate_bl_left, right) +} + +fn _relate_bl_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + let left_geom = left + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + downcast_geoarrow_array!(right, _relate_bl_right, &left_geom) +} + +fn _relate_bl_right<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left_geom: &Option, +) -> GeoArrowResult { + let mut builder = StringBuilder::with_capacity(right.len(), right.len() * 9); + if let Some(lg) = left_geom { + for r in right.iter() { + if let Some(rg) = r { + let right_geo = geometry_to_geo(&rg?)?; + let matrix = lg.relate(&right_geo); + builder.append_value(intersection_matrix_to_string(&matrix)); + } else { + builder.append_null(); + } + } + } else { + for _ in 0..right.len() { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +fn relate_broadcast_right( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(right, _relate_br_right, left) +} + +fn _relate_br_right<'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &dyn GeoArrowArray, +) -> GeoArrowResult { + let right_geom = right + .iter() + .next() + .unwrap() + .map(|g| geometry_to_geo(&g?)) + .transpose()?; + downcast_geoarrow_array!(left, _relate_br_left, &right_geom) +} + +fn _relate_br_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right_geom: &Option, +) -> GeoArrowResult { + let mut builder = StringBuilder::with_capacity(left.len(), left.len() * 9); + if let Some(rg) = right_geom { + for l in left.iter() { + if let Some(lg) = l { + let left_geo = geometry_to_geo(&lg?)?; + let matrix = left_geo.relate(rg); + builder.append_value(intersection_matrix_to_string(&matrix)); + } else { + builder.append_null(); + } + } + } else { + for _ in 0..left.len() { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +/// Convert an `IntersectionMatrix` to its 9-character DE-9IM string using the +/// public `get` API, avoiding any reliance on the `Debug` format. +fn intersection_matrix_to_string(matrix: &geo::relate::IntersectionMatrix) -> String { + use geo::coordinate_position::CoordPos; + use geo::dimensions::Dimensions; + let mut s = String::with_capacity(9); + for a in [CoordPos::Inside, CoordPos::OnBoundary, CoordPos::Outside] { + for b in [CoordPos::Inside, CoordPos::OnBoundary, CoordPos::Outside] { + s.push(match matrix.get(a, b) { + Dimensions::Empty => 'F', + Dimensions::ZeroDimensional => '0', + Dimensions::OneDimensional => '1', + Dimensions::TwoDimensional => '2', + }); + } + } + s +} + +/// Check if a DE-9IM matrix string matches a pattern. +fn de9im_matches(matrix: &str, pattern: &str) -> bool { + if matrix.len() != 9 || pattern.len() != 9 { + return false; + } + + matrix + .chars() + .zip(pattern.chars()) + .all(|(m, p)| match p { + '*' => true, + 'T' | 't' => m != 'F' && m != 'f', + 'F' | 'f' => m == 'F' || m == 'f', + '0' => m == '0', + '1' => m == '1', + '2' => m == '2', + _ => false, + }) +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::GeomFromText; + + #[tokio::test] + async fn test_relate() { + let ctx = SessionContext::new(); + ctx.register_udf(StRelate::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_Relate(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('LINESTRING(0 0, 1 1)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + let val = batch.column(0).as_string::().value(0); + // Point on start of line: F0FFFF102 + assert_eq!(val.len(), 9); + } + + #[test] + fn test_de9im_matches() { + assert!(de9im_matches("212101212", "T*T***T**")); + assert!(!de9im_matches("FF*FF****", "T*T***T**")); + assert!(de9im_matches("FF*FF****", "FF*FF****")); + } +} diff --git a/rust/geodatafusion/src/udf/native/accessors/geometry_accessors.rs b/rust/geodatafusion/src/udf/native/accessors/geometry_accessors.rs new file mode 100644 index 0000000..822639d --- /dev/null +++ b/rust/geodatafusion/src/udf/native/accessors/geometry_accessors.rs @@ -0,0 +1,963 @@ +//! Additional geometry accessor functions: ST_IsEmpty, ST_IsClosed, ST_IsSimple, +//! ST_IsRing, ST_IsCollection, ST_Dimension, ST_NumGeometries, ST_HasZ, ST_HasM, +//! ST_NRings. + +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_array::builder::{BooleanBuilder, Int32Builder, UInt32Builder}; +use arrow_array::{ArrayRef, BooleanArray, Int32Array, UInt32Array}; +use arrow_schema::DataType; +use datafusion::error::Result; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, +}; +use geo_traits::*; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::Dimension; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// ========================================================================= +// ST_IsEmpty +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct IsEmpty; + +impl Default for IsEmpty { + fn default() -> Self { + Self + } +} + +static IS_EMPTY_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for IsEmpty { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_isempty" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(is_empty_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(IS_EMPTY_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry is an empty geometry (has no points).", + "ST_IsEmpty(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn is_empty_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = is_empty_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn is_empty_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _is_empty_impl) +} + +fn _is_empty_impl<'a>(array: &'a impl GeoArrowArrayAccessor<'a>) -> GeoArrowResult { + let mut builder = BooleanBuilder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let g = geom?; + let empty = match g.as_type() { + GeometryType::Point(p) => p.coord().is_none(), + GeometryType::LineString(ls) => ls.num_coords() == 0, + GeometryType::Polygon(p) => { + p.exterior().map_or(true, |ext| ext.num_coords() == 0) + } + GeometryType::MultiPoint(mp) => mp.num_points() == 0, + GeometryType::MultiLineString(mls) => mls.num_line_strings() == 0, + GeometryType::MultiPolygon(mp) => mp.num_polygons() == 0, + GeometryType::GeometryCollection(gc) => gc.num_geometries() == 0, + _ => false, + }; + builder.append_value(empty); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// ========================================================================= +// ST_IsClosed +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct IsClosed; + +impl Default for IsClosed { + fn default() -> Self { + Self + } +} + +static IS_CLOSED_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for IsClosed { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_isclosed" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(is_closed_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(IS_CLOSED_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry's start and end points are coincident.", + "ST_IsClosed(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn is_closed_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = is_closed_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn is_closed_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _is_closed_impl) +} + +fn _is_closed_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geoarrow_expr_geo::util::to_geo::geometry_to_geo; + let mut builder = BooleanBuilder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let closed = is_geo_closed(&geo_geom); + builder.append_value(closed); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +fn is_geo_closed(geom: &geo::Geometry) -> bool { + match geom { + geo::Geometry::Point(_) => true, + geo::Geometry::LineString(ls) => is_linestring_closed(ls), + geo::Geometry::Polygon(_) => true, + geo::Geometry::MultiLineString(mls) => mls.0.iter().all(is_linestring_closed), + _ => true, + } +} + +fn is_linestring_closed(ls: &geo::LineString) -> bool { + if ls.0.len() < 2 { + return false; + } + let first = ls.0.first().unwrap(); + let last = ls.0.last().unwrap(); + first.x == last.x && first.y == last.y +} + +// ========================================================================= +// ST_IsSimple +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct IsSimple; + +impl Default for IsSimple { + fn default() -> Self { + Self + } +} + +static IS_SIMPLE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for IsSimple { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_issimple" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(is_simple_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(IS_SIMPLE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry has no anomalous geometric points.", + "ST_IsSimple(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn is_simple_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = is_simple_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn is_simple_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _is_simple_impl) +} + +fn _is_simple_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + use geoarrow_expr_geo::util::to_geo::geometry_to_geo; + let mut builder = BooleanBuilder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let simple = is_geometry_simple(&geo_geom); + builder.append_value(simple); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +fn is_geometry_simple(geom: &geo::Geometry) -> bool { + match geom { + geo::Geometry::Point(_) => true, + geo::Geometry::MultiPoint(mp) => { + // Simple if no two points are equal + for i in 0..mp.0.len() { + for j in (i + 1)..mp.0.len() { + if mp.0[i] == mp.0[j] { + return false; + } + } + } + true + } + geo::Geometry::LineString(ls) => is_linestring_simple(ls), + geo::Geometry::MultiLineString(mls) => mls.0.iter().all(is_linestring_simple), + geo::Geometry::Polygon(_) | geo::Geometry::MultiPolygon(_) => true, + geo::Geometry::GeometryCollection(gc) => gc.0.iter().all(is_geometry_simple), + _ => true, + } +} + +fn is_linestring_simple(ls: &geo::LineString) -> bool { + use geo::line_intersection::{line_intersection, LineIntersection}; + + if ls.0.len() < 2 { + return true; + } + let lines: Vec = ls.lines().collect(); + for i in 0..lines.len() { + // Check against non-adjacent segments + for j in (i + 2)..lines.len() { + // Skip adjacent segments (i and i+1 share an endpoint) + // Also skip the pair of first and last segment if the linestring is closed + // (they share the start/end point by definition) + if i == 0 && j == lines.len() - 1 && is_linestring_closed(ls) { + continue; + } + if let Some(intersection) = line_intersection(lines[i], lines[j]) { + match intersection { + LineIntersection::SinglePoint { intersection, .. } => { + // Shared endpoint between non-adjacent segments is OK + // only if it's at the ends of both segments + let is_endpoint_i = + intersection == lines[i].start || intersection == lines[i].end; + let is_endpoint_j = + intersection == lines[j].start || intersection == lines[j].end; + if !(is_endpoint_i && is_endpoint_j) { + return false; + } + } + LineIntersection::Collinear { .. } => return false, + } + } + } + } + true +} + +// ========================================================================= +// ST_IsRing +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct IsRing; + +impl Default for IsRing { + fn default() -> Self { + Self + } +} + +static IS_RING_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for IsRing { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_isring" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(is_ring_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(IS_RING_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry is a closed and simple linestring (a ring).", + "ST_IsRing(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn is_ring_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = is_ring_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn is_ring_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _is_ring_impl) +} + +fn _is_ring_impl<'a>(array: &'a impl GeoArrowArrayAccessor<'a>) -> GeoArrowResult { + use geoarrow_expr_geo::util::to_geo::geometry_to_geo; + let mut builder = BooleanBuilder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let ring = match &geo_geom { + geo::Geometry::LineString(ls) => { + ls.0.len() >= 4 && is_linestring_closed(ls) && is_linestring_simple(ls) + } + _ => false, + }; + builder.append_value(ring); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// ========================================================================= +// ST_IsCollection +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct IsCollection; + +impl Default for IsCollection { + fn default() -> Self { + Self + } +} + +static IS_COLLECTION_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for IsCollection { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_iscollection" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(is_collection_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(IS_COLLECTION_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry is a collection type.", + "ST_IsCollection(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn is_collection_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = is_collection_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn is_collection_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _is_collection_impl) +} + +fn _is_collection_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + let mut builder = BooleanBuilder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let g = geom?; + let is_coll = matches!( + g.as_type(), + GeometryType::MultiPoint(_) + | GeometryType::MultiLineString(_) + | GeometryType::MultiPolygon(_) + | GeometryType::GeometryCollection(_) + ); + builder.append_value(is_coll); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// ========================================================================= +// ST_Dimension +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct StDimension; + +impl Default for StDimension { + fn default() -> Self { + Self + } +} + +static DIMENSION_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StDimension { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_dimension" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int32) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(dimension_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DIMENSION_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the topological dimension of the geometry.", + "ST_Dimension(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn dimension_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = dimension_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn dimension_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _dimension_impl) +} + +fn _dimension_impl<'a>(array: &'a impl GeoArrowArrayAccessor<'a>) -> GeoArrowResult { + let mut builder = Int32Builder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let g = geom?; + let dim = match g.as_type() { + GeometryType::Point(_) | GeometryType::MultiPoint(_) => 0, + GeometryType::LineString(_) + | GeometryType::MultiLineString(_) + | GeometryType::Line(_) => 1, + GeometryType::Polygon(_) + | GeometryType::MultiPolygon(_) + | GeometryType::Triangle(_) + | GeometryType::Rect(_) => 2, + GeometryType::GeometryCollection(gc) => { + let mut max_dim = -1i32; + for g in gc.geometries() { + let d = match g.as_type() { + GeometryType::Point(_) | GeometryType::MultiPoint(_) => 0, + GeometryType::LineString(_) + | GeometryType::MultiLineString(_) => 1, + GeometryType::Polygon(_) + | GeometryType::MultiPolygon(_) => 2, + _ => 0, + }; + if d > max_dim { + max_dim = d; + } + } + max_dim + } + }; + builder.append_value(dim); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// ========================================================================= +// ST_NumGeometries +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct NumGeometries; + +impl Default for NumGeometries { + fn default() -> Self { + Self + } +} + +static NUM_GEOMETRIES_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for NumGeometries { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_numgeometries" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::UInt32) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(num_geometries_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(NUM_GEOMETRIES_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the number of elements in a geometry collection.", + "ST_NumGeometries(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn num_geometries_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = num_geometries_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn num_geometries_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _num_geometries_impl) +} + +fn _num_geometries_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + let mut builder = UInt32Builder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let g = geom?; + let count = match g.as_type() { + GeometryType::Point(_) + | GeometryType::LineString(_) + | GeometryType::Polygon(_) => 1u32, + GeometryType::MultiPoint(mp) => mp.num_points() as u32, + GeometryType::MultiLineString(mls) => mls.num_line_strings() as u32, + GeometryType::MultiPolygon(mp) => mp.num_polygons() as u32, + GeometryType::GeometryCollection(gc) => gc.num_geometries() as u32, + _ => 1, + }; + builder.append_value(count); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// ========================================================================= +// ST_NRings +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct NRings; + +impl Default for NRings { + fn default() -> Self { + Self + } +} + +static NRINGS_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for NRings { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_nrings" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::UInt32) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(nrings_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(NRINGS_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the total number of rings in a geometry.", + "ST_NRings(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +fn nrings_impl(args: ScalarFunctionArgs) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = nrings_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) +} + +fn nrings_array(array: &dyn GeoArrowArray) -> GeoArrowResult { + downcast_geoarrow_array!(array, _nrings_impl) +} + +fn _nrings_impl<'a>(array: &'a impl GeoArrowArrayAccessor<'a>) -> GeoArrowResult { + let mut builder = UInt32Builder::with_capacity(array.len()); + for item in array.iter() { + if let Some(geom) = item { + let g = geom?; + let count = count_rings(&g); + builder.append_value(count); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +fn count_rings(geom: &impl GeometryTrait) -> u32 { + match geom.as_type() { + GeometryType::Polygon(p) => { + let exterior = if p.exterior().is_some() { 1u32 } else { 0 }; + exterior + p.num_interiors() as u32 + } + GeometryType::MultiPolygon(mp) => { + (0..mp.num_polygons()) + .filter_map(|i| mp.polygon(i)) + .map(|p| count_rings_polygon(&p)) + .sum() + } + GeometryType::GeometryCollection(gc) => gc.geometries().map(|g| count_rings(&g)).sum(), + _ => 0, + } +} + +fn count_rings_polygon(p: &impl PolygonTrait) -> u32 { + let exterior = if p.exterior().is_some() { 1u32 } else { 0 }; + exterior + p.num_interiors() as u32 +} + +// ========================================================================= +// ST_HasZ +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct HasZ; + +impl Default for HasZ { + fn default() -> Self { + Self + } +} + +static HASZ_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for HasZ { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_hasz" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0]) + .map_err(|e| datafusion::error::DataFusionError::Internal(e.to_string()))?; + let dim = geo_array.data_type().dimension(); + let has_z = matches!(dim, Some(Dimension::XYZ) | Some(Dimension::XYZM)); + let result = BooleanArray::from(vec![Some(has_z); arrays[0].len()]); + Ok(ColumnarValue::Array(Arc::new(result))) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(HASZ_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry has a Z coordinate dimension.", + "ST_HasZ(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +// ========================================================================= +// ST_HasM +// ========================================================================= +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct HasM; + +impl Default for HasM { + fn default() -> Self { + Self + } +} + +static HASM_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for HasM { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_hasm" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Boolean) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0]) + .map_err(|e| datafusion::error::DataFusionError::Internal(e.to_string()))?; + let dim = geo_array.data_type().dimension(); + let has_m = matches!(dim, Some(Dimension::XYM) | Some(Dimension::XYZM)); + let result = BooleanArray::from(vec![Some(has_m); arrays[0].len()]); + Ok(ColumnarValue::Array(Arc::new(result))) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(HASM_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns true if the geometry has an M (measure) coordinate dimension.", + "ST_HasM(geometry)", + ) + .with_argument("geom", "geometry") + .build() + })) + } +} + +#[cfg(test)] +mod test { + use arrow_array::cast::AsArray; + use arrow_array::types::{Int32Type, UInt32Type}; + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::GeomFromText; + + #[tokio::test] + async fn test_is_empty_point() { + let ctx = SessionContext::new(); + ctx.register_udf(IsEmpty.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_IsEmpty(ST_GeomFromText('POINT(0 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert!(!batch.column(0).as_boolean().value(0)); + } + + #[tokio::test] + async fn test_is_closed_ring() { + let ctx = SessionContext::new(); + ctx.register_udf(IsClosed.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_IsClosed(ST_GeomFromText('LINESTRING(0 0, 1 1, 1 0, 0 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert!(batch.column(0).as_boolean().value(0)); + } + + #[tokio::test] + async fn test_is_closed_open() { + let ctx = SessionContext::new(); + ctx.register_udf(IsClosed.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_IsClosed(ST_GeomFromText('LINESTRING(0 0, 1 1, 1 0)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert!(!batch.column(0).as_boolean().value(0)); + } + + #[tokio::test] + async fn test_dimension() { + let ctx = SessionContext::new(); + ctx.register_udf(StDimension.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_Dimension(ST_GeomFromText('POLYGON((0 0, 1 0, 1 1, 0 1, 0 0))'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert_eq!(batch.column(0).as_primitive::().value(0), 2); + } + + #[tokio::test] + async fn test_num_geometries_multi() { + let ctx = SessionContext::new(); + ctx.register_udf(NumGeometries.into()); + ctx.register_udf(GeomFromText::default().into()); + + let df = ctx + .sql("SELECT ST_NumGeometries(ST_GeomFromText('MULTIPOINT(0 0, 1 1, 2 2)'));") + .await + .unwrap(); + let batch = df.collect().await.unwrap().into_iter().next().unwrap(); + assert_eq!(batch.column(0).as_primitive::().value(0), 3); + } +} diff --git a/rust/geodatafusion/src/udf/native/accessors/mod.rs b/rust/geodatafusion/src/udf/native/accessors/mod.rs index 5a716ca..6532a63 100644 --- a/rust/geodatafusion/src/udf/native/accessors/mod.rs +++ b/rust/geodatafusion/src/udf/native/accessors/mod.rs @@ -1,4 +1,5 @@ mod coord_dim; +mod geometry_accessors; mod geometry_type; mod line_string; mod npoints; @@ -6,6 +7,10 @@ mod num_interior_rings; mod point; pub use coord_dim::{CoordDim, NDims}; +pub use geometry_accessors::{ + HasM, HasZ, IsClosed, IsCollection, IsEmpty, IsRing, IsSimple, NRings, NumGeometries, + StDimension, +}; pub use geometry_type::{GeometryType, ST_GeometryType}; pub use line_string::{EndPoint, StartPoint}; pub use npoints::NPoints; @@ -14,13 +19,23 @@ pub use point::{M, X, Y, Z}; pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(CoordDim.into()); + session_context.register_udf(StDimension.into()); session_context.register_udf(NDims.into()); session_context.register_udf(GeometryType.into()); session_context.register_udf(ST_GeometryType.into()); session_context.register_udf(EndPoint::default().into()); - session_context.register_udf(StartPoint::default().into()); + session_context.register_udf(HasM.into()); + session_context.register_udf(HasZ.into()); + session_context.register_udf(IsClosed.into()); + session_context.register_udf(IsCollection.into()); + session_context.register_udf(IsEmpty.into()); + session_context.register_udf(IsRing.into()); + session_context.register_udf(IsSimple.into()); session_context.register_udf(NPoints.into()); + session_context.register_udf(NRings.into()); + session_context.register_udf(NumGeometries.into()); session_context.register_udf(NumInteriorRings.into()); + session_context.register_udf(StartPoint::default().into()); session_context.register_udf(M::default().into()); session_context.register_udf(X::default().into()); session_context.register_udf(Y::default().into()); diff --git a/rust/geodatafusion/src/udf/native/constructors/make.rs b/rust/geodatafusion/src/udf/native/constructors/make.rs new file mode 100644 index 0000000..54a5208 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/constructors/make.rs @@ -0,0 +1,584 @@ +//! Line, Polygon, and Envelope constructors + +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_array::cast::AsArray; +use arrow_array::types::Float64Type; +use arrow_array::Array; +use arrow_schema::{DataType, FieldRef}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + TypeSignature, Volatility, +}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_MakeLine +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct MakeLine { + signature: Signature, + coord_type: CoordType, +} + +impl MakeLine { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::any(2, Volatility::Immutable), + coord_type, + } + } +} + +impl Default for MakeLine { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static MAKE_LINE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MakeLine { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makeline" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let metadata = + Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(self.coord_type); + Ok(Arc::new(output_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(make_line_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MAKE_LINE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Creates a LineString containing the points of two input geometries. Other geometries are decomposed into their component points.", + "ST_MakeLine(geom1, geom2)", + ) + .with_argument("geom1", "geometry") + .with_argument("geom2", "geometry") + .build() + })) + } +} + +fn make_line_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let left_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let right_array = from_arrow_array(&arrays[1], &args.arg_fields[1])?; + let result = make_line_arrays(&left_array, &right_array, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn make_line_arrays( + left: &dyn GeoArrowArray, + right: &dyn GeoArrowArray, + coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(left, _make_line_left, right, coord_type) +} + +fn _make_line_left<'a>( + left: &'a impl GeoArrowArrayAccessor<'a>, + right: &dyn GeoArrowArray, + coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(right, _make_line_both, left, coord_type) +} + +fn _make_line_both<'a, 'b>( + right: &'b impl GeoArrowArrayAccessor<'b>, + left: &'a impl GeoArrowArrayAccessor<'a>, + coord_type: CoordType, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()).with_coord_type(coord_type); + let mut builder = GeometryBuilder::new(geom_type); + + for (l, r) in left.iter().zip(right.iter()) { + match (l, r) { + (Some(left_geom), Some(right_geom)) => { + let left_geo = geometry_to_geo(&left_geom?)?; + let right_geo = geometry_to_geo(&right_geom?)?; + + let mut coords = collect_points(&left_geo); + coords.extend(collect_points(&right_geo)); + + if coords.len() >= 2 { + let line = geo::LineString::new(coords); + builder.push_geometry(Some(&geo::Geometry::LineString(line)))?; + } else { + builder.push_null(); + } + } + _ => builder.push_null(), + } + } + + Ok(Arc::new(builder.finish())) +} + +/// Extract all coordinate points from a geometry. +fn collect_points(geom: &geo::Geometry) -> Vec { + use geo::CoordsIter; + geom.coords_iter().collect() +} + +// --------------------------------------------------------------------------- +// ST_MakePolygon +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct MakePolygon { + coord_type: CoordType, +} + +impl MakePolygon { + pub fn new(coord_type: CoordType) -> Self { + Self { coord_type } + } +} + +impl Default for MakePolygon { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static MAKE_POLYGON_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MakePolygon { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makepolygon" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let metadata = + Arc::new(Metadata::try_from(args.arg_fields[0].as_ref()).unwrap_or_default()); + let output_type = GeometryType::new(metadata).with_coord_type(self.coord_type); + Ok(Arc::new(output_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(make_polygon_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MAKE_POLYGON_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Creates a Polygon from a closed LineString shell.", + "ST_MakePolygon(linestring)", + ) + .with_argument("shell", "A closed LineString geometry") + .build() + })) + } +} + +fn make_polygon_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args)?; + let geo_array = from_arrow_array(&arrays[0], &args.arg_fields[0])?; + let result = make_polygon_array(&geo_array, coord_type)?; + Ok(ColumnarValue::Array(result.into_array_ref())) +} + +fn make_polygon_array( + array: &dyn GeoArrowArray, + coord_type: CoordType, +) -> GeoArrowResult> { + downcast_geoarrow_array!(array, _make_polygon_impl, coord_type) +} + +fn _make_polygon_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, + coord_type: CoordType, +) -> GeoArrowResult> { + let geom_type = GeometryType::new(Default::default()).with_coord_type(coord_type); + let mut builder = GeometryBuilder::new(geom_type); + + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + match geo_geom { + geo::Geometry::LineString(mut ls) => { + // Auto-close if not already closed (matches PostGIS behavior) + if ls.0.len() >= 2 && ls.0.first() != ls.0.last() { + ls.0.push(ls.0[0]); + } + let polygon = geo::Polygon::new(ls, vec![]); + builder.push_geometry(Some(&geo::Geometry::Polygon(polygon)))?; + } + _ => { + // For non-linestring inputs, push null + builder.push_null(); + } + } + } else { + builder.push_null(); + } + } + + Ok(Arc::new(builder.finish())) +} + +// --------------------------------------------------------------------------- +// ST_MakeEnvelope +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct MakeEnvelope { + signature: Signature, + coord_type: CoordType, +} + +impl MakeEnvelope { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![ + DataType::Float64, + DataType::Float64, + DataType::Float64, + DataType::Float64, + ]), + TypeSignature::Exact(vec![ + DataType::Float64, + DataType::Float64, + DataType::Float64, + DataType::Float64, + DataType::Int64, + ]), + ], + Volatility::Immutable, + ), + coord_type, + } + } +} + +impl Default for MakeEnvelope { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static MAKE_ENVELOPE_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for MakeEnvelope { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makeenvelope" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result { + let mut geom_type = + GeometryType::new(Default::default()).with_coord_type(self.coord_type); + + // If a 5th argument (SRID) is provided as scalar, use it + if let Some(srid) = args.scalar_arguments.get(4) { + if let Some(datafusion::scalar::ScalarValue::Int64(srid_val)) = srid { + let crs = geoarrow_schema::Crs::from_authority_code(format!( + "EPSG:{}", + match srid_val { + Some(v) => v, + None => return Ok(Arc::new(geom_type.to_field("", true))), + } + )); + geom_type = geom_type + .with_metadata(Arc::new(Metadata::new(crs, Default::default()))); + } + } + + Ok(Arc::new(geom_type.to_field("", true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(make_envelope_impl(args, self.coord_type)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MAKE_ENVELOPE_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Creates a rectangular Polygon from minimum and maximum coordinate values. The polygon is formed in the coordinate plane with corners at (xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), (xmin, ymin).", + "ST_MakeEnvelope(xmin, ymin, xmax, ymax) or ST_MakeEnvelope(xmin, ymin, xmax, ymax, srid)", + ) + .with_argument("xmin", "minimum x value") + .with_argument("ymin", "minimum y value") + .with_argument("xmax", "maximum x value") + .with_argument("ymax", "maximum y value") + .with_argument("srid", "optional integer SRID value") + .build() + })) + } +} + +fn make_envelope_impl( + args: ScalarFunctionArgs, + coord_type: CoordType, +) -> GeoDataFusionResult { + let arrays = ColumnarValue::values_to_arrays(&args.args[..4])?; + let xmin = arrays[0].as_primitive::(); + let ymin = arrays[1].as_primitive::(); + let xmax = arrays[2].as_primitive::(); + let ymax = arrays[3].as_primitive::(); + + let geom_type = GeometryType::new(Default::default()).with_coord_type(coord_type); + let mut builder = GeometryBuilder::new(geom_type); + + for i in 0..xmin.len() { + if xmin.is_null(i) || ymin.is_null(i) || xmax.is_null(i) || ymax.is_null(i) { + builder.push_null(); + } else { + let x1 = xmin.value(i); + let y1 = ymin.value(i); + let x2 = xmax.value(i); + let y2 = ymax.value(i); + + let polygon = geo::Polygon::new( + geo::LineString::from(vec![ + (x1, y1), + (x1, y2), + (x2, y2), + (x2, y1), + (x1, y1), + ]), + vec![], + ); + builder.push_geometry(Some(&geo::Geometry::Polygon(polygon)))?; + } + } + + Ok(ColumnarValue::Array(builder.finish().to_array_ref())) +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow_array::{ArrayRef, Float64Array, RecordBatch}; + use arrow_schema::{Field, Schema}; + use datafusion::prelude::SessionContext; + use geoarrow_schema::Crs; + + use super::*; + use crate::udf::native::io::{AsText, GeomFromText}; + + #[tokio::test] + async fn test_make_line() { + let ctx = SessionContext::new(); + + ctx.register_udf(MakeLine::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let sql_df = ctx + .sql(r#"SELECT ST_AsText(ST_MakeLine(ST_GeomFromText('POINT(0 0)'), ST_GeomFromText('POINT(1 1)')));"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let wkt = str_arr.value(0); + assert!( + wkt.contains("LINESTRING"), + "Expected LINESTRING in output, got: {}", + wkt + ); + } + + #[tokio::test] + async fn test_make_polygon() { + let ctx = SessionContext::new(); + + ctx.register_udf(MakePolygon::default().into()); + ctx.register_udf(GeomFromText::default().into()); + ctx.register_udf(AsText.into()); + + let sql_df = ctx + .sql(r#"SELECT ST_AsText(ST_MakePolygon(ST_GeomFromText('LINESTRING(0 0, 10 0, 10 10, 0 10, 0 0)')));"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let wkt = str_arr.value(0); + assert!( + wkt.contains("POLYGON"), + "Expected POLYGON in output, got: {}", + wkt + ); + } + + #[tokio::test] + async fn test_make_envelope() { + let ctx = SessionContext::new(); + + ctx.register_udf(MakeEnvelope::default().into()); + ctx.register_udf(AsText.into()); + + let sql_df = ctx + .sql(r#"SELECT ST_AsText(ST_MakeEnvelope(0.0, 0.0, 1.0, 1.0));"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let wkt = str_arr.value(0); + assert!( + wkt.contains("POLYGON"), + "Expected POLYGON in output, got: {}", + wkt + ); + } + + #[tokio::test] + async fn test_make_envelope_with_srid() { + let ctx = SessionContext::new(); + + ctx.register_udf(MakeEnvelope::default().into()); + + let sql_df = ctx + .sql(r#"SELECT ST_MakeEnvelope(0.0, 0.0, 1.0, 1.0, 4326);"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + let output_schema = output_batch.schema(); + let output_field = output_schema.field(0); + + let geom_type = output_field.try_extension_type::().unwrap(); + assert_eq!( + geom_type.metadata().crs(), + &Crs::from_authority_code("EPSG:4326".to_string()) + ); + } + + #[tokio::test] + async fn test_make_envelope_from_table() { + let ctx = SessionContext::new(); + + ctx.register_udf(MakeEnvelope::default().into()); + ctx.register_udf(AsText.into()); + + let xmin: ArrayRef = Arc::new(Float64Array::from(vec![0.0])); + let ymin: ArrayRef = Arc::new(Float64Array::from(vec![0.0])); + let xmax: ArrayRef = Arc::new(Float64Array::from(vec![10.0])); + let ymax: ArrayRef = Arc::new(Float64Array::from(vec![10.0])); + + let schema = Schema::new([ + Arc::new(Field::new("xmin", DataType::Float64, true)), + Arc::new(Field::new("ymin", DataType::Float64, true)), + Arc::new(Field::new("xmax", DataType::Float64, true)), + Arc::new(Field::new("ymax", DataType::Float64, true)), + ]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![xmin, ymin, xmax, ymax]).unwrap(); + + ctx.register_batch("t", batch).unwrap(); + + let sql_df = ctx + .sql("SELECT ST_AsText(ST_MakeEnvelope(xmin, ymin, xmax, ymax)) FROM t;") + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let wkt = str_arr.value(0); + assert!( + wkt.contains("POLYGON"), + "Expected POLYGON in output, got: {}", + wkt + ); + } +} diff --git a/rust/geodatafusion/src/udf/native/constructors/mod.rs b/rust/geodatafusion/src/udf/native/constructors/mod.rs index 18438bb..d153f22 100644 --- a/rust/geodatafusion/src/udf/native/constructors/mod.rs +++ b/rust/geodatafusion/src/udf/native/constructors/mod.rs @@ -1,5 +1,7 @@ +mod make; mod point; +pub use make::{MakeEnvelope, MakeLine, MakePolygon}; pub use point::{MakePoint, MakePointM, Point, PointM, PointZ, PointZM}; pub fn register(session_context: &datafusion::prelude::SessionContext) { @@ -9,4 +11,7 @@ pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(PointM::default().into()); session_context.register_udf(PointZ::default().into()); session_context.register_udf(PointZM::default().into()); + session_context.register_udf(MakeLine::default().into()); + session_context.register_udf(MakePolygon::default().into()); + session_context.register_udf(MakeEnvelope::default().into()); } diff --git a/rust/geodatafusion/src/udf/native/io/ewkt.rs b/rust/geodatafusion/src/udf/native/io/ewkt.rs new file mode 100644 index 0000000..735e498 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/io/ewkt.rs @@ -0,0 +1,232 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_array::builder::StringBuilder; +use arrow_array::Array; +use arrow_schema::{DataType, Field}; +use datafusion::error::Result; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, +}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::cast::to_wkt; +use geoarrow_array::GeoArrowArray; +use geoarrow_schema::{CrsType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_AsEWKT +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct AsEWKT; + +impl AsEWKT { + pub fn new() -> Self { + Self {} + } + + fn invoke_impl(&self, args: ScalarFunctionArgs) -> GeoDataFusionResult { + let array = &ColumnarValue::values_to_arrays(&args.args)?[0]; + let field = &args.arg_fields[0]; + let geo_array = from_arrow_array(&array, field.as_ref())?; + + // Get WKT representation + let wkt_arr = to_wkt::(geo_array.as_ref())?; + + // Extract SRID from metadata if present + let metadata = Metadata::try_from(field.as_ref()).ok(); + let srid_prefix = metadata + .as_ref() + .and_then(|m| { + let crs = m.crs(); + if crs.crs_type() == Some(CrsType::AuthorityCode) { + if let Some(val) = crs.crs_value() { + if let Some(code_str) = val.as_str() { + if let Some(srid) = code_str.strip_prefix("EPSG:") { + return Some(format!("SRID={};", srid)); + } + } + } + } + None + }) + .unwrap_or_default(); + + // Build output with SRID prefix + let str_arr = wkt_arr.into_array_ref(); + let wkt_string_arr = str_arr + .as_any() + .downcast_ref::() + .unwrap(); + + let mut builder = + StringBuilder::with_capacity(wkt_string_arr.len(), wkt_string_arr.len() * 64); + for i in 0..wkt_string_arr.len() { + if wkt_string_arr.is_null(i) { + builder.append_null(); + } else { + let wkt_val = wkt_string_arr.value(i); + let ewkt = format!("{}{}", srid_prefix, wkt_val); + builder.append_value(&ewkt); + } + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } +} + +impl Default for AsEWKT { + fn default() -> Self { + Self::new() + } +} + +static AS_EWKT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for AsEWKT { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_asewkt" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result> { + let input_field = &args.arg_fields[0]; + Ok(Field::new(input_field.name(), DataType::Utf8, input_field.is_nullable()).into()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(self.invoke_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AS_EWKT_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the Extended Well-Known Text (EWKT) representation of the geometry with SRID metadata.", + "ST_AsEWKT(geometry)", + ) + .with_argument("g1", "geometry") + .build() + })) + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow_schema::Schema; + use datafusion::prelude::SessionContext; + use geoarrow_array::test::point; + use geoarrow_schema::{CoordType, Crs, Dimension, Metadata}; + + use super::*; + + #[tokio::test] + async fn test_as_ewkt_with_srid() { + let ctx = SessionContext::new(); + + let crs = Crs::from_authority_code("EPSG:4326".to_string()); + let metadata = Arc::new(Metadata::new(crs, Default::default())); + + let geo_arr = point::array(CoordType::Separated, Dimension::XY).with_metadata(metadata); + + let arr = geo_arr.to_array_ref(); + let field = geo_arr.data_type().to_field("geometry", true); + let schema = Schema::new([Arc::new(field)]); + let batch = + arrow_array::RecordBatch::try_new(Arc::new(schema), vec![arr]).unwrap(); + + ctx.register_batch("t", batch).unwrap(); + + ctx.register_udf(AsEWKT::new().into()); + + let sql_df = ctx + .sql("SELECT ST_AsEWKT(geometry) FROM t;") + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(str_arr.len() > 0); + let first = str_arr.value(0); + assert!( + first.starts_with("SRID=4326;"), + "Expected EWKT to start with 'SRID=4326;', got: {}", + first + ); + assert!( + first.contains("POINT"), + "Expected EWKT to contain 'POINT', got: {}", + first + ); + } + + #[tokio::test] + async fn test_as_ewkt_without_srid() { + let ctx = SessionContext::new(); + + let geo_arr = point::array(CoordType::Separated, Dimension::XY); + + let arr = geo_arr.to_array_ref(); + let field = geo_arr.data_type().to_field("geometry", true); + let schema = Schema::new([Arc::new(field)]); + let batch = + arrow_array::RecordBatch::try_new(Arc::new(schema), vec![arr]).unwrap(); + + ctx.register_batch("t", batch).unwrap(); + + ctx.register_udf(AsEWKT::new().into()); + + let sql_df = ctx + .sql("SELECT ST_AsEWKT(geometry) FROM t;") + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert!(str_arr.len() > 0); + let first = str_arr.value(0); + assert!( + !first.starts_with("SRID="), + "Expected no SRID prefix when no CRS is set, got: {}", + first + ); + assert!( + first.contains("POINT"), + "Expected EWKT to contain 'POINT', got: {}", + first + ); + } +} diff --git a/rust/geodatafusion/src/udf/native/io/geojson.rs b/rust/geodatafusion/src/udf/native/io/geojson.rs new file mode 100644 index 0000000..ba749f0 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/io/geojson.rs @@ -0,0 +1,379 @@ +use std::any::Any; +use std::str::FromStr; +use std::sync::{Arc, OnceLock}; + +use arrow_array::builder::StringBuilder; +use arrow_array::Array; +use arrow_schema::{DataType, Field}; +use datafusion::error::{DataFusionError, Result}; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, +}; +use geoarrow_array::array::from_arrow_array; +use geoarrow_array::builder::GeometryBuilder; +use geoarrow_array::{GeoArrowArray, GeoArrowArrayAccessor, downcast_geoarrow_array}; +use geoarrow_expr_geo::util::to_geo::geometry_to_geo; +use geoarrow_schema::error::GeoArrowResult; +use geoarrow_schema::{CoordType, GeometryType, Metadata}; + +use crate::data_types::any_single_geometry_type_input; +use crate::error::GeoDataFusionResult; + +// --------------------------------------------------------------------------- +// ST_AsGeoJSON +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct AsGeoJSON; + +impl AsGeoJSON { + pub fn new() -> Self { + Self {} + } + + fn invoke_impl(&self, args: ScalarFunctionArgs) -> GeoDataFusionResult { + let array = &ColumnarValue::values_to_arrays(&args.args)?[0]; + let field = &args.arg_fields[0]; + let geo_array = from_arrow_array(&array, field.as_ref())?; + let result = as_geojson_array(&geo_array)?; + Ok(ColumnarValue::Array(Arc::new(result))) + } +} + +impl Default for AsGeoJSON { + fn default() -> Self { + Self::new() + } +} + +static AS_GEOJSON_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for AsGeoJSON { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_asgeojson" + } + + fn signature(&self) -> &Signature { + any_single_geometry_type_input() + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result> { + let input_field = &args.arg_fields[0]; + Ok(Field::new(input_field.name(), DataType::Utf8, input_field.is_nullable()).into()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(self.invoke_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AS_GEOJSON_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Returns the geometry as a GeoJSON element.", + "ST_AsGeoJSON(geometry)", + ) + .with_argument("g1", "geometry") + .build() + })) + } +} + +fn as_geojson_array( + array: &dyn GeoArrowArray, +) -> GeoArrowResult { + downcast_geoarrow_array!(array, _as_geojson_impl) +} + +fn _as_geojson_impl<'a>( + array: &'a impl GeoArrowArrayAccessor<'a>, +) -> GeoArrowResult { + let mut builder = StringBuilder::with_capacity(array.len(), array.len() * 64); + for item in array.iter() { + if let Some(geom) = item { + let geo_geom = geometry_to_geo(&geom?)?; + let geojson_value = geojson::Value::from(&geo_geom); + let geojson_geom = geojson::Geometry::new(geojson_value); + let json_string = serde_json::to_string(&geojson_geom) + .map_err(|e| geoarrow_schema::error::GeoArrowError::InvalidGeoArrow(e.to_string()))?; + builder.append_value(&json_string); + } else { + builder.append_null(); + } + } + Ok(builder.finish()) +} + +// --------------------------------------------------------------------------- +// ST_GeomFromGeoJSON +// --------------------------------------------------------------------------- + +#[derive(Debug, Eq, PartialEq, Hash)] +pub struct GeomFromGeoJSON { + signature: Signature, + coord_type: CoordType, +} + +impl GeomFromGeoJSON { + pub fn new(coord_type: CoordType) -> Self { + Self { + signature: Signature::uniform( + 1, + vec![DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View], + Volatility::Immutable, + ), + coord_type, + } + } + + fn invoke_impl(&self, args: ScalarFunctionArgs) -> GeoDataFusionResult { + let array = &ColumnarValue::values_to_arrays(&args.args)?[0]; + let field = &args.arg_fields[0]; + + let string_arr = array.as_ref(); + let len = string_arr.len(); + + let geom_type = GeometryType::new(Default::default()).with_coord_type(self.coord_type); + let mut builder = GeometryBuilder::new(geom_type); + + match field.data_type() { + DataType::Utf8 => { + let str_arr = string_arr + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..len { + if str_arr.is_null(i) { + builder.push_null(); + } else { + let geojson_str = str_arr.value(i); + let geo_geom = parse_geojson_to_geo(geojson_str)?; + builder.push_geometry(Some(&geo_geom))?; + } + } + } + DataType::LargeUtf8 => { + let str_arr = string_arr + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..len { + if str_arr.is_null(i) { + builder.push_null(); + } else { + let geojson_str = str_arr.value(i); + let geo_geom = parse_geojson_to_geo(geojson_str)?; + builder.push_geometry(Some(&geo_geom))?; + } + } + } + DataType::Utf8View => { + let str_arr = string_arr + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..len { + if str_arr.is_null(i) { + builder.push_null(); + } else { + let geojson_str = str_arr.value(i); + let geo_geom = parse_geojson_to_geo(geojson_str)?; + builder.push_geometry(Some(&geo_geom))?; + } + } + } + _ => { + return Err(DataFusionError::Internal(format!( + "Unexpected data type for ST_GeomFromGeoJSON: {:?}", + string_arr.data_type() + )) + .into()) + } + } + + Ok(ColumnarValue::Array(builder.finish().to_array_ref())) + } +} + +fn parse_geojson_to_geo(geojson_str: &str) -> GeoDataFusionResult { + use geojson::GeoJson; + + let geojson = GeoJson::from_str(geojson_str).map_err(|e| { + DataFusionError::Execution(format!("Failed to parse GeoJSON: {}", e)) + })?; + + let geometry: geojson::Geometry = match geojson { + GeoJson::Geometry(g) => g, + GeoJson::Feature(f) => f.geometry.ok_or_else(|| { + DataFusionError::Execution("GeoJSON Feature has no geometry".to_string()) + })?, + GeoJson::FeatureCollection(_) => { + return Err(DataFusionError::Execution( + "GeoJSON FeatureCollection is not supported; pass a single geometry".to_string(), + ) + .into()); + } + }; + + let geo_geom: geo::Geometry = + geo::Geometry::try_from(geometry).map_err(|e| { + DataFusionError::Execution(format!("Failed to convert GeoJSON to geo type: {}", e)) + })?; + + Ok(geo_geom) +} + +impl Default for GeomFromGeoJSON { + fn default() -> Self { + Self::new(Default::default()) + } +} + +static GEOM_FROM_GEOJSON_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for GeomFromGeoJSON { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_geomfromgeojson" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Err(DataFusionError::Internal("return_type".to_string())) + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result> { + let input_field = &args.arg_fields[0]; + let metadata = Arc::new(Metadata::try_from(input_field.as_ref()).unwrap_or_default()); + let geom_type = GeometryType::new(metadata).with_coord_type(self.coord_type); + Ok(geom_type + .to_field(input_field.name(), input_field.is_nullable()) + .into()) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + Ok(self.invoke_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(GEOM_FROM_GEOJSON_DOC.get_or_init(|| { + Documentation::builder( + DOC_SECTION_OTHER, + "Takes a GeoJSON representation of a geometry and creates a geometry object.", + "ST_GeomFromGeoJSON(text)", + ) + .with_argument("geojson", "GeoJSON string") + .build() + })) + } +} + +#[cfg(test)] +mod test { + use datafusion::prelude::SessionContext; + + use super::*; + use crate::udf::native::io::GeomFromText; + + #[tokio::test] + async fn test_as_geojson() { + let ctx = SessionContext::new(); + + ctx.register_udf(AsGeoJSON::new().into()); + ctx.register_udf(GeomFromText::default().into()); + + // Use ST_GeomFromText to create geometry via SQL to avoid empty-point issues + // with the test helper arrays from geoarrow_array. + let sql_df = ctx + .sql("SELECT ST_AsGeoJSON(ST_GeomFromText('POINT(30 10)'));") + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + + // Verify output is Utf8 + let output_schema = output_batch.schema(); + let output_field = output_schema.field(0); + assert_eq!(output_field.data_type(), &DataType::Utf8); + + // Verify the GeoJSON contains expected structure + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(str_arr.len(), 1); + let first = str_arr.value(0); + assert!(first.contains("\"type\"")); + assert!(first.contains("\"coordinates\"")); + assert!(first.contains("30")); + assert!(first.contains("10")); + } + + #[tokio::test] + async fn test_geom_from_geojson() { + let ctx = SessionContext::new(); + + ctx.register_udf(GeomFromGeoJSON::new(CoordType::Separated).into()); + ctx.register_udf(AsGeoJSON::new().into()); + + let sql_df = ctx + .sql(r#"SELECT ST_AsGeoJSON(ST_GeomFromGeoJSON('{"type":"Point","coordinates":[30,10]}'));"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let result = str_arr.value(0); + assert!(result.contains("30")); + assert!(result.contains("10")); + } + + #[tokio::test] + async fn test_geom_from_geojson_polygon() { + let ctx = SessionContext::new(); + + ctx.register_udf(GeomFromGeoJSON::new(CoordType::Separated).into()); + ctx.register_udf(AsGeoJSON::new().into()); + + let sql_df = ctx + .sql(r#"SELECT ST_AsGeoJSON(ST_GeomFromGeoJSON('{"type":"Polygon","coordinates":[[[0,0],[10,0],[10,10],[0,10],[0,0]]]}'));"#) + .await + .unwrap(); + + let output_batches = sql_df.collect().await.unwrap(); + assert_eq!(output_batches.len(), 1); + let output_batch = &output_batches[0]; + let str_arr = output_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let result = str_arr.value(0); + assert!(result.contains("Polygon")); + } +} diff --git a/rust/geodatafusion/src/udf/native/io/mod.rs b/rust/geodatafusion/src/udf/native/io/mod.rs index 376bfb6..b0adc8b 100644 --- a/rust/geodatafusion/src/udf/native/io/mod.rs +++ b/rust/geodatafusion/src/udf/native/io/mod.rs @@ -1,8 +1,12 @@ //! Geometry Input and Output +mod ewkt; +mod geojson; mod wkb; mod wkt; +pub use ewkt::AsEWKT; +pub use geojson::{AsGeoJSON, GeomFromGeoJSON}; pub use wkb::{AsBinary, GeomFromWKB}; pub use wkt::{AsText, GeomFromText}; @@ -11,4 +15,7 @@ pub fn register(session_context: &datafusion::prelude::SessionContext) { session_context.register_udf(GeomFromWKB::default().into()); session_context.register_udf(AsText.into()); session_context.register_udf(GeomFromText::default().into()); + session_context.register_udf(AsGeoJSON.into()); + session_context.register_udf(GeomFromGeoJSON::default().into()); + session_context.register_udf(AsEWKT.into()); }