From 92b521a0105bf2dc3f896da84e925f55bc329909 Mon Sep 17 00:00:00 2001 From: IOE Date: Sun, 11 Jan 2026 14:38:08 +0900 Subject: [PATCH 1/2] chore: improve performance --- include/nodec/math/math.hpp | 162 +++++++++--------------------------- 1 file changed, 40 insertions(+), 122 deletions(-) diff --git a/include/nodec/math/math.hpp b/include/nodec/math/math.hpp index 47c05a6..9b39830 100644 --- a/include/nodec/math/math.hpp +++ b/include/nodec/math/math.hpp @@ -216,136 +216,54 @@ Quaternion inv(const Quaternion &q) { template Matrix4x4 inv(const Matrix4x4 &mat, T *determinant = nullptr) { - // implementation notes: - // * https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix - - Matrix4x4 inverted; + // Optimized 4x4 matrix inverse using 2x2 sub-determinants + // This algorithm pre-computes 12 2x2 determinants and reuses them, + // reducing multiplications from ~144 to ~100. auto &m = mat.m; - inverted.m[0] = m[5] * m[10] * m[15] - - m[5] * m[11] * m[14] - - m[9] * m[6] * m[15] - + m[9] * m[7] * m[14] - + m[13] * m[6] * m[11] - - m[13] * m[7] * m[10]; - - inverted.m[4] = -m[4] * m[10] * m[15] - + m[4] * m[11] * m[14] - + m[8] * m[6] * m[15] - - m[8] * m[7] * m[14] - - m[12] * m[6] * m[11] - + m[12] * m[7] * m[10]; - - inverted.m[8] = m[4] * m[9] * m[15] - - m[4] * m[11] * m[13] - - m[8] * m[5] * m[15] - + m[8] * m[7] * m[13] - + m[12] * m[5] * m[11] - - m[12] * m[7] * m[9]; - - inverted.m[12] = -m[4] * m[9] * m[14] - + m[4] * m[10] * m[13] - + m[8] * m[5] * m[14] - - m[8] * m[6] * m[13] - - m[12] * m[5] * m[10] - + m[12] * m[6] * m[9]; - - inverted.m[1] = -m[1] * m[10] * m[15] - + m[1] * m[11] * m[14] - + m[9] * m[2] * m[15] - - m[9] * m[3] * m[14] - - m[13] * m[2] * m[11] - + m[13] * m[3] * m[10]; - - inverted.m[5] = m[0] * m[10] * m[15] - - m[0] * m[11] * m[14] - - m[8] * m[2] * m[15] - + m[8] * m[3] * m[14] - + m[12] * m[2] * m[11] - - m[12] * m[3] * m[10]; - - inverted.m[9] = -m[0] * m[9] * m[15] - + m[0] * m[11] * m[13] - + m[8] * m[1] * m[15] - - m[8] * m[3] * m[13] - - m[12] * m[1] * m[11] - + m[12] * m[3] * m[9]; - - inverted.m[13] = m[0] * m[9] * m[14] - - m[0] * m[10] * m[13] - - m[8] * m[1] * m[14] - + m[8] * m[2] * m[13] - + m[12] * m[1] * m[10] - - m[12] * m[2] * m[9]; - - inverted.m[2] = m[1] * m[6] * m[15] - - m[1] * m[7] * m[14] - - m[5] * m[2] * m[15] - + m[5] * m[3] * m[14] - + m[13] * m[2] * m[7] - - m[13] * m[3] * m[6]; - - inverted.m[6] = -m[0] * m[6] * m[15] - + m[0] * m[7] * m[14] - + m[4] * m[2] * m[15] - - m[4] * m[3] * m[14] - - m[12] * m[2] * m[7] - + m[12] * m[3] * m[6]; - - inverted.m[10] = m[0] * m[5] * m[15] - - m[0] * m[7] * m[13] - - m[4] * m[1] * m[15] - + m[4] * m[3] * m[13] - + m[12] * m[1] * m[7] - - m[12] * m[3] * m[5]; - - inverted.m[14] = -m[0] * m[5] * m[14] - + m[0] * m[6] * m[13] - + m[4] * m[1] * m[14] - - m[4] * m[2] * m[13] - - m[12] * m[1] * m[6] - + m[12] * m[2] * m[5]; - - inverted.m[3] = -m[1] * m[6] * m[11] - + m[1] * m[7] * m[10] - + m[5] * m[2] * m[11] - - m[5] * m[3] * m[10] - - m[9] * m[2] * m[7] - + m[9] * m[3] * m[6]; - - inverted.m[7] = m[0] * m[6] * m[11] - - m[0] * m[7] * m[10] - - m[4] * m[2] * m[11] - + m[4] * m[3] * m[10] - + m[8] * m[2] * m[7] - - m[8] * m[3] * m[6]; - - inverted.m[11] = -m[0] * m[5] * m[11] - + m[0] * m[7] * m[9] - + m[4] * m[1] * m[11] - - m[4] * m[3] * m[9] - - m[8] * m[1] * m[7] - + m[8] * m[3] * m[5]; - - inverted.m[15] = m[0] * m[5] * m[10] - - m[0] * m[6] * m[9] - - m[4] * m[1] * m[10] - + m[4] * m[2] * m[9] - + m[8] * m[1] * m[6] - - m[8] * m[2] * m[5]; - - const auto det = m[0] * inverted.m[0] - + m[1] * inverted.m[4] - + m[2] * inverted.m[8] - + m[3] * inverted.m[12]; + // Pre-compute 2x2 determinants from rows 0-1 (columns i,j) + const T b00 = m[0] * m[5] - m[1] * m[4]; + const T b01 = m[0] * m[6] - m[2] * m[4]; + const T b02 = m[0] * m[7] - m[3] * m[4]; + const T b03 = m[1] * m[6] - m[2] * m[5]; + const T b04 = m[1] * m[7] - m[3] * m[5]; + const T b05 = m[2] * m[7] - m[3] * m[6]; + + // Pre-compute 2x2 determinants from rows 2-3 (columns i,j) + const T b06 = m[8] * m[13] - m[9] * m[12]; + const T b07 = m[8] * m[14] - m[10] * m[12]; + const T b08 = m[8] * m[15] - m[11] * m[12]; + const T b09 = m[9] * m[14] - m[10] * m[13]; + const T b10 = m[9] * m[15] - m[11] * m[13]; + const T b11 = m[10] * m[15] - m[11] * m[14]; + + // Compute determinant + const T det = b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06; if (determinant != nullptr) { *determinant = det; } - inverted *= 1 / det; + const T inv_det = T(1) / det; - return inverted; + // Compute inverse matrix elements + return { + (m[5] * b11 - m[6] * b10 + m[7] * b09) * inv_det, + (m[2] * b10 - m[1] * b11 - m[3] * b09) * inv_det, + (m[13] * b05 - m[14] * b04 + m[15] * b03) * inv_det, + (m[10] * b04 - m[9] * b05 - m[11] * b03) * inv_det, + (m[6] * b08 - m[4] * b11 - m[7] * b07) * inv_det, + (m[0] * b11 - m[2] * b08 + m[3] * b07) * inv_det, + (m[14] * b02 - m[12] * b05 - m[15] * b01) * inv_det, + (m[8] * b05 - m[10] * b02 + m[11] * b01) * inv_det, + (m[4] * b10 - m[5] * b08 + m[7] * b06) * inv_det, + (m[1] * b08 - m[0] * b10 - m[3] * b06) * inv_det, + (m[12] * b04 - m[13] * b02 + m[15] * b00) * inv_det, + (m[9] * b02 - m[8] * b04 - m[11] * b00) * inv_det, + (m[5] * b07 - m[4] * b09 - m[6] * b06) * inv_det, + (m[0] * b09 - m[1] * b07 + m[2] * b06) * inv_det, + (m[13] * b01 - m[12] * b03 - m[14] * b00) * inv_det, + (m[8] * b03 - m[9] * b01 + m[10] * b00) * inv_det}; } } // namespace math From 076ac43467c5d189a412a64e8f3b9d4e8e5470b2 Mon Sep 17 00:00:00 2001 From: IOE Date: Sun, 11 Jan 2026 14:46:23 +0900 Subject: [PATCH 2/2] fix --- include/nodec/math/math.hpp | 67 +++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/include/nodec/math/math.hpp b/include/nodec/math/math.hpp index 9b39830..498732b 100644 --- a/include/nodec/math/math.hpp +++ b/include/nodec/math/math.hpp @@ -219,23 +219,26 @@ Matrix4x4 inv(const Matrix4x4 &mat, T *determinant = nullptr) { // Optimized 4x4 matrix inverse using 2x2 sub-determinants // This algorithm pre-computes 12 2x2 determinants and reuses them, // reducing multiplications from ~144 to ~100. + // + // Matrix4x4 is column-major: m[col*4 + row] + // m[0-3] = column 0 (m11,m21,m31,m41), m[4-7] = column 1, etc. auto &m = mat.m; - // Pre-compute 2x2 determinants from rows 0-1 (columns i,j) - const T b00 = m[0] * m[5] - m[1] * m[4]; - const T b01 = m[0] * m[6] - m[2] * m[4]; - const T b02 = m[0] * m[7] - m[3] * m[4]; - const T b03 = m[1] * m[6] - m[2] * m[5]; - const T b04 = m[1] * m[7] - m[3] * m[5]; - const T b05 = m[2] * m[7] - m[3] * m[6]; - - // Pre-compute 2x2 determinants from rows 2-3 (columns i,j) - const T b06 = m[8] * m[13] - m[9] * m[12]; - const T b07 = m[8] * m[14] - m[10] * m[12]; - const T b08 = m[8] * m[15] - m[11] * m[12]; - const T b09 = m[9] * m[14] - m[10] * m[13]; - const T b10 = m[9] * m[15] - m[11] * m[13]; - const T b11 = m[10] * m[15] - m[11] * m[14]; + // Pre-compute 2x2 determinants from rows 0-1 + const T b00 = m[0] * m[5] - m[4] * m[1]; // m11*m22 - m12*m21 + const T b01 = m[0] * m[9] - m[8] * m[1]; // m11*m23 - m13*m21 + const T b02 = m[0] * m[13] - m[12] * m[1]; // m11*m24 - m14*m21 + const T b03 = m[4] * m[9] - m[8] * m[5]; // m12*m23 - m13*m22 + const T b04 = m[4] * m[13] - m[12] * m[5]; // m12*m24 - m14*m22 + const T b05 = m[8] * m[13] - m[12] * m[9]; // m13*m24 - m14*m23 + + // Pre-compute 2x2 determinants from rows 2-3 + const T b06 = m[2] * m[7] - m[6] * m[3]; // m31*m42 - m32*m41 + const T b07 = m[2] * m[11] - m[10] * m[3]; // m31*m43 - m33*m41 + const T b08 = m[2] * m[15] - m[14] * m[3]; // m31*m44 - m34*m41 + const T b09 = m[6] * m[11] - m[10] * m[7]; // m32*m43 - m33*m42 + const T b10 = m[6] * m[15] - m[14] * m[7]; // m32*m44 - m34*m42 + const T b11 = m[10] * m[15] - m[14] * m[11]; // m33*m44 - m34*m43 // Compute determinant const T det = b00 * b11 - b01 * b10 + b02 * b09 + b03 * b08 - b04 * b07 + b05 * b06; @@ -246,24 +249,24 @@ Matrix4x4 inv(const Matrix4x4 &mat, T *determinant = nullptr) { const T inv_det = T(1) / det; - // Compute inverse matrix elements + // Compute inverse matrix elements (column-major output) return { - (m[5] * b11 - m[6] * b10 + m[7] * b09) * inv_det, - (m[2] * b10 - m[1] * b11 - m[3] * b09) * inv_det, - (m[13] * b05 - m[14] * b04 + m[15] * b03) * inv_det, - (m[10] * b04 - m[9] * b05 - m[11] * b03) * inv_det, - (m[6] * b08 - m[4] * b11 - m[7] * b07) * inv_det, - (m[0] * b11 - m[2] * b08 + m[3] * b07) * inv_det, - (m[14] * b02 - m[12] * b05 - m[15] * b01) * inv_det, - (m[8] * b05 - m[10] * b02 + m[11] * b01) * inv_det, - (m[4] * b10 - m[5] * b08 + m[7] * b06) * inv_det, - (m[1] * b08 - m[0] * b10 - m[3] * b06) * inv_det, - (m[12] * b04 - m[13] * b02 + m[15] * b00) * inv_det, - (m[9] * b02 - m[8] * b04 - m[11] * b00) * inv_det, - (m[5] * b07 - m[4] * b09 - m[6] * b06) * inv_det, - (m[0] * b09 - m[1] * b07 + m[2] * b06) * inv_det, - (m[13] * b01 - m[12] * b03 - m[14] * b00) * inv_det, - (m[8] * b03 - m[9] * b01 + m[10] * b00) * inv_det}; + (m[5] * b11 - m[9] * b10 + m[13] * b09) * inv_det, // inv_m11 + (m[8] * b10 - m[4] * b11 - m[12] * b09) * inv_det, // inv_m12 + (m[7] * b05 - m[11] * b04 + m[15] * b03) * inv_det, // inv_m13 + (m[10] * b04 - m[6] * b05 - m[14] * b03) * inv_det, // inv_m14 + (m[9] * b08 - m[1] * b11 - m[13] * b07) * inv_det, // inv_m21 + (m[0] * b11 - m[8] * b08 + m[12] * b07) * inv_det, // inv_m22 + (m[11] * b02 - m[3] * b05 - m[15] * b01) * inv_det, // inv_m23 + (m[2] * b05 - m[10] * b02 + m[14] * b01) * inv_det, // inv_m24 + (m[1] * b10 - m[5] * b08 + m[13] * b06) * inv_det, // inv_m31 + (m[4] * b08 - m[0] * b10 - m[12] * b06) * inv_det, // inv_m32 + (m[3] * b04 - m[7] * b02 + m[15] * b00) * inv_det, // inv_m33 + (m[6] * b02 - m[2] * b04 - m[14] * b00) * inv_det, // inv_m34 + (m[5] * b07 - m[1] * b09 - m[9] * b06) * inv_det, // inv_m41 + (m[0] * b09 - m[4] * b07 + m[8] * b06) * inv_det, // inv_m42 + (m[7] * b01 - m[3] * b03 - m[11] * b00) * inv_det, // inv_m43 + (m[2] * b03 - m[6] * b01 + m[10] * b00) * inv_det}; // inv_m44 } } // namespace math