From c25f770cd6c6d012a8c0031c99875bbc4aeb2b46 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@ipcopilot.ai>
Date: Thu, 21 Aug 2025 15:38:20 -0500
Subject: [PATCH 1/5] fix: requirements

---
 .pre-commit-config.yaml | 8 +++-----
 requirements-ml.txt     | 6 ++----
 requirements-test.txt   | 2 +-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 666cde4b..f0966a1b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,11 +82,9 @@ repos:
 
             # requirements-ml.txt
             scikit-learn>=0.23.2,
-            'keras>=2.4.3,<=3.4.0',
+            'keras>=3.11.0',
             rapidfuzz>=2.6.1,
-            "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
-            "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
-            "tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
+            tensorflow>=2.15.0,
             tqdm>=4.0.0,
 
             # requirements-reports.txt
@@ -101,7 +99,7 @@ repos:
             pytest-xdist>=2.1.0,
             pytest-forked>=1.3.0,
             toolz>=0.10.0,
-            'memray>=1.7.0,<1.12.0',
+            'memray>=1.18.0',
           ]
   # Check-manifest: ensures required non-Python files are included in MANIFEST.in
   # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
diff --git a/requirements-ml.txt b/requirements-ml.txt
index 31f9ca63..b3005a5c 100644
--- a/requirements-ml.txt
+++ b/requirements-ml.txt
@@ -1,7 +1,5 @@
 scikit-learn>=0.23.2
-keras<=3.4.0
+keras>=3.11.0
 rapidfuzz>=2.6.1
-tensorflow>=2.16.0; sys.platform != 'darwin'
-tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64'
-tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64'
+tensorflow>=2.16.0
 tqdm>=4.0.0
diff --git a/requirements-test.txt b/requirements-test.txt
index 725b2384..cf127b60 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -6,4 +6,4 @@ pytest-cov>=2.8.1
 pytest-xdist>=2.1.0
 pytest-forked>=1.3.0
 toolz>=0.10.0
-memray>=1.7.0,<1.12.0
+memray>=1.18.0

From dc1cd866182be6c0fa7b84d2ca5cc0e7ddce70e0 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@ipcopilot.ai>
Date: Thu, 21 Aug 2025 15:49:12 -0500
Subject: [PATCH 2/5] fix: reqs

---
 .pre-commit-config.yaml | 3 ++-
 requirements-ml.txt     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f0966a1b..9f7d445c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,7 +82,8 @@ repos:
 
             # requirements-ml.txt
             scikit-learn>=0.23.2,
-            'keras>=3.11.0',
+            "keras<=3.4.0; python_version <='3.9'",
+            "keras>=3.11.0; python_version > '3.9'",
             rapidfuzz>=2.6.1,
             tensorflow>=2.15.0,
             tqdm>=4.0.0,
diff --git a/requirements-ml.txt b/requirements-ml.txt
index b3005a5c..d7c6a282 100644
--- a/requirements-ml.txt
+++ b/requirements-ml.txt
@@ -1,5 +1,6 @@
 scikit-learn>=0.23.2
-keras>=3.11.0
+keras<=3.4.0; python_version <='3.9'
+keras>=3.11.0; python_version > '3.9'
 rapidfuzz>=2.6.1
 tensorflow>=2.16.0
 tqdm>=4.0.0

From 2378ed3fe7de732873e14fc779bf05c221eab269 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@ipcopilot.ai>
Date: Thu, 21 Aug 2025 16:35:39 -0500
Subject: [PATCH 3/5] fix: fit

---
 .pre-commit-config.yaml                            | 2 +-
 dataprofiler/labelers/character_level_cnn_model.py | 9 +++++----
 requirements-dev.txt                               | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f7d445c..959c8475 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   # Flake8: complexity and style checking
   # https://flake8.pycqa.org/en/latest/user/using-hooks.html
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 7.3.0
     hooks:
       - id: flake8
         additional_dependencies: [flake8-docstrings]
diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
index 2cbb7051..601d8738 100644
--- a/dataprofiler/labelers/character_level_cnn_model.py
+++ b/dataprofiler/labelers/character_level_cnn_model.py
@@ -573,7 +573,7 @@ def _construct_model(self) -> None:
 
         # Compile the model
         softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+        losses = ["categorical_crossentropy", None, None]
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
@@ -635,7 +635,7 @@ def _reconstruct_model(self) -> None:
 
         # Compile the model
         softmax_output_layer_name = self._model.output_names[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+        losses = ["categorical_crossentropy", None, None]
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
@@ -699,13 +699,14 @@ def fit(
         f1_report: dict = {}
 
         self._model.reset_metrics()
-        softmax_output_layer_name = self._model.output_names[0]
+        # softmax_output_layer_name = self._model.output_names[0]
 
         start_time = time.time()
         batch_id = 0
         for x_train, y_train in train_data:
             model_results = self._model.train_on_batch(
-                x_train, {softmax_output_layer_name: y_train}
+                x_train,
+                y_train,
             )
             sys.stdout.flush()
             if verbose:
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 8c7c7868..163dae50 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,7 +1,7 @@
 check-manifest>=0.50
 black>=24.3.0
 isort==5.12.0
-pre-commit==2.19.0
+pre-commit==4.3.0
 tox==3.25.1
 tox-conda==0.10.2
 types-setuptools==67.7.0.1

From 9cb25a08a194ea365821e58288a13e57f6b5f873 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@ipcopilot.ai>
Date: Fri, 22 Aug 2025 14:12:55 -0500
Subject: [PATCH 4/5] fix: remove global

---
 dataprofiler/plugins/decorators.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/plugins/decorators.py b/dataprofiler/plugins/decorators.py
index c781f430..3996578b 100644
--- a/dataprofiler/plugins/decorators.py
+++ b/dataprofiler/plugins/decorators.py
@@ -1,4 +1,5 @@
 """Contains function for generating plugins data."""
+
 from collections import defaultdict
 from typing import Any, DefaultDict, Dict
 
@@ -19,9 +20,9 @@ def __inner_factory_function(fn):
         Actual population of plugin_dict.
 
         :param fn: Plugin function
-        :return: function
+        :return: functions
         """
-        global plugins_dict
+        # global plugins_dict
         plugins_dict[typ][name] = fn
         return fn
 

From d7ccffca082ef85404e65c3312bbd88454b7254e Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@ipcopilot.ai>
Date: Fri, 22 Aug 2025 14:37:56 -0500
Subject: [PATCH 5/5] fix: pre-commit issues

---
 dataprofiler/labelers/base_model.py      |  3 +-
 dataprofiler/labelers/data_processing.py | 68 ++++++++++++++++++------
 2 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py
index 032c2ea3..c5d7aef5 100644
--- a/dataprofiler/labelers/base_model.py
+++ b/dataprofiler/labelers/base_model.py
@@ -1,4 +1,5 @@
 """Contains abstract classes for labeling data."""
+
 from __future__ import annotations
 
 import abc
@@ -78,7 +79,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, BaseModel)
             or self._parameters != other._parameters
             or self._label_mapping != other._label_mapping
diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
index d53980a3..3a35b00e 100644
--- a/dataprofiler/labelers/data_processing.py
+++ b/dataprofiler/labelers/data_processing.py
@@ -1,4 +1,5 @@
 """Contains pre-built processors for data labeling/processing."""
+
 from __future__ import annotations
 
 import abc
@@ -70,7 +71,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, BaseDataProcessor)
             or self._parameters != other._parameters
         ):
@@ -173,9 +174,11 @@ def process(
         labels: np.ndarray | None = None,
         label_mapping: dict[str, int] | None = None,
         batch_size: int = 32,
-    ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
-        np.ndarray, np.ndarray
-    ] | np.ndarray:
+    ) -> (
+        Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]
+        | tuple[np.ndarray, np.ndarray]
+        | np.ndarray
+    ):
         """Preprocess data."""
         raise NotImplementedError()
 
@@ -377,7 +380,16 @@ def _find_nearest_sentence_break_before_ind(
         sentence: str,
         start_ind: int,
         min_ind: int = 0,
-        separators: tuple[str, ...] = (" ", "\n", ",", "\t", "\r", "\x00", "\x01", ";"),
+        separators: tuple[str, ...] = (
+            " ",
+            "\n",
+            ",",
+            "\t",
+            "\r",
+            "\x00",
+            "\x01",
+            ";",
+        ),
     ) -> int:
         """
         Find nearest separator before the start_ind and return the index.
@@ -531,7 +543,8 @@ def gen_none() -> Generator[None, None, None]:
 
                     # pad the data until fits maximum length
                     pad_len = max(
-                        max_length - separate_ind + buffer_ind, max_length - sample_len
+                        max_length - separate_ind + buffer_ind,
+                        max_length - sample_len,
                     )
 
                     # Only add the buffer up until maximum length
@@ -891,7 +904,17 @@ def __init__(
         flatten_separator: str = " ",
         use_word_level_argmax: bool = False,
         output_format: str = "character_argmax",
-        separators: tuple[str, ...] = (" ", ",", ";", "'", '"', ":", "\n", "\t", "."),
+        separators: tuple[str, ...] = (
+            " ",
+            ",",
+            ";",
+            "'",
+            '"',
+            ":",
+            "\n",
+            "\t",
+            ".",
+        ),
         word_level_min_percent: float = 0.75,
     ) -> None:
         """
@@ -1185,7 +1208,11 @@ def convert_to_NER_format(
             if begin_idx != -1:
                 # Add last sample
                 sample_output.append(
-                    (begin_idx, curr_idx + 1, reverse_label_mapping[(int(curr_label))])
+                    (
+                        begin_idx,
+                        curr_idx + 1,
+                        reverse_label_mapping[(int(curr_label))],
+                    )
                 )
             # Add to total output list
             output_result.append(sample_output)
@@ -1194,7 +1221,10 @@ def convert_to_NER_format(
 
     @staticmethod
     def match_sentence_lengths(
-        data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True
+        data: np.ndarray,
+        results: dict,
+        flatten_separator: str,
+        inplace: bool = True,
     ) -> dict:
         """
         Convert results from model into same ragged data shapes as original data.
@@ -1516,7 +1546,10 @@ def process(
             np_unstruct_labels = None
 
         return super().process(
-            np.array(unstructured_data), np_unstruct_labels, label_mapping, batch_size
+            np.array(unstructured_data),
+            np_unstruct_labels,
+            label_mapping,
+            batch_size,
         )
 
 
@@ -1586,7 +1619,7 @@ def __eq__(self, other: object) -> bool:
         :rtype: bool
         """
         if (
-            type(self) != type(other)
+            type(self) is not type(other)
             or not isinstance(other, StructCharPostprocessor)
             or self._parameters["default_label"] != other._parameters["default_label"]
             or self._parameters["pad_label"] != other._parameters["pad_label"]
@@ -1662,7 +1695,10 @@ def help(cls) -> None:
 
     @staticmethod
     def match_sentence_lengths(
-        data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True
+        data: np.ndarray,
+        results: dict,
+        flatten_separator: str,
+        inplace: bool = True,
     ) -> dict:
         """
         Convert results from model into same ragged data shapes as original data.
@@ -1947,9 +1983,11 @@ def _validate_parameters(self, parameters: dict) -> None:
                 # being changed and is already set
                 aggregation_func = parameters.get(
                     "aggregation_func",
-                    self._parameters.get("aggregation_func")
-                    if hasattr(self, "_parameters")
-                    else None,
+                    (
+                        self._parameters.get("aggregation_func")
+                        if hasattr(self, "_parameters")
+                        else None
+                    ),
                 )
                 if value is None and aggregation_func == "priority":
                     errors.append(