From c25f770cd6c6d012a8c0031c99875bbc4aeb2b46 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 21 Aug 2025 15:38:20 -0500 Subject: [PATCH 1/5] fix: requirements --- .pre-commit-config.yaml | 8 +++----- requirements-ml.txt | 6 ++---- requirements-test.txt | 2 +- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 666cde4b..f0966a1b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,11 +82,9 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>=2.4.3,<=3.4.0', + 'keras>=3.11.0', rapidfuzz>=2.6.1, - "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'", - "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'", - "tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'", + tensorflow>=2.15.0, tqdm>=4.0.0, # requirements-reports.txt @@ -101,7 +99,7 @@ repos: pytest-xdist>=2.1.0, pytest-forked>=1.3.0, toolz>=0.10.0, - 'memray>=1.7.0,<1.12.0', + 'memray>=1.18.0', ] # Check-manifest: ensures required non-Python files are included in MANIFEST.in # https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml diff --git a/requirements-ml.txt b/requirements-ml.txt index 31f9ca63..b3005a5c 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,7 +1,5 @@ scikit-learn>=0.23.2 -keras<=3.4.0 +keras>=3.11.0 rapidfuzz>=2.6.1 -tensorflow>=2.16.0; sys.platform != 'darwin' -tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64' -tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64' +tensorflow>=2.16.0 tqdm>=4.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 725b2384..cf127b60 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,4 +6,4 @@ pytest-cov>=2.8.1 pytest-xdist>=2.1.0 pytest-forked>=1.3.0 toolz>=0.10.0 -memray>=1.7.0,<1.12.0 +memray>=1.18.0 From dc1cd866182be6c0fa7b84d2ca5cc0e7ddce70e0 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 21 Aug 2025 15:49:12 -0500 Subject: [PATCH 2/5] fix: reqs --- .pre-commit-config.yaml | 3 ++- requirements-ml.txt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f0966a1b..9f7d445c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -82,7 +82,8 @@ repos: # requirements-ml.txt scikit-learn>=0.23.2, - 'keras>=3.11.0', + "keras<=3.4.0; python_version <='3.9'", + "keras>=3.11.0; python_version > '3.9'", rapidfuzz>=2.6.1, tensorflow>=2.15.0, tqdm>=4.0.0, diff --git a/requirements-ml.txt b/requirements-ml.txt index b3005a5c..d7c6a282 100644 --- a/requirements-ml.txt +++ b/requirements-ml.txt @@ -1,5 +1,6 @@ scikit-learn>=0.23.2 -keras>=3.11.0 +keras<=3.4.0; python_version <='3.9' +keras>=3.11.0; python_version > '3.9' rapidfuzz>=2.6.1 tensorflow>=2.16.0 tqdm>=4.0.0 From 2378ed3fe7de732873e14fc779bf05c221eab269 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 21 Aug 2025 16:35:39 -0500 Subject: [PATCH 3/5] fix: fit --- .pre-commit-config.yaml | 2 +- dataprofiler/labelers/character_level_cnn_model.py | 9 +++++---- requirements-dev.txt | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f7d445c..959c8475 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,7 +21,7 @@ repos: # Flake8: complexity and style checking # https://flake8.pycqa.org/en/latest/user/using-hooks.html - repo: https://github.com/pycqa/flake8 - rev: 4.0.1 + rev: 7.3.0 hooks: - id: flake8 additional_dependencies: [flake8-docstrings] diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 2cbb7051..601d8738 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -573,7 +573,7 @@ def _construct_model(self) -> None: # Compile the model softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} + losses = ["categorical_crossentropy", None, None] # use f1 score metric f1_score_training = labeler_utils.F1Score( @@ -635,7 +635,7 @@ def _reconstruct_model(self) -> None: # Compile the model softmax_output_layer_name = self._model.output_names[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} + losses = ["categorical_crossentropy", None, None] # use f1 score metric f1_score_training = labeler_utils.F1Score( @@ -699,13 +699,14 @@ def fit( f1_report: dict = {} self._model.reset_metrics() - softmax_output_layer_name = self._model.output_names[0] + # softmax_output_layer_name = self._model.output_names[0] start_time = time.time() batch_id = 0 for x_train, y_train in train_data: model_results = self._model.train_on_batch( - x_train, {softmax_output_layer_name: y_train} + x_train, + y_train, ) sys.stdout.flush() if verbose: diff --git a/requirements-dev.txt b/requirements-dev.txt index 8c7c7868..163dae50 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ check-manifest>=0.50 black>=24.3.0 isort==5.12.0 -pre-commit==2.19.0 +pre-commit==4.3.0 tox==3.25.1 tox-conda==0.10.2 types-setuptools==67.7.0.1 From 9cb25a08a194ea365821e58288a13e57f6b5f873 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 Aug 2025 14:12:55 -0500 Subject: [PATCH 4/5] fix: remove global --- dataprofiler/plugins/decorators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dataprofiler/plugins/decorators.py b/dataprofiler/plugins/decorators.py index c781f430..3996578b 100644 --- a/dataprofiler/plugins/decorators.py +++ b/dataprofiler/plugins/decorators.py @@ -1,4 +1,5 @@ """Contains function for generating plugins data.""" + from collections import defaultdict from typing import Any, DefaultDict, Dict @@ -19,9 +20,9 @@ def __inner_factory_function(fn): Actual population of plugin_dict. :param fn: Plugin function - :return: function + :return: functions """ - global plugins_dict + # global plugins_dict plugins_dict[typ][name] = fn return fn From d7ccffca082ef85404e65c3312bbd88454b7254e Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Fri, 22 Aug 2025 14:37:56 -0500 Subject: [PATCH 5/5] fix: pre-commit issues --- dataprofiler/labelers/base_model.py | 3 +- dataprofiler/labelers/data_processing.py | 68 ++++++++++++++++++------ 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py index 032c2ea3..c5d7aef5 100644 --- a/dataprofiler/labelers/base_model.py +++ b/dataprofiler/labelers/base_model.py @@ -1,4 +1,5 @@ """Contains abstract classes for labeling data.""" + from __future__ import annotations import abc @@ -78,7 +79,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, BaseModel) or self._parameters != other._parameters or self._label_mapping != other._label_mapping diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index d53980a3..3a35b00e 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1,4 +1,5 @@ """Contains pre-built processors for data labeling/processing.""" + from __future__ import annotations import abc @@ -70,7 +71,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, BaseDataProcessor) or self._parameters != other._parameters ): @@ -173,9 +174,11 @@ def process( labels: np.ndarray | None = None, label_mapping: dict[str, int] | None = None, batch_size: int = 32, - ) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[ - np.ndarray, np.ndarray - ] | np.ndarray: + ) -> ( + Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] + | tuple[np.ndarray, np.ndarray] + | np.ndarray + ): """Preprocess data.""" raise NotImplementedError() @@ -377,7 +380,16 @@ def _find_nearest_sentence_break_before_ind( sentence: str, start_ind: int, min_ind: int = 0, - separators: tuple[str, ...] = (" ", "\n", ",", "\t", "\r", "\x00", "\x01", ";"), + separators: tuple[str, ...] = ( + " ", + "\n", + ",", + "\t", + "\r", + "\x00", + "\x01", + ";", + ), ) -> int: """ Find nearest separator before the start_ind and return the index. @@ -531,7 +543,8 @@ def gen_none() -> Generator[None, None, None]: # pad the data until fits maximum length pad_len = max( - max_length - separate_ind + buffer_ind, max_length - sample_len + max_length - separate_ind + buffer_ind, + max_length - sample_len, ) # Only add the buffer up until maximum length @@ -891,7 +904,17 @@ def __init__( flatten_separator: str = " ", use_word_level_argmax: bool = False, output_format: str = "character_argmax", - separators: tuple[str, ...] = (" ", ",", ";", "'", '"', ":", "\n", "\t", "."), + separators: tuple[str, ...] = ( + " ", + ",", + ";", + "'", + '"', + ":", + "\n", + "\t", + ".", + ), word_level_min_percent: float = 0.75, ) -> None: """ @@ -1185,7 +1208,11 @@ def convert_to_NER_format( if begin_idx != -1: # Add last sample sample_output.append( - (begin_idx, curr_idx + 1, reverse_label_mapping[(int(curr_label))]) + ( + begin_idx, + curr_idx + 1, + reverse_label_mapping[(int(curr_label))], + ) ) # Add to total output list output_result.append(sample_output) @@ -1194,7 +1221,10 @@ def convert_to_NER_format( @staticmethod def match_sentence_lengths( - data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True + data: np.ndarray, + results: dict, + flatten_separator: str, + inplace: bool = True, ) -> dict: """ Convert results from model into same ragged data shapes as original data. @@ -1516,7 +1546,10 @@ def process( np_unstruct_labels = None return super().process( - np.array(unstructured_data), np_unstruct_labels, label_mapping, batch_size + np.array(unstructured_data), + np_unstruct_labels, + label_mapping, + batch_size, ) @@ -1586,7 +1619,7 @@ def __eq__(self, other: object) -> bool: :rtype: bool """ if ( - type(self) != type(other) + type(self) is not type(other) or not isinstance(other, StructCharPostprocessor) or self._parameters["default_label"] != other._parameters["default_label"] or self._parameters["pad_label"] != other._parameters["pad_label"] @@ -1662,7 +1695,10 @@ def help(cls) -> None: @staticmethod def match_sentence_lengths( - data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True + data: np.ndarray, + results: dict, + flatten_separator: str, + inplace: bool = True, ) -> dict: """ Convert results from model into same ragged data shapes as original data. @@ -1947,9 +1983,11 @@ def _validate_parameters(self, parameters: dict) -> None: # being changed and is already set aggregation_func = parameters.get( "aggregation_func", - self._parameters.get("aggregation_func") - if hasattr(self, "_parameters") - else None, + ( + self._parameters.get("aggregation_func") + if hasattr(self, "_parameters") + else None + ), ) if value is None and aggregation_func == "priority": errors.append(