Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ repos:
# Flake8: complexity and style checking
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
rev: 7.3.0
hooks:
- id: flake8
additional_dependencies: [flake8-docstrings]
Expand Down Expand Up @@ -82,11 +82,10 @@ repos:

# requirements-ml.txt
scikit-learn>=0.23.2,
'keras>=2.4.3,<=3.4.0',
"keras<=3.4.0; python_version <='3.9'",
"keras>=3.11.0; python_version > '3.9'",
rapidfuzz>=2.6.1,
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
"tensorflow-macos>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine == 'arm64'",
tensorflow>=2.15.0,
tqdm>=4.0.0,

# requirements-reports.txt
Expand All @@ -101,7 +100,7 @@ repos:
pytest-xdist>=2.1.0,
pytest-forked>=1.3.0,
toolz>=0.10.0,
'memray>=1.7.0,<1.12.0',
'memray>=1.18.0',
]
# Check-manifest: ensures required non-Python files are included in MANIFEST.in
# https://github.com/mgedmin/check-manifest/blob/master/.pre-commit-hooks.yaml
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains abstract classes for labeling data."""

from __future__ import annotations

import abc
Expand Down Expand Up @@ -78,7 +79,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseModel)
or self._parameters != other._parameters
or self._label_mapping != other._label_mapping
Expand Down
9 changes: 5 additions & 4 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ def _construct_model(self) -> None:

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None, None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
Expand Down Expand Up @@ -635,7 +635,7 @@ def _reconstruct_model(self) -> None:

# Compile the model
softmax_output_layer_name = self._model.output_names[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}
losses = ["categorical_crossentropy", None, None]

# use f1 score metric
f1_score_training = labeler_utils.F1Score(
Expand Down Expand Up @@ -699,13 +699,14 @@ def fit(
f1_report: dict = {}

self._model.reset_metrics()
softmax_output_layer_name = self._model.output_names[0]
# softmax_output_layer_name = self._model.output_names[0]

start_time = time.time()
batch_id = 0
for x_train, y_train in train_data:
model_results = self._model.train_on_batch(
x_train, {softmax_output_layer_name: y_train}
x_train,
y_train,
)
sys.stdout.flush()
if verbose:
Expand Down
68 changes: 53 additions & 15 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains pre-built processors for data labeling/processing."""

from __future__ import annotations

import abc
Expand Down Expand Up @@ -70,7 +71,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, BaseDataProcessor)
or self._parameters != other._parameters
):
Expand Down Expand Up @@ -173,9 +174,11 @@ def process(
labels: np.ndarray | None = None,
label_mapping: dict[str, int] | None = None,
batch_size: int = 32,
) -> Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None] | tuple[
np.ndarray, np.ndarray
] | np.ndarray:
) -> (
Generator[tuple[np.ndarray, np.ndarray] | np.ndarray, None, None]
| tuple[np.ndarray, np.ndarray]
| np.ndarray
):
"""Preprocess data."""
raise NotImplementedError()

Expand Down Expand Up @@ -377,7 +380,16 @@ def _find_nearest_sentence_break_before_ind(
sentence: str,
start_ind: int,
min_ind: int = 0,
separators: tuple[str, ...] = (" ", "\n", ",", "\t", "\r", "\x00", "\x01", ";"),
separators: tuple[str, ...] = (
" ",
"\n",
",",
"\t",
"\r",
"\x00",
"\x01",
";",
),
) -> int:
"""
Find nearest separator before the start_ind and return the index.
Expand Down Expand Up @@ -531,7 +543,8 @@ def gen_none() -> Generator[None, None, None]:

# pad the data until fits maximum length
pad_len = max(
max_length - separate_ind + buffer_ind, max_length - sample_len
max_length - separate_ind + buffer_ind,
max_length - sample_len,
)

# Only add the buffer up until maximum length
Expand Down Expand Up @@ -891,7 +904,17 @@ def __init__(
flatten_separator: str = " ",
use_word_level_argmax: bool = False,
output_format: str = "character_argmax",
separators: tuple[str, ...] = (" ", ",", ";", "'", '"', ":", "\n", "\t", "."),
separators: tuple[str, ...] = (
" ",
",",
";",
"'",
'"',
":",
"\n",
"\t",
".",
),
word_level_min_percent: float = 0.75,
) -> None:
"""
Expand Down Expand Up @@ -1185,7 +1208,11 @@ def convert_to_NER_format(
if begin_idx != -1:
# Add last sample
sample_output.append(
(begin_idx, curr_idx + 1, reverse_label_mapping[(int(curr_label))])
(
begin_idx,
curr_idx + 1,
reverse_label_mapping[(int(curr_label))],
)
)
# Add to total output list
output_result.append(sample_output)
Expand All @@ -1194,7 +1221,10 @@ def convert_to_NER_format(

@staticmethod
def match_sentence_lengths(
data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True
data: np.ndarray,
results: dict,
flatten_separator: str,
inplace: bool = True,
) -> dict:
"""
Convert results from model into same ragged data shapes as original data.
Expand Down Expand Up @@ -1516,7 +1546,10 @@ def process(
np_unstruct_labels = None

return super().process(
np.array(unstructured_data), np_unstruct_labels, label_mapping, batch_size
np.array(unstructured_data),
np_unstruct_labels,
label_mapping,
batch_size,
)


Expand Down Expand Up @@ -1586,7 +1619,7 @@ def __eq__(self, other: object) -> bool:
:rtype: bool
"""
if (
type(self) != type(other)
type(self) is not type(other)
or not isinstance(other, StructCharPostprocessor)
or self._parameters["default_label"] != other._parameters["default_label"]
or self._parameters["pad_label"] != other._parameters["pad_label"]
Expand Down Expand Up @@ -1662,7 +1695,10 @@ def help(cls) -> None:

@staticmethod
def match_sentence_lengths(
data: np.ndarray, results: dict, flatten_separator: str, inplace: bool = True
data: np.ndarray,
results: dict,
flatten_separator: str,
inplace: bool = True,
) -> dict:
"""
Convert results from model into same ragged data shapes as original data.
Expand Down Expand Up @@ -1947,9 +1983,11 @@ def _validate_parameters(self, parameters: dict) -> None:
# being changed and is already set
aggregation_func = parameters.get(
"aggregation_func",
self._parameters.get("aggregation_func")
if hasattr(self, "_parameters")
else None,
(
self._parameters.get("aggregation_func")
if hasattr(self, "_parameters")
else None
),
)
if value is None and aggregation_func == "priority":
errors.append(
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/plugins/decorators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains function for generating plugins data."""

from collections import defaultdict
from typing import Any, DefaultDict, Dict

Expand All @@ -19,9 +20,9 @@ def __inner_factory_function(fn):
Actual population of plugin_dict.

:param fn: Plugin function
:return: function
:return: functions
"""
global plugins_dict
# global plugins_dict
plugins_dict[typ][name] = fn
return fn

Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
check-manifest>=0.50
black>=24.3.0
isort==5.12.0
pre-commit==2.19.0
pre-commit==4.3.0
tox==3.25.1
tox-conda==0.10.2
types-setuptools==67.7.0.1
Expand Down
7 changes: 3 additions & 4 deletions requirements-ml.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
scikit-learn>=0.23.2
keras<=3.4.0
keras<=3.4.0; python_version <='3.9'
keras>=3.11.0; python_version > '3.9'
rapidfuzz>=2.6.1
tensorflow>=2.16.0; sys.platform != 'darwin'
tensorflow>=2.16.0; sys_platform == 'darwin' and platform_machine != 'arm64'
tensorflow-macos>=2.16.0; sys_platform == 'darwin' and platform_machine == 'arm64'
tensorflow>=2.16.0
tqdm>=4.0.0
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ pytest-cov>=2.8.1
pytest-xdist>=2.1.0
pytest-forked>=1.3.0
toolz>=0.10.0
memray>=1.7.0,<1.12.0
memray>=1.18.0
Loading