Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains abstract classes from which labeler classes will inherit."""

from __future__ import annotations

import json
Expand All @@ -9,15 +10,14 @@

import numpy as np
import pandas as pd
import pkg_resources

from dataprofiler._typing import DataArray

from .. import data_readers
from . import data_processing
from . import data_processing, utils
from .base_model import BaseModel

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = utils.find_resources_dir() / "labelers"


class BaseDataLabeler:
Expand Down Expand Up @@ -246,7 +246,8 @@ def set_params(self, params: dict) -> None:
self._postprocessor.set_params(**params["postprocessor"])

self.check_pipeline(
skip_postprocessor=self._postprocessor is None, error_on_mismatch=False
skip_postprocessor=self._postprocessor is None,
error_on_mismatch=False,
)

def add_label(self, label: str, same_as: str = None) -> None:
Expand Down Expand Up @@ -438,7 +439,9 @@ def get_parameter_overlap_mismatches(
messages.append(
"Preprocessor and postprocessor value for `{}` do not "
"match. {} != {}".format(
param, preprocessor_params[param], postprocessor_params[param]
param,
preprocessor_params[param],
postprocessor_params[param],
)
)
if messages:
Expand Down Expand Up @@ -490,7 +493,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict]
"The load_options preprocessor class does not "
"match the required DataLabeler preprocessor."
"\n {} != {}".format(
processor_class.__class__.__name__, param_processor_class
processor_class.__class__.__name__,
param_processor_class,
)
)
params["preprocessor"]["class"] = load_options.get("preprocessor_class")
Expand All @@ -505,7 +509,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict]
raise ValueError(
"The load_options postprocessor class does not match "
"the required DataLabeler postprocessor.\n {} != {}".format(
processor_class.__class__.__name__, param_processor_class
processor_class.__class__.__name__,
param_processor_class,
)
)
params["postprocessor"]["class"] = load_options.get("postprocessor_class")
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
"""Module to train and choose between structured and unstructured data labelers."""

from __future__ import annotations

import os

import pandas as pd
import pkg_resources

from .. import data_readers
from . import utils
from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler
from .base_model import BaseModel
from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = utils.find_resources_dir() / "labelers"


def train_structured_labeler(
Expand Down
7 changes: 5 additions & 2 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Contains pre-built processors for data labeling/processing."""

from __future__ import annotations

import abc
Expand All @@ -15,9 +16,11 @@

import numpy as np
import numpy.typing as npt
import pkg_resources

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
from . import utils

default_labeler_dir = utils.find_resources_dir() / "labelers"


Processor = TypeVar("Processor", bound="BaseDataProcessor")

Expand Down
23 changes: 23 additions & 0 deletions dataprofiler/labelers/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""Contains functions for checking for installations/dependencies."""

import sys
import sysconfig
import warnings
from pathlib import Path
from typing import Any, Callable, List


Expand Down Expand Up @@ -50,3 +53,23 @@ def new_f(*args: Any, **kwds: Any) -> Any:
return new_f

return check_module


def find_resources_dir() -> Path:
"""Return the path to the package resources for the labeler."""
# 1) Installed location from data_files: <prefix>/resources
prefix = Path(sysconfig.get_path("data"))
installed = prefix / "resources"
if installed.exists():
return installed

# 2) Source tree fallback (works in editable installs / tests)
# Adjust the anchor file to something inside your package.
here = Path(__file__).resolve()
# Walk upwards to find repo root that contains "resources/labelers"
for parent in [here, *here.parents]:
candidate = parent / "resources"
if candidate.exists():
return candidate

raise FileNotFoundError("Could not locate resources (installed or source tree).")
9 changes: 5 additions & 4 deletions dataprofiler/tests/labelers/test_char_tf_load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@

import numpy as np
import pandas as pd
import pkg_resources
import tensorflow as tf

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel

_file_dir = os.path.dirname(os.path.abspath(__file__))
_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")

default_labeler_dir = labeler_utils.find_resources_dir() / "labelers"

mock_model_parameters = {
"model_path": "project/example/path/fake_model.h5",
Expand Down Expand Up @@ -303,7 +302,9 @@ def test_param_validation(self, *mocks):
"fake_extra_param": "fails",
}
model = CharLoadTFModel(
self.model_path, label_mapping=self.label_mapping, parameters=parameters
self.model_path,
label_mapping=self.label_mapping,
parameters=parameters,
)
model._construct_model()
self.assertDictEqual(parameters, model._parameters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@

import numpy as np
import pandas as pd
import pkg_resources
import tensorflow as tf

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.character_level_cnn_model import (
CharacterLevelCnnModel,
EncodingLayer,
)

_file_dir = os.path.dirname(os.path.abspath(__file__))
_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers"


mock_model_parameters = {
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/tests/labelers/test_column_name_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,14 @@
from unittest import mock

import numpy as np
import pkg_resources

import dataprofiler as dp
from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.column_name_model import ColumnNameModel

_file_dir = os.path.dirname(os.path.abspath(__file__))
_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers"


mock_model_parameters = {
"true_positive_dict": [
Expand Down
7 changes: 3 additions & 4 deletions dataprofiler/tests/labelers/test_data_labelers.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,10 @@ def test_load_from_library(self, *mocks):

@mock.patch("tensorflow.keras.models.load_model")
def test_load_from_disk(self, *mocks):
import pkg_resources
from dataprofiler.labelers import utils as labeler_utils

default_labeler_dir = labeler_utils.find_resources_dir() / "labelers"

default_labeler_dir = pkg_resources.resource_filename(
"resources", "labelers/structured_model"
)
data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir)
self.assertIsInstance(data_labeler, BaseDataLabeler)

Expand Down
9 changes: 7 additions & 2 deletions dataprofiler/tests/labelers/test_data_processing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
pass
import json
import os
import random
import re
import unittest
from io import StringIO

pass
from unittest import mock

import numpy as np
import pkg_resources

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.data_processing import (
BaseDataProcessor,
CharEncodedPreprocessor,
Expand Down Expand Up @@ -224,7 +227,9 @@ def test_load_from_library(self, mocked_load, *mocks):
BaseDataProcessor.load_from_library("default")

# assert called with proper load_processor dirpath
default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")

default_labeler_dir = labeler_utils.find_resources_dir() / "labelers"

mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default"))

@mock.patch("builtins.open")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import unittest

import numpy as np
import pkg_resources

import dataprofiler as dp
from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.column_name_model import ColumnNameModel
from dataprofiler.labelers.data_labelers import BaseDataLabeler
from dataprofiler.labelers.data_processing import (
ColumnNameModelPostprocessor,
DirectPassPreprocessor,
)

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = labeler_utils.find_resources_dir() / "labelers"


class TestColumnNameDataLabeler(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,25 @@
import unittest

import numpy as np
import pkg_resources

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.data_labelers import BaseDataLabeler

default_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
default_labeler_dir = labeler_utils.find_resources_dir() / "labelers"


class TestRegexDataLabeler(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.data = np.array(
["123 Fake St.", "1/2/2020", "nice.", "4/3/22", "abc", "333-44-2341"]
[
"123 Fake St.",
"1/2/2020",
"nice.",
"4/3/22",
"abc",
"333-44-2341",
]
).reshape((-1,))
cls.data_labeler = BaseDataLabeler.load_from_disk(
os.path.join(default_labeler_dir, "regex_model")
Expand Down
7 changes: 4 additions & 3 deletions dataprofiler/tests/labelers/test_regex_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
from unittest import mock

import numpy as np
import pkg_resources

from dataprofiler.labelers import utils as labeler_utils
from dataprofiler.labelers.regex_model import RegexModel

_file_dir = os.path.dirname(os.path.abspath(__file__))
_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers"


mock_model_parameters = {
Expand Down Expand Up @@ -161,7 +161,8 @@ def test_param_validation(self):
for invalid_param_set in invalid_parameters:
with self.assertRaises(ValueError):
RegexModel(
label_mapping=self.label_mapping, parameters=invalid_param_set
label_mapping=self.label_mapping,
parameters=invalid_param_set,
)

@mock.patch("sys.stdout", new_callable=StringIO)
Expand Down
Loading