diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 201f7899..871c050e 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -1,4 +1,5 @@ """Contains abstract classes from which labeler classes will inherit.""" + from __future__ import annotations import json @@ -9,15 +10,14 @@ import numpy as np import pandas as pd -import pkg_resources from dataprofiler._typing import DataArray from .. import data_readers -from . import data_processing +from . import data_processing, utils from .base_model import BaseModel -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = utils.find_resources_dir() / "labelers" class BaseDataLabeler: @@ -246,7 +246,8 @@ def set_params(self, params: dict) -> None: self._postprocessor.set_params(**params["postprocessor"]) self.check_pipeline( - skip_postprocessor=self._postprocessor is None, error_on_mismatch=False + skip_postprocessor=self._postprocessor is None, + error_on_mismatch=False, ) def add_label(self, label: str, same_as: str = None) -> None: @@ -438,7 +439,9 @@ def get_parameter_overlap_mismatches( messages.append( "Preprocessor and postprocessor value for `{}` do not " "match. {} != {}".format( - param, preprocessor_params[param], postprocessor_params[param] + param, + preprocessor_params[param], + postprocessor_params[param], ) ) if messages: @@ -490,7 +493,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict] "The load_options preprocessor class does not " "match the required DataLabeler preprocessor." "\n {} != {}".format( - processor_class.__class__.__name__, param_processor_class + processor_class.__class__.__name__, + param_processor_class, ) ) params["preprocessor"]["class"] = load_options.get("preprocessor_class") @@ -505,7 +509,8 @@ def _load_parameters(dirpath: str, load_options: dict = None) -> dict[str, dict] raise ValueError( "The load_options postprocessor class does not match " "the required DataLabeler postprocessor.\n {} != {}".format( - processor_class.__class__.__name__, param_processor_class + processor_class.__class__.__name__, + param_processor_class, ) ) params["postprocessor"]["class"] = load_options.get("postprocessor_class") diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index a6d9932b..5d69fb1a 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -1,17 +1,18 @@ """Module to train and choose between structured and unstructured data labelers.""" + from __future__ import annotations import os import pandas as pd -import pkg_resources from .. import data_readers +from . import utils from .base_data_labeler import BaseDataLabeler, TrainableDataLabeler from .base_model import BaseModel from .data_processing import BaseDataPostprocessor, BaseDataPreprocessor -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = utils.find_resources_dir() / "labelers" def train_structured_labeler( diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py index d53980a3..c4517a0e 100644 --- a/dataprofiler/labelers/data_processing.py +++ b/dataprofiler/labelers/data_processing.py @@ -1,4 +1,5 @@ """Contains pre-built processors for data labeling/processing.""" + from __future__ import annotations import abc @@ -15,9 +16,11 @@ import numpy as np import numpy.typing as npt -import pkg_resources -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +from . import utils + +default_labeler_dir = utils.find_resources_dir() / "labelers" + Processor = TypeVar("Processor", bound="BaseDataProcessor") diff --git a/dataprofiler/labelers/utils.py b/dataprofiler/labelers/utils.py index 2d587f7b..ad4e7fcc 100644 --- a/dataprofiler/labelers/utils.py +++ b/dataprofiler/labelers/utils.py @@ -1,6 +1,9 @@ """Contains functions for checking for installations/dependencies.""" + import sys +import sysconfig import warnings +from pathlib import Path from typing import Any, Callable, List @@ -50,3 +53,23 @@ def new_f(*args: Any, **kwds: Any) -> Any: return new_f return check_module + + +def find_resources_dir() -> Path: + """Return the path to the package resources for the labeler.""" + # 1) Installed location from data_files: /resources + prefix = Path(sysconfig.get_path("data")) + installed = prefix / "resources" + if installed.exists(): + return installed + + # 2) Source tree fallback (works in editable installs / tests) + # Adjust the anchor file to something inside your package. + here = Path(__file__).resolve() + # Walk upwards to find repo root that contains "resources/labelers" + for parent in [here, *here.parents]: + candidate = parent / "resources" + if candidate.exists(): + return candidate + + raise FileNotFoundError("Could not locate resources (installed or source tree).") diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index c6d70f74..61173e57 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -6,14 +6,13 @@ import numpy as np import pandas as pd -import pkg_resources import tensorflow as tf +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") - +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { "model_path": "project/example/path/fake_model.h5", @@ -303,7 +302,9 @@ def test_param_validation(self, *mocks): "fake_extra_param": "fails", } model = CharLoadTFModel( - self.model_path, label_mapping=self.label_mapping, parameters=parameters + self.model_path, + label_mapping=self.label_mapping, + parameters=parameters, ) model._construct_model() self.assertDictEqual(parameters, model._parameters) diff --git a/dataprofiler/tests/labelers/test_character_level_cnn_model.py b/dataprofiler/tests/labelers/test_character_level_cnn_model.py index e120a975..530dda2a 100644 --- a/dataprofiler/tests/labelers/test_character_level_cnn_model.py +++ b/dataprofiler/tests/labelers/test_character_level_cnn_model.py @@ -6,16 +6,16 @@ import numpy as np import pandas as pd -import pkg_resources import tensorflow as tf +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.character_level_cnn_model import ( CharacterLevelCnnModel, EncodingLayer, ) _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { diff --git a/dataprofiler/tests/labelers/test_column_name_model.py b/dataprofiler/tests/labelers/test_column_name_model.py index 58f90839..e3326a39 100644 --- a/dataprofiler/tests/labelers/test_column_name_model.py +++ b/dataprofiler/tests/labelers/test_column_name_model.py @@ -6,13 +6,14 @@ from unittest import mock import numpy as np -import pkg_resources import dataprofiler as dp +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.column_name_model import ColumnNameModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" + mock_model_parameters = { "true_positive_dict": [ diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index bbde1c50..c52187be 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -149,11 +149,10 @@ def test_load_from_library(self, *mocks): @mock.patch("tensorflow.keras.models.load_model") def test_load_from_disk(self, *mocks): - import pkg_resources + from dataprofiler.labelers import utils as labeler_utils + + default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" - default_labeler_dir = pkg_resources.resource_filename( - "resources", "labelers/structured_model" - ) data_labeler = dp.DataLabeler.load_from_disk(default_labeler_dir) self.assertIsInstance(data_labeler, BaseDataLabeler) diff --git a/dataprofiler/tests/labelers/test_data_processing.py b/dataprofiler/tests/labelers/test_data_processing.py index 00b4b088..9d386d8d 100644 --- a/dataprofiler/tests/labelers/test_data_processing.py +++ b/dataprofiler/tests/labelers/test_data_processing.py @@ -1,14 +1,17 @@ +pass import json import os import random import re import unittest from io import StringIO + +pass from unittest import mock import numpy as np -import pkg_resources +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.data_processing import ( BaseDataProcessor, CharEncodedPreprocessor, @@ -224,7 +227,9 @@ def test_load_from_library(self, mocked_load, *mocks): BaseDataProcessor.load_from_library("default") # assert called with proper load_processor dirpath - default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") + + default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" + mocked_load.assert_called_with(os.path.join(default_labeler_dir, "default")) @mock.patch("builtins.open") diff --git a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py index bcc136ae..5d230745 100644 --- a/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_column_name_data_labeler.py @@ -1,9 +1,9 @@ import unittest import numpy as np -import pkg_resources import dataprofiler as dp +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.column_name_model import ColumnNameModel from dataprofiler.labelers.data_labelers import BaseDataLabeler from dataprofiler.labelers.data_processing import ( @@ -11,7 +11,7 @@ DirectPassPreprocessor, ) -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" class TestColumnNameDataLabeler(unittest.TestCase): diff --git a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py index 1cb75372..7c729cca 100644 --- a/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py +++ b/dataprofiler/tests/labelers/test_integration_regex_data_labeler.py @@ -2,18 +2,25 @@ import unittest import numpy as np -import pkg_resources +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.data_labelers import BaseDataLabeler -default_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +default_labeler_dir = labeler_utils.find_resources_dir() / "labelers" class TestRegexDataLabeler(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls.data = np.array( - ["123 Fake St.", "1/2/2020", "nice.", "4/3/22", "abc", "333-44-2341"] + [ + "123 Fake St.", + "1/2/2020", + "nice.", + "4/3/22", + "abc", + "333-44-2341", + ] ).reshape((-1,)) cls.data_labeler = BaseDataLabeler.load_from_disk( os.path.join(default_labeler_dir, "regex_model") diff --git a/dataprofiler/tests/labelers/test_regex_model.py b/dataprofiler/tests/labelers/test_regex_model.py index 6a279307..91a2dfff 100644 --- a/dataprofiler/tests/labelers/test_regex_model.py +++ b/dataprofiler/tests/labelers/test_regex_model.py @@ -5,12 +5,12 @@ from unittest import mock import numpy as np -import pkg_resources +from dataprofiler.labelers import utils as labeler_utils from dataprofiler.labelers.regex_model import RegexModel _file_dir = os.path.dirname(os.path.abspath(__file__)) -_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") +_resource_labeler_dir = labeler_utils.find_resources_dir() / "labelers" mock_model_parameters = { @@ -161,7 +161,8 @@ def test_param_validation(self): for invalid_param_set in invalid_parameters: with self.assertRaises(ValueError): RegexModel( - label_mapping=self.label_mapping, parameters=invalid_param_set + label_mapping=self.label_mapping, + parameters=invalid_param_set, ) @mock.patch("sys.stdout", new_callable=StringIO)