From 4062dfd61e332b6bd07679850390b8d574ebe255 Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 20 Jan 2026 15:19:53 +0000 Subject: [PATCH 1/2] Bump the version to 4.2.2 --- pyproject.toml | 2 +- sphinx-docs/changelog.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 725717e..f5ca849 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "hlink" -version = "4.2.1" +version = "4.2.2" description = "Fast supervised pyspark record linkage software" readme = "README.md" requires-python = ">=3.10" diff --git a/sphinx-docs/changelog.md b/sphinx-docs/changelog.md index 427a9c7..b87f89b 100644 --- a/sphinx-docs/changelog.md +++ b/sphinx-docs/changelog.md @@ -3,7 +3,7 @@ The format of this changelog is based on [Keep A Changelog][keep-a-changelog]. Hlink adheres to semantic versioning as much as possible. -## Not Yet Released +## v4.2.2 (2026-01-20) ### Added From adaeef8f129cf23c9600a5477024780692b0cd7b Mon Sep 17 00:00:00 2001 From: rileyh Date: Tue, 20 Jan 2026 15:30:04 +0000 Subject: [PATCH 2/2] Reformat with black v26 --- examples/tutorial/tutorial.py | 6 ++---- hlink/linking/core/column_mapping.py | 1 - .../hh_matching/link_step_block_on_households.py | 1 - .../link_step_train_test_models.py | 16 +++++++--------- .../link_step_create_comparison_features.py | 6 ++---- hlink/linking/util.py | 1 - hlink/tests/core/column_mapping_test.py | 1 - hlink/tests/core/substitutions_test.py | 6 ++---- hlink/tests/core/transforms_test.py | 1 - 9 files changed, 13 insertions(+), 26 deletions(-) diff --git a/examples/tutorial/tutorial.py b/examples/tutorial/tutorial.py index d52fc2b..47415ac 100644 --- a/examples/tutorial/tutorial.py +++ b/examples/tutorial/tutorial.py @@ -9,8 +9,7 @@ def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description=""" + parser = argparse.ArgumentParser(description=""" This script links two very small example datasets that live in the data subdirectory. It reads in the tutorial_config.toml configuration file and runs hlink's preprocessing and matching steps to find some potential @@ -18,8 +17,7 @@ def parse_args() -> argparse.Namespace: For a detailed walkthrough of the tutorial, please see the README.md file in the same directory as this script. - """ - ) + """) parser.add_argument( "--clean", action="store_true", help="drop existing Spark tables on startup" diff --git a/hlink/linking/core/column_mapping.py b/hlink/linking/core/column_mapping.py index f9506ba..ba58f97 100755 --- a/hlink/linking/core/column_mapping.py +++ b/hlink/linking/core/column_mapping.py @@ -79,7 +79,6 @@ def transform_reverse(input_col: Column, transform: Mapping[str, Any], context: ) from pyspark.sql.types import LongType - ColumnMappingTransform: TypeAlias = Callable[ [Column, Mapping[str, Any], Mapping[str, Any]], Column ] diff --git a/hlink/linking/hh_matching/link_step_block_on_households.py b/hlink/linking/hh_matching/link_step_block_on_households.py index 738af50..f1a75b3 100644 --- a/hlink/linking/hh_matching/link_step_block_on_households.py +++ b/hlink/linking/hh_matching/link_step_block_on_households.py @@ -10,7 +10,6 @@ from hlink.linking.link_step import LinkStep from hlink.linking.util import set_job_description - logger = logging.getLogger(__name__) diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 26137d3..a3f9aa3 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -843,11 +843,11 @@ def _aggregate_per_threshold_results( mcc = [r.mcc for r in prediction_results if not math.isnan(r.mcc)] f_measure = [r.f_measure for r in prediction_results if not math.isnan(r.f_measure)] - (precision_mean, precision_sd) = _compute_mean_and_stdev(precision) - (recall_mean, recall_sd) = _compute_mean_and_stdev(recall) - (pr_auc_mean, pr_auc_sd) = _compute_mean_and_stdev(pr_auc) - (mcc_mean, mcc_sd) = _compute_mean_and_stdev(mcc) - (f_measure_mean, f_measure_sd) = _compute_mean_and_stdev(f_measure) + precision_mean, precision_sd = _compute_mean_and_stdev(precision) + recall_mean, recall_sd = _compute_mean_and_stdev(recall) + pr_auc_mean, pr_auc_sd = _compute_mean_and_stdev(pr_auc) + mcc_mean, mcc_sd = _compute_mean_and_stdev(mcc) + f_measure_mean, f_measure_sd = _compute_mean_and_stdev(f_measure) new_desc = pd.DataFrame( { @@ -962,8 +962,7 @@ def _handle_param_grid_attribute(training_settings: dict[str, Any]) -> dict[str, def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, Any]]: if "param_grid" in training_settings: print( - dedent( - """\ + dedent("""\ Deprecation Warning: training.param_grid is deprecated. Please use training.model_parameter_search instead by replacing @@ -971,8 +970,7 @@ def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, A `param_grid = True` with `model_parameter_search = {strategy = "grid"}` or `param_grid = False` with `model_parameter_search = {strategy = "explicit"}` - [deprecated_in_version=4.0.0]""" - ), + [deprecated_in_version=4.0.0]"""), file=sys.stderr, ) diff --git a/hlink/linking/training/link_step_create_comparison_features.py b/hlink/linking/training/link_step_create_comparison_features.py index 7e73575..b9727d1 100644 --- a/hlink/linking/training/link_step_create_comparison_features.py +++ b/hlink/linking/training/link_step_create_comparison_features.py @@ -42,8 +42,7 @@ def _create_training_features(self): dep_var = config[training_conf]["dependent_var"] if training_conf == "hh_training": hh_col = config[training_conf].get("hh_col", "serialp") - tdl = self.task.spark.sql( - f"""SELECT + tdl = self.task.spark.sql(f"""SELECT td.{id_col}_a, td.{id_col}_b, td.{dep_var}, @@ -57,8 +56,7 @@ def _create_training_features(self): left join prepped_df_b pdfb on pdfb.{id_col} = td.{id_col}_b - """ - ) + """) else: tdl = self.task.spark.table(f"{table_prefix}training_data").select( f"{id_col}_a", f"{id_col}_b", dep_var diff --git a/hlink/linking/util.py b/hlink/linking/util.py index f239be7..cd80d4d 100644 --- a/hlink/linking/util.py +++ b/hlink/linking/util.py @@ -1,7 +1,6 @@ from contextlib import contextmanager from math import ceil - MIN_PARTITIONS = 200 MAX_PARTITIONS = 10000 diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py index 3653a57..a4f47cc 100644 --- a/hlink/tests/core/column_mapping_test.py +++ b/hlink/tests/core/column_mapping_test.py @@ -5,7 +5,6 @@ from hlink.linking.core.column_mapping import apply_transform, select_column_mapping - TEST_DF_1 = pd.DataFrame( { "id": [0, 1, 2, 3, 4, 5], diff --git a/hlink/tests/core/substitutions_test.py b/hlink/tests/core/substitutions_test.py index 043d70c..6c122ef 100644 --- a/hlink/tests/core/substitutions_test.py +++ b/hlink/tests/core/substitutions_test.py @@ -24,13 +24,11 @@ def test_load_substitutions(tmp_path: Path) -> None: def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None: tmp_file = tmp_path / "substitutions.csv" - tmp_file.write_text( - """rose,rosie + tmp_file.write_text("""rose,rosie sophia,sophy sophia,sofia amanda,mandy - jane,jean""" - ) + jane,jean""") df = spark.createDataFrame( [("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)], diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py index 08b8e82..141a119 100644 --- a/hlink/tests/core/transforms_test.py +++ b/hlink/tests/core/transforms_test.py @@ -5,7 +5,6 @@ from hlink.linking.core.transforms import apply_transform, generate_transforms from hlink.linking.link_task import LinkTask - ignore_apply_transform_dep_warning = pytest.mark.filterwarnings( r"ignore:\s*This is a deprecated alias for hlink.linking.core.column_mapping.apply_transform" )