Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions examples/tutorial/tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,15 @@


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="""
parser = argparse.ArgumentParser(description="""
This script links two very small example datasets that live in the data
subdirectory. It reads in the tutorial_config.toml configuration file
and runs hlink's preprocessing and matching steps to find some potential
matches between the two datasets.

For a detailed walkthrough of the tutorial, please see the README.md
file in the same directory as this script.
"""
)
""")

parser.add_argument(
"--clean", action="store_true", help="drop existing Spark tables on startup"
Expand Down
1 change: 0 additions & 1 deletion hlink/linking/core/column_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ def transform_reverse(input_col: Column, transform: Mapping[str, Any], context:
)
from pyspark.sql.types import LongType


ColumnMappingTransform: TypeAlias = Callable[
[Column, Mapping[str, Any], Mapping[str, Any]], Column
]
Expand Down
1 change: 0 additions & 1 deletion hlink/linking/hh_matching/link_step_block_on_households.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from hlink.linking.link_step import LinkStep
from hlink.linking.util import set_job_description


logger = logging.getLogger(__name__)


Expand Down
16 changes: 7 additions & 9 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,11 +843,11 @@ def _aggregate_per_threshold_results(
mcc = [r.mcc for r in prediction_results if not math.isnan(r.mcc)]
f_measure = [r.f_measure for r in prediction_results if not math.isnan(r.f_measure)]

(precision_mean, precision_sd) = _compute_mean_and_stdev(precision)
(recall_mean, recall_sd) = _compute_mean_and_stdev(recall)
(pr_auc_mean, pr_auc_sd) = _compute_mean_and_stdev(pr_auc)
(mcc_mean, mcc_sd) = _compute_mean_and_stdev(mcc)
(f_measure_mean, f_measure_sd) = _compute_mean_and_stdev(f_measure)
precision_mean, precision_sd = _compute_mean_and_stdev(precision)
recall_mean, recall_sd = _compute_mean_and_stdev(recall)
pr_auc_mean, pr_auc_sd = _compute_mean_and_stdev(pr_auc)
mcc_mean, mcc_sd = _compute_mean_and_stdev(mcc)
f_measure_mean, f_measure_sd = _compute_mean_and_stdev(f_measure)

new_desc = pd.DataFrame(
{
Expand Down Expand Up @@ -962,17 +962,15 @@ def _handle_param_grid_attribute(training_settings: dict[str, Any]) -> dict[str,
def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, Any]]:
if "param_grid" in training_settings:
print(
dedent(
"""\
dedent("""\
Deprecation Warning: training.param_grid is deprecated.

Please use training.model_parameter_search instead by replacing

`param_grid = True` with `model_parameter_search = {strategy = "grid"}` or
`param_grid = False` with `model_parameter_search = {strategy = "explicit"}`

[deprecated_in_version=4.0.0]"""
),
[deprecated_in_version=4.0.0]"""),
file=sys.stderr,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ def _create_training_features(self):
dep_var = config[training_conf]["dependent_var"]
if training_conf == "hh_training":
hh_col = config[training_conf].get("hh_col", "serialp")
tdl = self.task.spark.sql(
f"""SELECT
tdl = self.task.spark.sql(f"""SELECT
td.{id_col}_a,
td.{id_col}_b,
td.{dep_var},
Expand All @@ -57,8 +56,7 @@ def _create_training_features(self):
left join
prepped_df_b pdfb
on pdfb.{id_col} = td.{id_col}_b
"""
)
""")
else:
tdl = self.task.spark.table(f"{table_prefix}training_data").select(
f"{id_col}_a", f"{id_col}_b", dep_var
Expand Down
1 change: 0 additions & 1 deletion hlink/linking/util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from contextlib import contextmanager
from math import ceil


MIN_PARTITIONS = 200
MAX_PARTITIONS = 10000

Expand Down
1 change: 0 additions & 1 deletion hlink/tests/core/column_mapping_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from hlink.linking.core.column_mapping import apply_transform, select_column_mapping


TEST_DF_1 = pd.DataFrame(
{
"id": [0, 1, 2, 3, 4, 5],
Expand Down
6 changes: 2 additions & 4 deletions hlink/tests/core/substitutions_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,11 @@ def test_load_substitutions(tmp_path: Path) -> None:

def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None:
tmp_file = tmp_path / "substitutions.csv"
tmp_file.write_text(
"""rose,rosie
tmp_file.write_text("""rose,rosie
sophia,sophy
sophia,sofia
amanda,mandy
jane,jean"""
)
jane,jean""")

df = spark.createDataFrame(
[("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)],
Expand Down
1 change: 0 additions & 1 deletion hlink/tests/core/transforms_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from hlink.linking.core.transforms import apply_transform, generate_transforms
from hlink.linking.link_task import LinkTask


ignore_apply_transform_dep_warning = pytest.mark.filterwarnings(
r"ignore:\s*This is a deprecated alias for hlink.linking.core.column_mapping.apply_transform"
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "hlink"
version = "4.2.1"
version = "4.2.2"
description = "Fast supervised pyspark record linkage software"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion sphinx-docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
The format of this changelog is based on [Keep A Changelog][keep-a-changelog].
Hlink adheres to semantic versioning as much as possible.

## Not Yet Released
## v4.2.2 (2026-01-20)

### Added

Expand Down