From eda3ae5c7f4726cf83518f973cdc12976a88674a Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 11 Dec 2025 22:35:54 -0500 Subject: [PATCH 1/2] minor changes to make the tool work with real data --- biasanalyzer/database.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/biasanalyzer/database.py b/biasanalyzer/database.py index 5ae62fb..44e75d1 100644 --- a/biasanalyzer/database.py +++ b/biasanalyzer/database.py @@ -99,7 +99,7 @@ def _create_cohort_definition_table(self): def _create_cohort_table(self): self.conn.execute(f""" CREATE TABLE IF NOT EXISTS {self.schema}.cohort ( - subject_id BIGINT, + subject_id VARCHAR NOT NULL, cohort_definition_id INTEGER, cohort_start_date DATE, cohort_end_date DATE, @@ -288,12 +288,14 @@ def get_cohort_concept_stats( ) concept_stats[concept_type] = self._execute_query(query) cs_df = pd.DataFrame(concept_stats[concept_type]) - # Combine concept_name and prevalence into a "details" column - cs_df["details"] = cs_df.apply( - lambda row: f"{row['concept_name']} (Code: {row['concept_code']}, " - f"Count: {row['count_in_cohort']}, Prevalence: {row['prevalence']:.3%})", - axis=1, - ) + + if not cs_df.empty: + # Combine concept_name and prevalence into a "details" column + cs_df["details"] = cs_df.apply( + lambda row: f"{row['concept_name']} (Code: {row['concept_code']}, " + f"Count: {row['count_in_cohort']}, Prevalence: {row['prevalence']:.3%})", + axis=1, + ) if print_concept_hierarchy: filtered_cs_df = cs_df[cs_df["ancestor_concept_id"] != cs_df["descendant_concept_id"]] From 4a296b418415f7936a993679e7d3c035a8d5c679 Mon Sep 17 00:00:00 2001 From: hyi Date: Thu, 11 Dec 2025 23:23:53 -0500 Subject: [PATCH 2/2] fixed tests --- biasanalyzer/cohort.py | 5 +++- tests/query_based/test_cohort_creation.py | 28 +++++++++---------- .../test_hierarchical_prevalence.py | 6 ++-- 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/biasanalyzer/cohort.py b/biasanalyzer/cohort.py index 4d5af20..979eac5 100644 --- a/biasanalyzer/cohort.py +++ b/biasanalyzer/cohort.py @@ -10,7 +10,7 @@ from biasanalyzer.concept import ConceptHierarchy from biasanalyzer.config import load_cohort_creation_config from biasanalyzer.database import BiasDatabase, OMOPCDMDatabase -from biasanalyzer.models import CohortDefinition +from biasanalyzer.models import CohortDefinition, DOMAIN_MAPPING from biasanalyzer.utils import clean_string, hellinger_distance, notify_users @@ -59,6 +59,9 @@ def get_concept_stats( """ Get cohort concept statistics such as concept prevalence """ + if concept_type not in DOMAIN_MAPPING: + raise ValueError(f'input concept_type {concept_type} is not a valid concept type to get concept stats') + cohort_stats = self.bias_db.get_cohort_concept_stats( self.cohort_id, self.query_builder, diff --git a/tests/query_based/test_cohort_creation.py b/tests/query_based/test_cohort_creation.py index 1bc3226..45c349c 100644 --- a/tests/query_based/test_cohort_creation.py +++ b/tests/query_based/test_cohort_creation.py @@ -86,10 +86,10 @@ def test_cohort_creation_baseline(caplog, test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 5) - assert_equal(patient_ids, {106, 108, 110, 111, 112}) + assert_equal(patient_ids, {'106', '108', '110', '111', '112'}) # select two patients to check for cohort_start_date and cohort_end_date automatically computed - patient_106 = next(item for item in cohort.data if item["subject_id"] == 106) - patient_108 = next(item for item in cohort.data if item["subject_id"] == 108) + patient_106 = next(item for item in cohort.data if item["subject_id"] == '106') + patient_108 = next(item for item in cohort.data if item["subject_id"] == '108') # Replace dates with actual values from your test data assert_equal( @@ -127,7 +127,7 @@ def test_cohort_creation_study(test_db): assert cohort.data is not None, "Cohort creation wrongly returned None data" patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 4) - assert_equal(patient_ids, {108, 110, 111, 112}) + assert_equal(patient_ids, {'108', '110', '111', '112'}) def test_cohort_creation_study2(caplog, test_db): @@ -155,7 +155,7 @@ def test_cohort_creation_study2(caplog, test_db): assert cohort.data is not None, "Cohort creation wrongly returned None data" patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 1) - assert_equal(patient_ids, {106}) + assert_equal(patient_ids, {'106'}) def test_cohort_creation_all(caplog, test_db): @@ -191,7 +191,7 @@ def test_cohort_creation_all(caplog, test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 2) - assert_equal(patient_ids, {108, 110}) + assert_equal(patient_ids, {'108', '110'}) def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db): @@ -214,7 +214,7 @@ def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 2) - assert_equal(patient_ids, {108, 110}) + assert_equal(patient_ids, {'108', '110'}) def test_cohort_creation_mixed_domains(test_db): @@ -242,7 +242,7 @@ def test_cohort_creation_mixed_domains(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) print(f"patient_ids: {patient_ids}", flush=True) assert_equal(len(patient_ids), 3) - assert_equal(patient_ids, {1, 2, 6}) + assert_equal(patient_ids, {'1', '2', '6'}) start_dates = [item["cohort_start_date"] for item in cohort.data] assert_equal(len(start_dates), 3) assert_equal(start_dates, [datetime.date(2020, 6, 1), datetime.date(2020, 6, 1), datetime.date(2018, 1, 1)]) @@ -356,10 +356,10 @@ def test_cohort_creation_negative_instance(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5 - assert_equal(patient_ids, {1, 2, 3, 5, 6, 7}) + assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) # Verify dates for a specific patient (e.g., patient 1 with last diabetes diagnosis) - patient_1 = next(item for item in cohort.data if item["subject_id"] == 1) + patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') assert_equal( patient_1["cohort_start_date"], datetime.date(2020, 6, 1), @@ -392,10 +392,10 @@ def test_cohort_creation_offset(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5 - assert_equal(patient_ids, {1, 2, 3, 5, 6, 7}) + assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) # Verify dates for a specific patient (e.g., patient 1 with offset) - patient_1 = next(item for item in cohort.data if item["subject_id"] == 1) + patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') # Diabetes on 2020-06-01: -730 days = 2018-06-02, +180 days = 2020-11-28 assert_equal( patient_1["cohort_start_date"], @@ -435,10 +435,10 @@ def test_cohort_creation_negative_instance_offset(test_db): patient_ids = set([item["subject_id"] for item in cohort.data]) assert_equal(len(patient_ids), 6) - assert_equal(patient_ids, {1, 2, 3, 5, 6, 7}) + assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'}) # Verify dates for a specific patient (e.g., patient 1 with last diabetes and offset) - patient_1 = next(item for item in cohort.data if item["subject_id"] == 1) + patient_1 = next(item for item in cohort.data if item["subject_id"] == '1') # Last diabetes on 2020-06-01: +180 days = 2020-11-28 assert_equal( patient_1["cohort_start_date"], diff --git a/tests/query_based/test_hierarchical_prevalence.py b/tests/query_based/test_hierarchical_prevalence.py index 41ab068..4541883 100644 --- a/tests/query_based/test_hierarchical_prevalence.py +++ b/tests/query_based/test_hierarchical_prevalence.py @@ -1,4 +1,6 @@ import pytest +from numpy.ma.testutils import assert_equal + from biasanalyzer.concept import ConceptHierarchy @@ -25,8 +27,8 @@ def test_cohort_concept_hierarchical_prevalence(test_db, caplog): cohort.get_concept_stats(vocab="dummy_invalid_vocab") # test the cohort does not have procedure_occurrence related concepts - with pytest.raises(ValueError): - cohort.get_concept_stats(concept_type="procedure_occurrence") + cohort_stat, _ = cohort.get_concept_stats(concept_type="procedure_occurrence") + assert_equal(cohort_stat, {'procedure_occurrence': []}) concept_stats, _ = cohort.get_concept_stats(vocab="ICD10CM", print_concept_hierarchy=True) assert concept_stats is not None, "Failed to fetch concept stats"