Skip to content

Commit 5ea6fd2

Browse files
authored
Merge pull request #19 from VACLab/cohort-create-yaml-schema
Expanded cohort create yaml schema
2 parents e4bde9e + 5c77ec6 commit 5ea6fd2

File tree

9 files changed

+524
-25
lines changed

9 files changed

+524
-25
lines changed

biasanalyzer/cohort_query_builder.py

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,38 @@ def render_event(event):
116116
if not domain or not domain["table"]:
117117
return ""
118118

119-
base_sql = f"SELECT person_id, event_start_date, event_end_date FROM ranked_events_{event['event_type']}"
120-
conditions = [f"concept_id = {event['event_concept_id']}"]
119+
# Handle event_instance, including negative values
120+
rank_table = f"ranked_asc_{event['event_type']}"
121121
if "event_instance" in event and event["event_instance"] is not None:
122-
conditions.append(f"event_instance >= {event['event_instance']}")
122+
event_instance = int(event["event_instance"])
123+
abs_instance = abs(event_instance)
124+
if event_instance < 0:
125+
rank_table = f"ranked_desc_{event['event_type']}"
126+
instance_condition = f" AND event_instance = {abs_instance}"
127+
else:
128+
instance_condition = ""
129+
# Handle offset for cohort window
130+
offset = event.get("offset", 0)
131+
if offset == 0:
132+
adjusted_start = "event_start_date"
133+
adjusted_end = "event_end_date"
134+
else:
135+
# Apply offset to start_date for negative, end_date for positive
136+
adjusted_start = f"event_start_date - INTERVAL '{abs(offset)} days'" if offset < 0 else "event_start_date"
137+
adjusted_end = f"event_end_date + INTERVAL '{offset} days'" if offset > 0 else "event_end_date"
138+
139+
base_sql = f"""
140+
SELECT
141+
person_id,
142+
event_start_date,
143+
event_end_date,
144+
{adjusted_start} AS adjusted_start,
145+
{adjusted_end} AS adjusted_end
146+
FROM {rank_table}
147+
WHERE concept_id = {event['event_concept_id']}{instance_condition}
148+
"""
123149

124-
return f"{base_sql} WHERE {' AND '.join(conditions)}"
150+
return base_sql
125151

126152

127153
@staticmethod
@@ -163,7 +189,7 @@ def render_event_group(event_group, alias_prefix="evt"):
163189
"""
164190
# Then, union all events for qualifying person_ids
165191
combined_sql = f"""
166-
SELECT person_id, event_start_date, event_end_date
192+
SELECT person_id, event_start_date, event_end_date, adjusted_start, adjusted_end
167193
FROM (
168194
{' UNION ALL '.join(f'({q})' for q in queries)}
169195
) AS all_events
@@ -174,13 +200,15 @@ def render_event_group(event_group, alias_prefix="evt"):
174200
return combined_sql
175201

176202
elif event_group["operator"] == "OR":
177-
return f"SELECT person_id, event_start_date, event_end_date FROM ({' UNION '.join(queries)}) AS {alias_prefix}_or"
203+
return (f"SELECT person_id, event_start_date, event_end_date, adjusted_start, adjusted_end "
204+
f"FROM ({' UNION '.join(queries)}) AS {alias_prefix}_or")
178205
elif event_group["operator"] == "NOT":
179206
not_query = queries[0]
180207
# Return a query that selects all persons from a base table (e.g., person),
181208
# excluding those in the NOT subquery, while allowing dates from other criteria
182209
return f"""
183-
SELECT p.person_id, NULL AS event_start_date, NULL AS event_end_date
210+
SELECT p.person_id, NULL AS event_start_date, NULL AS event_end_date,
211+
NULL AS adjusted_start, NULL AS adjusted_end,
184212
FROM person p
185213
WHERE p.person_id NOT IN (
186214
SELECT person_id FROM ({not_query}) AS {alias_prefix}_not
@@ -199,14 +227,14 @@ def render_event_group(event_group, alias_prefix="evt"):
199227
if timestamp_event_index < non_timestamp_event_index:
200228
# timestamp needs to happen before non-timestamp event
201229
return f"""
202-
SELECT person_id, event_start_date, event_end_date
230+
SELECT person_id, event_start_date, event_end_date, adjusted_start, adjusted_end
203231
FROM ({queries[0]}) AS {alias_prefix}_0
204232
WHERE event_start_date > DATE '{timestamp}'
205233
"""
206234
else:
207235
# non-timestamp event needs to happen before timestamp
208236
return f"""
209-
SELECT person_id, event_start_date, event_end_date
237+
SELECT person_id, event_start_date, event_end_date, adjusted_start, adjusted_end
210238
FROM ({queries[0]}) AS {alias_prefix}_0
211239
WHERE event_start_date < DATE '{timestamp}'
212240
"""
@@ -218,14 +246,16 @@ def render_event_group(event_group, alias_prefix="evt"):
218246

219247
# Ensure both events contribute dates with temporal order and interval
220248
return f"""
221-
SELECT {e1_alias}.person_id, {e1_alias}.event_start_date, {e1_alias}.event_end_date
249+
SELECT {e1_alias}.person_id, {e1_alias}.event_start_date, {e1_alias}.event_end_date,
250+
{e1_alias}.adjusted_start, {e1_alias}.adjusted_end
222251
FROM ({queries[0]}) AS {e1_alias}
223252
JOIN ({queries[1]}) AS {e2_alias}
224253
ON {e1_alias}.person_id = {e2_alias}.person_id
225254
AND {e1_alias}.event_start_date < {e2_alias}.event_start_date
226255
{interval_sql}
227256
UNION ALL
228-
SELECT {e2_alias}.person_id, {e2_alias}.event_start_date, {e2_alias}.event_end_date
257+
SELECT {e2_alias}.person_id, {e2_alias}.event_start_date, {e2_alias}.event_end_date,
258+
{e2_alias}.adjusted_start, {e2_alias}.adjusted_end
229259
FROM ({queries[1]}) AS {e2_alias}
230260
JOIN ({queries[0]}) AS {e1_alias}
231261
ON {e2_alias}.person_id = {e1_alias}.person_id
@@ -277,7 +307,8 @@ def temporal_event_filter(self, event_groups, alias='c'):
277307
# events:
278308
# - event_type: drug_exposure
279309
# event_concept_id: 67890
280-
return (f"SELECT person_id, event_start_date, event_end_date FROM "
310+
return (f"SELECT person_id, event_start_date, event_end_date, "
311+
f"adjusted_start, adjusted_end FROM "
281312
f"({' UNION ALL '.join(filters)}) AS combined_events")
282313

283314
# Single event group case with operator defined

biasanalyzer/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def cohort_distribution_variables(self):
210210

211211
def get_cohort_distributions(self, cohort_definition_id: int, variable: str):
212212
"""
213-
Get age distribution statistics for a cohort from the cohort table.
213+
Get distribution statistics for a cohort from the cohort table.
214214
"""
215215
try:
216216
if self._create_omop_table('person'):

biasanalyzer/sql_templates/base.sql.j2

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ domain_qualifying_events AS (
66
filtered_cohort AS (
77
SELECT c.person_id,
88
{% if temporal_events %}
9-
MIN(c.event_start_date) AS cohort_start_date,
10-
MAX(c.event_end_date) AS cohort_end_date
9+
MIN(c.adjusted_start) AS cohort_start_date,
10+
MAX(c.adjusted_end) AS cohort_end_date
1111
{% else %}
1212
MIN(all_events.event_start_date) AS cohort_start_date,
1313
MAX(all_events.event_end_date) AS cohort_end_date

biasanalyzer/sql_templates/cohort_creation_query.sql.j2

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
{% block domain_events %}
33
{% if ranked_domains %}
44
WITH
5-
{% for domain_type, domain in ranked_domains.items() %}
5+
{% for event_type, domain in ranked_domains.items() %}
66
{% if domain.table %}
7-
ranked_events_{{ domain_type }} AS (
7+
ranked_asc_{{ event_type }} AS (
88
SELECT
99
person_id,
1010
{{ domain.concept_id }} AS concept_id,
@@ -16,6 +16,18 @@ ranked_events_{{ domain_type }} AS (
1616
) AS event_instance
1717
FROM {{ domain.table }}
1818
),
19+
ranked_desc_{{ event_type }} AS (
20+
SELECT
21+
person_id,
22+
{{ domain.concept_id }} AS concept_id,
23+
{{ domain.start_date }} AS event_start_date,
24+
{{ domain.end_date }} AS event_end_date,
25+
ROW_NUMBER() OVER (
26+
PARTITION BY person_id, {{ domain.concept_id }}
27+
ORDER BY {{ domain.start_date }} DESC
28+
) AS event_instance
29+
FROM {{ domain.table }}
30+
),
1931
{% endif %}
2032
{% endfor %}
2133
{% endif %}
@@ -25,7 +37,7 @@ ranked_events_{{ domain_type }} AS (
2537
{% if inclusion_criteria.temporal_events %}
2638
{{ temporal_event_filter(inclusion_criteria.temporal_events) }}
2739
{% else %}
28-
SELECT person_id
40+
SELECT person_id, NULL AS event_start_date, NULL AS event_end_date, NULL AS adjusted_start, NULL AS adjusted_end
2941
FROM person p
3042
{% endif %}
3143
{% endblock %}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
inclusion_criteria:
2+
demographics:
3+
gender: female
4+
min_birth_year: 1970
5+
max_birth_year: 2000
6+
temporal_events:
7+
- operator: AND
8+
events:
9+
- event_type: condition_occurrence
10+
event_concept_id: 201826 # Type 2 diabetes (valid OMOP ID)
11+
event_instance: -1 # Last occurrence
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
inclusion_criteria:
2+
demographics:
3+
gender: female
4+
min_birth_year: 1970
5+
max_birth_year: 2000
6+
temporal_events:
7+
- operator: AND
8+
events:
9+
- event_type: condition_occurrence
10+
event_concept_id: 201826 # Type 2 diabetes (valid OMOP ID)
11+
event_instance: -1 # Last occurrence
12+
offset: 180 # 180 days after
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
inclusion_criteria:
2+
demographics:
3+
gender: female
4+
min_birth_year: 1970
5+
max_birth_year: 2000
6+
temporal_events:
7+
- operator: AND
8+
events:
9+
- event_type: condition_occurrence
10+
event_concept_id: 201826 # Type 2 diabetes (valid OMOP ID)
11+
offset: 180 # 180 days after
12+
- event_type: condition_occurrence
13+
event_concept_id: 201826 # Type 2 diabetes (valid OMOP ID)
14+
offset: -730 # 2 years before

tests/conftest.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,10 @@ def test_db():
123123
(2, 8532, 0, 0, 1996), -- Female, qualifying, not excluded due to not having cardiac surgery
124124
(3, 8532, 0, 0, 1996), -- Female, has cardiac surgery
125125
(4, 8507, 0, 0, 1980), -- Male, wrong gender
126-
(5, 8532, 0, 0, 1980); -- Female, missing insulin
126+
(5, 8532, 0, 0, 1980), -- Female, missing insulin
127+
-- for offset and negative instance testing
128+
(6, 8532, 0, 0, 1985), -- Female, multiple diabetes records, last one too early
129+
(7, 8532, 0, 0, 1990); -- Female, diabetes record too recent
127130
""")
128131

129132
# Insert mock concepts as needed
@@ -141,7 +144,8 @@ def test_db():
141144
(5, 'Fever', '2012-04-01', '2020-04-01', 'R50.9', 'ICD10CM', 'Condition'),
142145
(37311061, 'COVID-19', '2012-04-01', '2020-04-01', '840539006', 'SNOMED', 'Condition'),
143146
(4041664, 'Difficulty breathing', '2012-04-01', '2020-04-01', '230145002', 'SNOMED', 'Condition'),
144-
(316139, 'Heart failure', '2012-04-01', '2020-04-01', '84114007', 'SNOMED', 'Condition');
147+
(316139, 'Heart failure', '2012-04-01', '2020-04-01', '84114007', 'SNOMED', 'Condition'),
148+
(201826, 'Type 2 diabetes mellitus', '2012-04-01', '2020-04-01', '44054006', 'SNOMED', 'Condition');
145149
""")
146150

147151
# Insert hierarchical relationships as needed
@@ -163,7 +167,8 @@ def test_db():
163167
(1, 3, 1), -- Diabetes -> Type 2
164168
(1, 4, 2), -- Diabetes -> Retinopathy
165169
(2, 4, 1), -- Type 1 -> Diabetes Retinopathy
166-
(3, 4, 1); -- Type 2 -> Diabetes Retinopathy
170+
(3, 4, 1), -- Type 2 -> Diabetes Retinopathy
171+
(201826, 201826, 0); -- Type 2 diabetes SNOMED
167172
""")
168173

169174
# Insert mock condition occurrences as needed
@@ -199,7 +204,11 @@ def test_db():
199204
(2, 201826, '2020-06-01', '2020-06-01'), -- Person 2: Diabetes
200205
(3, 201826, '2020-06-01', '2020-06-01'), -- Person 3: Diabetes
201206
(4, 201826, '2020-06-01', '2020-06-01'), -- Person 4: Diabetes
202-
(5, 201826, '2020-06-01', '2020-06-01'); -- Person 5: Diabetes
207+
(5, 201826, '2020-06-01', '2020-06-01'), -- Person 5: Diabetes
208+
-- for negative event instance and offset testing
209+
(6, 201826, '2017-01-01', '2017-01-01'), -- Person 6: Early diabetes record
210+
(6, 201826, '2018-01-01', '2018-01-01'), -- Person 6: Last diabetes record, still early
211+
(7, 201826, '2023-01-01', '2023-01-01'); -- Person 7: Recent diabetes record
203212
""")
204213

205214
# Insert mock visit data
@@ -220,7 +229,10 @@ def test_db():
220229
(2, 9, 9202, '2020-06-10', '2020-06-10'), -- Person 2: Outpatient
221230
(3, 10, 9202, '2020-06-10', '2020-06-10'), -- Person 3: Outpatient
222231
(4, 11, 9202, '2020-06-10', '2020-06-10'), -- Person 4: Outpatient
223-
(5, 12, 9202, '2020-06-10', '2020-06-10'); -- Person 5: Outpatient
232+
(5, 12, 9202, '2020-06-10', '2020-06-10'), -- Person 5: Outpatient
233+
-- New patients (no visits needed for exclusion testing)
234+
(6, 13, 9202, '2018-01-10', '2018-01-10'), -- Person 6: Outpatient
235+
(7, 14, 9202, '2023-01-10', '2023-01-10'); -- Person 7: Outpatient
224236
""")
225237

226238
# Insert mock procedure_occurrence data for mixed domain testing
@@ -234,7 +246,9 @@ def test_db():
234246
(3, 3, 4048609, '2020-06-20'), -- Person 3: Blood test
235247
(3, 4, 619339, '2020-06-25'), -- Person 3: Cardiac surgery (exclusion)
236248
(4, 5, 4048609, '2020-06-20'), -- Person 4: Blood test
237-
(5, 6, 4048609, '2020-06-20'); -- Person 5: Blood test
249+
(5, 6, 4048609, '2020-06-20'), -- Person 5: Blood test
250+
(6, 7, 4048609, '2018-01-15'), -- Person 6: Blood test
251+
(7, 8, 4048609, '2023-01-15'); -- Person 7: Blood test
238252
""")
239253

240254
# Insert mock procedure_occurrence data for mixed domain testing
@@ -246,7 +260,9 @@ def test_db():
246260
(1, 4285892, '2020-06-15', '2020-06-15'), -- Person 1: Insulin 14 days after
247261
(2, 4285892, '2020-06-15', '2020-06-15'), -- Person 2: Insulin
248262
(3, 4285892, '2020-06-15', '2020-06-15'), -- Person 3: Insulin
249-
(4, 4285892, '2020-06-15', '2020-06-15'); -- Person 4: Insulin
263+
(4, 4285892, '2020-06-15', '2020-06-15'), -- Person 4: Insulin
264+
(6, 4285892, '2018-01-20', '2018-01-20'), -- Person 6: Insulin
265+
(7, 4285892, '2023-01-20', '2023-01-20'); -- Person 7: Insulin
250266
-- Person 5: No insulin
251267
""")
252268

0 commit comments

Comments
 (0)