Skip to content

Commit a39d4f5

Browse files
authored
Improve memory usage and logging info by moving the encoding step (#113)
* Move encoding step to save memory during analysis * Version 2.2.0
1 parent 7efc8b5 commit a39d4f5

File tree

11 files changed

+246
-113
lines changed

11 files changed

+246
-113
lines changed

clarite/modules/analyze/association_study.py

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import click
44
import pandas as pd
5-
from pandas_genomics import GenotypeDtype
65

76
from clarite.modules.analyze import regression
87
from clarite.modules.analyze.regression import (
@@ -43,10 +42,6 @@ def association_study(
4342
This can be 'glm', 'weighted_glm', or 'r_survey' for built-in Regression types,
4443
or a custom subclass of Regression. If None, it is set to 'glm' if a survey design is not specified
4544
and 'weighted_glm' if it is.
46-
encoding: str, default "additive"
47-
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
48-
edge_encoding_info: Optional pd.DataFrame, default None
49-
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings.
5045
kwargs: Keyword arguments specific to the Regression being used
5146
5247
Returns
@@ -55,34 +50,6 @@ def association_study(
5550
Association Study results DataFrame with at least these columns: ['N', 'pvalue', 'error', 'warnings'].
5651
Indexed by the outcome variable and the variable being assessed in each regression
5752
"""
58-
# Copy data to avoid modifying the original, in case it is changed
59-
data = data.copy(deep=True)
60-
61-
# Encode any genotype data
62-
has_genotypes = False
63-
for dt in data.dtypes:
64-
if GenotypeDtype.is_dtype(dt):
65-
has_genotypes = True
66-
break
67-
if has_genotypes:
68-
if encoding == "additive":
69-
data = data.genomics.encode_additive()
70-
elif encoding == "dominant":
71-
data = data.genomics.encode_dominant()
72-
elif encoding == "recessive":
73-
data = data.genomics.encode_recessive()
74-
elif encoding == "codominant":
75-
data = data.genomics.encode_codominant()
76-
elif encoding == "edge":
77-
if edge_encoding_info is None:
78-
raise ValueError(
79-
"'edge_encoding_info' must be provided when using edge encoding"
80-
)
81-
else:
82-
data = data.genomics.encode_edge(edge_encoding_info)
83-
else:
84-
raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
85-
8653
# Ensure outcome, covariates, and regression variables are lists
8754
if isinstance(outcomes, str):
8855
outcomes = [

clarite/modules/analyze/interaction_study.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def interaction_study(
1616
edge_encoding_info: Optional[pd.DataFrame] = None,
1717
report_betas: bool = False,
1818
min_n: int = 200,
19-
process_num: Optional[int] = None
19+
process_num: Optional[int] = None,
2020
):
2121
"""Perform LRT tests comparing a model with interaction terms to one without.
2222
@@ -110,7 +110,7 @@ def interaction_study(
110110
min_n=min_n,
111111
interactions=interactions,
112112
report_betas=report_betas,
113-
process_num=process_num
113+
process_num=process_num,
114114
)
115115
print(regression)
116116

clarite/modules/analyze/regression/base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def __init__(
3939
regression_variables: List[str],
4040
covariates: Optional[List[str]] = None,
4141
):
42+
# Copy the data to avoid changing the original. The copy will be modified in-place.
43+
data = data.copy()
4244
# Print a warning if there are any empty categories and remove them
4345
# This is done to distinguish from those that become missing during analysis (and could be an issue)
4446
empty_categories = _remove_empty_categories(data)
@@ -105,7 +107,7 @@ def _validate_regression_params(self, regression_variables):
105107
types = _get_dtypes(self.data)
106108
rv_types = {v: t for v, t in types.iteritems() if v in regression_variables}
107109
rv_count = 0
108-
for dtype in ["binary", "categorical", "continuous"]:
110+
for dtype in ["binary", "categorical", "continuous", "genotypes"]:
109111
self.regression_variables[dtype] = [
110112
v for v, t in rv_types.items() if t == dtype
111113
]

clarite/modules/analyze/regression/glm_regression.py

Lines changed: 77 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
import patsy
1010
import scipy
1111
import statsmodels.api as sm
12+
from pandas_genomics import GenotypeDtype
1213
from scipy.stats import stats
1314

14-
from clarite.internal.utilities import _remove_empty_categories
15+
from clarite.internal.utilities import _remove_empty_categories, _get_dtypes
1516

1617
from .base import Regression
1718
from ..utils import fix_names, statsmodels_var_regex
@@ -59,10 +60,16 @@ class GLMRegression(Regression):
5960
False by default.
6061
If True, numeric data will be standardized using z-scores before regression.
6162
This will affect the beta values and standard error, but not the pvalues.
63+
encoding: str, default "additive"
64+
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
65+
edge_encoding_info: Optional pd.DataFrame, default None
66+
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings.
6267
process_num: Optional[int]
6368
Number of processes to use when running the analysis, default is None (use the number of cores)
6469
"""
6570

71+
KNOWN_ENCODINGS = {"additive", "dominant", "recessive", "codominant", "edge"}
72+
6673
def __init__(
6774
self,
6875
data: pd.DataFrame,
@@ -72,6 +79,8 @@ def __init__(
7279
min_n: int = 200,
7380
report_categorical_betas: bool = False,
7481
standardize_data: bool = False,
82+
encoding: str = "additive",
83+
edge_encoding_info: Optional[pd.DataFrame] = None,
7584
process_num: Optional[int] = None,
7685
):
7786
# base class init
@@ -91,6 +100,15 @@ def __init__(
91100
if process_num is None:
92101
process_num = multiprocessing.cpu_count()
93102
self.process_num = process_num
103+
if encoding not in self.KNOWN_ENCODINGS:
104+
raise ValueError(f"Genotypes provided with unknown 'encoding': {encoding}")
105+
elif encoding == "edge" and edge_encoding_info is None:
106+
raise ValueError(
107+
"'edge_encoding_info' must be provided when using edge encoding"
108+
)
109+
else:
110+
self.encoding = encoding
111+
self.edge_encoding_info = edge_encoding_info
94112

95113
# Ensure the data output type is compatible
96114
# Set 'self.family' and 'self.use_t' which are dependent on the outcome dtype
@@ -316,6 +334,28 @@ def _run_categorical(
316334
"Diff_AIC": est.aic - est_restricted.aic,
317335
}
318336

337+
def _get_rv_specific_data(self, rv: str):
338+
"""Select the data relevant to performing a regression on a given variable, encoding genotypes if needed"""
339+
data = self.data[[rv, self.outcome_variable] + self.covariates].copy()
340+
# Encode any genotype data
341+
has_genotypes = False
342+
for dt in data.dtypes:
343+
if GenotypeDtype.is_dtype(dt):
344+
has_genotypes = True
345+
break
346+
if has_genotypes:
347+
if self.encoding == "additive":
348+
data = data.genomics.encode_additive()
349+
elif self.encoding == "dominant":
350+
data = data.genomics.encode_dominant()
351+
elif self.encoding == "recessive":
352+
data = data.genomics.encode_recessive()
353+
elif self.encoding == "codominant":
354+
data = data.genomics.encode_codominant()
355+
elif self.encoding == "edge":
356+
data = data.genomics.encode_edge(self.edge_encoding_info)
357+
return data
358+
319359
def run(self):
320360
"""Run a regression object, returning the results and logging any warnings/errors"""
321361
for rv_type, rv_list in self.regression_variables.items():
@@ -330,24 +370,37 @@ def run(self):
330370
)
331371
)
332372

333-
with multiprocessing.Pool(processes=self.process_num) as pool:
334-
run_result = pool.starmap(
335-
self._run_rv,
336-
zip(
337-
rv_list,
338-
repeat(rv_type),
339-
[
340-
self.data[[rv, self.outcome_variable] + self.covariates]
341-
for rv in rv_list
342-
],
343-
repeat(self.outcome_variable),
344-
repeat(self.covariates),
345-
repeat(self.min_n),
346-
repeat(self.family),
347-
repeat(self.use_t),
348-
repeat(self.report_categorical_betas),
349-
),
350-
)
373+
if self.process_num == 1:
374+
run_result = [
375+
self._run_rv(
376+
rv,
377+
rv_type,
378+
self._get_rv_specific_data(rv),
379+
self.outcome_variable,
380+
self.covariates,
381+
self.min_n,
382+
self.family,
383+
self.use_t,
384+
self.report_categorical_betas,
385+
)
386+
for rv in rv_list
387+
]
388+
else:
389+
with multiprocessing.Pool(processes=self.process_num) as pool:
390+
run_result = pool.starmap(
391+
self._run_rv,
392+
zip(
393+
rv_list,
394+
repeat(rv_type),
395+
[self._get_rv_specific_data(rv) for rv in rv_list],
396+
repeat(self.outcome_variable),
397+
repeat(self.covariates),
398+
repeat(self.min_n),
399+
repeat(self.family),
400+
repeat(self.use_t),
401+
repeat(self.report_categorical_betas),
402+
),
403+
)
351404

352405
for rv, rv_result in zip(rv_list, run_result):
353406
results, warnings, error = rv_result
@@ -424,6 +477,11 @@ def _run_rv(
424477
# Apply the complete_case_mask to the data to ensure categorical models use the same data in the LRT
425478
data = data.loc[complete_case_mask]
426479

480+
# Update rv_type to the encoded type if it is a genotype
481+
if rv_type == "genotypes":
482+
"""Need to update with encoded type"""
483+
rv_type = _get_dtypes(data[rv])[rv]
484+
427485
# Run Regression
428486
if rv_type == "continuous":
429487
result = cls.get_default_result_dict(rv)

clarite/modules/analyze/regression/interaction_regression.py

Lines changed: 76 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import patsy
99
import scipy
1010
import statsmodels.api as sm
11+
from pandas_genomics import GenotypeDtype
1112

1213
from clarite.internal.utilities import _remove_empty_categories
1314
from . import GLMRegression
@@ -45,6 +46,10 @@ class InteractionRegression(GLMRegression):
4546
False by default.
4647
If True, the results will contain one row for each interaction term and will include the beta value
4748
for that term. The number of terms increases with the number of categories in each interacting term.
49+
encoding: str, default "additive"
50+
Encoding method to use for any genotype data. One of {'additive', 'dominant', 'recessive', 'codominant', or 'weighted'}
51+
edge_encoding_info: Optional pd.DataFrame, default None
52+
If edge encoding is used, this must be provided. See Pandas-Genomics documentation on edge encodings.
4853
process_num: Optional[int]
4954
Number of processes to use when running the analysis, default is None (use the number of cores)
5055
@@ -58,6 +63,8 @@ def __init__(
5863
min_n=200,
5964
interactions=None,
6065
report_betas=False,
66+
encoding: str = "additive",
67+
edge_encoding_info: Optional[pd.DataFrame] = None,
6168
process_num: Optional[int] = None,
6269
):
6370
# base class init
@@ -71,6 +78,8 @@ def __init__(
7178
outcome_variable=outcome_variable,
7279
covariates=covariates,
7380
regression_variables=regression_variables,
81+
encoding=encoding,
82+
edge_encoding_info=edge_encoding_info,
7483
min_n=min_n,
7584
)
7685

@@ -86,11 +95,10 @@ def __init__(
8695

8796
def _process_interactions(self, interactions):
8897
"""Validate the interactions parameter and save it as a list of string tuples"""
89-
regression_var_list = (
90-
self.regression_variables["binary"]
91-
+ self.regression_variables["categorical"]
92-
+ self.regression_variables["continuous"]
93-
)
98+
regression_var_list = []
99+
for var_list in self.regression_variables.values():
100+
regression_var_list.extend(var_list)
101+
94102
if len(regression_var_list) < 2:
95103
raise ValueError(
96104
f"Not enough valid variables for running interactions: {len(regression_var_list)} variables"
@@ -212,6 +220,35 @@ def _run_interaction_regression(
212220
# Did not converge - nothing to update
213221
yield dict()
214222

223+
def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
224+
"""Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
225+
data = self.data[
226+
list(interaction)
227+
+ [
228+
self.outcome_variable,
229+
]
230+
+ self.covariates
231+
].copy()
232+
233+
# Encode any genotype data
234+
has_genotypes = False
235+
for dt in data.dtypes:
236+
if GenotypeDtype.is_dtype(dt):
237+
has_genotypes = True
238+
break
239+
if has_genotypes:
240+
if self.encoding == "additive":
241+
data = data.genomics.encode_additive()
242+
elif self.encoding == "dominant":
243+
data = data.genomics.encode_dominant()
244+
elif self.encoding == "recessive":
245+
data = data.genomics.encode_recessive()
246+
elif self.encoding == "codominant":
247+
data = data.genomics.encode_codominant()
248+
elif self.encoding == "edge":
249+
data = data.genomics.encode_edge(self.edge_encoding_info)
250+
return data
251+
215252
def run(self):
216253
"""Run a regression object, returning the results and logging any warnings/errors"""
217254
# Log how many interactions are being run using how many processes
@@ -221,29 +258,40 @@ def run(self):
221258
fg="green",
222259
)
223260
)
224-
with multiprocessing.Pool(processes=self.process_num) as pool:
225-
run_result = pool.starmap(
226-
self._run_interaction,
227-
zip(
228-
self.interactions,
229-
[
230-
self.data[
231-
list(interaction)
232-
+ [
233-
self.outcome_variable,
234-
]
235-
+ self.covariates
236-
]
237-
for interaction in self.interactions
238-
],
239-
repeat(self.outcome_variable),
240-
repeat(self.covariates),
241-
repeat(self.min_n),
242-
repeat(self.family),
243-
repeat(self.use_t),
244-
repeat(self.report_betas),
245-
),
246-
)
261+
262+
if self.process_num == 1:
263+
run_result = [
264+
self._run_interaction(
265+
interaction,
266+
self._get_interaction_specific_data(interaction),
267+
self.outcome_variable,
268+
self.covariates,
269+
self.min_n,
270+
self.family,
271+
self.use_t,
272+
self.report_betas,
273+
)
274+
for interaction in self.interactions
275+
]
276+
277+
else:
278+
with multiprocessing.Pool(processes=self.process_num) as pool:
279+
run_result = pool.starmap(
280+
self._run_interaction,
281+
zip(
282+
self.interactions,
283+
[
284+
self._get_interaction_specific_data(interaction)
285+
for interaction in self.interactions
286+
],
287+
repeat(self.outcome_variable),
288+
repeat(self.covariates),
289+
repeat(self.min_n),
290+
repeat(self.family),
291+
repeat(self.use_t),
292+
repeat(self.report_betas),
293+
),
294+
)
247295

248296
for interaction, interaction_result in zip(self.interactions, run_result):
249297
interaction_str = ":".join(interaction)

0 commit comments

Comments
 (0)