-
Notifications
You must be signed in to change notification settings - Fork 43
Bring threshold inside BaseReconstructor class #290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
4907b2e
691a1c2
fdc0e1e
bb64353
bd10912
13f3e74
630640f
985f30f
84efe18
9efe813
0d3e628
eae49b8
815cb14
46fc46f
b95e7f8
2683d8c
dedaca3
6d516ca
ff6cc80
18f261c
46ac1bf
1d61eb5
a59359f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,4 +13,3 @@ Common utilities for use within ``netrd``. | |
| graph | ||
| read | ||
| standardize | ||
| threshold | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,40 +1,336 @@ | ||
| import networkx as nx | ||
| import numpy as np | ||
| import warnings | ||
| import scipy.sparse as sp | ||
| from scipy.sparse.csgraph import minimum_spanning_tree | ||
|
|
||
|
|
||
| class BaseReconstructor: | ||
| r"""Base class for graph reconstruction algorithms. | ||
| """Base class for graph reconstruction algorithms. | ||
|
|
||
| The basic usage of a graph reconstruction algorithm is as follows: | ||
|
|
||
| >>> reconstructor = ReconstructionAlgorithm() | ||
| >>> G = reconstructor.fit(TS, <some_params>) | ||
| >>> # or alternately, G = reconstructor.results['graph'] | ||
| >>> R = ReconstructionAlgorithm() | ||
| >>> G = R.fit(TS, <some_params>).to_graph() | ||
|
|
||
| However, this is probably not the desired behavior, because, depending on | ||
| the method used, this typically returns a weighted dense graph, possibly | ||
| with self-loops. If a sparse or unweighted graph is desired, use the | ||
| thresholding functionality, as in the example below: | ||
|
|
||
| Here, `TS` is an :math:`N \times L` numpy array consisting of :math:`L` | ||
| observations for each of :math:`N` sensors. This constrains the graphs | ||
| to have integer-valued nodes. | ||
| >>> R = ReconstructionAlgorithm() | ||
| >>> R = R.fit(TS, <some_params>) | ||
| >>> R = R.remove_self_loops().threshold('quantile', quantile=0.8) | ||
| >>> G = R.to_graph() | ||
|
|
||
| The ``results`` dict object, in addition to containing the graph | ||
| object, may also contain objects created as a side effect of | ||
| reconstructing the network, which may be useful for debugging or | ||
| considering goodness of fit. What is returned will vary between | ||
| reconstruction algorithms. | ||
| Note that these can all be combined into a single call using method | ||
| chaining. | ||
|
|
||
| All algorithms subclass `BaseReconstructor` and override the `fit()` | ||
| method; see the documentation for each subclass's `fit()` method for | ||
| documentation of the algorithm. | ||
|
|
||
| """ | ||
|
|
||
| def __init__(self): | ||
| """Constructor for reconstructor classes. These take no arguments and define | ||
| three attributes: | ||
|
|
||
| 1. `self._graph`: A representation of the reconstructed network as a | ||
| NetworkX graph (or subclass). | ||
|
|
||
| 2. `self._matrix`: A representation of the reconstructed network as a | ||
| (dense) NumPy array. | ||
|
|
||
| 3. `self.results`: A dictionary for storing any intermediate data | ||
| objects or diagnostics generated by a reconstruction method. | ||
|
|
||
| `self._graph` and `self._matrix` should not be accessed directly by | ||
| users; instead, use the `to_matrix()` and `to_graph()` methods. | ||
| """ | ||
|
|
||
| self._graph = None | ||
| self._matrix = None | ||
| self.results = {} | ||
|
|
||
| def fit(self, TS, **kwargs): | ||
| """Reconstruct a graph from time series TS. | ||
| def fit(self, TS): | ||
| """The key method of the class. This takes an :math:`N \times L` numpy array, | ||
| representing a time series of :math:`L` observations from :math:`N` | ||
| sensors (nodes), and reconstructs a network from it. | ||
|
|
||
| Any new reconstruction method should subclass from `BaseReconstructor` | ||
| and override `fit()`. This method should reconstruct the network as a | ||
| matrix of weights, call `self.update_matrix()`, and then `return self`. | ||
|
|
||
| """ | ||
| self._matrix = np.zeros((TS.shape[0], TS.shape[0])) | ||
| return self | ||
|
|
||
| def update_matrix(self, new_mat): | ||
| """ | ||
| Update the contents of `self._matrix`. This also empties out | ||
| `self._graph` to avoid inconsistent state between the graph and matrix | ||
| representations of the networks. | ||
| """ | ||
| self._matrix = new_mat.copy() | ||
| self._graph = None | ||
|
|
||
| def to_dense(self): | ||
| if self._matrix is None: | ||
| raise ValueError( | ||
| "Matrix representation is missing. Have you fit the data yet?" | ||
| ) | ||
| elif sp.issparse(self._matrix): | ||
| return self._matrix.toarray() | ||
| else: | ||
| return self._matrix | ||
|
|
||
| def to_sparse(self): | ||
| if self._matrix is None: | ||
| raise ValueError( | ||
| "Matrix representation is missing. Have you fit the data yet?" | ||
| ) | ||
| elif sp.issparse(self._matrix): | ||
| return self._matrix | ||
| else: | ||
| return sp.csr_matrix(self._matrix) | ||
|
|
||
| def to_matrix(self): | ||
| if self._matrix is not None: | ||
| return self._matrix | ||
| else: | ||
| raise ValueError( | ||
| "Matrix representation is missing. Have you fit the data yet?" | ||
| ) | ||
|
|
||
| def to_graph(self, create_using=None): | ||
| """Return the graph representation of the reconstructed network.""" | ||
| if self._graph is not None: | ||
| return self._graph | ||
| if self._matrix is not None: | ||
| A = self._matrix.copy() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need a copy? |
||
|
|
||
| if not sp.issparse(self._matrix): | ||
| from_array = nx.from_numpy_array | ||
| allclose = lambda A: np.allclose(A, A.T) | ||
| else: | ||
| from_array = nx.from_scipy_sparse_matrix | ||
| allclose = _sparse_check_symmetric | ||
|
|
||
| if create_using is None: | ||
| undirected = allclose(A) | ||
|
|
||
| if undirected: | ||
| create_using = nx.Graph() | ||
| else: | ||
| create_using = nx.DiGraph() | ||
|
|
||
| G = from_array(A, create_using=create_using) | ||
|
|
||
| self._graph = G | ||
| return self._graph | ||
|
|
||
| raise ValueError( | ||
| "Matrix and graph representations both missing. " | ||
| "Have you fit the data yet?" | ||
| ) | ||
|
|
||
| def threshold_in_range(self, c=None, **kwargs): | ||
| """Threshold by setting values not within a list of ranges to zero. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| TS (np.ndarray): Array consisting of $L$ observations from $N$ sensors. | ||
| cutoffs (list of tuples) | ||
| When thresholding, include only edges whose weights fall | ||
| within a given range or set of ranges. The lower value must come | ||
| first in each tuple. For example, to keep those values whose | ||
| absolute value is between :math:`0.5` and :math:`1`, pass | ||
| ``cutoffs=[(-1, -0.5), (0.5, 1)]``. | ||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if 'cutoffs' in kwargs and c is None: | ||
| cutoffs = kwargs['cutoffs'] | ||
| elif c is None: | ||
| warnings.warn( | ||
| "Setting 'cutoffs' argument is strongly encouraged. " | ||
| "Using cutoff range of (-1, 1).", | ||
| RuntimeWarning, | ||
| ) | ||
| cutoffs = [(-1, 1)] | ||
| else: | ||
| cutoffs = c | ||
|
|
||
| mat = s.to_dense().copy() | ||
| mask_function = np.vectorize( | ||
| lambda x: any([x >= cutoff[0] and x <= cutoff[1] for cutoff in cutoffs]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use a generator instead of a list inside of
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I don't think there will be much of a difference there so feel free to ignore
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My preference would be to rewrite the range-based threshold to just take a min and a max, since you can call it twice now.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should do either of the things suggested (generator or rewrite), but we should do one of them before this is merged |
||
| ) | ||
| mask = mask_function(mat) | ||
|
|
||
| Returns | ||
| ------- | ||
| G (nx.Graph): A reconstructed graph with $N$ nodes. | ||
| thresholded_mat = np.where(mask, mat, 0) | ||
| thresholded_mat = sp.csr_matrix(thresholded_mat) | ||
|
|
||
| s.update_matrix(thresholded_mat) | ||
| return s | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps the docstring should mention that this returns a copy?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. All of the copy logic is what I'm least confident about. My thought was to throw copies all over the place and then figure out which cases were unnecessary.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like it needs revisiting. Is this the design we are going for? |
||
|
|
||
| def threshold_on_quantile(self, q=None, **kwargs): | ||
| """Threshold by setting values below a given quantile to zero. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| quantile (float) | ||
| The threshold above which to keep an element of the array, e.g., | ||
| set to zero elements below the 90th quantile of the array. | ||
|
|
||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if 'quantile' in kwargs and q is None: | ||
| quantile = kwargs['quantile'] | ||
| elif q is None: | ||
| warnings.warn( | ||
| "Setting 'quantile' argument is strongly recommended." | ||
| "Using target quantile of 0.9 for thresholding.", | ||
| RuntimeWarning, | ||
| ) | ||
| quantile = 0.9 | ||
| else: | ||
| quantile = q | ||
| mat = s.to_dense().copy() | ||
|
|
||
| if np.isclose(quantile, 0): | ||
| thresholded_mat = mat | ||
| else: | ||
| thresholded_mat = mat * (mat > np.percentile(mat, quantile * 100)) | ||
|
|
||
| s.update_matrix(sp.csr_matrix(thresholded_mat)) | ||
| return s | ||
|
|
||
| def threshold_on_degree(self, k=None, **kwargs): | ||
| """Threshold by iteratively setting the smallest entries in the weights | ||
| matrix to zero until the average degree falls below a given value. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| avg_k (float) | ||
| The average degree to target when thresholding the matrix. | ||
|
|
||
| """ | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
|
Comment on lines
+219
to
+220
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two lines are repeated all over and they seem bug-prone as hell. Maybe refactor into a helper function?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is basically the code for SQLAlchemy's
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's time to decide whether we want to merge this as is or use |
||
| if 'avg_k' in kwargs and k is None: | ||
| avg_k = kwargs['avg_k'] | ||
| elif k is None: | ||
| warnings.warn( | ||
| "Setting 'avg_k' argument is strongly encouraged. Using average " | ||
| "degree of 1 for thresholding.", | ||
| RuntimeWarning, | ||
| ) | ||
| avg_k = 1 | ||
| else: | ||
| avg_k = k | ||
| mat = s._matrix.copy() | ||
|
|
||
| n = len(mat) | ||
| A = np.ones((n, n)) | ||
|
|
||
| if np.mean(np.sum(A, 1)) <= avg_k: | ||
| # degenerate case: use the whole matrix | ||
| thresholded_mat = mat | ||
| else: | ||
| for m in sorted(mat.flatten()): | ||
| A[mat == m] = 0 | ||
| if np.mean(np.sum(A, 1)) <= avg_k: | ||
| break | ||
| thresholded_mat = mat * (mat > m) | ||
|
|
||
| s.update_matrix(sp.csr_matrix(thresholded_mat)) | ||
| return s | ||
|
|
||
| def threshold(self, rule, **kwargs): | ||
| """A flexible interface to other thresholding functions. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| rule (str) | ||
| A string indicating which thresholding function to invoke. | ||
|
|
||
| kwargs (dict) | ||
| Named arguments to pass to the underlying threshold function. | ||
| """ | ||
| G = nx.Graph() # reconstruct the graph | ||
| self.results['graph'] = G # and store it in self.results | ||
| # self.results[..] = .. # also store other values if needed | ||
| return G | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| try: | ||
| if rule == 'degree': | ||
| return s.threshold_on_degree(**kwargs) | ||
| elif rule == 'range': | ||
| return s.threshold_in_range(**kwargs) | ||
| elif rule == 'quantile': | ||
| return s.threshold_on_quantile(**kwargs) | ||
| elif rule == 'custom': | ||
| mat = s.to_dense() | ||
| s.update_matrix(sp.csr_matrix(kwargs['custom_thresholder'](mat))) | ||
| return s | ||
|
|
||
| except KeyError: | ||
| raise ValueError("missing threshold parameter") | ||
|
|
||
| def _mst_sparse(self, mat): | ||
| return minimum_spanning_tree(mat).asformat(mat.format) | ||
|
|
||
| def _mst_dense(self, mat): | ||
| return minimum_spanning_tree(mat).asformat('csr') | ||
|
|
||
| def minimum_spanning_tree(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| MST = s._mst_sparse(s.to_dense()) | ||
| else: | ||
| MST = s._mst_dense(s.to_dense()) | ||
| s.update_matrix(MST) | ||
| return s | ||
|
|
||
| def _binarize_sparse(self, mat): | ||
| return np.abs(mat.sign()) | ||
|
|
||
| def _binarize_dense(self, mat): | ||
| return np.abs(np.sign(mat)) | ||
|
|
||
|
Comment on lines
+294
to
+299
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need both? |
||
| def binarize(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| mat = s._binarize_sparse(s._matrix) | ||
| else: | ||
| mat = s._binarize_dense(s._matrix) | ||
| s.update_matrix(mat) | ||
| return s | ||
|
|
||
| def _rsl_sparse(self, mat): | ||
| new_mat = mat.copy() | ||
| new_mat.setdiag(0) | ||
| return new_mat | ||
|
|
||
| def _rsl_dense(self, mat): | ||
| new_mat = mat.copy() | ||
| np.fill_diagonal(new_mat, 0) | ||
| return new_mat | ||
|
|
||
| def remove_self_loops(self): | ||
| s = self.__class__.__new__(self.__class__) | ||
| s.__dict__ = self.__dict__.copy() | ||
| if sp.issparse(s._matrix): | ||
| mat = s._rsl_sparse(s._matrix) | ||
| else: | ||
| mat = s._rsl_dense(s._matrix) | ||
| s.update_matrix(mat) | ||
| return s | ||
|
|
||
|
|
||
| def _sparse_check_symmetric(mat, tol=1e-8): | ||
| """np.allclose doesn't work on sparse matrices. This approximates the behavior | ||
| of np.allclose(mat, mat.T), where mat is a sparse matrix. | ||
|
|
||
| """ | ||
| return abs((mat - mat.T) > tol).nnz == 0 | ||
Uh oh!
There was an error while loading. Please reload this page.