diff --git a/doc/source/reconstruction.rst b/doc/source/reconstruction.rst index 8d7a37a3..2ecd725a 100644 --- a/doc/source/reconstruction.rst +++ b/doc/source/reconstruction.rst @@ -1,7 +1,7 @@ Reconstruction ============== -Algorithms to recosntruct a graph from time series data. +Algorithms to reconstruct a graph from time series data. Base class @@ -20,7 +20,6 @@ the same general usage as above. netrd.reconstruction.ConvergentCrossMapping netrd.reconstruction.CorrelationMatrix - netrd.reconstruction.CorrelationSpanningTree netrd.reconstruction.FreeEnergyMinimization netrd.reconstruction.GrangerCausality netrd.reconstruction.GraphicalLasso diff --git a/doc/source/threshold.rst b/doc/source/threshold.rst deleted file mode 100644 index ec682582..00000000 --- a/doc/source/threshold.rst +++ /dev/null @@ -1,3 +0,0 @@ -.. automodule:: netrd.utilities.threshold - :members: - :undoc-members: diff --git a/doc/source/utilities.rst b/doc/source/utilities.rst index a91c88bb..bc84ba23 100644 --- a/doc/source/utilities.rst +++ b/doc/source/utilities.rst @@ -13,4 +13,3 @@ Common utilities for use within ``netrd``. graph read standardize - threshold diff --git a/netrd/reconstruction/__init__.py b/netrd/reconstruction/__init__.py index 3ce579a9..3aa175d8 100644 --- a/netrd/reconstruction/__init__.py +++ b/netrd/reconstruction/__init__.py @@ -15,7 +15,6 @@ from .naive_transfer_entropy import NaiveTransferEntropy from .granger_causality import GrangerCausality from .optimal_causation_entropy import OptimalCausationEntropy -from .correlation_spanning_tree import CorrelationSpanningTree __all__ = [ 'RandomReconstructor', @@ -34,5 +33,4 @@ 'NaiveTransferEntropy', 'GrangerCausality', 'OptimalCausationEntropy', - 'CorrelationSpanningTree', ] diff --git a/netrd/reconstruction/base.py b/netrd/reconstruction/base.py index dc705598..0fbec247 100644 --- a/netrd/reconstruction/base.py +++ b/netrd/reconstruction/base.py @@ -1,40 +1,336 @@ +import networkx as nx +import numpy as np +import warnings +import scipy.sparse as sp +from scipy.sparse.csgraph import minimum_spanning_tree + + class BaseReconstructor: - r"""Base class for graph reconstruction algorithms. + """Base class for graph reconstruction algorithms. The basic usage of a graph reconstruction algorithm is as follows: - >>> reconstructor = ReconstructionAlgorithm() - >>> G = reconstructor.fit(TS, ) - >>> # or alternately, G = reconstructor.results['graph'] + >>> R = ReconstructionAlgorithm() + >>> G = R.fit(TS, ).to_graph() + + However, this is probably not the desired behavior, because, depending on + the method used, this typically returns a weighted dense graph, possibly + with self-loops. If a sparse or unweighted graph is desired, use the + thresholding functionality, as in the example below: - Here, `TS` is an :math:`N \times L` numpy array consisting of :math:`L` - observations for each of :math:`N` sensors. This constrains the graphs - to have integer-valued nodes. + >>> R = ReconstructionAlgorithm() + >>> R = R.fit(TS, ) + >>> R = R.remove_self_loops().threshold('quantile', quantile=0.8) + >>> G = R.to_graph() - The ``results`` dict object, in addition to containing the graph - object, may also contain objects created as a side effect of - reconstructing the network, which may be useful for debugging or - considering goodness of fit. What is returned will vary between - reconstruction algorithms. + Note that these can all be combined into a single call using method + chaining. + + All algorithms subclass `BaseReconstructor` and override the `fit()` + method; see the documentation for each subclass's `fit()` method for + documentation of the algorithm. """ def __init__(self): + """Constructor for reconstructor classes. These take no arguments and define + three attributes: + + 1. `self._graph`: A representation of the reconstructed network as a + NetworkX graph (or subclass). + + 2. `self._matrix`: A representation of the reconstructed network as a + (dense) NumPy array. + + 3. `self.results`: A dictionary for storing any intermediate data + objects or diagnostics generated by a reconstruction method. + + `self._graph` and `self._matrix` should not be accessed directly by + users; instead, use the `to_matrix()` and `to_graph()` methods. + """ + + self._graph = None + self._matrix = None self.results = {} - def fit(self, TS, **kwargs): - """Reconstruct a graph from time series TS. + def fit(self, TS): + """The key method of the class. This takes an :math:`N \times L` numpy array, + representing a time series of :math:`L` observations from :math:`N` + sensors (nodes), and reconstructs a network from it. + + Any new reconstruction method should subclass from `BaseReconstructor` + and override `fit()`. This method should reconstruct the network as a + matrix of weights, call `self.update_matrix()`, and then `return self`. + + """ + self._matrix = np.zeros((TS.shape[0], TS.shape[0])) + return self + + def update_matrix(self, new_mat): + """ + Update the contents of `self._matrix`. This also empties out + `self._graph` to avoid inconsistent state between the graph and matrix + representations of the networks. + """ + self._matrix = new_mat.copy() + self._graph = None + + def to_dense(self): + if self._matrix is None: + raise ValueError( + "Matrix representation is missing. Have you fit the data yet?" + ) + elif sp.issparse(self._matrix): + return self._matrix.toarray() + else: + return self._matrix + + def to_sparse(self): + if self._matrix is None: + raise ValueError( + "Matrix representation is missing. Have you fit the data yet?" + ) + elif sp.issparse(self._matrix): + return self._matrix + else: + return sp.csr_matrix(self._matrix) + + def to_matrix(self): + if self._matrix is not None: + return self._matrix + else: + raise ValueError( + "Matrix representation is missing. Have you fit the data yet?" + ) + + def to_graph(self, create_using=None): + """Return the graph representation of the reconstructed network.""" + if self._graph is not None: + return self._graph + if self._matrix is not None: + A = self._matrix.copy() + + if not sp.issparse(self._matrix): + from_array = nx.from_numpy_array + allclose = lambda A: np.allclose(A, A.T) + else: + from_array = nx.from_scipy_sparse_matrix + allclose = _sparse_check_symmetric + + if create_using is None: + undirected = allclose(A) + + if undirected: + create_using = nx.Graph() + else: + create_using = nx.DiGraph() + + G = from_array(A, create_using=create_using) + + self._graph = G + return self._graph + + raise ValueError( + "Matrix and graph representations both missing. " + "Have you fit the data yet?" + ) + + def threshold_in_range(self, c=None, **kwargs): + """Threshold by setting values not within a list of ranges to zero. Parameters ---------- - TS (np.ndarray): Array consisting of $L$ observations from $N$ sensors. + cutoffs (list of tuples) + When thresholding, include only edges whose weights fall + within a given range or set of ranges. The lower value must come + first in each tuple. For example, to keep those values whose + absolute value is between :math:`0.5` and :math:`1`, pass + ``cutoffs=[(-1, -0.5), (0.5, 1)]``. + """ + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if 'cutoffs' in kwargs and c is None: + cutoffs = kwargs['cutoffs'] + elif c is None: + warnings.warn( + "Setting 'cutoffs' argument is strongly encouraged. " + "Using cutoff range of (-1, 1).", + RuntimeWarning, + ) + cutoffs = [(-1, 1)] + else: + cutoffs = c + + mat = s.to_dense().copy() + mask_function = np.vectorize( + lambda x: any([x >= cutoff[0] and x <= cutoff[1] for cutoff in cutoffs]) + ) + mask = mask_function(mat) - Returns - ------- - G (nx.Graph): A reconstructed graph with $N$ nodes. + thresholded_mat = np.where(mask, mat, 0) + thresholded_mat = sp.csr_matrix(thresholded_mat) + s.update_matrix(thresholded_mat) + return s + + def threshold_on_quantile(self, q=None, **kwargs): + """Threshold by setting values below a given quantile to zero. + + Parameters + ---------- + quantile (float) + The threshold above which to keep an element of the array, e.g., + set to zero elements below the 90th quantile of the array. + + """ + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if 'quantile' in kwargs and q is None: + quantile = kwargs['quantile'] + elif q is None: + warnings.warn( + "Setting 'quantile' argument is strongly recommended." + "Using target quantile of 0.9 for thresholding.", + RuntimeWarning, + ) + quantile = 0.9 + else: + quantile = q + mat = s.to_dense().copy() + + if np.isclose(quantile, 0): + thresholded_mat = mat + else: + thresholded_mat = mat * (mat > np.percentile(mat, quantile * 100)) + + s.update_matrix(sp.csr_matrix(thresholded_mat)) + return s + + def threshold_on_degree(self, k=None, **kwargs): + """Threshold by iteratively setting the smallest entries in the weights + matrix to zero until the average degree falls below a given value. + + Parameters + ---------- + avg_k (float) + The average degree to target when thresholding the matrix. + + """ + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if 'avg_k' in kwargs and k is None: + avg_k = kwargs['avg_k'] + elif k is None: + warnings.warn( + "Setting 'avg_k' argument is strongly encouraged. Using average " + "degree of 1 for thresholding.", + RuntimeWarning, + ) + avg_k = 1 + else: + avg_k = k + mat = s._matrix.copy() + + n = len(mat) + A = np.ones((n, n)) + + if np.mean(np.sum(A, 1)) <= avg_k: + # degenerate case: use the whole matrix + thresholded_mat = mat + else: + for m in sorted(mat.flatten()): + A[mat == m] = 0 + if np.mean(np.sum(A, 1)) <= avg_k: + break + thresholded_mat = mat * (mat > m) + + s.update_matrix(sp.csr_matrix(thresholded_mat)) + return s + + def threshold(self, rule, **kwargs): + """A flexible interface to other thresholding functions. + + Parameters + ---------- + rule (str) + A string indicating which thresholding function to invoke. + + kwargs (dict) + Named arguments to pass to the underlying threshold function. """ - G = nx.Graph() # reconstruct the graph - self.results['graph'] = G # and store it in self.results - # self.results[..] = .. # also store other values if needed - return G + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + try: + if rule == 'degree': + return s.threshold_on_degree(**kwargs) + elif rule == 'range': + return s.threshold_in_range(**kwargs) + elif rule == 'quantile': + return s.threshold_on_quantile(**kwargs) + elif rule == 'custom': + mat = s.to_dense() + s.update_matrix(sp.csr_matrix(kwargs['custom_thresholder'](mat))) + return s + + except KeyError: + raise ValueError("missing threshold parameter") + + def _mst_sparse(self, mat): + return minimum_spanning_tree(mat).asformat(mat.format) + + def _mst_dense(self, mat): + return minimum_spanning_tree(mat).asformat('csr') + + def minimum_spanning_tree(self): + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if sp.issparse(s._matrix): + MST = s._mst_sparse(s.to_dense()) + else: + MST = s._mst_dense(s.to_dense()) + s.update_matrix(MST) + return s + + def _binarize_sparse(self, mat): + return np.abs(mat.sign()) + + def _binarize_dense(self, mat): + return np.abs(np.sign(mat)) + + def binarize(self): + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if sp.issparse(s._matrix): + mat = s._binarize_sparse(s._matrix) + else: + mat = s._binarize_dense(s._matrix) + s.update_matrix(mat) + return s + + def _rsl_sparse(self, mat): + new_mat = mat.copy() + new_mat.setdiag(0) + return new_mat + + def _rsl_dense(self, mat): + new_mat = mat.copy() + np.fill_diagonal(new_mat, 0) + return new_mat + + def remove_self_loops(self): + s = self.__class__.__new__(self.__class__) + s.__dict__ = self.__dict__.copy() + if sp.issparse(s._matrix): + mat = s._rsl_sparse(s._matrix) + else: + mat = s._rsl_dense(s._matrix) + s.update_matrix(mat) + return s + + +def _sparse_check_symmetric(mat, tol=1e-8): + """np.allclose doesn't work on sparse matrices. This approximates the behavior + of np.allclose(mat, mat.T), where mat is a sparse matrix. + + """ + return abs((mat - mat.T) > tol).nnz == 0 diff --git a/netrd/reconstruction/convergent_cross_mapping.py b/netrd/reconstruction/convergent_cross_mapping.py index e9cef873..011e2ebe 100644 --- a/netrd/reconstruction/convergent_cross_mapping.py +++ b/netrd/reconstruction/convergent_cross_mapping.py @@ -14,17 +14,13 @@ import numpy as np from itertools import permutations from scipy.stats import pearsonr -import networkx as nx from sklearn.neighbors import NearestNeighbors -from ..utilities import create_graph, threshold class ConvergentCrossMapping(BaseReconstructor): """Infers dynamical causal relations.""" - def fit( - self, TS, tau=1, threshold_type='range', cutoffs=[(0.95, np.inf)], **kwargs - ): + def fit(self, TS, tau=1): r"""Infer causal relation applying Takens' Theorem of dynamical systems. Convergent cross-mapping infers dynamical causal relation between @@ -136,18 +132,13 @@ def fit( # sort zero p-values in decreasing order and tell edges with zero p-value weights = np.full(pvalue.shape, np.inf) weights[pvalue > 0] = -np.log10(pvalue[pvalue > 0]) - A = threshold(weights, threshold_type, cutoffs=cutoffs, **kwargs) - G = create_graph(A, create_using=nx.DiGraph()) - # Save the graph object, matrices of correlation and p-values into the - # "results" field (dictionary) + self.update_matrix(weights) self.results['correlation_matrix'] = correlation self.results['pvalues_matrix'] = pvalue self.results['weights_matrix'] = weights - self.results['thresholded_matrix'] = A - self.results['graph'] = G - return G + return self def shadow_data_cloud(data, N, tau): diff --git a/netrd/reconstruction/correlation_matrix.py b/netrd/reconstruction/correlation_matrix.py index 3f2b81cf..8a98be3b 100644 --- a/netrd/reconstruction/correlation_matrix.py +++ b/netrd/reconstruction/correlation_matrix.py @@ -8,14 +8,12 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx -from ..utilities import create_graph, threshold class CorrelationMatrix(BaseReconstructor): """Uses the correlation matrix.""" - def fit(self, TS, num_eigs=None, threshold_type='range', **kwargs): + def fit(self, TS, num_eigs=None): """Uses the correlation matrix. If ``num_eigs`` is `None`, perform the reconstruction using the @@ -85,13 +83,5 @@ def fit(self, TS, num_eigs=None, threshold_type='range', **kwargs): # store the appropriate source matrix self.results['weights_matrix'] = mat - - # threshold the correlation matrix - A = threshold(mat, threshold_type, **kwargs) - self.results['thresholded_matrix'] = A - - # construct the network - self.results['graph'] = create_graph(A) - G = self.results['graph'] - - return G + self.update_matrix(mat) + return self diff --git a/netrd/reconstruction/correlation_spanning_tree.py b/netrd/reconstruction/correlation_spanning_tree.py deleted file mode 100644 index 56176c8a..00000000 --- a/netrd/reconstruction/correlation_spanning_tree.py +++ /dev/null @@ -1,122 +0,0 @@ -""" -correlation_spanning_tree.py ----------------------------- - -Graph reconstruction algorithm based on Mantegna, R. N. (1999). Hierarchical structure in -financial markets. The European Physical Journal B-Condensed Matter and Complex Systems, -11(1), 193-197. DOI https://doi.org/10.1007/s100510050929 -https://link.springer.com/article/10.1007/s100510050929 - -author: Matteo Chinazzi -Submitted as part of the 2019 NetSI Collabathon. -""" - -from .base import BaseReconstructor -import numpy as np -import networkx as nx -from scipy.sparse.csgraph import minimum_spanning_tree - - -class CorrelationSpanningTree(BaseReconstructor): - """Minimum spanning tree connecting the sensors.""" - - def fit(self, TS, distance='root_inv', **kwargs): - r"""Create a minimum spanning tree connecting the sensors. - - The empirical correlation matrix is used to first compute a - distance matrix and then to create a minimum spanning tree - connecting all the sensors in the data. This method implements the - methodology described in [1]_ and applied in the context of creating - a graph connecting the stocks of a portfolio of generated by - looking at the correlations between the daily time series of stock - prices. - - The results dictionary also stores the distance matrix (computed - from the correlations) as `'distance_matrix'`. - - Parameters - ---------- - - TS (np.ndarray) - :math:`N \times L` array consisting of :math:`L` observations - from :math:`N` sensors. - - distance (str) - 'inv_square' calculates distance as :math:`1-corr_{ij}^2` - as in [1]_. 'root_inv' calculates distance as - :math:`\sqrt{2 (1-corr_{ij})}` [2]_. - - Returns - ------- - - G (nx.Graph) - A reconstructed graph with :math:`N` nodes. - - Examples - -------- - .. code:: python - - import numpy as np - import networkx as nx - from matplotlib import pyplot as plt - from netrd.reconstruction import CorrelationSpanningTree - - N = 25 - T = 300 - M = np.random.normal(size=(N,T)) - - print('Create correlated time series') - market_mode = 0.4*np.random.normal(size=(1,T)) - M += market_mode - - sector_modes = {d: 0.5*np.random.normal(size=(1,T)) for d in range(5)} - for sector_mode, vals in sector_modes.items(): - M[sector_mode*5:(sector_mode+1)*5,:] += vals - - print('Link node colors to sectors') - colors = ['b','r','g','y','m'] - node_colors = [color for color in colors for __ in range(5)] - - print('Network reconstruction step') - cst_net = CorrelationSpanningTree() - G = cst_net.fit(M) - - print('Plot reconstructed spanning tree') - fig, ax = plt.subplots() - nx.draw(G, ax=ax, node_color=node_colors) - - - References - ---------- - - .. [1] Mantegna, R. N. (1999). Hierarchical structure in financial - markets. The European Physical Journal B-Condensed Matter - and Complex Systems, 11(1), 193-197. DOI - https://doi.org/10.1007/s100510050929 - https://link.springer.com/article/10.1007/s100510050929 - - .. [2] Bonanno, G., Caldarelli, G., Lillo, F. & Mantegna, - R. N. (2003) Topology of correlation-based minimal spanning - trees in real and model markets. Physical Review E 68. - - .. [3] Vandewalle, N., Brisbois, F. & Tordoir, X. (2001) Non-random - topology of stock markets. Quantitative Finance 1, 372–374. - - """ - N, L = TS.shape - - C = np.corrcoef(TS) # Empirical correlation matrix - - D = ( - np.sqrt(2 * (1 - C)) if distance == 'root_inv' else 1 - np.square(C) - ) # Distance matrix - - self.results['distance_matrix'] = D - - MST = minimum_spanning_tree(D) # Minimum Spanning Tree - - G = nx.from_scipy_sparse_matrix(MST) - - self.results['graph'] = G - - return G diff --git a/netrd/reconstruction/free_energy_minimization.py b/netrd/reconstruction/free_energy_minimization.py index d55f4c66..7a442c8a 100644 --- a/netrd/reconstruction/free_energy_minimization.py +++ b/netrd/reconstruction/free_energy_minimization.py @@ -8,16 +8,13 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx -import scipy as sp from scipy import linalg -from ..utilities import create_graph, threshold class FreeEnergyMinimization(BaseReconstructor): """Applies free energy principle.""" - def fit(self, TS, threshold_type='degree', **kwargs): + def fit(self, TS): """Infer inter-node coupling weights by minimizing a free energy over the data structure. @@ -96,14 +93,7 @@ def fit(self, TS, threshold_type='degree', **kwargs): W[i0, :] = w[:] - # threshold the network - W_thresh = threshold(W, threshold_type, **kwargs) - - # construct the network - - self.results['graph'] = create_graph(W_thresh) + self.update_matrix(W) self.results['weights_matrix'] = W - self.results['thresholded_matrix'] = W_thresh - G = self.results['graph'] - return G + return self diff --git a/netrd/reconstruction/granger_causality.py b/netrd/reconstruction/granger_causality.py index 097222b8..8adb8444 100644 --- a/netrd/reconstruction/granger_causality.py +++ b/netrd/reconstruction/granger_causality.py @@ -18,13 +18,12 @@ from .base import BaseReconstructor from sklearn.linear_model import LinearRegression -from ..utilities import create_graph, threshold class GrangerCausality(BaseReconstructor): """Uses the Granger causality between nodes.""" - def fit(self, TS, lag=1, threshold_type="range", **kwargs): + def fit(self, TS, lag=1): r"""Reconstruct a network based on the Granger causality. To evaluate the effect of a time series :math:`j` over another, :math:`i`, it first evaluates the error :math:`e_1` given by an autoregressive model fit @@ -86,15 +85,9 @@ def fit(self, TS, lag=1, threshold_type="range", **kwargs): W[j, i] = np.log(std_i) - np.log(std_ij) self.results["weights_matrix"] = W - # threshold the network - W_thresh = threshold(W, threshold_type, **kwargs) - self.results["thresholded_matrix"] = W_thresh + self.update_matrix(W) - # construct the network - self.results["graph"] = create_graph(W_thresh) - G = self.results["graph"] - - return G + return self @staticmethod def split_data(TS, lag): diff --git a/netrd/reconstruction/graphical_lasso.py b/netrd/reconstruction/graphical_lasso.py index 3692bfa4..160a7b6e 100644 --- a/netrd/reconstruction/graphical_lasso.py +++ b/netrd/reconstruction/graphical_lasso.py @@ -16,7 +16,6 @@ import numpy as np from sklearn.covariance import graphical_lasso from .base import BaseReconstructor -from ..utilities import create_graph, threshold class GraphicalLasso(BaseReconstructor): @@ -90,13 +89,6 @@ def fit( self.results['weights_matrix'] = cov self.results['precision_matrix'] = prec - # threshold the network - self.results['thresholded_matrix'] = threshold( - self.results['weights_matrix'], threshold_type, **kwargs - ) + self.update_matrix(cov) - # construct the network - G = create_graph(self.results['thresholded_matrix']) - self.results['graph'] = G - - return G + return self diff --git a/netrd/reconstruction/marchenko_pastur.py b/netrd/reconstruction/marchenko_pastur.py index 1ecb0159..ef3e7de1 100644 --- a/netrd/reconstruction/marchenko_pastur.py +++ b/netrd/reconstruction/marchenko_pastur.py @@ -13,19 +13,13 @@ from .base import BaseReconstructor import numpy as np import networkx as nx -from ..utilities import create_graph, threshold class MarchenkoPastur(BaseReconstructor): """Uses Marchenko-Pastur law to remove noise.""" def fit( - self, - TS, - remove_largest=False, - metric_distance=False, - threshold_type='range', - **kwargs + self, TS, remove_largest=False, metric_distance=False, ): r"""Create a correlation-based graph using Marchenko-Pastur law to remove noise. @@ -148,9 +142,9 @@ def fit( selected = (w < w_min) | (w > w_max) if selected.sum() == 0: - G = nx.empty_graph(n=N) - self.results['graph'] = G - return G + self.update_matrix(np.zeros((N, N))) + self.results['weights_matrix'] = np.zeros((N, N)) + return self if remove_largest: selected[-1] = False @@ -164,14 +158,6 @@ def fit( C_signal = np.sqrt(2 * (1 - C_signal)) self.results['weights_matrix'] = C_signal + self.update_matrix(C_signal) - # threshold signal matrix - - self.results['thresholded_matrix'] = threshold( - C_signal, threshold_type, **kwargs - ) - - G = create_graph(self.results['thresholded_matrix']) - - self.results['graph'] = G - return G + return self diff --git a/netrd/reconstruction/maximum_likelihood_estimation.py b/netrd/reconstruction/maximum_likelihood_estimation.py index 5b4438e7..79ecd548 100644 --- a/netrd/reconstruction/maximum_likelihood_estimation.py +++ b/netrd/reconstruction/maximum_likelihood_estimation.py @@ -8,8 +8,6 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx -from ..utilities import create_graph, threshold class MaximumLikelihoodEstimation(BaseReconstructor): @@ -80,14 +78,6 @@ def fit(self, TS, rate=1.0, stop_criterion=True, threshold_type='degree', **kwar W[i0, :] = w - # threshold the network - W_thresh = threshold(W, threshold_type, **kwargs) + self.update_matrix(W) - # construct the network - - self.results['graph'] = create_graph(W_thresh) - self.results['weights_matrix'] = W - self.results['thresholded_matrix'] = W_thresh - G = self.results['graph'] - - return G + return self diff --git a/netrd/reconstruction/mean_field.py b/netrd/reconstruction/mean_field.py index 8d5a73aa..462ad08c 100644 --- a/netrd/reconstruction/mean_field.py +++ b/netrd/reconstruction/mean_field.py @@ -8,18 +8,13 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx -import scipy as sp from scipy import linalg from scipy.integrate import quad from scipy.optimize import fsolve -from ..utilities import create_graph, threshold class MeanField(BaseReconstructor): - def fit( - self, TS, exact=True, stop_criterion=True, threshold_type='range', **kwargs - ): + def fit(self, TS, exact=True, stop_criterion=True): """Infer inter-node coupling weights using a mean field approximation. From the paper: "Exact mean field (eMF) is another mean field @@ -137,17 +132,10 @@ def integrand(H): else: W = np.dot(A_inv, B) - # threshold the network - W_thresh = threshold(W, threshold_type, **kwargs) - - # construct the network - - self.results['graph'] = create_graph(W_thresh) + self.update_matrix(W) self.results['weights_matrix'] = W - self.results['thresholded_matrix'] = W_thresh - G = self.results['graph'] - return G + return self def cross_cov(a, b): diff --git a/netrd/reconstruction/mutual_information_matrix.py b/netrd/reconstruction/mutual_information_matrix.py index 6a3b4001..d097dd82 100644 --- a/netrd/reconstruction/mutual_information_matrix.py +++ b/netrd/reconstruction/mutual_information_matrix.py @@ -14,14 +14,12 @@ from .base import BaseReconstructor import numpy as np -import networkx as nx -from ..utilities import create_graph, threshold class MutualInformationMatrix(BaseReconstructor): """Uses the mutual information between nodes.""" - def fit(self, TS, nbins=10, threshold_type='degree', **kwargs): + def fit(self, TS, nbins=10): """Calculates the mutual information between the probability distributions of the (binned) values of the time series of pairs of nodes. @@ -74,16 +72,8 @@ def fit(self, TS, nbins=10, threshold_type='degree', **kwargs): I = mutual_info_all_pairs(JointP, ProduP, N) self.results['weights_matrix'] = I - # the adjacency matrix is the binarized thresholded mutual information matrix - # tau=threshold_from_degree(deg,I) - # A = np.array(I>tau, dtype=int) - A = threshold(I, threshold_type, **kwargs) - self.results['thresholded_matrix'] = A - - G = create_graph(A) - self.results['graph'] = G - - return G + self.update_matrix(I) + return self def find_individual_probability_distribution(TS, rang, nbins): @@ -230,26 +220,3 @@ def mutual_info_all_pairs(JointP, ProduP, N): I[l, j] = I[j, l] # this method is symmetric return I - - -def threshold_from_degree(deg, M): - """ - Compute the required threshold (tau) in order to yield a reconstructed graph of mean degree deg. - Parameters - ---------- - deg (int): Target degree for which the appropriate threshold will be computed - M (np.ndarray): Pre-thresholded NxN array - Returns - ------ - tau (float): Required threshold for A=np.array(I j. The resulting network is asymmetric, and each element @@ -98,16 +97,8 @@ def fit(self, TS, delay_max=1, n_bins=2, threshold_type='range', **kwargs): TE[i, j] = np.mean(te_list) self.results['weights_matrix'] = TE - - # threshold the network - TE_thresh = threshold(TE, threshold_type, **kwargs) - self.results['thresholded_matrix'] = TE_thresh - - # construct the network - self.results['graph'] = create_graph(TE_thresh) - G = self.results['graph'] - - return G + self.update_matrix(TE) + return self def transfer_entropy(X, Y, delay): diff --git a/netrd/reconstruction/optimal_causation_entropy.py b/netrd/reconstruction/optimal_causation_entropy.py index 8afbafd7..c65da360 100644 --- a/netrd/reconstruction/optimal_causation_entropy.py +++ b/netrd/reconstruction/optimal_causation_entropy.py @@ -13,7 +13,6 @@ from netrd.utilities.entropy import categorized_data, conditional_entropy import networkx as nx import numpy as np -from ..utilities import create_graph class OptimalCausationEntropy(BaseReconstructor): @@ -112,12 +111,11 @@ def fit(self, TS, n_bins=40, atol=1e-6, **kwargs): # Build the reconstructed graph A = nx.to_numpy_array(nx.DiGraph(adjlist).reverse()) - G = create_graph(A, create_using=nx.DiGraph(), remove_self_loops=False) + self.update_matrix(A) self.results['adjacency_matrix'] = A - self.results['graph'] = G self.results['parents'] = adjlist - return G + return self def causal_superset(nodes_I, data, atol): diff --git a/netrd/reconstruction/ou_inference.py b/netrd/reconstruction/ou_inference.py index ea742b91..07b33205 100644 --- a/netrd/reconstruction/ou_inference.py +++ b/netrd/reconstruction/ou_inference.py @@ -14,16 +14,14 @@ """ from .base import BaseReconstructor -import networkx as nx import numpy as np -from scipy.linalg import eig, inv -from ..utilities import create_graph, threshold +from scipy.linalg import eig class OUInference(BaseReconstructor): """Assumes a Orstein-Uhlenbeck generative model.""" - def fit(self, TS, threshold_type='range', **kwargs): + def fit(self, TS): """Infers the coupling coefficients assuming a Orstein-Uhlenbeck process generative model. @@ -67,15 +65,8 @@ def fit(self, TS, threshold_type='range', **kwargs): self.results['weights_matrix'] = np.zeros([N, N]) self.results['weights_matrix'][index_pair] = weights - # threshold the network - W_thresh = threshold(self.results['weights_matrix'], threshold_type, **kwargs) - self.results['thresholded_matrix'] = W_thresh - - # construct the network - self.results['graph'] = create_graph(W_thresh) - G = self.results['graph'] - - return G + self.update_matrix(self.results['weights_matrix']) + return self def inverse_method(covariance, temperatures): diff --git a/netrd/reconstruction/partial_correlation_influence.py b/netrd/reconstruction/partial_correlation_influence.py index e25fbcf3..6f261d5f 100644 --- a/netrd/reconstruction/partial_correlation_influence.py +++ b/netrd/reconstruction/partial_correlation_influence.py @@ -20,13 +20,12 @@ from .base import BaseReconstructor import numpy as np from scipy import linalg -from ..utilities import create_graph, threshold class PartialCorrelationInfluence(BaseReconstructor): """Uses average effect from a sensor to all others.""" - def fit(self, TS, index=None, threshold_type='range', **kwargs): + def fit(self, TS, index=None): r"""Uses the average effect of a series :math:`Z` on the correlation between a series :math:`X` and all other series. @@ -85,6 +84,7 @@ def fit(self, TS, index=None, threshold_type='range', **kwargs): (2015). """ + data = TS.T N = data.shape[1] @@ -123,16 +123,8 @@ def fit(self, TS, index=None, threshold_type='range', **kwargs): self.results['weights_matrix'] = influence - # threshold the network - W_thresh = threshold(influence, threshold_type, **kwargs) - - # construct the network - self.results['graph'] = create_graph(W_thresh) - self.results['thresholded_matrix'] = W_thresh - - G = self.results['graph'] - - return G + self.update_matrix(influence) + return self def partial_corr(_vars, idx_vars): diff --git a/netrd/reconstruction/partial_correlation_matrix.py b/netrd/reconstruction/partial_correlation_matrix.py index 02b93ee3..587ab539 100644 --- a/netrd/reconstruction/partial_correlation_matrix.py +++ b/netrd/reconstruction/partial_correlation_matrix.py @@ -11,22 +11,14 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx from scipy import stats, linalg -from ..utilities import create_graph, threshold class PartialCorrelationMatrix(BaseReconstructor): """Uses a regularized form of the precision matrix.""" def fit( - self, - TS, - index=None, - drop_index=True, - of_residuals=False, - threshold_type='range', - **kwargs + self, TS, index=None, drop_index=True, of_residuals=False, ): """Uses a regularized form of the precision matrix. @@ -82,17 +74,9 @@ def fit( p_cor = partial_corr(p_cor, index=None) self.results['weights_matrix'] = p_cor + self.update_matrix(p_cor) - # threshold the network - W_thresh = threshold(p_cor, threshold_type, **kwargs) - - # construct the network - self.results['graph'] = create_graph(W_thresh) - self.results['thresholded_matrix'] = W_thresh - - G = self.results['graph'] - - return G + return self # This partial correlation function is adapted from Fabian Pedregosa-Izquierdo's diff --git a/netrd/reconstruction/random.py b/netrd/reconstruction/random.py index 14417938..613cf0f7 100644 --- a/netrd/reconstruction/random.py +++ b/netrd/reconstruction/random.py @@ -12,15 +12,13 @@ """ from .base import BaseReconstructor -import networkx as nx import numpy as np -from ..utilities import create_graph, threshold class RandomReconstructor(BaseReconstructor): """Returns a random graph (dummy class).""" - def fit(self, TS, threshold_type='range', **kwargs): + def fit(self, TS): """Return a random correlation matrix with a threshold. The results dictionary also stores the weight matrix as @@ -47,9 +45,7 @@ def fit(self, TS, threshold_type='range', **kwargs): """ N, L = TS.shape W = np.random.rand(N, N) - A = threshold(W, threshold_type, **kwargs) - G = create_graph(A) - self.results['graph'] = G + self.results['weights_matrix'] = W - self.results['thresholded_matrix'] = A - return G + self.update_matrix(W) + return self diff --git a/netrd/reconstruction/thouless_anderson_palmer.py b/netrd/reconstruction/thouless_anderson_palmer.py index d3d5c411..4f3f3512 100644 --- a/netrd/reconstruction/thouless_anderson_palmer.py +++ b/netrd/reconstruction/thouless_anderson_palmer.py @@ -9,16 +9,13 @@ """ from .base import BaseReconstructor import numpy as np -import networkx as nx -import scipy as sp from scipy import linalg -from ..utilities import create_graph, threshold class ThoulessAndersonPalmer(BaseReconstructor): """Uses Thouless-Anderson-Palmer mean field approximation.""" - def fit(self, TS, threshold_type='range', **kwargs): + def fit(self, TS): """Infer inter-node coupling weights using a Thouless-Anderson-Palmer mean field approximation. @@ -111,16 +108,8 @@ def fit(self, TS, threshold_type='range', **kwargs): # predict W: W = np.dot(A_TAP_inv, B) self.results['weights_matrix'] = W - - # threshold the network - W_thresh = threshold(W, threshold_type, **kwargs) - self.results['thresholded_matrix'] = W_thresh - - # construct the network - self.results['graph'] = create_graph(W_thresh) - G = self.results['graph'] - - return G + self.update_matrix(W) + return self def cross_cov(a, b): diff --git a/netrd/utilities/__init__.py b/netrd/utilities/__init__.py index 0b40a40b..4e52cf9a 100644 --- a/netrd/utilities/__init__.py +++ b/netrd/utilities/__init__.py @@ -5,7 +5,6 @@ Common utilities for use within ``netrd``. """ -from .threshold import threshold from .graph import * from .read import * from .cluster import * diff --git a/netrd/utilities/threshold.py b/netrd/utilities/threshold.py deleted file mode 100644 index 2959ec29..00000000 --- a/netrd/utilities/threshold.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -threshold.py ------------- - -Utilities for thresholding matrices based on different criteria - -author: Stefan McCabe (stefanmccabe at gmail dot com) - -Submitted as part of the 2019 NetSI Collabathon. - -""" -import numpy as np -import warnings - - -def threshold_in_range(mat, **kwargs): - r"""Threshold by setting values not within a list of ranges to zero. - - Parameters - ---------- - mat (np.ndarray) - A numpy array. - - cutoffs (list of tuples) - When thresholding, include only edges whose correlations fall - within a given range or set of ranges. The lower value must come - first in each tuple. For example, to keep those values whose - absolute value is between :math:`0.5` and :math:`1`, pass - ``cutoffs=[(-1, -0.5), (0.5, 1)]``. - - Returns - ------- - thresholded_mat (np.ndarray) - the thresholded numpy array - - """ - if 'cutoffs' in kwargs: - cutoffs = kwargs['cutoffs'] - else: - warnings.warn( - "Setting 'cutoffs' argument is strongly encouraged. Using cutoff range of (-1, 1).", - RuntimeWarning, - ) - cutoffs = [(-1, 1)] - - mask_function = np.vectorize( - lambda x: any([x >= cutoff[0] and x <= cutoff[1] for cutoff in cutoffs]) - ) - mask = mask_function(mat) - - thresholded_mat = mat * mask - - if kwargs.get('binary', False): - thresholded_mat = np.abs(np.sign(thresholded_mat)) - - if kwargs.get('remove_self_loops', True): - np.fill_diagonal(thresholded_mat, 0) - - return thresholded_mat - - -def threshold_on_quantile(mat, **kwargs): - """Threshold by setting values below a given quantile to zero. - - Parameters - ---------- - - mat (np.ndarray) - A numpy array. - - quantile (float) - The threshold above which to keep an element of the array, e.g., - set to zero elements below the 90th quantile of the array. - - Returns - ------- - thresholded_mat - the thresholded numpy array - - """ - if 'quantile' in kwargs: - quantile = kwargs['quantile'] - else: - warnings.warn( - "Setting 'quantile' argument is strongly recommended. Using target quantile of 0.9 for thresholding.", - RuntimeWarning, - ) - quantile = 0.9 - - if kwargs.get('remove_self_loops', True): - np.fill_diagonal(mat, 0) - - if quantile != 0: - thresholded_mat = mat * (mat > np.percentile(mat, quantile * 100)) - else: - thresholded_mat = mat - - if kwargs.get('binary', False): - thresholded_mat = np.abs(np.sign(thresholded_mat)) - - return thresholded_mat - - -def threshold_on_degree(mat, **kwargs): - """Threshold by setting values below a given quantile to zero. - - Parameters - ---------- - - mat (np.ndarray) - A numpy array. - - avg_k (float) - The average degree to target when thresholding the matrix. - - Returns - ------- - thresholded_mat - the thresholded numpy array - - """ - - if 'avg_k' in kwargs: - avg_k = kwargs['avg_k'] - else: - warnings.warn( - "Setting 'avg_k' argument is strongly encouraged. Using average " - "degree of 1 for thresholding.", - RuntimeWarning, - ) - avg_k = 1 - - n = len(mat) - A = np.ones((n, n)) - - if kwargs.get('remove_self_loops', True): - np.fill_diagonal(A, 0) - np.fill_diagonal(mat, 0) - - if np.mean(np.sum(A, 1)) <= avg_k: - # degenerate case: threshold the whole matrix - thresholded_mat = mat - else: - for m in sorted(mat.flatten()): - A[mat == m] = 0 - if np.mean(np.sum(A, 1)) <= avg_k: - break - thresholded_mat = mat * (mat > m) - - if kwargs.get('binary', False): - thresholded_mat = np.abs(np.sign(thresholded_mat)) - - return thresholded_mat - - -def threshold(mat, rule, **kwargs): - """A flexible interface to other thresholding functions. - - Parameters - ---------- - - mat (np.ndarray) - A numpy array. - - rule (str) - A string indicating which thresholding function to invoke. - - kwargs (dict) - Named arguments to pass to the underlying threshold function. - - Returns - ------- - thresholded_mat - the thresholded numpy array - - """ - try: - if rule == 'degree': - return threshold_on_degree(mat, **kwargs) - elif rule == 'range': - return threshold_in_range(mat, **kwargs) - elif rule == 'quantile': - return threshold_on_quantile(mat, **kwargs) - elif rule == 'custom': - return kwargs['custom_thresholder'](mat) - except KeyError: - raise ValueError("missing threshold parameter") diff --git a/tests/test_reconstruction.py b/tests/test_reconstruction.py index a6b040fd..3c28ae2d 100644 --- a/tests/test_reconstruction.py +++ b/tests/test_reconstruction.py @@ -27,7 +27,7 @@ def test_graph_size(): continue if isinstance(obj, type) and BaseReconstructor in obj.__bases__: TS = np.random.random((size, 125)) - G = obj().fit(TS, threshold_type='range', cutoffs=[(-np.inf, np.inf)]) + G = obj().fit(TS).threshold_in_range([(-np.inf, np.inf)]).to_graph() assert G.order() == size @@ -39,9 +39,8 @@ def test_naive_transfer_entropy(): """ size = 25 TS = np.random.random((size, 100)) - G = reconstruction.NaiveTransferEntropy().fit( - TS, delay_max=2, threshold_type='range', cutoffs=[(-np.inf, np.inf)] - ) + R = reconstruction.NaiveTransferEntropy() + G = R.fit(TS).threshold_in_range([(-np.inf, np.inf)]).to_graph() assert G.order() == size @@ -52,9 +51,8 @@ def test_oce(): size = 25 TS = np.random.random((size, 50)) - G = reconstruction.OptimalCausationEntropy().fit( - TS, threshold_type='range', cutoffs=[(-np.inf, np.inf)] - ) + R = reconstruction.OptimalCausationEntropy() + G = R.fit(TS).threshold_in_range([(-np.inf, np.inf)]).to_graph() assert G.order() == size @@ -66,14 +64,20 @@ def test_convergent_cross_mapping(): """ filepath = '../data/two_species_coupled_time_series.dat' edgelist = {(1, 0), (0, 1)} - keys = ['graph', 'weights_matrix', 'pvalues_matrix'] + keys = ['weights_matrix', 'pvalues_matrix'] TS = np.loadtxt(filepath, delimiter=',') recon = ConvergentCrossMapping() - G = recon.fit(TS, threshold_type='range', cutoffs=[(-np.inf, np.inf)]) + G = ( + recon.fit(TS) + .remove_self_loops() + .threshold_in_range(cutoffs=[(-np.inf, np.inf)]) + .to_graph() + ) el = set(G.edges()) res = recon.results.keys() + assert G.is_directed() assert el == edgelist assert all(k in res for k in keys) @@ -91,10 +95,69 @@ def test_partial_correlation(): pass # this shouldn't be a valid parameterization else: TS = np.random.random((size, 50)) - G = reconstruction.PartialCorrelationMatrix().fit( - TS, index=index, cutoffs=[(-np.inf, np.inf)] - ) + R = reconstruction.PartialCorrelationMatrix() + R = R.fit(TS, index=index, of_residuals=resid) + G = R.threshold_in_range([(-np.inf, np.inf)]).to_graph() if index is None: assert G.order() == size else: assert G.order() == (size - 1) + + +def test_thresholds(): + """ + Test the threshold function by testing three underlying thresholding + methods: range, quantile, and degree. + """ + + R = reconstruction.BaseReconstructor() + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + + for k in range(5): + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + thresholded_mat = R.threshold('degree', avg_k=k).to_dense() + assert (thresholded_mat != 0).sum() == 4 * k + + for n in range(17): + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + thresholded_mat = R.threshold('quantile', quantile=n / 16).to_dense() + assert (thresholded_mat != 0).sum() == 16 - n + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + thresholded_mat = R.threshold('range', cutoffs=[(0, np.inf)]).to_dense() + assert (thresholded_mat >= 0).all() + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + thresholded_mat = R.threshold('range', cutoffs=[(-np.inf, 0)]).to_dense() + assert (thresholded_mat <= 0).all() + + target_mat = np.array( + [[0, 0, 0, 0], [0, 0, 0, 0], [9, 10, 11, 12], [13, 14, 15, 16]] + ) + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal( + R.threshold('range', cutoffs=[(9, 16)]).to_dense(), target_mat + ) + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal(R.threshold('degree', avg_k=2).to_dense(), target_mat) + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal(R.threshold('quantile', quantile=0.5).to_dense(), target_mat) + + target_mat = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]) + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal( + R.threshold('range', cutoffs=[(9, 16)]).binarize().to_dense(), target_mat, + ) + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal( + R.threshold('degree', avg_k=2, binary=True).binarize().to_dense(), target_mat, + ) + + R._matrix = np.arange(1, 17, 1).reshape((4, 4)) + assert np.array_equal( + R.threshold('quantile', quantile=0.5).binarize().to_dense(), target_mat, + ) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 5b2ab791..9839b628 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -9,68 +9,6 @@ import numpy as np from netrd.utilities.entropy import categorized_data from netrd.utilities.entropy import entropy_from_seq, joint_entropy, conditional_entropy -from netrd.utilities import threshold - - -def test_thresholds(): - """ - Test the threshold function by testing three underlying thresholding - methods: range, quantile, and degree. - """ - - mat = np.arange(1, 17, 1).reshape((4, 4)) - - for k in range(5): - thresholded_mat = threshold(mat, 'degree', avg_k=k, remove_self_loops=False) - assert (thresholded_mat != 0).sum() == 4 * k - - for n in range(17): - thresholded_mat = threshold( - mat, 'quantile', quantile=n / 16, remove_self_loops=False - ) - print(n) - assert (thresholded_mat != 0).sum() == 16 - n - - thresholded_mat = threshold( - mat, 'range', cutoffs=[(0, np.inf)], remove_self_loops=False - ) - assert (thresholded_mat >= 0).all() - - thresholded_mat = threshold( - mat, 'range', cutoffs=[(-np.inf, 0)], remove_self_loops=False - ) - assert (thresholded_mat <= 0).all() - - target_mat = np.array( - [[0, 0, 0, 0], [0, 0, 0, 0], [9, 10, 11, 12], [13, 14, 15, 16]] - ) - - assert np.array_equal( - threshold(mat, 'range', cutoffs=[(9, 16)], remove_self_loops=False), target_mat - ) - assert np.array_equal( - threshold(mat, 'degree', avg_k=2, remove_self_loops=False), target_mat - ) - assert np.array_equal( - threshold(mat, 'quantile', quantile=0.5, remove_self_loops=False), target_mat - ) - - target_mat = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]]) - - assert np.array_equal( - threshold( - mat, 'range', cutoffs=[(9, 16)], binary=True, remove_self_loops=False - ), - target_mat, - ) - assert np.array_equal( - threshold(mat, 'degree', avg_k=2, binary=True, remove_self_loops=False), - target_mat, - ) - assert np.array_equal( - threshold(mat, 'quantile', quantile=0.5, binary=True, remove_self_loops=False), - target_mat, - ) def test_categorized_data():