Source code for DiverseSelector.diversity

# -*- coding: utf-8 -*-
# The DiverseSelector library provides a set of tools to select molecule
# subset with maximum molecular diversity.
#
# Copyright (C) 2022 The QC-Devs Community
#
# This file is part of DiverseSelector.
#
# DiverseSelector is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# DiverseSelector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --

"""Molecule dataset diversity calculation module."""

import warnings
from typing import List

import numpy as np
from scipy.spatial.distance import euclidean

__all__ = [
    "compute_diversity",
    "entropy",
    "logdet",
    "shannon_entropy",
    "wdud",
    "hypersphere_overlap_of_subset",
    "gini_coefficient",
]


[docs]def compute_diversity( features: np.array, div_type: str = "hypersphere_overlap_of_subset", ) -> float: """Compute diversity metrics. Parameters ---------- features : np.ndarray Feature matrix. div_type : str, optional Method of calculation diversity for a given molecule set, which includes "entropy", "logdet", "shannon_entropy", "wdud", gini_coefficient" and "hypersphere_overlap_of_subset". Default is "hypersphere_overlap_of_subset". mols : List[rdkit.Chem.rdchem.Mol], optional List of RDKit molecule objects. This is only needed when using the "explicit_diversity_index" method. Default=None. Returns ------- float, computed diversity. """ func_dict = { "entropy": entropy, "logdet": logdet, "shannon_entropy": shannon_entropy, "wdud": wdud, "hypersphere_overlap_of_subset": hypersphere_overlap_of_subset, "gini_coefficient": gini_coefficient, } if div_type in func_dict: return func_dict[div_type](features) else: raise ValueError(f"Diversity type {div_type} not supported.")
[docs]def entropy(x: np.ndarray) -> float: r"""Compute entropy of matrix. The equation for entropy is .. math:: E = $-\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}$ where N is the number of molecules in the set, L is the length of the fingerprint, and :math:y_i is a vector of the bitcounts of each feature in the fingerprints. Parameters ---------- x : ndarray Feature matrix. Returns ------- e : float Entropy of matrix. Notes ----- Feature matrices are converted to bits, so we lose any information associated with num in matrix. Weidlich, I. E., and Filippov, I. V. (2016) Using the Gini coefficient to measure the chemical diversity of small-molecule libraries. Journal of Computational Chemistry 37, 2091-2097. """ # convert input matrix to bit matrix y = np.empty(x.shape) for i in range(0, len(x)): for j in range(0, len(x[0])): if x[i, j] != 0: y[i, j] = 1 else: y[i, j] = 0 # initialize variables length = len(x[0]) n = len(x) top = 0 val = [] # count bits in fingerprint for i in range(0, length): val.append(sum(y[:, i])) ans = np.sort(val) # sum entropy calculation for each feature for i in range(0, length): if ans[i] == 0: raise ValueError if ans[i] != 0: top += ((ans[i]) / n) * (np.log(ans[i] / n)) e = -1 * (top / (length * 0.34657359027997264)) return e
[docs]def logdet(x: np.ndarray) -> float: r"""Computes the log determinant function. Input is an :math:S\times :math:n feature matrix with :math:S molecules and :math:n features. .. math: F_{logdet}\left(S\right) = \log{\det{\left(X[S]X[S]^T + I_{|S|} \right)}} Parameters ---------- x : ndarray(S, n) Subset feature matrix. Returns ------- f_logdet: float The volume of parallelotope spand by the matrix. Notes ----- Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022) Selecting molecules with diverse structures and properties by maximizing submodular functions of descriptors learned with graph neural networks. Scientific Reports 12. """ mid = np.dot(x, np.transpose(x)) f_logdet = np.log10(np.linalg.det(mid + np.identity(len(x)))) return f_logdet
[docs]def shannon_entropy(x: np.ndarray) -> float: r"""Computes the shannon entropy of a matrix. The equation for Shannon entropy is .. math:: H(X) = \sum_{i=1}^{n}-P_i(X)\log{P_i(X)} where X is the feature matrix, n is the number of features, and :math:`P_i(X)` is the proportion of molecules that have feature :math:i in :math:X. Parameters ---------- x : ndarray Bit-string matrix. Returns ------- h_x: float The shannon entropy of the matrix. Notes ----- Leguy, J., Glavatskikh, M., Cauchy, T., and Benoit. (2021) Scalable estimator of the diversity for de novo molecular generation resulting in a more robust QM dataset (OD9) and a more efficient molecular optimization. Journal of Cheminformatics 13. """ num_feat = len(x[0, :]) num_mols = len(x[:, 0]) h_x = 0 for i in range(0, num_feat): # calculate feature proportion p_i = np.count_nonzero(x[:, i]) / num_mols # sum all non-zero terms if p_i == 0: raise ValueError( f"Feature {i} has value 0 for all molecules." "Remove extraneous feature from data set." ) h_x += (-1 * p_i) * np.log10(p_i) return h_x
[docs]def wdud(x: np.ndarray) -> float: r"""Computes the Wasserstein Distance to Uniform Distribution(WDUD). The equation for the Wasserstein Distance for a single feature to uniform distribution is .. math:: WDUD(x) = \int_{0}^{1} |U(x) - V(x)|dx where the feature is normalized to [0, 1], :math:`U(x)=x` is the cumulative distribution of the uniform distribution on [0, 1], and :math:`V(x) = \sum_{y <= x}1 / N` is the discrete distribution of the values of the feature in :math:`x`, where :math:`y` is the ith feature. This integral is calculated iteratively between :math:y_i and :math:y_{i+1}, using trapezoidal method. Parameters ---------- x : ndarray(N, K) Feature array of N molecules and K features. Returns ------- float: The mean of the WDUD of each feature over all molecules. Notes ----- Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022) Selecting molecules with diverse structures and properties by maximizing submodular functions of descriptors learned with graph neural networks. Scientific Reports 12. """ if x.ndim != 2: raise ValueError(f"The number of dimensions {x.ndim} should be two.") # min_max normalization: num_features = len(x[0]) num_mols = len(x[:, 0]) # Find the maximum and minimum over each feature across all molecules. max_x = np.max(x, axis=0) min_x = np.min(x, axis=0) # Normalization of each feature to [0, 1] if np.any(np.abs(max_x - min_x) < 1e-30): raise ValueError( f"One of the features is redundant and causes normalization to fail." ) x_norm = (x - min_x) / (max_x - min_x) ans = [] # store the Wasserstein distance for each feature for i in range(0, num_features): wdu = 0.0 y = np.sort(x_norm[:, i]) # Round to the sixth decimal place and count number of unique elements # to construct an accurate cumulative discrete distribution func \sum_{x <= y_{i + 1}} 1/k y, counts = np.unique(np.round(x_norm[:, i], decimals=6), return_counts=True) p = 0 # Ignore 0 and because v_min= 0 for j in range(1, len(counts)): # integral from y_{i - 1} to y_{i} of |x - \sum_{x <= y_{i}} 1/k| dx yi1 = y[j - 1] yi = y[j] # Make a grid from yi1 to yi grid = np.linspace(yi1, yi, num=1000, endpoint=True) # Evaluate the integrand |x - \sum_{x <= y_{i + 1}} 1/k| p += counts[j - 1] integrand = np.abs(grid - p / num_mols) # Integrate using np.trapz wdu += np.trapz(y=integrand, x=grid) ans.append(wdu) return np.average(ans)
[docs]def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float: r"""Computes the overlap of subset with hyper-spheres around each point The edge penalty is also included, which disregards areas outside of the boundary of the full feature space/library. This is calculated as: .. math:: g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m), where :math:`i, j` is over the subset of molecules, :math:`O(i, j)` is the approximate overlap between hyper-spheres, :math:`k` is the number of features and :math:`E` is the edge penalty of a molecule. Parameters ---------- lib : ndarray Feature matrix of all molecules. x : ndarray Feature matrix of selected subset of molecules. Returns ------- g_s: float The total diversity volume of the matrix. Notes ----- Agrafiotis, D. K.. (1997) Stochastic Algorithms for Maximizing Molecular Diversity. Journal of Chemical Information and Computer Sciences 37, 841-851. """ d = len(x[0]) k = len(x[:, 0]) # Find the maximum and minimum over each feature across all molecules. max_x = np.max(lib, axis=0) min_x = np.min(lib, axis=0) # Normalization of each feature to [0, 1] if np.any(np.abs(max_x - min_x) < 1e-30): raise ValueError( f"One of the features is redundant and causes normalization to fail." ) x_norm = (x - min_x) / (max_x - min_x) # r_o = hypersphere radius r_o = d * np.sqrt(1 / k) if r_o > 0.5: warnings.warn( f"The number of molecules should be much larger" " than the number of features." ) g_s = 0 edge = 0 # lambda parameter controls edge penalty lam = (d - 1.0) / d # calculate overlap volume for i in range(0, (k - 1)): for j in range((i + 1), k): dist = np.linalg.norm(x_norm[i] - x_norm[j]) # Overlap penalty if dist <= (2 * r_o): with np.errstate(divide="ignore"): # min(100) ignores the inf case with divide by zero g_s += min(100, 2 * (r_o / dist) - 1) # Edge penalty: lambda (1 - \sum^d_j e_{ij} / (dr_0) edge_pen = 0.0 for j_dim in range(0, d): # calculate dist to closest boundary in jth coordinate, # with max value = 1, min value = 0 dist_max = np.abs(1 - x_norm[i, j_dim]) dist_min = x_norm[i, j_dim] dist = min(dist_min, dist_max) # truncate distance at r_o if dist > r_o: dist = r_o edge_pen += dist edge_pen /= d * r_o # print("Should be positive value only", (1.0 - edge_pen)) edge_pen = lam * (1.0 - edge_pen) edge += edge_pen g_s += edge return g_s
[docs]def gini_coefficient(a: np.ndarray): r""" Gini coefficient of bit-wise fingerprints of a database of molecules. Measures the chemical diversity of a database of molecules defined by the following formula: .. math:: G = \frac{2 \sum_{i=1}^L i ||y_i||_1 }{N \sum_{i=1}^L ||y_i||_1} - \frac{L+1}{L}, where :math:`y_i \in \{0, 1\}^N` is a vector of zero and ones of length the number of molecules :math:`N` of the `i`th feature, and :math:`L` is the feature length. Parameters ---------- a : ndarray(N, L) Molecule features in L bits with N molecules. Returns ------- float : Gini coefficient between zero and one, where closer to zero indicates more diversity. References ---------- .. [1] Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the chemical diversity of small‐molecule libraries." (2016): 2091-2097. """ # Check that `a` is a bit-wise fingerprint. if np.any(np.abs(np.sort(np.unique(a)) - np.array([0, 1])) > 1e-8): raise ValueError("Attribute `a` should have binary values.") if a.ndim != 2: raise ValueError( f"Attribute `a` should have dimension two rather than {a.ndim}." ) numb_features = a.shape[1] # Take the bit-count of each column/molecule. bit_count = np.sum(a, axis=0) # Sort the bit-count since Gini coefficients relies on cumulative distribution. bit_count = np.sort(bit_count) # Mean of denominator denominator = numb_features * np.sum(bit_count) numerator = np.sum(np.arange(1, numb_features + 1) * bit_count) return 2.0 * numerator / denominator - (numb_features + 1) / numb_features