# -*- coding: utf-8 -*-
# The DiverseSelector library provides a set of tools to select molecule
# subset with maximum molecular diversity.
#
# Copyright (C) 2022 The QC-Devs Community
#
# This file is part of DiverseSelector.
#
# DiverseSelector is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# DiverseSelector is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>
#
# --
"""Molecule dataset diversity calculation module."""
import warnings
from typing import List
import numpy as np
from scipy.spatial.distance import euclidean
__all__ = [
"compute_diversity",
"entropy",
"logdet",
"shannon_entropy",
"wdud",
"hypersphere_overlap_of_subset",
"gini_coefficient",
]
[docs]def compute_diversity(
features: np.array,
div_type: str = "hypersphere_overlap_of_subset",
) -> float:
"""Compute diversity metrics.
Parameters
----------
features : np.ndarray
Feature matrix.
div_type : str, optional
Method of calculation diversity for a given molecule set, which
includes "entropy", "logdet", "shannon_entropy", "wdud",
gini_coefficient" and "hypersphere_overlap_of_subset".
Default is "hypersphere_overlap_of_subset".
mols : List[rdkit.Chem.rdchem.Mol], optional
List of RDKit molecule objects. This is only needed when using the
"explicit_diversity_index" method. Default=None.
Returns
-------
float, computed diversity.
"""
func_dict = {
"entropy": entropy,
"logdet": logdet,
"shannon_entropy": shannon_entropy,
"wdud": wdud,
"hypersphere_overlap_of_subset": hypersphere_overlap_of_subset,
"gini_coefficient": gini_coefficient,
}
if div_type in func_dict:
return func_dict[div_type](features)
else:
raise ValueError(f"Diversity type {div_type} not supported.")
[docs]def entropy(x: np.ndarray) -> float:
r"""Compute entropy of matrix.
The equation for entropy is
.. math::
E = $-\frac{\sum{\frac{y_i}{N}\ln{\frac{y_i}{N}}}}{L\frac{\ln{2}}{2}}$
where N is the number of molecules in the set, L is the length of the fingerprint,
and :math:y_i is a vector of the bitcounts of each feature in the fingerprints.
Parameters
----------
x : ndarray
Feature matrix.
Returns
-------
e : float
Entropy of matrix.
Notes
-----
Feature matrices are converted to bits,
so we lose any information associated with num in matrix.
Weidlich, I. E., and Filippov, I. V. (2016)
Using the Gini coefficient to measure the chemical diversity of small-molecule libraries.
Journal of Computational Chemistry 37, 2091-2097.
"""
# convert input matrix to bit matrix
y = np.empty(x.shape)
for i in range(0, len(x)):
for j in range(0, len(x[0])):
if x[i, j] != 0:
y[i, j] = 1
else:
y[i, j] = 0
# initialize variables
length = len(x[0])
n = len(x)
top = 0
val = []
# count bits in fingerprint
for i in range(0, length):
val.append(sum(y[:, i]))
ans = np.sort(val)
# sum entropy calculation for each feature
for i in range(0, length):
if ans[i] == 0:
raise ValueError
if ans[i] != 0:
top += ((ans[i]) / n) * (np.log(ans[i] / n))
e = -1 * (top / (length * 0.34657359027997264))
return e
[docs]def logdet(x: np.ndarray) -> float:
r"""Computes the log determinant function.
Input is an :math:S\times :math:n feature matrix with
:math:S molecules and :math:n features.
.. math:
F_{logdet}\left(S\right) = \log{\det{\left(X[S]X[S]^T + I_{|S|} \right)}}
Parameters
----------
x : ndarray(S, n)
Subset feature matrix.
Returns
-------
f_logdet: float
The volume of parallelotope spand by the matrix.
Notes
-----
Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022)
Selecting molecules with diverse structures and properties by maximizing
submodular functions of descriptors learned with graph neural networks.
Scientific Reports 12.
"""
mid = np.dot(x, np.transpose(x))
f_logdet = np.log10(np.linalg.det(mid + np.identity(len(x))))
return f_logdet
[docs]def shannon_entropy(x: np.ndarray) -> float:
r"""Computes the shannon entropy of a matrix.
The equation for Shannon entropy is
.. math::
H(X) = \sum_{i=1}^{n}-P_i(X)\log{P_i(X)}
where X is the feature matrix, n is the number of features, and :math:`P_i(X)` is the
proportion of molecules that have feature :math:i in :math:X.
Parameters
----------
x : ndarray
Bit-string matrix.
Returns
-------
h_x: float
The shannon entropy of the matrix.
Notes
-----
Leguy, J., Glavatskikh, M., Cauchy, T., and Benoit. (2021)
Scalable estimator of the diversity for de novo molecular generation resulting
in a more robust QM dataset (OD9) and a more efficient molecular optimization.
Journal of Cheminformatics 13.
"""
num_feat = len(x[0, :])
num_mols = len(x[:, 0])
h_x = 0
for i in range(0, num_feat):
# calculate feature proportion
p_i = np.count_nonzero(x[:, i]) / num_mols
# sum all non-zero terms
if p_i == 0:
raise ValueError(
f"Feature {i} has value 0 for all molecules."
"Remove extraneous feature from data set."
)
h_x += (-1 * p_i) * np.log10(p_i)
return h_x
[docs]def wdud(x: np.ndarray) -> float:
r"""Computes the Wasserstein Distance to Uniform Distribution(WDUD).
The equation for the Wasserstein Distance for a single feature to uniform distribution is
.. math::
WDUD(x) = \int_{0}^{1} |U(x) - V(x)|dx
where the feature is normalized to [0, 1], :math:`U(x)=x` is the cumulative distribution
of the uniform distribution on [0, 1], and :math:`V(x) = \sum_{y <= x}1 / N` is the discrete
distribution of the values of the feature in :math:`x`, where :math:`y` is the ith feature. This
integral is calculated iteratively between :math:y_i and :math:y_{i+1}, using trapezoidal method.
Parameters
----------
x : ndarray(N, K)
Feature array of N molecules and K features.
Returns
-------
float:
The mean of the WDUD of each feature over all molecules.
Notes
-----
Nakamura, T., Sakaue, S., Fujii, K., Harabuchi, Y., Maeda, S., and Iwata, S.. (2022)
Selecting molecules with diverse structures and properties by maximizing
submodular functions of descriptors learned with graph neural networks.
Scientific Reports 12.
"""
if x.ndim != 2:
raise ValueError(f"The number of dimensions {x.ndim} should be two.")
# min_max normalization:
num_features = len(x[0])
num_mols = len(x[:, 0])
# Find the maximum and minimum over each feature across all molecules.
max_x = np.max(x, axis=0)
min_x = np.min(x, axis=0)
# Normalization of each feature to [0, 1]
if np.any(np.abs(max_x - min_x) < 1e-30):
raise ValueError(
f"One of the features is redundant and causes normalization to fail."
)
x_norm = (x - min_x) / (max_x - min_x)
ans = [] # store the Wasserstein distance for each feature
for i in range(0, num_features):
wdu = 0.0
y = np.sort(x_norm[:, i])
# Round to the sixth decimal place and count number of unique elements
# to construct an accurate cumulative discrete distribution func \sum_{x <= y_{i + 1}} 1/k
y, counts = np.unique(np.round(x_norm[:, i], decimals=6), return_counts=True)
p = 0
# Ignore 0 and because v_min= 0
for j in range(1, len(counts)):
# integral from y_{i - 1} to y_{i} of |x - \sum_{x <= y_{i}} 1/k| dx
yi1 = y[j - 1]
yi = y[j]
# Make a grid from yi1 to yi
grid = np.linspace(yi1, yi, num=1000, endpoint=True)
# Evaluate the integrand |x - \sum_{x <= y_{i + 1}} 1/k|
p += counts[j - 1]
integrand = np.abs(grid - p / num_mols)
# Integrate using np.trapz
wdu += np.trapz(y=integrand, x=grid)
ans.append(wdu)
return np.average(ans)
[docs]def hypersphere_overlap_of_subset(lib: np.ndarray, x: np.array) -> float:
r"""Computes the overlap of subset with hyper-spheres around each point
The edge penalty is also included, which disregards areas
outside of the boundary of the full feature space/library.
This is calculated as:
.. math::
g(S) = \sum_{i < j}^k O(i, j) + \sum^k_m E(m),
where :math:`i, j` is over the subset of molecules,
:math:`O(i, j)` is the approximate overlap between hyper-spheres,
:math:`k` is the number of features and :math:`E`
is the edge penalty of a molecule.
Parameters
----------
lib : ndarray
Feature matrix of all molecules.
x : ndarray
Feature matrix of selected subset of molecules.
Returns
-------
g_s: float
The total diversity volume of the matrix.
Notes
-----
Agrafiotis, D. K.. (1997) Stochastic Algorithms for Maximizing Molecular Diversity.
Journal of Chemical Information and Computer Sciences 37, 841-851.
"""
d = len(x[0])
k = len(x[:, 0])
# Find the maximum and minimum over each feature across all molecules.
max_x = np.max(lib, axis=0)
min_x = np.min(lib, axis=0)
# Normalization of each feature to [0, 1]
if np.any(np.abs(max_x - min_x) < 1e-30):
raise ValueError(
f"One of the features is redundant and causes normalization to fail."
)
x_norm = (x - min_x) / (max_x - min_x)
# r_o = hypersphere radius
r_o = d * np.sqrt(1 / k)
if r_o > 0.5:
warnings.warn(
f"The number of molecules should be much larger"
" than the number of features."
)
g_s = 0
edge = 0
# lambda parameter controls edge penalty
lam = (d - 1.0) / d
# calculate overlap volume
for i in range(0, (k - 1)):
for j in range((i + 1), k):
dist = np.linalg.norm(x_norm[i] - x_norm[j])
# Overlap penalty
if dist <= (2 * r_o):
with np.errstate(divide="ignore"):
# min(100) ignores the inf case with divide by zero
g_s += min(100, 2 * (r_o / dist) - 1)
# Edge penalty: lambda (1 - \sum^d_j e_{ij} / (dr_0)
edge_pen = 0.0
for j_dim in range(0, d):
# calculate dist to closest boundary in jth coordinate,
# with max value = 1, min value = 0
dist_max = np.abs(1 - x_norm[i, j_dim])
dist_min = x_norm[i, j_dim]
dist = min(dist_min, dist_max)
# truncate distance at r_o
if dist > r_o:
dist = r_o
edge_pen += dist
edge_pen /= d * r_o
# print("Should be positive value only", (1.0 - edge_pen))
edge_pen = lam * (1.0 - edge_pen)
edge += edge_pen
g_s += edge
return g_s
[docs]def gini_coefficient(a: np.ndarray):
r"""
Gini coefficient of bit-wise fingerprints of a database of molecules.
Measures the chemical diversity of a database of molecules defined by
the following formula:
.. math::
G = \frac{2 \sum_{i=1}^L i ||y_i||_1 }{N \sum_{i=1}^L ||y_i||_1} - \frac{L+1}{L},
where :math:`y_i \in \{0, 1\}^N` is a vector of zero and ones of length the
number of molecules :math:`N` of the `i`th feature, and :math:`L` is the feature length.
Parameters
----------
a : ndarray(N, L)
Molecule features in L bits with N molecules.
Returns
-------
float :
Gini coefficient between zero and one, where closer to zero indicates more diversity.
References
----------
.. [1] Weidlich, Iwona E., and Igor V. Filippov. "Using the gini coefficient to measure the
chemical diversity of smallāmolecule libraries." (2016): 2091-2097.
"""
# Check that `a` is a bit-wise fingerprint.
if np.any(np.abs(np.sort(np.unique(a)) - np.array([0, 1])) > 1e-8):
raise ValueError("Attribute `a` should have binary values.")
if a.ndim != 2:
raise ValueError(
f"Attribute `a` should have dimension two rather than {a.ndim}."
)
numb_features = a.shape[1]
# Take the bit-count of each column/molecule.
bit_count = np.sum(a, axis=0)
# Sort the bit-count since Gini coefficients relies on cumulative distribution.
bit_count = np.sort(bit_count)
# Mean of denominator
denominator = numb_features * np.sum(bit_count)
numerator = np.sum(np.arange(1, numb_features + 1) * bit_count)
return 2.0 * numerator / denominator - (numb_features + 1) / numb_features