Module sortedness.misc.trustworthiness
This module is needed because sklearn implementation calculates trustworthiness only for the entire dataset.
Expand source code
# Copyright (c) 2023. Davi Pereira dos Santos
# This file is part of the sortedness project.
# Please respect the license - more about this in the section (*) below.
#
# sortedness is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# sortedness is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with sortedness. If not, see <http://www.gnu.org/licenses/>.
#
# (*) Removing authorship by any means, e.g. by distribution of derived
# works or verbatim, obfuscated, compiled or rewritten versions of any
# part of this work is illegal and it is unethical regarding the effort and
# time spent here.
#
"""
This module is needed because sklearn implementation calculates trustworthiness only for the entire dataset.
"""
from math import nan
import numpy as np
from numpy import eye, where, setdiff1d
from numpy.linalg import norm
from numpy.random import shuffle
from scipy.stats import rankdata
from sklearn.decomposition import PCA
def rank_by_distances(X, instance, method="average"):
distances = euclidean__n_vs_1(X, instance)
return rankdata(distances, method=method) - 1
def euclidean__n_vs_1(X, instance):
return norm(X - instance, axis=1, keepdims=True)
def continuity(X, X_, k=5, return_pvalues=False):
"""
'continuity' of each point separately.
>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> s = continuity(original, original)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = continuity(original, projected)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s, pvalues = continuity(original, projected, return_pvalues=True)
>>> min(s), max(s), s
(0.8, 1.0, array([0.95, 0.8 , 0.95, 1. , 0.9 , 0.95, 0.95, 1. , 0.95, 1. , 0.85,
0.9 ]))
>>> pvalues
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])
Parameters
----------
k
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
return_pvalues
Add dummy p-values to result (NaNs)
Returns
-------
List of values, one for each instance
"""
return trustworthiness(X_, X, k, return_pvalues)
def trustworthiness(X, X_, k=5, return_pvalues=False):
"""
'trustworthiness' of each point separately.
>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> s = trustworthiness(original, original)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = trustworthiness(original, projected)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s, pvalues = trustworthiness(original, projected, return_pvalues=True)
>>> min(s), max(s), s
(0.75, 1.0, array([0.8 , 0.75, 0.9 , 1. , 0.85, 0.9 , 0.95, 1. , 0.95, 1. , 0.85,
0.8 ]))
>>> pvalues
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])
Parameters
----------
k
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
return_pvalues
Add dummy p-values to result (NaNs)
Returns
-------
List of values, one for each instance
"""
result, pvalues = [], []
n = len(X)
for a, b in zip(X, X_):
ra = rank_by_distances(X, a, "min")
rb = rank_by_distances(X_, b, "min")
a_neighbors = where(ra <= k)
b_neighbors = where(rb <= k)
U = setdiff1d(b_neighbors, a_neighbors)
r = 1 - 2 * sum(ra[U] - k) / k / (2 * n - 3 * k - 1)
result.append(r)
result = np.array(result)
if return_pvalues:
return result, np.array([nan for _ in result])
return result
Functions
def continuity(X, X_, k=5, return_pvalues=False)
-
'continuity' of each point separately.
>>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = continuity(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=2).fit_transform(original) >>> s = continuity(original, projected) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s, pvalues = continuity(original, projected, return_pvalues=True) >>> min(s), max(s), s (0.8, 1.0, array([0.95, 0.8 , 0.95, 1. , 0.9 , 0.95, 0.95, 1. , 0.95, 1. , 0.85, 0.9 ])) >>> pvalues array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])
Parameters
k
X
- matrix with an instance by row in a given space (often the original one)
X_
- matrix with an instance by row in another given space (often the projected one)
return_pvalues
- Add dummy p-values to result (NaNs)
Returns
List
ofvalues, one for each instance
Expand source code
def continuity(X, X_, k=5, return_pvalues=False): """ 'continuity' of each point separately. >>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = continuity(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=2).fit_transform(original) >>> s = continuity(original, projected) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s, pvalues = continuity(original, projected, return_pvalues=True) >>> min(s), max(s), s (0.8, 1.0, array([0.95, 0.8 , 0.95, 1. , 0.9 , 0.95, 0.95, 1. , 0.95, 1. , 0.85, 0.9 ])) >>> pvalues array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]) Parameters ---------- k X matrix with an instance by row in a given space (often the original one) X_ matrix with an instance by row in another given space (often the projected one) return_pvalues Add dummy p-values to result (NaNs) Returns ------- List of values, one for each instance """ return trustworthiness(X_, X, k, return_pvalues)
def euclidean__n_vs_1(X, instance)
-
Expand source code
def euclidean__n_vs_1(X, instance): return norm(X - instance, axis=1, keepdims=True)
def rank_by_distances(X, instance, method='average')
-
Expand source code
def rank_by_distances(X, instance, method="average"): distances = euclidean__n_vs_1(X, instance) return rankdata(distances, method=method) - 1
def shuffle(x)
-
Modify a sequence in-place by shuffling its contents.
This function only shuffles the array along the first axis of a multi-dimensional array. The order of sub-arrays is changed but their contents remains the same.
Note
New code should use the
~numpy.random.Generator.shuffle
method of a~numpy.random.Generator
instance instead; please see the :ref:random-quick-start
.Parameters
x
:ndarray
orMutableSequence
- The array, list or mutable sequence to be shuffled.
Returns
None
See Also
random.Generator.shuffle
- which should be used for new code.
Examples
>>> arr = np.arange(10) >>> np.random.shuffle(arr) >>> arr [1 7 5 2 9 4 3 6 0 8] # random
Multi-dimensional arrays are only shuffled along the first axis:
>>> arr = np.arange(9).reshape((3, 3)) >>> np.random.shuffle(arr) >>> arr array([[3, 4, 5], # random [6, 7, 8], [0, 1, 2]])
def trustworthiness(X, X_, k=5, return_pvalues=False)
-
'trustworthiness' of each point separately.
>>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = trustworthiness(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=2).fit_transform(original) >>> s = trustworthiness(original, projected) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s, pvalues = trustworthiness(original, projected, return_pvalues=True) >>> min(s), max(s), s (0.75, 1.0, array([0.8 , 0.75, 0.9 , 1. , 0.85, 0.9 , 0.95, 1. , 0.95, 1. , 0.85, 0.8 ])) >>> pvalues array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])
Parameters
k
X
- matrix with an instance by row in a given space (often the original one)
X_
- matrix with an instance by row in another given space (often the projected one)
return_pvalues
- Add dummy p-values to result (NaNs)
Returns
List
ofvalues, one for each instance
Expand source code
def trustworthiness(X, X_, k=5, return_pvalues=False): """ 'trustworthiness' of each point separately. >>> import numpy as np >>> from functools import partial >>> from scipy.stats import spearmanr, weightedtau >>> mean = (1, 2) >>> cov = eye(2) >>> rng = np.random.default_rng(seed=0) >>> original = rng.multivariate_normal(mean, cov, size=12) >>> s = trustworthiness(original, original) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=2).fit_transform(original) >>> s = trustworthiness(original, projected) >>> min(s), max(s), s (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])) >>> projected = PCA(n_components=1).fit_transform(original) >>> s, pvalues = trustworthiness(original, projected, return_pvalues=True) >>> min(s), max(s), s (0.75, 1.0, array([0.8 , 0.75, 0.9 , 1. , 0.85, 0.9 , 0.95, 1. , 0.95, 1. , 0.85, 0.8 ])) >>> pvalues array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]) Parameters ---------- k X matrix with an instance by row in a given space (often the original one) X_ matrix with an instance by row in another given space (often the projected one) return_pvalues Add dummy p-values to result (NaNs) Returns ------- List of values, one for each instance """ result, pvalues = [], [] n = len(X) for a, b in zip(X, X_): ra = rank_by_distances(X, a, "min") rb = rank_by_distances(X_, b, "min") a_neighbors = where(ra <= k) b_neighbors = where(rb <= k) U = setdiff1d(b_neighbors, a_neighbors) r = 1 - 2 * sum(ra[U] - k) / k / (2 * n - 3 * k - 1) result.append(r) result = np.array(result) if return_pvalues: return result, np.array([nan for _ in result]) return result