Module sortedness.misc.trustworthiness

This module is needed because sklearn implementation calculates trustworthiness only for the entire dataset.

Expand source code
#  Copyright (c) 2023. Davi Pereira dos Santos
#  This file is part of the sortedness project.
#  Please respect the license - more about this in the section (*) below.
#
#  sortedness is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  sortedness is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with sortedness.  If not, see <http://www.gnu.org/licenses/>.
#
#  (*) Removing authorship by any means, e.g. by distribution of derived
#  works or verbatim, obfuscated, compiled or rewritten versions of any
#  part of this work is illegal and it is unethical regarding the effort and
#  time spent here.
#

"""
This module is needed because sklearn implementation calculates trustworthiness only for the entire dataset.
"""
from math import nan

import numpy as np
from numpy import eye, where, setdiff1d
from numpy.linalg import norm
from numpy.random import shuffle
from scipy.stats import rankdata
from sklearn.decomposition import PCA


def rank_by_distances(X, instance, method="average"):
    distances = euclidean__n_vs_1(X, instance)
    return rankdata(distances, method=method) - 1


def euclidean__n_vs_1(X, instance):
    return norm(X - instance, axis=1, keepdims=True)


def continuity(X, X_, k=5, return_pvalues=False):
    """
    'continuity' of each point separately.

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> s = continuity(original, original)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = continuity(original, projected)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s, pvalues = continuity(original, projected, return_pvalues=True)
    >>> min(s), max(s), s
    (0.8, 1.0, array([0.95, 0.8 , 0.95, 1.  , 0.9 , 0.95, 0.95, 1.  , 0.95, 1.  , 0.85,
           0.9 ]))
    >>> pvalues
    array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])


    Parameters
    ----------
    k
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    return_pvalues
        Add dummy p-values to result (NaNs)

    Returns
    -------
    List of values, one for each instance

    """
    return trustworthiness(X_, X, k, return_pvalues)


def trustworthiness(X, X_, k=5, return_pvalues=False):
    """
    'trustworthiness' of each point separately.

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> s = trustworthiness(original, original)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = trustworthiness(original, projected)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s, pvalues = trustworthiness(original, projected, return_pvalues=True)
    >>> min(s), max(s), s
    (0.75, 1.0, array([0.8 , 0.75, 0.9 , 1.  , 0.85, 0.9 , 0.95, 1.  , 0.95, 1.  , 0.85,
           0.8 ]))
    >>> pvalues
    array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])


    Parameters
    ----------
    k
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    return_pvalues
        Add dummy p-values to result (NaNs)

    Returns
    -------
    List of values, one for each instance

    """
    result, pvalues = [], []
    n = len(X)
    for a, b in zip(X, X_):
        ra = rank_by_distances(X, a, "min")
        rb = rank_by_distances(X_, b, "min")
        a_neighbors = where(ra <= k)
        b_neighbors = where(rb <= k)
        U = setdiff1d(b_neighbors, a_neighbors)
        r = 1 - 2 * sum(ra[U] - k) / k / (2 * n - 3 * k - 1)
        result.append(r)
    result = np.array(result)
    if return_pvalues:
        return result, np.array([nan for _ in result])
    return result

Functions

def continuity(X, X_, k=5, return_pvalues=False)

'continuity' of each point separately.

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> s = continuity(original, original)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = continuity(original, projected)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s, pvalues = continuity(original, projected, return_pvalues=True)
>>> min(s), max(s), s
(0.8, 1.0, array([0.95, 0.8 , 0.95, 1.  , 0.9 , 0.95, 0.95, 1.  , 0.95, 1.  , 0.85,
       0.9 ]))
>>> pvalues
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

Parameters

k
 
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
return_pvalues
Add dummy p-values to result (NaNs)

Returns

List of values, one for each instance
 
Expand source code
def continuity(X, X_, k=5, return_pvalues=False):
    """
    'continuity' of each point separately.

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> s = continuity(original, original)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = continuity(original, projected)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s, pvalues = continuity(original, projected, return_pvalues=True)
    >>> min(s), max(s), s
    (0.8, 1.0, array([0.95, 0.8 , 0.95, 1.  , 0.9 , 0.95, 0.95, 1.  , 0.95, 1.  , 0.85,
           0.9 ]))
    >>> pvalues
    array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])


    Parameters
    ----------
    k
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    return_pvalues
        Add dummy p-values to result (NaNs)

    Returns
    -------
    List of values, one for each instance

    """
    return trustworthiness(X_, X, k, return_pvalues)
def euclidean__n_vs_1(X, instance)
Expand source code
def euclidean__n_vs_1(X, instance):
    return norm(X - instance, axis=1, keepdims=True)
def rank_by_distances(X, instance, method='average')
Expand source code
def rank_by_distances(X, instance, method="average"):
    distances = euclidean__n_vs_1(X, instance)
    return rankdata(distances, method=method) - 1
def shuffle(x)

Modify a sequence in-place by shuffling its contents.

This function only shuffles the array along the first axis of a multi-dimensional array. The order of sub-arrays is changed but their contents remains the same.

Note

New code should use the ~numpy.random.Generator.shuffle method of a ~numpy.random.Generator instance instead; please see the :ref:random-quick-start.

Parameters

x : ndarray or MutableSequence
The array, list or mutable sequence to be shuffled.

Returns

None
 

See Also

random.Generator.shuffle
which should be used for new code.

Examples

>>> arr = np.arange(10)
>>> np.random.shuffle(arr)
>>> arr
[1 7 5 2 9 4 3 6 0 8] # random

Multi-dimensional arrays are only shuffled along the first axis:

>>> arr = np.arange(9).reshape((3, 3))
>>> np.random.shuffle(arr)
>>> arr
array([[3, 4, 5], # random
       [6, 7, 8],
       [0, 1, 2]])
def trustworthiness(X, X_, k=5, return_pvalues=False)

'trustworthiness' of each point separately.

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> s = trustworthiness(original, original)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = trustworthiness(original, projected)
>>> min(s), max(s), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s, pvalues = trustworthiness(original, projected, return_pvalues=True)
>>> min(s), max(s), s
(0.75, 1.0, array([0.8 , 0.75, 0.9 , 1.  , 0.85, 0.9 , 0.95, 1.  , 0.95, 1.  , 0.85,
       0.8 ]))
>>> pvalues
array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])

Parameters

k
 
X
matrix with an instance by row in a given space (often the original one)
X_
matrix with an instance by row in another given space (often the projected one)
return_pvalues
Add dummy p-values to result (NaNs)

Returns

List of values, one for each instance
 
Expand source code
def trustworthiness(X, X_, k=5, return_pvalues=False):
    """
    'trustworthiness' of each point separately.

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> s = trustworthiness(original, original)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = trustworthiness(original, projected)
    >>> min(s), max(s), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s, pvalues = trustworthiness(original, projected, return_pvalues=True)
    >>> min(s), max(s), s
    (0.75, 1.0, array([0.8 , 0.75, 0.9 , 1.  , 0.85, 0.9 , 0.95, 1.  , 0.95, 1.  , 0.85,
           0.8 ]))
    >>> pvalues
    array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan])


    Parameters
    ----------
    k
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    return_pvalues
        Add dummy p-values to result (NaNs)

    Returns
    -------
    List of values, one for each instance

    """
    result, pvalues = [], []
    n = len(X)
    for a, b in zip(X, X_):
        ra = rank_by_distances(X, a, "min")
        rb = rank_by_distances(X_, b, "min")
        a_neighbors = where(ra <= k)
        b_neighbors = where(rb <= k)
        U = setdiff1d(b_neighbors, a_neighbors)
        r = 1 - 2 * sum(ra[U] - k) / k / (2 * n - 3 * k - 1)
        result.append(r)
    result = np.array(result)
    if return_pvalues:
        return result, np.array([nan for _ in result])
    return result