Module `sortedness.local`

Expand source code

#  Copyright (c) 2023. Davi Pereira dos Santos
#  This file is part of the sortedness project.
#  Please respect the license - more about this in the section (*) below.
#
#  sortedness is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  sortedness is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with sortedness.  If not, see <http://www.gnu.org/licenses/>.
#
#  (*) Removing authorship by any means, e.g. by distribution of derived
#  works or verbatim, obfuscated, compiled or rewritten versions of any
#  part of this work is illegal and it is unethical regarding the effort and
#  time spent here.
#

import gc
from functools import partial
from math import exp

import numpy as np
import pathos.multiprocessing as mp
from numpy import eye, mean, sqrt, ndarray, nan
from numpy.random import permutation
from scipy.spatial.distance import cdist, pdist, squareform
from scipy.stats import rankdata, kendalltau, weightedtau

from sortedness.misc.parallel import rank_alongrow, rank_alongcol


def common(S, S_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs):
    def thread(a, b):
        return f(a, b, **kwargs)

    def oneway(scores_a, scores_b):
        jobs = pmap((thread if kwargs else f), scores_a, scores_b)
        result, pvalues = [], []
        for tup in jobs:
            corr, pvalue = tup if isinstance(tup, tuple) else (tup, nan)
            result.append(corr)
            pvalues.append(pvalue)
        return np.array(result, dtype=float), np.array(pvalues, dtype=float)

    handle_f = lambda tup: tup if isinstance(tup, tuple) else (tup, nan)
    result, pvalues = oneway(S, S_) if i is None else handle_f(f(S, S_, **kwargs))
    if not isweightedtau and symmetric:
        result_, pvalues_ = oneway(S_, S) if i is None else handle_f(f(S_, S, **kwargs))
        result = (result + result_) / 2
        pvalues = (pvalues + pvalues_) / 2

    if return_pvalues:
        return np.array(list(zip(result, pvalues)))
    return result


# todo: see if speed can benefit from:
# gen = pairwise_distances_chunked(X, method='cosine', n_jobs=-1)
# Z = np.concatenate(list(gen), axis=0)
# Z_cond = Z[np.triu_indices(Z.shape[0], k=1)
# https://stackoverflow.com/a/55940399/9681577
#
# import dask.dataframe as dd
# from dask.multiprocessing import get
# # o - is pandas DataFrame
# o['dist_center_from'] = dd.from_pandas(o, npartitions=8).map_partitions(lambda df: df.apply(lambda x: vincenty((x.fromlatitude, x.fromlongitude), center).km, axis=1)).compute(get=get)

def remove_diagonal(X):
    n_points = len(X)
    nI = ~eye(n_points, dtype=bool)  # Mask to remove diagonal.
    return X[nI].reshape(n_points, -1)


weightedtau.isweightedtau = True


def sortedness(X, X_, i=None, symmetric=True, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs):
    """
     Calculate the sortedness (stress-like correlation-based measure that focuses on ordering of points) value for each point
     Functions available as scipy correlation coefficients:
         ρ-sortedness (Spearman),
         𝜏-sortedness (Kendall's 𝜏),
         w𝜏-sortedness (Sebastiano Vigna weighted Kendall's 𝜏)  ← default

    Note:
        Categorical, or pathological data might present values lower than one due to the presence of ties even with a perfect projection.
        Depending on the chosen correlation coefficient, ties are penalized, as they do not contribute to establishing any order.

    Hint:
        Swap two points A and B at X_ to be able to calculate sortedness between A and B in the same space (i.e., originally, `X = X_`):
            `X = [A, B, C, ..., Z]`
            `X_ = [B, A, C, ..., Z]`
            `sortedness(X, X_, i=0)`

    Parameters
    ----------
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    i
        None:   calculate sortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(sortedness(a, b, symmetric=False) + sortedness(b, a, symmetric=False)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function:
        callable    =   scipy correlation function:
            weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    return_pvalues
        For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
        This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
        The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    kwargs
        Arguments to be passed to the correlation measure

     Returns
     -------
         ndarray containing a sortedness value per row, or a single float (include pvalues as a second value if requested)


    >>> ll = [[i] for i in range(17)]
    >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
    >>> b.ravel()
    array([ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])
    >>> r = sortedness(a, b)
    >>> from statistics import median
    >>> round(min(r), 12), round(max(r), 12), round(median(r),12)
    (-1.0, 0.998638259786, 0.937548981983)

    >>> rnd = np.random.default_rng(0)
    >>> rnd.shuffle(ll)
    >>> b = np.array(ll)
    >>> b.ravel()
    array([ 2, 10,  3, 11,  0,  4,  7,  5, 16, 12, 13,  6,  9, 14,  8,  1, 15])
    >>> r = sortedness(a, b)
    >>> r
    array([ 0.24691868, -0.17456491,  0.19184376, -0.18193532,  0.07175694,
            0.27992254,  0.04121859,  0.16249574, -0.03506842,  0.27856259,
            0.40866965, -0.07617887,  0.12184064,  0.24762942, -0.05049511,
           -0.46277399,  0.12193493])
    >>> round(min(r), 12), round(max(r), 12)
    (-0.462773990559, 0.408669653064)
    >>> round(mean(r), 12)
    0.070104521222

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> me = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(me, cov, size=12)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(0)
    >>> projectedrnd = permutation(original)

    >>> s = sortedness(original, original)
    >>> round(min(s), 12), round(max(s), 12), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

    # Measure sortedness between two points in the same space.
    >>> M = original.copy()
    >>> M[0], M[1] = original[1], original[0]
    >>> round(sortedness(M, original, 0), 12)
    0.547929184934

    >>> s = sortedness(original, projected2)
    >>> round(min(s), 12), round(max(s), 12), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> s = sortedness(original, projected1)
    >>> round(min(s), 12), round(max(s), 12)
    (0.393463224666, 0.944810120534)
    >>> s = sortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.648305479567, 0.397019507592)

    >>> np.round(sortedness(original, original, f=kendalltau, return_pvalues=True), 12)
    array([[1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08]])
    >>> sortedness(original, projected2, f=kendalltau)
    array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    >>> sortedness(original, projected1, f=kendalltau)
    array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091,
           0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909,
           0.6       , 0.74545455])
    >>> sortedness(original, projectedrnd, f=kendalltau)
    array([ 0.2       , -0.38181818,  0.23636364, -0.09090909, -0.05454545,
            0.23636364, -0.09090909,  0.23636364, -0.63636364, -0.01818182,
           -0.2       , -0.01818182])

    >>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
    >>> sortedness(original, original, f=wf, return_pvalues=True)
    array([[ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan]])
    >>> sortedness(original, projected2, f=wf)
    array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    >>> sortedness(original, projected1, f=wf)
    array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591,
           0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767,
           0.87256459, 0.87747329])
    >>> sortedness(original, projectedrnd, f=wf)
    array([ 0.23771513, -0.2790059 ,  0.3718005 , -0.16623167,  0.06179047,
            0.40434396, -0.00130294,  0.46569739, -0.67581876, -0.23852189,
           -0.39125007,  0.12131153])
    >>> np.random.seed(14980)
    >>> projectedrnd = permutation(original)
    >>> sortedness(original, projectedrnd)
    array([ 0.24432153, -0.19634576, -0.00238081, -0.4999116 , -0.01625951,
            0.22478766,  0.07176118, -0.48092843,  0.19345964, -0.44895295,
           -0.42044773,  0.06942218])
    >>> sortedness(original, np.flipud(original))
    array([-0.28741742,  0.36769361,  0.06926091,  0.02550202,  0.21424544,
           -0.3244699 , -0.3244699 ,  0.21424544,  0.02550202,  0.06926091,
            0.36769361, -0.28741742])
    >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
    >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
    >>> sortedness(original, projected)
    array([1., 1., 1., 1., 1., 1., 1.])
    >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
    >>> sortedness(original, projected)
    array([-1.        ,  0.51956213,  0.81695345,  0.98180162,  0.98180162,
            0.81695345,  0.51956213])
    >>> round(sortedness(original, projected, 1), 12)
    0.519562134793
    >>> round(sortedness(original, projected, 1, symmetric=False), 12)
    0.422638894922
    >>> round(sortedness(projected, original, 1, symmetric=False), 12)
    0.616485374665
    >>> round(sortedness(original, projected, rank=True)[1], 12)
    0.519562134793
    >>> round(sortedness(original, projected, rank=False)[1], 12)  # warning: will consider indexes as ranks!
    0.074070734162
    >>> round(sortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
    -1.0
    >>> from scipy.stats import weightedtau
    >>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
    >>> round(sortedness(original, projected, 1, f=weightedtau, rank=None), 12)
    0.275652884819
    >>> weightedtau.isweightedtau = True
    """
    isweightedtau = False
    if hasattr(f, "isweightedtau") and f.isweightedtau:
        isweightedtau = True
        if not symmetric:
            if "rank" in kwargs:  # pragma: no cover
                raise Exception(f"Cannot set `symmetric=False` and provide `rank` at the same time.")
            kwargs["rank"] = None
    if parallel_kwargs is None:
        parallel_kwargs = {}
    npoints = len(X)

    if i is None:
        tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
        pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
        sqdist_X, sqdist_X_ = tmap(lambda M: cdist(M, M, metric='sqeuclidean'), [X, X_])
        D = remove_diagonal(sqdist_X)
        D_ = remove_diagonal(sqdist_X_)
        scores_X, scores_X_ = (-D, -D_) if isweightedtau else (D, D_)
    else:
        pmap = None
        if not isinstance(X, ndarray):
            X, X_ = np.array(X), np.array(X_)
        x, x_ = X[i], X_[i]
        X = np.delete(X, i, axis=0)
        X_ = np.delete(X_, i, axis=0)
        d_ = np.sum((X_ - x_) ** 2, axis=1)
        d = np.sum((X - x) ** 2, axis=1)
        scores_X, scores_X_ = (-d, -d_) if isweightedtau else (d, d_)

    return common(scores_X, scores_X_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs)


def pwsortedness(X, X_, i=None, symmetric=True, f=weightedtau, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, parallel_kwargs=None, **kwargs):
    """
    Local pairwise sortedness (Λ𝜏w) based on Sebastiano Vigna weighted Kendall's 𝜏

    Importance rankings are calculated internally based on proximity of each pair to the point of interest.

    # TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
        This probably doesn't make any difference on the res, except on categorical, pathological or toy datasets
        Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
        In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

    Parameters
    ----------
    X
        Original dataset
    X_
        Projected points
    i
        None:   calculate pwsortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(pwsortedness(a, b) + pwsortedness(b, a)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function that accept the parameter `rank`:
        callable    =   weightedtau (weighted Kendall’s τ is the default) or other compatible correlation function
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    batches
        Parallel batch size
    debug
        Whether to print more info
    dist
        Provide distance matrices (D, D_) instead of points
        X and X_ should be None
    cython
        Whether to:
            (True) improve speed by ~2x; or,
            (False) be more compatible/portable.
    parallel_kwargs
        Dict of extra arguments to be provided to pathos parallelization
    kwargs
        Any extra argument to be provided to `weightedtau`, e.g., a custom weighting function.
        This only works for `cython=False`.

    Returns
    -------
        Numpy vector or Python float

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> m = (1, 12)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(m, cov, size=12)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(0)
    >>> projectedrnd = permutation(original)

    >>> r = pwsortedness(original, original)
    >>> min(r), max(r), round(mean(r), 12)
    (1.0, 1.0, 1.0)
    >>> r = pwsortedness(original, projected2)
    >>> min(r), round(mean(r), 12), max(r)
    (1.0, 1.0, 1.0)
    >>> r = pwsortedness(original, projected1)
    >>> min(r), round(mean(r), 12), max(r)
    (0.649315577592, 0.753429143832, 0.834601601062)
    >>> r = pwsortedness(original, projected2[:, 1:])
    >>> min(r), round(mean(r), 12), max(r)
    (0.035312055682, 0.2002329034, 0.352491282966)
    >>> r = pwsortedness(original, projectedrnd)
    >>> min(r), round(mean(r), 12), max(r)
    (-0.168611098044, -0.079882538998, 0.14442446342)
    >>> round(pwsortedness(original, projected1)[1], 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, cython=True)[1], 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, i=1), 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, symmetric=False, cython=True)[1], 12)
    0.730078995423
    >>> round(pwsortedness(original, projected1, symmetric=False, i=1), 12)
    0.730078995423
    >>> np.round(pwsortedness(original, projected1, symmetric=False), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=hyperbolic), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=gaussian), 12)
    array([0.74141933, 0.71595198, 0.94457495, 0.72528033, 0.78637383,
           0.92562531, 0.77600408, 0.74811014, 0.87241023, 0.8485321 ,
           0.82264118, 0.95322218])
    """
    if "rank" in kwargs:  # pragma: no cover
        raise Exception(f"Cannot provide `rank` as kwarg for pwsortedness. The pairwise distances ranking is calculated internally.")
    isweightedtau = hasattr(f, "isweightedtau") and f.isweightedtau
    if parallel_kwargs is None:
        parallel_kwargs = {}
    if cython and (kwargs or not isweightedtau):  # pragma: no cover
        raise Exception(f"Cannot provide custom `f` or `f` kwargs with `cython=True`")
    npoints = len(X) if X is not None else len(dist[0])  # pragma: no cover
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    thread = lambda M: pdist(M, metric="sqeuclidean")
    scores_X, scores_X_ = tmap(thread, [X, X_]) if X is not None else (squareform(dist[0]), squareform(dist[1]))
    if isweightedtau:
        scores_X, scores_X_ = -scores_X, -scores_X_

    def makeM(E):
        n = len(E)
        m = (n ** 2 - n) // 2
        M = np.zeros((m, E.shape[1]))
        c = 0
        for i in range(n - 1):  # a bit slow, but only a fraction of wtau (~5%)
            h = n - i - 1
            d = c + h
            M[c:d] = E[i] + E[i + 1:]
            c = d
        del E
        gc.collect()
        return M.T

    if symmetric:
        D, D_ = tmap(squareform, [-scores_X, -scores_X_]) if dist is None else (dist[0], dist[1])
    else:
        D = squareform(-scores_X) if dist is None else dist[0]

    if i is None:
        n = len(D)
        if symmetric:
            M, M_ = pmap(makeM, [D, D_])
            R_ = rank_alongrow(M_, step=n // batches, parallel=parallel, **parallel_kwargs).T
            del M_
        else:
            M = makeM(D)
        R = rank_alongrow(M, step=n // batches, parallel=parallel, **parallel_kwargs).T
        del M
        gc.collect()
        if cython:
            from sortedness.wtau import parwtau
            res = parwtau(scores_X, scores_X_, npoints, R, parallel=parallel, **parallel_kwargs)
            del R
            if not symmetric:
                gc.collect()
                return res

            res_ = parwtau(scores_X, scores_X_, npoints, R_, parallel=parallel, **parallel_kwargs)
            del R_
            gc.collect()
            return (res + res_) / 2
        else:
            def thread(r):
                corr = f(scores_X, scores_X_, rank=r, **kwargs)[0]
                return corr

            gen = (R[:, i] for i in range(len(X)))
            res = np.array(list(pmap(thread, gen)), dtype=float)
            del R
            if not symmetric:
                gc.collect()
                return res

            gen = (R_[:, i] for i in range(len(X_)))
            res_ = np.array(list(pmap(thread, gen)), dtype=float)
            del R_
            gc.collect()
            return np.round((res + res_) / 2, 12)

    if symmetric:
        M, M_ = pmap(makeM, [D[:, i:i + 1], D_[:, i:i + 1]])
        thread = lambda M: rankdata(M, axis=1, method="average")
        r, r_ = [r[0].astype(int) - 1 for r in tmap(thread, [M, M_])]  # todo: asInt and method="average" does not play nicely together  for ties!!
        s1 = f(scores_X, scores_X_, r, **kwargs)[0]
        s2 = f(scores_X, scores_X_, r_, **kwargs)[0]
        return round((s1 + s2) / 2, 12)

    M = makeM(D[:, i:i + 1])
    r = rankdata(M, axis=1, method="average")[0].astype(int) - 1
    return round(f(scores_X, scores_X_, r, **kwargs)[0], 12)


def rsortedness(X, X_, i=None, symmetric=True, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs):  # pragma: no cover
    """
    Reciprocal sortedness: consider the neighborhood relation the other way around

    Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm.

    WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well.

    # TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)
        This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
        Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
        In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

    Parameters
    ----------
    X
        Original dataset
    X_
        Projected points
    i
        None:   calculate rsortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(rsortedness(a, b) + rsortedness(b, a)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function:
        callable    =   scipy correlation function:
            weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    return_pvalues
        For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
        This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
        The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    kwargs
        Arguments to be passed to the correlation measure

    Returns
    -------
        Numpy vector

    >>> ll = [[i] for i in range(17)]
    >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
    >>> b.ravel()
    array([ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])
    >>> r = rsortedness(a, b)
    >>> round(min(r), 12), round(max(r), 12)
    (-0.707870893072, 0.961986592073)

    >>> rnd = np.random.default_rng(1)
    >>> rnd.shuffle(ll)
    >>> b = np.array(ll)
    >>> b.ravel()
    array([ 1, 10, 14, 15,  7, 12,  3,  4,  5,  8,  0,  9,  2, 16, 13, 11,  6])
    >>> r = rsortedness(a, b)
    >>> np.round(r, 12)
    array([-0.38455603, -0.28634813,  0.23902905,  0.19345863, -0.43727482,
           -0.3498781 ,  0.29240532,  0.52016504,  0.51878015,  0.07744892,
           -0.03664284, -0.17163371, -0.16346701, -0.07260407, -0.03677776,
            0.00183332, -0.25692691])
    >>> round(min(r), 12), round(max(r), 12)
    (-0.437274823593, 0.520165040078)
    >>> round(mean(r), 12)
    -0.020764055466

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> me = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=10)
    >>> original = rng.multivariate_normal(me, cov, size=30)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(10)
    >>> projectedrnd = permutation(original)

    >>> s = rsortedness(original, original)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected2)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1)
    >>> round(min(s), 12), round(max(s), 12)
    (0.160980548632, 0.967351026423)
    >>> s = rsortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.406104443754, 0.427134084097)
    >>> s = rsortedness(original, original, f=kendalltau, return_pvalues=True)
    >>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
    (array([1., 0.]), array([1.00e+00, 3.58e-10]))
    >>> s = rsortedness(original, projected2, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (0.045545155427, 0.920252656485)
    >>> s = rsortedness(original, projectedrnd, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.302063337022, 0.306710199121)
    >>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
    >>> s = rsortedness(original, original, f=wf, return_pvalues=True)
    >>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
    (array([ 1., nan]), array([ 1., nan]))
    >>> s = rsortedness(original, projected2, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.119320940022, 0.914184310816)
    >>> s = rsortedness(original, projectedrnd, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.418929560486, 0.710828808816)
    >>> np.random.seed(14980)
    >>> projectedrnd = permutation(original)
    >>> s = rsortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.415049518972, 0.465004321022)
    >>> s = rsortedness(original, np.flipud(original))
    >>> round(min(s), 12), round(max(s), 12)
    (-0.258519523874, 0.454184518962)
    >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
    >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
    >>> s = rsortedness(original, projected)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
    >>> s = rsortedness(original, projected)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.755847611802, 0.872258373962)
    >>> round(rsortedness(original, projected, 1), 12)
    0.544020048033
    >>> round(rsortedness(original, projected, 1, symmetric=False), 12)
    0.498125132865
    >>> round(rsortedness(projected, original, 1, symmetric=False), 12)
    0.589914963202
    >>> round(rsortedness(original, projected, rank=True)[1], 12)
    0.544020048033
    >>> round(rsortedness(original, projected, rank=False)[1], 12) # warning: will consider indexes as ranks!
    0.208406304729
    >>> round(rsortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
    -0.294037071368
    >>> from scipy.stats import weightedtau
    >>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
    >>> round(rsortedness(original, projected, 1, f=weightedtau, rank=None), 12)
    0.483816220002
    >>> weightedtau.isweightedtau = True

    """
    isweightedtau = False
    if hasattr(f, "isweightedtau") and f.isweightedtau:
        isweightedtau = True
        if not symmetric:
            if "rank" in kwargs:
                raise Exception(f"Cannot set `symmetric=False` and provide `rank` at the same time.")
            kwargs["rank"] = None
    if parallel_kwargs is None:
        parallel_kwargs = {}
    npoints = len(X)
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    D, D_ = tmap(lambda M: cdist(M, M, metric="sqeuclidean"), [X, X_])
    R, R_ = (rank_alongcol(M, parallel=parallel) for M in [D, D_])
    scores_X, scores_X_ = tmap(lambda M: remove_diagonal(M), [R, R_])
    if isweightedtau:
        scores_X, scores_X_ = -scores_X, -scores_X_

    if hasattr(f, "isparwtau"):  # pragma: no cover
        raise Exception("TODO: Pairtau implementation disagree with scipy weightedtau")
        # return parwtau(scores_X, scores_X_, npoints, parallel=parallel, **kwargs)
    if i is not None:
        scores_X, scores_X_ = scores_X[i], scores_X_[i]
    return common(scores_X, scores_X_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs)


def stress(X, X_, i=None, metric=True, parallel=True, parallel_n_trigger=10000, **parallel_kwargs):
    """
    Kruskal's "Stress Formula 1" normalized before comparing distances.
    default: Euclidean

    Parameters
    ----------
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    i
        None:   calculate stress for all instances
        `int`:  index of the instance of interest
    metric
        Stress formula version: metric or nonmetric
    parallel
        Parallelize processing when |X|>1000. Might use more memory.

    Returns
    -------
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 12)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> original
    array([[ 1.12573022, 11.86789514],
           [ 1.64042265, 12.10490012],
           [ 0.46433063, 12.36159505],
           [ 2.30400005, 12.94708096],
           [ 0.29626476, 10.73457853],
           [ 0.37672554, 12.04132598],
           [-1.32503077, 11.78120834],
           [-0.24591095, 11.26773265],
           [ 0.45574102, 11.68369984],
           [ 1.41163054, 13.04251337],
           [ 0.87146534, 13.36646347],
           [ 0.33480533, 12.35151007]])
    >>> s = stress(original, original*5)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
    >>> from sklearn.decomposition import PCA
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = stress(original, projected)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s = stress(original, projected)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.073383317103, 0.440609121637, array([0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,
           0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
           0.44060912, 0.27058614]))
    >>> stress(original, projected)
    array([0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,
           0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
           0.44060912, 0.27058614])
    >>> stress(original, projected, metric=False)
    array([0.36599664, 0.39465927, 0.27349092, 0.25096851, 0.31476019,
           0.27612935, 0.3064739 , 0.26141414, 0.2635681 , 0.25811772,
           0.36113025, 0.29740821])
    >>> stress(original, projected, 1)
    0.316031007598
    >>> stress(original, projected, 1, metric=False)
    0.39465927169
    """
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and X.size > parallel_n_trigger else map
    # TODO: parallelize cdist in slices?
    if metric:
        thread = (lambda M, m: cdist(M, M, metric=m)) if i is None else (lambda M, m: cdist(M[i:i + 1], M, metric=m))
        D, Dsq_ = tmap(thread, [X, X_], ["Euclidean", "sqeuclidean"])
        Dsq_ /= Dsq_.max(axis=1, keepdims=True)
        D_ = sqrt(Dsq_)
    else:
        thread = (lambda M, m: rankdata(cdist(M, M, metric=m), method="average", axis=1) - 1) if i is None else (lambda M, m: rankdata(cdist(M[i:i + 1], M, metric=m), method="average", axis=1) - 1)
        D, Dsq_ = tmap(thread, [X, X_], ["Euclidean", "sqeuclidean"])
        Dsq_ /= Dsq_.max(axis=1, keepdims=True)
        D_ = sqrt(Dsq_)

    D /= D.max(axis=1, keepdims=True)
    s = ((D - D_) ** 2).sum(axis=1) / 2
    result = np.round(np.sqrt(s / (Dsq_.sum(axis=1) / 2)), 12)
    return result if i is None else result.flat[0]


def hyperbolic(x):
    """
    >>> import numpy as np
    >>> np.round(list(map(hyperbolic, range(10))), 12).tolist()
    [1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]
    """
    return 1 / (1 + x)


def hyperbolic_np(x):
    """
    >>> import numpy as np
    >>> np.round(hyperbolic_np(list(range(10))), 12).tolist()
    [1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]
    """
    return np.divide(1, np.add(1, x))


def gaussian(x, ampl=1., sigma=1.):
    """
    >>> import numpy as np
    >>> np.round(list(map(gaussian, range(10))), 12).tolist()
    [1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]
    """
    return ampl * exp(- (x / sigma) ** 2 / 2)


def gaussian_np(x, ampl=1., sigma=1.):
    """
    >>> import numpy as np
    >>> np.round(gaussian_np(list(range(10))), 12).tolist()
    [1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]
    """
    return ampl * np.exp(- np.divide(x, sigma) ** 2 / 2)

Functions

def common(S, S_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs)

Expand source code

def common(S, S_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs):
    def thread(a, b):
        return f(a, b, **kwargs)

    def oneway(scores_a, scores_b):
        jobs = pmap((thread if kwargs else f), scores_a, scores_b)
        result, pvalues = [], []
        for tup in jobs:
            corr, pvalue = tup if isinstance(tup, tuple) else (tup, nan)
            result.append(corr)
            pvalues.append(pvalue)
        return np.array(result, dtype=float), np.array(pvalues, dtype=float)

    handle_f = lambda tup: tup if isinstance(tup, tuple) else (tup, nan)
    result, pvalues = oneway(S, S_) if i is None else handle_f(f(S, S_, **kwargs))
    if not isweightedtau and symmetric:
        result_, pvalues_ = oneway(S_, S) if i is None else handle_f(f(S_, S, **kwargs))
        result = (result + result_) / 2
        pvalues = (pvalues + pvalues_) / 2

    if return_pvalues:
        return np.array(list(zip(result, pvalues)))
    return result

def gaussian(x, ampl=1.0, sigma=1.0)

>>> import numpy as np
>>> np.round(list(map(gaussian, range(10))), 12).tolist()
[1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]

Expand source code

def gaussian(x, ampl=1., sigma=1.):
    """
    >>> import numpy as np
    >>> np.round(list(map(gaussian, range(10))), 12).tolist()
    [1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]
    """
    return ampl * exp(- (x / sigma) ** 2 / 2)

def gaussian_np(x, ampl=1.0, sigma=1.0)

>>> import numpy as np
>>> np.round(gaussian_np(list(range(10))), 12).tolist()
[1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]

Expand source code

def gaussian_np(x, ampl=1., sigma=1.):
    """
    >>> import numpy as np
    >>> np.round(gaussian_np(list(range(10))), 12).tolist()
    [1.0, 0.606530659713, 0.135335283237, 0.011108996538, 0.000335462628, 3.726653e-06, 1.523e-08, 2.3e-11, 0.0, 0.0]
    """
    return ampl * np.exp(- np.divide(x, sigma) ** 2 / 2)

def hyperbolic(x)

>>> import numpy as np
>>> np.round(list(map(hyperbolic, range(10))), 12).tolist()
[1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]

Expand source code

def hyperbolic(x):
    """
    >>> import numpy as np
    >>> np.round(list(map(hyperbolic, range(10))), 12).tolist()
    [1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]
    """
    return 1 / (1 + x)

def hyperbolic_np(x)

>>> import numpy as np
>>> np.round(hyperbolic_np(list(range(10))), 12).tolist()
[1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]

Expand source code

def hyperbolic_np(x):
    """
    >>> import numpy as np
    >>> np.round(hyperbolic_np(list(range(10))), 12).tolist()
    [1.0, 0.5, 0.333333333333, 0.25, 0.2, 0.166666666667, 0.142857142857, 0.125, 0.111111111111, 0.1]
    """
    return np.divide(1, np.add(1, x))

def permutation(x)

Randomly permute a sequence, or return a permuted range.

If x is a multi-dimensional array, it is only shuffled along its first index.

Note

New code should use the ~numpy.random.Generator.permutation method of a ~numpy.random.Generator instance instead; please see the :ref:random-quick-start.

Parameters

x : int or array_like: If x is an integer, randomly permute np.arange(x). If x is an array, make a copy and shuffle the elements randomly.

Returns

out : ndarray: Permuted sequence or array range.

Examples

>>> np.random.permutation(10)
array([1, 7, 4, 3, 0, 9, 2, 5, 8, 6]) # random

>>> np.random.permutation([1, 4, 9, 12, 15])
array([15,  1,  9,  4, 12]) # random

>>> arr = np.arange(9).reshape((3, 3))
>>> np.random.permutation(arr)
array([[6, 7, 8], # random
       [0, 1, 2],
       [3, 4, 5]])

def pwsortedness(X, X_, i=None, symmetric=True, f=<function weightedtau>, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, parallel_kwargs=None, **kwargs)

Local pairwise sortedness (Λ𝜏w) based on Sebastiano Vigna weighted Kendall's 𝜏

Importance rankings are calculated internally based on proximity of each pair to the point of interest.

TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)

This probably doesn't make any difference on the res, except on categorical, pathological or toy datasets
Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

Parameters

X: Original dataset
X_: Projected points
i: None: calculate pwsortedness for all instances int: index of the instance of interest
symmetric: True: Take the mean between extrusion and intrusion emphasis Equivalent to (pwsortedness(a, b) + pwsortedness(b, a)) / 2 at a slightly lower cost. Might increase memory usage. False: Weight by original distances (extrusion emphasis), not the projected distances.
f: Agreement function that accept the parameter rank: callable = weightedtau (weighted Kendall’s τ is the default) or other compatible correlation function Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
parallel: None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_n_trigger: Threshold to disable parallelization for small n values
batches: Parallel batch size
debug: Whether to print more info
dist: Provide distance matrices (D, D_) instead of points X and X_ should be None
cython: Whether to: (True) improve speed by ~2x; or, (False) be more compatible/portable.
parallel_kwargs: Dict of extra arguments to be provided to pathos parallelization
kwargs: Any extra argument to be provided to weightedtau, e.g., a custom weighting function. This only works for cython=False.

Returns

Numpy vector or Python float

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> m = (1, 12)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(m, cov, size=12)
>>> from sklearn.decomposition import PCA
>>> projected2 = PCA(n_components=2).fit_transform(original)
>>> projected1 = PCA(n_components=1).fit_transform(original)
>>> np.random.seed(0)
>>> projectedrnd = permutation(original)

>>> r = pwsortedness(original, original)
>>> min(r), max(r), round(mean(r), 12)
(1.0, 1.0, 1.0)
>>> r = pwsortedness(original, projected2)
>>> min(r), round(mean(r), 12), max(r)
(1.0, 1.0, 1.0)
>>> r = pwsortedness(original, projected1)
>>> min(r), round(mean(r), 12), max(r)
(0.649315577592, 0.753429143832, 0.834601601062)
>>> r = pwsortedness(original, projected2[:, 1:])
>>> min(r), round(mean(r), 12), max(r)
(0.035312055682, 0.2002329034, 0.352491282966)
>>> r = pwsortedness(original, projectedrnd)
>>> min(r), round(mean(r), 12), max(r)
(-0.168611098044, -0.079882538998, 0.14442446342)
>>> round(pwsortedness(original, projected1)[1], 12)
<code>0.649315577592</code>
:   &nbsp;


>>> round(pwsortedness(original, projected1, cython=True)[1], 12)
<code>0.649315577592</code>
:   &nbsp;


>>> round(pwsortedness(original, projected1, i=1), 12)
<code>0.649315577592</code>
:   &nbsp;


>>> round(pwsortedness(original, projected1, symmetric=False, cython=True)[1], 12)
<code>0.730078995423</code>
:   &nbsp;


>>> round(pwsortedness(original, projected1, symmetric=False, i=1), 12)
<code>0.730078995423</code>
:   &nbsp;


>>> np.round(pwsortedness(original, projected1, symmetric=False), 12)
<code>array(\[0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,</code>
:   0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
       0.74425225, 0.83731035])


>>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False), 12)
<code>array(\[0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,</code>
:   0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
       0.74425225, 0.83731035])


>>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=hyperbolic), 12)
<code>array(\[0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,</code>
:   0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
       0.74425225, 0.83731035])


>>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=gaussian), 12)
<code>array(\[0.74141933, 0.71595198, 0.94457495, 0.72528033, 0.78637383,</code>
:   0.92562531, 0.77600408, 0.74811014, 0.87241023, 0.8485321 ,
       0.82264118, 0.95322218])

Expand source code

def pwsortedness(X, X_, i=None, symmetric=True, f=weightedtau, parallel=True, parallel_n_trigger=200, batches=10, debug=False, dist=None, cython=False, parallel_kwargs=None, **kwargs):
    """
    Local pairwise sortedness (Λ𝜏w) based on Sebastiano Vigna weighted Kendall's 𝜏

    Importance rankings are calculated internally based on proximity of each pair to the point of interest.

    # TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)
        This probably doesn't make any difference on the res, except on categorical, pathological or toy datasets
        Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
        In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

    Parameters
    ----------
    X
        Original dataset
    X_
        Projected points
    i
        None:   calculate pwsortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(pwsortedness(a, b) + pwsortedness(b, a)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function that accept the parameter `rank`:
        callable    =   weightedtau (weighted Kendall’s τ is the default) or other compatible correlation function
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    batches
        Parallel batch size
    debug
        Whether to print more info
    dist
        Provide distance matrices (D, D_) instead of points
        X and X_ should be None
    cython
        Whether to:
            (True) improve speed by ~2x; or,
            (False) be more compatible/portable.
    parallel_kwargs
        Dict of extra arguments to be provided to pathos parallelization
    kwargs
        Any extra argument to be provided to `weightedtau`, e.g., a custom weighting function.
        This only works for `cython=False`.

    Returns
    -------
        Numpy vector or Python float

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> m = (1, 12)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(m, cov, size=12)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(0)
    >>> projectedrnd = permutation(original)

    >>> r = pwsortedness(original, original)
    >>> min(r), max(r), round(mean(r), 12)
    (1.0, 1.0, 1.0)
    >>> r = pwsortedness(original, projected2)
    >>> min(r), round(mean(r), 12), max(r)
    (1.0, 1.0, 1.0)
    >>> r = pwsortedness(original, projected1)
    >>> min(r), round(mean(r), 12), max(r)
    (0.649315577592, 0.753429143832, 0.834601601062)
    >>> r = pwsortedness(original, projected2[:, 1:])
    >>> min(r), round(mean(r), 12), max(r)
    (0.035312055682, 0.2002329034, 0.352491282966)
    >>> r = pwsortedness(original, projectedrnd)
    >>> min(r), round(mean(r), 12), max(r)
    (-0.168611098044, -0.079882538998, 0.14442446342)
    >>> round(pwsortedness(original, projected1)[1], 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, cython=True)[1], 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, i=1), 12)
    0.649315577592
    >>> round(pwsortedness(original, projected1, symmetric=False, cython=True)[1], 12)
    0.730078995423
    >>> round(pwsortedness(original, projected1, symmetric=False, i=1), 12)
    0.730078995423
    >>> np.round(pwsortedness(original, projected1, symmetric=False), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=hyperbolic), 12)
    array([0.75892647, 0.730079  , 0.83496865, 0.73161226, 0.75376525,
           0.83301104, 0.76695755, 0.74759156, 0.81434161, 0.74067221,
           0.74425225, 0.83731035])
    >>> np.round(pwsortedness(original, projected1, f=weightedtau, symmetric=False, weigher=gaussian), 12)
    array([0.74141933, 0.71595198, 0.94457495, 0.72528033, 0.78637383,
           0.92562531, 0.77600408, 0.74811014, 0.87241023, 0.8485321 ,
           0.82264118, 0.95322218])
    """
    if "rank" in kwargs:  # pragma: no cover
        raise Exception(f"Cannot provide `rank` as kwarg for pwsortedness. The pairwise distances ranking is calculated internally.")
    isweightedtau = hasattr(f, "isweightedtau") and f.isweightedtau
    if parallel_kwargs is None:
        parallel_kwargs = {}
    if cython and (kwargs or not isweightedtau):  # pragma: no cover
        raise Exception(f"Cannot provide custom `f` or `f` kwargs with `cython=True`")
    npoints = len(X) if X is not None else len(dist[0])  # pragma: no cover
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    thread = lambda M: pdist(M, metric="sqeuclidean")
    scores_X, scores_X_ = tmap(thread, [X, X_]) if X is not None else (squareform(dist[0]), squareform(dist[1]))
    if isweightedtau:
        scores_X, scores_X_ = -scores_X, -scores_X_

    def makeM(E):
        n = len(E)
        m = (n ** 2 - n) // 2
        M = np.zeros((m, E.shape[1]))
        c = 0
        for i in range(n - 1):  # a bit slow, but only a fraction of wtau (~5%)
            h = n - i - 1
            d = c + h
            M[c:d] = E[i] + E[i + 1:]
            c = d
        del E
        gc.collect()
        return M.T

    if symmetric:
        D, D_ = tmap(squareform, [-scores_X, -scores_X_]) if dist is None else (dist[0], dist[1])
    else:
        D = squareform(-scores_X) if dist is None else dist[0]

    if i is None:
        n = len(D)
        if symmetric:
            M, M_ = pmap(makeM, [D, D_])
            R_ = rank_alongrow(M_, step=n // batches, parallel=parallel, **parallel_kwargs).T
            del M_
        else:
            M = makeM(D)
        R = rank_alongrow(M, step=n // batches, parallel=parallel, **parallel_kwargs).T
        del M
        gc.collect()
        if cython:
            from sortedness.wtau import parwtau
            res = parwtau(scores_X, scores_X_, npoints, R, parallel=parallel, **parallel_kwargs)
            del R
            if not symmetric:
                gc.collect()
                return res

            res_ = parwtau(scores_X, scores_X_, npoints, R_, parallel=parallel, **parallel_kwargs)
            del R_
            gc.collect()
            return (res + res_) / 2
        else:
            def thread(r):
                corr = f(scores_X, scores_X_, rank=r, **kwargs)[0]
                return corr

            gen = (R[:, i] for i in range(len(X)))
            res = np.array(list(pmap(thread, gen)), dtype=float)
            del R
            if not symmetric:
                gc.collect()
                return res

            gen = (R_[:, i] for i in range(len(X_)))
            res_ = np.array(list(pmap(thread, gen)), dtype=float)
            del R_
            gc.collect()
            return np.round((res + res_) / 2, 12)

    if symmetric:
        M, M_ = pmap(makeM, [D[:, i:i + 1], D_[:, i:i + 1]])
        thread = lambda M: rankdata(M, axis=1, method="average")
        r, r_ = [r[0].astype(int) - 1 for r in tmap(thread, [M, M_])]  # todo: asInt and method="average" does not play nicely together  for ties!!
        s1 = f(scores_X, scores_X_, r, **kwargs)[0]
        s2 = f(scores_X, scores_X_, r_, **kwargs)[0]
        return round((s1 + s2) / 2, 12)

    M = makeM(D[:, i:i + 1])
    r = rankdata(M, axis=1, method="average")[0].astype(int) - 1
    return round(f(scores_X, scores_X_, r, **kwargs)[0], 12)

def remove_diagonal(X)

Expand source code

def remove_diagonal(X):
    n_points = len(X)
    nI = ~eye(n_points, dtype=bool)  # Mask to remove diagonal.
    return X[nI].reshape(n_points, -1)

def rsortedness(X, X_, i=None, symmetric=True, f=<function weightedtau>, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs)

Reciprocal sortedness: consider the neighborhood relation the other way around

Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm.

WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well.

TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)

This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

Parameters

X: Original dataset
X_: Projected points
i: None: calculate rsortedness for all instances int: index of the instance of interest
symmetric: True: Take the mean between extrusion and intrusion emphasis Equivalent to (rsortedness(a, b) + rsortedness(b, a)) / 2 at a slightly lower cost. Might increase memory usage. False: Weight by original distances (extrusion emphasis), not the projected distances.
f: Agreement function: callable = scipy correlation function: weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues: For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel: None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_kwargs: Any extra argument to be provided to pathos parallelization
parallel_n_trigger: Threshold to disable parallelization for small n values
kwargs: Arguments to be passed to the correlation measure

Returns

Numpy vector

>>> ll = [[i] for i in range(17)]
>>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
>>> b.ravel()
<code>array(\[ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])</code>
:   &nbsp;


>>> r = rsortedness(a, b)
>>> round(min(r), 12), round(max(r), 12)
(-0.707870893072, 0.961986592073)

>>> rnd = np.random.default_rng(1)
>>> rnd.shuffle(ll)
>>> b = np.array(ll)
>>> b.ravel()
<code>array(\[ 1, 10, 14, 15,  7, 12,  3,  4,  5,  8,  0,  9,  2, 16, 13, 11,  6])</code>
:   &nbsp;


>>> r = rsortedness(a, b)
>>> np.round(r, 12)
`array([-0.38455603, -0.28634813,  0.23902905,  0.19345863, -0.43727482,`
:   -0.3498781 ,  0.29240532,  0.52016504,  0.51878015,  0.07744892,
       -0.03664284, -0.17163371, -0.16346701, -0.07260407, -0.03677776,
        0.00183332, -0.25692691])


>>> round(min(r), 12), round(max(r), 12)
(-0.437274823593, 0.520165040078)
>>> round(mean(r), 12)
-0.020764055466

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> me = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=10)
>>> original = rng.multivariate_normal(me, cov, size=30)
>>> from sklearn.decomposition import PCA
>>> projected2 = PCA(n_components=2).fit_transform(original)
>>> projected1 = PCA(n_components=1).fit_transform(original)
>>> np.random.seed(10)
>>> projectedrnd = permutation(original)

>>> s = rsortedness(original, original)
>>> round(min(s), 12), round(max(s), 12)
(1.0, 1.0)
>>> s = rsortedness(original, projected2)
>>> round(min(s), 12), round(max(s), 12)
(1.0, 1.0)
>>> s = rsortedness(original, projected1)
>>> round(min(s), 12), round(max(s), 12)
(0.160980548632, 0.967351026423)
>>> s = rsortedness(original, projectedrnd)
>>> round(min(s), 12), round(max(s), 12)
(-0.406104443754, 0.427134084097)
>>> s = rsortedness(original, original, f=kendalltau, return_pvalues=True)
>>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
(array([1., 0.]), array([1.00e+00, 3.58e-10]))
>>> s = rsortedness(original, projected2, f=kendalltau)
>>> round(min(s), 12), round(max(s), 12)
(1.0, 1.0)
>>> s = rsortedness(original, projected1, f=kendalltau)
>>> round(min(s), 12), round(max(s), 12)
(0.045545155427, 0.920252656485)
>>> s = rsortedness(original, projectedrnd, f=kendalltau)
>>> round(min(s), 12), round(max(s), 12)
(-0.302063337022, 0.306710199121)
>>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
>>> s = rsortedness(original, original, f=wf, return_pvalues=True)
>>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
(array([ 1., nan]), array([ 1., nan]))
>>> s = rsortedness(original, projected2, f=wf)
>>> round(min(s), 12), round(max(s), 12)
(1.0, 1.0)
>>> s = rsortedness(original, projected1, f=wf)
>>> round(min(s), 12), round(max(s), 12)
(-0.119320940022, 0.914184310816)
>>> s = rsortedness(original, projectedrnd, f=wf)
>>> round(min(s), 12), round(max(s), 12)
(-0.418929560486, 0.710828808816)
>>> np.random.seed(14980)
>>> projectedrnd = permutation(original)
>>> s = rsortedness(original, projectedrnd)
>>> round(min(s), 12), round(max(s), 12)
(-0.415049518972, 0.465004321022)
>>> s = rsortedness(original, np.flipud(original))
>>> round(min(s), 12), round(max(s), 12)
(-0.258519523874, 0.454184518962)
>>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
>>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
>>> s = rsortedness(original, projected)
>>> round(min(s), 12), round(max(s), 12)
(1.0, 1.0)
>>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
>>> s = rsortedness(original, projected)
>>> round(min(s), 12), round(max(s), 12)
(-0.755847611802, 0.872258373962)
>>> round(rsortedness(original, projected, 1), 12)
0.544020048033
>>> round(rsortedness(original, projected, 1, symmetric=False), 12)
0.498125132865
>>> round(rsortedness(projected, original, 1, symmetric=False), 12)
0.589914963202
>>> round(rsortedness(original, projected, rank=True)[1], 12)
0.544020048033
>>> round(rsortedness(original, projected, rank=False)[1], 12) # warning: will consider indexes as ranks!
0.208406304729
>>> round(rsortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
-0.294037071368
>>> from scipy.stats import weightedtau
>>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
>>> round(rsortedness(original, projected, 1, f=weightedtau, rank=None), 12)
0.483816220002
>>> weightedtau.isweightedtau = True

Expand source code

def rsortedness(X, X_, i=None, symmetric=True, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs):  # pragma: no cover
    """
    Reciprocal sortedness: consider the neighborhood relation the other way around

    Might be good to assess the effect of a projection on hubness, and also to serve as a loss function for a custom projection algorithm.

    WARNING: this function is experimental, i.e., not as well tested as the others; it might need a better algorithm/fomula as well.

    # TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)
        This probably doesn't make any difference on the result, except on categorical, pathological or toy datasets
        Values can be lower due to the presence of ties, but only when the projection isn't prefect for all points.
        In the end, it might be even desired to penalize ties, as they don't exactly contribute to a stronger ordering and are (probabilistically) easier to be kept than a specific order.

    Parameters
    ----------
    X
        Original dataset
    X_
        Projected points
    i
        None:   calculate rsortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(rsortedness(a, b) + rsortedness(b, a)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function:
        callable    =   scipy correlation function:
            weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    return_pvalues
        For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
        This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
        The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    kwargs
        Arguments to be passed to the correlation measure

    Returns
    -------
        Numpy vector

    >>> ll = [[i] for i in range(17)]
    >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
    >>> b.ravel()
    array([ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])
    >>> r = rsortedness(a, b)
    >>> round(min(r), 12), round(max(r), 12)
    (-0.707870893072, 0.961986592073)

    >>> rnd = np.random.default_rng(1)
    >>> rnd.shuffle(ll)
    >>> b = np.array(ll)
    >>> b.ravel()
    array([ 1, 10, 14, 15,  7, 12,  3,  4,  5,  8,  0,  9,  2, 16, 13, 11,  6])
    >>> r = rsortedness(a, b)
    >>> np.round(r, 12)
    array([-0.38455603, -0.28634813,  0.23902905,  0.19345863, -0.43727482,
           -0.3498781 ,  0.29240532,  0.52016504,  0.51878015,  0.07744892,
           -0.03664284, -0.17163371, -0.16346701, -0.07260407, -0.03677776,
            0.00183332, -0.25692691])
    >>> round(min(r), 12), round(max(r), 12)
    (-0.437274823593, 0.520165040078)
    >>> round(mean(r), 12)
    -0.020764055466

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> me = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=10)
    >>> original = rng.multivariate_normal(me, cov, size=30)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(10)
    >>> projectedrnd = permutation(original)

    >>> s = rsortedness(original, original)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected2)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1)
    >>> round(min(s), 12), round(max(s), 12)
    (0.160980548632, 0.967351026423)
    >>> s = rsortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.406104443754, 0.427134084097)
    >>> s = rsortedness(original, original, f=kendalltau, return_pvalues=True)
    >>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
    (array([1., 0.]), array([1.00e+00, 3.58e-10]))
    >>> s = rsortedness(original, projected2, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (0.045545155427, 0.920252656485)
    >>> s = rsortedness(original, projectedrnd, f=kendalltau)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.302063337022, 0.306710199121)
    >>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
    >>> s = rsortedness(original, original, f=wf, return_pvalues=True)
    >>> np.round(s.min(axis=0), 12), np.round(s.max(axis=0), 12)
    (array([ 1., nan]), array([ 1., nan]))
    >>> s = rsortedness(original, projected2, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> s = rsortedness(original, projected1, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.119320940022, 0.914184310816)
    >>> s = rsortedness(original, projectedrnd, f=wf)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.418929560486, 0.710828808816)
    >>> np.random.seed(14980)
    >>> projectedrnd = permutation(original)
    >>> s = rsortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.415049518972, 0.465004321022)
    >>> s = rsortedness(original, np.flipud(original))
    >>> round(min(s), 12), round(max(s), 12)
    (-0.258519523874, 0.454184518962)
    >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
    >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
    >>> s = rsortedness(original, projected)
    >>> round(min(s), 12), round(max(s), 12)
    (1.0, 1.0)
    >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
    >>> s = rsortedness(original, projected)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.755847611802, 0.872258373962)
    >>> round(rsortedness(original, projected, 1), 12)
    0.544020048033
    >>> round(rsortedness(original, projected, 1, symmetric=False), 12)
    0.498125132865
    >>> round(rsortedness(projected, original, 1, symmetric=False), 12)
    0.589914963202
    >>> round(rsortedness(original, projected, rank=True)[1], 12)
    0.544020048033
    >>> round(rsortedness(original, projected, rank=False)[1], 12) # warning: will consider indexes as ranks!
    0.208406304729
    >>> round(rsortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
    -0.294037071368
    >>> from scipy.stats import weightedtau
    >>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
    >>> round(rsortedness(original, projected, 1, f=weightedtau, rank=None), 12)
    0.483816220002
    >>> weightedtau.isweightedtau = True

    """
    isweightedtau = False
    if hasattr(f, "isweightedtau") and f.isweightedtau:
        isweightedtau = True
        if not symmetric:
            if "rank" in kwargs:
                raise Exception(f"Cannot set `symmetric=False` and provide `rank` at the same time.")
            kwargs["rank"] = None
    if parallel_kwargs is None:
        parallel_kwargs = {}
    npoints = len(X)
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
    D, D_ = tmap(lambda M: cdist(M, M, metric="sqeuclidean"), [X, X_])
    R, R_ = (rank_alongcol(M, parallel=parallel) for M in [D, D_])
    scores_X, scores_X_ = tmap(lambda M: remove_diagonal(M), [R, R_])
    if isweightedtau:
        scores_X, scores_X_ = -scores_X, -scores_X_

    if hasattr(f, "isparwtau"):  # pragma: no cover
        raise Exception("TODO: Pairtau implementation disagree with scipy weightedtau")
        # return parwtau(scores_X, scores_X_, npoints, parallel=parallel, **kwargs)
    if i is not None:
        scores_X, scores_X_ = scores_X[i], scores_X_[i]
    return common(scores_X, scores_X_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs)

def sortedness(X, X_, i=None, symmetric=True, f=<function weightedtau>, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs)

Calculate the sortedness (stress-like correlation-based measure that focuses on ordering of points) value for each point Functions available as scipy correlation coefficients: ρ-sortedness (Spearman), 𝜏-sortedness (Kendall's 𝜏), w𝜏-sortedness (Sebastiano Vigna weighted Kendall's 𝜏) ← default

Note

Categorical, or pathological data might present values lower than one due to the presence of ties even with a perfect projection. Depending on the chosen correlation coefficient, ties are penalized, as they do not contribute to establishing any order.

Hint

Swap two points A and B at X_ to be able to calculate sortedness between A and B in the same space (i.e., originally, X = X_): X = [A, B, C, ..., Z] X_ = [B, A, C, ..., Z] sortedness(X, X_, i=0)

Parameters

X: matrix with an instance by row in a given space (often the original one)
X_: matrix with an instance by row in another given space (often the projected one)
i: None: calculate sortedness for all instances int: index of the instance of interest
symmetric: True: Take the mean between extrusion and intrusion emphasis Equivalent to (sortedness(a, b, symmetric=False) + sortedness(b, a, symmetric=False)) / 2 at a slightly lower cost. Might increase memory usage. False: Weight by original distances (extrusion emphasis), not the projected distances.
f: Agreement function: callable = scipy correlation function: weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr Meaning of resulting values for correlation-based functions: 1.0: perfect projection (regarding order of examples) 0.0: random projection (enough distortion to have no information left when considering the overall ordering) -1.0: worst possible projection (mostly theoretical; it represents the "opposite" of the original ordering)
return_pvalues: For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr' This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment] The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
parallel: None: Avoid high-memory parallelization True: Full parallelism False: No parallelism
parallel_kwargs: Any extra argument to be provided to pathos parallelization
parallel_n_trigger: Threshold to disable parallelization for small n values
kwargs: Arguments to be passed to the correlation measure

Returns

 ndarray containing a sortedness value per row, or a single float (include pvalues as a second value if requested)

>>> ll = [[i] for i in range(17)]
>>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
>>> b.ravel()
array([ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])
>>> r = sortedness(a, b)
>>> from statistics import median
>>> round(min(r), 12), round(max(r), 12), round(median(r),12)
(-1.0, 0.998638259786, 0.937548981983)

>>> rnd = np.random.default_rng(0)
>>> rnd.shuffle(ll)
>>> b = np.array(ll)
>>> b.ravel()
array([ 2, 10,  3, 11,  0,  4,  7,  5, 16, 12, 13,  6,  9, 14,  8,  1, 15])
>>> r = sortedness(a, b)
>>> r
array([ 0.24691868, -0.17456491,  0.19184376, -0.18193532,  0.07175694,
        0.27992254,  0.04121859,  0.16249574, -0.03506842,  0.27856259,
        0.40866965, -0.07617887,  0.12184064,  0.24762942, -0.05049511,
       -0.46277399,  0.12193493])
>>> round(min(r), 12), round(max(r), 12)
(-0.462773990559, 0.408669653064)
>>> round(mean(r), 12)
0.070104521222

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> me = (1, 2)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(me, cov, size=12)
>>> from sklearn.decomposition import PCA
>>> projected2 = PCA(n_components=2).fit_transform(original)
>>> projected1 = PCA(n_components=1).fit_transform(original)
>>> np.random.seed(0)
>>> projectedrnd = permutation(original)

>>> s = sortedness(original, original)
>>> round(min(s), 12), round(max(s), 12), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

Measure sortedness between two points in the same space.

>>> M = original.copy()
>>> M[0], M[1] = original[1], original[0]
>>> round(sortedness(M, original, 0), 12)
0.547929184934

>>> s = sortedness(original, projected2)
>>> round(min(s), 12), round(max(s), 12), s
(1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
>>> s = sortedness(original, projected1)
>>> round(min(s), 12), round(max(s), 12)
(0.393463224666, 0.944810120534)
>>> s = sortedness(original, projectedrnd)
>>> round(min(s), 12), round(max(s), 12)
(-0.648305479567, 0.397019507592)

>>> np.round(sortedness(original, original, f=kendalltau, return_pvalues=True), 12)
array([[1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08],
       [1.0000e+00, 5.0104e-08]])
>>> sortedness(original, projected2, f=kendalltau)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> sortedness(original, projected1, f=kendalltau)
array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091,
       0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909,
       0.6       , 0.74545455])
>>> sortedness(original, projectedrnd, f=kendalltau)
array([ 0.2       , -0.38181818,  0.23636364, -0.09090909, -0.05454545,
        0.23636364, -0.09090909,  0.23636364, -0.63636364, -0.01818182,
       -0.2       , -0.01818182])

>>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
>>> sortedness(original, original, f=wf, return_pvalues=True)
array([[ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan],
       [ 1., nan]])
>>> sortedness(original, projected2, f=wf)
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
>>> sortedness(original, projected1, f=wf)
array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591,
       0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767,
       0.87256459, 0.87747329])
>>> sortedness(original, projectedrnd, f=wf)
array([ 0.23771513, -0.2790059 ,  0.3718005 , -0.16623167,  0.06179047,
        0.40434396, -0.00130294,  0.46569739, -0.67581876, -0.23852189,
       -0.39125007,  0.12131153])
>>> np.random.seed(14980)
>>> projectedrnd = permutation(original)
>>> sortedness(original, projectedrnd)
array([ 0.24432153, -0.19634576, -0.00238081, -0.4999116 , -0.01625951,
        0.22478766,  0.07176118, -0.48092843,  0.19345964, -0.44895295,
       -0.42044773,  0.06942218])
>>> sortedness(original, np.flipud(original))
array([-0.28741742,  0.36769361,  0.06926091,  0.02550202,  0.21424544,
       -0.3244699 , -0.3244699 ,  0.21424544,  0.02550202,  0.06926091,
        0.36769361, -0.28741742])
>>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
>>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
>>> sortedness(original, projected)
array([1., 1., 1., 1., 1., 1., 1.])
>>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
>>> sortedness(original, projected)
array([-1.        ,  0.51956213,  0.81695345,  0.98180162,  0.98180162,
        0.81695345,  0.51956213])
>>> round(sortedness(original, projected, 1), 12)
0.519562134793
>>> round(sortedness(original, projected, 1, symmetric=False), 12)
0.422638894922
>>> round(sortedness(projected, original, 1, symmetric=False), 12)
0.616485374665
>>> round(sortedness(original, projected, rank=True)[1], 12)
0.519562134793
>>> round(sortedness(original, projected, rank=False)[1], 12)  # warning: will consider indexes as ranks!
0.074070734162
>>> round(sortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
-1.0
>>> from scipy.stats import weightedtau
>>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
>>> round(sortedness(original, projected, 1, f=weightedtau, rank=None), 12)
0.275652884819
>>> weightedtau.isweightedtau = True

Expand source code

def sortedness(X, X_, i=None, symmetric=True, f=weightedtau, return_pvalues=False, parallel=True, parallel_n_trigger=500, parallel_kwargs=None, **kwargs):
    """
     Calculate the sortedness (stress-like correlation-based measure that focuses on ordering of points) value for each point
     Functions available as scipy correlation coefficients:
         ρ-sortedness (Spearman),
         𝜏-sortedness (Kendall's 𝜏),
         w𝜏-sortedness (Sebastiano Vigna weighted Kendall's 𝜏)  ← default

    Note:
        Categorical, or pathological data might present values lower than one due to the presence of ties even with a perfect projection.
        Depending on the chosen correlation coefficient, ties are penalized, as they do not contribute to establishing any order.

    Hint:
        Swap two points A and B at X_ to be able to calculate sortedness between A and B in the same space (i.e., originally, `X = X_`):
            `X = [A, B, C, ..., Z]`
            `X_ = [B, A, C, ..., Z]`
            `sortedness(X, X_, i=0)`

    Parameters
    ----------
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    i
        None:   calculate sortedness for all instances
        `int`:  index of the instance of interest
    symmetric
        True: Take the mean between extrusion and intrusion emphasis
            Equivalent to `(sortedness(a, b, symmetric=False) + sortedness(b, a, symmetric=False)) / 2` at a slightly lower cost.
            Might increase memory usage.
        False: Weight by original distances (extrusion emphasis), not the projected distances.
    f
        Agreement function:
        callable    =   scipy correlation function:
            weightedtau (weighted Kendall’s τ is the default), kendalltau, spearmanr
            Meaning of resulting values for correlation-based functions:
                1.0:    perfect projection          (regarding order of examples)
                0.0:    random projection           (enough distortion to have no information left when considering the overall ordering)
               -1.0:    worst possible projection   (mostly theoretical; it represents the "opposite" of the original ordering)
    return_pvalues
        For scipy correlation functions, return a 2-column matrix 'corr, pvalue' instead of just 'corr'
        This makes more sense for Kendall's tau. [the weighted version might not have yet a established pvalue calculation method at this moment]
        The null hypothesis is that the projection is random, i.e., sortedness = 0.0.
    parallel
        None: Avoid high-memory parallelization
        True: Full parallelism
        False: No parallelism
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values
    kwargs
        Arguments to be passed to the correlation measure

     Returns
     -------
         ndarray containing a sortedness value per row, or a single float (include pvalues as a second value if requested)


    >>> ll = [[i] for i in range(17)]
    >>> a, b = np.array(ll), np.array(ll[0:1] + list(reversed(ll[1:])))
    >>> b.ravel()
    array([ 0, 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])
    >>> r = sortedness(a, b)
    >>> from statistics import median
    >>> round(min(r), 12), round(max(r), 12), round(median(r),12)
    (-1.0, 0.998638259786, 0.937548981983)

    >>> rnd = np.random.default_rng(0)
    >>> rnd.shuffle(ll)
    >>> b = np.array(ll)
    >>> b.ravel()
    array([ 2, 10,  3, 11,  0,  4,  7,  5, 16, 12, 13,  6,  9, 14,  8,  1, 15])
    >>> r = sortedness(a, b)
    >>> r
    array([ 0.24691868, -0.17456491,  0.19184376, -0.18193532,  0.07175694,
            0.27992254,  0.04121859,  0.16249574, -0.03506842,  0.27856259,
            0.40866965, -0.07617887,  0.12184064,  0.24762942, -0.05049511,
           -0.46277399,  0.12193493])
    >>> round(min(r), 12), round(max(r), 12)
    (-0.462773990559, 0.408669653064)
    >>> round(mean(r), 12)
    0.070104521222

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> me = (1, 2)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(me, cov, size=12)
    >>> from sklearn.decomposition import PCA
    >>> projected2 = PCA(n_components=2).fit_transform(original)
    >>> projected1 = PCA(n_components=1).fit_transform(original)
    >>> np.random.seed(0)
    >>> projectedrnd = permutation(original)

    >>> s = sortedness(original, original)
    >>> round(min(s), 12), round(max(s), 12), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))

    # Measure sortedness between two points in the same space.
    >>> M = original.copy()
    >>> M[0], M[1] = original[1], original[0]
    >>> round(sortedness(M, original, 0), 12)
    0.547929184934

    >>> s = sortedness(original, projected2)
    >>> round(min(s), 12), round(max(s), 12), s
    (1.0, 1.0, array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))
    >>> s = sortedness(original, projected1)
    >>> round(min(s), 12), round(max(s), 12)
    (0.393463224666, 0.944810120534)
    >>> s = sortedness(original, projectedrnd)
    >>> round(min(s), 12), round(max(s), 12)
    (-0.648305479567, 0.397019507592)

    >>> np.round(sortedness(original, original, f=kendalltau, return_pvalues=True), 12)
    array([[1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08],
           [1.0000e+00, 5.0104e-08]])
    >>> sortedness(original, projected2, f=kendalltau)
    array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    >>> sortedness(original, projected1, f=kendalltau)
    array([0.56363636, 0.52727273, 0.81818182, 0.96363636, 0.70909091,
           0.85454545, 0.74545455, 0.92727273, 0.85454545, 0.89090909,
           0.6       , 0.74545455])
    >>> sortedness(original, projectedrnd, f=kendalltau)
    array([ 0.2       , -0.38181818,  0.23636364, -0.09090909, -0.05454545,
            0.23636364, -0.09090909,  0.23636364, -0.63636364, -0.01818182,
           -0.2       , -0.01818182])

    >>> wf = partial(weightedtau, weigher=lambda x: 1 / (x**2 + 1))
    >>> sortedness(original, original, f=wf, return_pvalues=True)
    array([[ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan],
           [ 1., nan]])
    >>> sortedness(original, projected2, f=wf)
    array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    >>> sortedness(original, projected1, f=wf)
    array([0.89469168, 0.89269637, 0.92922928, 0.99721669, 0.86529591,
           0.97806422, 0.94330979, 0.99357377, 0.87959707, 0.92182767,
           0.87256459, 0.87747329])
    >>> sortedness(original, projectedrnd, f=wf)
    array([ 0.23771513, -0.2790059 ,  0.3718005 , -0.16623167,  0.06179047,
            0.40434396, -0.00130294,  0.46569739, -0.67581876, -0.23852189,
           -0.39125007,  0.12131153])
    >>> np.random.seed(14980)
    >>> projectedrnd = permutation(original)
    >>> sortedness(original, projectedrnd)
    array([ 0.24432153, -0.19634576, -0.00238081, -0.4999116 , -0.01625951,
            0.22478766,  0.07176118, -0.48092843,  0.19345964, -0.44895295,
           -0.42044773,  0.06942218])
    >>> sortedness(original, np.flipud(original))
    array([-0.28741742,  0.36769361,  0.06926091,  0.02550202,  0.21424544,
           -0.3244699 , -0.3244699 ,  0.21424544,  0.02550202,  0.06926091,
            0.36769361, -0.28741742])
    >>> original = np.array([[0],[1],[2],[3],[4],[5],[6]])
    >>> projected = np.array([[6],[5],[4],[3],[2],[1],[0]])
    >>> sortedness(original, projected)
    array([1., 1., 1., 1., 1., 1., 1.])
    >>> projected = np.array([[0],[6],[5],[4],[3],[2],[1]])
    >>> sortedness(original, projected)
    array([-1.        ,  0.51956213,  0.81695345,  0.98180162,  0.98180162,
            0.81695345,  0.51956213])
    >>> round(sortedness(original, projected, 1), 12)
    0.519562134793
    >>> round(sortedness(original, projected, 1, symmetric=False), 12)
    0.422638894922
    >>> round(sortedness(projected, original, 1, symmetric=False), 12)
    0.616485374665
    >>> round(sortedness(original, projected, rank=True)[1], 12)
    0.519562134793
    >>> round(sortedness(original, projected, rank=False)[1], 12)  # warning: will consider indexes as ranks!
    0.074070734162
    >>> round(sortedness([[1,2,3,3],[1,2,7,3],[3,4,7,8],[5,2,6,3],[3,5,4,8],[2,7,7,5]], [[7,1,2,3],[3,7,7,3],[5,4,5,6],[9,7,6,3],[2,3,5,1],[1,2,6,3]], 1), 12)
    -1.0
    >>> from scipy.stats import weightedtau
    >>> weightedtau.isweightedtau = False  # warning: will deactivate wau's auto-negativation of scores!
    >>> round(sortedness(original, projected, 1, f=weightedtau, rank=None), 12)
    0.275652884819
    >>> weightedtau.isweightedtau = True
    """
    isweightedtau = False
    if hasattr(f, "isweightedtau") and f.isweightedtau:
        isweightedtau = True
        if not symmetric:
            if "rank" in kwargs:  # pragma: no cover
                raise Exception(f"Cannot set `symmetric=False` and provide `rank` at the same time.")
            kwargs["rank"] = None
    if parallel_kwargs is None:
        parallel_kwargs = {}
    npoints = len(X)

    if i is None:
        tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
        pmap = mp.ProcessingPool(**parallel_kwargs).imap if parallel and npoints > parallel_n_trigger else map
        sqdist_X, sqdist_X_ = tmap(lambda M: cdist(M, M, metric='sqeuclidean'), [X, X_])
        D = remove_diagonal(sqdist_X)
        D_ = remove_diagonal(sqdist_X_)
        scores_X, scores_X_ = (-D, -D_) if isweightedtau else (D, D_)
    else:
        pmap = None
        if not isinstance(X, ndarray):
            X, X_ = np.array(X), np.array(X_)
        x, x_ = X[i], X_[i]
        X = np.delete(X, i, axis=0)
        X_ = np.delete(X_, i, axis=0)
        d_ = np.sum((X_ - x_) ** 2, axis=1)
        d = np.sum((X - x) ** 2, axis=1)
        scores_X, scores_X_ = (-d, -d_) if isweightedtau else (d, d_)

    return common(scores_X, scores_X_, i, symmetric, f, isweightedtau, return_pvalues, pmap, kwargs)

def stress(X, X_, i=None, metric=True, parallel=True, parallel_n_trigger=10000, **parallel_kwargs)

Kruskal's "Stress Formula 1" normalized before comparing distances. default: Euclidean

Parameters

X: matrix with an instance by row in a given space (often the original one)
X_: matrix with an instance by row in another given space (often the projected one)
i: None: calculate stress for all instances int: index of the instance of interest
metric: Stress formula version: metric or nonmetric
parallel: Parallelize processing when |X|>1000. Might use more memory.

Returns

parallel_kwargs: Any extra argument to be provided to pathos parallelization
parallel_n_trigger: Threshold to disable parallelization for small n values

>>> import numpy as np
>>> from functools import partial
>>> from scipy.stats import spearmanr, weightedtau
>>> mean = (1, 12)
>>> cov = eye(2)
>>> rng = np.random.default_rng(seed=0)
>>> original = rng.multivariate_normal(mean, cov, size=12)
>>> original
<code>array(\[\[ 1.12573022, 11.86789514],</code>
:   [ 1.64042265, 12.10490012],
       [ 0.46433063, 12.36159505],
       [ 2.30400005, 12.94708096],
       [ 0.29626476, 10.73457853],
       [ 0.37672554, 12.04132598],
       [-1.32503077, 11.78120834],
       [-0.24591095, 11.26773265],
       [ 0.45574102, 11.68369984],
       [ 1.41163054, 13.04251337],
       [ 0.87146534, 13.36646347],
       [ 0.33480533, 12.35151007]])


>>> s = stress(original, original*5)
>>> round(min(s), 12), round(max(s), 12), s
(0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
>>> from sklearn.decomposition import PCA
>>> projected = PCA(n_components=2).fit_transform(original)
>>> s = stress(original, projected)
>>> round(min(s), 12), round(max(s), 12), s
(0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
>>> projected = PCA(n_components=1).fit_transform(original)
>>> s = stress(original, projected)
>>> round(min(s), 12), round(max(s), 12), s
(0.073383317103, 0.440609121637, array([0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,
       0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
       0.44060912, 0.27058614]))
>>> stress(original, projected)
<code>array(\[0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,</code>
:   0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
       0.44060912, 0.27058614])


>>> stress(original, projected, metric=False)
<code>array(\[0.36599664, 0.39465927, 0.27349092, 0.25096851, 0.31476019,</code>
:   0.27612935, 0.3064739 , 0.26141414, 0.2635681 , 0.25811772,
       0.36113025, 0.29740821])


>>> stress(original, projected, 1)
<code>0.316031007598</code>
:   &nbsp;


>>> stress(original, projected, 1, metric=False)
<code>0.39465927169</code>
:   &nbsp;

Expand source code

def stress(X, X_, i=None, metric=True, parallel=True, parallel_n_trigger=10000, **parallel_kwargs):
    """
    Kruskal's "Stress Formula 1" normalized before comparing distances.
    default: Euclidean

    Parameters
    ----------
    X
        matrix with an instance by row in a given space (often the original one)
    X_
        matrix with an instance by row in another given space (often the projected one)
    i
        None:   calculate stress for all instances
        `int`:  index of the instance of interest
    metric
        Stress formula version: metric or nonmetric
    parallel
        Parallelize processing when |X|>1000. Might use more memory.

    Returns
    -------
    parallel_kwargs
        Any extra argument to be provided to pathos parallelization
    parallel_n_trigger
        Threshold to disable parallelization for small n values

    >>> import numpy as np
    >>> from functools import partial
    >>> from scipy.stats import spearmanr, weightedtau
    >>> mean = (1, 12)
    >>> cov = eye(2)
    >>> rng = np.random.default_rng(seed=0)
    >>> original = rng.multivariate_normal(mean, cov, size=12)
    >>> original
    array([[ 1.12573022, 11.86789514],
           [ 1.64042265, 12.10490012],
           [ 0.46433063, 12.36159505],
           [ 2.30400005, 12.94708096],
           [ 0.29626476, 10.73457853],
           [ 0.37672554, 12.04132598],
           [-1.32503077, 11.78120834],
           [-0.24591095, 11.26773265],
           [ 0.45574102, 11.68369984],
           [ 1.41163054, 13.04251337],
           [ 0.87146534, 13.36646347],
           [ 0.33480533, 12.35151007]])
    >>> s = stress(original, original*5)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
    >>> from sklearn.decomposition import PCA
    >>> projected = PCA(n_components=2).fit_transform(original)
    >>> s = stress(original, projected)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.0, 0.0, array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
    >>> projected = PCA(n_components=1).fit_transform(original)
    >>> s = stress(original, projected)
    >>> round(min(s), 12), round(max(s), 12), s
    (0.073383317103, 0.440609121637, array([0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,
           0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
           0.44060912, 0.27058614]))
    >>> stress(original, projected)
    array([0.26748441, 0.31603101, 0.24636389, 0.07338332, 0.34571508,
           0.19548442, 0.1800883 , 0.16544039, 0.2282494 , 0.16405274,
           0.44060912, 0.27058614])
    >>> stress(original, projected, metric=False)
    array([0.36599664, 0.39465927, 0.27349092, 0.25096851, 0.31476019,
           0.27612935, 0.3064739 , 0.26141414, 0.2635681 , 0.25811772,
           0.36113025, 0.29740821])
    >>> stress(original, projected, 1)
    0.316031007598
    >>> stress(original, projected, 1, metric=False)
    0.39465927169
    """
    tmap = mp.ThreadingPool(**parallel_kwargs).imap if parallel and X.size > parallel_n_trigger else map
    # TODO: parallelize cdist in slices?
    if metric:
        thread = (lambda M, m: cdist(M, M, metric=m)) if i is None else (lambda M, m: cdist(M[i:i + 1], M, metric=m))
        D, Dsq_ = tmap(thread, [X, X_], ["Euclidean", "sqeuclidean"])
        Dsq_ /= Dsq_.max(axis=1, keepdims=True)
        D_ = sqrt(Dsq_)
    else:
        thread = (lambda M, m: rankdata(cdist(M, M, metric=m), method="average", axis=1) - 1) if i is None else (lambda M, m: rankdata(cdist(M[i:i + 1], M, metric=m), method="average", axis=1) - 1)
        D, Dsq_ = tmap(thread, [X, X_], ["Euclidean", "sqeuclidean"])
        Dsq_ /= Dsq_.max(axis=1, keepdims=True)
        D_ = sqrt(Dsq_)

    D /= D.max(axis=1, keepdims=True)
    s = ((D - D_) ** 2).sum(axis=1) / 2
    result = np.round(np.sqrt(s / (Dsq_.sum(axis=1) / 2)), 12)
    return result if i is None else result.flat[0]

Functions

Parameters

Returns

See Also

Examples

TODO?: add flag to break extremely rare cases of ties that persist after projection (implies a much slower algorithm)

Parameters

Returns

TODO?: add flag to break (not so rare) cases of ties that persist after projection (implies a much slower algorithm)

Parameters

Returns

Note

Hint

Parameters

Measure sortedness between two points in the same space.

Parameters

Returns