|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
Tools to compute persistence diagrams
|
|
|
|
|
|
|
|
Persistent homology from ripser and gudhi library
|
|
|
|
Confidence sets from arxiv:1303.7117
|
|
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
from scipy.spatial.distance import directed_hausdorff
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
from tqdm import trange
|
|
|
|
|
|
|
|
import ripser
|
|
|
|
from persim import plot_diagrams
|
|
|
|
import gudhi
|
|
|
|
|
|
|
|
from .decorators import multi_input
|
|
|
|
|
|
|
|
|
|
|
|
def hausdorff(data1, data2, homdim, coeff):
|
|
|
|
"""Hausdorff metric between two persistence diagrams"""
|
|
|
|
dgm1 = (ripser.ripser(data1,maxdim=homdim,coeff=coeff))['dgms']
|
|
|
|
dgm2 = (ripser.ripser(data2,maxdim=homdim,coeff=coeff))['dgms']
|
|
|
|
distance = directed_hausdorff(dgm1[homdim], dgm2[homdim])[0]
|
|
|
|
return distance
|
|
|
|
|
|
|
|
@multi_input
|
|
|
|
def confidence(X, alpha=0.05, Nsubsamples=20, homdim=1, coeff=2):
|
|
|
|
"""
|
|
|
|
Compute the confidence interval of the persistence diagram of a dataset
|
|
|
|
|
|
|
|
Computation done by subsampling as in arxiv:1303.7117
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X: dataframe(n_datapoints, n_features):
|
|
|
|
Dataframe containing the data
|
|
|
|
alpha : float between 0 and 1, optional, default 0.05
|
|
|
|
1-alpha is the confidence
|
|
|
|
Nsubsamples : int, optional, default 20
|
|
|
|
The number of subsamples
|
|
|
|
homdim : int, optional, default 1
|
|
|
|
The dimension of the homology
|
|
|
|
coeff : int prime, optional, default 2
|
|
|
|
The coefficient basis
|
|
|
|
"""
|
|
|
|
N = X.shape[0]
|
|
|
|
distances = np.zeros(Nsubsamples)
|
|
|
|
iterator = trange(0, Nsubsamples, position=0, leave=True)
|
|
|
|
iterator.set_description("Computing confidence interval")
|
|
|
|
for i in iterator:
|
|
|
|
subsample = X.iloc[np.random.choice(N, N, replace=True)]
|
|
|
|
distances[i] = hausdorff(X, subsample, homdim, coeff)
|
|
|
|
distances.sort()
|
|
|
|
confidence = np.sqrt(2) * 2 * distances[int(alpha*Nsubsamples)]
|
|
|
|
return confidence
|
|
|
|
|
|
|
|
@multi_input
|
|
|
|
def persistence(X, homdim=1, coeff=2, threshold=float('inf'),
|
|
|
|
show_largest_homology=0, distance_matrix=False, Nsubsamples=0,
|
|
|
|
alpha=0.05, cycle=None, save_path=None):
|
|
|
|
"""
|
|
|
|
Plot the persistence diagram of a dataset using ripser
|
|
|
|
|
|
|
|
Also prints the five largest homology components
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X: dataframe(n_datapoints, n_features):
|
|
|
|
Dataframe containing the data
|
|
|
|
homdim : int, optional, default 1
|
|
|
|
The dimension of the homology
|
|
|
|
coeff : int prime, optional, default 2
|
|
|
|
The coefficient basis
|
|
|
|
threshold : float, optional, default infinity
|
|
|
|
The maximum distance in the filtration
|
|
|
|
show_largest_homology: int, optional, default 0
|
|
|
|
Print this many of the largest homology components
|
|
|
|
distance_matrix : bool, optional, default False
|
|
|
|
When true X will be interepreted as a distance matrix
|
|
|
|
Nsubsamples : int, optional, default 0
|
|
|
|
The number of subsamples used in computing the confidence interval
|
|
|
|
Does not compute the confidence interval when this is 0
|
|
|
|
alpha : float between 0 and 1, optional, default 0.05
|
|
|
|
1-alpha is the confidence
|
|
|
|
cycle : int, optional, default None
|
|
|
|
If given highlight the homology component in the plot corresponding to
|
|
|
|
this cycle id
|
|
|
|
save_path : str, optional, default None
|
|
|
|
When given save the plot here
|
|
|
|
"""
|
|
|
|
result = ripser.ripser(X, maxdim=homdim, coeff=coeff, do_cocycles=True,
|
|
|
|
distance_matrix=distance_matrix, thresh=threshold)
|
|
|
|
diagrams = result['dgms']
|
|
|
|
plot_diagrams(diagrams, show=False)
|
|
|
|
if (Nsubsamples>0):
|
|
|
|
conf = confidence(X, alpha, Nsubsamples, homdim, 2)
|
|
|
|
line_length = 10000
|
|
|
|
plt.plot([0, line_length], [conf, line_length + conf], color='green',
|
|
|
|
linestyle='dashed',linewidth=2)
|
|
|
|
if cycle is not None:
|
|
|
|
dgm1 = diagrams[1]
|
|
|
|
plt.scatter(dgm1[cycle, 0], dgm1[cycle, 1], 20, 'k', 'x')
|
|
|
|
if save_path is not None:
|
|
|
|
path = save_path + 'Z' + str(coeff)
|
|
|
|
if (Nsubsamples>0):
|
|
|
|
path += '_confidence' + str(1-alpha)
|
|
|
|
path += '.png'
|
|
|
|
plt.savefig(path)
|
|
|
|
plt.show()
|
|
|
|
|
|
|
|
if show_largest_homology != 0:
|
|
|
|
dgm = diagrams[homdim]
|
|
|
|
largest_indices = np.argsort(dgm[:, 0] - dgm[:, 1])
|
|
|
|
largest_components = dgm[largest_indices[:show_largest_homology]]
|
|
|
|
print(f"Largest {homdim}-homology components:")
|
|
|
|
print(largest_components)
|
|
|
|
return
|
|
|
|
|
|
|
|
@multi_input
|
|
|
|
def persistence_witness(X, number_of_landmarks=100, max_alpha_square=0.0,
|
|
|
|
homdim=1):
|
|
|
|
"""
|
|
|
|
Plot the persistence diagram of a dataset using gudhi
|
|
|
|
|
|
|
|
Uses a witness complex allowing it to be used on larger datasets
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
X: dataframe(n_datapoints, n_features):
|
|
|
|
Dataframe containing the data
|
|
|
|
number_of_landmarks : int, optional, default 100
|
|
|
|
The number of landmarks in the witness complex
|
|
|
|
max_alpha_square : double, optional, default 0.0
|
|
|
|
Maximal squared relaxation parameter
|
|
|
|
homdim : int, optional, default 1
|
|
|
|
The dimension of the homology
|
|
|
|
"""
|
|
|
|
print("Sampling landmarks...", end=" ")
|
|
|
|
|
|
|
|
witnesses = X.to_numpy()
|
|
|
|
landmarks = gudhi.pick_n_random_points(
|
|
|
|
points=witnesses, nb_points=number_of_landmarks
|
|
|
|
)
|
|
|
|
print("done")
|
|
|
|
message = (
|
|
|
|
"EuclideanStrongWitnessComplex with max_edge_length="
|
|
|
|
+ repr(max_alpha_square)
|
|
|
|
+ " - Number of landmarks="
|
|
|
|
+ repr(number_of_landmarks)
|
|
|
|
)
|
|
|
|
print(message)
|
|
|
|
witness_complex = gudhi.EuclideanStrongWitnessComplex(
|
|
|
|
witnesses=witnesses, landmarks=landmarks
|
|
|
|
)
|
|
|
|
simplex_tree = witness_complex.create_simplex_tree(
|
|
|
|
max_alpha_square=max_alpha_square,
|
|
|
|
limit_dimension=homdim
|
|
|
|
)
|
|
|
|
message = "Number of simplices=" + repr(simplex_tree.num_simplices())
|
|
|
|
print(message)
|
|
|
|
diag = simplex_tree.persistence()
|
|
|
|
print("betti_numbers()=")
|
|
|
|
print(simplex_tree.betti_numbers())
|
|
|
|
gudhi.plot_persistence_diagram(diag, band=0.0)
|
|
|
|
plt.show()
|
|
|
|
return
|