Lev
2 years ago
18 changed files with 436 additions and 1229 deletions
@ -0,0 +1,78 @@ |
|||||||
|
{ |
||||||
|
"cells": [ |
||||||
|
{ |
||||||
|
"cell_type": "code", |
||||||
|
"execution_count": 1, |
||||||
|
"outputs": [], |
||||||
|
"source": [ |
||||||
|
"from load import pkl_load" |
||||||
|
], |
||||||
|
"metadata": { |
||||||
|
"collapsed": false, |
||||||
|
"pycharm": { |
||||||
|
"name": "#%%\n" |
||||||
|
} |
||||||
|
} |
||||||
|
}, |
||||||
|
{ |
||||||
|
"cell_type": "code", |
||||||
|
"execution_count": 2, |
||||||
|
"outputs": [ |
||||||
|
{ |
||||||
|
"ename": "AttributeError", |
||||||
|
"evalue": "Can't get attribute 'simple_cell_model' on <module '__main__'>", |
||||||
|
"output_type": "error", |
||||||
|
"traceback": [ |
||||||
|
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", |
||||||
|
"\u001B[0;31mAttributeError\u001B[0m Traceback (most recent call last)", |
||||||
|
"Input \u001B[0;32mIn [2]\u001B[0m, in \u001B[0;36m<cell line: 1>\u001B[0;34m()\u001B[0m\n\u001B[0;32m----> 1\u001B[0m model_center_fp \u001B[38;5;241m=\u001B[39m \u001B[43mpkl_load\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43msimple_cell_center_fp\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n", |
||||||
|
"File \u001B[0;32m~/dev/amgen/load.py:6\u001B[0m, in \u001B[0;36mpkl_load\u001B[0;34m(filename)\u001B[0m\n\u001B[1;32m 4\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mpkl_load\u001B[39m(filename):\n\u001B[1;32m 5\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(filename, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mrb\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m f:\n\u001B[0;32m----> 6\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mpickle\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mload\u001B[49m\u001B[43m(\u001B[49m\u001B[43mf\u001B[49m\u001B[43m)\u001B[49m\n", |
||||||
|
"\u001B[0;31mAttributeError\u001B[0m: Can't get attribute 'simple_cell_model' on <module '__main__'>" |
||||||
|
] |
||||||
|
} |
||||||
|
], |
||||||
|
"source": [ |
||||||
|
"model_center_fp = pkl_load('simple_cell_center_fp')" |
||||||
|
], |
||||||
|
"metadata": { |
||||||
|
"collapsed": false, |
||||||
|
"pycharm": { |
||||||
|
"name": "#%%\n" |
||||||
|
} |
||||||
|
} |
||||||
|
}, |
||||||
|
{ |
||||||
|
"cell_type": "code", |
||||||
|
"execution_count": null, |
||||||
|
"outputs": [], |
||||||
|
"source": [], |
||||||
|
"metadata": { |
||||||
|
"collapsed": false, |
||||||
|
"pycharm": { |
||||||
|
"name": "#%%\n" |
||||||
|
} |
||||||
|
} |
||||||
|
} |
||||||
|
], |
||||||
|
"metadata": { |
||||||
|
"kernelspec": { |
||||||
|
"display_name": "Python 3", |
||||||
|
"language": "python", |
||||||
|
"name": "python3" |
||||||
|
}, |
||||||
|
"language_info": { |
||||||
|
"codemirror_mode": { |
||||||
|
"name": "ipython", |
||||||
|
"version": 2 |
||||||
|
}, |
||||||
|
"file_extension": ".py", |
||||||
|
"mimetype": "text/x-python", |
||||||
|
"name": "python", |
||||||
|
"nbconvert_exporter": "python", |
||||||
|
"pygments_lexer": "ipython2", |
||||||
|
"version": "2.7.6" |
||||||
|
} |
||||||
|
}, |
||||||
|
"nbformat": 4, |
||||||
|
"nbformat_minor": 0 |
||||||
|
} |
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,189 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
""" |
||||||
|
Use cohomology to decode datasets with circular parameters |
||||||
|
|
||||||
|
Persistent homology from arxiv:1908.02518 |
||||||
|
Homological decoding from DOI:10.1007/s00454-011-9344-x and arxiv:1711.07205 |
||||||
|
""" |
||||||
|
import math |
||||||
|
import numpy as np |
||||||
|
from scipy.optimize import least_squares |
||||||
|
import pandas as pd |
||||||
|
|
||||||
|
from tqdm import trange |
||||||
|
|
||||||
|
import ripser |
||||||
|
|
||||||
|
from persistence import persistence |
||||||
|
|
||||||
|
EPSILON = 0.0000000000001 |
||||||
|
|
||||||
|
|
||||||
|
def shortest_cycle(graph, node2, node1): |
||||||
|
""" |
||||||
|
Returns the shortest cycle going through an edge |
||||||
|
|
||||||
|
Used for computing weights in decode |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
graph: ndarray (n_nodes, n_nodes) |
||||||
|
A matrix containing the weights of the edges in the graph |
||||||
|
node1: int |
||||||
|
The index of the first node of the edge |
||||||
|
node2: int |
||||||
|
The index of the second node of the edge |
||||||
|
|
||||||
|
Returns |
||||||
|
------- |
||||||
|
cycle: list of ints |
||||||
|
A list of indices representing the nodes of the cycle in order |
||||||
|
""" |
||||||
|
N = graph.shape[0] |
||||||
|
distances = np.inf * np.ones(N) |
||||||
|
distances[node2] = 0 |
||||||
|
prev_nodes = np.zeros(N) |
||||||
|
prev_nodes[:] = np.nan |
||||||
|
prev_nodes[node2] = node1 |
||||||
|
while (math.isnan(prev_nodes[node1])): |
||||||
|
distances_buffer = distances |
||||||
|
for j in range(N): |
||||||
|
possible_path_lengths = distances_buffer + graph[:, j] |
||||||
|
if (np.min(possible_path_lengths) < distances[j]): |
||||||
|
prev_nodes[j] = np.argmin(possible_path_lengths) |
||||||
|
distances[j] = np.min(possible_path_lengths) |
||||||
|
prev_nodes = prev_nodes.astype(int) |
||||||
|
cycle = [node1] |
||||||
|
while (cycle[0] != node2): |
||||||
|
cycle.insert(0, prev_nodes[cycle[0]]) |
||||||
|
cycle.insert(0, node1) |
||||||
|
return cycle |
||||||
|
|
||||||
|
|
||||||
|
def cohomological_parameterization(X, cocycle_number=1, coeff=2, weighted=False): |
||||||
|
""" |
||||||
|
Compute an angular parametrization on the data set corresponding to a given |
||||||
|
1-cycle |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
X: ndarray(n_datapoints, n_features): |
||||||
|
Array containing the data |
||||||
|
cocycle_number: int, optional, default 1 |
||||||
|
An integer specifying the 1-cycle used |
||||||
|
The n-th most stable 1-cycle is used, where n = cocycle_number |
||||||
|
coeff: int prime, optional, default 1 |
||||||
|
The coefficient basis in which we compute the cohomology |
||||||
|
weighted: bool, optional, default False |
||||||
|
When true use a weighted graph for smoother parameterization |
||||||
|
as proposed in arxiv:1711.07205 |
||||||
|
|
||||||
|
Returns |
||||||
|
------- |
||||||
|
decoding: ndarray(n_datapoints) |
||||||
|
The parameterization of the dataset consisting of a number between |
||||||
|
0 and 1 for each datapoint, to be interpreted modulo 1 |
||||||
|
""" |
||||||
|
# Get the cocycle |
||||||
|
result = ripser.ripser(X, maxdim=1, coeff=coeff, do_cocycles=True) |
||||||
|
diagrams = result['dgms'] |
||||||
|
cocycles = result['cocycles'] |
||||||
|
D = result['dperm2all'] |
||||||
|
dgm1 = diagrams[1] |
||||||
|
idx = np.argsort(dgm1[:, 1] - dgm1[:, 0])[-cocycle_number] |
||||||
|
cocycle = cocycles[1][idx] |
||||||
|
persistence(X, homdim=1, coeff=coeff, show_largest_homology=0, |
||||||
|
Nsubsamples=0, save_path=None, cycle=idx) |
||||||
|
thresh = dgm1[idx, 1] - EPSILON |
||||||
|
|
||||||
|
# Compute connectivity |
||||||
|
N = X.shape[0] |
||||||
|
connectivity = np.zeros([N, N]) |
||||||
|
for i in range(N): |
||||||
|
for j in range(i): |
||||||
|
if D[i, j] <= thresh: |
||||||
|
connectivity[i, j] = 1 |
||||||
|
cocycle_array = np.zeros([N, N]) |
||||||
|
|
||||||
|
# Lift cocycle |
||||||
|
for i in range(cocycle.shape[0]): |
||||||
|
cocycle_array[cocycle[i, 0], cocycle[i, 1]] = ( |
||||||
|
((cocycle[i, 2] + coeff / 2) % coeff) - coeff / 2 |
||||||
|
) |
||||||
|
|
||||||
|
# Weights |
||||||
|
if (weighted): |
||||||
|
def real_cocycle(x): |
||||||
|
real_cocycle = ( |
||||||
|
connectivity * (cocycle_array + np.subtract.outer(x, x)) |
||||||
|
) |
||||||
|
return np.ravel(real_cocycle) |
||||||
|
|
||||||
|
# Compute graph |
||||||
|
x0 = np.zeros(N) |
||||||
|
res = least_squares(real_cocycle, x0) |
||||||
|
real_cocyle_array = res.fun |
||||||
|
real_cocyle_array = real_cocyle_array.reshape(N, N) |
||||||
|
real_cocyle_array = real_cocyle_array - np.transpose(real_cocyle_array) |
||||||
|
graph = np.array(real_cocyle_array > 0).astype(float) |
||||||
|
graph[graph == 0] = np.inf |
||||||
|
graph = (D + EPSILON) * graph # Add epsilon to avoid NaNs |
||||||
|
|
||||||
|
# Compute weights |
||||||
|
cycle_counts = np.zeros([N, N]) |
||||||
|
iterator = trange(0, N, position=0, leave=True) |
||||||
|
iterator.set_description("Computing weights for decoding") |
||||||
|
for i in iterator: |
||||||
|
for j in range(N): |
||||||
|
if (graph[i, j] != np.inf): |
||||||
|
cycle = shortest_cycle(graph, j, i) |
||||||
|
for k in range(len(cycle) - 1): |
||||||
|
cycle_counts[cycle[k], cycle[k + 1]] += 1 |
||||||
|
|
||||||
|
weights = cycle_counts / (D + EPSILON) ** 2 |
||||||
|
weights = np.sqrt(weights) |
||||||
|
else: |
||||||
|
weights = np.outer(np.ones(N), np.ones(N)) |
||||||
|
|
||||||
|
def real_cocycle(x): |
||||||
|
real_cocycle = ( |
||||||
|
weights * connectivity * (cocycle_array + np.subtract.outer(x, x)) |
||||||
|
) |
||||||
|
return np.ravel(real_cocycle) |
||||||
|
|
||||||
|
# Smooth cocycle |
||||||
|
print("Decoding...", end=" ") |
||||||
|
x0 = np.zeros(N) |
||||||
|
res = least_squares(real_cocycle, x0) |
||||||
|
decoding = res.x |
||||||
|
decoding = np.mod(decoding, 1) |
||||||
|
print("done") |
||||||
|
|
||||||
|
decoding = pd.DataFrame(decoding, columns=["decoding"]) |
||||||
|
decoding = decoding.set_index(X.index) |
||||||
|
return decoding |
||||||
|
|
||||||
|
|
||||||
|
def remove_feature(X, decoding, shift=0, cut_amplitude=1.0): |
||||||
|
""" |
||||||
|
Removes a decoded feature from a dataset by making a cut at a fixed value |
||||||
|
of the decoding |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
X: dataframe(n_datapoints, n_features): |
||||||
|
Array containing the data |
||||||
|
decoding : dataframe(n_datapoints) |
||||||
|
The decoded feature, assumed to be angular with periodicity 1 |
||||||
|
shift : float between 0 and 1, optional, default 0 |
||||||
|
The location of the cut |
||||||
|
cut_amplitude : float, optional, default 1 |
||||||
|
Amplitude of the cut |
||||||
|
""" |
||||||
|
cuts = np.zeros(X.shape) |
||||||
|
decoding = decoding.to_numpy()[:, 0] |
||||||
|
for i in range(X.shape[1]): |
||||||
|
effective_amplitude = cut_amplitude * (np.max(X[i]) - np.min(X[i])) |
||||||
|
cuts[:, i] = effective_amplitude * ((decoding - shift) % 1) |
||||||
|
reduced_data = X + cuts |
||||||
|
return reduced_data |
@ -0,0 +1,169 @@ |
|||||||
|
# -*- coding: utf-8 -*- |
||||||
|
""" |
||||||
|
Tools to compute persistence diagrams |
||||||
|
|
||||||
|
Persistent homology from ripser and gudhi library |
||||||
|
Confidence sets from arxiv:1303.7117 |
||||||
|
""" |
||||||
|
import numpy as np |
||||||
|
from scipy.spatial.distance import directed_hausdorff |
||||||
|
|
||||||
|
import matplotlib.pyplot as plt |
||||||
|
|
||||||
|
from tqdm import trange |
||||||
|
|
||||||
|
import ripser |
||||||
|
from persim import plot_diagrams |
||||||
|
import gudhi |
||||||
|
|
||||||
|
from decorators import multi_input |
||||||
|
|
||||||
|
|
||||||
|
def hausdorff(data1, data2, homdim, coeff): |
||||||
|
"""Hausdorff metric between two persistence diagrams""" |
||||||
|
dgm1 = (ripser.ripser(data1,maxdim=homdim,coeff=coeff))['dgms'] |
||||||
|
dgm2 = (ripser.ripser(data2,maxdim=homdim,coeff=coeff))['dgms'] |
||||||
|
distance = directed_hausdorff(dgm1[homdim], dgm2[homdim])[0] |
||||||
|
return distance |
||||||
|
|
||||||
|
@multi_input |
||||||
|
def confidence(X, alpha=0.05, Nsubsamples=20, homdim=1, coeff=2): |
||||||
|
""" |
||||||
|
Compute the confidence interval of the persistence diagram of a dataset |
||||||
|
|
||||||
|
Computation done by subsampling as in arxiv:1303.7117 |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
X: dataframe(n_datapoints, n_features): |
||||||
|
Dataframe containing the data |
||||||
|
alpha : float between 0 and 1, optional, default 0.05 |
||||||
|
1-alpha is the confidence |
||||||
|
Nsubsamples : int, optional, default 20 |
||||||
|
The number of subsamples |
||||||
|
homdim : int, optional, default 1 |
||||||
|
The dimension of the homology |
||||||
|
coeff : int prime, optional, default 2 |
||||||
|
The coefficient basis |
||||||
|
""" |
||||||
|
N = X.shape[0] |
||||||
|
distances = np.zeros(Nsubsamples) |
||||||
|
iterator = trange(0, Nsubsamples, position=0, leave=True) |
||||||
|
iterator.set_description("Computing confidence interval") |
||||||
|
for i in iterator: |
||||||
|
subsample = X.iloc[np.random.choice(N, N, replace=True)] |
||||||
|
distances[i] = hausdorff(X, subsample, homdim, coeff) |
||||||
|
distances.sort() |
||||||
|
confidence = np.sqrt(2) * 2 * distances[int(alpha*Nsubsamples)] |
||||||
|
return confidence |
||||||
|
|
||||||
|
@multi_input |
||||||
|
def persistence(X, homdim=1, coeff=2, threshold=float('inf'), |
||||||
|
show_largest_homology=0, distance_matrix=False, Nsubsamples=0, |
||||||
|
alpha=0.05, cycle=None, save_path=None): |
||||||
|
""" |
||||||
|
Plot the persistence diagram of a dataset using ripser |
||||||
|
|
||||||
|
Also prints the five largest homology components |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
X: dataframe(n_datapoints, n_features): |
||||||
|
Dataframe containing the data |
||||||
|
homdim : int, optional, default 1 |
||||||
|
The dimension of the homology |
||||||
|
coeff : int prime, optional, default 2 |
||||||
|
The coefficient basis |
||||||
|
threshold : float, optional, default infinity |
||||||
|
The maximum distance in the filtration |
||||||
|
show_largest_homology: int, optional, default 0 |
||||||
|
Print this many of the largest homology components |
||||||
|
distance_matrix : bool, optional, default False |
||||||
|
When true X will be interepreted as a distance matrix |
||||||
|
Nsubsamples : int, optional, default 0 |
||||||
|
The number of subsamples used in computing the confidence interval |
||||||
|
Does not compute the confidence interval when this is 0 |
||||||
|
alpha : float between 0 and 1, optional, default 0.05 |
||||||
|
1-alpha is the confidence |
||||||
|
cycle : int, optional, default None |
||||||
|
If given highlight the homology component in the plot corresponding to |
||||||
|
this cycle id |
||||||
|
save_path : str, optional, default None |
||||||
|
When given save the plot here |
||||||
|
""" |
||||||
|
result = ripser.ripser(X, maxdim=homdim, coeff=coeff, do_cocycles=True, |
||||||
|
distance_matrix=distance_matrix, thresh=threshold) |
||||||
|
diagrams = result['dgms'] |
||||||
|
plot_diagrams(diagrams, show=False) |
||||||
|
if (Nsubsamples>0): |
||||||
|
conf = confidence(X, alpha, Nsubsamples, homdim, 2) |
||||||
|
line_length = 10000 |
||||||
|
plt.plot([0, line_length], [conf, line_length + conf], color='green', |
||||||
|
linestyle='dashed',linewidth=2) |
||||||
|
if cycle is not None: |
||||||
|
dgm1 = diagrams[1] |
||||||
|
plt.scatter(dgm1[cycle, 0], dgm1[cycle, 1], 20, 'k', 'x') |
||||||
|
if save_path is not None: |
||||||
|
path = save_path + 'Z' + str(coeff) |
||||||
|
if (Nsubsamples>0): |
||||||
|
path += '_confidence' + str(1-alpha) |
||||||
|
path += '.png' |
||||||
|
plt.savefig(path) |
||||||
|
plt.show() |
||||||
|
|
||||||
|
if show_largest_homology != 0: |
||||||
|
dgm = diagrams[homdim] |
||||||
|
largest_indices = np.argsort(dgm[:, 0] - dgm[:, 1]) |
||||||
|
largest_components = dgm[largest_indices[:show_largest_homology]] |
||||||
|
print(f"Largest {homdim}-homology components:") |
||||||
|
print(largest_components) |
||||||
|
return |
||||||
|
|
||||||
|
@multi_input |
||||||
|
def persistence_witness(X, number_of_landmarks=100, max_alpha_square=0.0, |
||||||
|
homdim=1): |
||||||
|
""" |
||||||
|
Plot the persistence diagram of a dataset using gudhi |
||||||
|
|
||||||
|
Uses a witness complex allowing it to be used on larger datasets |
||||||
|
|
||||||
|
Parameters |
||||||
|
---------- |
||||||
|
X: dataframe(n_datapoints, n_features): |
||||||
|
Dataframe containing the data |
||||||
|
number_of_landmarks : int, optional, default 100 |
||||||
|
The number of landmarks in the witness complex |
||||||
|
max_alpha_square : double, optional, default 0.0 |
||||||
|
Maximal squared relaxation parameter |
||||||
|
homdim : int, optional, default 1 |
||||||
|
The dimension of the homology |
||||||
|
""" |
||||||
|
print("Sampling landmarks...", end=" ") |
||||||
|
|
||||||
|
witnesses = X.to_numpy() |
||||||
|
landmarks = gudhi.pick_n_random_points( |
||||||
|
points=witnesses, nb_points=number_of_landmarks |
||||||
|
) |
||||||
|
print("done") |
||||||
|
message = ( |
||||||
|
"EuclideanStrongWitnessComplex with max_edge_length=" |
||||||
|
+ repr(max_alpha_square) |
||||||
|
+ " - Number of landmarks=" |
||||||
|
+ repr(number_of_landmarks) |
||||||
|
) |
||||||
|
print(message) |
||||||
|
witness_complex = gudhi.EuclideanStrongWitnessComplex( |
||||||
|
witnesses=witnesses, landmarks=landmarks |
||||||
|
) |
||||||
|
simplex_tree = witness_complex.create_simplex_tree( |
||||||
|
max_alpha_square=max_alpha_square, |
||||||
|
limit_dimension=homdim |
||||||
|
) |
||||||
|
message = "Number of simplices=" + repr(simplex_tree.num_simplices()) |
||||||
|
print(message) |
||||||
|
diag = simplex_tree.persistence() |
||||||
|
print("betti_numbers()=") |
||||||
|
print(simplex_tree.betti_numbers()) |
||||||
|
gudhi.plot_persistence_diagram(diag, band=0.0) |
||||||
|
plt.show() |
||||||
|
return |
Loading…
Reference in new issue