For the sake of anyone who stumbles upon this in the future, I apologize – I had conceived of the problem and solution incorrectly. A proper implementation of finding the coordinates of pairwise best hits in a 2D matrix is:
def pairwise_best_hit(matrix):
boolean_max_matrix = (matrix.max(axis=1,keepdims=1) == matrix) & (matrix.max(axis=0,keepdims=1) == matrix)
return np.transpose(boolean_max_matrix.nonzero())
I'm attempting to find the "best hits" in a similarity matrix (i.e. an mxn matrix where index along each axis corresponds to the ith position in vector m and the jth position in vector n). The simplest way to explain this is finding the indices of the highest values in a matrix iteratively, excluding previously selected rows and columns. This results in min(m,n) indices chosen.
Here's my minimum reproducible example of my current implementation, using pandas:
import numpy as np
import pandas as pd
def pairwise_best_hit(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return [seq1_hits,seq2_hits]
and for a matrix
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
pairwise_best_hit(sim)
returns
[[1, 2, 0], [1, 3, 2]]
Figured an edit would be the best way to respond to all comments simultaneously. Re: typical data size – m and n are anywhere from 250 to 1000 and the values in the matrix are floats.
Now, for the results on a matrix of my actual data, which is about 300x350. Slightly tweaking the answers from LastDuckStanding, Julien, and Axel Donath, we have:
def original(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
output_list = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
score = df.iloc[0]['sim']
output_list.append((int(index1),int(index2),score))
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return output_list
def lastduckstanding(input_matrix):
mat = input_matrix.copy()
idxs = np.argsort(mat, None)
output_list = []
hit_is_set = set()
hit_js_set = set()
num_entries = min(mat.shape[0], mat.shape[1])
for idx in reversed(idxs):
i, j = divmod(idx, mat.shape[1])
if i in hit_is_set or j in hit_js_set:
continue
hit_is_set.add(i)
hit_js_set.add(j)
output_list.append((i,j,mat[i,j]))
if len(output_list) == num_entries:
break
return output_list
def julien(matrix: np.ndarray):
out = []
copy = matrix.copy()
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
indeces_plus_score = (ij[0],ij[1],copy[ij[0],ij[1]])
out.append(indeces_plus_score)
copy[ij[0], :] = -np.inf
copy[:, ij[1]] = -np.inf
return out
def axeldonath(arr, indices):
"""Find the maximum value in a 2D array recursively."""
if not np.any(np.isfinite(arr)):
return indices
idx_max = np.argmax(arr)
idxs = np.unravel_index(idx_max, arr.shape)
indeces_plus_score = (idxs[0],idxs[1],arr[idxs[0],idxs[1]])
arr[idxs[0], :] = -np.inf
arr[:, idxs[1]] = -np.inf
indices.append(indeces_plus_score)
return axeldonath(arr, indices)
def axeldonath_wrapper(similarity):
testsim = similarity.copy()
return axeldonath(testsim,[])
def timing_test(S,functionlist):
for function in functionlist:
testsim = S['matrix']
print(function)
%timeit function(testsim)
With the following timing results:
<function original at 0x7f405006d1c0>
287 ms ± 1.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
<function lastduckstanding at 0x7f405006d120>
30 ms ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
<function julien at 0x7f405006d260>
7.9 ms ± 30.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
<function axeldonath_wrapper at 0x7f405006d3a0>
16.9 ms ± 42.9 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Here's a candidate using numpy only, about 100x faster on your small example:
import numpy as np
import pandas as pd
import timeit
sim = np.array([[1,5,6,2],[7,10,3,4],[1,5,3,7]])
def mine(sim):
out = []
copy = sim.copy()
MIN = np.iinfo(copy.dtype).min # or -np.inf if using floats...
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
out.append(ij)
copy[ij[0]] = MIN
copy[:,ij[1]] = MIN
return np.transpose(out)
def yours(sim):
xdim,ydim = np.meshgrid(np.arange(sim.shape[1]),np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(),xdim.ravel(),ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={0:'sim',1:'index2',2:'index1'}).sort_values('sim',ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1']!=index1)&(df['index2']!=index2)]
return [seq1_hits,seq2_hits]
assert np.all(mine(sim) == yours(sim))
%timeit yours(sim)
# 1.05 ms ± 6.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit mine(sim)
# 8.18 µs ± 19.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Comparison slowly degrades for larger arrays, but stays ahead (still 10x faster on 1000x1000 arrays):
sim = np.arange(10000)
np.random.shuffle(sim)
sim.shape = 100,100
%timeit yours(sim)
# 26.2 ms ± 64.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit mine(sim)
# 397 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
sim = np.arange(1000000)
np.random.shuffle(sim)
sim.shape = 1000,1000
%timeit yours(sim)
# 2.45 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit mine(sim)
#203 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Your solution looks like it is O(min(m, n) * mn), because every time you are going through the whole DataFrame and removing those entries that are in the same row and column and recreating a new DataFrame. it looks like Julien's solution has this complexity (except that it probably uses parallel processing and SIMD well) as well because it still goes through the whole array for the argmax.
To achieve O(mn * log(mn)) just don't modify your DataFrame, but iterate through the entries one by one from the highest value and discard it when it is already in the same row or column as the entries you extracted. You can use two sets (or array of flags) to store the previously chosen rows and columns respectively. I name this method sort_w_set.
I have also another method of O(min(m, n) * mn) worst-case time complexity but unknown average case complexity. It keeps track of the maximum of each unpicked row and where it is and only updates it when the column is picked. I've named this lazy_update. Unfortunately this solution is slightly slower according to my tests.
import itertools
import timeit
import numpy as np
import pandas as pd
def original(sim):
xdim, ydim = np.meshgrid(np.arange(sim.shape[1]), np.arange(sim.shape[0]))
table = np.vstack((sim.ravel(), xdim.ravel(), ydim.ravel())).T
df = pd.DataFrame(table).rename(columns={
0: 'sim',
1: 'index2',
2: 'index1'
}).sort_values('sim', ascending=False)
seq1_hits = []
seq2_hits = []
while len(df):
index1 = df.iloc[0]['index1']
index2 = df.iloc[0]['index2']
seq1_hits.append(index1)
seq2_hits.append(index2)
df = df[(df['index1'] != index1) & (df['index2'] != index2)]
return np.asanyarray([seq1_hits, seq2_hits])
def sort_w_set(mat):
idxs = np.argsort(mat, None)
hit_is = []
hit_js = []
hit_is_set = set()
hit_js_set = set()
num_entries = min(mat.shape[0], mat.shape[1])
for idx in reversed(idxs):
i, j = idx // mat.shape[1], idx % mat.shape[1]
if i in hit_is_set or j in hit_js_set:
continue
hit_is.append(i)
hit_is_set.add(i)
hit_js.append(j)
hit_js_set.add(j)
if len(hit_is) == num_entries:
break
return np.asanyarray([hit_is, hit_js])
def lazy_update(mat, neg_inf=float("-inf")):
m = mat.shape[0]
n = mat.shape[1]
max_js = np.argmax(mat, axis=1)
maxes = np.take_along_axis(mat, max_js[:, np.newaxis], axis=1)
is_j_hit = np.full(n, False)
hit_is = []
hit_js = []
for k in range(min(m, n)):
max_i = np.argmax(maxes)
max_j = max_js[max_i]
# print(max_i, max_j)
hit_is.append(max_i)
hit_js.append(max_j)
max_js[max_i] = -1
maxes[max_i] = neg_inf
is_j_hit[max_j] = True
for i in range(m):
if max_js[i] != -1:
if max_js[i] == max_j:
max_js[i] = new_max_j = np.argmax(
np.where(is_j_hit, neg_inf, mat[i]))
maxes[i] = mat[i, new_max_j]
return np.asanyarray([hit_is, hit_js])
def julien(sim: np.ndarray):
out = []
copy = sim.copy()
for _ in range(min(copy.shape)):
ij = np.unravel_index(copy.argmax(), copy.shape)
# print(ij)
out.append(ij)
copy[ij[0]] = -1.0
copy[:, ij[1]] = -1.0
return np.transpose(out)
def main():
mat = np.array([[1, 5, 6, 2], [7, 10, 3, 4], [1, 5, 3, 7]], dtype=float)
np_rand = np.random.default_rng()
max_time = 10
max_iters = 100
# for _ in itertools.count():
# try:
if True:
if True:
mat = np_rand.random([100, 200])
res = None
for f in [original, sort_w_set, lazy_update, julien]:
res2 = f(mat)
if res is None:
res = res2
else:
assert np.all(res2 == res)
# except:
# print([list(mat[i]) for i in range(mat.shape[0])])
# raise
for m in [100, 200, 300, 500, 1000]:
n = m + 50
print(f'{m=} {n=}')
mat = None
def setup_func():
nonlocal mat
mat = np.round(np_rand.random([m, n]), 3)
for f in [original, sort_w_set, lazy_update, julien]:
timing = timeit.Timer(lambda: f(mat), setup_func).repeat(10, 1)
print(f.__name__, np.median(timing))
if __name__ == "__main__":
main()
To achieve O(min(m, n) * log(mn))) you probably need to do something much more complicated (or maybe it's impossible).
Results:
m=100 n=150
original 0.05974135349993048
sort_w_set 0.002159014999961073
lazy_update 0.002811349000012342
julien 0.001682339000126376
m=200 n=250
original 0.14275815500013778
sort_w_set 0.005666553500077498
lazy_update 0.007546098000148049
julien 0.007552466000106506
m=300 n=350
original 0.223872012999891
sort_w_set 0.009630193000020881
lazy_update 0.013982388499925946
julien 0.016382625000005646
m=500 n=550
original 0.6410600660001364
sort_w_set 0.02666090999991866
lazy_update 0.0368077354999059
julien 0.05437794899989967
m=1000 n=1050
original 4.4048256545002005
sort_w_set 0.13431666199971914
lazy_update 0.17222096199975567
julien 0.4341630219998933
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With