I have data like this: [144 144 144 144 143 143 143 93 93 93 93 93 93 93 93 93 93], and I want to make data like this: [0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, ....]
I tried to use the function in the link below, I got this: [3 2 1 0 2 1 0 9 0 6 5 4 3 2 1 7 8]
How can I fix it?
def grp_range(a):
count = np.unique(a,return_counts=1)[1]
idx = count.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -count[:-1]+1
out = id_arr.cumsum()[np.argsort(a).argsort()]
return out
How to use numpy to get the cumulative count by unique values in linear time?
To speed up bpfrd version you can use numba. On my machine ~40 times faster as bpfrd version and ~10 times faster to ouroboros1 versions!
from numba import jit
import numpy as np
@jit(nopython=True)
def numba_style(a):
prev = a[0]
idx = 0
c = [idx]
for i in range(1, len(a)):
if a[i] == prev:
idx += 1
else:
prev = a[i]
idx = 0
c.append(idx)
return np.array(c)
| Function | Timing (mean ± std. dev. of 3 runs, 3 loops each) |
|---|---|
| list_style | 818 µs ± 116 µs per loop |
| grp_range | 230 µs ± 81.3 µs per loop |
| cumcount | 170 µs ± 73.9 µs per loop |
| np_style_2 | 165 µs ± 86.2 µs per loop |
| np_style | 118 µs ± 79.3 µs per loop |
| numba_style | 19.2 µs ± 887 ns per loop |
Speed comparison also to ouroboros1 versions. Define functions:
def list_style(a):
prev = a[0]
idx = 0
c = [idx]
for i in range(1, len(a)):
if a[i] == prev:
idx += 1
else:
prev = a[i]
idx = 0
c.append(idx)
return c
def dfill(a):
n = a.size
b = np.concatenate([[0], np.where(a[:-1] != a[1:])[0] + 1, [n]])
return np.arange(n)[b[:-1]].repeat(np.diff(b))
def argunsort(s):
n = s.size
u = np.empty(n, dtype=np.int64)
u[s] = np.arange(n)
return u
def cumcount(a):
n = a.size
s = a.argsort(kind='mergesort')
i = argunsort(s)
b = a[s]
return (np.arange(n) - dfill(b))[i]
def grp_range(a):
count = np.unique(a,return_counts=1)[1]
idx = count.cumsum()
id_arr = np.ones(idx[-1],dtype=int)
id_arr[0] = 0
id_arr[idx[:-1]] = -count[:-1]+1
out = id_arr.cumsum()[np.argsort(a, kind='mergesort').argsort()] # adjustment here
return out
def np_style(a):
unique, index, counts = np.unique(a, return_counts=True, return_index=True)
arg_s = np.argsort(index)
return np.concatenate(list(map(np.arange,counts[arg_s])), axis=0)
def np_style_2(a):
aa = a+np.arange(len(a))
unique, index = np.unique(a, return_index=True)
for uni, ind in zip(unique, index):
aa[a==uni] -= aa[ind]
return aa
And the actual testing with a slightly longer array:
n = 4, 3, 9, 15, 24, 100, 2500, 555
t = 144, 143, 93, 85, 84, 100, 250, 555
a = np.concatenate([[t_i]*n_i for t_i, n_i in zip(t,n)])
%timeit -n 3 -r 3 grp_range(a)
%timeit -n 3 -r 3 np_style(a)
%timeit -n 3 -r 3 np_style_2(a)
%timeit -n 3 -r 3 cumcount(a)
%timeit -n 3 -r 3 list_style(a)
%timeit -n 3 -r 3 numba_style(a)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With