In an example dataframe with NaNs in the third column, let us compare the performance of fillna()
with boolean masking + setting the value:
import pandas as pd
import numpy as np
np.random.seed(100)
nrows = 10000000
nnan = 25000
df = pd.DataFrame(np.random.uniform(0,250000,size=(nrows,3)))
ind_row = np.random.randint(0,nrows,nnan)
df.loc[ind_row, 2] = np.nan
df1 = df.copy()
%timeit df1[2] = df1[2].fillna(999)
df1 = df.copy()
%timeit df1[2].fillna(999)
df1 = df.copy()
%timeit df1.loc[df1[2].isna(),2] = 999
I got the following example timings:
35.1 ms ± 369 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
36.4 ms ± 331 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
6.23 ms ± 65.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
and
35.9 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
37.3 ms ± 438 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
6.41 ms ± 38.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Note that these timings do to not seem to depend much on the ratio of NaNs to non-NaN values.
Why does manual boolean masking appear to be faster than fillna()
?
Update 1: The below plot shows the timings for different dataframe sizes generated with the code below and can be compared to @mozway's plot. However, for me there is a clear separation, with boolean masking consistently below fillna(inplace=True)
below fillna()
for N >= 10^5:
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import timeit
np.random.seed(100)
nrepeats = 5
nruns = 10
def time_fillna(nrows, output):
nnan = int(nrows/4)
df = pd.DataFrame(np.random.uniform(0,250000,size=(nrows,3)))
ind_row = np.random.randint(0,nrows,nnan)
df.loc[ind_row, 2] = np.nan
df1 = df.copy()
output['fillna_assign'].append(np.min(timeit.repeat('df1[2] = df1[2].fillna(999)', globals=locals(), repeat=nrepeats, number=nruns))/nruns)
df1 = df.copy()
output['fillna_only'].append(np.min(timeit.repeat('df1[2].fillna(999)', globals=locals(), repeat=nrepeats, number=nruns))/nruns)
df1 = df.copy()
output['fillna_inplace'].append(np.min(timeit.repeat('df1[2].fillna(999,inplace=True)', globals=locals(), repeat=nrepeats, number=nruns))/nruns)
df1 = df.copy()
output['bool_mask'].append(np.min(timeit.repeat('df1.loc[df1[2].isna(),2] = 999', globals=locals(), repeat=nrepeats, number=nruns))/nruns)
output = {method:[] for method in ['fillna_assign', 'fillna_only', 'fillna_inplace', 'bool_mask']}
nrows_all = np.logspace(3,8,11).astype(int)
for nrows in nrows_all:
print(f'Timing for nrows = {nrows}')
time_fillna(nrows, output)
plt.ion()
plt.plot(nrows_all, output['fillna_assign'], label='fillna_assign')
plt.plot(nrows_all, output['fillna_only'], label='fillna_only')
plt.plot(nrows_all, output['fillna_inplace'], label='fillna_inplace')
plt.plot(nrows_all, output['bool_mask'], label='bool_mask')
plt.loglog()
plt.xlabel('Number of rows')
plt.ylabel('Runtime [s]')
plt.legend()
plt.show()
plt.savefig('timing_fillna.png')
Update 2: Making sure that NaNs are present in repeated timing runs (see comments by @mozway), I get a different result: fillna(inplace=True)
and boolean masking comparable, fillna()
only without assignment as the fastest:
Code:
def time_fillna_revised(nrows, output):
nnan = int(nrows/4)
df = pd.DataFrame(np.random.uniform(0,250000,size=(nrows,3)))
ind_row = np.random.randint(0,nrows,nnan)
df.loc[ind_row, 2] = np.nan
df_insert = df.copy()
t_fillna_assign = []
t_fillna_inplace = []
t_fillna_bool = []
for i in range(nrepeats):
time_temp_assign = 0
time_temp_inplace = 0
time_temp_bool = 0
for j in range(nruns):
df1 = df.copy()
time_temp_assign += timeit.repeat('df_insert[2] = df1[2].fillna(999)', globals=locals(), repeat=1, number=1)[0]
df1 = df.copy()
time_temp_inplace += timeit.repeat('df1[2].fillna(999,inplace=True)', globals=locals(), repeat=1, number=1)[0]
df1 = df.copy()
time_temp_bool += timeit.repeat('df1.loc[df1[2].isna(),2] = 999', globals=locals(), repeat=1, number=1)[0]
t_fillna_assign.append(time_temp_assign)
t_fillna_inplace.append(time_temp_inplace)
t_fillna_bool.append(time_temp_bool)
output['fillna_assign'].append(np.min(t_fillna_assign)/nruns)
output['fillna_inplace'].append(np.min(t_fillna_inplace)/nruns)
output['bool_mask'].append(np.min(t_fillna_bool)/nruns)
df1 = df.copy()
output['fillna_only'].append(np.min(timeit.repeat('df1[2].fillna(999)', globals=locals(), repeat=nrepeats, number=nruns))/nruns)
Update 3: For NaN-ratio 1/4, (see above plot), the results may be more complicated, see next plot:
For NaN-ratio 1/400 (comment by @ken), the timings are:
Running @mozway's code:
"Why does manual boolean masking appear to be faster than fillna()?"
Could boolean indexing be faster in these conditions? Potentially yes.
First of all, fillna
can do much more than simply filling NaNs with a value, it can ffill
/bfill
, but also restrict filling to a number of consecutive NaNs.
On top of that, it also outputs a new Series.
All in all, this is potentially much more work than simply identifying the positions of the NaNs and replacing them in place.
It seems however that the difference is not always that clear. I tested a comparison for various sizes and 50% probability to get NaNs and got the following:
For a 1/400 probability to get NaNs, this gives:
In this case, boolean indexing is significantly faster than fillna
, but fillna
with inplace=True
is even faster.
I used a random sampling of 1 or NaN, copied to 3 different columns, then ran:
# fillna_only
df['col'].fillna(999)
# fillna_assign
df['col'] = df['col'].fillna(999)
# fillna_inplace
df['col2'].fillna(999, inplace=True)
# boolean_indexing
df.loc[df['col3'].isna(), 'col3'] = 999
The above graphs was computed with 10 replicates. It seems that the features such as the slight drop in timing for fillna_only
around 10⁶ are true. This would indicate internal optimizations or size-effects that could occur depending on the size of the input.
Code:
import numpy as np
import pandas as pd
import perfplot
import seaborn as sns
def init(N):
df = pd.DataFrame({'col': np.random.choice([1, np.nan], size=N)})
# or for a custom probability
# df = pd.DataFrame({'col': np.random.choice([1, np.nan], p=[399/400, 1/400], size=N)})
df['col2'] = df['col'].copy()
df['col3'] = df['col'].copy()
return df
def fillna_only(df):
return df['col'].fillna(999)
def fillna_inplace(df):
df['col2'].fillna(999, inplace=True)
def fillna_assign(df):
df['col'] = df['col'].fillna(999)
return df
def boolean_indexing(df):
df.loc[df['col3'].isna(), 'col3'] = 999
return df
out = []
for _ in range(10):
out.append(perfplot.bench(
setup=init,
kernels=[fillna_only, fillna_assign, fillna_inplace, boolean_indexing],
n_range=2**np.arange(28),
equality_check=None,
max_time=10
))
results = (pd.concat([pd.DataFrame(x.timings_s, columns=x.n_range, index=x.labels) for x in out])
.rename_axis(index='method', columns='len(df)').stack().reset_index(name='timing (s)')
)
ax = sns.lineplot(df, x='len(df)', y='timing (s)', hue='method')
ax.set_xscale('log')
ax.set_yscale('log')
Be aware that in your timing code, only the first replicate seems to have NaNs
df1 = df.copy()
timeit.repeat('m = df1[2].isna() ; print(m.sum()) ; df1.loc[m,2] = 999', globals=locals(), repeat=2, number=2)
2212512 <- lots of NaNs in the first replicate
0 <- now they're gone
0
0
[0.06422067689709365, 0.021104251965880394]
Your code:
Your code with nrepeats = 1 ; nruns = 1
:
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With