I would like to create a cross table that shows, in each cell, the percentages of rows over the total number of rows.
Inspired by this post I started with:
df = pl.DataFrame({"a": [2, 0, 1, 0, 0, 0], "b": [1, 1, 1, 0, 0, 1]})
crosstab = (
df.pivot(on="b", index="a", values="b", aggregate_function="len", sort_columns=True)
.fill_null(0)
.sort("a")
)
crosstab
and then inspired by polars' user guide I tried to convert values into percentages with:
def perc_cols(df):
tot = df.select(~cs.by_index(0)).to_numpy().sum()
for col in df.columns[1:]:
yield ((pl.col(col) / tot) * 100)
crosstab.select(cs.by_index(0), perc_cols(crosstab))
but I get an error:
TypeError: cannot create expression literal for value of type generator.
notice that both crosstab.select(cs.by_index(0)) and crosstab.select(perc_cols(crosstab)) works as expected.
When passing multiple args you must manually unpack generators.
crosstab.select(cs.by_index(0), *perc_cols(crosstab))
shape: (3, 3)
┌─────┬───────────┬───────────┐
│ a ┆ 0 ┆ 1 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪═══════════╪═══════════╡
│ 0 ┆ 33.333333 ┆ 33.333333 │
│ 1 ┆ 0.0 ┆ 16.666667 │
│ 2 ┆ 0.0 ┆ 16.666667 │
└─────┴───────────┴───────────┘
It should be possible to do natively in Polars though e.g. using .sum() and pl.sum_horizontal() to get the total.
crosstab.select(cs.exclude(cs.first()).sum())
# shape: (1, 2)
# ┌─────┬─────┐
# │ 0 ┆ 1 │
# │ --- ┆ --- │
# │ u32 ┆ u32 │
# ╞═════╪═════╡
# │ 2 ┆ 4 │
# └─────┴─────┘
crosstab.select(pl.sum_horizontal(cs.exclude(cs.first()).sum()))
# shape: (1, 1)
# ┌─────┐
# │ 0 │
# │ --- │
# │ u32 │
# ╞═════╡
# │ 6 │
# └─────┘
Which you can use directly in your calculation:
crosstab.with_columns(
cs.exclude(cs.first()) / pl.sum_horizontal(cs.exclude(cs.first()).sum()) * 100
)
shape: (3, 3)
┌─────┬───────────┬───────────┐
│ a ┆ 0 ┆ 1 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 │
╞═════╪═══════════╪═══════════╡
│ 0 ┆ 33.333333 ┆ 33.333333 │
│ 1 ┆ 0.0 ┆ 16.666667 │
│ 2 ┆ 0.0 ┆ 16.666667 │
└─────┴───────────┴───────────┘
Selectors/expressions can also be stored in variables if you wanted to split it up into steps.
def perc_cols(df):
other_cols = cs.exclude(cs.first())
total = pl.sum_horizontal(other_cols.sum())
return df.with_columns(other_cols / total * 100)
crosstab.pipe(perc_cols)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With