In pandas I can use the from_dummies method to reverse one-hot encoding. There doesn't seem to be a built in method for this in polars. Here is a basic example:
pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
┌─────────┬──────────┬─────────┬──────────┬─────────┐
│ col1_hi ┆ col1_med ┆ col1_lo ┆ col2_yes ┆ col2_no │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪══════════╪═════════╪══════════╪═════════╡
│ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
│ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │
│ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
│ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
└─────────┴──────────┴─────────┴──────────┴─────────┘
Reversing the to_dummies operation should result in something like this:
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
My first thought was to use a pivot. How could I go about implementing this functionality?
You could utilize pl.coalesce
(df
.with_columns(
pl.when(pl.col(col) == 1)
.then(pl.lit(col).str.extract(r"([^_]+$)"))
.alias(col)
for col in df.columns)
.select(
pl.coalesce(pl.col(f"^{prefix}_.+$")).alias(prefix)
for prefix in dict.fromkeys(
col.rsplit("_", maxsplit=1)[0]
for col in df.columns
)
))
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
Update: @Rodalm's approach is much neater:
def from_dummies(df, separator="_"):
col_exprs = {}
for col in df.columns:
name, value = col.rsplit(separator, maxsplit=1)
expr = pl.when(pl.col(col) == 1).then(value)
col_exprs.setdefault(name, []).append(expr)
return df.select(
pl.coalesce(exprs) # keep the first non-null expression value by row
.alias(name)
for name, exprs in col_exprs.items()
)
A similar approach to @jqurious's answer using pl.coalesce:
from collections import defaultdict
import polars as pl
df = pl.DataFrame({
"col1_hi": [0,0,0,1,1],
"col1_med": [0,0,1,0,0],
"col1_lo": [1,1,0,0,0],
"col2_yes": [1,1,0,1,0],
"col2_no": [0,0,1,0,1],
})
def from_dummies(df, sep="_"):
col_exprs = defaultdict(list)
for col in df.columns:
name, value = col.split(sep)
expr = pl.when(pl.col(col) == 1).then(value) # null otherwise
col_exprs[name].append(expr)
res = df.select(**{
name: pl.coalesce(exprs) # keep the first non-null expression value by row
for name, exprs in col_exprs.items()
})
return res
Or generalizing @warwick12's approach using multiple when and thens chained:
def from_dummies(df, sep="_"):
col_exprs = {}
for col in df.columns:
name, value = col.split(sep)
if name not in col_exprs:
col_exprs[name] = pl.when(pl.col(col) == 1).then(value)
else:
col_exprs[name] = col_exprs[name].when(pl.col(col) == 1).then(value)
return df.select(**col_exprs)
Output:
>>> from_dummies(df)
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ --- ┆ --- │
│ str ┆ str │
╞══════╪══════╡
│ lo ┆ yes │
│ lo ┆ yes │
│ med ┆ no │
│ hi ┆ yes │
│ hi ┆ no │
└──────┴──────┘
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With