Polars Convert Back From Dummies

Question

In pandas I can use the from_dummies method to reverse one-hot encoding. There doesn't seem to be a built in method for this in polars. Here is a basic example:

pl.DataFrame({
  "col1_hi": [0,0,0,1,1],
  "col1_med": [0,0,1,0,0],
  "col1_lo": [1,1,0,0,0],
  "col2_yes": [1,1,0,1,0],
  "col2_no": [0,0,1,0,1],
})

┌─────────┬──────────┬─────────┬──────────┬─────────┐
│ col1_hi ┆ col1_med ┆ col1_lo ┆ col2_yes ┆ col2_no │
│ ---     ┆ ---      ┆ ---     ┆ ---      ┆ ---     │
│ i64     ┆ i64      ┆ i64     ┆ i64      ┆ i64     │
╞═════════╪══════════╪═════════╪══════════╪═════════╡
│ 0       ┆ 0        ┆ 1       ┆ 1        ┆ 0       │
│ 0       ┆ 0        ┆ 1       ┆ 1        ┆ 0       │
│ 0       ┆ 1        ┆ 0       ┆ 0        ┆ 1       │
│ 1       ┆ 0        ┆ 0       ┆ 1        ┆ 0       │
│ 1       ┆ 0        ┆ 0       ┆ 0        ┆ 1       │
└─────────┴──────────┴─────────┴──────────┴─────────┘

Reversing the to_dummies operation should result in something like this:

┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘

My first thought was to use a pivot. How could I go about implementing this functionality?

jqurious · Accepted Answer

You could utilize pl.coalesce

(df
 .with_columns(
    pl.when(pl.col(col) == 1)
      .then(pl.lit(col).str.extract(r"([^_]+$)"))
      .alias(col) 
    for col in df.columns)
 .select(
    pl.coalesce(pl.col(f"^{prefix}_.+$")).alias(prefix) 
    for prefix in dict.fromkeys(
       col.rsplit("_", maxsplit=1)[0]
       for col in df.columns
    )
))

shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘

Update: @Rodalm's approach is much neater:

def from_dummies(df, separator="_"):
    col_exprs = {}
    
    for col in df.columns:
        name, value = col.rsplit(separator, maxsplit=1)
        expr = pl.when(pl.col(col) == 1).then(value) 
        col_exprs.setdefault(name, []).append(expr)

    return df.select(
        pl.coalesce(exprs) # keep the first non-null expression value by row
          .alias(name)
        for name, exprs in col_exprs.items()
    )

Rodalm · Answer

A similar approach to @jqurious's answer using pl.coalesce:

from collections import defaultdict
import polars as pl

df = pl.DataFrame({
  "col1_hi": [0,0,0,1,1],
  "col1_med": [0,0,1,0,0],
  "col1_lo": [1,1,0,0,0],
  "col2_yes": [1,1,0,1,0],
  "col2_no": [0,0,1,0,1],
})

def from_dummies(df, sep="_"):

    col_exprs = defaultdict(list)
    for col in df.columns:
        name, value = col.split(sep)
        expr = pl.when(pl.col(col) == 1).then(value) # null otherwise
        col_exprs[name].append(expr)

    res = df.select(**{
        name: pl.coalesce(exprs) # keep the first non-null expression value by row
        for name, exprs in col_exprs.items()
    })
    return res

Or generalizing @warwick12's approach using multiple when and thens chained:

def from_dummies(df, sep="_"):

    col_exprs = {}
    for col in df.columns:
        name, value = col.split(sep)
        if name not in col_exprs:
            col_exprs[name] = pl.when(pl.col(col) == 1).then(value)
        else:
            col_exprs[name] = col_exprs[name].when(pl.col(col) == 1).then(value)
 
    return df.select(**col_exprs)

Output:

>>> from_dummies(df)

shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘

Polars Convert Back From Dummies

Tags:

python

dataframe

python-polars

bkw1491

2 Answers

jqurious

Rodalm

Recent Activity

Donate For Us

Polars Convert Back From Dummies

Tags:

python

dataframe

python-polars

bkw1491

2 Answers

jqurious

Rodalm

Related questions

Recent Activity

Donate For Us