Suppose you have the following Pandas DataFrame, df, with 4 columns and 10 rows. Here is a MWE:
data = {'age': [39, 50, 38, 53, 28, 37, 49, 52, 31, 42],
'education-num': [13, 13, 9, 7, 13, 14, 5, 9, 14, 13],
'workclass': ['State-gov', 'Self-emp-not-inc', 'Private', 'Private', 'Private', 'Private', 'Private', 'Self-emp-not-inc', 'Private', 'Private'],
'income': ['<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '<=50K', '>50K', '>50K', '>50K']
}
df = pd.DataFrame(data)
finished_partitions = []
feature_columns = ['age', 'education-num', 'workclass']
finished_partitions = [Int64Index([2, 3, 6, 7], dtype='int64'),
Int64Index([4, 5, 8], dtype='int64'),
Int64Index([0, 1, 9], dtype='int64')]
feature_columns = ['age', 'education-num', 'workclass']
def agg_categorical_column(series):
return [','.join(set(series))]
def agg_numerical_column(series):
return [series.mean()]
## my code:
def Convert(a):
it = iter(a)
res_dct = dict(zip(it, it))
return res_dct
aggregations = {}
for column in feature_columns:
if column in categorical:
aggregations[column] = agg_categorical_column # class 'function'. ex. output: {'age': <function agg_numerical_column at 0x7f314231c9e0>, 'education-num': <function agg_numerical_column at 0x7f314231c9e0>, 'workclass': <function agg_categorical_column at 0x7f314231c830>}
else:
aggregations[column] = agg_numerical_column # class 'function'. ex. output: {'age': <function agg_numerical_column at 0x7f314231c9e0>, 'education-num': <function agg_numerical_column at 0x7f314231c9e0>, 'workclass': <function agg_categorical_column at 0x7f314231c830>}
partition = finished_partitions[0]
tmp = df.loc[partition] # Pandas DataFrame for partition, i, in partitions I
grouped_columns = tmp.agg(aggregations, squeeze=False) # Pandas Series
Which will generate the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.map_locations()
TypeError: unhashable type: 'list'
Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
File "pandas/_libs/hashtable_class_helper.pxi", line 4588, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'list'
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-98-980ed492ad0c> in <module>
1 partition = finished_partitions[0]
2 tmp = df.loc[partition] # Pandas DataFrame for partition, i, in partitions I
----> 3 grouped_columns = tmp.agg(aggregations, squeeze=False) # Pandas Series
/emr/notebook-env/lib/python3.7/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
7576 result = None
7577 try:
-> 7578 result, how = self._aggregate(func, axis, *args, **kwargs)
7579 except TypeError as err:
7580 exc = TypeError(
/emr/notebook-env/lib/python3.7/site-packages/pandas/core/frame.py in _aggregate(self, arg, axis, *args, **kwargs)
7607 result = result.T if result is not None else result
7608 return result, how
-> 7609 return aggregate(self, arg, *args, **kwargs)
7610
7611 agg = aggregate
/emr/notebook-env/lib/python3.7/site-packages/pandas/core/aggregation.py in aggregate(obj, arg, *args, **kwargs)
580 elif is_dict_like(arg):
581 arg = cast(AggFuncTypeDict, arg)
--> 582 return agg_dict_like(obj, arg, _axis), True
583 elif is_list_like(arg):
584 # we require a list, but not an 'str'
/emr/notebook-env/lib/python3.7/site-packages/pandas/core/aggregation.py in agg_dict_like(obj, arg, _axis)
784 # There is a mix of NDFrames and scalars
785 raise ValueError(
--> 786 "cannot perform both aggregation "
787 "and transformation operations "
788 "simultaneously"
ValueError: cannot perform both aggregation and transformation operations simultaneously
Update
@wwnde Not sure what the expected output is supposed to look like with this particular input data, as these are functions from a repo I'm trying to get to work. It's implementing an algorithm called k-anonymizer. Basically, supposed to condense the 10 rows into fewer than 10 rows that look something like
age: education-num: workclass income
28-31 7-14 ['State-gov', 'Private'] '<=50K'
42-52 13-14 ['Private'] '>50K'
...
Note the following troubleshooting results:
df['age'].agg(agg_numerical_column) # works
[41.9]
aggregations['age']
<function __main__.agg_numerical_column(series)>
df['age'].agg(aggregations['age'])
[41.9]
df.agg(aggregations, squeeze=False) # gives error above
I think it's a bug related to pandas/issues/41768
I minimize the demo code to:
data = {'workclass':
['State-gov', 'Self-emp-not-inc', 'Private']
}
df = pd.DataFrame(data)
def agg_categorical_column(series):
print(f'Input object type: {type(series)}')
print(f'Input object looks like:\n {series}')
return [','.join(set(series))]
aggregations = {}
aggregations['workclass'] = agg_categorical_column
res = df[['workclass']].agg(aggregations['workclass']) # works
print('results is a series as expected.\n', res)
print('\n\n')
res = df[['workclass']].agg(aggregations) # not works, means agg acts like map each element
print('results is a dataframe but with strange value:\n', res)
Outputs:
Input object type: <class 'pandas.core.series.Series'>
Input object looks like:
0 State-gov
1 Self-emp-not-inc
2 Private
Name: workclass, dtype: object
results is a series as expect.
workclass
0 Private,Self-emp-not-inc,State-gov
Input object type: <class 'str'>
Input object looks like:
State-gov
Input object type: <class 'str'>
Input object looks like:
Self-emp-not-inc
Input object type: <class 'str'>
Input object looks like:
Private
results is a dataframe but with strange value:
workclass
0 [g,v,-,t,S,e,o,a]
1 [l,-,p,f,t,S,e,i,n,o,c,m]
2 [v,P,t,r,e,i,a]
Alternative:
df = pd.DataFrame(data)
#income should be added, or removed if not used
categorical = ['workclass', 'income']
def agg_categorical_column(series):
return [','.join(set(series))]
def agg_numerical_column(series):
return [series.mean()]
# split it to 2 dataframe first
cat_df = df[[c for c in df.columns if c in categorical]]
num_df = df[[c for c in df.columns if c not in categorical]]
# then concat with separated num and cat method
pd.concat([num_df.agg(agg_numerical_column), cat_df.agg(agg_categorical_column)], axis=1)
Outputs:
age education-num workclass income
0 41.9 11.0 Private,Self-emp-not-inc,State-gov <=50K,>50K
I had a similar issue and I found an easy workaround, sharing it in case it helps. Adding a dummy column with a single value and using groupby before aggregating produces the expected result without raising the error:
#instead of
tmp = df.loc[partition] # Pandas DataFrame for partition, i, in partitions I
grouped_columns = tmp.agg(aggregations, squeeze=False) # Pandas Series
#use
tmp = df.loc[partition] # Pandas DataFrame for partition, i, in partitions I
tmp['dummy_id'] = 1
grouped_columns = tmp.groupby('dummy_id').agg(aggregations, squeeze=False).reset_index().drop('id', axis=1) # Pandas Series
The result is (using the same aggregation functions than in the example, returning lists):
age education-num workclass income
0 [48] [7.5] [Self-emp-not-inc,Private] [>50K,<=50K]
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With