import pandas as pd
When there is no na values, It's right.
!cat id1
1471341653427101696 1458379213265436885
pd.read_csv('id1',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
| 0 | 1 |
|---|---|
| 1471341653427101696 | 1458379213265436885 |
!cat id2
1471870967209926656 \N
1471341653427101696 1458379213265436885
1471458498691866624 1458379213265436889
when using int64, and there are na values, pd get wrong number
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'Int64')
df
| 0 | 1 |
|---|---|
| 1471870967209926656 | <NA> |
| 1471341653427101696 | 1458379213265436672 |
| 1471458498691866624 | 1458379213265436672 |
when read as str, it's correct
df = pd.read_csv('id2',sep ='\t',header=None, na_values=['\\N'],dtype = 'str')
df
| 0 | 1 |
|---|---|
| 1471870967209926656 | NaN |
| 1471341653427101696 | 1458379213265436885 |
| 1471458498691866624 | 1458379213265436889 |
df[1]
0 NaN
1 1458379213265436885
2 1458379213265436889
Name: 1, dtype: object
df.loc[[0],1].astype('Int64')
0 <NA>
Name: 1, dtype: Int64
df.loc[[1],1].astype('Int64')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/tmp/ipykernel_1971828/2578832362.py in <module>
----> 1 df.loc[[1],1].astype('Int64')
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
5813 else:
5814 # else, only a single dtype is given
-> 5815 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
5816 return self._constructor(new_data).__finalize__(self, method="astype")
5817
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
416
417 def astype(self: T, dtype, copy: bool = False, errors: str = "raise") -> T:
--> 418 return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
419
420 def convert(
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
589 values = self.values
590
--> 591 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
592
593 new_values = maybe_coerce_values(new_values)
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array_safe(values, dtype, copy, errors)
1307
1308 try:
-> 1309 new_values = astype_array(values, dtype, copy=copy)
1310 except (ValueError, TypeError):
1311 # e.g. astype_nansafe can fail on object-dtype of strings
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_array(values, dtype, copy)
1255
1256 else:
-> 1257 values = astype_nansafe(values, dtype, copy=copy)
1258
1259 # in pandas we don't store numpy str dtypes, so convert to object
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
1103 # dispatch on extension dtype if needed
1104 if isinstance(dtype, ExtensionDtype):
-> 1105 return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy)
1106
1107 elif not isinstance(dtype, np.dtype): # pragma: no cover
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in _from_sequence(cls, scalars, dtype, copy)
321 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
322 ) -> IntegerArray:
--> 323 values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy)
324 return IntegerArray(values, mask)
325
~/.conda/envs/work/lib/python3.9/site-packages/pandas/core/arrays/integer.py in coerce_to_array(values, dtype, mask, copy)
196 "mixed-integer-float",
197 ]:
--> 198 raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")
199
200 elif is_bool_dtype(values) and is_integer_dtype(dtype):
TypeError: object cannot be converted to an IntegerDtype
Fixed in GH 50757, so it works ever since pandas v2.0.0:
What’s new in 2.0.0 (April 3, 2023)
...
Fixed bug inSeriesconstructor unnecessarily overflowing for nullable unsigned integer dtypes (GH 38798, GH 25880)
For older versions of pandas, the only way is to use str, remove nan, then convert to int.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With