So I have a column which is an object type of column in a Pandas Dataframe.
It includes the following data:
array(['9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',
'9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',
'10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',
'11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',
'11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',
'12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5',
'100.333.333.333.333', '9.55', '8.5', '110.666.666.666.667',
'956.666.666.666.667', '10.55', '8.8', '135.666.666.666.667',
'11.95', '9.95', '923.333.333.333.333', '9.25', '9.05', '10.75',
'8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',
'128.933.333.333.333', '114.666.666.666.667', '10.98',
'114.333.333.333.333', '105.333.333.333.333',
'953.333.333.333.333', '109.333.333.333.333',
'113.666.666.666.667', '113.333.333.333.333',
'973.333.333.333.333', '11.05', '9.75', '11.35', '11.45', '14.05',
'123.333.333.333.333', '12.75', '13.8', '12.15', '13.05',
'112.666.666.666.667', '105.666.666.666.667',
'117.333.333.333.333', '11.75', '10.65', '109.666.666.666.667',
'101.333.333.333.333', '10.15', '104.666.666.666.667',
'116.333.333.333.333', '12.25', '11.85', '11.65', '13.55',
'131.333.333.333.333', '120.666.666.666.667', '11.55',
'963.333.333.333.333', '12.05'], dtype=object)
I want to update the ones that has more than one dots ('.') with maybe 0 or something. I am not very familiar with regex-es, but the idea is to use a regex for this problem, instead of '953.333.333.333.333'!
DF.replace({'column': '953.333.333.333.333'},'0')
Thank you a lot!
Using numpy.where with Series.str.count and Series.gt:
DF['column'] = np.where(DF['column'].str.count('\.').gt(1), 0, DF['column'])
[out]
array(['9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',
'9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',
'10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',
'11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',
'11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',
'12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5', 0, '9.55',
'8.5', 0, 0, '10.55', '8.8', 0, '11.95', '9.95', 0, '9.25', '9.05',
'10.75', '8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',
0, 0, '10.98', 0, 0, 0, 0, 0, 0, 0, '11.05', '9.75', '11.35',
'11.45', '14.05', 0, '12.75', '13.8', '12.15', '13.05', 0, 0, 0,
'11.75', '10.65', 0, 0, '10.15', 0, 0, '12.25', '11.85', '11.65',
'13.55', 0, 0, '11.55', 0, '12.05'], dtype=object)
This expression can simply capture the . in the desired elements:
'\d+\.\d+'|'\d+'|(\.)
using this capturing group:
(\.)
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility
import re
regex = r"'\d+\.\d+'|'\d+'|(\.)"
test_str = ("'9.4', '9.8', '10', '9.5', '10.5', '9.2', '9.9', '9.1', '9.3', '9',\n"
" '9.7', '10.1', '10.6', '9.6', '10.8', '10.3', '13.1', '10.2',\n"
" '10.9', '10.7', '12.9', '10.4', '13', '14', '11.5', '11.4', '12.4',\n"
" '11', '12.2', '12.8', '12.6', '12.5', '11.7', '11.3', '12.3', '12',\n"
" '11.9', '11.8', '8.7', '13.3', '11.2', '11.6', '11.1', '13.4',\n"
" '12.1', '8.4', '12.7', '14.9', '13.2', '13.6', '13.5',\n"
" '100.333.333.333.333', '9.55', '8.5', '110.666.666.666.667',\n"
" '956.666.666.666.667', '10.55', '8.8', '135.666.666.666.667',\n"
" '11.95', '9.95', '923.333.333.333.333', '9.25', '9.05', '10.75',\n"
" '8.6', '8.9', '13.9', '13.7', '8', '8.0', '14.2', '11.94',\n"
" '128.933.333.333.333', '114.666.666.666.667', '10.98',\n"
" '114.333.333.333.333', '105.333.333.333.333',\n"
" '953.333.333.333.333', '109.333.333.333.333',\n"
" '113.666.666.666.667', '113.333.333.333.333',\n"
" '973.333.333.333.333', '11.05', '9.75', '11.35', '11.45', '14.05',\n"
" '123.333.333.333.333', '12.75', '13.8', '12.15', '13.05',\n"
" '112.666.666.666.667', '105.666.666.666.667',\n"
" '117.333.333.333.333', '11.75', '10.65', '109.666.666.666.667',\n"
" '101.333.333.333.333', '10.15', '104.666.666.666.667',\n"
" '116.333.333.333.333', '12.25', '11.85', '11.65', '13.55',\n"
" '131.333.333.333.333', '120.666.666.666.667', '11.55',\n"
" '963.333.333.333.333', '12.05'")
matches = re.finditer(regex, test_str, re.MULTILINE)
for matchNum, match in enumerate(matches, start=1):
print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
for groupNum in range(0, len(match.groups())):
groupNum = groupNum + 1
print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With