분석 결과의 신뢰도를 높이기 위한 데이터 전처리 과정을 학습합니다.
import warnings
warnings.filterwarnings('ignore' )
import numpy as np
import pandas as pd
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 25
pd.options.display.max_columns = 20
pd.options.display.max_colwidth = 82
np.random.seed(12345 )
import matplotlib.pyplot as plt
plt.rc("figure" , figsize= (10 , 6 ))
np.set_printoptions(precision= 4 , suppress= True )
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font' , family= 'AppleGothic' )
plt.rc('axes' , unicode_minus= False )
import numpy as np
import pandas as pd
누락된 데이터 처리
결측치(NaN, None)를 확인하고 제거하거나 다른 값으로 채우는 방법을 알아봅니다.
float_data = pd.Series([1.2 , - 3.5 , np.nan, 0 ])
float_data
0 1.2
1 -3.5
2 NaN
3 0.0
dtype: float64
누락된 데이터 처리
결측치(Null, NA)를 식별하고 정제하는 다양한 기법을 배웁니다.
0 False
1 False
2 True
3 False
dtype: bool
string_data = pd.Series(["aardvark" , np.nan, None , "avocado" ])
string_data
string_data.isna()
float_data = pd.Series([1 , 2 , None ], dtype= 'float64' )
float_data
float_data.isna()
0 False
1 False
2 True
dtype: bool
data = pd.Series([1 , np.nan, 3.5 , np.nan, 7 ])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
0 1.0
2 3.5
4 7.0
dtype: float64
data = pd.DataFrame([[1. , 6.5 , 3. ], [1. , np.nan, np.nan],
[np.nan, np.nan, np.nan], [np.nan, 6.5 , 3. ]])
data
data.dropna()
0
1.0
6.5
3.0
1
1.0
NaN
NaN
3
NaN
6.5
3.0
data[4 ] = np.nan
data
data.dropna(axis= "columns" , how= "all" )
0
1.0
6.5
3.0
1
1.0
NaN
NaN
2
NaN
NaN
NaN
3
NaN
6.5
3.0
df = pd.DataFrame(np.random.standard_normal((7 , 3 )))
df.iloc[:4 , 1 ] = np.nan
df.iloc[:2 , 2 ] = np.nan
df
df.dropna()
df.dropna(thresh= 2 )
2
0.092908
NaN
0.769023
3
1.246435
NaN
-1.296221
4
0.274992
0.228913
1.352917
5
0.886429
-2.001637
-0.371843
6
1.669025
-0.438570
-0.539741
0
-0.204708
0.000000
0.000000
1
-0.555730
0.000000
0.000000
2
0.092908
0.000000
0.769023
3
1.246435
0.000000
-1.296221
4
0.274992
0.228913
1.352917
5
0.886429
-2.001637
-0.371843
6
1.669025
-0.438570
-0.539741
df.fillna({1 : 0.5 , 2 : 0 })
0
-0.204708
0.500000
0.000000
1
-0.555730
0.500000
0.000000
2
0.092908
0.500000
0.769023
3
1.246435
0.500000
-1.296221
4
0.274992
0.228913
1.352917
5
0.886429
-2.001637
-0.371843
6
1.669025
-0.438570
-0.539741
df = pd.DataFrame(np.random.standard_normal((6 , 3 )))
df.iloc[2 :, 1 ] = np.nan
df.iloc[4 :, 2 ] = np.nan
df
df.fillna(method= "ffill" )
df.fillna(method= "ffill" , limit= 2 )
0
0.476985
3.248944
-1.021228
1
-0.577087
0.124121
0.302614
2
0.523772
0.124121
1.343810
3
-0.713544
0.124121
-2.370232
4
-1.860761
NaN
-2.370232
5
-1.265934
NaN
-2.370232
data = pd.Series([1. , np.nan, 3.5 , np.nan, 7 ])
data.fillna(data.mean())
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
데이터 변형
중복 제거, 값 치환, 이름 변경 등 데이터의 구조를 바꾸는 작업을 수행합니다.
data = pd.DataFrame({"k1" : ["one" , "two" ] * 3 + ["two" ],
"k2" : [1 , 1 , 2 , 3 , 3 , 4 , 4 ]})
data
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
6
two
4
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
데이터 변형
중복 제거, 값 치환 등 데이터의 무결성을 높이는 작업을 수행합니다.
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
data["v1" ] = range (7 )
data
data.drop_duplicates(subset= ["k1" ])
data.drop_duplicates(["k1" , "k2" ], keep= "last" )
0
one
1
0
1
two
1
1
2
one
2
2
3
two
3
3
4
one
3
4
6
two
4
6
data = pd.DataFrame({"food" : ["bacon" , "pulled pork" , "bacon" ,
"pastrami" , "corned beef" , "bacon" ,
"pastrami" , "honey ham" , "nova lox" ],
"ounces" : [4 , 3 , 12 , 6 , 7.5 , 8 , 3 , 5 , 6 ]})
data
0
bacon
4.0
1
pulled pork
3.0
2
bacon
12.0
3
pastrami
6.0
4
corned beef
7.5
5
bacon
8.0
6
pastrami
3.0
7
honey ham
5.0
8
nova lox
6.0
meat_to_animal = {
"bacon" : "pig" ,
"pulled pork" : "pig" ,
"pastrami" : "cow" ,
"corned beef" : "cow" ,
"honey ham" : "pig" ,
"nova lox" : "salmon"
}
data["animal" ] = data["food" ].map (meat_to_animal)
data
0
bacon
4.0
pig
1
pulled pork
3.0
pig
2
bacon
12.0
pig
3
pastrami
6.0
cow
4
corned beef
7.5
cow
5
bacon
8.0
pig
6
pastrami
3.0
cow
7
honey ham
5.0
pig
8
nova lox
6.0
salmon
def get_animal(x):
return meat_to_animal[x]
data["food" ].map (get_animal)
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
data = pd.Series([1. , - 999. , 2. , - 999. , - 1000. , 3. ])
data
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
dtype: float64
data.replace(- 999 , np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data.replace([- 999 , - 1000 ], np.nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data.replace([- 999 , - 1000 ], [np.nan, 0 ])
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data.replace({- 999 : np.nan, - 1000 : 0 })
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data = pd.DataFrame(np.arange(12 ).reshape((3 , 4 )),
index= ["Ohio" , "Colorado" , "New York" ],
columns= ["one" , "two" , "three" , "four" ])
def transform(x):
return x[:4 ].upper()
data.index.map (transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map (transform)
data
OHIO
0
1
2
3
COLO
4
5
6
7
NEW
8
9
10
11
data.rename(index= str .title, columns= str .upper)
Ohio
0
1
2
3
Colo
4
5
6
7
New
8
9
10
11
data.rename(index= {"OHIO" : "INDIANA" },
columns= {"three" : "peekaboo" })
INDIANA
0
1
2
3
COLO
4
5
6
7
NEW
8
9
10
11
ages = [20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ]
이산화와 개별화
cut과 qcut을 사용하여 연속형 데이터를 구간별로 나누는 방법을 배웁니다.
이산화와 개별화
연속형 데이터를 구간별 범주 데이터로 변환하는 방법을 익힙니다.
bins = [18 , 25 , 35 , 60 , 100 ]
age_categories = pd.cut(ages, bins)
age_categories
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
age_categories.codes
age_categories.categories
age_categories.categories[0 ]
pd.value_counts(age_categories)
(18, 25] 5
(25, 35] 3
(35, 60] 3
(60, 100] 1
Name: count, dtype: int64
pd.cut(ages, bins, right= False )
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
group_names = ["Youth" , "YoungAdult" , "MiddleAged" , "Senior" ]
pd.cut(ages, bins, labels= group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np.random.uniform(size= 20 )
pd.cut(data, 4 , precision= 2 )
[(0.34, 0.55], (0.34, 0.55], (0.76, 0.97], (0.76, 0.97], (0.34, 0.55], ..., (0.34, 0.55], (0.34, 0.55], (0.55, 0.76], (0.34, 0.55], (0.12, 0.34]]
Length: 20
Categories (4, interval[float64, right]): [(0.12, 0.34] < (0.34, 0.55] < (0.55, 0.76] < (0.76, 0.97]]
data = np.random.standard_normal(1000 )
quartiles = pd.qcut(data, 4 , precision= 2 )
quartiles
pd.value_counts(quartiles)
(-2.96, -0.68] 250
(-0.68, -0.026] 250
(-0.026, 0.62] 250
(0.62, 3.93] 250
Name: count, dtype: int64
pd.qcut(data, [0 , 0.1 , 0.5 , 0.9 , 1. ]).value_counts()
(-2.9499999999999997, -1.187] 100
(-1.187, -0.0265] 400
(-0.0265, 1.286] 400
(1.286, 3.928] 100
Name: count, dtype: int64
data = pd.DataFrame(np.random.standard_normal((1000 , 4 )))
data.describe()
count
1000.000000
1000.000000
1000.000000
1000.000000
mean
0.049091
0.026112
-0.002544
-0.051827
std
0.996947
1.007458
0.995232
0.998311
min
-3.645860
-3.184377
-3.745356
-3.428254
25%
-0.599807
-0.612162
-0.687373
-0.747478
50%
0.047101
-0.013609
-0.022158
-0.088274
75%
0.756646
0.695298
0.699046
0.623331
max
2.653656
3.525865
2.735527
3.366626
col = data[2 ]
col[col.abs () > 3 ]
41 -3.399312
136 -3.745356
Name: 2, dtype: float64
data[(data.abs () > 3 ).any (axis= "columns" )]
41
0.457246
-0.025907
-3.399312
-0.974657
60
1.951312
3.260383
0.963301
1.201206
136
0.508391
-0.196713
-3.745356
-1.520113
235
-0.242459
-3.056990
1.918403
-0.578828
258
0.682841
0.326045
0.425384
-3.428254
322
1.179227
-3.184377
1.369891
-1.074833
544
-3.548824
1.553205
-2.186301
1.277104
635
-0.578093
0.193299
1.397822
3.366626
782
-0.207434
3.525865
0.283070
0.544635
803
-3.645860
0.255475
-0.549574
-1.907459
data[data.abs () > 3 ] = np.sign(data) * 3
data.describe()
count
1000.000000
1000.000000
1000.000000
1000.000000
mean
0.050286
0.025567
-0.001399
-0.051765
std
0.992920
1.004214
0.991414
0.995761
min
-3.000000
-3.000000
-3.000000
-3.000000
25%
-0.599807
-0.612162
-0.687373
-0.747478
50%
0.047101
-0.013609
-0.022158
-0.088274
75%
0.756646
0.695298
0.699046
0.623331
max
2.653656
3.000000
2.735527
3.000000
0
-1.0
1.0
-1.0
1.0
1
1.0
-1.0
1.0
-1.0
2
1.0
1.0
1.0
-1.0
3
-1.0
-1.0
1.0
-1.0
4
-1.0
1.0
-1.0
-1.0
df = pd.DataFrame(np.arange(5 * 7 ).reshape((5 , 7 )))
df
sampler = np.random.permutation(5 )
sampler
df.take(sampler)
df.iloc[sampler]
3
21
22
23
24
25
26
27
1
7
8
9
10
11
12
13
4
28
29
30
31
32
33
34
2
14
15
16
17
18
19
20
0
0
1
2
3
4
5
6
column_sampler = np.random.permutation(7 )
column_sampler
df.take(column_sampler, axis= "columns" )
0
4
6
3
2
1
0
5
1
11
13
10
9
8
7
12
2
18
20
17
16
15
14
19
3
25
27
24
23
22
21
26
4
32
34
31
30
29
28
33
2
14
15
16
17
18
19
20
4
28
29
30
31
32
33
34
0
0
1
2
3
4
5
6
choices = pd.Series([5 , 7 , - 1 , 6 , 4 ])
choices.sample(n= 10 , replace= True )
2 -1
0 5
3 6
1 7
4 4
0 5
4 4
0 5
4 4
4 4
dtype: int64
df = pd.DataFrame({"key" : ["b" , "b" , "a" , "c" , "a" , "b" ],
"data1" : range (6 )})
df
pd.get_dummies(df["key" ], dtype= float )
0
0.0
1.0
0.0
1
0.0
1.0
0.0
2
1.0
0.0
0.0
3
0.0
0.0
1.0
4
1.0
0.0
0.0
5
0.0
1.0
0.0
dummies = pd.get_dummies(df["key" ], prefix= "key" , dtype= float )
df_with_dummy = df[["data1" ]].join(dummies)
df_with_dummy
0
0
0.0
1.0
0.0
1
1
0.0
1.0
0.0
2
2
1.0
0.0
0.0
3
3
0.0
0.0
1.0
4
4
1.0
0.0
0.0
5
5
0.0
1.0
0.0
mnames = ["movie_id" , "title" , "genres" ]
movies = pd.read_table("datasets/movielens/movies.dat" , sep= "::" ,
header= None , names= mnames, engine= "python" )
movies[:10 ]
0
1
Toy Story (1995)
Animation|Children's|Comedy
1
2
Jumanji (1995)
Adventure|Children's|Fantasy
2
3
Grumpier Old Men (1995)
Comedy|Romance
3
4
Waiting to Exhale (1995)
Comedy|Drama
4
5
Father of the Bride Part II (1995)
Comedy
5
6
Heat (1995)
Action|Crime|Thriller
6
7
Sabrina (1995)
Comedy|Romance
7
8
Tom and Huck (1995)
Adventure|Children's
8
9
Sudden Death (1995)
Action
9
10
GoldenEye (1995)
Action|Adventure|Thriller
dummies = movies["genres" ].str .get_dummies("|" )
dummies.iloc[:10 , :6 ]
0
0
0
1
1
1
0
1
0
1
0
1
0
0
2
0
0
0
0
1
0
3
0
0
0
0
1
0
4
0
0
0
0
1
0
5
1
0
0
0
0
1
6
0
0
0
0
1
0
7
0
1
0
1
0
0
8
1
0
0
0
0
0
9
1
1
0
0
0
0
movies_windic = movies.join(dummies.add_prefix("Genre_" ))
movies_windic.iloc[0 ]
movie_id 1
title Toy Story (1995)
genres Animation|Children's|Comedy
Genre_Action 0
Genre_Adventure 0
Genre_Animation 1
Genre_Children's 1
Genre_Comedy 1
Genre_Crime 0
Genre_Documentary 0
Genre_Drama 0
Genre_Fantasy 0
Genre_Film-Noir 0
Genre_Horror 0
Genre_Musical 0
Genre_Mystery 0
Genre_Romance 0
Genre_Sci-Fi 0
Genre_Thriller 0
Genre_War 0
Genre_Western 0
Name: 0, dtype: object
np.random.seed(12345 ) # to make the example repeatable
values = np.random.uniform(size= 10 )
values
bins = [0 , 0.2 , 0.4 , 0.6 , 0.8 , 1 ]
pd.get_dummies(pd.cut(values, bins))
0
False
False
False
False
True
1
False
True
False
False
False
2
True
False
False
False
False
3
False
True
False
False
False
4
False
False
True
False
False
5
False
False
True
False
False
6
False
False
False
False
True
7
False
False
False
True
False
8
False
False
False
True
False
9
False
False
False
True
False
s = pd.Series([1 , 2 , 3 , None ])
s
s.dtype
s = pd.Series([1 , 2 , 3 , None ], dtype= pd.Int64Dtype())
s
s.isna()
s.dtype
s = pd.Series([1 , 2 , 3 , None ], dtype= "Int64" )
s = pd.Series(['one' , 'two' , None , 'three' ], dtype= pd.StringDtype())
s
0 one
1 two
2 <NA>
3 three
dtype: string
df = pd.DataFrame({"A" : [1 , 2 , None , 4 ],
"B" : ["one" , "two" , "three" , None ],
"C" : [False , None , False , True ]})
df
df["A" ] = df["A" ].astype("Int64" )
df["B" ] = df["B" ].astype("string" )
df["C" ] = df["C" ].astype("boolean" )
df
0
1
one
False
1
2
two
<NA>
2
<NA>
three
False
3
4
<NA>
True
문자열 조작
파이썬의 내장 문자열 메서드와 정규표현식을 사용하여 텍스트 데이터를 정제합니다.
val = "a,b, guido"
val.split("," )
pieces = [x.strip() for x in val.split("," )]
pieces
first, second, third = pieces
first + "::" + second + "::" + third
"guido" in val
val.index("," )
val.find(":" )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[70] , line 1
----> 1 val . index ( " : " )
ValueError : substring not found
문자열 조작
텍스트 데이터를 정교하게 다루기 위한 정규표현식과 문자열 메서드를 학습합니다.
val.replace("," , "::" )
val.replace("," , "" )
import re
text = "foo bar \t baz \t qux"
re.split(r" \s + " , text)
['foo', 'bar', 'baz', 'qux']
regex = re.compile (r" \s + " )
regex.split(text)
['foo', 'bar', 'baz', 'qux']
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r" [A-Z0-9._%+-] + @ [A-Z0-9.-] + \. [A-Z] {2,4} "
# re.IGNORECASE makes the regex case insensitive
regex = re.compile (pattern, flags= re.IGNORECASE)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
m
text[m.start():m.end()]
print (regex.sub("REDACTED" , text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
pattern = r" ( [A-Z0-9._%+-] + ) @ ( [A-Z0-9.-] + ) \. ( [A-Z] {2,4} ) "
regex = re.compile (pattern, flags= re.IGNORECASE)
m = regex.match("wesm@bright.net" )
m.groups()
('wesm', 'bright', 'net')
[('dave', 'google', 'com'),
('steve', 'gmail', 'com'),
('rob', 'gmail', 'com'),
('ryan', 'yahoo', 'com')]
print (regex.sub(r"Username: \1 , Domain: \2 , Suffix: \3 " , text))
Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com
data = {"Dave" : "dave@google.com" , "Steve" : "steve@gmail.com" ,
"Rob" : "rob@gmail.com" , "Wes" : np.nan}
data = pd.Series(data)
data
data.isna()
Dave False
Steve False
Rob False
Wes True
dtype: bool
data.str .contains("gmail" )
Dave False
Steve True
Rob True
Wes NaN
dtype: object
data_as_string_ext = data.astype('string' )
data_as_string_ext
data_as_string_ext.str .contains("gmail" )
Dave False
Steve True
Rob True
Wes <NA>
dtype: boolean
pattern = r" ( [A-Z0-9._%+-] + ) @ ( [A-Z0-9.-] + ) \. ( [A-Z] {2,4} ) "
data.str .findall(pattern, flags= re.IGNORECASE)
Dave [(dave, google, com)]
Steve [(steve, gmail, com)]
Rob [(rob, gmail, com)]
Wes NaN
dtype: object
matches = data.str .findall(pattern, flags= re.IGNORECASE).str [0 ]
matches
matches.str .get(1 )
Dave google
Steve gmail
Rob gmail
Wes NaN
dtype: object
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object
data.str .extract(pattern, flags= re.IGNORECASE)
Dave
dave
google
com
Steve
steve
gmail
com
Rob
rob
gmail
com
Wes
NaN
NaN
NaN
values = pd.Series(['apple' , 'orange' , 'apple' ,
'apple' ] * 2 )
values
pd.unique(values)
pd.value_counts(values)
apple 6
orange 2
Name: count, dtype: int64
values = pd.Series([0 , 1 , 0 , 0 ] * 2 )
dim = pd.Series(['apple' , 'orange' ])
values
dim
0 apple
1 orange
dtype: object
0 apple
1 orange
0 apple
0 apple
0 apple
1 orange
0 apple
0 apple
dtype: object
fruits = ['apple' , 'orange' , 'apple' , 'apple' ] * 2
N = len (fruits)
rng = np.random.default_rng(seed= 12345 )
df = pd.DataFrame({'fruit' : fruits,
'basket_id' : np.arange(N),
'count' : rng.integers(3 , 15 , size= N),
'weight' : rng.uniform(0 , 4 , size= N)},
columns= ['basket_id' , 'fruit' , 'count' , 'weight' ])
df
0
0
apple
11
1.564438
1
1
orange
5
1.331256
2
2
apple
12
2.393235
3
3
apple
6
0.746937
4
4
apple
5
2.691024
5
5
orange
12
3.767211
6
6
apple
10
0.992983
7
7
apple
11
3.795525
fruit_cat = df['fruit' ].astype('category' )
fruit_cat
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']
c = fruit_cat.array
type (c)
pandas.core.arrays.categorical.Categorical
array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
dict (enumerate (c.categories))
{0: 'apple', 1: 'orange'}
df['fruit' ] = df['fruit' ].astype('category' )
df["fruit" ]
0 apple
1 orange
2 apple
3 apple
4 apple
5 orange
6 apple
7 apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']
my_categories = pd.Categorical(['foo' , 'bar' , 'baz' , 'foo' , 'bar' ])
my_categories
['foo', 'bar', 'baz', 'foo', 'bar']
Categories (3, object): ['bar', 'baz', 'foo']
categories = ['foo' , 'bar' , 'baz' ]
codes = [0 , 1 , 2 , 0 , 0 , 1 ]
my_cats_2 = pd.Categorical.from_codes(codes, categories)
my_cats_2
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo', 'bar', 'baz']
ordered_cat = pd.Categorical.from_codes(codes, categories,
ordered= True )
ordered_cat
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']
['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']
rng = np.random.default_rng(seed= 12345 )
draws = rng.standard_normal(1000 )
draws[:5 ]
array([-1.4238, 1.2637, -0.8707, -0.2592, -0.0753])
bins = pd.qcut(draws, 4 )
bins
[(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]
bins = pd.qcut(draws, 4 , labels= ['Q1' , 'Q2' , 'Q3' , 'Q4' ])
bins
bins.codes[:10 ]
array([0, 3, 0, 1, 1, 0, 0, 2, 2, 0], dtype=int8)
bins = pd.Series(bins, name= 'quartile' )
results = (pd.Series(draws)
.groupby(bins)
.agg(['count' , 'min' , 'max' ])
.reset_index())
results
0
Q1
250
-3.119609
-0.678494
1
Q2
250
-0.673305
0.008009
2
Q3
250
0.018753
0.686183
3
Q4
250
0.688282
3.211418
0 Q1
1 Q2
2 Q3
3 Q4
Name: quartile, dtype: category
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
N = 10_000_000
labels = pd.Series(['foo' , 'bar' , 'baz' , 'qux' ] * (N // 4 ))
categories = labels.astype('category' )
labels.memory_usage(deep= True )
categories.memory_usage(deep= True )
% time _ = labels.astype('category' )
CPU times: user 136 ms, sys: 12.4 ms, total: 148 ms
Wall time: 148 ms
% timeit labels.value_counts()
% timeit categories.value_counts()
125 ms ± 151 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
15.9 ms ± 56.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
s = pd.Series(['a' , 'b' , 'c' , 'd' ] * 2 )
cat_s = s.astype('category' )
cat_s
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']
cat_s.cat.codes
cat_s.cat.categories
Index(['a', 'b', 'c', 'd'], dtype='object')
actual_categories = ['a' , 'b' , 'c' , 'd' , 'e' ]
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2
0 a
1 b
2 c
3 d
4 a
5 b
6 c
7 d
dtype: category
Categories (5, object): ['a', 'b', 'c', 'd', 'e']
cat_s.value_counts()
cat_s2.value_counts()
a 2
b 2
c 2
d 2
e 0
Name: count, dtype: int64
cat_s3 = cat_s[cat_s.isin(['a' , 'b' ])]
cat_s3
cat_s3.cat.remove_unused_categories()
0 a
1 b
4 a
5 b
dtype: category
Categories (2, object): ['a', 'b']
cat_s = pd.Series(['a' , 'b' , 'c' , 'd' ] * 2 , dtype= 'category' )
pd.get_dummies(cat_s, dtype= float )
0
1.0
0.0
0.0
0.0
1
0.0
1.0
0.0
0.0
2
0.0
0.0
1.0
0.0
3
0.0
0.0
0.0
1.0
4
1.0
0.0
0.0
0.0
5
0.0
1.0
0.0
0.0
6
0.0
0.0
1.0
0.0
7
0.0
0.0
0.0
1.0
pd.options.display.max_rows = PREVIOUS_MAX_ROWS