여러 데이터셋을 결합하여 분석을 위한 통합 데이터를 만드는 과정을 다룹니다.
import warnings
warnings.filterwarnings('ignore' )
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
pd.options.display.max_columns = 20
np.random.seed(12345 )
import matplotlib.pyplot as plt
plt.rc("figure" , figsize= (10 , 6 ))
np.set_printoptions(precision= 4 , suppress= True )
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font' , family= 'AppleGothic' )
plt.rc('axes' , unicode_minus= False )
계층적 색인
축 위에서 다중 색인을 사용하는 방법과 그 특성을 이해합니다.
계층적 색인
고차원 데이터를 다루기 위한 멀티 인덱싱 기능을 익힙니다.
data = pd.Series(np.random.uniform(size= 9 ),
index= [["a" , "a" , "a" , "b" , "b" , "c" , "c" , "d" , "d" ],
[1 , 2 , 3 , 1 , 3 , 1 , 2 , 2 , 3 ]])
data
a 1 0.929616
2 0.316376
3 0.183919
b 1 0.204560
3 0.567725
c 1 0.595545
2 0.964515
d 2 0.653177
3 0.748907
dtype: float64
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
data["b" ]
data["b" :"c" ]
data.loc[["b" , "d" ]]
b 1 0.204560
3 0.567725
d 2 0.653177
3 0.748907
dtype: float64
a 0.316376
c 0.964515
d 0.653177
dtype: float64
a
0.929616
0.316376
0.183919
b
0.204560
NaN
0.567725
c
0.595545
0.964515
NaN
d
NaN
0.653177
0.748907
a 1 0.929616
2 0.316376
3 0.183919
b 1 0.204560
3 0.567725
c 1 0.595545
2 0.964515
d 2 0.653177
3 0.748907
dtype: float64
frame = pd.DataFrame(np.arange(12 ).reshape((4 , 3 )),
index= [["a" , "a" , "b" , "b" ], [1 , 2 , 1 , 2 ]],
columns= [["Ohio" , "Ohio" , "Colorado" ],
["Green" , "Red" , "Green" ]])
frame
Green
Red
Green
a
1
0
1
2
2
3
4
5
b
1
6
7
8
2
9
10
11
데이터 결합하기
공통 키를 기준으로 데이터를 병합(merge, join) 하거나 축을 따라 이어 붙이는(concat) 방법을 배웁니다.
frame.index.names = ["key1" , "key2" ]
frame.columns.names = ["state" , "color" ]
frame
color
Green
Red
Green
a
1
0
1
2
2
3
4
5
b
1
6
7
8
2
9
10
11
key1
key2
a
1
0
1
2
3
4
b
1
6
7
2
9
10
frame.swaplevel("key1" , "key2" )
color
Green
Red
Green
1
a
0
1
2
2
a
3
4
5
1
b
6
7
8
2
b
9
10
11
frame.sort_index(level= 1 )
frame.swaplevel(0 , 1 ).sort_index(level= 0 )
color
Green
Red
Green
1
a
0
1
2
b
6
7
8
2
a
3
4
5
b
9
10
11
frame.groupby(level= "key2" ).sum ()
frame.groupby(level= "color" , axis= "columns" ).sum ()
key1
key2
a
1
2
1
2
8
4
b
1
14
7
2
20
10
frame = pd.DataFrame({"a" : range (7 ), "b" : range (7 , 0 , - 1 ),
"c" : ["one" , "one" , "one" , "two" , "two" ,
"two" , "two" ],
"d" : [0 , 1 , 2 , 0 , 1 , 2 , 3 ]})
frame
0
0
7
one
0
1
1
6
one
1
2
2
5
one
2
3
3
4
two
0
4
4
3
two
1
5
5
2
two
2
6
6
1
two
3
frame2 = frame.set_index(["c" , "d" ])
frame2
c
d
one
0
0
7
1
1
6
2
2
5
two
0
3
4
1
4
3
2
5
2
3
6
1
frame.set_index(["c" , "d" ], drop= False )
c
d
one
0
0
7
one
0
1
1
6
one
1
2
2
5
one
2
two
0
3
4
two
0
1
4
3
two
1
2
5
2
two
2
3
6
1
two
3
0
one
0
0
7
1
one
1
1
6
2
one
2
2
5
3
two
0
3
4
4
two
1
4
3
5
two
2
5
2
6
two
3
6
1
df1 = pd.DataFrame({"key" : ["b" , "b" , "a" , "c" , "a" , "a" , "b" ],
"data1" : pd.Series(range (7 ), dtype= "Int64" )})
df2 = pd.DataFrame({"key" : ["a" , "b" , "d" ],
"data2" : pd.Series(range (3 ), dtype= "Int64" )})
df1
df2
데이터 병합 (Merge)
공통된 키를 기준으로 행을 결합하는 강력한 병합 기능을 학습합니다.
0
b
0
1
1
b
1
1
2
b
6
1
3
a
2
0
4
a
4
0
5
a
5
0
pd.merge(df1, df2, on= "key" )
0
b
0
1
1
b
1
1
2
b
6
1
3
a
2
0
4
a
4
0
5
a
5
0
df3 = pd.DataFrame({"lkey" : ["b" , "b" , "a" , "c" , "a" , "a" , "b" ],
"data1" : pd.Series(range (7 ), dtype= "Int64" )})
df4 = pd.DataFrame({"rkey" : ["a" , "b" , "d" ],
"data2" : pd.Series(range (3 ), dtype= "Int64" )})
pd.merge(df3, df4, left_on= "lkey" , right_on= "rkey" )
0
b
0
b
1
1
b
1
b
1
2
b
6
b
1
3
a
2
a
0
4
a
4
a
0
5
a
5
a
0
pd.merge(df1, df2, how= "outer" )
pd.merge(df3, df4, left_on= "lkey" , right_on= "rkey" , how= "outer" )
0
b
0
b
1
1
b
1
b
1
2
b
6
b
1
3
a
2
a
0
4
a
4
a
0
5
a
5
a
0
6
c
3
NaN
<NA>
7
NaN
<NA>
d
2
df1 = pd.DataFrame({"key" : ["b" , "b" , "a" , "c" , "a" , "b" ],
"data1" : pd.Series(range (6 ), dtype= "Int64" )})
df2 = pd.DataFrame({"key" : ["a" , "b" , "a" , "b" , "d" ],
"data2" : pd.Series(range (5 ), dtype= "Int64" )})
df1
df2
pd.merge(df1, df2, on= "key" , how= "left" )
0
b
0
1
1
b
0
3
2
b
1
1
3
b
1
3
4
a
2
0
5
a
2
2
6
c
3
<NA>
7
a
4
0
8
a
4
2
9
b
5
1
10
b
5
3
pd.merge(df1, df2, how= "inner" )
0
b
0
1
1
b
0
3
2
b
1
1
3
b
1
3
4
b
5
1
5
b
5
3
6
a
2
0
7
a
2
2
8
a
4
0
9
a
4
2
left = pd.DataFrame({"key1" : ["foo" , "foo" , "bar" ],
"key2" : ["one" , "two" , "one" ],
"lval" : pd.Series([1 , 2 , 3 ], dtype= 'Int64' )})
right = pd.DataFrame({"key1" : ["foo" , "foo" , "bar" , "bar" ],
"key2" : ["one" , "one" , "one" , "two" ],
"rval" : pd.Series([4 , 5 , 6 , 7 ], dtype= 'Int64' )})
pd.merge(left, right, on= ["key1" , "key2" ], how= "outer" )
0
foo
one
1
4
1
foo
one
1
5
2
foo
two
2
<NA>
3
bar
one
3
6
4
bar
two
<NA>
7
pd.merge(left, right, on= "key1" )
0
foo
one
1
one
4
1
foo
one
1
one
5
2
foo
two
2
one
4
3
foo
two
2
one
5
4
bar
one
3
one
6
5
bar
one
3
two
7
pd.merge(left, right, on= "key1" , suffixes= ("_left" , "_right" ))
0
foo
one
1
one
4
1
foo
one
1
one
5
2
foo
two
2
one
4
3
foo
two
2
one
5
4
bar
one
3
one
6
5
bar
one
3
two
7
left1 = pd.DataFrame({"key" : ["a" , "b" , "a" , "a" , "b" , "c" ],
"value" : pd.Series(range (6 ), dtype= "Int64" )})
right1 = pd.DataFrame({"group_val" : [3.5 , 7 ]}, index= ["a" , "b" ])
left1
right1
pd.merge(left1, right1, left_on= "key" , right_index= True )
0
a
0
3.5
2
a
2
3.5
3
a
3
3.5
1
b
1
7.0
4
b
4
7.0
pd.merge(left1, right1, left_on= "key" , right_index= True , how= "outer" )
0
a
0
3.5
2
a
2
3.5
3
a
3
3.5
1
b
1
7.0
4
b
4
7.0
5
c
5
NaN
lefth = pd.DataFrame({"key1" : ["Ohio" , "Ohio" , "Ohio" ,
"Nevada" , "Nevada" ],
"key2" : [2000 , 2001 , 2002 , 2001 , 2002 ],
"data" : pd.Series(range (5 ), dtype= "Int64" )})
righth_index = pd.MultiIndex.from_arrays(
[
["Nevada" , "Nevada" , "Ohio" , "Ohio" , "Ohio" , "Ohio" ],
[2001 , 2000 , 2000 , 2000 , 2001 , 2002 ]
]
)
righth = pd.DataFrame({"event1" : pd.Series([0 , 2 , 4 , 6 , 8 , 10 ], dtype= "Int64" ,
index= righth_index),
"event2" : pd.Series([1 , 3 , 5 , 7 , 9 , 11 ], dtype= "Int64" ,
index= righth_index)})
lefth
righth
Nevada
2001
0
1
2000
2
3
Ohio
2000
4
5
2000
6
7
2001
8
9
2002
10
11
pd.merge(lefth, righth, left_on= ["key1" , "key2" ], right_index= True )
pd.merge(lefth, righth, left_on= ["key1" , "key2" ],
right_index= True , how= "outer" )
0
Ohio
2000
0
4
5
0
Ohio
2000
0
6
7
1
Ohio
2001
1
8
9
2
Ohio
2002
2
10
11
3
Nevada
2001
3
0
1
4
Nevada
2002
4
<NA>
<NA>
4
Nevada
2000
<NA>
2
3
left2 = pd.DataFrame([[1. , 2. ], [3. , 4. ], [5. , 6. ]],
index= ["a" , "c" , "e" ],
columns= ["Ohio" , "Nevada" ]).astype("Int64" )
right2 = pd.DataFrame([[7. , 8. ], [9. , 10. ], [11. , 12. ], [13 , 14 ]],
index= ["b" , "c" , "d" , "e" ],
columns= ["Missouri" , "Alabama" ]).astype("Int64" )
left2
right2
pd.merge(left2, right2, how= "outer" , left_index= True , right_index= True )
a
1
2
<NA>
<NA>
b
<NA>
<NA>
7
8
c
3
4
9
10
d
<NA>
<NA>
11
12
e
5
6
13
14
left2.join(right2, how= "outer" )
a
1
2
<NA>
<NA>
b
<NA>
<NA>
7
8
c
3
4
9
10
d
<NA>
<NA>
11
12
e
5
6
13
14
left1.join(right1, on= "key" )
0
a
0
3.5
1
b
1
7.0
2
a
2
3.5
3
a
3
3.5
4
b
4
7.0
5
c
5
NaN
another = pd.DataFrame([[7. , 8. ], [9. , 10. ], [11. , 12. ], [16. , 17. ]],
index= ["a" , "c" , "e" , "f" ],
columns= ["New York" , "Oregon" ])
another
left2.join([right2, another])
left2.join([right2, another], how= "outer" )
a
1
2
<NA>
<NA>
7.0
8.0
c
3
4
9
10
9.0
10.0
e
5
6
13
14
11.0
12.0
b
<NA>
<NA>
7
8
NaN
NaN
d
<NA>
<NA>
11
12
NaN
NaN
f
<NA>
<NA>
<NA>
<NA>
16.0
17.0
arr = np.arange(12 ).reshape((3 , 4 ))
arr
np.concatenate([arr, arr], axis= 1 )
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
s1 = pd.Series([0 , 1 ], index= ["a" , "b" ], dtype= "Int64" )
s2 = pd.Series([2 , 3 , 4 ], index= ["c" , "d" , "e" ], dtype= "Int64" )
s3 = pd.Series([5 , 6 ], index= ["f" , "g" ], dtype= "Int64" )
데이터 이어 붙이기 (Concat)
축을 따라 여러 객체를 합치는 연산을 수행합니다.
s1
s2
s3
pd.concat([s1, s2, s3])
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: Int64
pd.concat([s1, s2, s3], axis= "columns" )
a
0
<NA>
<NA>
b
1
<NA>
<NA>
c
<NA>
2
<NA>
d
<NA>
3
<NA>
e
<NA>
4
<NA>
f
<NA>
<NA>
5
g
<NA>
<NA>
6
s4 = pd.concat([s1, s3])
s4
pd.concat([s1, s4], axis= "columns" )
pd.concat([s1, s4], axis= "columns" , join= "inner" )
result = pd.concat([s1, s1, s3], keys= ["one" , "two" , "three" ])
result
result.unstack()
one
0
1
<NA>
<NA>
two
0
1
<NA>
<NA>
three
<NA>
<NA>
5
6
pd.concat([s1, s2, s3], axis= "columns" , keys= ["one" , "two" , "three" ])
a
0
<NA>
<NA>
b
1
<NA>
<NA>
c
<NA>
2
<NA>
d
<NA>
3
<NA>
e
<NA>
4
<NA>
f
<NA>
<NA>
5
g
<NA>
<NA>
6
df1 = pd.DataFrame(np.arange(6 ).reshape(3 , 2 ), index= ["a" , "b" , "c" ],
columns= ["one" , "two" ])
df2 = pd.DataFrame(5 + np.arange(4 ).reshape(2 , 2 ), index= ["a" , "c" ],
columns= ["three" , "four" ])
df1
df2
pd.concat([df1, df2], axis= "columns" , keys= ["level1" , "level2" ])
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
pd.concat({"level1" : df1, "level2" : df2}, axis= "columns" )
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
pd.concat([df1, df2], axis= "columns" , keys= ["level1" , "level2" ],
names= ["upper" , "lower" ])
lower
one
two
three
four
a
0
1
5.0
6.0
b
2
3
NaN
NaN
c
4
5
7.0
8.0
df1 = pd.DataFrame(np.random.standard_normal((3 , 4 )),
columns= ["a" , "b" , "c" , "d" ])
df2 = pd.DataFrame(np.random.standard_normal((2 , 3 )),
columns= ["b" , "d" , "a" ])
df1
df2
0
1.059626
0.644448
-0.007799
1
-0.449204
2.448963
0.667226
pd.concat([df1, df2], ignore_index= True )
0
1.248804
0.774191
-0.319657
-0.624964
1
1.078814
0.544647
0.855588
1.343268
2
-0.267175
1.793095
-0.652929
-1.886837
3
-0.007799
1.059626
NaN
0.644448
4
0.667226
-0.449204
NaN
2.448963
a = pd.Series([np.nan, 2.5 , 0.0 , 3.5 , 4.5 , np.nan],
index= ["f" , "e" , "d" , "c" , "b" , "a" ])
b = pd.Series([0. , np.nan, 2. , np.nan, np.nan, 5. ],
index= ["a" , "b" , "c" , "d" , "e" , "f" ])
a
b
np.where(pd.isna(a), b, a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
a 0.0
b 4.5
c 3.5
d 0.0
e 2.5
f 5.0
dtype: float64
df1 = pd.DataFrame({"a" : [1. , np.nan, 5. , np.nan],
"b" : [np.nan, 2. , np.nan, 6. ],
"c" : range (2 , 18 , 4 )})
df2 = pd.DataFrame({"a" : [5. , 4. , np.nan, 3. , 7. ],
"b" : [np.nan, 3. , 4. , 6. , 8. ]})
df1
df2
df1.combine_first(df2)
0
1.0
NaN
2.0
1
4.0
2.0
6.0
2
5.0
4.0
10.0
3
3.0
6.0
14.0
4
7.0
8.0
NaN
data = pd.DataFrame(np.arange(6 ).reshape((2 , 3 )),
index= pd.Index(["Ohio" , "Colorado" ], name= "state" ),
columns= pd.Index(["one" , "two" , "three" ],
name= "number" ))
data
state
Ohio
0
1
2
Colorado
3
4
5
재형성과 피벗
데이터의 레이아웃을 바꾸는 stack, unstack, pivot 기능을 학습합니다.
재형성과 피벗
데이터의 레이아웃을 유연하게 조정하는 기법을 알아봅니다.
result = data.stack()
result
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int64
state
Ohio
0
1
2
Colorado
3
4
5
result.unstack(level= 0 )
result.unstack(level= "state" )
number
one
0
3
two
1
4
three
2
5
s1 = pd.Series([0 , 1 , 2 , 3 ], index= ["a" , "b" , "c" , "d" ], dtype= "Int64" )
s2 = pd.Series([4 , 5 , 6 ], index= ["c" , "d" , "e" ], dtype= "Int64" )
data2 = pd.concat([s1, s2], keys= ["one" , "two" ])
data2
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: Int64
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna= False )
one a 0
b 1
c 2
d 3
e <NA>
two a <NA>
b <NA>
c 4
d 5
e 6
dtype: Int64
df = pd.DataFrame({"left" : result, "right" : result + 5 },
columns= pd.Index(["left" , "right" ], name= "side" ))
df
df.unstack(level= "state" )
state
Ohio
Colorado
Ohio
Colorado
one
0
3
5
8
two
1
4
6
9
three
2
5
7
10
df.unstack(level= "state" ).stack(level= "side" )
number
side
one
left
3
0
right
8
5
two
left
4
1
right
9
6
three
left
5
2
right
10
7
data = pd.read_csv("examples/macrodata.csv" )
data = data.loc[:, ["year" , "quarter" , "realgdp" , "infl" , "unemp" ]]
data.head()
0
1959
1
2710.349
0.00
5.8
1
1959
2
2778.801
2.34
5.1
2
1959
3
2775.488
2.74
5.3
3
1959
4
2785.204
0.27
5.6
4
1960
1
2847.699
2.31
5.2
periods = pd.PeriodIndex(year= data.pop("year" ),
quarter= data.pop("quarter" ),
name= "date" )
periods
data.index = periods.to_timestamp("D" )
data.head()
date
1959-01-01
2710.349
0.00
5.8
1959-04-01
2778.801
2.34
5.1
1959-07-01
2775.488
2.74
5.3
1959-10-01
2785.204
0.27
5.6
1960-01-01
2847.699
2.31
5.2
data = data.reindex(columns= ["realgdp" , "infl" , "unemp" ])
data.columns.name = "item"
data.head()
date
1959-01-01
2710.349
0.00
5.8
1959-04-01
2778.801
2.34
5.1
1959-07-01
2775.488
2.74
5.3
1959-10-01
2785.204
0.27
5.6
1960-01-01
2847.699
2.31
5.2
long_data = (data.stack()
.reset_index()
.rename(columns= {0 : "value" }))
0
1959-01-01
realgdp
2710.349
1
1959-01-01
infl
0.000
2
1959-01-01
unemp
5.800
3
1959-04-01
realgdp
2778.801
4
1959-04-01
infl
2.340
5
1959-04-01
unemp
5.100
6
1959-07-01
realgdp
2775.488
7
1959-07-01
infl
2.740
8
1959-07-01
unemp
5.300
9
1959-10-01
realgdp
2785.204
pivoted = long_data.pivot(index= "date" , columns= "item" ,
values= "value" )
pivoted.head()
date
1959-01-01
0.00
2710.349
5.8
1959-04-01
2.34
2778.801
5.1
1959-07-01
2.74
2775.488
5.3
1959-10-01
0.27
2785.204
5.6
1960-01-01
2.31
2847.699
5.2
long_data.index.name = None
long_data["value2" ] = np.random.standard_normal(len (long_data))
long_data[:10 ]
0
1959-01-01
realgdp
2710.349
0.802926
1
1959-01-01
infl
0.000
0.575721
2
1959-01-01
unemp
5.800
1.381918
3
1959-04-01
realgdp
2778.801
0.000992
4
1959-04-01
infl
2.340
-0.143492
5
1959-04-01
unemp
5.100
-0.206282
6
1959-07-01
realgdp
2775.488
-0.222392
7
1959-07-01
infl
2.740
-1.682403
8
1959-07-01
unemp
5.300
1.811659
9
1959-10-01
realgdp
2785.204
-0.351305
pivoted = long_data.pivot(index= "date" , columns= "item" )
pivoted.head()
pivoted["value" ].head()
date
1959-01-01
0.00
2710.349
5.8
1959-04-01
2.34
2778.801
5.1
1959-07-01
2.74
2775.488
5.3
1959-10-01
0.27
2785.204
5.6
1960-01-01
2.31
2847.699
5.2
unstacked = long_data.set_index(["date" , "item" ]).unstack(level= "item" )
unstacked.head()
item
infl
realgdp
unemp
infl
realgdp
unemp
1959-01-01
0.00
2710.349
5.8
0.575721
0.802926
1.381918
1959-04-01
2.34
2778.801
5.1
-0.143492
0.000992
-0.206282
1959-07-01
2.74
2775.488
5.3
-1.682403
-0.222392
1.811659
1959-10-01
0.27
2785.204
5.6
0.128317
-0.351305
-1.313554
1960-01-01
2.31
2847.699
5.2
-0.615939
0.498327
0.174072
df = pd.DataFrame({"key" : ["foo" , "bar" , "baz" ],
"A" : [1 , 2 , 3 ],
"B" : [4 , 5 , 6 ],
"C" : [7 , 8 , 9 ]})
df
0
foo
1
4
7
1
bar
2
5
8
2
baz
3
6
9
melted = pd.melt(df, id_vars= "key" )
melted
0
foo
A
1
1
bar
A
2
2
baz
A
3
3
foo
B
4
4
bar
B
5
5
baz
B
6
6
foo
C
7
7
bar
C
8
8
baz
C
9
reshaped = melted.pivot(index= "key" , columns= "variable" ,
values= "value" )
reshaped
key
bar
2
5
8
baz
3
6
9
foo
1
4
7
0
bar
2
5
8
1
baz
3
6
9
2
foo
1
4
7
pd.melt(df, id_vars= "key" , value_vars= ["A" , "B" ])
0
foo
A
1
1
bar
A
2
2
baz
A
3
3
foo
B
4
4
bar
B
5
5
baz
B
6
pd.melt(df, value_vars= ["A" , "B" , "C" ])
pd.melt(df, value_vars= ["key" , "A" , "B" ])
0
key
foo
1
key
bar
2
key
baz
3
A
1
4
A
2
5
A
3
6
B
4
7
B
5
8
B
6