7  8장: 데이터 준비: 조인, 병합, 변형

여러 데이터셋을 결합하여 분석을 위한 통합 데이터를 만드는 과정을 다룹니다.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
pd.options.display.max_columns = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

7.1 계층적 색인

축 위에서 다중 색인을 사용하는 방법과 그 특성을 이해합니다.

7.2 계층적 색인

고차원 데이터를 다루기 위한 멀티 인덱싱 기능을 익힙니다.

data = pd.Series(np.random.uniform(size=9),
                 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a  1    0.929616
   2    0.316376
   3    0.183919
b  1    0.204560
   3    0.567725
c  1    0.595545
   2    0.964515
d  2    0.653177
   3    0.748907
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
data["b"]
data["b":"c"]
data.loc[["b", "d"]]
b  1    0.204560
   3    0.567725
d  2    0.653177
   3    0.748907
dtype: float64
data.loc[:, 2]
a    0.316376
c    0.964515
d    0.653177
dtype: float64
data.unstack()
1 2 3
a 0.929616 0.316376 0.183919
b 0.204560 NaN 0.567725
c 0.595545 0.964515 NaN
d NaN 0.653177 0.748907
data.unstack().stack()
a  1    0.929616
   2    0.316376
   3    0.183919
b  1    0.204560
   3    0.567725
c  1    0.595545
   2    0.964515
d  2    0.653177
   3    0.748907
dtype: float64
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                     index=[["a", "a", "b", "b"], [1, 2, 1, 2]],
                     columns=[["Ohio", "Ohio", "Colorado"],
                              ["Green", "Red", "Green"]])
frame
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11

7.3 데이터 결합하기

공통 키를 기준으로 데이터를 병합(merge, join) 하거나 축을 따라 이어 붙이는(concat) 방법을 배웁니다.

frame.index.names = ["key1", "key2"]
frame.columns.names = ["state", "color"]
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
frame.index.nlevels
2
frame["Ohio"]
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
frame.swaplevel("key1", "key2")
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
frame.sort_index(level=1)
frame.swaplevel(0, 1).sort_index(level=0)
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
frame.groupby(level="key2").sum()
frame.groupby(level="color", axis="columns").sum()
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10
frame = pd.DataFrame({"a": range(7), "b": range(7, 0, -1),
                      "c": ["one", "one", "one", "two", "two",
                            "two", "two"],
                      "d": [0, 1, 2, 0, 1, 2, 3]})
frame
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
frame2 = frame.set_index(["c", "d"])
frame2
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
frame.set_index(["c", "d"], drop=False)
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
frame2.reset_index()
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
                    "data1": pd.Series(range(7), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "d"],
                    "data2": pd.Series(range(3), dtype="Int64")})
df1
df2
key data2
0 a 0
1 b 1
2 d 2

7.4 데이터 병합 (Merge)

공통된 키를 기준으로 행을 결합하는 강력한 병합 기능을 학습합니다.

pd.merge(df1, df2)
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
pd.merge(df1, df2, on="key")
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
df3 = pd.DataFrame({"lkey": ["b", "b", "a", "c", "a", "a", "b"],
                    "data1": pd.Series(range(7), dtype="Int64")})
df4 = pd.DataFrame({"rkey": ["a", "b", "d"],
                    "data2": pd.Series(range(3), dtype="Int64")})
pd.merge(df3, df4, left_on="lkey", right_on="rkey")
lkey data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
pd.merge(df1, df2, how="outer")
pd.merge(df3, df4, left_on="lkey", right_on="rkey", how="outer")
lkey data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
6 c 3 NaN <NA>
7 NaN <NA> d 2
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                    "data1": pd.Series(range(6), dtype="Int64")})
df2 = pd.DataFrame({"key": ["a", "b", "a", "b", "d"],
                    "data2": pd.Series(range(5), dtype="Int64")})
df1
df2
pd.merge(df1, df2, on="key", how="left")
key data1 data2
0 b 0 1
1 b 0 3
2 b 1 1
3 b 1 3
4 a 2 0
5 a 2 2
6 c 3 <NA>
7 a 4 0
8 a 4 2
9 b 5 1
10 b 5 3
pd.merge(df1, df2, how="inner")
key data1 data2
0 b 0 1
1 b 0 3
2 b 1 1
3 b 1 3
4 b 5 1
5 b 5 3
6 a 2 0
7 a 2 2
8 a 4 0
9 a 4 2
left = pd.DataFrame({"key1": ["foo", "foo", "bar"],
                     "key2": ["one", "two", "one"],
                     "lval": pd.Series([1, 2, 3], dtype='Int64')})
right = pd.DataFrame({"key1": ["foo", "foo", "bar", "bar"],
                      "key2": ["one", "one", "one", "two"],
                      "rval": pd.Series([4, 5, 6, 7], dtype='Int64')})
pd.merge(left, right, on=["key1", "key2"], how="outer")
key1 key2 lval rval
0 foo one 1 4
1 foo one 1 5
2 foo two 2 <NA>
3 bar one 3 6
4 bar two <NA> 7
pd.merge(left, right, on="key1")
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
pd.merge(left, right, on="key1", suffixes=("_left", "_right"))
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
left1 = pd.DataFrame({"key": ["a", "b", "a", "a", "b", "c"],
                      "value": pd.Series(range(6), dtype="Int64")})
right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])
left1
right1
pd.merge(left1, right1, left_on="key", right_index=True)
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
pd.merge(left1, right1, left_on="key", right_index=True, how="outer")
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
lefth = pd.DataFrame({"key1": ["Ohio", "Ohio", "Ohio",
                               "Nevada", "Nevada"],
                      "key2": [2000, 2001, 2002, 2001, 2002],
                      "data": pd.Series(range(5), dtype="Int64")})
righth_index = pd.MultiIndex.from_arrays(
    [
        ["Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Ohio"],
        [2001, 2000, 2000, 2000, 2001, 2002]
    ]
)
righth = pd.DataFrame({"event1": pd.Series([0, 2, 4, 6, 8, 10], dtype="Int64",
                                           index=righth_index),
                       "event2": pd.Series([1, 3, 5, 7, 9, 11], dtype="Int64",
                                           index=righth_index)})
lefth
righth
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True)
pd.merge(lefth, righth, left_on=["key1", "key2"],
         right_index=True, how="outer")
key1 key2 data event1 event2
0 Ohio 2000 0 4 5
0 Ohio 2000 0 6 7
1 Ohio 2001 1 8 9
2 Ohio 2002 2 10 11
3 Nevada 2001 3 0 1
4 Nevada 2002 4 <NA> <NA>
4 Nevada 2000 <NA> 2 3
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=["a", "c", "e"],
                     columns=["Ohio", "Nevada"]).astype("Int64")
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=["b", "c", "d", "e"],
                      columns=["Missouri", "Alabama"]).astype("Int64")
left2
right2
pd.merge(left2, right2, how="outer", left_index=True, right_index=True)
Ohio Nevada Missouri Alabama
a 1 2 <NA> <NA>
b <NA> <NA> 7 8
c 3 4 9 10
d <NA> <NA> 11 12
e 5 6 13 14
left2.join(right2, how="outer")
Ohio Nevada Missouri Alabama
a 1 2 <NA> <NA>
b <NA> <NA> 7 8
c 3 4 9 10
d <NA> <NA> 11 12
e 5 6 13 14
left1.join(right1, on="key")
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=["a", "c", "e", "f"],
                       columns=["New York", "Oregon"])
another
left2.join([right2, another])
left2.join([right2, another], how="outer")
Ohio Nevada Missouri Alabama New York Oregon
a 1 2 <NA> <NA> 7.0 8.0
c 3 4 9 10 9.0 10.0
e 5 6 13 14 11.0 12.0
b <NA> <NA> 7 8 NaN NaN
d <NA> <NA> 11 12 NaN NaN
f <NA> <NA> <NA> <NA> 16.0 17.0
arr = np.arange(12).reshape((3, 4))
arr
np.concatenate([arr, arr], axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64")
s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64")
s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64")

7.5 데이터 이어 붙이기 (Concat)

축을 따라 여러 객체를 합치는 연산을 수행합니다.

s1
s2
s3
pd.concat([s1, s2, s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64
pd.concat([s1, s2, s3], axis="columns")
0 1 2
a 0 <NA> <NA>
b 1 <NA> <NA>
c <NA> 2 <NA>
d <NA> 3 <NA>
e <NA> 4 <NA>
f <NA> <NA> 5
g <NA> <NA> 6
s4 = pd.concat([s1, s3])
s4
pd.concat([s1, s4], axis="columns")
pd.concat([s1, s4], axis="columns", join="inner")
0 1
a 0 0
b 1 1
result = pd.concat([s1, s1, s3], keys=["one", "two", "three"])
result
result.unstack()
a b f g
one 0 1 <NA> <NA>
two 0 1 <NA> <NA>
three <NA> <NA> 5 6
pd.concat([s1, s2, s3], axis="columns", keys=["one", "two", "three"])
one two three
a 0 <NA> <NA>
b 1 <NA> <NA>
c <NA> 2 <NA>
d <NA> 3 <NA>
e <NA> 4 <NA>
f <NA> <NA> 5
g <NA> <NA> 6
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=["a", "b", "c"],
                   columns=["one", "two"])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=["a", "c"],
                   columns=["three", "four"])
df1
df2
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"])
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
pd.concat({"level1": df1, "level2": df2}, axis="columns")
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"],
          names=["upper", "lower"])
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
df1 = pd.DataFrame(np.random.standard_normal((3, 4)),
                   columns=["a", "b", "c", "d"])
df2 = pd.DataFrame(np.random.standard_normal((2, 3)),
                   columns=["b", "d", "a"])
df1
df2
b d a
0 1.059626 0.644448 -0.007799
1 -0.449204 2.448963 0.667226
pd.concat([df1, df2], ignore_index=True)
a b c d
0 1.248804 0.774191 -0.319657 -0.624964
1 1.078814 0.544647 0.855588 1.343268
2 -0.267175 1.793095 -0.652929 -1.886837
3 -0.007799 1.059626 NaN 0.644448
4 0.667226 -0.449204 NaN 2.448963
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan],
              index=["f", "e", "d", "c", "b", "a"])
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
              index=["a", "b", "c", "d", "e", "f"])
a
b
np.where(pd.isna(a), b, a)
array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
a.combine_first(b)
a    0.0
b    4.5
c    3.5
d    0.0
e    2.5
f    5.0
dtype: float64
df1 = pd.DataFrame({"a": [1., np.nan, 5., np.nan],
                    "b": [np.nan, 2., np.nan, 6.],
                    "c": range(2, 18, 4)})
df2 = pd.DataFrame({"a": [5., 4., np.nan, 3., 7.],
                    "b": [np.nan, 3., 4., 6., 8.]})
df1
df2
df1.combine_first(df2)
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(["Ohio", "Colorado"], name="state"),
                    columns=pd.Index(["one", "two", "three"],
                    name="number"))
data
number one two three
state
Ohio 0 1 2
Colorado 3 4 5

7.6 재형성과 피벗

데이터의 레이아웃을 바꾸는 stack, unstack, pivot 기능을 학습합니다.

7.7 재형성과 피벗

데이터의 레이아웃을 유연하게 조정하는 기법을 알아봅니다.

result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64
result.unstack()
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
result.unstack(level=0)
result.unstack(level="state")
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
s1 = pd.Series([0, 1, 2, 3], index=["a", "b", "c", "d"], dtype="Int64")
s2 = pd.Series([4, 5, 6], index=["c", "d", "e"], dtype="Int64")
data2 = pd.concat([s1, s2], keys=["one", "two"])
data2
one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64
data2.unstack()
data2.unstack().stack()
data2.unstack().stack(dropna=False)
one  a       0
     b       1
     c       2
     d       3
     e    <NA>
two  a    <NA>
     b    <NA>
     c       4
     d       5
     e       6
dtype: Int64
df = pd.DataFrame({"left": result, "right": result + 5},
                  columns=pd.Index(["left", "right"], name="side"))
df
df.unstack(level="state")
side left right
state Ohio Colorado Ohio Colorado
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10
df.unstack(level="state").stack(level="side")
state Colorado Ohio
number side
one left 3 0
right 8 5
two left 4 1
right 9 6
three left 5 2
right 10 7
data = pd.read_csv("examples/macrodata.csv")
data = data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]]
data.head()
year quarter realgdp infl unemp
0 1959 1 2710.349 0.00 5.8
1 1959 2 2778.801 2.34 5.1
2 1959 3 2775.488 2.74 5.3
3 1959 4 2785.204 0.27 5.6
4 1960 1 2847.699 2.31 5.2
periods = pd.PeriodIndex(year=data.pop("year"),
                         quarter=data.pop("quarter"),
                         name="date")
periods
data.index = periods.to_timestamp("D")
data.head()
realgdp infl unemp
date
1959-01-01 2710.349 0.00 5.8
1959-04-01 2778.801 2.34 5.1
1959-07-01 2775.488 2.74 5.3
1959-10-01 2785.204 0.27 5.6
1960-01-01 2847.699 2.31 5.2
data = data.reindex(columns=["realgdp", "infl", "unemp"])
data.columns.name = "item"
data.head()
item realgdp infl unemp
date
1959-01-01 2710.349 0.00 5.8
1959-04-01 2778.801 2.34 5.1
1959-07-01 2775.488 2.74 5.3
1959-10-01 2785.204 0.27 5.6
1960-01-01 2847.699 2.31 5.2
long_data = (data.stack()
             .reset_index()
             .rename(columns={0: "value"}))
long_data[:10]
date item value
0 1959-01-01 realgdp 2710.349
1 1959-01-01 infl 0.000
2 1959-01-01 unemp 5.800
3 1959-04-01 realgdp 2778.801
4 1959-04-01 infl 2.340
5 1959-04-01 unemp 5.100
6 1959-07-01 realgdp 2775.488
7 1959-07-01 infl 2.740
8 1959-07-01 unemp 5.300
9 1959-10-01 realgdp 2785.204
pivoted = long_data.pivot(index="date", columns="item",
                          values="value")
pivoted.head()
item infl realgdp unemp
date
1959-01-01 0.00 2710.349 5.8
1959-04-01 2.34 2778.801 5.1
1959-07-01 2.74 2775.488 5.3
1959-10-01 0.27 2785.204 5.6
1960-01-01 2.31 2847.699 5.2
long_data.index.name = None
long_data["value2"] = np.random.standard_normal(len(long_data))
long_data[:10]
date item value value2
0 1959-01-01 realgdp 2710.349 0.802926
1 1959-01-01 infl 0.000 0.575721
2 1959-01-01 unemp 5.800 1.381918
3 1959-04-01 realgdp 2778.801 0.000992
4 1959-04-01 infl 2.340 -0.143492
5 1959-04-01 unemp 5.100 -0.206282
6 1959-07-01 realgdp 2775.488 -0.222392
7 1959-07-01 infl 2.740 -1.682403
8 1959-07-01 unemp 5.300 1.811659
9 1959-10-01 realgdp 2785.204 -0.351305
pivoted = long_data.pivot(index="date", columns="item")
pivoted.head()
pivoted["value"].head()
item infl realgdp unemp
date
1959-01-01 0.00 2710.349 5.8
1959-04-01 2.34 2778.801 5.1
1959-07-01 2.74 2775.488 5.3
1959-10-01 0.27 2785.204 5.6
1960-01-01 2.31 2847.699 5.2
unstacked = long_data.set_index(["date", "item"]).unstack(level="item")
unstacked.head()
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-01-01 0.00 2710.349 5.8 0.575721 0.802926 1.381918
1959-04-01 2.34 2778.801 5.1 -0.143492 0.000992 -0.206282
1959-07-01 2.74 2775.488 5.3 -1.682403 -0.222392 1.811659
1959-10-01 0.27 2785.204 5.6 0.128317 -0.351305 -1.313554
1960-01-01 2.31 2847.699 5.2 -0.615939 0.498327 0.174072
df = pd.DataFrame({"key": ["foo", "bar", "baz"],
                   "A": [1, 2, 3],
                   "B": [4, 5, 6],
                   "C": [7, 8, 9]})
df
key A B C
0 foo 1 4 7
1 bar 2 5 8
2 baz 3 6 9
melted = pd.melt(df, id_vars="key")
melted
key variable value
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
6 foo C 7
7 bar C 8
8 baz C 9
reshaped = melted.pivot(index="key", columns="variable",
                        values="value")
reshaped
variable A B C
key
bar 2 5 8
baz 3 6 9
foo 1 4 7
reshaped.reset_index()
variable key A B C
0 bar 2 5 8
1 baz 3 6 9
2 foo 1 4 7
pd.melt(df, id_vars="key", value_vars=["A", "B"])
key variable value
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
pd.melt(df, value_vars=["A", "B", "C"])
pd.melt(df, value_vars=["key", "A", "B"])
variable value
0 key foo
1 key bar
2 key baz
3 A 1
4 A 2
5 A 3
6 B 4
7 B 5
8 B 6