import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)13 부록 A: 고급 NumPy
NumPy의 더 깊은 이해를 위한 고급 주제들을 다룹니다.
rng = np.random.default_rng(seed=12345)13.1 ndarray 객체의 구조
배열의 메모리 레이아웃과 스트라이드(stride) 개념을 이해합니다.
np.ones((10, 5)).shape(10, 5)
np.ones((3, 4, 5), dtype=np.float64).strides(160, 40, 8)
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)True
np.float64.mro()[numpy.float64,
numpy.floating,
numpy.inexact,
numpy.number,
numpy.generic,
float,
object]
np.issubdtype(ints.dtype, np.number)True
13.2 고급 배열 조작
Reshape, Concatenate, Split 등 더 세밀한 배열 제어 방법을 학습합니다.
arr = np.arange(8)
arr
arr.reshape((4, 2))array([[0, 1],
[2, 3],
[4, 5],
[6, 7]])
arr.reshape((4, 2)).reshape((2, 4))array([[0, 1, 2, 3],
[4, 5, 6, 7]])
arr = np.arange(15)
arr.reshape((5, -1))array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr.flatten()array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')array([ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11])
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))array([[ 1, 2, 3, 7, 8, 9],
[ 4, 5, 6, 10, 11, 12]])
arr = rng.standard_normal((5, 2))
arr
first, second, third = np.split(arr, [1, 3])
first
second
thirdarray([[-1.3678, 0.6489],
[ 0.3611, -1.9529]])
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = rng.standard_normal((3, 2))
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]array([[ 0. , 1. , 0. ],
[ 2. , 3. , 1. ],
[ 4. , 5. , 2. ],
[ 2.3474, 0.9685, 3. ],
[-0.7594, 0.9022, 4. ],
[-0.467 , -0.0607, 5. ]])
np.c_[1:6, -10:-5]array([[ 1, -10],
[ 2, -9],
[ 3, -8],
[ 4, -7],
[ 5, -6]])
arr = np.arange(3)
arr
arr.repeat(3)array([0, 0, 0, 1, 1, 1, 2, 2, 2])
arr.repeat([2, 3, 4])array([0, 0, 1, 1, 1, 2, 2, 2, 2])
arr = rng.standard_normal((2, 2))
arr
arr.repeat(2, axis=0)array([[ 0.7888, -1.2567],
[ 0.7888, -1.2567],
[ 0.5759, 1.399 ],
[ 0.5759, 1.399 ]])
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)array([[ 0.7888, 0.7888, -1.2567, -1.2567, -1.2567],
[ 0.5759, 0.5759, 1.399 , 1.399 , 1.399 ]])
arr
np.tile(arr, 2)array([[ 0.7888, -1.2567, 0.7888, -1.2567],
[ 0.5759, 1.399 , 0.5759, 1.399 ]])
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))array([[ 0.7888, -1.2567, 0.7888, -1.2567],
[ 0.5759, 1.399 , 0.5759, 1.399 ],
[ 0.7888, -1.2567, 0.7888, -1.2567],
[ 0.5759, 1.399 , 0.5759, 1.399 ],
[ 0.7888, -1.2567, 0.7888, -1.2567],
[ 0.5759, 1.399 , 0.5759, 1.399 ]])
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]array([700, 100, 200, 600])
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arrarray([ 0, 41, 42, 300, 400, 500, 43, 40, 800, 900])
inds = [2, 0, 2, 1]
arr = rng.standard_normal((2, 4))
arr
arr.take(inds, axis=1)array([[ 0.9029, 1.3223, 0.9029, -0.2997],
[-1.3436, -0.1582, -1.3436, 0.4495]])
arr = np.arange(5)
arr
arr * 4array([ 0, 4, 8, 12, 16])
arr = rng.standard_normal((4, 3))
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)array([ 0., -0., 0.])
arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)array([-0., 0., 0., 0.])
arr - arr.mean(1)--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[32], line 1 ----> 1 arr - arr.mean(1) ValueError: operands could not be broadcast together with shapes (4,3) (4,)
arr - arr.mean(1).reshape((4, 1))array([[ 0.018 , 0.9114, -0.9294],
[ 1.2752, -0.5124, -0.7628],
[-1.3727, 0.5811, 0.7915],
[-0.1155, -0.6854, 0.8009]])
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = rng.standard_normal(3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]array([[ 0.3129, -0.1308, 1.27 ]])
arr = rng.standard_normal((3, 4, 5))
depth_means = arr.mean(2)
depth_means
depth_means.shape
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)array([[ 0., -0., 0., -0.],
[ 0., -0., -0., -0.],
[ 0., 0., 0., 0.]])
arr = np.zeros((4, 3))
arr[:] = 5
arrarray([[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.],
[5., 5., 5.]])
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arrarray([[-1.37 , -1.37 , -1.37 ],
[ 0.509, 0.509, 0.509],
[ 0.44 , 0.44 , 0.44 ],
[ 1.6 , 1.6 , 1.6 ]])
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()45
my_rng = np.random.default_rng(12346) # for 재현성
arr = my_rng.standard_normal((5, 5))
arr
arr[::2].sort(1) # 몇몇 행을 정렬
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)array([ True, False, True, False, True])
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)array([[ 0, 1, 3, 6, 10],
[ 5, 11, 18, 26, 35],
[10, 21, 33, 46, 60]])
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))array([[0, 0, 0, 0, 0],
[0, 1, 2, 3, 4],
[0, 1, 2, 3, 4],
[0, 2, 4, 6, 8],
[0, 2, 4, 6, 8]])
x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)
result = np.subtract.outer(x, y)
result.shape(3, 4, 5)
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])array([10, 18, 17])
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)array([[ 0, 0, 0],
[ 1, 5, 4],
[ 2, 10, 8],
[ 3, 15, 12]])
def add_elements(x, y):
return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))array([ 0., 2., 4., 6., 8., 10., 12., 14.])
arr = rng.standard_normal(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)553 μs ± 21.4 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.19 μs ± 33 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarrarray([(1.5 , 6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])
sarr[0]
sarr[0]['y']6
sarr['x']array([1.5 , 3.1416])
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arrarray([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
dtype=[('x', '<i8', (3,)), ('y', '<i4')])
arr[0]['x']array([0, 0, 0])
arr['x']array([[0, 0, 0],
[0, 0, 0],
[0, 0, 0],
[0, 0, 0]])
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']array([1., 3.])
arr = rng.standard_normal(6)
arr.sort()
arrarray([-1.1553, -0.9319, -0.5218, -0.4745, -0.1649, 0.03 ])
arr = rng.standard_normal((3, 5))
arr
arr[:, 0].sort() # 첫 번째 열의 값을 제자리에서 정렬
arrarray([[-1.1956, 0.4691, -0.3598, 1.0359, 0.2267],
[-0.7448, -0.5931, -1.055 , -0.0683, 0.458 ],
[-0.07 , 0.1462, -0.9944, 1.1436, 0.5026]])
arr = rng.standard_normal(5)
arr
np.sort(arr)
arrarray([ 0.8981, -1.1704, -0.2686, -0.796 , 1.4522])
arr = rng.standard_normal((3, 5))
arr
arr.sort(axis=1)
arrarray([[-0.6245, -0.2535, 0.3634, 1.1279, 2.1183],
[-1.2067, -0.6201, -0.2287, -0.1143, 1.6164],
[-2.1518, -1.3199, -1.0872, -0.6287, 0.083 ]])
arr[:, ::-1]array([[ 2.1183, 1.1279, 0.3634, -0.2535, -0.6245],
[ 1.6164, -0.1143, -0.2287, -0.6201, -1.2067],
[ 0.083 , -0.6287, -1.0872, -1.3199, -2.1518]])
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]array([0, 1, 2, 3, 5])
arr = rng.standard_normal((3, 5))
arr[0] = values
arr
arr[:, arr[0].argsort()]array([[ 0. , 1. , 2. , 3. , 5. ],
[-2.1268, -1.391 , 0.4505, -0.4922, -0.7503],
[-1.0479, 0.9553, 0.5379, 0.2936, 0.8926]])
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
list(zip(last_name[sorter], first_name[sorter]))[('Arnold', 'Jane'),
('Arnold', 'Steve'),
('Jones', 'Bill'),
('Jones', 'Bob'),
('Walters', 'Barbara')]
values = np.array(['2:first', '2:second', '1:first', '1:second',
'1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)array(['1:first', '1:second', '1:third', '2:first', '2:second'],
dtype='<U8')
rng = np.random.default_rng(12345)
arr = rng.standard_normal(20)
arr
np.partition(arr, 3)array([-1.9529, -1.4238, -1.3678, -1.2567, -0.8707, -0.7594, -0.7409,
-0.0607, 0.3611, -0.0753, -0.2592, -0.467 , 0.5759, 0.9022,
0.9685, 0.6489, 0.7888, 1.2637, 1.399 , 2.3474])
indices = np.argpartition(arr, 3)
indices
arr.take(indices)array([-1.9529, -1.4238, -1.3678, -1.2567, -0.8707, -0.7594, -0.7409,
-0.0607, 0.3611, -0.0753, -0.2592, -0.467 , 0.5759, 0.9022,
0.9685, 0.6489, 0.7888, 1.2637, 1.399 , 2.3474])
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)3
arr.searchsorted([0, 8, 11, 16])array([0, 3, 3, 5])
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')array([3, 7])
data = np.floor(rng.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
dataarray([ 815., 1598., 3401., 4651., 2664., 8157., 1932., 1294., 916.,
5985., 8547., 6016., 9319., 7247., 8605., 9293., 5461., 9376.,
4949., 2737., 4517., 6650., 3308., 9034., 2570., 3398., 2588.,
3554., 50., 6286., 2823., 680., 6168., 1763., 3043., 4408.,
1502., 2179., 4743., 4763., 2552., 2975., 2790., 2605., 4827.,
2119., 4956., 2462., 8384., 1801.])
labels = bins.searchsorted(data)
labelsarray([2, 3, 3, 3, 3, 4, 3, 3, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 4,
3, 4, 3, 3, 3, 3, 1, 4, 3, 2, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 4, 3])
pd.Series(data).groupby(labels).mean()1 50.000000
2 803.666667
3 3079.741935
4 7635.200000
dtype: float64
import numpy as np
def mean_distance(x, y):
nx = len(x)
result = 0.0
count = 0
for i in range(nx):
result += x[i] - y[i]
count += 1
return result / countmmap = np.memmap('mymmap', dtype='float64', mode='w+',
shape=(10000, 10000))
mmapmemmap([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
section = mmap[:5]section[:] = rng.standard_normal((5, 10000))
mmap.flush()
mmap
del mmapmmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmapmemmap([[-0.9074, -1.0954, 0.0071, ..., 0.2753, -1.1641, 0.8521],
[-0.0103, -0.0646, -1.0615, ..., -1.1003, 0.2505, 0.5832],
[ 0.4583, 1.2992, 1.7137, ..., 0.8691, -0.7889, -0.2431],
...,
[ 0. , 0. , 0. , ..., 0. , 0. , 0. ],
[ 0. , 0. , 0. , ..., 0. , 0. , 0. ],
[ 0. , 0. , 0. , ..., 0. , 0. , 0. ]])
%xdel mmap
!rm mymmaparr_c = np.ones((100, 10000), order='C')
arr_f = np.ones((100, 10000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguousTrue
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)115 μs ± 265 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
172 μs ± 805 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
arr_f.copy('C').flags C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
arr_c[:50].flags.contiguous
arr_c[:, :50].flags C_CONTIGUOUS : False
F_CONTIGUOUS : False
OWNDATA : False
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False
%xdel arr_c
%xdel arr_fpd.options.display.max_rows = PREVIOUS_MAX_ROWS