5 6장: 데이터 로딩, 저장, 파일 형식

파일 시스템이나 데이터베이스로부터 데이터를 읽어오고 저장하는 기술을 배웁니다.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
pd.options.display.max_colwidth = 75
pd.options.display.max_columns = 20
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

5.1 텍스트 형식 데이터 읽기

CSV, TXT 등 텍스트 기반 데이터를 pandas로 불러오는 방법을 알아봅니다.

!cat examples/ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

5.2 텍스트 형식 데이터 읽기

가장 일반적인 CSV 파일과 구분자로 나뉜 텍스트 데이터를 읽는 방법을 익힙니다.

df = pd.read_csv("examples/ex1.csv")
df

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

!cat examples/ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

pd.read_csv("examples/ex2.csv", header=None)
pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

names = ["a", "b", "c", "d", "message"]
pd.read_csv("examples/ex2.csv", names=names, index_col="message")

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

!cat examples/csv_mindex.csv
parsed = pd.read_csv("examples/csv_mindex.csv",
                     index_col=["key1", "key2"])
parsed

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

		value1	value2
key1	key2
one	a	1	2
	b	3	4
	c	5	6
	d	7	8
two	a	9	10
	b	11	12
	c	13	14
	d	15	16

!cat examples/ex3.txt

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491

result = pd.read_csv("examples/ex3.txt", sep="\s+")
result

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

!cat examples/ex4.csv
pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

!cat examples/ex5.csv
result = pd.read_csv("examples/ex5.csv")
result

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

pd.isna(result)

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

result = pd.read_csv("examples/ex5.csv", na_values=["NULL"])
result

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False)
result2
result2.isna()
result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False,
                      na_values=["NA"])
result3
result3.isna()

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	False	False	False
2	False	False	False	False	False	False

sentinels = {"message": ["foo", "NA"], "something": ["two"]}
pd.read_csv("examples/ex5.csv", na_values=sentinels,
            keep_default_na=False)

	something	a	b	c	d	message
0	one	1	2	3	4	NaN
1	NaN	5	6		8	world
2	three	9	10	11	12	NaN

pd.options.display.max_rows = 10

5.3 텍스트 파일 조금씩 읽기

큰 파일을 조각(chunk) 단위로 읽어서 처리하는 방법을 살펴봅니다.

result = pd.read_csv("examples/ex6.csv")
result

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q
...	...	...	...	...	...
9995	2.311896	-0.417070	-1.409599	-0.515821	L
9996	-0.479893	-0.650419	0.745152	-0.646038	E
9997	0.523331	0.787112	0.486066	1.093156	K
9998	-0.362559	0.598894	-1.843201	0.887292	G
9999	-0.096376	-1.012999	-0.657431	-0.573315	0

10000 rows × 5 columns

pd.read_csv("examples/ex6.csv", nrows=5)

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

5.4 텍스트 파일 조금씩 읽기

메모리 효율을 위해 큰 파일을 청크(chunk) 단위로 나누어 읽는 방법을 학습합니다.

chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)
type(chunker)

pandas.io.parsers.readers.TextFileReader

chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)

tot = pd.Series([], dtype='int64')
for piece in chunker:
    tot = tot.add(piece["key"].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)

tot[:10]

key
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

data = pd.read_csv("examples/ex5.csv")
data

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

5.5 데이터를 텍스트 형식으로 기록하기

데이터프레임을 CSV 파일로 내보내는 방법을 학습합니다.

data.to_csv("examples/out.csv")
!cat examples/out.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

5.6 데이터를 텍스트 형식으로 기록하기

분석 결과를 다양한 텍스트 형식으로 내보내는 기법을 알아봅니다.

import sys
data.to_csv(sys.stdout, sep="|")

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

data.to_csv(sys.stdout, na_rep="NULL")

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"])

a,b,c
1,2,3.0
5,6,
9,10,11.0

!cat examples/ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"

import csv
f = open("examples/ex7.csv")
reader = csv.reader(f)

for line in reader:
    print(line)
f.close()

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']

with open("examples/ex7.csv") as f:
    lines = list(csv.reader(f))

header, values = lines[0], lines[1:]

data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

obj = """
{"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
              {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
"""

5.7 JSON 데이터 다루기

웹 API 등에서 흔히 사용하는 JSON 형식 데이터를 파싱하고 변환하는 방법을 다룹니다.

import json
result = json.loads(obj)
result

{'name': 'Wes',
 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
  {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}

asjson = json.dumps(result)
asjson

'{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}'

siblings = pd.DataFrame(result["siblings"], columns=["name", "age"])
siblings

	name	age
0	Scott	34
1	Katie	42

!cat examples/example.json

[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]

5.8 JSON 데이터

웹 환경에서 표준처럼 쓰이는 JSON 데이터를 pandas 객체로 변환합니다.

data = pd.read_json("examples/example.json")
data

	a	b	c
0	1	2	3
1	4	5	6
2	7	8	9

data.to_json(sys.stdout)
data.to_json(sys.stdout, orient="records")

{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]

tables = pd.read_html("examples/fdic_failed_bank_list.html")
len(tables)
failures = tables[0]
failures.head()

	Bank Name	City	ST	CERT	Acquiring Institution	Closing Date	Updated Date
0	Allied Bank	Mulberry	AR	91	Today's Bank	September 23, 2016	November 17, 2016
1	The Woodbury Banking Company	Woodbury	GA	11297	United Bank	August 19, 2016	November 17, 2016
2	First CornerStone Bank	King of Prussia	PA	35312	First-Citizens Bank & Trust Company	May 6, 2016	September 6, 2016
3	Trust Company Bank	Memphis	TN	9956	The Bank of Fayette County	April 29, 2016	September 6, 2016
4	North Milwaukee State Bank	Milwaukee	WI	20364	First-Citizens Bank & Trust Company	March 11, 2016	June 16, 2016

close_timestamps = pd.to_datetime(failures["Closing Date"])
close_timestamps.dt.year.value_counts()

Closing Date
2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: count, Length: 15, dtype: int64

from lxml import objectify

path = "datasets/mta_perf/Performance_MNR.xml"
with open(path) as f:
    parsed = objectify.parse(f)
root = parsed.getroot()

data = []

skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ",
               "DESIRED_CHANGE", "DECIMAL_PLACES"]

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)

perf = pd.DataFrame(data)
perf.head()

	AGENCY_NAME	INDICATOR_NAME	DESCRIPTION	PERIOD_YEAR	PERIOD_MONTH	CATEGORY	FREQUENCY	INDICATOR_UNIT	YTD_TARGET	YTD_ACTUAL	MONTHLY_TARGET	MONTHLY_ACTUAL
0	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	1	Service Indicators	M	%	95.0	96.9	95.0	96.9
1	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	2	Service Indicators	M	%	95.0	96.0	95.0	95.0
2	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	3	Service Indicators	M	%	95.0	96.3	95.0	96.9
3	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	4	Service Indicators	M	%	95.0	96.8	95.0	98.3
4	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	5	Service Indicators	M	%	95.0	96.6	95.0	95.8

perf2 = pd.read_xml(path)
perf2.head()

	INDICATOR_SEQ	PARENT_SEQ	AGENCY_NAME	INDICATOR_NAME	DESCRIPTION	PERIOD_YEAR	PERIOD_MONTH	CATEGORY	FREQUENCY	DESIRED_CHANGE	INDICATOR_UNIT	DECIMAL_PLACES	YTD_TARGET	YTD_ACTUAL	MONTHLY_TARGET	MONTHLY_ACTUAL
0	28445	NaN	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	1	Service Indicators	M	U	%	1	95.00	96.90	95.00	96.90
1	28445	NaN	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	2	Service Indicators	M	U	%	1	95.00	96.00	95.00	95.00
2	28445	NaN	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	3	Service Indicators	M	U	%	1	95.00	96.30	95.00	96.90
3	28445	NaN	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	4	Service Indicators	M	U	%	1	95.00	96.80	95.00	98.30
4	28445	NaN	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at their destinations within 5 m...	2008	5	Service Indicators	M	U	%	1	95.00	96.60	95.00	95.80

frame = pd.read_csv("examples/ex1.csv")
frame
frame.to_pickle("examples/frame_pickle")

pd.read_pickle("examples/frame_pickle")

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

!rm examples/frame_pickle

fec = pd.read_parquet('datasets/fec/fec.parquet')

xlsx = pd.ExcelFile("examples/ex1.xlsx")

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'openpyxl'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[51], line 1
----> 1 xlsx = pd.ExcelFile("examples/ex1.xlsx")

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
   1510 self.engine = engine
   1511 self.storage_options = storage_options
-> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
    533 @doc(storage_options=_shared_docs["storage_options"])
    534 def __init__(
    535     self,
    536     filepath_or_buffer: FilePath | ReadBuffer[bytes],
    537     storage_options: StorageOptions = None,
    538 ) -> None:
    539     """
    540     Reader using openpyxl engine.
    541 
   (...)    546     {storage_options}
    547     """
--> 548     import_optional_dependency("openpyxl")
    549     super().__init__(filepath_or_buffer, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

xlsx.sheet_names

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[52], line 1
----> 1 xlsx.sheet_names

NameError: name 'xlsx' is not defined

xlsx.parse(sheet_name="Sheet1")

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[53], line 1
----> 1 xlsx.parse(sheet_name="Sheet1")

NameError: name 'xlsx' is not defined

xlsx.parse(sheet_name="Sheet1", index_col=0)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[54], line 1
----> 1 xlsx.parse(sheet_name="Sheet1", index_col=0)

NameError: name 'xlsx' is not defined

frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
frame

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'openpyxl'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[55], line 1
----> 1 frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
      2 frame

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:478, in read_excel(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend)
    476 if not isinstance(io, ExcelFile):
    477     should_close = True
--> 478     io = ExcelFile(io, storage_options=storage_options, engine=engine)
    479 elif engine and engine != io.engine:
    480     raise ValueError(
    481         "Engine should not be specified when passing "
    482         "an ExcelFile - ExcelFile already has the engine set"
    483     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
   1510 self.engine = engine
   1511 self.storage_options = storage_options
-> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
    533 @doc(storage_options=_shared_docs["storage_options"])
    534 def __init__(
    535     self,
    536     filepath_or_buffer: FilePath | ReadBuffer[bytes],
    537     storage_options: StorageOptions = None,
    538 ) -> None:
    539     """
    540     Reader using openpyxl engine.
    541 
   (...)    546     {storage_options}
    547     """
--> 548     import_optional_dependency("openpyxl")
    549     super().__init__(filepath_or_buffer, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

writer = pd.ExcelWriter("examples/ex2.xlsx")
frame.to_excel(writer, "Sheet1")
writer.close()

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[56], line 1
----> 1 writer = pd.ExcelWriter("examples/ex2.xlsx")
      2 frame.to_excel(writer, "Sheet1")
      3 writer.close()

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     43 def __init__(
     44     self,
     45     path: FilePath | WriteExcelBuffer | ExcelWriter,
   (...)     54 ) -> None:
     55     # Use the openpyxl module as the Excel writer.
---> 56     from openpyxl.workbook import Workbook
     58     engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
     60     super().__init__(
     61         path,
     62         mode=mode,
   (...)     65         engine_kwargs=engine_kwargs,
     66     )

ModuleNotFoundError: No module named 'openpyxl'

frame.to_excel("examples/ex2.xlsx")

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[57], line 1
----> 1 frame.to_excel("examples/ex2.xlsx")

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2252, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, inf_rep, freeze_panes, storage_options)
   2239 from pandas.io.formats.excel import ExcelFormatter
   2241 formatter = ExcelFormatter(
   2242     df,
   2243     na_rep=na_rep,
   (...)   2250     inf_rep=inf_rep,
   2251 )
-> 2252 formatter.write(
   2253     excel_writer,
   2254     sheet_name=sheet_name,
   2255     startrow=startrow,
   2256     startcol=startcol,
   2257     freeze_panes=freeze_panes,
   2258     engine=engine,
   2259     storage_options=storage_options,
   2260 )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/formats/excel.py:934, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)
    930     need_save = False
    931 else:
    932     # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
    933     # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
--> 934     writer = ExcelWriter(  # type: ignore[abstract]
    935         writer, engine=engine, storage_options=storage_options
    936     )
    937     need_save = True
    939 try:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     43 def __init__(
     44     self,
     45     path: FilePath | WriteExcelBuffer | ExcelWriter,
   (...)     54 ) -> None:
     55     # Use the openpyxl module as the Excel writer.
---> 56     from openpyxl.workbook import Workbook
     58     engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
     60     super().__init__(
     61         path,
     62         mode=mode,
   (...)     65         engine_kwargs=engine_kwargs,
     66     )

ModuleNotFoundError: No module named 'openpyxl'

!rm examples/ex2.xlsx

rm: examples/ex2.xlsx: No such file or directory

!rm -f examples/mydata.h5

frame = pd.DataFrame({"a": np.random.standard_normal(100)})
store = pd.HDFStore("examples/mydata.h5")
store["obj1"] = frame
store["obj1_col"] = frame["a"]
store

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'tables'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[60], line 2
      1 frame = pd.DataFrame({"a": np.random.standard_normal(100)})
----> 2 store = pd.HDFStore("examples/mydata.h5")
      3 store["obj1"] = frame
      4 store["obj1_col"] = frame["a"]

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    557 if "format" in kwargs:
    558     raise ValueError("format is not a defined argument for HDFStore")
--> 560 tables = import_optional_dependency("tables")
    562 if complib is not None and complib not in tables.filters.all_complibs:
    563     raise ValueError(
    564         f"complib only supports {tables.filters.all_complibs} compression."
    565     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.

store["obj1"]

store.put("obj2", frame, format="table")
store.select("obj2", where=["index >= 10 and index <= 15"])
store.close()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[62], line 1
----> 1 store.put("obj2", frame, format="table")
      2 store.select("obj2", where=["index >= 10 and index <= 15"])
      3 store.close()

NameError: name 'store' is not defined

frame.to_hdf("examples/mydata.h5", "obj3", format="table")
pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'tables'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[63], line 1
----> 1 frame.to_hdf("examples/mydata.h5", "obj3", format="table")
      2 pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2682, in NDFrame.to_hdf(self, path_or_buf, key, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
   2678 from pandas.io import pytables
   2680 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
   2681 # "Union[DataFrame, Series]" [arg-type]
-> 2682 pytables.to_hdf(
   2683     path_or_buf,
   2684     key,
   2685     self,  # type: ignore[arg-type]
   2686     mode=mode,
   2687     complevel=complevel,
   2688     complib=complib,
   2689     append=append,
   2690     format=format,
   2691     index=index,
   2692     min_itemsize=min_itemsize,
   2693     nan_rep=nan_rep,
   2694     dropna=dropna,
   2695     data_columns=data_columns,
   2696     errors=errors,
   2697     encoding=encoding,
   2698 )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:302, in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
    300 path_or_buf = stringify_path(path_or_buf)
    301 if isinstance(path_or_buf, str):
--> 302     with HDFStore(
    303         path_or_buf, mode=mode, complevel=complevel, complib=complib
    304     ) as store:
    305         f(store)
    306 else:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    557 if "format" in kwargs:
    558     raise ValueError("format is not a defined argument for HDFStore")
--> 560 tables = import_optional_dependency("tables")
    562 if complib is not None and complib not in tables.filters.all_complibs:
    563     raise ValueError(
    564         f"complib only supports {tables.filters.all_complibs} compression."
    565     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.

import os
os.remove("examples/mydata.h5")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[64], line 2
      1 import os
----> 2 os.remove("examples/mydata.h5")

FileNotFoundError: [Errno 2] No such file or directory: 'examples/mydata.h5'

import requests
url = "https://api.github.com/repos/pandas-dev/pandas/issues"
resp = requests.get(url)
resp.raise_for_status()
resp

<Response [200]>

data = resp.json()
data[0]["title"]

'BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapping .. failed" after pd.concat with PyArrow'

issues = pd.DataFrame(data, columns=["number", "title",
                                     "labels", "state"])
issues

	number	title	labels	state
0	64320	BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapp...	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g...	open
1	64319	DOC: clarify .loc Series-assignment alignment behavior	[]	open
2	64318	BUG: HDFStore.select(where=...) silently returns empty DataFrame with d...	[]	open
3	64317	TST: troubleshoot flaky linux plotting tests	[]	open
4	64316	TST: remove strict=False xfail from plotting test	[]	open
...	...	...	...	...
25	64285	ENH: Implement `series.argsort(order=, stable=)`	[]	open
26	64283	feat: add Series.str.splitlines and Series.str.expandtabs	[]	open
27	64282	BUG: enforce 1-dimensional input in pandas.array	[]	open
28	64281	BUG: test_dt64arr_cmp_arraylike_invalid[tzlocal] fails on Windows	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g...	open
29	64280	BUG: pandas.array accepts multidimensional arguments	[{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g...	open

30 rows × 4 columns

import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()

data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data)
con.commit()

cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])

	a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

import sqlalchemy as sqla
db = sqla.create_engine("sqlite:///mydata.sqlite")
pd.read_sql("SELECT * FROM test", db)

	a	b	c	d
0	Atlanta	Georgia	1.25	6
1	Tallahassee	Florida	2.60	3
2	Sacramento	California	1.70	5

!rm mydata.sqlite