5  6장: 데이터 로딩, 저장, 파일 형식

파일 시스템이나 데이터베이스로부터 데이터를 읽어오고 저장하는 기술을 배웁니다.

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
pd.options.display.max_colwidth = 75
pd.options.display.max_columns = 20
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

5.1 텍스트 형식 데이터 읽기

CSV, TXT 등 텍스트 기반 데이터를 pandas로 불러오는 방법을 알아봅니다.

!cat examples/ex1.csv
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

5.2 텍스트 형식 데이터 읽기

가장 일반적인 CSV 파일과 구분자로 나뉜 텍스트 데이터를 읽는 방법을 익힙니다.

df = pd.read_csv("examples/ex1.csv")
df
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
!cat examples/ex2.csv
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
pd.read_csv("examples/ex2.csv", header=None)
pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
names = ["a", "b", "c", "d", "message"]
pd.read_csv("examples/ex2.csv", names=names, index_col="message")
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12
!cat examples/csv_mindex.csv
parsed = pd.read_csv("examples/csv_mindex.csv",
                     index_col=["key1", "key2"])
parsed
key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 13 14
d 15 16
!cat examples/ex3.txt
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491
result = pd.read_csv("examples/ex3.txt", sep="\s+")
result
A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
!cat examples/ex4.csv
pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
!cat examples/ex5.csv
result = pd.read_csv("examples/ex5.csv")
result
something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
pd.isna(result)
something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False
result = pd.read_csv("examples/ex5.csv", na_values=["NULL"])
result
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo
result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False)
result2
result2.isna()
result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False,
                      na_values=["NA"])
result3
result3.isna()
something a b c d message
0 False False False False False True
1 False False False False False False
2 False False False False False False
sentinels = {"message": ["foo", "NA"], "something": ["two"]}
pd.read_csv("examples/ex5.csv", na_values=sentinels,
            keep_default_na=False)
something a b c d message
0 one 1 2 3 4 NaN
1 NaN 5 6 8 world
2 three 9 10 11 12 NaN
pd.options.display.max_rows = 10

5.3 텍스트 파일 조금씩 읽기

큰 파일을 조각(chunk) 단위로 읽어서 처리하는 방법을 살펴봅니다.

result = pd.read_csv("examples/ex6.csv")
result
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
... ... ... ... ... ...
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0

10000 rows × 5 columns

pd.read_csv("examples/ex6.csv", nrows=5)
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q

5.4 텍스트 파일 조금씩 읽기

메모리 효율을 위해 큰 파일을 청크(chunk) 단위로 나누어 읽는 방법을 학습합니다.

chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)
type(chunker)
pandas.io.parsers.readers.TextFileReader
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)

tot = pd.Series([], dtype='int64')
for piece in chunker:
    tot = tot.add(piece["key"].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]
key
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64
data = pd.read_csv("examples/ex5.csv")
data
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

5.5 데이터를 텍스트 형식으로 기록하기

데이터프레임을 CSV 파일로 내보내는 방법을 학습합니다.

data.to_csv("examples/out.csv")
!cat examples/out.csv
,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

5.6 데이터를 텍스트 형식으로 기록하기

분석 결과를 다양한 텍스트 형식으로 내보내는 기법을 알아봅니다.

import sys
data.to_csv(sys.stdout, sep="|")
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
data.to_csv(sys.stdout, na_rep="NULL")
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, header=False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"])
a,b,c
1,2,3.0
5,6,
9,10,11.0
!cat examples/ex7.csv
"a","b","c"
"1","2","3"
"1","2","3"
import csv
f = open("examples/ex7.csv")
reader = csv.reader(f)
for line in reader:
    print(line)
f.close()
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
with open("examples/ex7.csv") as f:
    lines = list(csv.reader(f))
header, values = lines[0], lines[1:]
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
obj = """
{"name": "Wes",
 "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
              {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
"""

5.7 JSON 데이터 다루기

웹 API 등에서 흔히 사용하는 JSON 형식 데이터를 파싱하고 변환하는 방법을 다룹니다.

import json
result = json.loads(obj)
result
{'name': 'Wes',
 'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
  {'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}
asjson = json.dumps(result)
asjson
'{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}'
siblings = pd.DataFrame(result["siblings"], columns=["name", "age"])
siblings
name age
0 Scott 34
1 Katie 42
!cat examples/example.json
[{"a": 1, "b": 2, "c": 3},
 {"a": 4, "b": 5, "c": 6},
 {"a": 7, "b": 8, "c": 9}]

5.8 JSON 데이터

웹 환경에서 표준처럼 쓰이는 JSON 데이터를 pandas 객체로 변환합니다.

data = pd.read_json("examples/example.json")
data
a b c
0 1 2 3
1 4 5 6
2 7 8 9
data.to_json(sys.stdout)
data.to_json(sys.stdout, orient="records")
{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
tables = pd.read_html("examples/fdic_failed_bank_list.html")
len(tables)
failures = tables[0]
failures.head()
Bank Name City ST CERT Acquiring Institution Closing Date Updated Date
0 Allied Bank Mulberry AR 91 Today's Bank September 23, 2016 November 17, 2016
1 The Woodbury Banking Company Woodbury GA 11297 United Bank August 19, 2016 November 17, 2016
2 First CornerStone Bank King of Prussia PA 35312 First-Citizens Bank & Trust Company May 6, 2016 September 6, 2016
3 Trust Company Bank Memphis TN 9956 The Bank of Fayette County April 29, 2016 September 6, 2016
4 North Milwaukee State Bank Milwaukee WI 20364 First-Citizens Bank & Trust Company March 11, 2016 June 16, 2016
close_timestamps = pd.to_datetime(failures["Closing Date"])
close_timestamps.dt.year.value_counts()
Closing Date
2010    157
2009    140
2011     92
2012     51
2008     25
       ... 
2004      4
2001      4
2007      3
2003      3
2000      2
Name: count, Length: 15, dtype: int64
from lxml import objectify

path = "datasets/mta_perf/Performance_MNR.xml"
with open(path) as f:
    parsed = objectify.parse(f)
root = parsed.getroot()
data = []

skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ",
               "DESIRED_CHANGE", "DECIMAL_PLACES"]

for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
perf = pd.DataFrame(data)
perf.head()
AGENCY_NAME INDICATOR_NAME DESCRIPTION PERIOD_YEAR PERIOD_MONTH CATEGORY FREQUENCY INDICATOR_UNIT YTD_TARGET YTD_ACTUAL MONTHLY_TARGET MONTHLY_ACTUAL
0 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 1 Service Indicators M % 95.0 96.9 95.0 96.9
1 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 2 Service Indicators M % 95.0 96.0 95.0 95.0
2 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 3 Service Indicators M % 95.0 96.3 95.0 96.9
3 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 4 Service Indicators M % 95.0 96.8 95.0 98.3
4 Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 5 Service Indicators M % 95.0 96.6 95.0 95.8
perf2 = pd.read_xml(path)
perf2.head()
INDICATOR_SEQ PARENT_SEQ AGENCY_NAME INDICATOR_NAME DESCRIPTION PERIOD_YEAR PERIOD_MONTH CATEGORY FREQUENCY DESIRED_CHANGE INDICATOR_UNIT DECIMAL_PLACES YTD_TARGET YTD_ACTUAL MONTHLY_TARGET MONTHLY_ACTUAL
0 28445 NaN Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 1 Service Indicators M U % 1 95.00 96.90 95.00 96.90
1 28445 NaN Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 2 Service Indicators M U % 1 95.00 96.00 95.00 95.00
2 28445 NaN Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 3 Service Indicators M U % 1 95.00 96.30 95.00 96.90
3 28445 NaN Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 4 Service Indicators M U % 1 95.00 96.80 95.00 98.30
4 28445 NaN Metro-North Railroad On-Time Performance (West of Hudson) Percent of commuter trains that arrive at their destinations within 5 m... 2008 5 Service Indicators M U % 1 95.00 96.60 95.00 95.80
frame = pd.read_csv("examples/ex1.csv")
frame
frame.to_pickle("examples/frame_pickle")
pd.read_pickle("examples/frame_pickle")
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
!rm examples/frame_pickle
fec = pd.read_parquet('datasets/fec/fec.parquet')
xlsx = pd.ExcelFile("examples/ex1.xlsx")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'openpyxl'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[51], line 1
----> 1 xlsx = pd.ExcelFile("examples/ex1.xlsx")

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
   1510 self.engine = engine
   1511 self.storage_options = storage_options
-> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
    533 @doc(storage_options=_shared_docs["storage_options"])
    534 def __init__(
    535     self,
    536     filepath_or_buffer: FilePath | ReadBuffer[bytes],
    537     storage_options: StorageOptions = None,
    538 ) -> None:
    539     """
    540     Reader using openpyxl engine.
    541 
   (...)    546     {storage_options}
    547     """
--> 548     import_optional_dependency("openpyxl")
    549     super().__init__(filepath_or_buffer, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
xlsx.sheet_names
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[52], line 1
----> 1 xlsx.sheet_names

NameError: name 'xlsx' is not defined
xlsx.parse(sheet_name="Sheet1")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[53], line 1
----> 1 xlsx.parse(sheet_name="Sheet1")

NameError: name 'xlsx' is not defined
xlsx.parse(sheet_name="Sheet1", index_col=0)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[54], line 1
----> 1 xlsx.parse(sheet_name="Sheet1", index_col=0)

NameError: name 'xlsx' is not defined
frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
frame
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'openpyxl'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[55], line 1
----> 1 frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
      2 frame

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:478, in read_excel(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend)
    476 if not isinstance(io, ExcelFile):
    477     should_close = True
--> 478     io = ExcelFile(io, storage_options=storage_options, engine=engine)
    479 elif engine and engine != io.engine:
    480     raise ValueError(
    481         "Engine should not be specified when passing "
    482         "an ExcelFile - ExcelFile already has the engine set"
    483     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options)
   1510 self.engine = engine
   1511 self.storage_options = storage_options
-> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options)
    533 @doc(storage_options=_shared_docs["storage_options"])
    534 def __init__(
    535     self,
    536     filepath_or_buffer: FilePath | ReadBuffer[bytes],
    537     storage_options: StorageOptions = None,
    538 ) -> None:
    539     """
    540     Reader using openpyxl engine.
    541 
   (...)    546     {storage_options}
    547     """
--> 548     import_optional_dependency("openpyxl")
    549     super().__init__(filepath_or_buffer, storage_options=storage_options)

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.
writer = pd.ExcelWriter("examples/ex2.xlsx")
frame.to_excel(writer, "Sheet1")
writer.close()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[56], line 1
----> 1 writer = pd.ExcelWriter("examples/ex2.xlsx")
      2 frame.to_excel(writer, "Sheet1")
      3 writer.close()

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     43 def __init__(
     44     self,
     45     path: FilePath | WriteExcelBuffer | ExcelWriter,
   (...)     54 ) -> None:
     55     # Use the openpyxl module as the Excel writer.
---> 56     from openpyxl.workbook import Workbook
     58     engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
     60     super().__init__(
     61         path,
     62         mode=mode,
   (...)     65         engine_kwargs=engine_kwargs,
     66     )

ModuleNotFoundError: No module named 'openpyxl'
frame.to_excel("examples/ex2.xlsx")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[57], line 1
----> 1 frame.to_excel("examples/ex2.xlsx")

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2252, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, inf_rep, freeze_panes, storage_options)
   2239 from pandas.io.formats.excel import ExcelFormatter
   2241 formatter = ExcelFormatter(
   2242     df,
   2243     na_rep=na_rep,
   (...)   2250     inf_rep=inf_rep,
   2251 )
-> 2252 formatter.write(
   2253     excel_writer,
   2254     sheet_name=sheet_name,
   2255     startrow=startrow,
   2256     startcol=startcol,
   2257     freeze_panes=freeze_panes,
   2258     engine=engine,
   2259     storage_options=storage_options,
   2260 )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/formats/excel.py:934, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options)
    930     need_save = False
    931 else:
    932     # error: Cannot instantiate abstract class 'ExcelWriter' with abstract
    933     # attributes 'engine', 'save', 'supported_extensions' and 'write_cells'
--> 934     writer = ExcelWriter(  # type: ignore[abstract]
    935         writer, engine=engine, storage_options=storage_options
    936     )
    937     need_save = True
    939 try:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)
     43 def __init__(
     44     self,
     45     path: FilePath | WriteExcelBuffer | ExcelWriter,
   (...)     54 ) -> None:
     55     # Use the openpyxl module as the Excel writer.
---> 56     from openpyxl.workbook import Workbook
     58     engine_kwargs = combine_kwargs(engine_kwargs, kwargs)
     60     super().__init__(
     61         path,
     62         mode=mode,
   (...)     65         engine_kwargs=engine_kwargs,
     66     )

ModuleNotFoundError: No module named 'openpyxl'
!rm examples/ex2.xlsx
rm: examples/ex2.xlsx: No such file or directory
!rm -f examples/mydata.h5
frame = pd.DataFrame({"a": np.random.standard_normal(100)})
store = pd.HDFStore("examples/mydata.h5")
store["obj1"] = frame
store["obj1_col"] = frame["a"]
store
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'tables'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[60], line 2
      1 frame = pd.DataFrame({"a": np.random.standard_normal(100)})
----> 2 store = pd.HDFStore("examples/mydata.h5")
      3 store["obj1"] = frame
      4 store["obj1_col"] = frame["a"]

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    557 if "format" in kwargs:
    558     raise ValueError("format is not a defined argument for HDFStore")
--> 560 tables = import_optional_dependency("tables")
    562 if complib is not None and complib not in tables.filters.all_complibs:
    563     raise ValueError(
    564         f"complib only supports {tables.filters.all_complibs} compression."
    565     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.
store["obj1"]
store.put("obj2", frame, format="table")
store.select("obj2", where=["index >= 10 and index <= 15"])
store.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[62], line 1
----> 1 store.put("obj2", frame, format="table")
      2 store.select("obj2", where=["index >= 10 and index <= 15"])
      3 store.close()

NameError: name 'store' is not defined
frame.to_hdf("examples/mydata.h5", "obj3", format="table")
pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version)
    141 try:
--> 142     module = importlib.import_module(name)
    143 except ImportError:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'tables'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[63], line 1
----> 1 frame.to_hdf("examples/mydata.h5", "obj3", format="table")
      2 pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2682, in NDFrame.to_hdf(self, path_or_buf, key, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
   2678 from pandas.io import pytables
   2680 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
   2681 # "Union[DataFrame, Series]" [arg-type]
-> 2682 pytables.to_hdf(
   2683     path_or_buf,
   2684     key,
   2685     self,  # type: ignore[arg-type]
   2686     mode=mode,
   2687     complevel=complevel,
   2688     complib=complib,
   2689     append=append,
   2690     format=format,
   2691     index=index,
   2692     min_itemsize=min_itemsize,
   2693     nan_rep=nan_rep,
   2694     dropna=dropna,
   2695     data_columns=data_columns,
   2696     errors=errors,
   2697     encoding=encoding,
   2698 )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:302, in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding)
    300 path_or_buf = stringify_path(path_or_buf)
    301 if isinstance(path_or_buf, str):
--> 302     with HDFStore(
    303         path_or_buf, mode=mode, complevel=complevel, complib=complib
    304     ) as store:
    305         f(store)
    306 else:

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs)
    557 if "format" in kwargs:
    558     raise ValueError("format is not a defined argument for HDFStore")
--> 560 tables = import_optional_dependency("tables")
    562 if complib is not None and complib not in tables.filters.all_complibs:
    563     raise ValueError(
    564         f"complib only supports {tables.filters.all_complibs} compression."
    565     )

File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version)
    143 except ImportError:
    144     if errors == "raise":
--> 145         raise ImportError(msg)
    146     return None
    148 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.
import os
os.remove("examples/mydata.h5")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[64], line 2
      1 import os
----> 2 os.remove("examples/mydata.h5")

FileNotFoundError: [Errno 2] No such file or directory: 'examples/mydata.h5'
import requests
url = "https://api.github.com/repos/pandas-dev/pandas/issues"
resp = requests.get(url)
resp.raise_for_status()
resp
<Response [200]>
data = resp.json()
data[0]["title"]
'BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapping .. failed" after pd.concat with PyArrow'
issues = pd.DataFrame(data, columns=["number", "title",
                                     "labels", "state"])
issues
number title labels state
0 64320 BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapp... [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... open
1 64319 DOC: clarify .loc Series-assignment alignment behavior [] open
2 64318 BUG: HDFStore.select(where=...) silently returns empty DataFrame with d... [] open
3 64317 TST: troubleshoot flaky linux plotting tests [] open
4 64316 TST: remove strict=False xfail from plotting test [] open
... ... ... ... ...
25 64285 ENH: Implement `series.argsort(order=, stable=)` [] open
26 64283 feat: add Series.str.splitlines and Series.str.expandtabs [] open
27 64282 BUG: enforce 1-dimensional input in pandas.array [] open
28 64281 BUG: test_dt64arr_cmp_arraylike_invalid[tzlocal] fails on Windows [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... open
29 64280 BUG: pandas.array accepts multidimensional arguments [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... open

30 rows × 4 columns

import sqlite3

query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
 c REAL,        d INTEGER
);"""

con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()
data = [("Atlanta", "Georgia", 1.25, 6),
        ("Tallahassee", "Florida", 2.6, 3),
        ("Sacramento", "California", 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"

con.executemany(stmt, data)
con.commit()
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
rows
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])
a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
import sqlalchemy as sqla
db = sqla.create_engine("sqlite:///mydata.sqlite")
pd.read_sql("SELECT * FROM test", db)
a b c d
0 Atlanta Georgia 1.25 6
1 Tallahassee Florida 2.60 3
2 Sacramento California 1.70 5
!rm mydata.sqlite