import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc("figure", figsize=(10, 6))
pd.options.display.max_colwidth = 75
pd.options.display.max_columns = 20
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
# Matplotlib 한글 폰트 설정 (macOS용)
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)5 6장: 데이터 로딩, 저장, 파일 형식
파일 시스템이나 데이터베이스로부터 데이터를 읽어오고 저장하는 기술을 배웁니다.
5.1 텍스트 형식 데이터 읽기
CSV, TXT 등 텍스트 기반 데이터를 pandas로 불러오는 방법을 알아봅니다.
!cat examples/ex1.csva,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
5.2 텍스트 형식 데이터 읽기
가장 일반적인 CSV 파일과 구분자로 나뉜 텍스트 데이터를 읽는 방법을 익힙니다.
df = pd.read_csv("examples/ex1.csv")
df| a | b | c | d | message | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | 4 | hello |
| 1 | 5 | 6 | 7 | 8 | world |
| 2 | 9 | 10 | 11 | 12 | foo |
!cat examples/ex2.csv1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
pd.read_csv("examples/ex2.csv", header=None)
pd.read_csv("examples/ex2.csv", names=["a", "b", "c", "d", "message"])| a | b | c | d | message | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | 4 | hello |
| 1 | 5 | 6 | 7 | 8 | world |
| 2 | 9 | 10 | 11 | 12 | foo |
names = ["a", "b", "c", "d", "message"]
pd.read_csv("examples/ex2.csv", names=names, index_col="message")| a | b | c | d | |
|---|---|---|---|---|
| message | ||||
| hello | 1 | 2 | 3 | 4 |
| world | 5 | 6 | 7 | 8 |
| foo | 9 | 10 | 11 | 12 |
!cat examples/csv_mindex.csv
parsed = pd.read_csv("examples/csv_mindex.csv",
index_col=["key1", "key2"])
parsedkey1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16
| value1 | value2 | ||
|---|---|---|---|
| key1 | key2 | ||
| one | a | 1 | 2 |
| b | 3 | 4 | |
| c | 5 | 6 | |
| d | 7 | 8 | |
| two | a | 9 | 10 |
| b | 11 | 12 | |
| c | 13 | 14 | |
| d | 15 | 16 |
!cat examples/ex3.txt A B C
aaa -0.264438 -1.026059 -0.619500
bbb 0.927272 0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382 1.100491
result = pd.read_csv("examples/ex3.txt", sep="\s+")
result| A | B | C | |
|---|---|---|---|
| aaa | -0.264438 | -1.026059 | -0.619500 |
| bbb | 0.927272 | 0.302904 | -0.032399 |
| ccc | -0.264273 | -0.386314 | -0.217601 |
| ddd | -0.871858 | -0.348382 | 1.100491 |
!cat examples/ex4.csv
pd.read_csv("examples/ex4.csv", skiprows=[0, 2, 3])# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
| a | b | c | d | message | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | 4 | hello |
| 1 | 5 | 6 | 7 | 8 | world |
| 2 | 9 | 10 | 11 | 12 | foo |
!cat examples/ex5.csv
result = pd.read_csv("examples/ex5.csv")
resultsomething,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo
| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | one | 1 | 2 | 3.0 | 4 | NaN |
| 1 | two | 5 | 6 | NaN | 8 | world |
| 2 | three | 9 | 10 | 11.0 | 12 | foo |
pd.isna(result)| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | True |
| 1 | False | False | False | True | False | False |
| 2 | False | False | False | False | False | False |
result = pd.read_csv("examples/ex5.csv", na_values=["NULL"])
result| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | one | 1 | 2 | 3.0 | 4 | NaN |
| 1 | two | 5 | 6 | NaN | 8 | world |
| 2 | three | 9 | 10 | 11.0 | 12 | foo |
result2 = pd.read_csv("examples/ex5.csv", keep_default_na=False)
result2
result2.isna()
result3 = pd.read_csv("examples/ex5.csv", keep_default_na=False,
na_values=["NA"])
result3
result3.isna()| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | True |
| 1 | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False |
sentinels = {"message": ["foo", "NA"], "something": ["two"]}
pd.read_csv("examples/ex5.csv", na_values=sentinels,
keep_default_na=False)| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | one | 1 | 2 | 3 | 4 | NaN |
| 1 | NaN | 5 | 6 | 8 | world | |
| 2 | three | 9 | 10 | 11 | 12 | NaN |
pd.options.display.max_rows = 105.3 텍스트 파일 조금씩 읽기
큰 파일을 조각(chunk) 단위로 읽어서 처리하는 방법을 살펴봅니다.
result = pd.read_csv("examples/ex6.csv")
result| one | two | three | four | key | |
|---|---|---|---|---|---|
| 0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
| 1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
| 2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
| 3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
| 4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
| ... | ... | ... | ... | ... | ... |
| 9995 | 2.311896 | -0.417070 | -1.409599 | -0.515821 | L |
| 9996 | -0.479893 | -0.650419 | 0.745152 | -0.646038 | E |
| 9997 | 0.523331 | 0.787112 | 0.486066 | 1.093156 | K |
| 9998 | -0.362559 | 0.598894 | -1.843201 | 0.887292 | G |
| 9999 | -0.096376 | -1.012999 | -0.657431 | -0.573315 | 0 |
10000 rows × 5 columns
pd.read_csv("examples/ex6.csv", nrows=5)| one | two | three | four | key | |
|---|---|---|---|---|---|
| 0 | 0.467976 | -0.038649 | -0.295344 | -1.824726 | L |
| 1 | -0.358893 | 1.404453 | 0.704965 | -0.200638 | B |
| 2 | -0.501840 | 0.659254 | -0.421691 | -0.057688 | G |
| 3 | 0.204886 | 1.074134 | 1.388361 | -0.982404 | R |
| 4 | 0.354628 | -0.133116 | 0.283763 | -0.837063 | Q |
5.4 텍스트 파일 조금씩 읽기
메모리 효율을 위해 큰 파일을 청크(chunk) 단위로 나누어 읽는 방법을 학습합니다.
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)
type(chunker)pandas.io.parsers.readers.TextFileReader
chunker = pd.read_csv("examples/ex6.csv", chunksize=1000)
tot = pd.Series([], dtype='int64')
for piece in chunker:
tot = tot.add(piece["key"].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)tot[:10]key
E 368.0
X 364.0
L 346.0
O 343.0
Q 340.0
M 338.0
J 337.0
F 335.0
K 334.0
H 330.0
dtype: float64
data = pd.read_csv("examples/ex5.csv")
data| something | a | b | c | d | message | |
|---|---|---|---|---|---|---|
| 0 | one | 1 | 2 | 3.0 | 4 | NaN |
| 1 | two | 5 | 6 | NaN | 8 | world |
| 2 | three | 9 | 10 | 11.0 | 12 | foo |
5.5 데이터를 텍스트 형식으로 기록하기
데이터프레임을 CSV 파일로 내보내는 방법을 학습합니다.
data.to_csv("examples/out.csv")
!cat examples/out.csv,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo
5.6 데이터를 텍스트 형식으로 기록하기
분석 결과를 다양한 텍스트 형식으로 내보내는 기법을 알아봅니다.
import sys
data.to_csv(sys.stdout, sep="|")|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
data.to_csv(sys.stdout, na_rep="NULL"),something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, header=False)one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
data.to_csv(sys.stdout, index=False, columns=["a", "b", "c"])a,b,c
1,2,3.0
5,6,
9,10,11.0
!cat examples/ex7.csv"a","b","c"
"1","2","3"
"1","2","3"
import csv
f = open("examples/ex7.csv")
reader = csv.reader(f)for line in reader:
print(line)
f.close()['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
with open("examples/ex7.csv") as f:
lines = list(csv.reader(f))header, values = lines[0], lines[1:]data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
obj = """
{"name": "Wes",
"cities_lived": ["Akron", "Nashville", "New York", "San Francisco"],
"pet": null,
"siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]},
{"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]
}
"""5.7 JSON 데이터 다루기
웹 API 등에서 흔히 사용하는 JSON 형식 데이터를 파싱하고 변환하는 방법을 다룹니다.
import json
result = json.loads(obj)
result{'name': 'Wes',
'cities_lived': ['Akron', 'Nashville', 'New York', 'San Francisco'],
'pet': None,
'siblings': [{'name': 'Scott', 'age': 34, 'hobbies': ['guitars', 'soccer']},
{'name': 'Katie', 'age': 42, 'hobbies': ['diving', 'art']}]}
asjson = json.dumps(result)
asjson'{"name": "Wes", "cities_lived": ["Akron", "Nashville", "New York", "San Francisco"], "pet": null, "siblings": [{"name": "Scott", "age": 34, "hobbies": ["guitars", "soccer"]}, {"name": "Katie", "age": 42, "hobbies": ["diving", "art"]}]}'
siblings = pd.DataFrame(result["siblings"], columns=["name", "age"])
siblings| name | age | |
|---|---|---|
| 0 | Scott | 34 |
| 1 | Katie | 42 |
!cat examples/example.json[{"a": 1, "b": 2, "c": 3},
{"a": 4, "b": 5, "c": 6},
{"a": 7, "b": 8, "c": 9}]
5.8 JSON 데이터
웹 환경에서 표준처럼 쓰이는 JSON 데이터를 pandas 객체로 변환합니다.
data = pd.read_json("examples/example.json")
data| a | b | c | |
|---|---|---|---|
| 0 | 1 | 2 | 3 |
| 1 | 4 | 5 | 6 |
| 2 | 7 | 8 | 9 |
data.to_json(sys.stdout)
data.to_json(sys.stdout, orient="records"){"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]
tables = pd.read_html("examples/fdic_failed_bank_list.html")
len(tables)
failures = tables[0]
failures.head()| Bank Name | City | ST | CERT | Acquiring Institution | Closing Date | Updated Date | |
|---|---|---|---|---|---|---|---|
| 0 | Allied Bank | Mulberry | AR | 91 | Today's Bank | September 23, 2016 | November 17, 2016 |
| 1 | The Woodbury Banking Company | Woodbury | GA | 11297 | United Bank | August 19, 2016 | November 17, 2016 |
| 2 | First CornerStone Bank | King of Prussia | PA | 35312 | First-Citizens Bank & Trust Company | May 6, 2016 | September 6, 2016 |
| 3 | Trust Company Bank | Memphis | TN | 9956 | The Bank of Fayette County | April 29, 2016 | September 6, 2016 |
| 4 | North Milwaukee State Bank | Milwaukee | WI | 20364 | First-Citizens Bank & Trust Company | March 11, 2016 | June 16, 2016 |
close_timestamps = pd.to_datetime(failures["Closing Date"])
close_timestamps.dt.year.value_counts()Closing Date
2010 157
2009 140
2011 92
2012 51
2008 25
...
2004 4
2001 4
2007 3
2003 3
2000 2
Name: count, Length: 15, dtype: int64
from lxml import objectify
path = "datasets/mta_perf/Performance_MNR.xml"
with open(path) as f:
parsed = objectify.parse(f)
root = parsed.getroot()data = []
skip_fields = ["PARENT_SEQ", "INDICATOR_SEQ",
"DESIRED_CHANGE", "DECIMAL_PLACES"]
for elt in root.INDICATOR:
el_data = {}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag] = child.pyval
data.append(el_data)perf = pd.DataFrame(data)
perf.head()| AGENCY_NAME | INDICATOR_NAME | DESCRIPTION | PERIOD_YEAR | PERIOD_MONTH | CATEGORY | FREQUENCY | INDICATOR_UNIT | YTD_TARGET | YTD_ACTUAL | MONTHLY_TARGET | MONTHLY_ACTUAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 1 | Service Indicators | M | % | 95.0 | 96.9 | 95.0 | 96.9 |
| 1 | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 2 | Service Indicators | M | % | 95.0 | 96.0 | 95.0 | 95.0 |
| 2 | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 3 | Service Indicators | M | % | 95.0 | 96.3 | 95.0 | 96.9 |
| 3 | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 4 | Service Indicators | M | % | 95.0 | 96.8 | 95.0 | 98.3 |
| 4 | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 5 | Service Indicators | M | % | 95.0 | 96.6 | 95.0 | 95.8 |
perf2 = pd.read_xml(path)
perf2.head()| INDICATOR_SEQ | PARENT_SEQ | AGENCY_NAME | INDICATOR_NAME | DESCRIPTION | PERIOD_YEAR | PERIOD_MONTH | CATEGORY | FREQUENCY | DESIRED_CHANGE | INDICATOR_UNIT | DECIMAL_PLACES | YTD_TARGET | YTD_ACTUAL | MONTHLY_TARGET | MONTHLY_ACTUAL | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 28445 | NaN | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 1 | Service Indicators | M | U | % | 1 | 95.00 | 96.90 | 95.00 | 96.90 |
| 1 | 28445 | NaN | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 2 | Service Indicators | M | U | % | 1 | 95.00 | 96.00 | 95.00 | 95.00 |
| 2 | 28445 | NaN | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 3 | Service Indicators | M | U | % | 1 | 95.00 | 96.30 | 95.00 | 96.90 |
| 3 | 28445 | NaN | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 4 | Service Indicators | M | U | % | 1 | 95.00 | 96.80 | 95.00 | 98.30 |
| 4 | 28445 | NaN | Metro-North Railroad | On-Time Performance (West of Hudson) | Percent of commuter trains that arrive at their destinations within 5 m... | 2008 | 5 | Service Indicators | M | U | % | 1 | 95.00 | 96.60 | 95.00 | 95.80 |
frame = pd.read_csv("examples/ex1.csv")
frame
frame.to_pickle("examples/frame_pickle")pd.read_pickle("examples/frame_pickle")| a | b | c | d | message | |
|---|---|---|---|---|---|
| 0 | 1 | 2 | 3 | 4 | hello |
| 1 | 5 | 6 | 7 | 8 | world |
| 2 | 9 | 10 | 11 | 12 | foo |
!rm examples/frame_picklefec = pd.read_parquet('datasets/fec/fec.parquet')xlsx = pd.ExcelFile("examples/ex1.xlsx")--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version) 141 try: --> 142 module = importlib.import_module(name) 143 except ImportError: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package) 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level) File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level) File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_) File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_) ModuleNotFoundError: No module named 'openpyxl' During handling of the above exception, another exception occurred: ImportError Traceback (most recent call last) Cell In[51], line 1 ----> 1 xlsx = pd.ExcelFile("examples/ex1.xlsx") File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options) 1510 self.engine = engine 1511 self.storage_options = storage_options -> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options) 533 @doc(storage_options=_shared_docs["storage_options"]) 534 def __init__( 535 self, 536 filepath_or_buffer: FilePath | ReadBuffer[bytes], 537 storage_options: StorageOptions = None, 538 ) -> None: 539 """ 540 Reader using openpyxl engine. 541 (...) 546 {storage_options} 547 """ --> 548 import_optional_dependency("openpyxl") 549 super().__init__(filepath_or_buffer, storage_options=storage_options) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version) 143 except ImportError: 144 if errors == "raise": --> 145 raise ImportError(msg) 146 return None 148 # Handle submodules: if we have submodule, grab parent module from sys.modules ImportError: Missing optional dependency 'openpyxl'. Use pip or conda to install openpyxl.
xlsx.sheet_names--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[52], line 1 ----> 1 xlsx.sheet_names NameError: name 'xlsx' is not defined
xlsx.parse(sheet_name="Sheet1")--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[53], line 1 ----> 1 xlsx.parse(sheet_name="Sheet1") NameError: name 'xlsx' is not defined
xlsx.parse(sheet_name="Sheet1", index_col=0)--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[54], line 1 ----> 1 xlsx.parse(sheet_name="Sheet1", index_col=0) NameError: name 'xlsx' is not defined
frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1")
frame--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version) 141 try: --> 142 module = importlib.import_module(name) 143 except ImportError: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package) 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level) File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level) File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_) File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_) ModuleNotFoundError: No module named 'openpyxl' During handling of the above exception, another exception occurred: ImportError Traceback (most recent call last) Cell In[55], line 1 ----> 1 frame = pd.read_excel("examples/ex1.xlsx", sheet_name="Sheet1") 2 frame File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:478, in read_excel(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend) 476 if not isinstance(io, ExcelFile): 477 should_close = True --> 478 io = ExcelFile(io, storage_options=storage_options, engine=engine) 479 elif engine and engine != io.engine: 480 raise ValueError( 481 "Engine should not be specified when passing " 482 "an ExcelFile - ExcelFile already has the engine set" 483 ) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_base.py:1513, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options) 1510 self.engine = engine 1511 self.storage_options = storage_options -> 1513 self._reader = self._engines[engine](self._io, storage_options=storage_options) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:548, in OpenpyxlReader.__init__(self, filepath_or_buffer, storage_options) 533 @doc(storage_options=_shared_docs["storage_options"]) 534 def __init__( 535 self, 536 filepath_or_buffer: FilePath | ReadBuffer[bytes], 537 storage_options: StorageOptions = None, 538 ) -> None: 539 """ 540 Reader using openpyxl engine. 541 (...) 546 {storage_options} 547 """ --> 548 import_optional_dependency("openpyxl") 549 super().__init__(filepath_or_buffer, storage_options=storage_options) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version) 143 except ImportError: 144 if errors == "raise": --> 145 raise ImportError(msg) 146 return None 148 # Handle submodules: if we have submodule, grab parent module from sys.modules ImportError: Missing optional dependency 'openpyxl'. Use pip or conda to install openpyxl.
writer = pd.ExcelWriter("examples/ex2.xlsx")
frame.to_excel(writer, "Sheet1")
writer.close()--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[56], line 1 ----> 1 writer = pd.ExcelWriter("examples/ex2.xlsx") 2 frame.to_excel(writer, "Sheet1") 3 writer.close() File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 43 def __init__( 44 self, 45 path: FilePath | WriteExcelBuffer | ExcelWriter, (...) 54 ) -> None: 55 # Use the openpyxl module as the Excel writer. ---> 56 from openpyxl.workbook import Workbook 58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs) 60 super().__init__( 61 path, 62 mode=mode, (...) 65 engine_kwargs=engine_kwargs, 66 ) ModuleNotFoundError: No module named 'openpyxl'
frame.to_excel("examples/ex2.xlsx")--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[57], line 1 ----> 1 frame.to_excel("examples/ex2.xlsx") File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2252, in NDFrame.to_excel(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, inf_rep, freeze_panes, storage_options) 2239 from pandas.io.formats.excel import ExcelFormatter 2241 formatter = ExcelFormatter( 2242 df, 2243 na_rep=na_rep, (...) 2250 inf_rep=inf_rep, 2251 ) -> 2252 formatter.write( 2253 excel_writer, 2254 sheet_name=sheet_name, 2255 startrow=startrow, 2256 startcol=startcol, 2257 freeze_panes=freeze_panes, 2258 engine=engine, 2259 storage_options=storage_options, 2260 ) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/formats/excel.py:934, in ExcelFormatter.write(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options) 930 need_save = False 931 else: 932 # error: Cannot instantiate abstract class 'ExcelWriter' with abstract 933 # attributes 'engine', 'save', 'supported_extensions' and 'write_cells' --> 934 writer = ExcelWriter( # type: ignore[abstract] 935 writer, engine=engine, storage_options=storage_options 936 ) 937 need_save = True 939 try: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/excel/_openpyxl.py:56, in OpenpyxlWriter.__init__(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs) 43 def __init__( 44 self, 45 path: FilePath | WriteExcelBuffer | ExcelWriter, (...) 54 ) -> None: 55 # Use the openpyxl module as the Excel writer. ---> 56 from openpyxl.workbook import Workbook 58 engine_kwargs = combine_kwargs(engine_kwargs, kwargs) 60 super().__init__( 61 path, 62 mode=mode, (...) 65 engine_kwargs=engine_kwargs, 66 ) ModuleNotFoundError: No module named 'openpyxl'
!rm examples/ex2.xlsxrm: examples/ex2.xlsx: No such file or directory
!rm -f examples/mydata.h5frame = pd.DataFrame({"a": np.random.standard_normal(100)})
store = pd.HDFStore("examples/mydata.h5")
store["obj1"] = frame
store["obj1_col"] = frame["a"]
store--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version) 141 try: --> 142 module = importlib.import_module(name) 143 except ImportError: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package) 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level) File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level) File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_) File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_) ModuleNotFoundError: No module named 'tables' During handling of the above exception, another exception occurred: ImportError Traceback (most recent call last) Cell In[60], line 2 1 frame = pd.DataFrame({"a": np.random.standard_normal(100)}) ----> 2 store = pd.HDFStore("examples/mydata.h5") 3 store["obj1"] = frame 4 store["obj1_col"] = frame["a"] File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs) 557 if "format" in kwargs: 558 raise ValueError("format is not a defined argument for HDFStore") --> 560 tables = import_optional_dependency("tables") 562 if complib is not None and complib not in tables.filters.all_complibs: 563 raise ValueError( 564 f"complib only supports {tables.filters.all_complibs} compression." 565 ) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version) 143 except ImportError: 144 if errors == "raise": --> 145 raise ImportError(msg) 146 return None 148 # Handle submodules: if we have submodule, grab parent module from sys.modules ImportError: Missing optional dependency 'pytables'. Use pip or conda to install pytables.
store["obj1"]store.put("obj2", frame, format="table")
store.select("obj2", where=["index >= 10 and index <= 15"])
store.close()--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[62], line 1 ----> 1 store.put("obj2", frame, format="table") 2 store.select("obj2", where=["index >= 10 and index <= 15"]) 3 store.close() NameError: name 'store' is not defined
frame.to_hdf("examples/mydata.h5", "obj3", format="table")
pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"])--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:142, in import_optional_dependency(name, extra, errors, min_version) 141 try: --> 142 module = importlib.import_module(name) 143 except ImportError: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/importlib/__init__.py:126, in import_module(name, package) 125 level += 1 --> 126 return _bootstrap._gcd_import(name[level:], package, level) File <frozen importlib._bootstrap>:1204, in _gcd_import(name, package, level) File <frozen importlib._bootstrap>:1176, in _find_and_load(name, import_) File <frozen importlib._bootstrap>:1140, in _find_and_load_unlocked(name, import_) ModuleNotFoundError: No module named 'tables' During handling of the above exception, another exception occurred: ImportError Traceback (most recent call last) Cell In[63], line 1 ----> 1 frame.to_hdf("examples/mydata.h5", "obj3", format="table") 2 pd.read_hdf("examples/mydata.h5", "obj3", where=["index < 5"]) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/core/generic.py:2682, in NDFrame.to_hdf(self, path_or_buf, key, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding) 2678 from pandas.io import pytables 2680 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected 2681 # "Union[DataFrame, Series]" [arg-type] -> 2682 pytables.to_hdf( 2683 path_or_buf, 2684 key, 2685 self, # type: ignore[arg-type] 2686 mode=mode, 2687 complevel=complevel, 2688 complib=complib, 2689 append=append, 2690 format=format, 2691 index=index, 2692 min_itemsize=min_itemsize, 2693 nan_rep=nan_rep, 2694 dropna=dropna, 2695 data_columns=data_columns, 2696 errors=errors, 2697 encoding=encoding, 2698 ) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:302, in to_hdf(path_or_buf, key, value, mode, complevel, complib, append, format, index, min_itemsize, nan_rep, dropna, data_columns, errors, encoding) 300 path_or_buf = stringify_path(path_or_buf) 301 if isinstance(path_or_buf, str): --> 302 with HDFStore( 303 path_or_buf, mode=mode, complevel=complevel, complib=complib 304 ) as store: 305 f(store) 306 else: File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/io/pytables.py:560, in HDFStore.__init__(self, path, mode, complevel, complib, fletcher32, **kwargs) 557 if "format" in kwargs: 558 raise ValueError("format is not a defined argument for HDFStore") --> 560 tables = import_optional_dependency("tables") 562 if complib is not None and complib not in tables.filters.all_complibs: 563 raise ValueError( 564 f"complib only supports {tables.filters.all_complibs} compression." 565 ) File ~/Downloads/repo/Python-for-Data-Analysis/.pixi/envs/default/lib/python3.11/site-packages/pandas/compat/_optional.py:145, in import_optional_dependency(name, extra, errors, min_version) 143 except ImportError: 144 if errors == "raise": --> 145 raise ImportError(msg) 146 return None 148 # Handle submodules: if we have submodule, grab parent module from sys.modules ImportError: Missing optional dependency 'pytables'. Use pip or conda to install pytables.
import os
os.remove("examples/mydata.h5")--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[64], line 2 1 import os ----> 2 os.remove("examples/mydata.h5") FileNotFoundError: [Errno 2] No such file or directory: 'examples/mydata.h5'
import requests
url = "https://api.github.com/repos/pandas-dev/pandas/issues"
resp = requests.get(url)
resp.raise_for_status()
resp<Response [200]>
data = resp.json()
data[0]["title"]'BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapping .. failed" after pd.concat with PyArrow'
issues = pd.DataFrame(data, columns=["number", "title",
"labels", "state"])
issues| number | title | labels | state | |
|---|---|---|---|---|
| 0 | 64320 | BUG: Assigning pd.NA to StringDtype column causes "Unknown error: Wrapp... | [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... | open |
| 1 | 64319 | DOC: clarify .loc Series-assignment alignment behavior | [] | open |
| 2 | 64318 | BUG: HDFStore.select(where=...) silently returns empty DataFrame with d... | [] | open |
| 3 | 64317 | TST: troubleshoot flaky linux plotting tests | [] | open |
| 4 | 64316 | TST: remove strict=False xfail from plotting test | [] | open |
| ... | ... | ... | ... | ... |
| 25 | 64285 | ENH: Implement `series.argsort(order=, stable=)` | [] | open |
| 26 | 64283 | feat: add Series.str.splitlines and Series.str.expandtabs | [] | open |
| 27 | 64282 | BUG: enforce 1-dimensional input in pandas.array | [] | open |
| 28 | 64281 | BUG: test_dt64arr_cmp_arraylike_invalid[tzlocal] fails on Windows | [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... | open |
| 29 | 64280 | BUG: pandas.array accepts multidimensional arguments | [{'id': 76811, 'node_id': 'MDU6TGFiZWw3NjgxMQ==', 'url': 'https://api.g... | open |
30 rows × 4 columns
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL, d INTEGER
);"""
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
con.commit()data = [("Atlanta", "Georgia", 1.25, 6),
("Tallahassee", "Florida", 2.6, 3),
("Sacramento", "California", 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
rows[('Atlanta', 'Georgia', 1.25, 6),
('Tallahassee', 'Florida', 2.6, 3),
('Sacramento', 'California', 1.7, 5)]
cursor.description
pd.DataFrame(rows, columns=[x[0] for x in cursor.description])| a | b | c | d | |
|---|---|---|---|---|
| 0 | Atlanta | Georgia | 1.25 | 6 |
| 1 | Tallahassee | Florida | 2.60 | 3 |
| 2 | Sacramento | California | 1.70 | 5 |
import sqlalchemy as sqla
db = sqla.create_engine("sqlite:///mydata.sqlite")
pd.read_sql("SELECT * FROM test", db)| a | b | c | d | |
|---|---|---|---|---|
| 0 | Atlanta | Georgia | 1.25 | 6 |
| 1 | Tallahassee | Florida | 2.60 | 3 |
| 2 | Sacramento | California | 1.70 | 5 |
!rm mydata.sqlite