데이터셋 다운로드

sarcasm.json 데이터셋을 다운로드 받습니다.

import urllib
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

SEED = 123

# 데이터셋 다운로드
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

# JSON 파일을 데이터프레임으로 로드
df = pd.read_json('sarcasm.json')
df = df.rename(columns={
    'headline': 'sentence', 
    'is_sarcastic': 'label'
})
df

	article_link	sentence	label
0	https://www.huffingtonpost.com/entry/versace-b...	former versace store clerk sues over secret 'b...	0
1	https://www.huffingtonpost.com/entry/roseanne-...	the 'roseanne' revival catches up to our thorn...	0
2	https://local.theonion.com/mom-starting-to-fea...	mom starting to fear son's web series closest ...	1
3	https://politics.theonion.com/boehner-just-wan...	boehner just wants wife to listen, not come up...	1
4	https://www.huffingtonpost.com/entry/jk-rowlin...	j.k. rowling wishes snape happy birthday in th...	0
...	...	...	...
26704	https://www.huffingtonpost.com/entry/american-...	american politics in moral free-fall	0
26705	https://www.huffingtonpost.com/entry/americas-...	america's best 20 hikes	0
26706	https://www.huffingtonpost.com/entry/reparatio...	reparations and obama	0
26707	https://www.huffingtonpost.com/entry/israeli-b...	israeli ban targeting boycott supporters raise...	0
26708	https://www.huffingtonpost.com/entry/gourmet-g...	gourmet gifts for the foodie 2014	0

26709 rows × 3 columns

데이터셋 분할

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=SEED)

# train 데이터셋 출력
train.head()

	article_link	sentence	label
7917	https://www.theonion.com/disturbance-of-arafat...	disturbance of arafat's grave casts horrible c...	1
23206	https://www.huffingtonpost.com/entry/15-photos...	15 photos of hot dudes supporting bernie sande...	0
4611	https://www.huffingtonpost.com/entry/illinois-...	6 things you need to know about the nation's s...	0
11937	https://local.theonion.com/really-ugly-shark-t...	really ugly shark tired of being mistaken for ...	1
9334	https://local.theonion.com/friends-wife-encoun...	friend's wife encountered twice a year	1

# test 데이터셋 출력
test.head()

	article_link	sentence	label
22288	https://www.huffingtonpost.com/entry/steve-wil...	steve wilson on 'the making of gone with the w...	0
16228	https://local.theonion.com/standards-lowered-f...	standards lowered for second search through fr...	1
4905	https://www.huffingtonpost.comhttp://www.thede...	surgical tech in needle-swap scandal at swedis...	0
8947	https://www.huffingtonpost.com/entry/donald-tr...	ferguson is not among the most dangerous place...	0
3706	https://politics.theonion.com/bill-clinton-res...	bill clinton resting up to sit upright at next...	1

토큰화가 적용된 데이터셋

from transformers import AutoTokenizer
from torch.utils.data import Dataset


class TokenDataset(Dataset):
  
    def __init__(self, dataframe, tokenizer_pretrained):
        # sentence, label 컬럼으로 구성된 데이터프레임 전달
        self.data = dataframe        
        # Huggingface 토크나이저 생성
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_pretrained)
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label']

        # 토큰화 처리
        tokens = self.tokenizer(
            sentence,                # 1개 문장 
            return_tensors='pt',     # 텐서로 반환
            truncation=True,         # 잘라내기 적용
            padding='max_length',    # 패딩 적용
            add_special_tokens=True  # 스페셜 토큰 적용
        )

        input_ids = tokens['input_ids'].squeeze(0)           # 2D -> 1D
        attention_mask = tokens['attention_mask'].squeeze(0) # 2D -> 1D

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask, 
            'label': torch.tensor(label)
        }

데이터셋 인스턴스 생성

# distilbert-base-uncased 토크나이저 지정
tokenizer_pretrained = 'distilbert-base-uncased'

# train, test 데이터셋 생성
train_data = TokenDataset(train, tokenizer_pretrained)
test_data = TokenDataset(test, tokenizer_pretrained)

Model

import torch

# device 지정
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:1

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


# Fine-Tuning을 위한 옵션 지정
training_args = TrainingArguments(
    output_dir='./results',          # 결과 값이 저장될 디렉토리 지정
    num_train_epochs=3,              # 학습 epoch
    per_device_train_batch_size=16,  # training 배치사이즈
    per_device_eval_batch_size=64,   # evaluation 배치사이즈
    warmup_steps=500,                # leaning rate 스케줄러의 웜업 step
    weight_decay=0.01,               # weight decay 강도
    logging_dir='./logs',            # 로그를 저장할 디렉토리
    logging_steps=200,               # 로그 출력 step
)

# pretrained 모델 지정
model_pretrained = 'distilbert-base-uncased'

# 모델 다운로드, num_labels 지정, device 지정
model = AutoModelForSequenceClassification.from_pretrained(model_pretrained, num_labels=2).to(device)

# Trainer 생성 후, model, train, test 데이터셋 지정
trainer = Trainer(
    model=model,                     # 이전에 불러온 허깅페이스 pretrained 모델
    args=training_args,              # 이전에 정의한 training arguments 지정
    train_dataset=train_data,        # training 데이터
    eval_dataset=test_data           # test 데이터
)

# trainer 를 활용한 학습 시작
trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']

- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

***** Running training *****

  Num examples = 20031

  Num Epochs = 3

  Instantaneous batch size per device = 16

  Total train batch size (w. parallel, distributed & accumulation) = 32

  Gradient Accumulation steps = 1

  Total optimization steps = 1878

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.

wandb: Currently logged in as: clee166. Use `wandb login --relogin` to force relogin

wandb version 0.13.7 is available! To upgrade, please run: $ pip install wandb --upgrade

Tracking run with wandb version 0.13.3

Run data is saved locally in /home/jupyter/07-pytorch/wandb/run-20230109_175133-3uq49qn3

Syncing run ./results to Weights & Biases (docs)

[1878/1878 07:18, Epoch 3/3]

Step	Training Loss
200	0.512700
400	0.287100
600	0.267800
800	0.161400
1000	0.147000
1200	0.148200
1400	0.067900
1600	0.047200
1800	0.044900

Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=1878, training_loss=0.1813284194253631, metrics={'train_runtime': 445.8676, 'train_samples_per_second': 134.778, 'train_steps_per_second': 4.212, 'total_flos': 7960363387435008.0, 'train_loss': 0.1813284194253631, 'epoch': 3.0})

# 학습된 trainer로 예측
predictions = trainer.predict(test_data)
predictions

***** Running Prediction *****
  Num examples = 6678
  Batch size = 128

PredictionOutput(predictions=array([[ 2.9732733, -2.9471958],
       [-4.0222363,  3.6413522],
       [ 3.8347576, -3.318453 ],
       ...,
       [ 2.824299 , -2.4794154],
       [ 3.5981152, -3.2576218],
       [ 4.025952 , -3.6779523]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.3030776381492615, 'test_runtime': 13.6168, 'test_samples_per_second': 490.424, 'test_steps_per_second': 3.892})

# 예측 결과는 label_ids 에 담겨 있음
predictions.label_ids

array([0, 1, 0, ..., 0, 0, 0])

# 평가
accuracy = (test['label'] == predictions.label_ids).mean()
accuracy

1.0