데이터셋 다운로드

sarcasm.json 데이터셋을 다운로드 받습니다.

import urllib
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

SEED = 123

# 데이터셋 다운로드
url = 'https://storage.googleapis.com/download.tensorflow.org/data/sarcasm.json'
urllib.request.urlretrieve(url, 'sarcasm.json')

# JSON 파일을 데이터프레임으로 로드
df = pd.read_json('sarcasm.json')
df = df.rename(columns={
    'headline': 'sentence', 
    'is_sarcastic': 'label'
})
df
article_link sentence label
0 https://www.huffingtonpost.com/entry/versace-b... former versace store clerk sues over secret 'b... 0
1 https://www.huffingtonpost.com/entry/roseanne-... the 'roseanne' revival catches up to our thorn... 0
2 https://local.theonion.com/mom-starting-to-fea... mom starting to fear son's web series closest ... 1
3 https://politics.theonion.com/boehner-just-wan... boehner just wants wife to listen, not come up... 1
4 https://www.huffingtonpost.com/entry/jk-rowlin... j.k. rowling wishes snape happy birthday in th... 0
... ... ... ...
26704 https://www.huffingtonpost.com/entry/american-... american politics in moral free-fall 0
26705 https://www.huffingtonpost.com/entry/americas-... america's best 20 hikes 0
26706 https://www.huffingtonpost.com/entry/reparatio... reparations and obama 0
26707 https://www.huffingtonpost.com/entry/israeli-b... israeli ban targeting boycott supporters raise... 0
26708 https://www.huffingtonpost.com/entry/gourmet-g... gourmet gifts for the foodie 2014 0

26709 rows × 3 columns

데이터셋 분할

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=SEED)
# train 데이터셋 출력
train.head()
article_link sentence label
7917 https://www.theonion.com/disturbance-of-arafat... disturbance of arafat's grave casts horrible c... 1
23206 https://www.huffingtonpost.com/entry/15-photos... 15 photos of hot dudes supporting bernie sande... 0
4611 https://www.huffingtonpost.com/entry/illinois-... 6 things you need to know about the nation's s... 0
11937 https://local.theonion.com/really-ugly-shark-t... really ugly shark tired of being mistaken for ... 1
9334 https://local.theonion.com/friends-wife-encoun... friend's wife encountered twice a year 1
# test 데이터셋 출력
test.head()
article_link sentence label
22288 https://www.huffingtonpost.com/entry/steve-wil... steve wilson on 'the making of gone with the w... 0
16228 https://local.theonion.com/standards-lowered-f... standards lowered for second search through fr... 1
4905 https://www.huffingtonpost.comhttp://www.thede... surgical tech in needle-swap scandal at swedis... 0
8947 https://www.huffingtonpost.com/entry/donald-tr... ferguson is not among the most dangerous place... 0
3706 https://politics.theonion.com/bill-clinton-res... bill clinton resting up to sit upright at next... 1

토큰화가 적용된 데이터셋

from transformers import AutoTokenizer
from torch.utils.data import Dataset


class TokenDataset(Dataset):
  
    def __init__(self, dataframe, tokenizer_pretrained):
        # sentence, label 컬럼으로 구성된 데이터프레임 전달
        self.data = dataframe        
        # Huggingface 토크나이저 생성
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_pretrained)
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label']

        # 토큰화 처리
        tokens = self.tokenizer(
            sentence,                # 1개 문장 
            return_tensors='pt',     # 텐서로 반환
            truncation=True,         # 잘라내기 적용
            padding='max_length',    # 패딩 적용
            add_special_tokens=True  # 스페셜 토큰 적용
        )

        input_ids = tokens['input_ids'].squeeze(0)           # 2D -> 1D
        attention_mask = tokens['attention_mask'].squeeze(0) # 2D -> 1D

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask, 
            'label': torch.tensor(label)
        }

데이터셋 인스턴스 생성

# distilbert-base-uncased 토크나이저 지정
tokenizer_pretrained = 'distilbert-base-uncased'

# train, test 데이터셋 생성
train_data = TokenDataset(train, tokenizer_pretrained)
test_data = TokenDataset(test, tokenizer_pretrained)

Model

import torch

# device 지정
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)
cuda:1
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


# Fine-Tuning을 위한 옵션 지정
training_args = TrainingArguments(
    output_dir='./results',          # 결과 값이 저장될 디렉토리 지정
    num_train_epochs=3,              # 학습 epoch
    per_device_train_batch_size=16,  # training 배치사이즈
    per_device_eval_batch_size=64,   # evaluation 배치사이즈
    warmup_steps=500,                # leaning rate 스케줄러의 웜업 step
    weight_decay=0.01,               # weight decay 강도
    logging_dir='./logs',            # 로그를 저장할 디렉토리
    logging_steps=200,               # 로그 출력 step
)
# pretrained 모델 지정
model_pretrained = 'distilbert-base-uncased'

# 모델 다운로드, num_labels 지정, device 지정
model = AutoModelForSequenceClassification.from_pretrained(model_pretrained, num_labels=2).to(device)

# Trainer 생성 후, model, train, test 데이터셋 지정
trainer = Trainer(
    model=model,                     # 이전에 불러온 허깅페이스 pretrained 모델
    args=training_args,              # 이전에 정의한 training arguments 지정
    train_dataset=train_data,        # training 데이터
    eval_dataset=test_data           # test 데이터
)

# trainer 를 활용한 학습 시작
trainer.train()
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']

- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

***** Running training *****

  Num examples = 20031

  Num Epochs = 3

  Instantaneous batch size per device = 16

  Total train batch size (w. parallel, distributed & accumulation) = 32

  Gradient Accumulation steps = 1

  Total optimization steps = 1878

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.

wandb: Currently logged in as: clee166. Use `wandb login --relogin` to force relogin
wandb version 0.13.7 is available! To upgrade, please run: $ pip install wandb --upgrade
Tracking run with wandb version 0.13.3
Run data is saved locally in /home/jupyter/07-pytorch/wandb/run-20230109_175133-3uq49qn3
Syncing run ./results to Weights & Biases (docs)
[1878/1878 07:18, Epoch 3/3]
Step Training Loss
200 0.512700
400 0.287100
600 0.267800
800 0.161400
1000 0.147000
1200 0.148200
1400 0.067900
1600 0.047200
1800 0.044900

Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)

TrainOutput(global_step=1878, training_loss=0.1813284194253631, metrics={'train_runtime': 445.8676, 'train_samples_per_second': 134.778, 'train_steps_per_second': 4.212, 'total_flos': 7960363387435008.0, 'train_loss': 0.1813284194253631, 'epoch': 3.0})
# 학습된 trainer로 예측
predictions = trainer.predict(test_data)
predictions
***** Running Prediction *****
  Num examples = 6678
  Batch size = 128
PredictionOutput(predictions=array([[ 2.9732733, -2.9471958],
       [-4.0222363,  3.6413522],
       [ 3.8347576, -3.318453 ],
       ...,
       [ 2.824299 , -2.4794154],
       [ 3.5981152, -3.2576218],
       [ 4.025952 , -3.6779523]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 0, 0]), metrics={'test_loss': 0.3030776381492615, 'test_runtime': 13.6168, 'test_samples_per_second': 490.424, 'test_steps_per_second': 3.892})
# 예측 결과는 label_ids 에 담겨 있음
predictions.label_ids
array([0, 1, 0, ..., 0, 0, 0])
# 평가
accuracy = (test['label'] == predictions.label_ids).mean()
accuracy
1.0