Chapter 9: 생성 모델 - 변분 오토인코더(VAE)
이 장에서는 훈련 데이터와 유사한 새로운 데이터를 생성할 수 있는 모델 중 하나인 변분 오토인코더(Variational Autoencoders, VAEs)를 소개합니다.
변분 오토인코더 (VAE)
VAE는 데이터의 밀도 높은 표현(Dense representation)을 학습하고 새로운 샘플을 효과적으로 생성할 수 있는 강력한 도구입니다.
import deepchem as dc
from deepchem.models.optimizers import ExponentialDecay
from deepchem.models.seqtoseq import AspuruGuzikAutoEncoder
import numpy as np
from rdkit import Chem
# Train a variational autoencoder to generate molecules. First load
# the training data.
tasks, datasets, transformers = dc.molnet.load_muv()
train_dataset, valid_dataset, test_dataset = datasets
train_smiles = train_dataset.ids
# Find the set of tokens that can appear, and the maximum length of
# the SMILES strings we are working with.
tokens = set()
for s in train_smiles:
tokens = tokens.union(set(s))
tokens = sorted(list(tokens))
max_length = max(len(s) for s in train_smiles)
# Build the model.
batch_size = 100
batches_per_epoch = len(train_smiles) / batch_size
learning_rate = ExponentialDecay(0.001, 0.95, batches_per_epoch)
model = AspuruGuzikAutoEncoder(
tokens,
max_length,
model_dir="vae",
batch_size=batch_size,
learning_rate=learning_rate,
)
# Train the model.
def generate_sequences(epochs):
for i in range(epochs):
for s in train_smiles:
yield (s, s)
model.fit_sequences(generate_sequences(50))
# Generate some new molecules.
predictions = model.predict_from_embeddings(np.random.normal(size=(1000, 196)))
molecules = []
for p in predictions:
smiles = "".join(p)
if Chem.MolFromSmiles(smiles) is not None:
molecules.append(smiles)
print("Generated molecules:")
for m in molecules:
print(m)