구글 드라이브 마운트

from google.colab import drive
drive.mount('/gdrive', force_remount=True)
# /gdrive/My Drive/ (폴더명)

# 그림파일로 렌더링 하도록 패키지 설정
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

# 필요한 모듈 설치
import tensorflow as tf
import gym
from IPython import display
import cv2
from pyvirtualdisplay import Display
from IPython import display
import matplotlib.pyplot as plt
from collections import deque
import numpy as np
import random
%matplotlib inline
Display().start()

<pyvirtualdisplay.display.Display at 0x7f34a075e1d0>

# 카트폴 게임 환경을 만듦
env = gym.make("CartPole-v1")

env.render('rgb_array')

# 2 - 액션 종류 슈 (아웃풋)
action_num=env.action_space.n
# 4 - 상태 종류 수 (인풋)
state_num=env.observation_space.shape[0]

# pg 모델 만들기 - REINFORCE 알고리즘(가장 기초)
i=tf.keras.Input(shape=(state_num,))
out=tf.keras.layers.Dense(128,activation='relu')(i)
# out=tf.keras.layers.Dense(128,activation='relu')(out)
# out=tf.keras.layers.Dense(128,activation='relu')(out)
pi=tf.keras.layers.Dense(action_num,activation='softmax',name='pi')(out)
val=tf.keras.layers.Dense(1,name='val')(out)
pg_model=tf.keras.Model(inputs=[i],outputs=[pi,val])
opt=tf.keras.optimizers.Adam(0.001,clipnorm=0.1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-df773df0268a> in <module>()
      1 # pg 모델 만들기 - REINFORCE 알고리즘(가장 기초)
----> 2 i=tf.keras.Input(shape=(state_num,))
      3 out=tf.keras.layers.Dense(128,activation='relu')(i)
      4 # out=tf.keras.layers.Dense(128,activation='relu')(out)
      5 # out=tf.keras.layers.Dense(128,activation='relu')(out)

NameError: name 'tf' is not defined

pg_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_2 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 128)          640         input_2[0][0]                    
__________________________________________________________________________________________________
pi (Dense)                      (None, 2)            258         dense_3[0][0]                    
__________________________________________________________________________________________________
val (Dense)                     (None, 1)            129         dense_3[0][0]                    
==================================================================================================
Total params: 1,027
Trainable params: 1,027
Non-trainable params: 0
__________________________________________________________________________________________________

# 에피소드 수만큼 학습
episode_count=1000

# 점수를 기록할 리스트
scores = []

# 디스카운트 팩터 정의
discount_rate=0.99

# 업데이트 배치
batch_size=16
count=0
grad_t=[]

for episode in range(episode_count):
    state = env.reset()
    # 차원을 맞추어 준다
    state = np.reshape(state, [1, state_num])
    done = False
    total_reward = 0
    while not done:
        count=count+1
        p,v=pg_model.predict(state)
        action=np.random.choice(range(action_num),p=p[0])
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_num])

        
        variable = pg_model.trainable_variables
        
        with tf.GradientTape() as tape:
            p,v=pg_model(state)
            p=p[0]
            _,n_v=pg_model(next_state)
            td=reward + (1-done) * discount_rate * n_v[0]
            tde=tf.stop_gradient(td-v[0])
            val_loss=tf.stop_gradient(td)-v[0]
            val_loss=tf.square(val_loss)
            loss = - tf.math.log(p[action]) * tde + val_loss
        grad=tape.gradient(loss,variable)
        grad_t.append(grad)
        
        # print(grad)
        
        if(count%batch_size==0):
            opt.apply_gradients(zip(np.mean(grad_t,axis=0),variable))
            grad_t=[]
            
       
        state = next_state
        total_reward += reward
    
    scores.append(total_reward)
    mean_score = np.mean(scores)
    
    print(episode+1,total_reward)
    # print(p[action])
       
    
    if (episode+1) % 20 == 0:
        print("Episode %d: Mean survival = %0.2lf in %d episodes" %(episode+1, mean_score, 20))
        scores = []

env.close()

pg_model=tf.keras.models.load_model('/gdrive/My Drive/hjk_pg_reinforce_model.h5')

WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.

env = gym.make('CartPole-v1')
state=env.reset()
state = np.reshape(state, [1, state_num])
done=False
# img = plt.imshow(env.render('rgb_array')) # only call this once
total_reward=0
img_avi=np.zeros((400,600,3))
fcc=cv2.VideoWriter_fourcc(*'DIVX')
out=cv2.VideoWriter('/gdrive/My Drive/hjk_pg_reinforce.avi',fcc,10.0,(600,400))
while not done:
    # img.set_data(env.render('rgb_array')) # just update the data
    # display.display(plt.gcf())
    # display.clear_output(wait=True)
    img_avi=env.render('rgb_array')
    action = np.argmax(pg_model.predict(state)[0])
    # action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1, state_num])
    state = next_state
    total_reward += reward
    out.write(np.uint8(img_avi))
print(total_reward)
out.release()
cv2.destroyAllWindows()

500.0