# 에피소드 수만큼 학습
episode_count=1000
# 점수를 기록할 리스트
scores = []
memory=[]
# 디스카운트 팩터 정의
discount_rate=0.99
for episode in range(episode_count):
state = env.reset()
# 차원을 맞추어 준다
state = np.reshape(state, [1, state_num])
done = False
total_reward = 0
while not done:
p=pg_model.predict(state)[0]
action=np.random.choice(range(action_num),p=p)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, state_num])
i=(state,action,reward/100.,next_state,done)
# 메모리에 작업 내용을 기록한다
memory.append(i)
# 다음상태를 현사태로 변경하여 계속 진행한다
state = next_state
total_reward += reward
# 에피소드가 끝나면 PG 학습 시작
G=0
for s,a,r,n_s,d in memory[::-1]:
G=r+discount_rate * G
variable = pg_model.trainable_variables
with tf.GradientTape() as tape:
p=pg_model(s)[0][a]
loss = -tf.math.log(p) * G
grad=tape.gradient(loss,variable)
opt.apply_gradients(zip(grad,variable))
memory=[]
scores.append(total_reward)
if(total_reward>450):
pg_model.save('/gdrive/My Drive/hjk_pg_reinforce_model.h5')
mean_score = np.mean(scores)
print(episode+1,total_reward)
if (episode+1) % 20 == 0:
print("Episode %d: Mean survival = %0.2lf in %d episodes" %(episode+1, mean_score, 20))
if mean_score >= 400:
break
scores = []
env.close()