Re:[오늘의 마지막 문제] 랜덤행동이 점점 줄어들게 하시오 !

import numpy as np import random from collections import defaultdict from environment import Env     class SARSAgent:     def __init__(self, actions):         self.num=10         self.actions = actions # 대리인이 해야할 행동을 지정         self.step_size = 0.01 # 큐함수 값을         self.discount_factor = 0.9 # 감가율         self.epsilon = 1/self.num # 학습할 때는 10번 중 1번         # 0을 초기값으로 가지는 큐함수 테이블 생성         self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0])         # <s, a, r, s', a'>의 샘플로부터 큐함수를 업데이트     def learn(self, state, action, reward, next_state, next_action):         state, next_state = str(state), str(next_state)         current_q = self.q_table[state][action]         next_state_q = self.q_table[next_state][next_action]         td = reward + self.discount_factor * next_state_q - current_q         new_q = current_q + self.step_size * td         self.q_table[state][action] = new_q                     # 입실론 탐욕 정책에 따라서 행동을 반환     def get_action(self, state):         if np.random.rand() < self.epsilon:             # 무작위 행동 반환             action = np.random.choice(self.actions)                     else:             # 큐함수에 따른 행동 반환             state = str(state)             q_list = self.q_table[state]             action = arg_max(q_list)         return action     # 큐함수의 값에 따라 최적의 행동을 반환 def arg_max(q_list):     max_idx_list = np.argwhere(q_list == np.amax(q_list))     max_idx_list = max_idx_list.flatten().tolist()     return random.choice(max_idx_list)     if __name__ == "__main__":     env = Env()     agent = SARSAgent(actions=list(range(env.n_actions)))     s=0     f=0     for episode in range(1000): # 에피소드 1000         # 게임 환경과 상태를 초기화         state = env.reset()         # 현재 상태에 대한 행동을 선택         action = agent.get_action(state)         while True: #             env.render()             # 행동을 위한 후 다음상태 보상 에피소드의 종료 여부를 받아옴             next_state, reward, done = env.step(action) # 다음상태, 보상, 종료여부             if reward==-100:                 f+=1             elif reward==100:                 s+=1                             # 다음 상태에서의 다음 행동 선택             next_action = agent.get_action(next_state)             # <s,a,r,s',a'>로 큐함수를 업데이트             agent.learn(state, action, reward, next_state, next_action)               state = next_state             action = next_action              # 모든 큐함수를 화면에 표시             env.print_value_all(agent.q_table)               if done:                 print('episode:',episode+1,'성공횟수 : ',s,'실패횟수 : ',f)                 agent.num*=2                                break