目前是一名python小白,网上找了一篇强化学习的代码想学学,发现好多这种代码都有这个问题:
Q_observation = self.Q_table[observation, :]
IndexError: only integers, slices (:
), ellipsis (...
), numpy.newaxis (None
) and integer or boolean arrays are valid indices
是代码问题还是我的环境设置问题呀,希望大佬求教,小白不胜感激
def __init__(self, obser_n, act_n, epsilong, gama, alfa):
self.act_n = act_n
self.Q_table = np.zeros((obser_n, act_n))
self.epsilong = epsilong
self.gama = gama
self.alfa = alfa
"""
根据当前 Observation选择最优 action,采用 ε-greedy 算法,即:ε 概率探索,否则选择Q值最大的策略
每个observation 用一个数字表示,例如:索引1表示observation(1,1)
"""
def actionChoose(self, observation):
Q_observation = self.Q_table[observation, :]
if random.uniform(0, 1) > (1 - self.epsilong):
return np.random.choice(self.act_n)
else:
return self.getMaxOfQtable(observation)
# 根据当前Observation返回Q值最大的action
def getMaxOfQtable(self, observation):
Q_observation = self.Q_table[observation, :]
maxList = np.where(Q_observation == max(Q_observation))[0]
return np.random.choice(maxList)
# learn算法,更新Q表格
def learn(self, observation, action, reward, next_observation, next_action, is_done):
if is_done:
target_value = reward
this_value = self.Q_table[observation][action]
self.Q_table[observation][action] += self.alfa * (target_value - this_value)
else:
# 先计算目标值
target_value = reward + self.gama * max(self.Q_table[next_observation, :])
# 拿当前值
this_value = self.Q_table[observation][action]
# 计算时序差分
diff = target_value - this_value
# 更新当前Q
self.Q_table[observation][action] += self.alfa * diff
def getQtable(self):
return self.Q_table