强化学习(Reinforcement Learning, RL)是一种机器学习方法,通过与环境的交互来学习如何采取行动,以最大化累积的奖励。以下是对强化学习原理、作用以及如何构建一个强化学习模型的详细解释:
原理
代理(Agent):做出决策的主体。
环境(Environment):代理与之交互的外部系统。
状态(State, s):环境在某一时刻的具体情况。
动作(Action, a):代理在某一状态下可以采取的行为。
奖励(Reward, r):代理采取某一动作后环境反馈的信号,用于衡量动作的好坏。
策略(Policy, π):代理选择动作的规则或函数,可以是确定性的或随机的。
值函数(Value Function, V):评估某一状态的好坏,表示在该状态下能获得的期望累积奖励。
Q函数(Q-Value, Q):评估在某一状态采取某一动作的好坏,表示采取该动作后能获得的期望累积奖励。
Q-learning算法
import numpy as np
class GridWorld:
def __init__(self, size=5):
self.size = size
self.state = 0 # 起点在左上角
self.end_state = size * size - 1 # 终点在右下角
# 动作空间:上(0)、右(1)、下(2)、左(3)
self.action_space = [0, 1, 2, 3]
print(f"创建了一个 {size}x{size} 的网格世界")
print(f"起点位置: (0,0), 终点位置: ({size-1},{size-1})")
def get_state_coords(self, state):
"""将状态数字转换为坐标"""
return state // self.size, state % self.size
def reset(self):
self.state = 0
x, y = self.get_state_coords(self.state)
print(f"\n重置环境,智能体位置: ({x},{y})")
return self.state
def step(self, action):
old_x, old_y = self.get_state_coords(self.state)
x, y = old_x, old_y
# 根据动作更新位置
if action == 0: # 上
x = max(0, x-1)
elif action == 1: # 右
y = min(self.size-1, y+1)
elif action == 2: # 下
x = min(self.size-1, x+1)
elif action == 3: # 左
y = max(0, y-1)
self.state = x * self.size + y
# 到达终点奖励为1,其他步骤奖励为-0.1
reward = 1.0 if self.state == self.end_state else -0.1
done = self.state == self.end_state
action_names = ['上', '右', '下', '左']
print(f"动作: {action_names[action]}, 从({old_x},{old_y})移动到({x},{y}), 奖励: {reward:.1f}")
return self.state, reward, done
class QLearning:
def __init__(self, state_size, action_size, learning_rate=0.1, gamma=0.9):
self.q_table = np.zeros((state_size, action_size))
self.lr = learning_rate # 学习率
self.gamma = gamma # 折扣因子
print(f"\n初始化Q-learning智能体:")
print(f"学习率: {learning_rate}")
print(f"折扣因子: {gamma}")
print(f"Q表大小: {state_size}x{action_size}")
def get_action(self, state, epsilon=0.1):
# epsilon-贪婪策略
if np.random.random() < epsilon:
action = np.random.choice(len(self.q_table[state]))
print(f"探索:随机选择动作 {action}")
return action
action = np.argmax(self.q_table[state])
print(f"利用:选择最优动作 {action}")
return action
def learn(self, state, action, reward, next_state):
# 1. 获取当前状态-动作对的Q值
old_value = self.q_table[state, action]
# 2. 获取下一个状态中最大的Q值
next_max = np.max(self.q_table[next_state])
# 3. Q-learning更新公式
new_value = (1 - self.lr) * old_value + self.lr * (reward + self.gamma * next_max)
# 4. 更新Q表
self.q_table[state, action] = new_value
def print_episode_summary(episode, total_reward, steps):
print(f"\n回合 {episode} 总结:")
print(f"总步数: {steps}")
print(f"总奖励: {total_reward:.2f}")
print("-" * 50)
# 训练过程
env = GridWorld(size=5)
agent = QLearning(state_size=25, action_size=4)
episodes = 100 # 为了演示,我们减少回合数
for episode in range(episodes):
state = env.reset()
total_reward = 0
done = False
steps = 0
print(f"\n开始回合 {episode + 1}")
while not done:
steps += 1
action = agent.get_action(state, epsilon=0.1)
next_state, reward, done = env.step(action)
agent.learn(state, action, reward, next_state)
state = next_state
total_reward += reward
if steps > 100: # 防止无限循环
print("回合步数过多,提前结束")
break
print_episode_summary(episode + 1, total_reward, steps)
# 每10个回合展示一次Q表
if (episode + 1) % 10 == 0:
print("\nQ表片段:")
print(agent.q_table[:, :]) # 只显示前5个状态的Q值
代码解析
1.GridWorld 类:
- 创建一个简单的网格环境
- 智能体可以上下左右移动
- 到达终点获得正奖励,其他步骤获得小的负奖励
QLearning 类: - 实现 Q-learning 算法
- 维护一个 Q 表来存储状态-动作值
- 使用 ε-贪婪策略选择动作
- 通过时序差分学习更新 Q 值
训练循环: - 运行多个回合来训练智能体
- 每个回合都从起点开始,直到到达终点
输出:
Q表片段:
[[-0.29766024 -0.27952562 -0.24557464 -0.28062151]
[-0.2124381 -0.21151503 -0.1930229 -0.21528052]
[-0.15677518 -0.15030181 -0.10340539 -0.17394976]
[-0.10466175 -0.10065021 -0.09897863 -0.10862233]
[-0.07430568 -0.06793465 -0.05423492 -0.06129447]
[-0.22295002 -0.11124957 -0.21446148 -0.22789866]
[-0.19177152 0.06200571 -0.15535018 -0.19476545]
[-0.12503019 -0.10603329 0.25057189 -0.12785624]
[-0.08993588 -0.05934414 0.07503771 -0.06348727]
[-0.03940399 -0.03940399 0.22452878 -0.03948338]
[-0.17292671 -0.15084247 -0.15448072 -0.15648468]
[-0.12302759 0.02442123 -0.10888807 -0.12450674]
[-0.07707295 0.4344819 -0.05944325 -0.0677957 ]
[-0.03414978 0.61352714 0.01632828 -0.03240244]
[-0.01567082 0.05200123 0.79880578 0.03590251]
[-0.11063855 -0.1078826 -0.11357292 -0.11286845]
[-0.07876345 -0.06386965 -0.06959083 -0.07230351]
[-0.0306487 0.11765511 -0.034561 -0.0306487 ]
[-0.01 0.60844074 -0.01 -0.01171 ]
[ 0.09009701 0.25152601 0.99991536 0.10631137]
[-0.08317677 -0.08653641 -0.08582936 -0.08564389]
[-0.04145702 -0.01794207 -0.04900995 -0.05261765]
[-0.0199 0.21003608 -0.019171 -0.0109 ]
[-0.01673687 0.6861894 0.0071 0. ]
[ 0. 0. 0. 0. ]]

