强化学习入门-Q学习

强化学习入门-Q学习

11月 1, 2024 阅读 2305 字数 4108 评论 0 喜欢 0

强化学习（Reinforcement Learning, RL）是一种机器学习方法，通过与环境的交互来学习如何采取行动，以最大化累积的奖励。以下是对强化学习原理、作用以及如何构建一个强化学习模型的详细解释：

原理

代理（Agent）：做出决策的主体。
环境（Environment）：代理与之交互的外部系统。
状态（State, s）：环境在某一时刻的具体情况。
动作（Action, a）：代理在某一状态下可以采取的行为。
奖励（Reward, r）：代理采取某一动作后环境反馈的信号，用于衡量动作的好坏。
策略（Policy, π）：代理选择动作的规则或函数，可以是确定性的或随机的。
值函数（Value Function, V）：评估某一状态的好坏，表示在该状态下能获得的期望累积奖励。
Q函数（Q-Value, Q）：评估在某一状态采取某一动作的好坏，表示采取该动作后能获得的期望累积奖励。

Q-learning算法

import numpy as np

class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.state = 0  # 起点在左上角
        self.end_state = size * size - 1  # 终点在右下角

        # 动作空间：上(0)、右(1)、下(2)、左(3)
        self.action_space = [0, 1, 2, 3]
        print(f"创建了一个 {size}x{size} 的网格世界")
        print(f"起点位置: (0,0), 终点位置: ({size-1},{size-1})")

    def get_state_coords(self, state):
        """将状态数字转换为坐标"""
        return state // self.size, state % self.size

    def reset(self):
        self.state = 0
        x, y = self.get_state_coords(self.state)
        print(f"\n重置环境，智能体位置: ({x},{y})")
        return self.state

    def step(self, action):
        old_x, old_y = self.get_state_coords(self.state)
        x, y = old_x, old_y

        # 根据动作更新位置
        if action == 0:    # 上
            x = max(0, x-1)
        elif action == 1:  # 右
            y = min(self.size-1, y+1)
        elif action == 2:  # 下
            x = min(self.size-1, x+1)
        elif action == 3:  # 左
            y = max(0, y-1)

        self.state = x * self.size + y

        # 到达终点奖励为1，其他步骤奖励为-0.1
        reward = 1.0 if self.state == self.end_state else -0.1
        done = self.state == self.end_state

        action_names = ['上', '右', '下', '左']
        print(f"动作: {action_names[action]}, 从({old_x},{old_y})移动到({x},{y}), 奖励: {reward:.1f}")

        return self.state, reward, done

class QLearning:
    def __init__(self, state_size, action_size, learning_rate=0.1, gamma=0.9):
        self.q_table = np.zeros((state_size, action_size))
        self.lr = learning_rate  # 学习率
        self.gamma = gamma      # 折扣因子
        print(f"\n初始化Q-learning智能体:")
        print(f"学习率: {learning_rate}")
        print(f"折扣因子: {gamma}")
        print(f"Q表大小: {state_size}x{action_size}")

    def get_action(self, state, epsilon=0.1):
        # epsilon-贪婪策略
        if np.random.random() < epsilon:
            action = np.random.choice(len(self.q_table[state]))
            print(f"探索：随机选择动作 {action}")
            return action
        action = np.argmax(self.q_table[state])
        print(f"利用：选择最优动作 {action}")
        return action

    def learn(self, state, action, reward, next_state):
        # 1. 获取当前状态-动作对的Q值
        old_value = self.q_table[state, action]

        # 2. 获取下一个状态中最大的Q值
        next_max = np.max(self.q_table[next_state])

        # 3. Q-learning更新公式
        new_value = (1 - self.lr) * old_value + self.lr * (reward + self.gamma * next_max)

        # 4. 更新Q表
        self.q_table[state, action] = new_value

def print_episode_summary(episode, total_reward, steps):
    print(f"\n回合 {episode} 总结:")
    print(f"总步数: {steps}")
    print(f"总奖励: {total_reward:.2f}")
    print("-" * 50)

# 训练过程
env = GridWorld(size=5)
agent = QLearning(state_size=25, action_size=4)
episodes = 100  # 为了演示，我们减少回合数

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0

    print(f"\n开始回合 {episode + 1}")

    while not done:
        steps += 1
        action = agent.get_action(state, epsilon=0.1)
        next_state, reward, done = env.step(action)
        agent.learn(state, action, reward, next_state)
        state = next_state
        total_reward += reward

        if steps > 100:  # 防止无限循环
            print("回合步数过多，提前结束")
            break

    print_episode_summary(episode + 1, total_reward, steps)

    # 每10个回合展示一次Q表
    if (episode + 1) % 10 == 0:
        print("\nQ表片段:")
        print(agent.q_table[:, :])  # 只显示前5个状态的Q值

代码解析

1.GridWorld 类：

创建一个简单的网格环境
智能体可以上下左右移动
到达终点获得正奖励，其他步骤获得小的负奖励
QLearning 类：
实现 Q-learning 算法
维护一个 Q 表来存储状态-动作值
使用 ε-贪婪策略选择动作
通过时序差分学习更新 Q 值
训练循环：
运行多个回合来训练智能体
每个回合都从起点开始，直到到达终点

输出：

Q表片段:
[[-0.29766024 -0.27952562 -0.24557464 -0.28062151]
 [-0.2124381  -0.21151503 -0.1930229  -0.21528052]
 [-0.15677518 -0.15030181 -0.10340539 -0.17394976]
 [-0.10466175 -0.10065021 -0.09897863 -0.10862233]
 [-0.07430568 -0.06793465 -0.05423492 -0.06129447]
 [-0.22295002 -0.11124957 -0.21446148 -0.22789866]
 [-0.19177152  0.06200571 -0.15535018 -0.19476545]
 [-0.12503019 -0.10603329  0.25057189 -0.12785624]
 [-0.08993588 -0.05934414  0.07503771 -0.06348727]
 [-0.03940399 -0.03940399  0.22452878 -0.03948338]
 [-0.17292671 -0.15084247 -0.15448072 -0.15648468]
 [-0.12302759  0.02442123 -0.10888807 -0.12450674]
 [-0.07707295  0.4344819  -0.05944325 -0.0677957 ]
 [-0.03414978  0.61352714  0.01632828 -0.03240244]
 [-0.01567082  0.05200123  0.79880578  0.03590251]
 [-0.11063855 -0.1078826  -0.11357292 -0.11286845]
 [-0.07876345 -0.06386965 -0.06959083 -0.07230351]
 [-0.0306487   0.11765511 -0.034561   -0.0306487 ]
 [-0.01        0.60844074 -0.01       -0.01171   ]
 [ 0.09009701  0.25152601  0.99991536  0.10631137]
 [-0.08317677 -0.08653641 -0.08582936 -0.08564389]
 [-0.04145702 -0.01794207 -0.04900995 -0.05261765]
 [-0.0199      0.21003608 -0.019171   -0.0109    ]
 [-0.01673687  0.6861894   0.0071      0.        ]
 [ 0.          0.          0.          0.        ]]

发表评论取消回复

相关文章返回顶部 上一篇 下一篇