diff --git a/source-code/Pseudocode/q-lambda/q-lambda.png b/source-code/Pseudocode/q-lambda/q-lambda.png index a567c4b..9e39beb 100644 Binary files a/source-code/Pseudocode/q-lambda/q-lambda.png and b/source-code/Pseudocode/q-lambda/q-lambda.png differ diff --git a/source-code/Pseudocode/q-lambda/q-lambda.tex b/source-code/Pseudocode/q-lambda/q-lambda.tex index 07e4a25..1043580 100644 --- a/source-code/Pseudocode/q-lambda/q-lambda.tex +++ b/source-code/Pseudocode/q-lambda/q-lambda.tex @@ -55,8 +55,8 @@ \Return $Q$ \EndProcedure \end{algorithmic} - \caption{SARSA($\lambda$): Learn function $Q: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$} - \label{alg:sarsa-lambda} + \caption{Q($\lambda$): Learn function $Q: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$} + \label{alg:q-lambda} \end{algorithm} \end{preview} -\end{document} \ No newline at end of file +\end{document} diff --git a/source-code/Pseudocode/q-learning/q-learning.png b/source-code/Pseudocode/q-learning/q-learning.png index 59a678c..6c78c9f 100644 Binary files a/source-code/Pseudocode/q-learning/q-learning.png and b/source-code/Pseudocode/q-learning/q-learning.png differ diff --git a/source-code/Pseudocode/q-learning/q-learning.tex b/source-code/Pseudocode/q-learning/q-learning.tex index 221a6fc..9af2792 100644 --- a/source-code/Pseudocode/q-learning/q-learning.tex +++ b/source-code/Pseudocode/q-learning/q-learning.tex @@ -27,13 +27,15 @@ \Statex Discounting factor $\gamma \in [0, 1]$ \Procedure{QLearning}{$\mathcal{X}$, $A$, $R$, $T$, $\alpha$, $\gamma$} \State Initialize $Q: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$ arbitrarily - \State Start in state $s \in \mathcal{X}$ \While{$Q$ is not converged} - \State Select $a \in \mathcal{A}$ by $Q$ and an exploration policy (e.g. $\varepsilon$ greedy) - \State $r \gets R(s, a)$ - \State $s' \gets T(s, a)$ \Comment{Receive the new state} - \State $Q(s', a) \gets (1 - \alpha) \cdot Q(s, a) + \alpha \cdot (r + \gamma \cdot \max_{a'} Q(s', a'))$ - \State $s \gets s'$ + \State Start in state $s \in \mathcal{X}$ + \While{$s$ is not terminal} + \State Select $a \in \mathcal{A}$ by $Q$ and an exploration policy (e.g. $\varepsilon$ greedy) + \State $r \gets R(s, a)$ + \State $s' \gets T(s, a)$ \Comment{Receive the new state} + \State $Q(s', a) \gets (1 - \alpha) \cdot Q(s, a) + \alpha \cdot (r + \gamma \cdot \max_{a'} Q(s', a'))$ + \State $s \gets s'$ + \EndWhile \EndWhile \Return $Q$ \EndProcedure