diff --git a/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.png b/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.png index 407d5d4..6140b90 100644 Binary files a/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.png and b/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.png differ diff --git a/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.tex b/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.tex index b72950a..61887b2 100644 --- a/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.tex +++ b/source-code/Pseudocode/Policy-Iteration/Policy-Iteration.tex @@ -22,8 +22,9 @@ \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$ \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$ \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$ - \Statex Transition probabilities $f$ - \Procedure{PolicyIteration}{$\mathcal{X}$, $A$, $g$, $f$} + \Statex Transition probabilities $f$, $F$ + \Statex $\alpha \in (0, 1)$ + \Procedure{PolicyIteration}{$\mathcal{X}$, $A$, $g$, $f$, $F$, $\alpha$} \State Initialize $\pi$ arbitrarily \While{$\pi$ is not converged} \State $J \gets$ solve system of linear equations $(I - \alpha \cdot F(\pi)) \cdot J = g(\pi)$ diff --git a/source-code/Pseudocode/Value-Iteration/Value-Iteration.png b/source-code/Pseudocode/Value-Iteration/Value-Iteration.png index 594b3e4..edfdf2c 100644 Binary files a/source-code/Pseudocode/Value-Iteration/Value-Iteration.png and b/source-code/Pseudocode/Value-Iteration/Value-Iteration.png differ diff --git a/source-code/Pseudocode/Value-Iteration/Value-Iteration.tex b/source-code/Pseudocode/Value-Iteration/Value-Iteration.tex index 047232f..06f362e 100644 --- a/source-code/Pseudocode/Value-Iteration/Value-Iteration.tex +++ b/source-code/Pseudocode/Value-Iteration/Value-Iteration.tex @@ -22,7 +22,7 @@ \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$ \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$ \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$ - \Statex Transition probabilities $f$ + \Statex Transition probabilities $f_{xy}(a) = \mathbb{P}(y | x, a)$ \Statex Discounting factor $\alpha \in (0, 1)$, typically $\alpha = 0.9$ \Procedure{ValueIteration}{$\mathcal{X}$, $A$, $g$, $f$, $\alpha$} \State Initialize $J, J': \mathcal{X} \rightarrow \mathbb{R}_0^+$ arbitrarily diff --git a/source-code/Pseudocode/dynamic-programming/dynamic-programming.png b/source-code/Pseudocode/dynamic-programming/dynamic-programming.png index 1eabf9a..8ef50b3 100644 Binary files a/source-code/Pseudocode/dynamic-programming/dynamic-programming.png and b/source-code/Pseudocode/dynamic-programming/dynamic-programming.png differ diff --git a/source-code/Pseudocode/dynamic-programming/dynamic-programming.tex b/source-code/Pseudocode/dynamic-programming/dynamic-programming.tex index ec6c720..8794229 100644 --- a/source-code/Pseudocode/dynamic-programming/dynamic-programming.tex +++ b/source-code/Pseudocode/dynamic-programming/dynamic-programming.tex @@ -22,7 +22,7 @@ \Statex Sates $\mathcal{X} = \{1, \dots, n_x\}$ \Statex Actions $\mathcal{A} = \{1, \dots, n_a\},\qquad A: \mathcal{X} \Rightarrow \mathcal{A}$ \Statex Cost function $g: \mathcal{X} \times \mathcal{A} \rightarrow \mathbb{R}$ - \Statex Horizon $N$ + \Statex Horizon $N \in \mathbb{N}_{\geq 1}$ \Statex Discounting factor $\alpha \in [0, 1]$ \Procedure{DynamicProgramming}{$\mathcal{X}$, $A$, $g$, $N$, $\alpha$} \State $J_N(x) \gets g_N(x) \quad \forall x \in \mathcal{X}$ @@ -36,10 +36,11 @@ \State $\pi_k(x) \gets \arg \min_a (Q_k(x, a))$ \EndFor \EndFor + \Return $\pi_{0:N-1}$ \EndProcedure \end{algorithmic} \caption{Dynamic Programming} - \label{alg:dynamic-programming} + \label{alg:dynamic-programming: Learn a strategy} \end{algorithm} \end{preview} \end{document} diff --git a/source-code/Pseudocode/label-correction/label-correction.png b/source-code/Pseudocode/label-correction/label-correction.png index 75baad2..805324f 100644 Binary files a/source-code/Pseudocode/label-correction/label-correction.png and b/source-code/Pseudocode/label-correction/label-correction.png differ diff --git a/source-code/Pseudocode/label-correction/label-correction.tex b/source-code/Pseudocode/label-correction/label-correction.tex index dfb27db..1286dc5 100644 --- a/source-code/Pseudocode/label-correction/label-correction.tex +++ b/source-code/Pseudocode/label-correction/label-correction.tex @@ -43,14 +43,13 @@ \State $u \gets d_v + g_{vt}$ \EndIf \EndIf - \If{$d_c + m_c < u$} - \State $u \gets d_c + m_c$ - \EndIf + \State $u \gets \min (u, d_c + m_c)$ \EndFor \EndWhile + \Return $u, t$ \EndProcedure \end{algorithmic} - \caption{Label correction algorithm} + \caption{Label correction algorithm: Find shortest path} \label{alg:label-correction-algorithm} \end{algorithm} \end{preview} diff --git a/source-code/Pseudocode/q-learning/q-learning.png b/source-code/Pseudocode/q-learning/q-learning.png index b01719d..35610b8 100644 Binary files a/source-code/Pseudocode/q-learning/q-learning.png and b/source-code/Pseudocode/q-learning/q-learning.png differ diff --git a/source-code/Pseudocode/q-learning/q-learning.tex b/source-code/Pseudocode/q-learning/q-learning.tex index 4b73158..bd69241 100644 --- a/source-code/Pseudocode/q-learning/q-learning.tex +++ b/source-code/Pseudocode/q-learning/q-learning.tex @@ -33,7 +33,7 @@ \While{$s$ is not terminal} \State Calculate $\pi$ according to Q and exploration strategy (e.g. $\pi(x) \gets \argmax_{a} Q(x, a)$) \State $a \gets \pi(s)$ - \State $r \gets R(s, a)$ + \State $r \gets R(s, a)$ \Comment{Receive the reward} \State $s' \gets T(s, a)$ \Comment{Receive the new state} \State $Q(s', a) \gets (1 - \alpha) \cdot Q(s, a) + \alpha \cdot (r + \gamma \cdot \max_{a'} Q(s', a'))$ \State $s \gets s'$