mirror of
https://github.com/MartinThoma/LaTeX-examples.git
synced 2025-04-18 19:18:21 +02:00
206 lines
No EOL
18 KiB
TeX
206 lines
No EOL
18 KiB
TeX
%!TEX root = main.tex
|
|
|
|
\appendix
|
|
\onecolumn
|
|
\section*{Overview}
|
|
\begin{table}[H]
|
|
\centering
|
|
\hspace*{-1cm}\begin{tabular}{lllll}
|
|
\toprule
|
|
Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ & Used by \\\midrule %
|
|
Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ & \cite{971754} \\
|
|
\parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ & \cite{mcculloch1943logical}\\
|
|
Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ & \cite{duch1999survey} \\
|
|
Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ & \cite{LeNet-5,Thoma:2014}\\
|
|
\gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & \cite{AlexNet-2012}\\
|
|
\parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ & \cite{maas2013rectifier,he2015delving} \\
|
|
Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ & \cite{dugas2001incorporating,glorot2011deep} \\
|
|
\gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ & \cite{clevert2015fast} \\
|
|
Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & \cite{AlexNet-2012,Thoma:2014}\\
|
|
Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ & \cite{goodfellow2013maxout} \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption[Activation functions]{Overview of activation functions. Functions
|
|
marked with $\dagger$ are not differentiable at 0 and functions
|
|
marked with $\ddagger$ operate on all elements of a layer
|
|
simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
|
|
ReLU and ELU are typically $\alpha = 0.01$. Other activation
|
|
function like randomized leaky ReLUs exist~\cite{xu2015empirical},
|
|
but are far less commonly used.\\
|
|
Some functions are smoothed versions of others, like the logistic
|
|
function for the Heaviside step function, tanh for the sign
|
|
function, softplus for ReLU.\\
|
|
Softmax is the standard activation function for the last layer of
|
|
a classification network as it produces a probability
|
|
distribution. See \Cref{fig:activation-functions-plot} for a plot
|
|
of some of them.}
|
|
\label{table:activation-functions-overview}
|
|
\end{table}
|
|
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
|
|
|
|
\section*{Evaluation Results}
|
|
\glsunset{LReLU}
|
|
\begin{table}[H]
|
|
\centering
|
|
\begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
|
|
\toprule
|
|
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
|
|
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Training set & Test set \\\midrule
|
|
Identity & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
|
|
Logistic & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$ & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
|
|
Logistic$^-$ & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$ & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
|
|
Softmax & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$ & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
|
|
Tanh & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$ & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
|
|
Softsign & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$ & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
|
|
\gls{ReLU} & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$ & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
|
|
\gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$ & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
|
|
Softplus & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$ & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
|
|
S2ReLU & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$ & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
|
|
\gls{LReLU} & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$ & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
|
|
\gls{PReLU} & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
|
|
\gls{ELU} & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption[Activation function evaluation results on CIFAR-100]{Training and
|
|
test accuracy of adjusted baseline models trained with different
|
|
activation functions on CIFAR-100. For \gls{LReLU}, $\alpha = 0.3$ was
|
|
chosen.}
|
|
\label{table:CIFAR-100-accuracies-activation-functions}
|
|
\end{table}
|
|
|
|
\begin{table}[H]
|
|
\centering
|
|
\setlength\tabcolsep{1.5pt}
|
|
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
|
|
\toprule
|
|
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
|
|
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
|
|
Identity & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$ & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
|
|
Logistic & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$ & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91} & \textbf{77.3}\\
|
|
Softmax & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$ & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
|
|
Tanh & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$ & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
|
|
Softsign & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$ & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
|
|
\gls{ReLU} & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$ & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
|
|
Softplus & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$ & \SI{88.90}{\percent} & \SI{85.73}{\percent} & 108 -- 143 & 121.0\\
|
|
\gls{LReLU} & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$ & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
|
|
\gls{PReLU} & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
|
|
\gls{ELU} & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$ & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 & 92.4\\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption[Activation function evaluation results on HASYv2]{Test accuracy of
|
|
adjusted baseline models trained with different activation
|
|
functions on HASYv2. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
|
|
\label{table:HASYv2-accuracies-activation-functions}
|
|
\end{table}
|
|
|
|
\begin{table}[H]
|
|
\centering
|
|
\setlength\tabcolsep{1.5pt}
|
|
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
|
|
\toprule
|
|
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
|
|
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
|
|
Identity & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$ & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65 & 53.4\\
|
|
Logistic & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$ & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93 & 74.6\\
|
|
Softmax & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$ & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150 & 127.5\\
|
|
Tanh & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$ & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
|
|
Softsign & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$ & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117 & 83.2\\
|
|
\gls{ReLU} & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$ & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
|
|
Softplus & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$ & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
|
|
\gls{LReLU} & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$ & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
|
|
\gls{PReLU} & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$ & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
|
|
\gls{ELU} & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$ & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption[Activation function evaluation results on STL-10]{Test accuracy of
|
|
adjusted baseline models trained with different activation
|
|
functions on STL-10. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
|
|
\label{table:STL-10-accuracies-activation-functions}
|
|
\end{table}
|
|
|
|
\begin{table}[H]
|
|
\centering
|
|
\hspace*{-1cm}\begin{tabular}{lllll}
|
|
\toprule
|
|
Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by
|
|
Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\
|
|
\parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\
|
|
Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\
|
|
Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\
|
|
\gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\
|
|
\parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
|
|
Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\
|
|
\gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
|
|
Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\
|
|
Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\
|
|
\bottomrule
|
|
\end{tabular}
|
|
\caption[Activation functions]{Overview of activation functions. Functions
|
|
marked with $\dagger$ are not differentiable at 0 and functions
|
|
marked with $\ddagger$ operate on all elements of a layer
|
|
simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
|
|
ReLU and ELU are typically $\alpha = 0.01$. Other activation
|
|
function like randomized leaky ReLUs exist~\cite{xu2015empirical},
|
|
but are far less commonly used.\\
|
|
Some functions are smoothed versions of others, like the logistic
|
|
function for the Heaviside step function, tanh for the sign
|
|
function, softplus for ReLU.\\
|
|
Softmax is the standard activation function for the last layer of
|
|
a classification network as it produces a probability
|
|
distribution. See \Cref{fig:activation-functions-plot} for a plot
|
|
of some of them.}
|
|
\label{table:activation-functions-overview}
|
|
\end{table}
|
|
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
|
|
|
|
\begin{figure}[ht]
|
|
\centering
|
|
\begin{tikzpicture}
|
|
\definecolor{color1}{HTML}{E66101}
|
|
\definecolor{color2}{HTML}{FDB863}
|
|
\definecolor{color3}{HTML}{B2ABD2}
|
|
\definecolor{color4}{HTML}{5E3C99}
|
|
\begin{axis}[
|
|
legend pos=north west,
|
|
legend cell align={left},
|
|
axis x line=middle,
|
|
axis y line=middle,
|
|
x tick label style={/pgf/number format/fixed,
|
|
/pgf/number format/fixed zerofill,
|
|
/pgf/number format/precision=1},
|
|
y tick label style={/pgf/number format/fixed,
|
|
/pgf/number format/fixed zerofill,
|
|
/pgf/number format/precision=1},
|
|
grid = major,
|
|
width=16cm,
|
|
height=8cm,
|
|
grid style={dashed, gray!30},
|
|
xmin=-2, % start the diagram at this x-coordinate
|
|
xmax= 2, % end the diagram at this x-coordinate
|
|
ymin=-1, % start the diagram at this y-coordinate
|
|
ymax= 2, % end the diagram at this y-coordinate
|
|
xlabel=x,
|
|
ylabel=y,
|
|
tick align=outside,
|
|
enlargelimits=false]
|
|
\addplot[domain=-2:2, color1, ultra thick,samples=500] {1/(1+exp(-x))};
|
|
\addplot[domain=-2:2, color2, ultra thick,samples=500] {tanh(x)};
|
|
\addplot[domain=-2:2, color4, ultra thick,samples=500] {max(0, x)};
|
|
\addplot[domain=-2:2, color4, ultra thick,samples=500, dashed] {ln(exp(x) + 1)};
|
|
\addplot[domain=-2:2, color3, ultra thick,samples=500, dotted] {max(x, exp(x) - 1)};
|
|
\addlegendentry{$\varphi_1(x)=\frac{1}{1+e^{-x}}$}
|
|
\addlegendentry{$\varphi_2(x)=\tanh(x)$}
|
|
\addlegendentry{$\varphi_3(x)=\max(0, x)$}
|
|
\addlegendentry{$\varphi_4(x)=\log(e^x + 1)$}
|
|
\addlegendentry{$\varphi_5(x)=\max(x, e^x - 1)$}
|
|
\end{axis}
|
|
\end{tikzpicture}
|
|
\caption[Activation functions]{Activation functions plotted in $[-2, +2]$.
|
|
$\tanh$ and ELU are able to produce negative numbers. The image of
|
|
ELU, ReLU and Softplus is not bound on the positive side, whereas
|
|
$\tanh$ and the logistic function are always below~1.}
|
|
\label{fig:activation-functions-plot}
|
|
\end{figure}
|
|
|
|
\glsreset{LReLU}
|
|
\twocolumn |