LaTeX-examples/publications/activation-functions/appendix.tex

%!TEX root = main.tex

\appendix
\onecolumn
\section*{Overview}
\begin{table}[H]
    \centering
    \hspace*{-1cm}\begin{tabular}{lllll}
    \toprule
    Name                     & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ & Used by \\\midrule %
    Sign function$^\dagger$  & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$                              & $0$                    & \cite{971754} \\
    \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$  & $0$                       & \cite{mcculloch1943logical}\\
    Logistic function        & $\frac{1}{1+e^{-x}}$                           & $[0, 1]$                                                        & $\frac{e^x}{(e^x +1)^2}$  & \cite{duch1999survey} \\
    Tanh                     & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$                                                       & $\sech^2(x)$              & \cite{LeNet-5,Thoma:2014}\\
    \gls{ReLU}$^\dagger$           & $\max(0, x)$                                   & $[0, +\infty)$                                                  & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$      & \cite{AlexNet-2012}\\
    \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$                        & $(-\infty, +\infty)$                                             & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ & \cite{maas2013rectifier,he2015delving} \\
    Softplus                 & $\log(e^x + 1)$                                & $(0, +\infty)$                                       & $\frac{e^x}{e^x + 1}$    & \cite{dugas2001incorporating,glorot2011deep} \\
    \gls{ELU}                & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ & \cite{clevert2015fast} \\
    Softmax$^\ddagger$       & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$    & $[0, 1]^K$                                           & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$         & \cite{AlexNet-2012,Thoma:2014}\\
    Maxout$^\ddagger$        & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$                 & $(-\infty, +\infty)$                                 & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$          & \cite{goodfellow2013maxout}       \\
    \bottomrule
    \end{tabular}
    \caption[Activation functions]{Overview of activation functions. Functions
             marked with $\dagger$ are not differentiable at 0 and functions
             marked with $\ddagger$ operate on all elements of a layer
             simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
             ReLU and ELU are typically $\alpha = 0.01$. Other activation
             function like randomized leaky ReLUs exist~\cite{xu2015empirical},
             but are far less commonly used.\\
             Some functions are smoothed versions of others, like the logistic
             function for the Heaviside step function, tanh for the sign
             function, softplus for ReLU.\\
             Softmax is the standard activation function for the last layer of
             a classification network as it produces a probability
             distribution. See \Cref{fig:activation-functions-plot} for a plot
             of some of them.}
    \label{table:activation-functions-overview}
\end{table}
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}

\section*{Evaluation Results}
\glsunset{LReLU}
\begin{table}[H]
    \centering
    \begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
    \toprule
    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}                                                    & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
                   & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Training set         & Test set \\\midrule
    Identity       & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
    Logistic       & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$          & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
    Logistic$^-$   & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$          & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
    Softmax        & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$          & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
    Tanh           & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$          & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
    Softsign       & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$          & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
    \gls{ReLU}     & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$          & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
    \gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$          & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
    Softplus       & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$          & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
    S2ReLU         & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$          & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
    \gls{LReLU}    & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$          & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
    \gls{PReLU}    & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
    \gls{ELU}      & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
    \bottomrule
    \end{tabular}
    \caption[Activation function evaluation results on CIFAR-100]{Training and
             test accuracy of adjusted baseline models trained with different
             activation functions on CIFAR-100. For \gls{LReLU}, $\alpha = 0.3$ was
             chosen.}
    \label{table:CIFAR-100-accuracies-activation-functions}
\end{table}

\begin{table}[H]
    \centering
    \setlength\tabcolsep{1.5pt}
    \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
    \toprule
    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}              & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
                              & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Train                & Test                 & Range     & \multicolumn{1}{c}{Mean} \\\midrule
    Identity                  & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$         & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
    Logistic                  & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$         & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91}  & \textbf{77.3}\\
    Softmax                   & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$         & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
    Tanh                      & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$         & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
    Softsign                  & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$         & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
    \gls{ReLU}                & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$         & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
    Softplus                  & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$         & \SI{88.90}{\percent} & \SI{85.73}{\percent} &            108 -- 143 & 121.0\\
    \gls{LReLU}               & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$         & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
    \gls{PReLU}               & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
    \gls{ELU}                 & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$         & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 &  92.4\\
    \bottomrule
    \end{tabular}
    \caption[Activation function evaluation results on HASYv2]{Test accuracy of
             adjusted baseline models trained with different activation
             functions on HASYv2. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
    \label{table:HASYv2-accuracies-activation-functions}
\end{table}

\begin{table}[H]
    \centering
    \setlength\tabcolsep{1.5pt}
    \begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
    \toprule
    \multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model}              & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
                              & \multicolumn{2}{c}{Training set}     &\multicolumn{2}{c}{Test set}                  & Train                & Test                 & Range     & \multicolumn{1}{c}{Mean} \\\midrule
    Identity                  & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$         & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65  &  53.4\\
    Logistic                  & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$        & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93  &  74.6\\
    Softmax                   & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$         & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150           & 127.5\\
    Tanh                      & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$         & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
    Softsign                  & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$         & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117            & 83.2\\
    \gls{ReLU}                & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$         & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
    Softplus                  & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$         & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
    \gls{LReLU}               & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$         & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
    \gls{PReLU}               & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$         & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
    \gls{ELU}                 & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$         & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
    \bottomrule
    \end{tabular}
    \caption[Activation function evaluation results on STL-10]{Test accuracy of
             adjusted baseline models trained with different activation
             functions on STL-10. For \gls{LReLU}, $\alpha = 0.3$ was chosen.}
    \label{table:STL-10-accuracies-activation-functions}
\end{table}

\begin{table}[H]
    \centering
    \hspace*{-1cm}\begin{tabular}{lllll}
    \toprule
    Name                     & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by
    Sign function$^\dagger$  & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$                              & $0$                    \\%& \cite{971754} \\
    \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$  & $0$                       \\%& \cite{mcculloch1943logical}\\
    Logistic function        & $\frac{1}{1+e^{-x}}$                           & $[0, 1]$                                                        & $\frac{e^x}{(e^x +1)^2}$  \\%& \cite{duch1999survey} \\
    Tanh                     & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$                                                       & $\sech^2(x)$              \\%& \cite{LeNet-5,Thoma:2014}\\
    \gls{ReLU}$^\dagger$           & $\max(0, x)$                                   & $[0, +\infty)$                                                  & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$      \\%& \cite{AlexNet-2012}\\
    \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$                        & $(-\infty, +\infty)$                                             & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
    Softplus                 & $\log(e^x + 1)$                                & $(0, +\infty)$                                       & $\frac{e^x}{e^x + 1}$    \\%& \cite{dugas2001incorporating,glorot2011deep} \\
    \gls{ELU}                & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
    Softmax$^\ddagger$       & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$    & $[0, 1]^K$                                           & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$          \\%& \cite{AlexNet-2012,Thoma:2014}\\
    Maxout$^\ddagger$        & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$                 & $(-\infty, +\infty)$                                 & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$          \\%& \cite{goodfellow2013maxout}       \\
    \bottomrule
    \end{tabular}
    \caption[Activation functions]{Overview of activation functions. Functions
             marked with $\dagger$ are not differentiable at 0 and functions
             marked with $\ddagger$ operate on all elements of a layer
             simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
             ReLU and ELU are typically $\alpha = 0.01$. Other activation
             function like randomized leaky ReLUs exist~\cite{xu2015empirical},
             but are far less commonly used.\\
             Some functions are smoothed versions of others, like the logistic
             function for the Heaviside step function, tanh for the sign
             function, softplus for ReLU.\\
             Softmax is the standard activation function for the last layer of
             a classification network as it produces a probability
             distribution. See \Cref{fig:activation-functions-plot} for a plot
             of some of them.}
    \label{table:activation-functions-overview}
\end{table}
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}

\begin{figure}[ht]
    \centering
    \begin{tikzpicture}
        \definecolor{color1}{HTML}{E66101}
        \definecolor{color2}{HTML}{FDB863}
        \definecolor{color3}{HTML}{B2ABD2}
        \definecolor{color4}{HTML}{5E3C99}
        \begin{axis}[
            legend pos=north west,
            legend cell align={left},
            axis x line=middle,
            axis y line=middle,
            x tick label style={/pgf/number format/fixed,
                                /pgf/number format/fixed zerofill,
                                /pgf/number format/precision=1},
            y tick label style={/pgf/number format/fixed,
                                /pgf/number format/fixed zerofill,
                                /pgf/number format/precision=1},
            grid = major,
            width=16cm,
            height=8cm,
            grid style={dashed, gray!30},
            xmin=-2,     % start the diagram at this x-coordinate
            xmax= 2,     % end   the diagram at this x-coordinate
            ymin=-1,     % start the diagram at this y-coordinate
            ymax= 2,     % end   the diagram at this y-coordinate
            xlabel=x,
            ylabel=y,
            tick align=outside,
            enlargelimits=false]
          \addplot[domain=-2:2, color1, ultra thick,samples=500] {1/(1+exp(-x))};
          \addplot[domain=-2:2, color2, ultra thick,samples=500] {tanh(x)};
          \addplot[domain=-2:2, color4, ultra thick,samples=500] {max(0, x)};
          \addplot[domain=-2:2, color4, ultra thick,samples=500, dashed] {ln(exp(x) + 1)};
          \addplot[domain=-2:2, color3, ultra thick,samples=500, dotted] {max(x, exp(x) - 1)};
          \addlegendentry{$\varphi_1(x)=\frac{1}{1+e^{-x}}$}
          \addlegendentry{$\varphi_2(x)=\tanh(x)$}
          \addlegendentry{$\varphi_3(x)=\max(0, x)$}
          \addlegendentry{$\varphi_4(x)=\log(e^x + 1)$}
          \addlegendentry{$\varphi_5(x)=\max(x, e^x - 1)$}
        \end{axis}
    \end{tikzpicture}
    \caption[Activation functions]{Activation functions plotted in $[-2, +2]$.
             $\tanh$ and ELU are able to produce negative numbers. The image of
             ELU, ReLU and Softplus is not bound on the positive side, whereas
             $\tanh$ and the logistic function are always below~1.}
    \label{fig:activation-functions-plot}
\end{figure}

\glsreset{LReLU}
\twocolumn