2
0
Fork 0
mirror of https://github.com/MartinThoma/LaTeX-examples.git synced 2025-04-19 11:38:05 +02:00

publications/activation-functions: Add

This commit is contained in:
Martin Thoma 2017-07-07 08:15:44 +02:00
parent fe5de214ef
commit 8faca5d36b
11 changed files with 17337 additions and 0 deletions

View file

@ -0,0 +1,2 @@
main.tex toplevelfile
nohypertex

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
DOKUMENT = main
make:
pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen erstellen
makeglossaries $(DOKUMENT)
bibtex $(DOKUMENT)
pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen einbinden
pdflatex $(DOKUMENT).tex -output-format=pdf # Referenzen einbinden
# make clean
ebook:
latexml --dest=$(DOKUMENT).xml $(DOKUMENT).tex
latexmlpost -dest=$(DOKUMENT).html $(DOKUMENT).xml
ebook-convert $(DOKUMENT).html $(DOKUMENT).epub --language de --no-default-epub-cover
arxiv:
zip -r upload.zip . -x \*.git\* -x MAKEFILE -x *.zip -x *.pdf
clean:
rm -rf $(TARGET) *.class *.html *.aux *.out *.thm *.idx *.toc *.ilg *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof *.xmpdata *.xmpi *.bbl
rm -rf _minted-booka4
rm -rf *.log # Analyze this for errors
# rm -rf *.bbl *.ind # Needed for arxiv

View file

@ -0,0 +1,7 @@
\begin{abstract}
This paper reviews the most common activation functions for convolution neural
networks. They are evaluated on TODO dataset and possible reasons for the
differences in their performance are given.
New state of the art results are achieved for TODO.
\end{abstract}

View file

@ -0,0 +1,123 @@
%!TEX root = main.tex
\appendix
\onecolumn
\section*{Overview}
\begin{table}[H]
\centering
\hspace*{-1cm}\begin{tabular}{lllll}
\toprule
Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by
Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\
\parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\
Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\
Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\
\gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\
\parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\
Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\
\gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\
Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\
Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\
\bottomrule
\end{tabular}
\caption[Activation functions]{Overview of activation functions. Functions
marked with $\dagger$ are not differentiable at 0 and functions
marked with $\ddagger$ operate on all elements of a layer
simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky
ReLU and ELU are typically $\alpha = 0.01$. Other activation
function like randomized leaky ReLUs exist~\cite{xu2015empirical},
but are far less commonly used.\\
Some functions are smoothed versions of others, like the logistic
function for the Heaviside step function, tanh for the sign
function, softplus for ReLU.\\
Softmax is the standard activation function for the last layer of
a classification network as it produces a probability
distribution. See \Cref{fig:activation-functions-plot} for a plot
of some of them.}
\label{table:activation-functions-overview}
\end{table}
\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.}
\section*{Evaluation Results}
\glsunset{LReLU}
\begin{table}[H]
\centering
\begin{tabular}{@{\extracolsep{4pt}}lcccccc@{}}
\toprule
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} \\\cline{2-3}\cline{4-5}\cline{6-7}
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Training set & Test set \\\midrule
Identity & \SI{66.25}{\percent} & $\boldsymbol{\sigma=0.77}$ &\SI{56.74}{\percent} & \textbf{$\sigma=0.51$} & \SI{68.77}{\percent} & \SI{58.78}{\percent}\\
Logistic & \SI{51.87}{\percent} & $\sigma=3.64$ &\SI{46.54}{\percent} & $\sigma=3.22$ & \SI{61.19}{\percent} & \SI{54.58}{\percent}\\
Logistic$^-$ & \SI{66.49}{\percent} & $\sigma=1.99$ &\SI{57.84}{\percent} & $\sigma=1.15$ & \SI{69.04}{\percent} & \SI{60.10}{\percent}\\
Softmax & \SI{75.22}{\percent} & $\sigma=2.41$ &\SI{59.49}{\percent} & $\sigma=1.25$ & \SI{78.87}{\percent} & \SI{63.06}{\percent}\\
Tanh & \SI{67.27}{\percent} & $\sigma=2.38$ &\SI{55.70}{\percent} & $\sigma=1.44$ & \SI{70.21}{\percent} & \SI{58.10}{\percent}\\
Softsign & \SI{66.43}{\percent} & $\sigma=1.74$ &\SI{55.75}{\percent} & $\sigma=0.93$ & \SI{69.78}{\percent} & \SI{58.40}{\percent}\\
\gls{ReLU} & \SI{78.62}{\percent} & $\sigma=2.15$ &\SI{62.18}{\percent} & $\sigma=0.99$ & \SI{81.81}{\percent} & \SI{64.57}{\percent}\\
\gls{ReLU}$^-$ & \SI{76.01}{\percent} & $\sigma=2.31$ &\SI{62.87}{\percent} & $\sigma=1.08$ & \SI{78.18}{\percent} & \SI{64.81}{\percent}\\
Softplus & \SI{66.75}{\percent} & $\sigma=2.45$ &\SI{56.68}{\percent} & $\sigma=1.32$ & \SI{71.27}{\percent} & \SI{60.26}{\percent}\\
S2ReLU & \SI{63.32}{\percent} & $\sigma=1.69$ &\SI{56.99}{\percent} & $\sigma=1.14$ & \SI{65.80}{\percent} & \SI{59.20}{\percent}\\
\gls{LReLU} & \SI{74.92}{\percent} & $\sigma=2.49$ &\SI{61.86}{\percent} & $\sigma=1.23$ & \SI{77.67}{\percent} & \SI{64.01}{\percent}\\
\gls{PReLU} & \textbf{\SI{80.01}{\percent}} & $\sigma=2.03$ &\SI{62.16}{\percent} & $\sigma=0.73$ & \textbf{\SI{83.50}{\percent}} & \textbf{\SI{64.79}{\percent}}\\
\gls{ELU} & \SI{76.64}{\percent} & $\sigma=1.48$ &\textbf{\SI{63.38}{\percent}} & $\sigma=0.55$ & \SI{78.30}{\percent} & \SI{64.70}{\percent}\\
\bottomrule
\end{tabular}
\caption[Activation function evaluation results on CIFAR-100]{Training and
test accuracy of adjusted baseline models trained with different
activation functions on CIFAR-100. For LReLU, $\alpha = 0.3$ was
chosen.}
\label{table:CIFAR-100-accuracies-activation-functions}
\end{table}
\glsreset{LReLU}
\begin{table}[H]
\centering
\setlength\tabcolsep{1.5pt}
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
\toprule
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
Identity & \SI{87.92}{\percent} & $\sigma=0.40$ & \SI{84.69}{\percent} & $\sigma=0.08$ & \SI{88.59}{\percent} & \SI{85.43}{\percent} & \hphantom{0}92 -- 140 & 114.5\\%TODO: Really?
Logistic & \SI{81.46}{\percent} & $\sigma=5.08$ & \SI{79.67}{\percent} & $\sigma=4.85$ & \SI{86.38}{\percent} & \SI{84.60}{\percent} & \hphantom{0}\textbf{58} -- \hphantom{0}\textbf{91} & \textbf{77.3}\\
Softmax & \SI{88.19}{\percent} & $\sigma=0.31$ & \SI{84.70}{\percent} & $\sigma=0.15$ & \SI{88.69}{\percent} & \SI{85.43}{\percent} & 124 -- 171& 145.8\\
Tanh & \SI{88.41}{\percent} & $\sigma=0.36$ & \SI{84.46}{\percent} & $\sigma=0.27$ & \SI{89.24}{\percent} & \SI{85.45}{\percent} & \hphantom{0}89 -- 123 & 108.7\\
Softsign & \SI{88.00}{\percent} & $\sigma=0.47$ & \SI{84.46}{\percent} & $\sigma=0.23$ & \SI{88.77}{\percent} & \SI{85.33}{\percent} & \hphantom{0}77 -- 119 & 104.1\\
\gls{ReLU} & \SI{88.93}{\percent} & $\sigma=0.46$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.21$ & \SI{89.35}{\percent} & \SI{85.95}{\percent} & \hphantom{0}96 -- 132 & 102.8\\
Softplus & \SI{88.42}{\percent} & $\boldsymbol{\sigma=0.29}$ & \SI{85.16}{\percent} & $\sigma=0.15$ & \SI{88.90}{\percent} & \SI{85.73}{\percent} & 108 -- 143 & 121.0\\
\gls{LReLU} & \SI{88.61}{\percent} & $\sigma=0.41$ & \SI{85.21}{\percent} & $\boldsymbol{\sigma=0.05}$ & \SI{89.07}{\percent} & \SI{85.83}{\percent} & \hphantom{0}87 -- 117 & 104.5\\
\gls{PReLU} & \textbf{\SI{89.62}{\percent}} & $\sigma=0.41$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.17$& \textbf{\SI{90.10}{\percent}} & \SI{86.01}{\percent} & \hphantom{0}85 -- 111 & 100.5\\
\gls{ELU} & \SI{89.49}{\percent} & $\sigma=0.42$ & \textbf{\SI{85.35}{\percent}} & $\sigma=0.10$ & \SI{89.94}{\percent} & \textbf{\SI{86.03}{\percent}} & \hphantom{0}73 -- 113 & 92.4\\
\bottomrule
\end{tabular}
\caption[Activation function evaluation results on HASYv2]{Test accuracy of
adjusted baseline models trained with different activation
functions on HASYv2. For LReLU, $\alpha = 0.3$ was chosen.}
\label{table:HASYv2-accuracies-activation-functions}
\end{table}
\begin{table}[H]
\centering
\setlength\tabcolsep{1.5pt}
\begin{tabular}{@{\extracolsep{4pt}}lcccccccr@{}}
\toprule
\multirow{2}{*}{Function} & \multicolumn{4}{c}{Single model} & \multicolumn{2}{c}{Ensemble of 10} & \multicolumn{2}{c}{Epochs}\\\cline{2-5}\cline{6-7}\cline{8-9}
& \multicolumn{2}{c}{Training set} &\multicolumn{2}{c}{Test set} & Train & Test & Range & \multicolumn{1}{c}{Mean} \\\midrule
Identity & \SI{87.49}{\percent} & $\sigma=2.50$ & \SI{69.86}{\percent} & $\sigma=1.41$ & \SI{89.78}{\percent} & \SI{71.90}{\percent} & \hphantom{0}51 -- \hphantom{0}65 & 53.4\\
Logistic & \SI{45.32}{\percent} & $\sigma=14.88$& \SI{40.85}{\percent} & $\sigma=12.56$ & \SI{51.06}{\percent} & \SI{45.49}{\percent} & \hphantom{0}38 -- \hphantom{0}93 & 74.6\\
Softmax & \SI{87.90}{\percent} & $\sigma=3.58$ & \SI{67.91}{\percent} & $\sigma=2.32$ & \SI{91.51}{\percent} & \SI{70.96}{\percent} & 108 -- 150 & 127.5\\
Tanh & \SI{85.38}{\percent} & $\sigma=4.04$ & \SI{67.65}{\percent} & $\sigma=2.01$ & \SI{90.47}{\percent} & \SI{71.29}{\percent} & 48 -- \hphantom{0}92 & 65.2\\
Softsign & \SI{88.57}{\percent} & $\sigma=4.00$ & \SI{69.32}{\percent} & $\sigma=1.68$ & \SI{93.04}{\percent} & \SI{72.40}{\percent} & 55 -- 117 & 83.2\\
\gls{ReLU} & \SI{94.35}{\percent} & $\sigma=3.38$ & \SI{71.01}{\percent} & $\sigma=1.63$ & \SI{98.20}{\percent} & \SI{74.85}{\percent} & 52 -- \hphantom{0}98 & 75.5\\
Softplus & \SI{83.03}{\percent} & $\sigma=2.07$ & \SI{68.28}{\percent} & $\sigma=1.74$ & \SI{93.04}{\percent} & \SI{75.99}{\percent} & 56 -- \hphantom{0}89 & 68.9\\
\gls{LReLU} & \SI{93.83}{\percent} & $\sigma=3.89$ & \SI{74.66}{\percent} & $\sigma=2.11$ & \SI{97.56}{\percent} & \SI{78.08}{\percent} & 52 -- 120 & 80.1\\
\gls{PReLU} & \SI{95.53}{\percent} & $\sigma=1.92$ & \SI{71.69}{\percent} & $\sigma=1.37$ & \SI{98.17}{\percent} & \SI{74.69}{\percent} & 59 -- 101 & 78.8\\
\gls{ELU} & \SI{95.42}{\percent} & $\sigma=3.57$ & \SI{75.09}{\percent} & $\sigma=2.39$ & \SI{98.54}{\percent} & \SI{78.66}{\percent} & 66 -- \hphantom{0}72 & 67.2\\
\bottomrule
\end{tabular}
\caption[Activation function evaluation results on STL-10]{Test accuracy of
adjusted baseline models trained with different activation
functions on STL-10. For LReLU, $\alpha = 0.3$ was chosen.}
\label{table:STL-10-accuracies-activation-functions}
\end{table}
\twocolumn

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,176 @@
%!TEX root = main.tex
\section{Introduction}
TODO\cite{Thoma:2014}
\section{Terminology}
TODO
\section{Activation Functions}
Nonlinear, differentiable activation functions are important for neural
networks to allow them to learn nonlinear decision boundaries. One of the
simplest and most widely used activation functions for \glspl{CNN} is
\gls{ReLU}~\cite{AlexNet-2012}, but others such as
\gls{ELU}~\cite{clevert2015fast}, \gls{PReLU}~\cite{he2015delving}, softplus~\cite{7280459}
and softsign~\cite{bergstra2009quadratic} have been proposed. The baseline uses
\gls{ELU}.
Activation functions differ in the range of values and the derivative. The
definitions and other comparisons of eleven activation functions are given
in~\cref{table:activation-functions-overview}.
Theoretical explanations why one activation function is preferable to another
in some scenarios are the following:
\begin{itemize}
\item \textbf{Vanishing Gradient}: Activation functions like tanh and the
logistic function saturate outside of the interval $[-5, 5]$. This
means weight updates are very small for preceding neurons, which is
especially a problem for very deep or recurrent networks as described
in~\cite{bengio1994learning}. Even if the neurons learn eventually,
learning is slower~\cite{AlexNet-2012}.
\item \textbf{Dying ReLU}: The dying \gls{ReLU} problem is similar to the
vanishing gradient problem. The gradient of the \gls{ReLU} function
is~0 for all non-positive values. This means if all elements of the
training set lead to a negative input for one neuron at any point in
the training process, this neuron does not get any update and hence
does not participate in the training process. This problem is
addressed in~\cite{maas2013rectifier}.
\item \textbf{Mean unit activation}: Some publications
like~\cite{clevert2015fast,BatchNormalization-2015} claim that mean
unit activations close to 0 are desirable. They claim that this
speeds up learning by reducing the bias shift effect. The speedup
of learning is supported by many experiments. Hence the possibility
of negative activations is desirable.
\end{itemize}
Those considerations are listed
in~\cref{table:properties-of-activation-functions} for 11~activation functions.
Besides the theoretical properties, empiric results are provided
in~\cref{table:CIFAR-100-accuracies-activation-functions,table:CIFAR-100-timing-activation-functions}.
The baseline network was adjusted so that every activation function except the
one of the output layer was replaced by one of the 11~activation functions.
As expected, \gls{PReLU} and \gls{ELU} performed best. Unexpected was that the
logistic function, tanh and softplus performed worse than the identity and it
is unclear why the pure-softmax network performed so much better than the
logistic function.
One hypothesis why the logistic function performs so bad is that it cannot
produce negative outputs. Hence the logistic$^-$ function was developed:
\[\text{logistic}^{-}(x) = \frac{1}{1+ e^{-x}} - 0.5\]
The logistic$^-$ function has the same derivative as the logistic function and
hence still suffers from the vanishing gradient problem.
The network with the logistic$^-$ function achieves an accuracy which is
\SI{11.30}{\percent} better than the network with the logistic function, but is
still \SI{5.54}{\percent} worse than the \gls{ELU}.
Similarly, \gls{ReLU} was adjusted to have a negative output:
\[\text{ReLU}^{-}(x) = \max(-1, x) = \text{ReLU}(x+1) - 1\]
The results of \gls{ReLU}$^-$ are much worse on the training set, but perform
similar on the test set. The result indicates that the possibility of hard zero
and thus a sparse representation is either not important or similar important as
the possibility to produce negative outputs. This
contradicts~\cite{glorot2011deep,srivastava2014understanding}.
A key difference between the logistic$^-$ function and \gls{ELU} is that
\gls{ELU} does neither suffers from the vanishing gradient problem nor is its
range of values bound. For this reason, the S2ReLU activation function, defined
as
\begin{align*}
\StwoReLU(x) &= \ReLU \left (\frac{x}{2} + 1 \right ) - \ReLU \left (-\frac{x}{2} + 1 \right)\\
&=
\begin{cases}-\frac{x}{2} + 1 &\text{if } x \le -2\\
x &\text{if } -2\le x \le 2\\
\frac{x}{2} + 1&\text{if } x > -2\end{cases}
\end{align*}
This function is similar to SReLUs as introduced in~\cite{jin2016deep}. The
difference is that S2ReLU does not introduce learnable parameters. The S2ReLU
was designed to be symmetric, be the identity close to zero and have a smaller
absolute value than the identity farther away. It is easy to compute and easy to
implement.
Those results --- not only the absolute values, but also the relative
comparison --- might depend on the network architecture, the training
algorithm, the initialization and the dataset. Results for MNIST can be found
in~\cref{table:MNIST-accuracies-activation-functions} and for HASYv2
in~\cref{table:HASYv2-accuracies-activation-functions}. For both datasets, the
logistic function has a much shorter training time and a noticeably lower test
accuracy.
\begin{table}[H]
\centering
\begin{tabular}{lccc}
\toprule
\multirow{2}{*}{Function} & Vanishing & Negative Activation & Bound \\
& Gradient & possible & activation \\\midrule
Identity & \cellcolor{green!25}No & \cellcolor{green!25} Yes & \cellcolor{green!25}No \\
Logistic & \cellcolor{red!25} Yes & \cellcolor{red!25} No & \cellcolor{red!25} Yes \\
Logistic$^-$ & \cellcolor{red!25} Yes & \cellcolor{green!25} Yes & \cellcolor{red!25} Yes \\
Softmax & \cellcolor{red!25} Yes & \cellcolor{green!25} Yes & \cellcolor{red!25} Yes \\
tanh & \cellcolor{red!25} Yes & \cellcolor{green!25} Yes & \cellcolor{red!25} Yes \\
Softsign & \cellcolor{red!25} Yes & \cellcolor{green!25}Yes & \cellcolor{red!25} Yes \\
ReLU & \cellcolor{yellow!25}Yes\footnotemark & \cellcolor{red!25} No & \cellcolor{yellow!25}Half-sided \\
Softplus & \cellcolor{green!25}No & \cellcolor{red!25} No & \cellcolor{yellow!25}Half-sided \\
S2ReLU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\
LReLU/PReLU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\
ELU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\
\bottomrule
\end{tabular}
\caption[Activation function properties]{Properties of activation functions.}
\label{table:properties-of-activation-functions}
\end{table}
\footnotetext{The dying ReLU problem is similar to the vanishing gradient problem.}
\glsunset{LReLU}
\begin{table}[H]
\centering
\begin{tabular}{lccclllll}
\toprule
\multirow{2}{*}{Function} & \multicolumn{2}{c}{Inference per} & Training & \multirow{2}{*}{Epochs} & Mean total \\\cline{2-3}
& 1 Image & 128 & time & & training time \\\midrule
Identity & \SI{8}{\milli\second} & \SI{42}{\milli\second} & \SI{31}{\second\per\epoch} & 108 -- \textbf{148} &\SI{3629}{\second} \\
Logistic & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{24}{\second\per\epoch} & \textbf{101} -- 167 &\textbf{\SI{2234}{\second}} \\
Logistic$^-$ & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \textbf{\SI{22}{\second\per\epoch}} & 133 -- 255 &\SI{3421}{\second} \\
Softmax & \SI{7}{\milli\second} & \SI{37}{\milli\second} & \SI{33}{\second\per\epoch} & 127 -- 248 &\SI{5250}{\second} \\
Tanh & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch} & 125 -- 211 &\SI{3141}{\second} \\
Softsign & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch} & 122 -- 205 &\SI{3505}{\second} \\
\gls{ReLU} & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch} & 118 -- 192 &\SI{3449}{\second} \\
Softplus & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{24}{\second\per\epoch} & \textbf{101} -- 165 &\SI{2718}{\second} \\
S2ReLU & \textbf{\SI{5}{\milli\second}} & \SI{32}{\milli\second} & \SI{26}{\second\per\epoch} & 108 -- 209 &\SI{3231}{\second} \\
\gls{LReLU} & \SI{7}{\milli\second} & \SI{34}{\milli\second} & \SI{25}{\second\per\epoch} & 109 -- 198 &\SI{3388}{\second} \\
\gls{PReLU} & \SI{7}{\milli\second} & \SI{34}{\milli\second} & \SI{28}{\second\per\epoch} & 131 -- 215 &\SI{3970}{\second} \\
\gls{ELU} & \SI{6}{\milli\second} & \textbf{\SI{31}{\milli\second}} & \SI{23}{\second\per\epoch} & 146 -- 232 &\SI{3692}{\second} \\
\bottomrule
\end{tabular}
\caption[Activation function timing results on CIFAR-100]{Training time and
inference time of adjusted baseline models trained with different
activation functions on GTX~970 \glspl{GPU} on CIFAR-100. It was
expected that the identity is the fastest function. This result is
likely an implementation specific problem of Keras~2.0.4 or
Tensorflow~1.1.0.}
\label{table:CIFAR-100-timing-activation-functions}
\end{table}
\begin{table}[H]
\centering
\begin{tabular}{lccccc}
\toprule
\multirow{2}{*}{Function} & \multicolumn{2}{c}{Single model} & Ensemble & \multicolumn{2}{c}{Epochs}\\\cline{2-3}\cline{5-6}
& Accuracy & std & Accuracy & Range & Mean \\\midrule
Identity & \SI{99.45}{\percent} & $\sigma=0.09$ & \SI{99.63}{\percent} & 55 -- \hphantom{0}77 & 62.2\\%TODO: Really?
Logistic & \SI{97.27}{\percent} & $\sigma=2.10$ & \SI{99.48}{\percent} & \textbf{37} -- \hphantom{0}76 & \textbf{54.5}\\
Softmax & \SI{99.60}{\percent} & $\boldsymbol{\sigma=0.03}$& \SI{99.63}{\percent} & 44 -- \hphantom{0}73 & 55.6\\
Tanh & \SI{99.40}{\percent} & $\sigma=0.09$ & \SI{99.57}{\percent} & 56 -- \hphantom{0}80 & 67.6\\
Softsign & \SI{99.40}{\percent} & $\sigma=0.08$ & \SI{99.57}{\percent} & 72 -- 101 & 84.0\\
\gls{ReLU} & \textbf{\SI{99.62}{\percent}} & \textbf{$\sigma=0.04$} & \textbf{\SI{99.73}{\percent}} & 51 -- \hphantom{0}94 & 71.7\\
Softplus & \SI{99.52}{\percent} & $\sigma=0.05$ & \SI{99.62}{\percent} & 62 -- \hphantom{0}\textbf{70} & 68.9\\
\gls{PReLU} & \SI{99.57}{\percent} & $\sigma=0.07$ & \textbf{\SI{99.73}{\percent}} & 44 -- \hphantom{0}89 & 71.2\\
\gls{ELU} & \SI{99.53}{\percent} & $\sigma=0.06$ & \SI{99.58}{\percent} & 45 -- 111 & 72.5\\
\bottomrule
\end{tabular}
\caption[Activation function evaluation results on MNIST]{Test accuracy of
adjusted baseline models trained with different activation
functions on MNIST.}
\label{table:MNIST-accuracies-activation-functions}
\end{table}
\glsreset{LReLU}

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

View file

@ -0,0 +1,43 @@
%!TEX root = booka4.tex
%Term definitions
\newacronym{ANN}{ANN}{artificial neural network}
\newacronym{ANOVA}{ANOVA}{analysis of variance}
\newacronym{ASO}{ASO}{Automatic Structure Optimization}
\newacronym{CMO}{CMO}{Confusion Matrix Ordering}
\newacronym{CE}{CE}{cross entropy}
\newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
\newacronym{CNN}{CNN}{Convolutional Neural Network}
\newacronym{CSR}{CSR}{cursive script recognition}
\newacronym{CFM}{CFM}{classification figure of merit}
\newacronym{DTW}{DTW}{dynamic time warping}
\newacronym{ELU}{ELU}{Exponential Linear Unit}
\newacronym{ES}{ES}{early stopping}
\newacronym{FLOP}{FLOP}{floating point operation}
\newacronym{FC}{FC}{Fully Connected}
\newacronym{GA}{GA}{genetic algorithm}
\newacronym{GPU}{GPU}{graphics processing unit}
\newacronym{GAN}{GAN}{Generative Adverserial Network}
\newacronym{GMM}{GMM}{Gaussian mixture model}
\newacronym{GTW}{GTW}{greedy time warping}
\newacronym{HMM}{HMM}{hidden Markov model}
\newacronym{HWR}{HWR}{handwriting recognition}
\newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
\newacronym{HSV}{HSV}{hue, saturation, value}
\newacronym{LReLU}{LReLU}{leaky rectified linear unit}
\newacronym{LDA}{LDA}{linear discriminant analysis}
\newacronym{LCN}{LCN}{Local Contrast Normalization}
\newacronym{MLP}{MLP}{multilayer perceptron}
\newacronym{MSE}{MSE}{mean squared error}
\newacronym{NAG}{NAG}{Nesterov Accellerated Momentum}
\newacronym{NEAT}{NEAT}{NeuroEvolution of Augmenting Topologies}
\newacronym{OBD}{OBD}{Optimal Brain Damage}
\newacronym{OOV}{OOV}{out of vocabulary}
\newacronym{PCA}{PCA}{principal component analysis}
\newacronym{PyPI}{PyPI}{Python Package Index}
\newacronym{PReLU}{PReLU}{parametrized rectified linear unit}
\newacronym{SGD}{SGD}{stochastic gradient descent}
\newacronym{TDNN}{TDNN}{time delay neural network}
\newacronym{SVM}{SVM}{support vector machine}
\newacronym{SLP}{SLP}{supervised layer-wise pretraining}
\newacronym{ReLU}{ReLU}{rectified linear unit}
\newacronym{ZCA}{ZCA}{Zero Components Analysis}

View file

@ -0,0 +1,99 @@
\documentclass[technote,a4paper,leqno]{IEEEtran}
\pdfoutput=1
\usepackage[utf8]{inputenc} % this is needed for umlauts
\usepackage[USenglish]{babel} % this is needed for umlauts
\usepackage[T1]{fontenc} % this is needed for correct output of umlauts in pdf
\usepackage{amsmath,amssymb}
\usepackage[table]{xcolor}
\usepackage[absolute,overlay]{textpos}
\usepackage{tikz}
\usepackage{csquotes}
\usepackage[binary-units,group-separator={,}]{siunitx}
\sisetup{per-mode=fraction,
binary-units=true,
group-separator = {\,},
range-phrase=-,
detect-weight=true,
detect-family=true}
\DeclareSIUnit\pixel{px}
\DeclareSIUnit\epoch{epoch}
\DeclareSIUnit\float{float}
\DeclareSIUnit\floats{floats}
\usepackage{caption} % nicer captions
\usepackage{url}
\usepackage{breakurl}
\usepackage[raiselinks=true,
bookmarks=true,
bookmarksopenlevel=1,
bookmarksopen=true,
bookmarksnumbered=true,
breaklinks,
hyperindex=true,
plainpages=false,
pdfpagelabels=true,
pdfborder={0 0 0.5}]{hyperref}
\def\UrlBreaks{\do\/\do-}
\usepackage{xspace}
\newcommand*\elide{\textup{[\,\dots]}\xspace}
\usepackage[nameinlink, noabbrev,capitalise]{cleveref}
\title{A review of activation functions for convolutional neural networks}
\author{%
\IEEEauthorblockN{Martin Thoma}\\
\IEEEauthorblockA{E-Mail: info@martin-thoma.de} % ORCID: http://orcid.org/0000-0002-6517-1690
}
\hypersetup{
pdfauthor = {Martin Thoma},
pdfkeywords = {activation functions, review},
pdfsubject = {activation functions},
pdftitle = {A review of activation functions for convolutional neural networks},
}
\usepackage[inline]{enumitem}
\usepackage{longtable}
\usepackage{booktabs} % \toprule
\usepackage{braket} % needed for \Set
\usepackage{algorithm,algpseudocode}
\usepackage[xindy,toc,section=chapter,numberedsection=autolabel]{glossaries}
% Make document nicer
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\sech}{sech}
\DeclareMathOperator*{\conv}{conv}
\DeclareMathOperator*{\ReLU}{ReLU}
\DeclareMathOperator*{\StwoReLU}{S2ReLU}
\DeclareMathOperator*{\logistic}{logistic}
\newcommand*\diff{\mathop{}\!\mathrm{d}}
\usepackage{tensor}
\usepackage{parskip}
\usepackage{multirow}
\usepackage{microtype}
\loadglsentries[main]{glossary}
\makeglossaries
% % Variables
% \newcommand{\dbTotalClasses}{369}
% \newcommand{\dbTotalInstances}{\num{168233}}
% \newcommand{\dbName}{HASY}
% \newcommand{\dbNameVersion}{HASYv2}
% \newcommand{\dbSizeMB}{34.6}
% \newcommand{\dbDownloadURL}{\url{https://doi.org/10.5281/zenodo.259444}}
% \newcommand{\dbMDfivesum}{fddf23f36e24b5236f6b3a0880c778e3}
% Start
\begin{document}
\maketitle
\input{abstract}
\input{content}
\bibliographystyle{IEEEtranSA}
\bibliography{bibliography}
\input{appendix}
\end{document}