diff --git a/documents/cv-curriculum-vitae/cv-curriculum-vitae.tex b/documents/cv-curriculum-vitae/cv-curriculum-vitae.tex index a2525fe..4dd9e22 100644 --- a/documents/cv-curriculum-vitae/cv-curriculum-vitae.tex +++ b/documents/cv-curriculum-vitae/cv-curriculum-vitae.tex @@ -163,22 +163,6 @@ Coding conventions and basic OOP was part of the course. All of my German presen and a big, but algorithmically not challenging project. To be honest, I only fixed some Java bugs.}\\ -%---------------------------------------------------------------------------------------- -% WORK EXPERIENCE -2- - -{\raggedleft\textsc{2011}\par} - -{\raggedright\large Student research assistant at \textsc{ Institute of Toxicology and Genetics}, KIT\\ -\textit{participating in a university research project}\\[5pt]} - -\normalsize{In summer 2011 I worked for over a month for a -research project at KIT. I have written bash scripts for file -conversions, fixed some bugs and re-written a slow Mathematica script -in a much faster Python version. But it quickly turned out that -this project had a lot of C++ source which was rarely commented or -documented. I realized, that I wouldn't have time for this project -after beginning my studies at university.}\\ - %---------------------------------------------------------------------------------------- % WORK EXPERIENCE -4- @@ -208,7 +192,7 @@ after beginning my studies at university.}\\ \colorbox{shade}{\textcolor{text1}{ \begin{tabular}{c|p{7cm}} -\raisebox{-4pt}{\textifsymbol{18}} & Parkstraße 17, 76131 Karlsruhe \\ % Address +\raisebox{-4pt}{\textifsymbol{18}} & Alte Allee 107, 81245 Munich \\ % Address \raisebox{-3pt}{\Mobilefone} & +49 $($1636$)$ 28 04 91 \\ % Phone number \raisebox{-1pt}{\Letter} & \href{mailto:info@martin-thoma.de}{info@martin-thoma.de} \\ % Email address \Keyboard & \href{http://martin-thoma.com}{martin-thoma.com} \\ % Website @@ -331,6 +315,22 @@ Good Knowledge & \textsc{Python}\\ \\ %---------------------------------------------------------------------------------------- \section{Work Experience} +%---------------------------------------------------------------------------------------- +% WORK EXPERIENCE -2- + +{\raggedleft\textsc{2011}\par} + +{\raggedright\large Student research assistant at \textsc{ Institute of Toxicology and Genetics}, KIT\\ +\textit{participating in a university research project}\\[5pt]} + +\normalsize{In summer 2011 I worked for over a month for a +research project at KIT. I have written bash scripts for file +conversions, fixed some bugs and re-written a slow Mathematica script +in a much faster Python version. But it quickly turned out that +this project had a lot of C++ source which was rarely commented or +documented. I realized, that I wouldn't have time for this project +after beginning my studies at university.}\\ + %---------------------------------------------------------------------------------------- % WORK EXPERIENCE -3- diff --git a/documents/math-minimal-distance-to-cubic-function/math-minimal-distance-to-cubic-function.pdf b/documents/math-minimal-distance-to-cubic-function/math-minimal-distance-to-cubic-function.pdf index 80554b6..cdf075d 100644 Binary files a/documents/math-minimal-distance-to-cubic-function/math-minimal-distance-to-cubic-function.pdf and b/documents/math-minimal-distance-to-cubic-function/math-minimal-distance-to-cubic-function.pdf differ diff --git a/publications/activation-functions/abstract.tex b/publications/activation-functions/abstract.tex index 85d6f1e..6ee756f 100644 --- a/publications/activation-functions/abstract.tex +++ b/publications/activation-functions/abstract.tex @@ -1,7 +1,8 @@ \begin{abstract} This paper reviews the most common activation functions for convolution neural -networks. They are evaluated on TODO dataset and possible reasons for the -differences in their performance are given. +networks. They are evaluated on the Asirra, GTSRB, HASYv2, STL-10, CIFAR-10, +CIFAR-100 and MNIST dataset. Possible reasons for the differences in their +performance are given. -New state of the art results are achieved for TODO. +New state of the art results are achieved for Asirra, GTSRB, HASYv2 and STL-10. \end{abstract} diff --git a/publications/activation-functions/appendix.tex b/publications/activation-functions/appendix.tex index 4b27cd7..f51a024 100644 --- a/publications/activation-functions/appendix.tex +++ b/publications/activation-functions/appendix.tex @@ -7,17 +7,17 @@ \centering \hspace*{-1cm}\begin{tabular}{lllll} \toprule - Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by - Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\ - \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\ - Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\ - Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\ - \gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\ - \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\ - Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\ - \gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\ - Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\ - Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\ + Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ & Used by \\\midrule % + Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ & \cite{971754} \\ + \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ & \cite{mcculloch1943logical}\\ + Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ & \cite{duch1999survey} \\ + Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ & \cite{LeNet-5,Thoma:2014}\\ + \gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & \cite{AlexNet-2012}\\ + \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ & \cite{maas2013rectifier,he2015delving} \\ + Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ & \cite{dugas2001incorporating,glorot2011deep} \\ + \gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ & \cite{clevert2015fast} \\ + Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & \cite{AlexNet-2012,Thoma:2014}\\ + Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ & \cite{goodfellow2013maxout} \\ \bottomrule \end{tabular} \caption[Activation functions]{Overview of activation functions. Functions @@ -63,13 +63,11 @@ \end{tabular} \caption[Activation function evaluation results on CIFAR-100]{Training and test accuracy of adjusted baseline models trained with different - activation functions on CIFAR-100. For LReLU, $\alpha = 0.3$ was + activation functions on CIFAR-100. For \gls{LReLU}, $\alpha = 0.3$ was chosen.} \label{table:CIFAR-100-accuracies-activation-functions} \end{table} -\glsreset{LReLU} - \begin{table}[H] \centering \setlength\tabcolsep{1.5pt} @@ -91,7 +89,7 @@ \end{tabular} \caption[Activation function evaluation results on HASYv2]{Test accuracy of adjusted baseline models trained with different activation - functions on HASYv2. For LReLU, $\alpha = 0.3$ was chosen.} + functions on HASYv2. For \gls{LReLU}, $\alpha = 0.3$ was chosen.} \label{table:HASYv2-accuracies-activation-functions} \end{table} @@ -116,8 +114,93 @@ \end{tabular} \caption[Activation function evaluation results on STL-10]{Test accuracy of adjusted baseline models trained with different activation - functions on STL-10. For LReLU, $\alpha = 0.3$ was chosen.} + functions on STL-10. For \gls{LReLU}, $\alpha = 0.3$ was chosen.} \label{table:STL-10-accuracies-activation-functions} \end{table} +\begin{table}[H] + \centering + \hspace*{-1cm}\begin{tabular}{lllll} + \toprule + Name & Function $\varphi(x)$ & Range of Values & $\varphi'(x)$ \\\midrule % & Used by + Sign function$^\dagger$ & $\begin{cases}+1 &\text{if } x \geq 0\\-1 &\text{if } x < 0\end{cases}$ & $\Set{-1,1}$ & $0$ \\%& \cite{971754} \\ + \parbox[t]{2.6cm}{Heaviside\\step function$^\dagger$} & $\begin{cases}+1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ & $\Set{0, 1}$ & $0$ \\%& \cite{mcculloch1943logical}\\ + Logistic function & $\frac{1}{1+e^{-x}}$ & $[0, 1]$ & $\frac{e^x}{(e^x +1)^2}$ \\%& \cite{duch1999survey} \\ + Tanh & $\frac{e^x - e^{-x}}{e^x + e^{-x}} = \tanh(x)$ & $[-1, 1]$ & $\sech^2(x)$ \\%& \cite{LeNet-5,Thoma:2014}\\ + \gls{ReLU}$^\dagger$ & $\max(0, x)$ & $[0, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\0 &\text{if } x < 0\end{cases}$ \\%& \cite{AlexNet-2012}\\ + \parbox[t]{2.6cm}{\gls{LReLU}$^\dagger$\footnotemark\\(\gls{PReLU})} & $\varphi(x) = \max(\alpha x, x)$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha &\text{if } x < 0\end{cases}$ \\%& \cite{maas2013rectifier,he2015delving} \\ + Softplus & $\log(e^x + 1)$ & $(0, +\infty)$ & $\frac{e^x}{e^x + 1}$ \\%& \cite{dugas2001incorporating,glorot2011deep} \\ + \gls{ELU} & $\begin{cases}x &\text{if } x > 0\\\alpha (e^x - 1) &\text{if } x \leq 0\end{cases}$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x > 0\\\alpha e^x &\text{otherwise}\end{cases}$ \\%& \cite{clevert2015fast} \\ + Softmax$^\ddagger$ & $o(\mathbf{x})_j = \frac{e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ & $[0, 1]^K$ & $o(\mathbf{x})_j \cdot \frac{\sum_{k=1}^K e^{x_k} - e^{x_j}}{\sum_{k=1}^K e^{x_k}}$ \\%& \cite{AlexNet-2012,Thoma:2014}\\ + Maxout$^\ddagger$ & $o(\mathbf{x}) = \max_{x \in \mathbf{x}} x$ & $(-\infty, +\infty)$ & $\begin{cases}1 &\text{if } x_i = \max \mathbf{x}\\0 &\text{otherwise}\end{cases}$ \\%& \cite{goodfellow2013maxout} \\ + \bottomrule + \end{tabular} + \caption[Activation functions]{Overview of activation functions. Functions + marked with $\dagger$ are not differentiable at 0 and functions + marked with $\ddagger$ operate on all elements of a layer + simultaneously. The hyperparameters $\alpha \in (0, 1)$ of Leaky + ReLU and ELU are typically $\alpha = 0.01$. Other activation + function like randomized leaky ReLUs exist~\cite{xu2015empirical}, + but are far less commonly used.\\ + Some functions are smoothed versions of others, like the logistic + function for the Heaviside step function, tanh for the sign + function, softplus for ReLU.\\ + Softmax is the standard activation function for the last layer of + a classification network as it produces a probability + distribution. See \Cref{fig:activation-functions-plot} for a plot + of some of them.} + \label{table:activation-functions-overview} +\end{table} +\footnotetext{$\alpha$ is a hyperparameter in leaky ReLU, but a learnable parameter in the parametric ReLU function.} + +\begin{figure}[ht] + \centering + \begin{tikzpicture} + \definecolor{color1}{HTML}{E66101} + \definecolor{color2}{HTML}{FDB863} + \definecolor{color3}{HTML}{B2ABD2} + \definecolor{color4}{HTML}{5E3C99} + \begin{axis}[ + legend pos=north west, + legend cell align={left}, + axis x line=middle, + axis y line=middle, + x tick label style={/pgf/number format/fixed, + /pgf/number format/fixed zerofill, + /pgf/number format/precision=1}, + y tick label style={/pgf/number format/fixed, + /pgf/number format/fixed zerofill, + /pgf/number format/precision=1}, + grid = major, + width=16cm, + height=8cm, + grid style={dashed, gray!30}, + xmin=-2, % start the diagram at this x-coordinate + xmax= 2, % end the diagram at this x-coordinate + ymin=-1, % start the diagram at this y-coordinate + ymax= 2, % end the diagram at this y-coordinate + xlabel=x, + ylabel=y, + tick align=outside, + enlargelimits=false] + \addplot[domain=-2:2, color1, ultra thick,samples=500] {1/(1+exp(-x))}; + \addplot[domain=-2:2, color2, ultra thick,samples=500] {tanh(x)}; + \addplot[domain=-2:2, color4, ultra thick,samples=500] {max(0, x)}; + \addplot[domain=-2:2, color4, ultra thick,samples=500, dashed] {ln(exp(x) + 1)}; + \addplot[domain=-2:2, color3, ultra thick,samples=500, dotted] {max(x, exp(x) - 1)}; + \addlegendentry{$\varphi_1(x)=\frac{1}{1+e^{-x}}$} + \addlegendentry{$\varphi_2(x)=\tanh(x)$} + \addlegendentry{$\varphi_3(x)=\max(0, x)$} + \addlegendentry{$\varphi_4(x)=\log(e^x + 1)$} + \addlegendentry{$\varphi_5(x)=\max(x, e^x - 1)$} + \end{axis} + \end{tikzpicture} + \caption[Activation functions]{Activation functions plotted in $[-2, +2]$. + $\tanh$ and ELU are able to produce negative numbers. The image of + ELU, ReLU and Softplus is not bound on the positive side, whereas + $\tanh$ and the logistic function are always below~1.} + \label{fig:activation-functions-plot} +\end{figure} + +\glsreset{LReLU} \twocolumn \ No newline at end of file diff --git a/publications/activation-functions/content.tex b/publications/activation-functions/content.tex index cf34b82..059ae29 100644 --- a/publications/activation-functions/content.tex +++ b/publications/activation-functions/content.tex @@ -1,24 +1,42 @@ %!TEX root = main.tex \section{Introduction} -TODO\cite{Thoma:2014} +Artificial neural networks have dozends of hyperparameters which influence +their behaviour during training and evaluation time. One parameter is the +choice of activation functions. While in principle every neuron could have a +different activation function, in practice networks only use two activation +functions: The softmax function for the output layer in order to obtain a +probability distribution over the possible classes and one activation function +for all other neurons. -\section{Terminology} -TODO +Activation functions should have the following properties: +\begin{itemize} + \item \textbf{Non-linearity}: A linear activation function in a simple feed + forward network leads to a linear function. This means no matter how + many layers the network uses, there is an equivalent network with + only the input and the output layer. Please note that \glspl{CNN} are + different. Padding and pooling are also non-linear operations. + \item \textbf{Differentiability}: Activation functions need to be + differentiable in order to be able to apply gradient descent. It is + not necessary that they are differentiable at any point. In practice, + the gradient at non-differentiable points can simply be set to zero + in order to prevent weight updates at this point. + \item \textbf{Non-zero gradient}: The sign function is not suitable for + gradient descent based optimizers as its gradient is zero at all + differentiable points. An activation function should have infinitely + many points with non-zero gradient. +\end{itemize} - -\section{Activation Functions} -Nonlinear, differentiable activation functions are important for neural -networks to allow them to learn nonlinear decision boundaries. One of the -simplest and most widely used activation functions for \glspl{CNN} is -\gls{ReLU}~\cite{AlexNet-2012}, but others such as +One of the simplest and most widely used activation functions for \glspl{CNN} +is \gls{ReLU}~\cite{AlexNet-2012}, but others such as \gls{ELU}~\cite{clevert2015fast}, \gls{PReLU}~\cite{he2015delving}, softplus~\cite{7280459} -and softsign~\cite{bergstra2009quadratic} have been proposed. The baseline uses -\gls{ELU}. +and softsign~\cite{bergstra2009quadratic} have been proposed. Activation functions differ in the range of values and the derivative. The definitions and other comparisons of eleven activation functions are given in~\cref{table:activation-functions-overview}. + +\section{Important Differences of Proposed Activation Functions} Theoretical explanations why one activation function is preferable to another in some scenarios are the following: \begin{itemize} @@ -96,6 +114,7 @@ in~\cref{table:HASYv2-accuracies-activation-functions}. For both datasets, the logistic function has a much shorter training time and a noticeably lower test accuracy. +\glsunset{LReLU} \begin{table}[H] \centering \begin{tabular}{lccc} @@ -111,7 +130,7 @@ accuracy. ReLU & \cellcolor{yellow!25}Yes\footnotemark & \cellcolor{red!25} No & \cellcolor{yellow!25}Half-sided \\ Softplus & \cellcolor{green!25}No & \cellcolor{red!25} No & \cellcolor{yellow!25}Half-sided \\ S2ReLU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\ - LReLU/PReLU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\ + \gls{LReLU}/PReLU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\ ELU & \cellcolor{green!25}No & \cellcolor{green!25}Yes & \cellcolor{green!25} No \\ \bottomrule \end{tabular} @@ -120,8 +139,6 @@ accuracy. \end{table} \footnotetext{The dying ReLU problem is similar to the vanishing gradient problem.} -\glsunset{LReLU} - \begin{table}[H] \centering \begin{tabular}{lccclllll} @@ -173,4 +190,5 @@ accuracy. functions on MNIST.} \label{table:MNIST-accuracies-activation-functions} \end{table} -\glsreset{LReLU} \ No newline at end of file +\glsreset{LReLU} + diff --git a/publications/activation-functions/main.tex b/publications/activation-functions/main.tex index 70f0c22..2ef6f6d 100644 --- a/publications/activation-functions/main.tex +++ b/publications/activation-functions/main.tex @@ -7,7 +7,15 @@ \usepackage{amsmath,amssymb} \usepackage[table]{xcolor} \usepackage[absolute,overlay]{textpos} +\usepackage{pgfplots} +\pgfplotsset{compat=1.13} \usepackage{tikz} +\usetikzlibrary{arrows.meta} +\usetikzlibrary{decorations.pathreplacing} +\usetikzlibrary{positioning} +\usetikzlibrary{decorations.text} +\usetikzlibrary{decorations.pathmorphing} +\usetikzlibrary{shapes.multipart, calc} \usepackage{csquotes} \usepackage[binary-units,group-separator={,}]{siunitx} \sisetup{per-mode=fraction, @@ -59,7 +67,7 @@ \usepackage{braket} % needed for \Set \usepackage{algorithm,algpseudocode} -\usepackage[xindy,toc,section=chapter,numberedsection=autolabel]{glossaries} +\usepackage[xindy,toc,section=section]{glossaries} % Make document nicer \DeclareMathOperator*{\argmin}{arg\,min} @@ -93,6 +101,7 @@ \input{content} \bibliographystyle{IEEEtranSA} \bibliography{bibliography} +\printglossaries% \input{appendix}