2
0
Fork 0
mirror of https://github.com/MartinThoma/LaTeX-examples.git synced 2025-04-19 11:38:05 +02:00

documents/write-math-ba-paper: updated content

This commit is contained in:
Martin Thoma 2014-12-24 15:13:41 +01:00
parent cea22c65c0
commit 7daf28a251
14 changed files with 8430 additions and 140 deletions

View file

@ -1,21 +1,3 @@
## TODO
### Preprocessing
* Scale-and-shift
* linear interpolation
* connect strokes
* Douglas-Peucker
### Features
* coordinates
* ink
* stroke count
* aspect ratio
### Training
* learning rate
* momentum
* Supervised layer-wise pretraining
* Check abstract!
## Spell checking
* Spell checking `aspell --lang=en --mode=tex check write-math-ba-paper.tex`
* Spell checking with `http://www.reverso.net/spell-checker`

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,35 @@
SOURCE = errors-by-epoch-pretraining
DELAY = 80
DENSITY = 300
WIDTH = 512
make:
pdflatex $(SOURCE).tex -output-format=pdf
make clean
clean:
rm -rf $(TARGET) *.class *.html *.log *.aux *.data *.gnuplot
gif:
pdfcrop $(SOURCE).pdf
convert -verbose -delay $(DELAY) -loop 0 -density $(DENSITY) $(SOURCE)-crop.pdf $(SOURCE).gif
make clean
png:
make
make svg
inkscape $(SOURCE).svg -w $(WIDTH) --export-png=$(SOURCE).png
transparentGif:
convert $(SOURCE).pdf -transparent white result.gif
make clean
svg:
make
#inkscape $(SOURCE).pdf --export-plain-svg=$(SOURCE).svg
pdf2svg $(SOURCE).pdf $(SOURCE).svg
# Necessary, as pdf2svg does not always create valid svgs:
inkscape $(SOURCE).svg --export-plain-svg=$(SOURCE).svg
rsvg-convert -a -w $(WIDTH) -f svg $(SOURCE).svg -o $(SOURCE)2.svg
inkscape $(SOURCE)2.svg --export-plain-svg=$(SOURCE).svg
rm $(SOURCE)2.svg

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,31 @@
\begin{tikzpicture}
\begin{axis}[
axis x line=middle,
axis y line=middle,
enlarge y limits=true,
xmin=0,
% xmax=1000,
ymin=0.18, ymax=0.4,
minor ytick={0, 0.01, ..., 1},
% width=15cm, height=8cm, % size of the image
grid = both,
minor grid style={dashed, gray!30},
major grid style={gray!40},,
%grid style={dashed, gray!30},
ylabel=error,
xlabel=epoch,
legend cell align=left,
legend style={
at={(0.5,-0.1)},
anchor=north,
legend columns=2
}
]
\addplot[mark=x,green] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-1.csv};
\addplot[mark=x,orange] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2.csv};
\addplot[mark=x,red] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2-pretraining.csv};
\legend{{1 hidden layer},
{2 hidden layers},
{2 hidden layers with pretraining}}
\end{axis}
\end{tikzpicture}

View file

@ -1184,7 +1184,7 @@
}
@InProceedings{Morita02,
Title = {An HMM-MLP hybrid system to recognize handwritten dates},
Title = {An {HMM}-{MLP} hybrid system to recognize handwritten dates},
Author = {Morita, M. and Oliveira, L.S. and Sabourin, R. and Bortolozzi, F. and Suen, C.Y.},
Booktitle = {IJCNN '02. Proceedings of the 2002 International Joint Conference on Neural Networks, 2002.},
Year = {2002},
@ -1585,7 +1585,7 @@
}
@InProceedings{Visvalingam1990,
Title = {The Douglas-Peucker Algorithm for Line Simplification: Re-evaluation through Visualization},
Title = {The {D}ouglas-{P}eucker Algorithm for Line Simplification: Re-evaluation through Visualization},
Author = {Visvalingam, Mahes and Whyatt, J Duncan},
Booktitle = {Computer Graphics Forum},
Year = {1990},
@ -1675,7 +1675,7 @@
}
@Misc{deeplearningweights,
Title = {Going from logistic regression to MLP},
Title = {Going from logistic regression to {MLP}},
Owner = {Martin Thoma},
Timestamp = {2014.10.30},

Binary file not shown.

View file

@ -2,8 +2,12 @@
\usepackage{amssymb, amsmath} % needed for math
\usepackage{hyperref} % links im text
\usepackage{parskip}
\usepackage[pdftex,final]{graphicx}
\usepackage{csquotes}
\usepackage{braket}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{pgfplots}
\usepackage[noadjust]{cite}
\usepackage[nameinlink,noabbrev]{cleveref} % has to be after hyperref, ntheorem, amsthm
\usepackage[binary-units]{siunitx}
@ -19,7 +23,7 @@
\hypersetup{
pdfauthor = {Martin Thoma},
pdfkeywords = {Mathematics,Symbols,recognition},
pdftitle = {On-line Recognition of Handwritten Mathematical Symbols}
pdftitle = {On-line Recognition of Handwritten Mathematical Symbols}
}
\include{variables}
@ -39,44 +43,43 @@ preprocessing steps, one data multiplication algorithm, five features and five
variants for multilayer Perceptron training were evaluated using $\num{166898}$
recordings which were collected with two crowdsourcing projects. The evaluation
results of these 21~experiments were used to create an optimized recognizer
which has a TOP1 error of less than $\SI{17.5}{\percent}$ and a TOP3 error of
which has a TOP-1 error of less than $\SI{17.5}{\percent}$ and a TOP-3 error of
$\SI{4.0}{\percent}$. This is an improvement of $\SI{18.5}{\percent}$ for the
TOP1 error and $\SI{29.7}{\percent}$ for the TOP3 error compared to the
TOP-1 error and $\SI{29.7}{\percent}$ for the TOP-3 error compared to the
baseline system.
\end{abstract}
\section{Introduction}
On-line recognition makes use of the pen trajectory. This means the data is
given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where
each group represents a stroke, $(x, y)$ is the position of the pen on a canvas
and $t$ is the time. One handwritten symbol in the described format is called
a \textit{recording}. Recordings can be classified by making use of
this data. One classification approach assigns a probability to each class
given the data. The classifier can be evaluated by using recordings which
were classified by humans and were not used by to train the classifier. The
set of those recordings is called \textit{testset}. Then
the TOP-$n$ error is defined as the fraction of the symbols where the correct
class was not within the top $n$ classes of the highest probability.
given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where each
group represents a stroke, $(x, y)$ is the position of the pen on a canvas and
$t$ is the time. One handwritten symbol in the described format is called a
\textit{recording}. One approach to classify recordings into symbol classes
assigns a probability to each class given the data. The classifier can be
evaluated by using recordings which were classified by humans and were not used
to train the classifier. The set of those recordings is called \textit{test
set}. The TOP-$n$ error is defined as the fraction of the symbols where
the correct class was not within the top $n$ classes of the highest
probability.
Various systems for mathematical symbol recognition with on-line data have been
described so far~\cite{Kosmala98,Mouchere2013}, but most of them have neither
published their source code nor their data which makes it impossible to re-run
experiments to compare different systems. This is unfortunate as the choice of
symbols is cruicial for the TOP-$n$ error. For example, the symbols $o$, $O$,
$\circ$ and $0$ are very similar and systems which know all those classes will
certainly have a higher TOP-$n$ error than systems which only accept one of
them.
symbols is crucial for the TOP-$n$ error and all systems used different symbol
sets. For example, the symbols $o$, $O$, $\circ$ and $0$ are very similar and
systems which know all those classes will certainly have a higher TOP-$n$ error
than systems which only accept one of them.
Daniel Kirsch describes in~\cite{Kirsch} a system which uses time warping to
classify on-line handwritten symbols and claimes to achieve a TOP3 error of
less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols. He also
published his data, which was collected by a crowd-sourcing approach via
\url{http://detexify.kirelabs.org}, on
\url{https://github.com/kirel/detexify-data}. Those recordings as well as
some recordings which were collected by a similar approach via
\url{http://write-math.com} were used to train and evaluated different
classifiers. A complete description of all involved software, data,
presentations and experiments is listed in~\cite{Thoma:2014}.
Daniel Kirsch describes in~\cite{Kirsch} a system called Detexify which uses
time warping to classify on-line handwritten symbols and claims to achieve a
TOP-3 error of less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols.
He also published his data on \url{https://github.com/kirel/detexify-data},
which was collected by a crowd-sourcing approach via
\url{http://detexify.kirelabs.org}. Those recordings as well as some recordings
which were collected by a similar approach via \url{http://write-math.com} were
used to train and evaluated different classifiers. A complete description of
all involved software, data and experiments is given in~\cite{Thoma:2014}.
\section{Steps in Handwriting Recognition}
The following steps are used in all classifiers which are described in the
@ -84,18 +87,18 @@ following:
\begin{enumerate}
\item \textbf{Preprocessing}: Recorded data is never perfect. Devices have
errors and people make mistakes while using devices. To tackle
these problems there are preprocessing algorithms to clean the data.
The preprocessing algorithms can also remove unnecessary variations of
the data that do not help classify but hide what is important.
Having slightly different sizes of the same symbol is an example of such a
variation. Nine preprocessing algorithms that clean or normalize
recordings are explained in
errors and people make mistakes while using devices. To tackle these
problems there are preprocessing algorithms to clean the data. The
preprocessing algorithms can also remove unnecessary variations of
the data that do not help in the classification process, but hide
what is important. Having slightly different sizes of the same symbol
is an example of such a variation. Four preprocessing algorithms that
clean or normalize recordings are explained in
\cref{sec:preprocessing}.
\item \textbf{Data multiplication}: Learning algorithms need lots of data
to learn internal parameters. If there is not enough data available,
domain knowledge can be considered to create new artificial data from
the original data. In the domain of on-line handwriting recognition
the original data. In the domain of on-line handwriting recognition,
data can be multiplied by adding rotated variants.
\item \textbf{Segmentation}: The task of formula recognition can eventually
be reduced to the task of symbol recognition combined with symbol
@ -104,12 +107,11 @@ following:
recognition, this step will not be further discussed.
\item \textbf{Feature computation}: A feature is high-level information
derived from the raw data after preprocessing. Some systems like
Detexify, which was presented in~\cite{Kirsch}, simply take the
result of the preprocessing step, but many compute new features. This
might have the advantage that less training data is needed since the
developer can use knowledge about handwriting to compute highly
discriminative features. Various features are explained in
\cref{sec:features}.
Detexify simply take the result of the preprocessing step, but many
compute new features. This might have the advantage that less
training data is needed since the developer can use knowledge about
handwriting to compute highly discriminative features. Various
features are explained in \cref{sec:features}.
\item \textbf{Feature enhancement}: Applying PCA, LDA, or
feature standardization might change the features in ways that could
improve the performance of learning algorithms.
@ -126,11 +128,10 @@ two parts:
topologies, preprocessing queues, and features in \Cref{ch:Evaluation}.
\end{enumerate}
Two fundamentally different systems for classification of time series data were
evaluated. One uses greedy time warping, which has a very easy, fast learning
algorithm which only stores some of the seen training examples. The other one is
based on neural networks, taking longer to train, but is much faster in
recognition and also leads to better recognition results.
The classification learning task can be solved with \glspl{MLP} if the number
of input features is the same for every recording. There are many ways how to
adjust \glspl{MLP} and how to adjust their training. Some of them are
described in~\cref{sec:mlp-training}.
\section{Algorithms}
\subsection{Preprocessing}\label{sec:preprocessing}
@ -140,14 +141,14 @@ and feature extraction easier, more effective or faster. It does so by resolving
errors in the input data, reducing duplicate information and removing irrelevant
information.
The preprocessing algorithms fall in two groups: Normalization and noise
Preprocessing algorithms fall in two groups: Normalization and noise
reduction algorithms.
The most important normalization algorithm in single-symbol recognition is
\textit{scale-and-shift}. It scales the recording so that
A very important normalization algorithm in single-symbol recognition is
\textit{scale-and-shift}~\cite{Thoma:2014}. It scales the recording so that
its bounding box fits into a unit square. As the aspect ratio of a recording
is almost never 1:1, only one dimension will fit exactly in the unit square.
Then there are multiple ways how to shift the recording. For this paper, it was
There are multiple ways how to shift the recording. For this paper, it was
chosen to shift the bigger dimension to fit into the $[0,1] \times [0,1]$ unit
square whereas the smaller dimension is centered in the $[-1,1] \times [-1,1]$
square.
@ -157,62 +158,88 @@ on the pen trajectory are generated asynchronously and with different
time-resolutions depending on the used hardware and software, it is desirable
to resample the recordings to have points spread equally in time for every
recording. This was done with linear interpolation of the $(x,t)$ and $(y,t)$
sequences and getting a fixed number of equally spaced samples.
sequences and getting a fixed number of equally spaced points per stroke.
\textit{Connect strokes} is a noise reduction algorithm. It happens sometimes
that the hardware detects that the user lifted the pen where he certainly
didn't do so. This can be detected by measuring the distance between the end of
one stroke and the beginning of the next stroke. If this distance is below a
threshold, then the strokes are connected.
that the hardware detects that the user lifted the pen where the user certainly
didn't do so. This can be detected by measuring the euclidean distance between
the end of one stroke and the beginning of the next stroke. If this distance is
below a threshold, then the strokes are connected.
Due to a limited resolution of the recording device and due to erratic
handwriting, the pen trajectory might not be smooth. One way to smooth is
calculating a weighted average and replacing points by the weighted average of
their coordinate and their neighbors coordinates. Another way to do smoothing
would be to reduce the number of points with the Douglas-Peucker algorithm to
the most relevant ones and then interpolate those points. The Douglas-Peucker
stroke simplification algorithm is usually used in cartography to simplify the
shape of roads. The Douglas-Peucker algorithm works recursively to find a
subset of control points of a stroke that is simpler and still similar to the
original shape. The algorithm adds the first and the last point $p_1$ and $p_n$
of a stroke to the simplified set of points $S$. Then it searches the control
point $p_i$ in between that has maximum distance from the \gls{line} $p_1 p_n$.
If this distance is above a threshold $\varepsilon$, the point $p_i$ is added
to $S$. Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively.
Pseudocode of this algorithm is on \cpageref{alg:douglas-peucker}. It is
described as \enquote{Algorithm 1} in~\cite{Visvalingam1990} with a different
notation.
the most relevant ones and then interpolate the stroke between those points.
The Douglas-Peucker stroke simplification algorithm is usually used in
cartography to simplify the shape of roads. It works recursively to find a
subset of points of a stroke that is simpler and still similar to the original
shape. The algorithm adds the first and the last point $p_1$ and $p_n$ of a
stroke to the simplified set of points $S$. Then it searches the point $p_i$ in
between that has maximum distance from the line $p_1 p_n$. If this
distance is above a threshold $\varepsilon$, the point $p_i$ is added to $S$.
Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively. It is
described as \enquote{Algorithm 1} in~\cite{Visvalingam1990}.
\subsection{Features}\label{sec:features}
Features can be global, that means calculated for the complete recording or
complete strokes. Other features are calculated for single points on the
pen trajectory and are called \textit{local}.
Features can be \textit{global}, that means calculated for the complete
recording or complete strokes. Other features are calculated for single points
on the pen trajectory and are called \textit{local}.
Global features are the \textit{number of strokes} in a recording, the
\textit{aspect ratio} of the bounding box of a recordings bounding box or the
\textit{aspect ratio} of a recordings bounding box or the
\textit{ink} being used for a recording. The ink feature gets calculated by
measuring the length of all strokes combined. The re-curvature, which was
introduced in~\cite{Huang06}, is defined as
\[\text{re-curvature}(stroke) := \frac{\text{height}(stroke)}{\text{length}(stroke)}\]
and a stroke-global feature.
The most important local feature is the coordinate of the point itself.
Speed, curvature and a local small-resolution bitmap around the point, which
was introduced by Manke et al. in~\cite{Manke94} are other local features.
The simplest local feature is the coordinate of the point itself. Speed,
curvature and a local small-resolution bitmap around the point, which was
introduced by Manke, Finke and Waibel in~\cite{Manke94}, are other local
features.
\subsection{Multilayer Perceptrons}\label{sec:mlp-training}
\Glspl{MLP} are explained in detail in~\cite{Mitchell97}. They can have
different numbers of hidden layers, the number of neurons per layer and the
activation functions can be varied. The learning algorithm is parameterized by
the learning rate $\eta$, the momentum $\alpha$ and the number of epochs. The
learning of \glspl{MLP} can be executed in various different ways, for example
with layer-wise supversided pretraining which means if a three layer \gls{MLP}
of the topology $160:500:500:500:369$ should get trained, at first a \gls{MLP}
with one hidden layer ($160:500:369$) is trained. Then the output layer is
discarded, a new hidden layer and a new output layer is added and it is trained
again. Then we have a $160:500:500:369$ \gls{MLP}. The output layer is
discarded again, a new hidden layer is added and a new output layer is added
and the training is executed again.
the learning rate $\eta \in (0, \infty)$, the momentum $\alpha \in [0, \infty)$
and the number of epochs.
The topology of \glspl{MLP} will be denoted in the following by separating
the number of neurons per layer with colons. For example, the notation $160{:}500{:}500{:}500{:}369$
means that the input layer gets 160~features, there are three hidden layers
with 500~neurons per layer and one output layer with 369~neurons.
\glspl{MLP} training can be executed in
various different ways, for example with \gls{SLP}.
In case of a \gls{MLP} with the topology $160{:}500{:}500{:}500{:}369$,
\gls{SLP} works as follows: At first a \gls{MLP} with one hidden layer ($160{:}500{:}369$)
is trained. Then the output layer is discarded, a new hidden layer and a new
output layer is added and it is trained again, resulting in a $160{:}500{:}500{:}369$
\gls{MLP}. The output layer is discarded again, a new hidden layer is added and
a new output layer is added and the training is executed again.
Denoising auto-encoders are another way of pretraining. An
\textit{auto-encoder} is a neural network that is trained to restore its input.
This means the number of input neurons is equal to the number of output
neurons. The weights define an \textit{encoding} of the input that allows
restoring the input. As the neural network finds the encoding by itself, it is
called auto-encoder. If the hidden layer is smaller than the input layer, it
can be used for dimensionality reduction~\cite{Hinton1989}. If only one hidden
layer with linear activation functions is used, then the hidden layer contains
the principal components after training~\cite{Duda2001}.
Denoising auto-encoders are a variant introduced in~\cite{Vincent2008} that
is more robust to partial corruption of the input features. It is trained to
get robust by adding noise to the input features.
There are multiple ways how noise can be added. Gaussian noise and
randomly masking elements with zero are two possibilities. \cite{Deeplearning-Denoising-AE}
describes how such a denoising auto-encoder with masking noise can be
implemented. The \texttt{corruption} is the probability of a feature being
masked.
\section{Evaluation}\label{ch:Evaluation}
In order to evaluate the effect of different preprocessing algorithms, features
@ -233,14 +260,232 @@ could have a severe influence on the effect of new features or preprocessing
steps. Each hidden layer in all evaluated systems has $500$ neurons.
Each \gls{MLP} was trained with a learning rate of $\eta = 0.1$ and a momentum
of $\alpha = 0.1$. The activation function of every neuron is
of $\alpha = 0.1$. The activation function of every neuron in a hidden layer is
the sigmoid function $\text{sig}(x) := \frac{1}{1+e^{-x}}$. The neurons in the
output layer use the softmax function. For every experiment, exactly one part
of the baseline systems was changed.
\subsection{Random Weight Initialization}
The neural networks in all experiments got initialized with a small random
weight
\[w_{i,j} \sim U(-4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}}, 4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}})\]
where $w_{i,j}$ is the weight between the neurons $i$ and $j$, $l$ is the layer
of neuron $i$, and $n_i$ is the number of neurons in layer $i$. This random
initialization was suggested on
\cite{deeplearningweights} and is done to break symmetry.
This might lead to different error rates for the same systems just because the
initialization was different.
In order to get an impression of the magnitude of the influence on the different
topologies and error rates the baseline models were trained 5 times with
random initializations.
\Cref{table:baseline-systems-random-initializations-summary}
shows a summary of the results. The more hidden layers are used, the more do
the results vary between different random weight initializations.
\begin{table}[h]
\centering
\begin{tabular}{crrr|rrr} %chktex 44
\toprule
\multirow{3}{*}{System} & \multicolumn{6}{c}{Classification error}\\
\cmidrule(l){2-7}
& \multicolumn{3}{c}{TOP-1} & \multicolumn{3}{c}{TOP-3}\\
& min & max & range & min & max & range\\\midrule
$B_1$ & $\SI{23.08}{\percent}$ & $\SI{23.44}{\percent}$ & $\SI{0.36}{\percent}$ & $\SI{6.67}{\percent}$ & $\SI{6.80}{\percent}$ & $\SI{0.13}{\percent}$ \\
$B_2$ & \underline{$\SI{21.45}{\percent}$} & \underline{$\SI{21.83}{\percent}$}& $\SI{0.38}{\percent}$ & $\SI{5.68}{\percent}$ & \underline{$\SI{5.75}{\percent}$} & $\SI{0.07}{\percent}$\\
$B_3$ & $\SI{21.54}{\percent}$ & $\SI{22.28}{\percent}$ & $\SI{0.74}{\percent}$ & \underline{$\SI{5.50}{\percent}$} & $\SI{5.82}{\percent}$ & $\SI{0.32}{\percent}$\\
$B_4$ & $\SI{23.19}{\percent}$ & $\SI{24.84}{\percent}$ & $\SI{1.65}{\percent}$ & $\SI{5.98}{\percent}$ & $\SI{6.44}{\percent}$ & $\SI{0.46}{\percent}$\\
\bottomrule
\end{tabular}
\caption{The systems $B_1$ -- $B_4$ were randomly initialized, trained
and evaluated 5~times to estimate the influence of random weight
initialization.}
\label{table:baseline-systems-random-initializations-summary}
\end{table}
\subsection{Connect strokes}
In order to solve the problem of interrupted strokes, pairs of strokes
can be connected with stroke connect algorithm. The idea is that for
a pair of consecutively drawn strokes $s_{i}, s_{i+1}$ the last point $s_i$ is
close to the first point of $s_{i+1}$ if a stroke was accidentally split
into two strokes.
$\SI{59}{\percent}$ of all stroke pair distances in the collected data are
between $\SI{30}{\pixel}$ and $\SI{150}{\pixel}$. Hence the stroke connect
algorithm was tried with $\SI{5}{\pixel}$, $\SI{10}{\pixel}$ and
$\SI{20}{\pixel}$.
All models TOP-3 error improved with a threshold of $\theta = \SI{10}{\pixel}$
by at least $\SI{0.17}{\percent}$, except $B_4$ which improved only by
$\SI{0.01}{\percent}$ which could be a result of random weight initialization.
\subsection{Douglas-Peucker Smoothing}
The Douglas-Peucker algorithm can be used to find
points that are more relevant for the overall shape of a recording. After that,
an interpolation can be done. If the interpolation is a cubic spline
interpolation, this makes the recording smooth.
The Douglas-Peucker algorithm was applied with a threshold of $\varepsilon =
0.05$, $\varepsilon = 0.1$ and $\varepsilon = 0.2$ after scaling and shifting,
but before resampling. The interpolation in the resampling step was done
linearly and with cubic splines in two experiments. The recording was scaled
and shifted again after the interpolation because the bounding box might have
changed.
The result of the application of the Douglas-Peucker smoothing with $\varepsilon
> 0.05$ was a high rise of the TOP-1 and TOP-3 error for all models $B_i$.
This means that the simplification process removes some relevant information and
does not --- as it was expected --- remove only noise. For $\varepsilon = 0.05$
with linear interpolation some models TOP-1 error improved, but the
changes were small. It could be an effect of random weight initialization.
However, cubic spline interpolation made all systems perform more than
$\SI{1.7}{\percent}$ worse for TOP-1 and TOP-3 error.
The lower the value of $\varepsilon$, the less does the recording change after
this preprocessing step. As it was applied after scaling the recording such that
the biggest dimension of the recording (width or height) is $1$, a value of
$\varepsilon = 0.05$ means that a point has to move at least $\SI{5}{\percent}$
of the biggest dimension.
\subsection{Global Features}
Single global features were added one at a time to the baseline systems. Those
features were re-curvature $\text{re-curvature}(stroke) = \frac{\text{height}(stroke)}{\text{length}(stroke)}$
as described in \cite{Huang06}, the ink feature which is the summed length
of all strokes, the stroke count, the aspect ratio and the stroke center points
for the first four strokes. The stroke center point feature improved the system
$B_1$ by $\SI{0.27}{\percent}$ for the TOP-3 error and system $B_3$ for the
TOP-1 error by $\SI{0.74}{\percent}$, but all other systems and error measures
either got worse or did not improve much.
The other global features did improve the systems $B_1 -- B_3$, but not $B_4$.
The highest improvement was achieved with the re-curvature feature. It
improved the systems $B_1 -- B_4$ by more than $\SI{0.6}{\percent}$ TOP-1 error.
\subsection{Data Multiplication}
Data multiplication can be used to make the model invariant to transformations.
However, this idea seems not to work well in the domain of on-line handwritten
mathematical symbols. It was tried to triple the data by adding a rotated
version that is rotated 3 degrees to the left and another one that is rotated
3 degrees to the right around the center of mass. This data multiplication
made all classifiers for most error measures perform worse by more than
$\SI{2}{\percent}$ for the TOP-1 error.
\subsection{Pretraining}\label{subsec:pretraining-evaluation}
Pretraining is a technique used to improve the training of \glspl{MLP} with
multiple hidden layers.
\Cref{fig:training-and-test-error-for-different-topologies-pretraining} shows
the evolution of the TOP-1 error over 1000~epochs with supervised
layer-wise pretraining and without pretraining. It clearly shows that this
kind of pretraining improves the classification performance by $\SI{1.6}{\percent}$
for the TOP-1 error and $\SI{1.0}{\percent}$ for the TOP-3 error.
\begin{figure}[htb]
\centering
\input{figures/errors-by-epoch-pretraining/errors-by-epoch-pretraining.tex}
\caption{Training- and test error by number of trained epochs for different
topologies with \gls{SLP}. The plot shows
that all pretrained systems performed much better than the systems
without pretraining. All plotted systems did not improve
with more epochs of training.}
\label{fig:training-and-test-error-for-different-topologies-pretraining}
\end{figure}
Pretraining with denoising auto-encoder lead to the much worse results listed in
\cref{table:pretraining-denoising-auto-encoder}. The first layer used a $\tanh$
activation function. Every layer was trained for $1000$ epochs and the
\gls{MSE} loss function. A learning-rate of $\eta = 0.001$, a corruption of
$0.3$ and a $L_2$ regularization of $\lambda = 10^{-4}$ were chosen. This
pretraining setup made all systems with all error measures perform much worse.
\begin{table}[tb]
\centering
\begin{tabular}{lrrrr}
\toprule
\multirow{2}{*}{System} & \multicolumn{4}{c}{Classification error}\\
\cmidrule(l){2-5}
& TOP-1 & change & TOP-3 & change \\\midrule
$B_{1,p}$ & $\SI{23.75}{\percent}$ & $\SI{+0.41}{\percent}$ & $\SI{7.19}{\percent}$ & $\SI{+0.39}{\percent}$\\
$B_{2,p}$ & \underline{$\SI{22.76}{\percent}$} & $\SI{+1.25}{\percent}$ & $\SI{6.38}{\percent}$ & $\SI{+0.63}{\percent}$\\
$B_{3,p}$ & $\SI{23.10}{\percent}$ & $\SI{+1.17}{\percent}$ & \underline{$\SI{6.14}{\percent}$} & $\SI{+0.40}{\percent}$\\
$B_{4,p}$ & $\SI{25.59}{\percent}$ & $\SI{+1.71}{\percent}$ & $\SI{6.99}{\percent}$ & $\SI{+0.87}{\percent}$\\
\bottomrule
\end{tabular}
\caption{Systems with denoising auto-encoder pretraining compared to pure
gradient descent. The pretrained systems clearly performed worse.}
\label{table:pretraining-denoising-auto-encoder}
\end{table}
\subsection{Optimized Recognizer}
All preprocessing steps and features that were useful were combined to
create a recognizer that should perform best.
All models were much better than everything that was tried before. The results
of this experiment show that single-symbol recognition with
\totalClassesAnalyzed{} classes and usual touch devices and the mouse can be
done with a TOP1 error rate of $\SI{18.56}{\percent}$ and a TOP3 error of
$\SI{4.11}{\percent}$. This was
achieved by a \gls{MLP} with a $167{:}500{:}500{:}\totalClassesAnalyzed{}$ topology.
It used an algorithm to connect strokes of which the ends were less than
$\SI{10}{\pixel}$ away, scaled each recording to a unit square and shifted this
unit square to $(0,0)$. After that, a linear resampling step was applied to the
first 4 strokes to resample them to 20 points each. All other strokes were
discarded.
The 167 features were
\begin{itemize}
\item the first 4 strokes with 20 points per stroke resulting in 160
features,
\item the re-curvature for the first 4 strokes,
\item the ink,
\item the number of strokes and
\item the aspect ratio
\end{itemize}
\Gls{SLP} was applied with $\num{1000}$ epochs per layer, a
learning rate of $\eta=0.1$ and a momentum of $\alpha=0.1$. After that, the
complete model was trained again for $1000$ epochs with standard mini-batch
gradient descent.
After the models $B_{1,c}$ -- $B_{4,c}$ were trained the first $1000$ epochs,
they were trained again for $1000$ epochs with a learning rate of $\eta = 0.05$.
\Cref{table:complex-recognizer-systems-evaluation} shows that
this improved the classifiers again.
\begin{table}[htb]
\centering
\begin{tabular}{lrrrr}
\toprule
\multirow{2}{*}{System} & \multicolumn{4}{c}{Classification error}\\
\cmidrule(l){2-5}
& TOP1 & change & TOP3 & change\\\midrule
$B_{1,c}$ & $\SI{20.96}{\percent}$ & $\SI{-2.38}{\percent}$ & $\SI{5.24}{\percent}$ & $\SI{-1.56}{\percent}$\\
$B_{2,c}$ & $\SI{18.26}{\percent}$ & $\SI{-3.25}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{-1.68}{\percent}$\\
$B_{3,c}$ & \underline{$\SI{18.19}{\percent}$} & $\SI{-3.74}{\percent}$ & \underline{$\SI{4.06}{\percent}$} & $\SI{-1.68}{\percent}$\\
$B_{4,c}$ & $\SI{18.57}{\percent}$ & $\SI{-5.31}{\percent}$ & $\SI{4.25}{\percent}$ & $\SI{-1.87}{\percent}$\\\midrule
$B_{1,c}'$ & $\SI{19.33}{\percent}$ & $\SI{-1.63}{\percent}$ & $\SI{4.78}{\percent}$ & $\SI{-0.46}{\percent}$ \\
$B_{2,c}'$ & \underline{$\SI{17.52}{\percent}$} & $\SI{-0.74}{\percent}$ & \underline{$\SI{4.04}{\percent}$} & $\SI{-0.03}{\percent}$\\
$B_{3,c}'$ & $\SI{17.65}{\percent}$ & $\SI{-0.54}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{+0.01}{\percent}$\\
$B_{4,c}'$ & $\SI{17.82}{\percent}$ & $\SI{-0.75}{\percent}$ & $\SI{4.26}{\percent}$ & $\SI{+0.01}{\percent}$\\
\bottomrule
\end{tabular}
\caption{Error rates of the optimized recognizer systems. The systems
$B_{i,c}'$ were trained another $1000$ epochs with a learning rate
of $\eta=0.05$. The value of the column \enquote{change} of the
systems $B_{i,c}'$ is relative to $B_{i,c}$.}
\label{table:complex-recognizer-systems-evaluation}
\end{table}
%TODO: Evaluation randomnes
%TODO:
\section{Conclusion}
The aim of this bachelor's thesis was to build a recognition system that
can recognize many mathematical symbols with low error rates as well as to
Four baseline recognition systems were adjusted in many experiments and their
recognition capabilities were compared in order to build a recognition system
that can recognize 396 mathematical symbols with low error rates as well as to
evaluate which preprocessing steps and features help to improve the recognition
rate.
@ -254,25 +499,14 @@ smartphones, others were created with the mouse.
\Glspl{MLP} were used for the classification task. Four baseline systems with
different numbers of hidden layers were used, as the number of hidden layer
influences the capabilities and problems of \glspl{MLP}. Furthermore, an error
measure MER was defined, which takes the top three \glspl{hypothesis} of the classifier,
merges symbols such as \verb+\sum+ ($\sum$) and \verb+\Sigma+ ($\Sigma$) to
equivalence classes, and then calculates the error.
influences the capabilities and problems of \glspl{MLP}.
All baseline systems used the same preprocessing queue. The recordings were
scaled to fit into a unit square, shifted to $(0,0)$, resampled with linear
interpolation so that every stroke had exactly 20~points which are spread
equidistant in time. The 80~($x,y$) coordinates of the first 4~strokes were used
to get exactly $160$ input features for every recording. The baseline systems
$B_2$ has a MER error of $\SI{5.67}{\percent}$.
Three variations of the scale and shift algorithm, wild point filtering, stroke
connect, weighted average smoothing, and Douglas-Peucker smoothing were
evaluated. The evaluation showed that the scale and shift algorithm is extremely
important and the connect strokes algorithm improves the classification. All
other preprocessing algorithms either diminished the classification performance
or had less influence on it than the random initialization of the \glspl{MLP}
weights.
to get exactly $160$ input features for every recording. The baseline system
$B_2$ has a TOP-3 error of $\SI{5.75}{\percent}$.
Adding two slightly rotated variants for each recording and hence tripling the
training set made the systems $B_3$ and $B_4$ perform much worse, but improved
@ -282,9 +516,7 @@ The global features re-curvature, ink, stoke count and aspect ratio improved the
systems $B_1$--$B_3$, whereas the stroke center point feature made $B_2$ perform
worse.
The learning rate and the momentum were evaluated. A learning rate of $\eta=0.1$
and a momentum of $\alpha=0.9$ gave the best results. Newbob training lead to
much worse recognition rates. Denoising auto-encoders were evaluated as one way
Denoising auto-encoders were evaluated as one way
to use pretraining, but by this the error rate increased notably. However,
supervised layer-wise pretraining improved the performance decidedly.
@ -292,21 +524,23 @@ The stroke connect algorithm was added to the preprocessing steps of the
baseline system as well as the re-curvature feature, the ink feature, the number
of strokes and the aspect ratio. The training setup of the baseline system was
changed to supervised layer-wise pretraining and the resulting model was trained
with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a MER
error of $\SI{3.96}{\percent}$. This means that the MER error dropped by over
$\SI{30}{\percent}$ in comparison to the baseline system $B_2$.
with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a TOP-3
error of $\SI{4.04}{\percent}$. This means that the TOP-3 error dropped by over
$\SI{1.7}{\percent}$ in comparison to the baseline system $B_2$.
A MER error of $\SI{3.96}{\percent}$ makes the system usable for symbol lookup.
A TOP-3 error of $\SI{4.04}{\percent}$ makes the system usable for symbol lookup.
It could also be used as a starting point for the development of a
multiple-symbol classifier.
The aim of this bachelor's thesis was to develop a symbol recognition system
which is easy to use, fast and has high recognition rates as well as evaluating
ideas for single symbol classifiers. Some of those goals were reached. The
recognition system $B_{2,c}'$ evaluates new recordings in a fraction of a second
and has acceptable recognition rates. Many variations algorithms were evaluated.
However, there are still many more algorithms which could be evaluated and, at
the time of this work, the best classifier $B_{2,c}'$ is not publicly available.
The aim of this work was to develop a symbol recognition system which is easy
to use, fast and has high recognition rates as well as evaluating ideas for
single symbol classifiers. Some of those goals were reached. The recognition
system $B_{2,c}'$ evaluates new recordings in a fraction of a second and has
acceptable recognition rates. Many algorithms were evaluated.
However, there are still many other algorithms which could be evaluated and, at
the time of this work, the best classifier $B_{2,c}'$ is only available
through the Python package \texttt{hwrt}. It is planned to add an web version
of that classifier online.
\bibliographystyle{IEEEtranSA}
\bibliography{write-math-ba-paper}