mirror of
https://github.com/MartinThoma/LaTeX-examples.git
synced 2025-04-19 11:38:05 +02:00
moved paper to private repository
This commit is contained in:
parent
4804ad91d5
commit
2a80dd2c0a
22 changed files with 2 additions and 10495 deletions
|
@ -1,12 +0,0 @@
|
|||
DOKUMENT = write-math-ba-paper
|
||||
make:
|
||||
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # aux-files for makeindex / makeglossaries
|
||||
makeglossaries $(DOKUMENT)
|
||||
bibtex $(DOKUMENT)
|
||||
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include index
|
||||
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
|
||||
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
|
||||
make clean
|
||||
|
||||
clean:
|
||||
rm -rf $(TARGET) *.class *.html *.log *.aux *.out *.thm *.idx *.toc *.ind *.ilg figures/torus.tex *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.bbl *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof
|
|
@ -1,8 +1,2 @@
|
|||
[Download compiled PDF](https://github.com/MartinThoma/LaTeX-examples/blob/master/documents/write-math-ba-paper/write-math-ba-paper.pdf)
|
||||
|
||||
Paper for [ICDAR 2015](http://2015.icdar.org/).
|
||||
|
||||
## Spell checking
|
||||
* Spell checking `aspell --lang=en --mode=tex check write-math-ba-paper.tex`
|
||||
* Spell checking with `http://www.reverso.net/spell-checker`
|
||||
* https://github.com/devd/Academic-Writing-Check
|
||||
This example is now in a private repository. If you want to get access, please
|
||||
ask info@martin-thoma.de
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,35 +0,0 @@
|
|||
SOURCE = errors-by-epoch-pretraining
|
||||
DELAY = 80
|
||||
DENSITY = 300
|
||||
WIDTH = 512
|
||||
|
||||
make:
|
||||
pdflatex $(SOURCE).tex -output-format=pdf
|
||||
make clean
|
||||
|
||||
clean:
|
||||
rm -rf $(TARGET) *.class *.html *.log *.aux *.data *.gnuplot
|
||||
|
||||
gif:
|
||||
pdfcrop $(SOURCE).pdf
|
||||
convert -verbose -delay $(DELAY) -loop 0 -density $(DENSITY) $(SOURCE)-crop.pdf $(SOURCE).gif
|
||||
make clean
|
||||
|
||||
png:
|
||||
make
|
||||
make svg
|
||||
inkscape $(SOURCE).svg -w $(WIDTH) --export-png=$(SOURCE).png
|
||||
|
||||
transparentGif:
|
||||
convert $(SOURCE).pdf -transparent white result.gif
|
||||
make clean
|
||||
|
||||
svg:
|
||||
make
|
||||
#inkscape $(SOURCE).pdf --export-plain-svg=$(SOURCE).svg
|
||||
pdf2svg $(SOURCE).pdf $(SOURCE).svg
|
||||
# Necessary, as pdf2svg does not always create valid svgs:
|
||||
inkscape $(SOURCE).svg --export-plain-svg=$(SOURCE).svg
|
||||
rsvg-convert -a -w $(WIDTH) -f svg $(SOURCE).svg -o $(SOURCE)2.svg
|
||||
inkscape $(SOURCE)2.svg --export-plain-svg=$(SOURCE).svg
|
||||
rm $(SOURCE)2.svg
|
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,31 +0,0 @@
|
|||
\begin{tikzpicture}
|
||||
\begin{axis}[
|
||||
axis x line=middle,
|
||||
axis y line=middle,
|
||||
enlarge y limits=true,
|
||||
xmin=0,
|
||||
% xmax=1000,
|
||||
ymin=0.18, ymax=0.4,
|
||||
minor ytick={0, 0.01, ..., 1},
|
||||
% width=15cm, height=8cm, % size of the image
|
||||
grid = both,
|
||||
minor grid style={dashed, gray!30},
|
||||
major grid style={gray!40},,
|
||||
%grid style={dashed, gray!30},
|
||||
ylabel=error,
|
||||
xlabel=epoch,
|
||||
legend cell align=left,
|
||||
legend style={
|
||||
at={(0.5,-0.1)},
|
||||
anchor=north,
|
||||
legend columns=2
|
||||
}
|
||||
]
|
||||
\addplot[mark=x,green] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-1.csv};
|
||||
\addplot[mark=x,orange] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2.csv};
|
||||
\addplot[mark=x,red] table [each nth point=20,x=epoch, y=testerror, col sep=comma] {baseline-2-pretraining.csv};
|
||||
\legend{{1 hidden layer},
|
||||
{2 hidden layers},
|
||||
{2 hidden layers with pretraining}}
|
||||
\end{axis}
|
||||
\end{tikzpicture}
|
|
@ -1,73 +0,0 @@
|
|||
%!TEX root = thesis.tex
|
||||
%Term definitions
|
||||
\newacronym{ANN}{ANN}{artificial neural network}
|
||||
\newacronym{CSR}{CSR}{cursive script recognition}
|
||||
\newacronym{DTW}{DTW}{dynamic time warping}
|
||||
\newacronym{GTW}{GTW}{greedy time warping}
|
||||
\newacronym{HMM}{HMM}{hidden Markov model}
|
||||
\newacronym{HWR}{HWR}{handwriting recognition}
|
||||
\newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
|
||||
\newacronym{MLP}{MLP}{multilayer perceptron}
|
||||
\newacronym{MSE}{MSE}{mean squared error}
|
||||
\newacronym{OOV}{OOV}{out of vocabulary}
|
||||
\newacronym{TDNN}{TDNN}{time delay neural network}
|
||||
\newacronym{PCA}{PCA}{principal component analysis}
|
||||
\newacronym{LDA}{LDA}{linear discriminant analysis}
|
||||
\newacronym{CROHME}{CROHME}{Competition on Recognition of Online Handwritten Mathematical Expressions}
|
||||
\newacronym{GMM}{GMM}{Gaussian mixture model}
|
||||
\newacronym{SVM}{SVM}{support vector machine}
|
||||
\newacronym{PyPI}{PyPI}{Python Package Index}
|
||||
\newacronym{CFM}{CFM}{classification figure of merit}
|
||||
\newacronym{CE}{CE}{cross entropy}
|
||||
\newacronym{GPU}{GPU}{graphics processing unit}
|
||||
\newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
|
||||
\newacronym{SLP}{SLP}{supervised layer-wise pretraining}
|
||||
|
||||
% Term definitions
|
||||
\newglossaryentry{Detexify}{name={Detexify}, description={A system used for
|
||||
on-line handwritten symbol recognition which is described in \cite{Kirsch}}}
|
||||
|
||||
\newglossaryentry{epoch}{name={epoch}, description={During iterative training of a neural network, an \textit{epoch} is a single pass through the entire training set, followed by testing of the verification set.\cite{Concise12}}}
|
||||
|
||||
\newglossaryentry{hypothesis}{
|
||||
name={hypothesis},
|
||||
description={The recognition results which a classifier returns is called a hypothesis. In other words, it is the \enquote{guess} of a classifier},
|
||||
plural=hypotheses
|
||||
}
|
||||
|
||||
\newglossaryentry{reference}{
|
||||
name={reference},
|
||||
description={Labeled data is used to evaluate classifiers. Those labels are called references},
|
||||
}
|
||||
|
||||
\newglossaryentry{YAML}{name={YAML}, description={YAML is a human-readable data format that can be used for configuration files}}
|
||||
\newglossaryentry{MER}{name={MER}, description={An error measure which combines symbols to equivalence classes. It was introduced on \cpageref{merged-error-introduction}}}
|
||||
|
||||
\newglossaryentry{JSON}{name={JSON}, description={JSON, short for JavaScript Object Notation, is a language-independent data format that can be used to transmit data between a server and a client in web applications}}
|
||||
|
||||
\newglossaryentry{hyperparamter}{name={hyperparamter}, description={A
|
||||
\textit{hyperparamter} is a parameter of a neural net, that cannot be learned,
|
||||
but has to be chosen}, symbol={\ensuremath{\theta}}}
|
||||
|
||||
\newglossaryentry{learning rate}{name={learning rate}, description={A factor $0 \leq \eta \in \mdr$ that affects how fast new weights are learned. $\eta=0$ means that no new data is learned}, symbol={\ensuremath{\eta}}} % Andrew Ng: \alpha
|
||||
|
||||
\newglossaryentry{learning rate decay}{name={learning rate decay}, description={The learning rate decay $0 < \alpha \leq 1$ is used to adjust the learning rate. After each epoch the learning rate $\eta$ is updated to $\eta \gets \eta \times \alpha$}, symbol={\ensuremath{\eta}}}
|
||||
|
||||
\newglossaryentry{preactivation}{name={preactivation}, description={The preactivation of a neuron is the weighted sum of its input, before the activation function is applied}}
|
||||
|
||||
\newglossaryentry{stroke}{name={stroke}, description={The path the pen took from
|
||||
the point where the pen was put down to the point where the pen was lifted first}}
|
||||
|
||||
\newglossaryentry{line}{name={line}, description={Geometric object that is infinitely long
|
||||
and defined by two points.}}
|
||||
|
||||
\newglossaryentry{line segment}{name={line segment}, description={Geometric object that has finite length
|
||||
and defined by two points.}}
|
||||
|
||||
\newglossaryentry{symbol}{name={symbol}, description={An atomic semantic entity. A more detailed description can be found in \cref{sec:what-is-a-symbol}}}
|
||||
|
||||
\newglossaryentry{weight}{name={weight}, description={A
|
||||
\textit{weight} is a parameter of a neural net, that can be learned}, symbol={\ensuremath{\weight}}}
|
||||
|
||||
\newglossaryentry{control point}{name={control point}, description={A
|
||||
\textit{control point} is a point recorded by the input device.}}
|
|
@ -1,12 +0,0 @@
|
|||
\newcommand{\totalCollectedRecordings}{166898} % ACTUALITY
|
||||
\newcommand{\detexifyCollectedRecordings}{153423}
|
||||
\newcommand{\trainingsetsize}{134804}
|
||||
\newcommand{\validtionsetsize}{15161}
|
||||
\newcommand{\testsetsize}{17012}
|
||||
\newcommand{\totalClasses}{1111}
|
||||
\newcommand{\totalClassesAnalyzed}{369}
|
||||
\newcommand{\totalClassesAboveFifty}{680}
|
||||
\newcommand{\totalClassesNotAnalyzedBelowFifty}{431}
|
||||
\newcommand{\detexifyPercentage}{$\SI{91.93}{\percent}$}
|
||||
\newcommand{\recordingsWithDots}{$\SI{2.77}{\percent}$} % excluding i,j, ...
|
||||
\newcommand{\recordingsWithDotsSizechange}{$\SI{0.85}{\percent}$} % excluding i,j, ...
|
Binary file not shown.
Before Width: | Height: | Size: 696 KiB |
Binary file not shown.
Before Width: | Height: | Size: 669 KiB |
Binary file not shown.
Before Width: | Height: | Size: 659 KiB |
Binary file not shown.
Before Width: | Height: | Size: 629 KiB |
Binary file not shown.
Before Width: | Height: | Size: 630 KiB |
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
@ -1,586 +0,0 @@
|
|||
\documentclass[9pt,technote]{IEEEtran}
|
||||
\usepackage{amssymb, amsmath} % needed for math
|
||||
\usepackage{hyperref} % links im text
|
||||
\usepackage{parskip}
|
||||
\usepackage[pdftex,final]{graphicx}
|
||||
\usepackage{csquotes}
|
||||
\usepackage{braket}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{multirow}
|
||||
\usepackage{pgfplots}
|
||||
\usepackage{ wasysym }
|
||||
\usepackage[noadjust]{cite}
|
||||
\usepackage[nameinlink,noabbrev]{cleveref} % has to be after hyperref, ntheorem, amsthm
|
||||
\usepackage[binary-units]{siunitx}
|
||||
\sisetup{per-mode=fraction,binary-units=true}
|
||||
\DeclareSIUnit\pixel{px}
|
||||
\usepackage{glossaries}
|
||||
\loadglsentries[main]{glossary}
|
||||
\makeglossaries
|
||||
|
||||
\title{On-line Recognition of Handwritten Mathematical Symbols}
|
||||
\author{Martin Thoma, Kevin Kilgour, Sebastian St{\"u}ker and Alexander Waibel}
|
||||
|
||||
\hypersetup{
|
||||
pdfauthor = {Martin Thoma, Kevin Kilgour, Sebastian St{\"u}ker and Alexander Waibel},
|
||||
pdfkeywords = {Mathematics,Symbols,recognition},
|
||||
pdftitle = {On-line Recognition of Handwritten Mathematical Symbols}
|
||||
}
|
||||
\include{variables}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
% Begin document %
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
\begin{document}
|
||||
\maketitle
|
||||
\begin{abstract}
|
||||
|
||||
The automatic recognition of single handwritten symbols has three main
|
||||
applications. The first application is to support users who know how a symbol
|
||||
looks like, but not what its name is such as $\saturn$. The second application
|
||||
is providing the necessary commands for professional publishing in books or on
|
||||
websites, e.g. in form of \LaTeX{} commands, as MathML, or as code points. The
|
||||
third application of single symbol classifiers is in form of a building block
|
||||
for formula recognition.
|
||||
|
||||
This paper presents a system
|
||||
which uses the pen trajectory to classify handwritten symbols. Five
|
||||
preprocessing steps, one data multiplication algorithm, five features and five
|
||||
variants for multilayer Perceptron training were evaluated using $\num{166898}$
|
||||
recordings which were collected with two crowdsourcing projects. The evaluation
|
||||
results of these 21~experiments were used to create an optimized recognizer
|
||||
which has a TOP-1 error of less than $\SI{17.5}{\percent}$ and a TOP-3 error of
|
||||
$\SI{4.0}{\percent}$. This is a relative improvement of $\SI{18.5}{\percent}$
|
||||
for the TOP-1 error and $\SI{29.7}{\percent}$ for the TOP-3 error compared to
|
||||
the baseline system.
|
||||
\end{abstract}
|
||||
|
||||
\section{Introduction}
|
||||
On-line recognition makes use of the pen trajectory. This means the data is
|
||||
given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where each
|
||||
group represents a stroke, $(x, y)$ is the position of the pen on a canvas and
|
||||
$t$ is the time.
|
||||
|
||||
On-line data was used to classify handwritten natural language text in many
|
||||
different variants. For example, the NPen++ system classified cursive
|
||||
handwriting into English words by using hidden Markov models and neural
|
||||
networks\cite{Manke1995}.
|
||||
|
||||
% One handwritten symbol in the described format is called a
|
||||
% \textit{recording}. One approach to classify recordings into symbol classes
|
||||
% assigns a probability to each class given the data. The classifier can be
|
||||
% evaluated by using recordings which were classified by humans and were not used
|
||||
% to train the classifier. The set of those recordings is called \textit{test
|
||||
% set}. The TOP-$n$ error is defined as the fraction of the symbols where
|
||||
% the correct class was not within the top $n$ classes of the highest
|
||||
% probability.
|
||||
|
||||
Several systems for mathematical symbol recognition with on-line data have been
|
||||
described so far~\cite{Kosmala98,Mouchere2013}, but no standard test set
|
||||
existed to compare the results of different classifiers. The used symbols
|
||||
differed in all papers. This is unfortunate as the choice of symbols is crucial
|
||||
for the TOP-$n$ error. For example, the symbols $o$, $O$, $\circ$ and $0$ are
|
||||
very similar and systems which know all those classes will certainly have a
|
||||
higher TOP-$n$ error than systems which only accept one of them. But not only
|
||||
the classes differed, also the used data to train and test had to be collected
|
||||
by each author again.
|
||||
|
||||
Daniel Kirsch describes in~\cite{Kirsch} a system called Detexify which uses
|
||||
time warping to classify on-line handwritten symbols and reports a TOP-3 error
|
||||
of less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols. He did also
|
||||
recently publish his data on \url{https://github.com/kirel/detexify-data},
|
||||
which was collected by a crowdsourcing approach via
|
||||
\url{http://detexify.kirelabs.org}. Those recordings as well as some recordings
|
||||
which were collected by a similar approach via \url{http://write-math.com} were
|
||||
used to train and evaluated different classifiers. A complete description of
|
||||
all involved software, data and experiments is given in~\cite{Thoma:2014}.
|
||||
|
||||
In this paper we present a baseline system for the classification of on-line
|
||||
handwriting into $369$ classes of which some are very similar. An optimized
|
||||
classifier which has a $\SI{29.7}{\percent}$ relative improvement of the TOP-3
|
||||
error. This was achieved by using better features and layer-wise supervised
|
||||
pretraining. The absolute improvements compared to the baseline of those
|
||||
changes will also be shown.
|
||||
|
||||
|
||||
\section{Steps in Handwriting Recognition}
|
||||
The following steps are used for symbol classification:\nobreak
|
||||
\begin{enumerate}
|
||||
\item \textbf{Preprocessing}: Recorded data is never perfect. Devices have
|
||||
errors and people make mistakes while using the devices. To tackle
|
||||
these problems there are preprocessing algorithms to clean the data.
|
||||
The preprocessing algorithms can also remove unnecessary variations
|
||||
of the data that do not help in the classification process, but hide
|
||||
what is important. Having slightly different sizes of the same symbol
|
||||
is an example of such a variation. Four preprocessing algorithms that
|
||||
clean or normalize recordings are explained in
|
||||
\cref{sec:preprocessing}.
|
||||
\item \textbf{Data multiplication}: Learning algorithms need lots of data
|
||||
to learn internal parameters. If there is not enough data available,
|
||||
domain knowledge can be considered to create new artificial data from
|
||||
the original data. In the domain of on-line handwriting recognition,
|
||||
data can be multiplied by adding rotated variants.
|
||||
\item \textbf{Segmentation}: The task of formula recognition can eventually
|
||||
be reduced to the task of symbol recognition combined with symbol
|
||||
placement. Before symbol recognition can be done, the formula has
|
||||
to be segmented. As this paper is only about single-symbol
|
||||
recognition, this step will not be further discussed.
|
||||
\item \textbf{Feature computation}: A feature is high-level information
|
||||
derived from the raw data after preprocessing. Some systems like
|
||||
Detexify take the result of the preprocessing step, but many compute
|
||||
new features. Those features could be designed by a human engineer or
|
||||
learned. Non-raw data features can have the advantage that less
|
||||
training data is needed since the developer can use knowledge about
|
||||
handwriting to compute highly discriminative features. Various
|
||||
features are explained in \cref{sec:features}.
|
||||
\item \textbf{Feature enhancement}: Applying PCA, LDA, or
|
||||
feature standardization might change the features in ways that could
|
||||
improve the performance of learning algorithms.
|
||||
\end{enumerate}
|
||||
|
||||
After these steps, we are faced with a classification learning task which
|
||||
consists of two parts:
|
||||
\begin{enumerate}
|
||||
\item \textbf{Learning} parameters for a given classifier.
|
||||
\item \textbf{Classifying} new recordings, sometimes called
|
||||
\textit{evaluation}. This should not be confused with the evaluation
|
||||
of the classification performance which is done for multiple
|
||||
topologies, preprocessing queues, and features in
|
||||
\Cref{ch:Evaluation}.
|
||||
\end{enumerate}
|
||||
|
||||
The classification learning task can be solved with \glspl{MLP} if the number
|
||||
of input features is the same for every recording. There are many ways how to
|
||||
adjust \glspl{MLP} and how to adjust their training. Some of them are
|
||||
described in~\cref{sec:mlp-training}.
|
||||
|
||||
|
||||
\section{Data and Implementation}
|
||||
The combined data of Detexify and \href{http://write-math.com}{write-math.com}
|
||||
can be downloaded via \href{http://write-math.com/data}{write-math.com/data} as
|
||||
a compressed tar archive. It contains a list of $369$ symbols which are used in
|
||||
mathematical context. Each symbol has at least $50$ labeled examples, but most
|
||||
symbols have more than $200$ labeled examples and some have more than $2000$.
|
||||
In total, more than $\num{160000}$ labeled recordings were collected.
|
||||
|
||||
Preprocessing and feature computation algorithms were implemented and are
|
||||
publicly available as open-source software in the Python package \texttt{hwrt}
|
||||
and \gls{MLP} algorithms are available in the Python package
|
||||
\texttt{nntoolkit}.
|
||||
|
||||
|
||||
\section{Algorithms}
|
||||
\subsection{Preprocessing}\label{sec:preprocessing}
|
||||
Preprocessing in symbol recognition is done to improve the quality and
|
||||
expressive power of the data. It should make follow-up tasks like segmentation
|
||||
and feature extraction easier, more effective or faster. It does so by resolving
|
||||
errors in the input data, reducing duplicate information and removing irrelevant
|
||||
information.
|
||||
|
||||
Preprocessing algorithms fall into two groups: Normalization and noise
|
||||
reduction algorithms.
|
||||
|
||||
A very important normalization algorithm in single-symbol recognition is
|
||||
\textit{scale-and-shift}~\cite{Thoma:2014}. It scales the recording so that
|
||||
its bounding box fits into a unit square. As the aspect ratio of a recording
|
||||
is almost never 1:1, only one dimension will fit exactly in the unit square.
|
||||
There are multiple ways how to shift the recording. For this paper, it was
|
||||
chosen to shift the bigger dimension to fit into the $[0,1] \times [0,1]$ unit
|
||||
square whereas the smaller dimension is centered in the $[-1,1] \times [-1,1]$
|
||||
square.
|
||||
|
||||
Another normalization preprocessing algorithm is resampling. As the data points
|
||||
on the pen trajectory are generated asynchronously and with different
|
||||
time-resolutions depending on the used hardware and software, it is desirable
|
||||
to resample the recordings to have points spread equally in time for every
|
||||
recording. This was done by linear interpolation of the $(x,t)$ and $(y,t)$
|
||||
sequences and getting a fixed number of equally spaced points per stroke.
|
||||
|
||||
\textit{Connect strokes} is a noise reduction algorithm. It happens sometimes
|
||||
that the hardware detects that the user lifted the pen where the user certainly
|
||||
didn't do so. This can be detected by measuring the Euclidean distance between
|
||||
the end of one stroke and the beginning of the next stroke. If this distance is
|
||||
below a threshold, then the strokes are connected.
|
||||
|
||||
Due to a limited resolution of the recording device and due to erratic
|
||||
handwriting, the pen trajectory might not be smooth. One way to smooth is
|
||||
calculating a weighted average and replacing points by the weighted average of
|
||||
their coordinate and their neighbors coordinates. Another way to do smoothing
|
||||
would be to reduce the number of points with the Douglas-Peucker algorithm to
|
||||
the most relevant ones and then interpolate the stroke between those points.
|
||||
The Douglas-Peucker stroke simplification algorithm is usually used in
|
||||
cartography to simplify the shape of roads. It works recursively to find a
|
||||
subset of points of a stroke that is simpler and still similar to the original
|
||||
shape. The algorithm adds the first and the last point $p_1$ and $p_n$ of a
|
||||
stroke to the simplified set of points $S$. Then it searches the point $p_i$ in
|
||||
between that has maximum distance from the line $p_1 p_n$. If this
|
||||
distance is above a threshold $\varepsilon$, the point $p_i$ is added to $S$.
|
||||
Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively. It is
|
||||
described as \enquote{Algorithm 1} in~\cite{Visvalingam1990}.
|
||||
|
||||
\subsection{Features}\label{sec:features}
|
||||
Features can be \textit{global}, that means calculated for the complete
|
||||
recording or complete strokes. Other features are calculated for single points
|
||||
on the pen trajectory and are called \textit{local}.
|
||||
|
||||
Global features are the \textit{number of strokes} in a recording, the
|
||||
\textit{aspect ratio} of a recordings bounding box or the
|
||||
\textit{ink} being used for a recording. The ink feature gets calculated by
|
||||
measuring the length of all strokes combined. The re-curvature, which was
|
||||
introduced in~\cite{Huang06}, is defined as
|
||||
\[\text{re-curvature}(stroke) := \frac{\text{height}(stroke)}{\text{length}(stroke)}\]
|
||||
and a stroke-global feature.
|
||||
|
||||
The simplest local feature is the coordinate of the point itself. Speed,
|
||||
curvature and a local small-resolution bitmap around the point, which was
|
||||
introduced by Manke, Finke and Waibel in~\cite{Manke1995}, are other local
|
||||
features.
|
||||
|
||||
\subsection{Multilayer Perceptrons}\label{sec:mlp-training}
|
||||
\Glspl{MLP} are explained in detail in~\cite{Mitchell97}. They can have
|
||||
different numbers of hidden layers, the number of neurons per layer and the
|
||||
activation functions can be varied. The learning algorithm is parameterized by
|
||||
the learning rate $\eta \in (0, \infty)$, the momentum $\alpha \in [0, \infty)$
|
||||
and the number of epochs.
|
||||
|
||||
The topology of \glspl{MLP} will be denoted in the following by separating the
|
||||
number of neurons per layer with colons. For example, the notation
|
||||
$160{:}500{:}500{:}500{:}369$ means that the input layer gets 160~features,
|
||||
there are three hidden layers with 500~neurons per layer and one output layer
|
||||
with 369~neurons.
|
||||
|
||||
\glspl{MLP} training can be executed in various different ways, for example
|
||||
with \gls{SLP}. In case of a \gls{MLP} with the topology
|
||||
$160{:}500{:}500{:}500{:}369$, \gls{SLP} works as follows: At first a \gls{MLP}
|
||||
with one hidden layer ($160{:}500{:}369$) is trained. Then the output layer is
|
||||
discarded, a new hidden layer and a new output layer is added and it is trained
|
||||
again, resulting in a $160{:}500{:}500{:}369$ \gls{MLP}. The output layer is
|
||||
discarded again, a new hidden layer is added and a new output layer is added
|
||||
and the training is executed again.
|
||||
|
||||
Denoising auto-encoders are another way of pretraining. An
|
||||
\textit{auto-encoder} is a neural network that is trained to restore its input.
|
||||
This means the number of input neurons is equal to the number of output
|
||||
neurons. The weights define an \textit{encoding} of the input that allows
|
||||
restoring the input. As the neural network finds the encoding by itself, it is
|
||||
called auto-encoder. If the hidden layer is smaller than the input layer, it
|
||||
can be used for dimensionality reduction~\cite{Hinton1989}. If only one hidden
|
||||
layer with linear activation functions is used, then the hidden layer contains
|
||||
the principal components after training~\cite{Duda2001}.
|
||||
|
||||
Denoising auto-encoders are a variant introduced in~\cite{Vincent2008} that
|
||||
is more robust to partial corruption of the input features. It is trained to
|
||||
get robust by adding noise to the input features.
|
||||
|
||||
There are multiple ways how noise can be added. Gaussian noise and
|
||||
randomly masking elements with zero are two possibilities. \cite{Deeplearning-Denoising-AE}
|
||||
describes how such a denoising auto-encoder with masking noise can be
|
||||
implemented. The \texttt{corruption} is the probability of a feature being
|
||||
masked.
|
||||
|
||||
\section{Evaluation}\label{ch:Evaluation}
|
||||
In order to evaluate the effect of different preprocessing algorithms, features
|
||||
and adjustments in the \gls{MLP} training and topology, the following baseline
|
||||
system was used:
|
||||
|
||||
Scale the recording to fit into a unit square while keeping the aspect ratio,
|
||||
shift it into $[-1,1] \times [-1,1]$ as described in \cref{sec:preprocessing},
|
||||
resample it with linear interpolation to get 20~points per stroke, spaced
|
||||
evenly in time. Take the first 4~strokes with 20~points per stroke and
|
||||
2~coordinates per point as features, resulting in 160~features which is equal
|
||||
to the number of input neurons. If a recording has less than 4~strokes, the
|
||||
remaining features were filled with zeroes.
|
||||
|
||||
All experiments were evaluated with four baseline systems $B_i$, $i \in \Set{1,
|
||||
2, 3, 4}$, where $i$ is the number of hidden layers as different topologies
|
||||
could have a severe influence on the effect of new features or preprocessing
|
||||
steps. Each hidden layer in all evaluated systems has $500$ neurons.
|
||||
|
||||
Each \gls{MLP} was trained with a learning rate of $\eta = 0.1$ and a momentum
|
||||
of $\alpha = 0.1$. The activation function of every neuron in a hidden layer is
|
||||
the sigmoid function $\text{sig}(x) := \frac{1}{1+e^{-x}}$. The neurons in the
|
||||
output layer use the softmax function. For every experiment, exactly one part
|
||||
of the baseline systems was changed.
|
||||
|
||||
\subsection{Random Weight Initialization}
|
||||
The neural networks in all experiments got initialized with a small random
|
||||
weight
|
||||
|
||||
\[w_{i,j} \sim U(-4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}}, 4 \cdot \sqrt{\frac{6}{n_l + n_{l+1}}})\]
|
||||
|
||||
where $w_{i,j}$ is the weight between the neurons $i$ and $j$, $l$ is the layer
|
||||
of neuron $i$, and $n_i$ is the number of neurons in layer $i$. This random
|
||||
initialization was suggested on
|
||||
\cite{deeplearningweights} and is done to break symmetry.
|
||||
|
||||
This might lead to different error rates for the same systems just because the
|
||||
initialization was different.
|
||||
|
||||
In order to get an impression of the magnitude of the influence on the different
|
||||
topologies and error rates the baseline models were trained 5 times with
|
||||
random initializations.
|
||||
\Cref{table:baseline-systems-random-initializations-summary}
|
||||
shows a summary of the results. The more hidden layers are used, the more do
|
||||
the results vary between different random weight initializations.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{crrr|rrr} %chktex 44
|
||||
\toprule
|
||||
\multirow{3}{*}{System} & \multicolumn{6}{c}{Classification error}\\
|
||||
\cmidrule(l){2-7}
|
||||
& \multicolumn{3}{c}{TOP-1} & \multicolumn{3}{c}{TOP-3}\\
|
||||
& min & max & range & min & max & range\\\midrule
|
||||
$B_1$ & $\SI{23.08}{\percent}$ & $\SI{23.44}{\percent}$ & $\SI{0.36}{\percent}$ & $\SI{6.67}{\percent}$ & $\SI{6.80}{\percent}$ & $\SI{0.13}{\percent}$ \\
|
||||
$B_2$ & \underline{$\SI{21.45}{\percent}$} & \underline{$\SI{21.83}{\percent}$}& $\SI{0.38}{\percent}$ & $\SI{5.68}{\percent}$ & \underline{$\SI{5.75}{\percent}$} & $\SI{0.07}{\percent}$\\
|
||||
$B_3$ & $\SI{21.54}{\percent}$ & $\SI{22.28}{\percent}$ & $\SI{0.74}{\percent}$ & \underline{$\SI{5.50}{\percent}$} & $\SI{5.82}{\percent}$ & $\SI{0.32}{\percent}$\\
|
||||
$B_4$ & $\SI{23.19}{\percent}$ & $\SI{24.84}{\percent}$ & $\SI{1.65}{\percent}$ & $\SI{5.98}{\percent}$ & $\SI{6.44}{\percent}$ & $\SI{0.46}{\percent}$\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{The systems $B_1$ -- $B_4$ were randomly initialized, trained
|
||||
and evaluated 5~times to estimate the influence of random weight
|
||||
initialization.}
|
||||
\label{table:baseline-systems-random-initializations-summary}
|
||||
\end{table}
|
||||
|
||||
\subsection{Connect strokes}
|
||||
In order to solve the problem of interrupted strokes, pairs of strokes
|
||||
can be connected with stroke connect algorithm. The idea is that for
|
||||
a pair of consecutively drawn strokes $s_{i}, s_{i+1}$ the last point $s_i$ is
|
||||
close to the first point of $s_{i+1}$ if a stroke was accidentally split
|
||||
into two strokes.
|
||||
|
||||
$\SI{59}{\percent}$ of all stroke pair distances in the collected data are
|
||||
between $\SI{30}{\pixel}$ and $\SI{150}{\pixel}$. Hence the stroke connect
|
||||
algorithm was tried with $\SI{5}{\pixel}$, $\SI{10}{\pixel}$ and
|
||||
$\SI{20}{\pixel}$.
|
||||
All models TOP-3 error improved with a threshold of $\theta = \SI{10}{\pixel}$
|
||||
by at least $\SI{0.17}{\percent}$, except $B_4$ which improved only by
|
||||
$\SI{0.01}{\percent}$ which could be a result of random weight initialization.
|
||||
|
||||
\subsection{Douglas-Peucker Smoothing}
|
||||
The Douglas-Peucker algorithm can be used to find
|
||||
points that are more relevant for the overall shape of a recording. After that,
|
||||
an interpolation can be done. If the interpolation is a cubic spline
|
||||
interpolation, this makes the recording smooth.
|
||||
|
||||
The Douglas-Peucker algorithm was applied with a threshold of $\varepsilon =
|
||||
0.05$, $\varepsilon = 0.1$ and $\varepsilon = 0.2$ after scaling and shifting,
|
||||
but before resampling. The interpolation in the resampling step was done
|
||||
linearly and with cubic splines in two experiments. The recording was scaled
|
||||
and shifted again after the interpolation because the bounding box might have
|
||||
changed.
|
||||
|
||||
The result of the application of the Douglas-Peucker smoothing with $\varepsilon
|
||||
> 0.05$ was a high rise of the TOP-1 and TOP-3 error for all models $B_i$.
|
||||
This means that the simplification process removes some relevant information and
|
||||
does not --- as it was expected --- remove only noise. For $\varepsilon = 0.05$
|
||||
with linear interpolation some models TOP-1 error improved, but the
|
||||
changes were small. It could be an effect of random weight initialization.
|
||||
However, cubic spline interpolation made all systems perform more than
|
||||
$\SI{1.7}{\percent}$ worse for TOP-1 and TOP-3 error.
|
||||
|
||||
The lower the value of $\varepsilon$, the less does the recording change after
|
||||
this preprocessing step. As it was applied after scaling the recording such that
|
||||
the biggest dimension of the recording (width or height) is $1$, a value of
|
||||
$\varepsilon = 0.05$ means that a point has to move at least $\SI{5}{\percent}$
|
||||
of the biggest dimension.
|
||||
|
||||
\subsection{Global Features}
|
||||
Single global features were added one at a time to the baseline systems. Those
|
||||
features were re-curvature $\text{re-curvature}(stroke) = \frac{\text{height}(stroke)}{\text{length}(stroke)}$
|
||||
as described in \cite{Huang06}, the ink feature which is the summed length
|
||||
of all strokes, the stroke count, the aspect ratio and the stroke center points
|
||||
for the first four strokes. The stroke center point feature improved the system
|
||||
$B_1$ by $\SI{0.27}{\percent}$ for the TOP-3 error and system $B_3$ for the
|
||||
TOP-1 error by $\SI{0.74}{\percent}$, but all other systems and error measures
|
||||
either got worse or did not improve much.
|
||||
|
||||
The other global features did improve the systems $B_1 -- B_3$, but not $B_4$.
|
||||
The highest improvement was achieved with the re-curvature feature. It
|
||||
improved the systems $B_1 -- B_4$ by more than $\SI{0.6}{\percent}$ TOP-1 error.
|
||||
|
||||
|
||||
\subsection{Data Multiplication}
|
||||
Data multiplication can be used to make the model invariant to transformations.
|
||||
However, this idea seems not to work well in the domain of on-line handwritten
|
||||
mathematical symbols. It was tried to triple the data by adding a rotated
|
||||
version that is rotated 3 degrees to the left and another one that is rotated
|
||||
3 degrees to the right around the center of mass. This data multiplication
|
||||
made all classifiers for most error measures perform worse by more than
|
||||
$\SI{2}{\percent}$ for the TOP-1 error.
|
||||
|
||||
\subsection{Pretraining}\label{subsec:pretraining-evaluation}
|
||||
Pretraining is a technique used to improve the training of \glspl{MLP} with
|
||||
multiple hidden layers.
|
||||
|
||||
\Cref{fig:training-and-test-error-for-different-topologies-pretraining} shows
|
||||
the evolution of the TOP-1 error over 1000~epochs with supervised
|
||||
layer-wise pretraining and without pretraining. It clearly shows that this
|
||||
kind of pretraining improves the classification performance by $\SI{1.6}{\percent}$
|
||||
for the TOP-1 error and $\SI{1.0}{\percent}$ for the TOP-3 error.
|
||||
|
||||
\begin{figure}[htb]
|
||||
\centering
|
||||
\input{figures/errors-by-epoch-pretraining/errors-by-epoch-pretraining.tex}
|
||||
\caption{Training- and test error by number of trained epochs for different
|
||||
topologies with \gls{SLP}. The plot shows
|
||||
that all pretrained systems performed much better than the systems
|
||||
without pretraining. All plotted systems did not improve
|
||||
with more epochs of training.}
|
||||
\label{fig:training-and-test-error-for-different-topologies-pretraining}
|
||||
\end{figure}
|
||||
|
||||
Pretraining with denoising auto-encoder lead to the much worse results listed in
|
||||
\cref{table:pretraining-denoising-auto-encoder}. The first layer used a $\tanh$
|
||||
activation function. Every layer was trained for $1000$ epochs and the
|
||||
\gls{MSE} loss function. A learning-rate of $\eta = 0.001$, a corruption of
|
||||
$0.3$ and a $L_2$ regularization of $\lambda = 10^{-4}$ were chosen. This
|
||||
pretraining setup made all systems with all error measures perform much worse.
|
||||
|
||||
\begin{table}[tb]
|
||||
\centering
|
||||
\begin{tabular}{lrrrr}
|
||||
\toprule
|
||||
\multirow{2}{*}{System} & \multicolumn{4}{c}{Classification error}\\
|
||||
\cmidrule(l){2-5}
|
||||
& TOP-1 & change & TOP-3 & change \\\midrule
|
||||
$B_{1,p}$ & $\SI{23.75}{\percent}$ & $\SI{+0.41}{\percent}$ & $\SI{7.19}{\percent}$ & $\SI{+0.39}{\percent}$\\
|
||||
$B_{2,p}$ & \underline{$\SI{22.76}{\percent}$} & $\SI{+1.25}{\percent}$ & $\SI{6.38}{\percent}$ & $\SI{+0.63}{\percent}$\\
|
||||
$B_{3,p}$ & $\SI{23.10}{\percent}$ & $\SI{+1.17}{\percent}$ & \underline{$\SI{6.14}{\percent}$} & $\SI{+0.40}{\percent}$\\
|
||||
$B_{4,p}$ & $\SI{25.59}{\percent}$ & $\SI{+1.71}{\percent}$ & $\SI{6.99}{\percent}$ & $\SI{+0.87}{\percent}$\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Systems with denoising auto-encoder pretraining compared to pure
|
||||
gradient descent. The pretrained systems clearly performed worse.}
|
||||
\label{table:pretraining-denoising-auto-encoder}
|
||||
\end{table}
|
||||
|
||||
\subsection{Optimized Recognizer}
|
||||
All preprocessing steps and features that were useful were combined to
|
||||
create a recognizer that should perform best.
|
||||
|
||||
All models were much better than everything that was tried before. The results
|
||||
of this experiment show that single-symbol recognition with
|
||||
\totalClassesAnalyzed{} classes and usual touch devices and the mouse can be
|
||||
done with a TOP1 error rate of $\SI{18.56}{\percent}$ and a TOP3 error of
|
||||
$\SI{4.11}{\percent}$. This was
|
||||
achieved by a \gls{MLP} with a $167{:}500{:}500{:}\totalClassesAnalyzed{}$ topology.
|
||||
|
||||
It used an algorithm to connect strokes of which the ends were less than
|
||||
$\SI{10}{\pixel}$ away, scaled each recording to a unit square and shifted this
|
||||
unit square to $(0,0)$. After that, a linear resampling step was applied to the
|
||||
first 4 strokes to resample them to 20 points each. All other strokes were
|
||||
discarded.
|
||||
|
||||
The 167 features were
|
||||
|
||||
\begin{itemize}
|
||||
\item the first 4 strokes with 20 points per stroke resulting in 160
|
||||
features,
|
||||
\item the re-curvature for the first 4 strokes,
|
||||
\item the ink,
|
||||
\item the number of strokes and
|
||||
\item the aspect ratio
|
||||
\end{itemize}
|
||||
|
||||
\Gls{SLP} was applied with $\num{1000}$ epochs per layer, a
|
||||
learning rate of $\eta=0.1$ and a momentum of $\alpha=0.1$. After that, the
|
||||
complete model was trained again for $1000$ epochs with standard mini-batch
|
||||
gradient descent.
|
||||
|
||||
After the models $B_{1,c}$ -- $B_{4,c}$ were trained the first $1000$ epochs,
|
||||
they were trained again for $1000$ epochs with a learning rate of $\eta = 0.05$.
|
||||
\Cref{table:complex-recognizer-systems-evaluation} shows that
|
||||
this improved the classifiers again.
|
||||
|
||||
\begin{table}[htb]
|
||||
\centering
|
||||
\begin{tabular}{lrrrr}
|
||||
\toprule
|
||||
\multirow{2}{*}{System} & \multicolumn{4}{c}{Classification error}\\
|
||||
\cmidrule(l){2-5}
|
||||
& TOP1 & change & TOP3 & change\\\midrule
|
||||
$B_{1,c}$ & $\SI{20.96}{\percent}$ & $\SI{-2.38}{\percent}$ & $\SI{5.24}{\percent}$ & $\SI{-1.56}{\percent}$\\
|
||||
$B_{2,c}$ & $\SI{18.26}{\percent}$ & $\SI{-3.25}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{-1.68}{\percent}$\\
|
||||
$B_{3,c}$ & \underline{$\SI{18.19}{\percent}$} & $\SI{-3.74}{\percent}$ & \underline{$\SI{4.06}{\percent}$} & $\SI{-1.68}{\percent}$\\
|
||||
$B_{4,c}$ & $\SI{18.57}{\percent}$ & $\SI{-5.31}{\percent}$ & $\SI{4.25}{\percent}$ & $\SI{-1.87}{\percent}$\\\midrule
|
||||
$B_{1,c}'$ & $\SI{19.33}{\percent}$ & $\SI{-1.63}{\percent}$ & $\SI{4.78}{\percent}$ & $\SI{-0.46}{\percent}$ \\
|
||||
$B_{2,c}'$ & \underline{$\SI{17.52}{\percent}$} & $\SI{-0.74}{\percent}$ & \underline{$\SI{4.04}{\percent}$} & $\SI{-0.03}{\percent}$\\
|
||||
$B_{3,c}'$ & $\SI{17.65}{\percent}$ & $\SI{-0.54}{\percent}$ & $\SI{4.07}{\percent}$ & $\SI{+0.01}{\percent}$\\
|
||||
$B_{4,c}'$ & $\SI{17.82}{\percent}$ & $\SI{-0.75}{\percent}$ & $\SI{4.26}{\percent}$ & $\SI{+0.01}{\percent}$\\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Error rates of the optimized recognizer systems. The systems
|
||||
$B_{i,c}'$ were trained another $1000$ epochs with a learning rate
|
||||
of $\eta=0.05$. The value of the column \enquote{change} of the
|
||||
systems $B_{i,c}'$ is relative to $B_{i,c}$.}
|
||||
\label{table:complex-recognizer-systems-evaluation}
|
||||
\end{table}
|
||||
|
||||
|
||||
\section{Discussion}
|
||||
Four baseline recognition systems were adjusted in many experiments and their
|
||||
recognition capabilities were compared in order to build a recognition system
|
||||
that can recognize 396 mathematical symbols with low error rates as well as to
|
||||
evaluate which preprocessing steps and features help to improve the recognition
|
||||
rate.
|
||||
|
||||
All recognition systems were trained and evaluated with
|
||||
$\num{\totalCollectedRecordings{}}$ recordings for \totalClassesAnalyzed{}
|
||||
symbols. These recordings were collected by two crowdsourcing projects
|
||||
(\href{http://detexify.kirelabs.org/classify.html}{Detexify} and
|
||||
\href{write-math.com}{write-math.com}) and created with various devices. While
|
||||
some recordings were created with standard touch devices such as tablets and
|
||||
smartphones, others were created with the mouse.
|
||||
|
||||
\Glspl{MLP} were used for the classification task. Four baseline systems with
|
||||
different numbers of hidden layers were used, as the number of hidden layer
|
||||
influences the capabilities and problems of \glspl{MLP}.
|
||||
|
||||
All baseline systems used the same preprocessing queue. The recordings were
|
||||
scaled to fit into a unit square, shifted to $(0,0)$, resampled with linear
|
||||
interpolation so that every stroke had exactly 20~points which are spread
|
||||
equidistant in time. The 80~($x,y$) coordinates of the first 4~strokes were used
|
||||
to get exactly $160$ input features for every recording. The baseline system
|
||||
$B_2$ has a TOP-3 error of $\SI{5.75}{\percent}$.
|
||||
|
||||
Adding two slightly rotated variants for each recording and hence tripling the
|
||||
training set made the systems $B_3$ and $B_4$ perform much worse, but improved
|
||||
the performance of the smaller systems.
|
||||
|
||||
The global features re-curvature, ink, stoke count and aspect ratio improved the
|
||||
systems $B_1$--$B_3$, whereas the stroke center point feature made $B_2$ perform
|
||||
worse.
|
||||
|
||||
Denoising auto-encoders were evaluated as one way
|
||||
to use pretraining, but by this the error rate increased notably. However,
|
||||
supervised layer-wise pretraining improved the performance decidedly.
|
||||
|
||||
The stroke connect algorithm was added to the preprocessing steps of the
|
||||
baseline system as well as the re-curvature feature, the ink feature, the number
|
||||
of strokes and the aspect ratio. The training setup of the baseline system was
|
||||
changed to supervised layer-wise pretraining and the resulting model was trained
|
||||
with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a TOP-3
|
||||
error of $\SI{4.04}{\percent}$. This means that the TOP-3 error dropped by over
|
||||
$\SI{1.7}{\percent}$ in comparison to the baseline system $B_2$.
|
||||
|
||||
A TOP-3 error of $\SI{4.04}{\percent}$ makes the system usable for symbol lookup.
|
||||
It could also be used as a starting point for the development of a
|
||||
multiple-symbol classifier.
|
||||
|
||||
The aim of this work was to develop a symbol recognition system which is easy
|
||||
to use, fast and has high recognition rates as well as evaluating ideas for
|
||||
single symbol classifiers. Some of those goals were reached. The recognition
|
||||
system $B_{2,c}'$ evaluates new recordings in a fraction of a second and has
|
||||
acceptable recognition rates.
|
||||
|
||||
% Many algorithms were evaluated.
|
||||
% However, there are still many other algorithms which could be evaluated and, at
|
||||
% the time of this work, the best classifier $B_{2,c}'$ is only available
|
||||
% through the Python package \texttt{hwrt}. It is planned to add an web version
|
||||
% of that classifier online.
|
||||
|
||||
\bibliographystyle{IEEEtranSA}
|
||||
\bibliography{write-math-ba-paper}
|
||||
\end{document}
|
Loading…
Add table
Add a link
Reference in a new issue