mirror of
https://github.com/MartinThoma/LaTeX-examples.git
synced 2025-04-26 06:48:04 +02:00
documents/write-math-ba-paper: Initial commit
This commit is contained in:
parent
6ffe05846e
commit
cea22c65c0
6 changed files with 2142 additions and 0 deletions
12
documents/write-math-ba-paper/Makefile
Normal file
12
documents/write-math-ba-paper/Makefile
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
DOKUMENT = write-math-ba-paper
|
||||||
|
make:
|
||||||
|
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # aux-files for makeindex / makeglossaries
|
||||||
|
makeglossaries $(DOKUMENT)
|
||||||
|
bibtex $(DOKUMENT)
|
||||||
|
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include index
|
||||||
|
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
|
||||||
|
pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
|
||||||
|
make clean
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(TARGET) *.class *.html *.log *.aux *.out *.thm *.idx *.toc *.ind *.ilg figures/torus.tex *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.bbl *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof
|
21
documents/write-math-ba-paper/README.md
Normal file
21
documents/write-math-ba-paper/README.md
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
## TODO
|
||||||
|
|
||||||
|
### Preprocessing
|
||||||
|
* Scale-and-shift
|
||||||
|
* linear interpolation
|
||||||
|
* connect strokes
|
||||||
|
* Douglas-Peucker
|
||||||
|
|
||||||
|
### Features
|
||||||
|
* coordinates
|
||||||
|
* ink
|
||||||
|
* stroke count
|
||||||
|
* aspect ratio
|
||||||
|
|
||||||
|
### Training
|
||||||
|
* learning rate
|
||||||
|
* momentum
|
||||||
|
* Supervised layer-wise pretraining
|
||||||
|
|
||||||
|
|
||||||
|
* Check abstract!
|
73
documents/write-math-ba-paper/glossary.tex
Normal file
73
documents/write-math-ba-paper/glossary.tex
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
%!TEX root = thesis.tex
|
||||||
|
%Term definitions
|
||||||
|
\newacronym{ANN}{ANN}{artificial neural network}
|
||||||
|
\newacronym{CSR}{CSR}{cursive script recognition}
|
||||||
|
\newacronym{DTW}{DTW}{dynamic time warping}
|
||||||
|
\newacronym{GTW}{GTW}{greedy time warping}
|
||||||
|
\newacronym{HMM}{HMM}{hidden Markov model}
|
||||||
|
\newacronym{HWR}{HWR}{handwriting recognition}
|
||||||
|
\newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
|
||||||
|
\newacronym{MLP}{MLP}{multilayer perceptron}
|
||||||
|
\newacronym{MSE}{MSE}{mean squared error}
|
||||||
|
\newacronym{OOV}{OOV}{out of vocabulary}
|
||||||
|
\newacronym{TDNN}{TDNN}{time delay neural network}
|
||||||
|
\newacronym{PCA}{PCA}{principal component analysis}
|
||||||
|
\newacronym{LDA}{LDA}{linear discriminant analysis}
|
||||||
|
\newacronym{CROHME}{CROHME}{Competition on Recognition of Online Handwritten Mathematical Expressions}
|
||||||
|
\newacronym{GMM}{GMM}{Gaussian mixture model}
|
||||||
|
\newacronym{SVM}{SVM}{support vector machine}
|
||||||
|
\newacronym{PyPI}{PyPI}{Python Package Index}
|
||||||
|
\newacronym{CFM}{CFM}{classification figure of merit}
|
||||||
|
\newacronym{CE}{CE}{cross entropy}
|
||||||
|
\newacronym{GPU}{GPU}{graphics processing unit}
|
||||||
|
\newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
|
||||||
|
\newacronym{SLP}{SLP}{supervised layer-wise pretraining}
|
||||||
|
|
||||||
|
% Term definitions
|
||||||
|
\newglossaryentry{Detexify}{name={Detexify}, description={A system used for
|
||||||
|
on-line handwritten symbol recognition which is described in \cite{Kirsch}}}
|
||||||
|
|
||||||
|
\newglossaryentry{epoch}{name={epoch}, description={During iterative training of a neural network, an \textit{epoch} is a single pass through the entire training set, followed by testing of the verification set.\cite{Concise12}}}
|
||||||
|
|
||||||
|
\newglossaryentry{hypothesis}{
|
||||||
|
name={hypothesis},
|
||||||
|
description={The recognition results which a classifier returns is called a hypothesis. In other words, it is the \enquote{guess} of a classifier},
|
||||||
|
plural=hypotheses
|
||||||
|
}
|
||||||
|
|
||||||
|
\newglossaryentry{reference}{
|
||||||
|
name={reference},
|
||||||
|
description={Labeled data is used to evaluate classifiers. Those labels are called references},
|
||||||
|
}
|
||||||
|
|
||||||
|
\newglossaryentry{YAML}{name={YAML}, description={YAML is a human-readable data format that can be used for configuration files}}
|
||||||
|
\newglossaryentry{MER}{name={MER}, description={An error measure which combines symbols to equivalence classes. It was introduced on \cpageref{merged-error-introduction}}}
|
||||||
|
|
||||||
|
\newglossaryentry{JSON}{name={JSON}, description={JSON, short for JavaScript Object Notation, is a language-independent data format that can be used to transmit data between a server and a client in web applications}}
|
||||||
|
|
||||||
|
\newglossaryentry{hyperparamter}{name={hyperparamter}, description={A
|
||||||
|
\textit{hyperparamter} is a parameter of a neural net, that cannot be learned,
|
||||||
|
but has to be chosen}, symbol={\ensuremath{\theta}}}
|
||||||
|
|
||||||
|
\newglossaryentry{learning rate}{name={learning rate}, description={A factor $0 \leq \eta \in \mdr$ that affects how fast new weights are learned. $\eta=0$ means that no new data is learned}, symbol={\ensuremath{\eta}}} % Andrew Ng: \alpha
|
||||||
|
|
||||||
|
\newglossaryentry{learning rate decay}{name={learning rate decay}, description={The learning rate decay $0 < \alpha \leq 1$ is used to adjust the learning rate. After each epoch the learning rate $\eta$ is updated to $\eta \gets \eta \times \alpha$}, symbol={\ensuremath{\eta}}}
|
||||||
|
|
||||||
|
\newglossaryentry{preactivation}{name={preactivation}, description={The preactivation of a neuron is the weighted sum of its input, before the activation function is applied}}
|
||||||
|
|
||||||
|
\newglossaryentry{stroke}{name={stroke}, description={The path the pen took from
|
||||||
|
the point where the pen was put down to the point where the pen was lifted first}}
|
||||||
|
|
||||||
|
\newglossaryentry{line}{name={line}, description={Geometric object that is infinitely long
|
||||||
|
and defined by two points.}}
|
||||||
|
|
||||||
|
\newglossaryentry{line segment}{name={line segment}, description={Geometric object that has finite length
|
||||||
|
and defined by two points.}}
|
||||||
|
|
||||||
|
\newglossaryentry{symbol}{name={symbol}, description={An atomic semantic entity. A more detailed description can be found in \cref{sec:what-is-a-symbol}}}
|
||||||
|
|
||||||
|
\newglossaryentry{weight}{name={weight}, description={A
|
||||||
|
\textit{weight} is a parameter of a neural net, that can be learned}, symbol={\ensuremath{\weight}}}
|
||||||
|
|
||||||
|
\newglossaryentry{control point}{name={control point}, description={A
|
||||||
|
\textit{control point} is a point recorded by the input device.}}
|
12
documents/write-math-ba-paper/variables.tex
Normal file
12
documents/write-math-ba-paper/variables.tex
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
\newcommand{\totalCollectedRecordings}{166898} % ACTUALITY
|
||||||
|
\newcommand{\detexifyCollectedRecordings}{153423}
|
||||||
|
\newcommand{\trainingsetsize}{134804}
|
||||||
|
\newcommand{\validtionsetsize}{15161}
|
||||||
|
\newcommand{\testsetsize}{17012}
|
||||||
|
\newcommand{\totalClasses}{1111}
|
||||||
|
\newcommand{\totalClassesAnalyzed}{369}
|
||||||
|
\newcommand{\totalClassesAboveFifty}{680}
|
||||||
|
\newcommand{\totalClassesNotAnalyzedBelowFifty}{431}
|
||||||
|
\newcommand{\detexifyPercentage}{$\SI{91.93}{\percent}$}
|
||||||
|
\newcommand{\recordingsWithDots}{$\SI{2.77}{\percent}$} % excluding i,j, ...
|
||||||
|
\newcommand{\recordingsWithDotsSizechange}{$\SI{0.85}{\percent}$} % excluding i,j, ...
|
1711
documents/write-math-ba-paper/write-math-ba-paper.bib
Normal file
1711
documents/write-math-ba-paper/write-math-ba-paper.bib
Normal file
File diff suppressed because it is too large
Load diff
313
documents/write-math-ba-paper/write-math-ba-paper.tex
Normal file
313
documents/write-math-ba-paper/write-math-ba-paper.tex
Normal file
|
@ -0,0 +1,313 @@
|
||||||
|
\documentclass[9pt,technote]{IEEEtran}
|
||||||
|
\usepackage{amssymb, amsmath} % needed for math
|
||||||
|
\usepackage{hyperref} % links im text
|
||||||
|
\usepackage{parskip}
|
||||||
|
\usepackage{csquotes}
|
||||||
|
\usepackage{braket}
|
||||||
|
\usepackage[noadjust]{cite}
|
||||||
|
\usepackage[nameinlink,noabbrev]{cleveref} % has to be after hyperref, ntheorem, amsthm
|
||||||
|
\usepackage[binary-units]{siunitx}
|
||||||
|
\sisetup{per-mode=fraction,binary-units=true}
|
||||||
|
\DeclareSIUnit\pixel{px}
|
||||||
|
\usepackage{glossaries}
|
||||||
|
\loadglsentries[main]{glossary}
|
||||||
|
\makeglossaries
|
||||||
|
|
||||||
|
\title{On-line Recognition of Handwritten Mathematical Symbols}
|
||||||
|
\author{Martin Thoma}
|
||||||
|
|
||||||
|
\hypersetup{
|
||||||
|
pdfauthor = {Martin Thoma},
|
||||||
|
pdfkeywords = {Mathematics,Symbols,recognition},
|
||||||
|
pdftitle = {On-line Recognition of Handwritten Mathematical Symbols}
|
||||||
|
}
|
||||||
|
\include{variables}
|
||||||
|
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
% Begin document %
|
||||||
|
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
\begin{abstract}
|
||||||
|
Writing mathematical formulas with \LaTeX{} is easy as soon as one is used to
|
||||||
|
commands like \verb+\alpha+ and \verb+\propto+. However, for people who have
|
||||||
|
never used \LaTeX{} or who don't know the English name of the command, it can
|
||||||
|
be difficult to find the right command. Hence the automatic recognition of
|
||||||
|
handwritten mathematical symbols is desirable. This paper presents a system
|
||||||
|
which uses the pen trajectory to classify handwritten symbols. Five
|
||||||
|
preprocessing steps, one data multiplication algorithm, five features and five
|
||||||
|
variants for multilayer Perceptron training were evaluated using $\num{166898}$
|
||||||
|
recordings which were collected with two crowdsourcing projects. The evaluation
|
||||||
|
results of these 21~experiments were used to create an optimized recognizer
|
||||||
|
which has a TOP1 error of less than $\SI{17.5}{\percent}$ and a TOP3 error of
|
||||||
|
$\SI{4.0}{\percent}$. This is an improvement of $\SI{18.5}{\percent}$ for the
|
||||||
|
TOP1 error and $\SI{29.7}{\percent}$ for the TOP3 error compared to the
|
||||||
|
baseline system.
|
||||||
|
\end{abstract}
|
||||||
|
|
||||||
|
\section{Introduction}
|
||||||
|
On-line recognition makes use of the pen trajectory. This means the data is
|
||||||
|
given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where
|
||||||
|
each group represents a stroke, $(x, y)$ is the position of the pen on a canvas
|
||||||
|
and $t$ is the time. One handwritten symbol in the described format is called
|
||||||
|
a \textit{recording}. Recordings can be classified by making use of
|
||||||
|
this data. One classification approach assigns a probability to each class
|
||||||
|
given the data. The classifier can be evaluated by using recordings which
|
||||||
|
were classified by humans and were not used by to train the classifier. The
|
||||||
|
set of those recordings is called \textit{testset}. Then
|
||||||
|
the TOP-$n$ error is defined as the fraction of the symbols where the correct
|
||||||
|
class was not within the top $n$ classes of the highest probability.
|
||||||
|
|
||||||
|
Various systems for mathematical symbol recognition with on-line data have been
|
||||||
|
described so far~\cite{Kosmala98,Mouchere2013}, but most of them have neither
|
||||||
|
published their source code nor their data which makes it impossible to re-run
|
||||||
|
experiments to compare different systems. This is unfortunate as the choice of
|
||||||
|
symbols is cruicial for the TOP-$n$ error. For example, the symbols $o$, $O$,
|
||||||
|
$\circ$ and $0$ are very similar and systems which know all those classes will
|
||||||
|
certainly have a higher TOP-$n$ error than systems which only accept one of
|
||||||
|
them.
|
||||||
|
|
||||||
|
Daniel Kirsch describes in~\cite{Kirsch} a system which uses time warping to
|
||||||
|
classify on-line handwritten symbols and claimes to achieve a TOP3 error of
|
||||||
|
less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols. He also
|
||||||
|
published his data, which was collected by a crowd-sourcing approach via
|
||||||
|
\url{http://detexify.kirelabs.org}, on
|
||||||
|
\url{https://github.com/kirel/detexify-data}. Those recordings as well as
|
||||||
|
some recordings which were collected by a similar approach via
|
||||||
|
\url{http://write-math.com} were used to train and evaluated different
|
||||||
|
classifiers. A complete description of all involved software, data,
|
||||||
|
presentations and experiments is listed in~\cite{Thoma:2014}.
|
||||||
|
|
||||||
|
\section{Steps in Handwriting Recognition}
|
||||||
|
The following steps are used in all classifiers which are described in the
|
||||||
|
following:
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Preprocessing}: Recorded data is never perfect. Devices have
|
||||||
|
errors and people make mistakes while using devices. To tackle
|
||||||
|
these problems there are preprocessing algorithms to clean the data.
|
||||||
|
The preprocessing algorithms can also remove unnecessary variations of
|
||||||
|
the data that do not help classify but hide what is important.
|
||||||
|
Having slightly different sizes of the same symbol is an example of such a
|
||||||
|
variation. Nine preprocessing algorithms that clean or normalize
|
||||||
|
recordings are explained in
|
||||||
|
\cref{sec:preprocessing}.
|
||||||
|
\item \textbf{Data multiplication}: Learning algorithms need lots of data
|
||||||
|
to learn internal parameters. If there is not enough data available,
|
||||||
|
domain knowledge can be considered to create new artificial data from
|
||||||
|
the original data. In the domain of on-line handwriting recognition
|
||||||
|
data can be multiplied by adding rotated variants.
|
||||||
|
\item \textbf{Segmentation}: The task of formula recognition can eventually
|
||||||
|
be reduced to the task of symbol recognition combined with symbol
|
||||||
|
placement. Before symbol recognition can be done, the formula has
|
||||||
|
to be segmented. As this paper is only about single-symbol
|
||||||
|
recognition, this step will not be further discussed.
|
||||||
|
\item \textbf{Feature computation}: A feature is high-level information
|
||||||
|
derived from the raw data after preprocessing. Some systems like
|
||||||
|
Detexify, which was presented in~\cite{Kirsch}, simply take the
|
||||||
|
result of the preprocessing step, but many compute new features. This
|
||||||
|
might have the advantage that less training data is needed since the
|
||||||
|
developer can use knowledge about handwriting to compute highly
|
||||||
|
discriminative features. Various features are explained in
|
||||||
|
\cref{sec:features}.
|
||||||
|
\item \textbf{Feature enhancement}: Applying PCA, LDA, or
|
||||||
|
feature standardization might change the features in ways that could
|
||||||
|
improve the performance of learning algorithms.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
After these steps, we are faced with a classification learning task which consists of
|
||||||
|
two parts:
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Learning} parameters for a given classifier. This process is
|
||||||
|
also called \textit{training}.
|
||||||
|
\item \textbf{Classifying} new recordings, sometimes called
|
||||||
|
\textit{evaluation}. This should not be confused with the evaluation
|
||||||
|
of the classification performance which is done for multiple
|
||||||
|
topologies, preprocessing queues, and features in \Cref{ch:Evaluation}.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
Two fundamentally different systems for classification of time series data were
|
||||||
|
evaluated. One uses greedy time warping, which has a very easy, fast learning
|
||||||
|
algorithm which only stores some of the seen training examples. The other one is
|
||||||
|
based on neural networks, taking longer to train, but is much faster in
|
||||||
|
recognition and also leads to better recognition results.
|
||||||
|
|
||||||
|
\section{Algorithms}
|
||||||
|
\subsection{Preprocessing}\label{sec:preprocessing}
|
||||||
|
Preprocessing in symbol recognition is done to improve the quality and
|
||||||
|
expressive power of the data. It should make follow-up tasks like segmentation
|
||||||
|
and feature extraction easier, more effective or faster. It does so by resolving
|
||||||
|
errors in the input data, reducing duplicate information and removing irrelevant
|
||||||
|
information.
|
||||||
|
|
||||||
|
The preprocessing algorithms fall in two groups: Normalization and noise
|
||||||
|
reduction algorithms.
|
||||||
|
|
||||||
|
The most important normalization algorithm in single-symbol recognition is
|
||||||
|
\textit{scale-and-shift}. It scales the recording so that
|
||||||
|
its bounding box fits into a unit square. As the aspect ratio of a recording
|
||||||
|
is almost never 1:1, only one dimension will fit exactly in the unit square.
|
||||||
|
Then there are multiple ways how to shift the recording. For this paper, it was
|
||||||
|
chosen to shift the bigger dimension to fit into the $[0,1] \times [0,1]$ unit
|
||||||
|
square whereas the smaller dimension is centered in the $[-1,1] \times [-1,1]$
|
||||||
|
square.
|
||||||
|
|
||||||
|
Another normalization preprocessing algorithm is resampling. As the data points
|
||||||
|
on the pen trajectory are generated asynchronously and with different
|
||||||
|
time-resolutions depending on the used hardware and software, it is desirable
|
||||||
|
to resample the recordings to have points spread equally in time for every
|
||||||
|
recording. This was done with linear interpolation of the $(x,t)$ and $(y,t)$
|
||||||
|
sequences and getting a fixed number of equally spaced samples.
|
||||||
|
|
||||||
|
\textit{Connect strokes} is a noise reduction algorithm. It happens sometimes
|
||||||
|
that the hardware detects that the user lifted the pen where he certainly
|
||||||
|
didn't do so. This can be detected by measuring the distance between the end of
|
||||||
|
one stroke and the beginning of the next stroke. If this distance is below a
|
||||||
|
threshold, then the strokes are connected.
|
||||||
|
|
||||||
|
Due to a limited resolution of the recording device and due to erratic
|
||||||
|
handwriting, the pen trajectory might not be smooth. One way to smooth is
|
||||||
|
calculating a weighted average and replacing points by the weighted average of
|
||||||
|
their coordinate and their neighbors coordinates. Another way to do smoothing
|
||||||
|
would be to reduce the number of points with the Douglas-Peucker algorithm to
|
||||||
|
the most relevant ones and then interpolate those points. The Douglas-Peucker
|
||||||
|
stroke simplification algorithm is usually used in cartography to simplify the
|
||||||
|
shape of roads. The Douglas-Peucker algorithm works recursively to find a
|
||||||
|
subset of control points of a stroke that is simpler and still similar to the
|
||||||
|
original shape. The algorithm adds the first and the last point $p_1$ and $p_n$
|
||||||
|
of a stroke to the simplified set of points $S$. Then it searches the control
|
||||||
|
point $p_i$ in between that has maximum distance from the \gls{line} $p_1 p_n$.
|
||||||
|
If this distance is above a threshold $\varepsilon$, the point $p_i$ is added
|
||||||
|
to $S$. Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively.
|
||||||
|
Pseudocode of this algorithm is on \cpageref{alg:douglas-peucker}. It is
|
||||||
|
described as \enquote{Algorithm 1} in~\cite{Visvalingam1990} with a different
|
||||||
|
notation.
|
||||||
|
|
||||||
|
\subsection{Features}\label{sec:features}
|
||||||
|
Features can be global, that means calculated for the complete recording or
|
||||||
|
complete strokes. Other features are calculated for single points on the
|
||||||
|
pen trajectory and are called \textit{local}.
|
||||||
|
|
||||||
|
Global features are the \textit{number of strokes} in a recording, the
|
||||||
|
\textit{aspect ratio} of the bounding box of a recordings bounding box or the
|
||||||
|
\textit{ink} being used for a recording. The ink feature gets calculated by
|
||||||
|
measuring the length of all strokes combined. The re-curvature, which was
|
||||||
|
introduced in~\cite{Huang06}, is defined as
|
||||||
|
\[\text{re-curvature}(stroke) := \frac{\text{height}(stroke)}{\text{length}(stroke)}\]
|
||||||
|
and a stroke-global feature.
|
||||||
|
|
||||||
|
The most important local feature is the coordinate of the point itself.
|
||||||
|
Speed, curvature and a local small-resolution bitmap around the point, which
|
||||||
|
was introduced by Manke et al. in~\cite{Manke94} are other local features.
|
||||||
|
|
||||||
|
\subsection{Multilayer Perceptrons}\label{sec:mlp-training}
|
||||||
|
\Glspl{MLP} are explained in detail in~\cite{Mitchell97}. They can have
|
||||||
|
different numbers of hidden layers, the number of neurons per layer and the
|
||||||
|
activation functions can be varied. The learning algorithm is parameterized by
|
||||||
|
the learning rate $\eta$, the momentum $\alpha$ and the number of epochs. The
|
||||||
|
learning of \glspl{MLP} can be executed in various different ways, for example
|
||||||
|
with layer-wise supversided pretraining which means if a three layer \gls{MLP}
|
||||||
|
of the topology $160:500:500:500:369$ should get trained, at first a \gls{MLP}
|
||||||
|
with one hidden layer ($160:500:369$) is trained. Then the output layer is
|
||||||
|
discarded, a new hidden layer and a new output layer is added and it is trained
|
||||||
|
again. Then we have a $160:500:500:369$ \gls{MLP}. The output layer is
|
||||||
|
discarded again, a new hidden layer is added and a new output layer is added
|
||||||
|
and the training is executed again.
|
||||||
|
|
||||||
|
\section{Evaluation}\label{ch:Evaluation}
|
||||||
|
In order to evaluate the effect of different preprocessing algorithms, features
|
||||||
|
and adjustments in the \gls{MLP} training and topology, the following baseline
|
||||||
|
system was used:
|
||||||
|
|
||||||
|
Scale the recording to fit into a unit square while keeping the aspect ratio,
|
||||||
|
shift it into $[-1,1] \times [-1,1]$ as described in \cref{sec:preprocessing},
|
||||||
|
resample it with linear interpolation to get 20~points per stroke, spaced
|
||||||
|
evenly in time. Take the first 4~strokes with 20~points per stroke and
|
||||||
|
2~coordinates per point as features, resulting in 160~features which is equal
|
||||||
|
to the number of input neurons. If a recording has less than 4~strokes, the
|
||||||
|
remaining features were filled with zeroes.
|
||||||
|
|
||||||
|
All experiments were evaluated with four baseline systems $B_i$, $i \in \Set{1,
|
||||||
|
2, 3, 4}$, where $i$ is the number of hidden layers as different topologies
|
||||||
|
could have a severe influence on the effect of new features or preprocessing
|
||||||
|
steps. Each hidden layer in all evaluated systems has $500$ neurons.
|
||||||
|
|
||||||
|
Each \gls{MLP} was trained with a learning rate of $\eta = 0.1$ and a momentum
|
||||||
|
of $\alpha = 0.1$. The activation function of every neuron is
|
||||||
|
|
||||||
|
%TODO: Evaluation randomnes
|
||||||
|
%TODO:
|
||||||
|
|
||||||
|
\section{Conclusion}
|
||||||
|
The aim of this bachelor's thesis was to build a recognition system that
|
||||||
|
can recognize many mathematical symbols with low error rates as well as to
|
||||||
|
evaluate which preprocessing steps and features help to improve the recognition
|
||||||
|
rate.
|
||||||
|
|
||||||
|
All recognition systems were trained and evaluated with
|
||||||
|
$\num{\totalCollectedRecordings{}}$ recordings for \totalClassesAnalyzed{}
|
||||||
|
symbols. These recordings were collected by two crowdsourcing projects
|
||||||
|
(\href{http://detexify.kirelabs.org/classify.html}{Detexify} and
|
||||||
|
\href{write-math.com}{write-math.com}) and created with various devices. While
|
||||||
|
some recordings were created with standard touch devices such as tablets and
|
||||||
|
smartphones, others were created with the mouse.
|
||||||
|
|
||||||
|
\Glspl{MLP} were used for the classification task. Four baseline systems with
|
||||||
|
different numbers of hidden layers were used, as the number of hidden layer
|
||||||
|
influences the capabilities and problems of \glspl{MLP}. Furthermore, an error
|
||||||
|
measure MER was defined, which takes the top three \glspl{hypothesis} of the classifier,
|
||||||
|
merges symbols such as \verb+\sum+ ($\sum$) and \verb+\Sigma+ ($\Sigma$) to
|
||||||
|
equivalence classes, and then calculates the error.
|
||||||
|
|
||||||
|
All baseline systems used the same preprocessing queue. The recordings were
|
||||||
|
scaled to fit into a unit square, shifted to $(0,0)$, resampled with linear
|
||||||
|
interpolation so that every stroke had exactly 20~points which are spread
|
||||||
|
equidistant in time. The 80~($x,y$) coordinates of the first 4~strokes were used
|
||||||
|
to get exactly $160$ input features for every recording. The baseline systems
|
||||||
|
$B_2$ has a MER error of $\SI{5.67}{\percent}$.
|
||||||
|
|
||||||
|
Three variations of the scale and shift algorithm, wild point filtering, stroke
|
||||||
|
connect, weighted average smoothing, and Douglas-Peucker smoothing were
|
||||||
|
evaluated. The evaluation showed that the scale and shift algorithm is extremely
|
||||||
|
important and the connect strokes algorithm improves the classification. All
|
||||||
|
other preprocessing algorithms either diminished the classification performance
|
||||||
|
or had less influence on it than the random initialization of the \glspl{MLP}
|
||||||
|
weights.
|
||||||
|
|
||||||
|
Adding two slightly rotated variants for each recording and hence tripling the
|
||||||
|
training set made the systems $B_3$ and $B_4$ perform much worse, but improved
|
||||||
|
the performance of the smaller systems.
|
||||||
|
|
||||||
|
The global features re-curvature, ink, stoke count and aspect ratio improved the
|
||||||
|
systems $B_1$--$B_3$, whereas the stroke center point feature made $B_2$ perform
|
||||||
|
worse.
|
||||||
|
|
||||||
|
The learning rate and the momentum were evaluated. A learning rate of $\eta=0.1$
|
||||||
|
and a momentum of $\alpha=0.9$ gave the best results. Newbob training lead to
|
||||||
|
much worse recognition rates. Denoising auto-encoders were evaluated as one way
|
||||||
|
to use pretraining, but by this the error rate increased notably. However,
|
||||||
|
supervised layer-wise pretraining improved the performance decidedly.
|
||||||
|
|
||||||
|
The stroke connect algorithm was added to the preprocessing steps of the
|
||||||
|
baseline system as well as the re-curvature feature, the ink feature, the number
|
||||||
|
of strokes and the aspect ratio. The training setup of the baseline system was
|
||||||
|
changed to supervised layer-wise pretraining and the resulting model was trained
|
||||||
|
with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a MER
|
||||||
|
error of $\SI{3.96}{\percent}$. This means that the MER error dropped by over
|
||||||
|
$\SI{30}{\percent}$ in comparison to the baseline system $B_2$.
|
||||||
|
|
||||||
|
A MER error of $\SI{3.96}{\percent}$ makes the system usable for symbol lookup.
|
||||||
|
It could also be used as a starting point for the development of a
|
||||||
|
multiple-symbol classifier.
|
||||||
|
|
||||||
|
The aim of this bachelor's thesis was to develop a symbol recognition system
|
||||||
|
which is easy to use, fast and has high recognition rates as well as evaluating
|
||||||
|
ideas for single symbol classifiers. Some of those goals were reached. The
|
||||||
|
recognition system $B_{2,c}'$ evaluates new recordings in a fraction of a second
|
||||||
|
and has acceptable recognition rates. Many variations algorithms were evaluated.
|
||||||
|
However, there are still many more algorithms which could be evaluated and, at
|
||||||
|
the time of this work, the best classifier $B_{2,c}'$ is not publicly available.
|
||||||
|
|
||||||
|
\bibliographystyle{IEEEtranSA}
|
||||||
|
\bibliography{write-math-ba-paper}
|
||||||
|
\end{document}
|
Loading…
Add table
Add a link
Reference in a new issue