documents/write-math-ba-paper: Initial commit

2025-04-26 06:48:04 +02:00 · 2014-12-21 17:13:58 +01:00 · 2014-12-21 17:13:58 +01:00 · cea22c65c0
commit cea22c65c0
parent 6ffe05846e
6 changed files with 2142 additions and 0 deletions
--- a/documents/write-math-ba-paper/Makefile
+++ b/documents/write-math-ba-paper/Makefile
@ -0,0 +1,12 @@
 DOKUMENT = write-math-ba-paper
 make:
 	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # aux-files for makeindex / makeglossaries
 	makeglossaries $(DOKUMENT)
 	bibtex $(DOKUMENT)
 	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include index
 	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
 	pdflatex -shell-escape $(DOKUMENT).tex -interaction=batchmode -output-format=pdf # include symbol table
 	make clean
 clean:
 	rm -rf  $(TARGET) *.class *.html *.log *.aux *.out *.thm *.idx *.toc *.ind *.ilg figures/torus.tex *.glg *.glo *.gls *.ist *.xdy *.fdb_latexmk *.bak *.blg *.bbl *.glsdefs *.acn *.acr *.alg *.nls *.nlo *.bak *.pyg *.lot *.lof
--- a/documents/write-math-ba-paper/README.md
+++ b/documents/write-math-ba-paper/README.md
@ -0,0 +1,21 @@
 ## TODO
 ### Preprocessing
 * Scale-and-shift
 * linear interpolation
 * connect strokes
 * Douglas-Peucker
 ### Features
 * coordinates
 * ink
 * stroke count
 * aspect ratio
 ### Training
 * learning rate
 * momentum
 * Supervised layer-wise pretraining
 * Check abstract!
--- a/documents/write-math-ba-paper/glossary.tex
+++ b/documents/write-math-ba-paper/glossary.tex
@ -0,0 +1,73 @@
 %!TEX root = thesis.tex
 %Term definitions
 \newacronym{ANN}{ANN}{artificial neural network}
 \newacronym{CSR}{CSR}{cursive script recognition}
 \newacronym{DTW}{DTW}{dynamic time warping}
 \newacronym{GTW}{GTW}{greedy time warping}
 \newacronym{HMM}{HMM}{hidden Markov model}
 \newacronym{HWR}{HWR}{handwriting recognition}
 \newacronym{HWRT}{HWRT}{handwriting recognition toolkit}
 \newacronym{MLP}{MLP}{multilayer perceptron}
 \newacronym{MSE}{MSE}{mean squared error}
 \newacronym{OOV}{OOV}{out of vocabulary}
 \newacronym{TDNN}{TDNN}{time delay neural network}
 \newacronym{PCA}{PCA}{principal component analysis}
 \newacronym{LDA}{LDA}{linear discriminant analysis}
 \newacronym{CROHME}{CROHME}{Competition on Recognition of Online Handwritten Mathematical Expressions}
 \newacronym{GMM}{GMM}{Gaussian mixture model}
 \newacronym{SVM}{SVM}{support vector machine}
 \newacronym{PyPI}{PyPI}{Python Package Index}
 \newacronym{CFM}{CFM}{classification figure of merit}
 \newacronym{CE}{CE}{cross entropy}
 \newacronym{GPU}{GPU}{graphics processing unit}
 \newacronym{CUDA}{CUDA}{Compute Unified Device Architecture}
 \newacronym{SLP}{SLP}{supervised layer-wise pretraining}
 % Term definitions
 \newglossaryentry{Detexify}{name={Detexify}, description={A system used for
 on-line handwritten symbol recognition which is described in \cite{Kirsch}}}
 \newglossaryentry{epoch}{name={epoch}, description={During iterative training of a neural network, an \textit{epoch} is a single pass through the entire training set, followed by testing of the verification set.\cite{Concise12}}}
 \newglossaryentry{hypothesis}{
    name={hypothesis},
    description={The recognition results which a classifier returns is called a hypothesis. In other words, it is the \enquote{guess} of a classifier},
    plural=hypotheses
 }
 \newglossaryentry{reference}{
    name={reference},
    description={Labeled data is used to evaluate classifiers. Those labels are called references},
 }
 \newglossaryentry{YAML}{name={YAML}, description={YAML is a human-readable data format that can be used for configuration files}}
 \newglossaryentry{MER}{name={MER}, description={An error measure which combines symbols to equivalence classes. It was introduced on \cpageref{merged-error-introduction}}}
 \newglossaryentry{JSON}{name={JSON}, description={JSON, short for JavaScript Object Notation, is a language-independent data format that can be used to transmit data between a server and a client in web applications}}
 \newglossaryentry{hyperparamter}{name={hyperparamter}, description={A
 \textit{hyperparamter} is a parameter of a neural net, that cannot be learned,
 but has to be chosen}, symbol={\ensuremath{\theta}}}
 \newglossaryentry{learning rate}{name={learning rate}, description={A factor $0 \leq \eta \in \mdr$ that affects how fast new weights are learned. $\eta=0$ means that no new data is learned}, symbol={\ensuremath{\eta}}} % Andrew Ng: \alpha
 \newglossaryentry{learning rate decay}{name={learning rate decay}, description={The learning rate decay $0 < \alpha \leq 1$ is used to adjust the learning rate. After each epoch the learning rate $\eta$ is updated to $\eta \gets \eta \times \alpha$}, symbol={\ensuremath{\eta}}}
 \newglossaryentry{preactivation}{name={preactivation}, description={The preactivation of a neuron is the weighted sum of its input, before the activation function is applied}}
 \newglossaryentry{stroke}{name={stroke}, description={The path the pen took from
 the point where the pen was put down to the point where the pen was lifted first}}
 \newglossaryentry{line}{name={line}, description={Geometric object that is infinitely long
 and defined by two points.}}
 \newglossaryentry{line segment}{name={line segment}, description={Geometric object that has finite length
 and defined by two points.}}
 \newglossaryentry{symbol}{name={symbol}, description={An atomic semantic entity. A more detailed description can be found in \cref{sec:what-is-a-symbol}}}
 \newglossaryentry{weight}{name={weight}, description={A
 \textit{weight} is a parameter of a neural net, that can be learned}, symbol={\ensuremath{\weight}}}
 \newglossaryentry{control point}{name={control point}, description={A
 \textit{control point} is a point recorded by the input device.}}
--- a/documents/write-math-ba-paper/variables.tex
+++ b/documents/write-math-ba-paper/variables.tex
@ -0,0 +1,12 @@
 \newcommand{\totalCollectedRecordings}{166898}  % ACTUALITY
 \newcommand{\detexifyCollectedRecordings}{153423}
 \newcommand{\trainingsetsize}{134804}
 \newcommand{\validtionsetsize}{15161}
 \newcommand{\testsetsize}{17012}
 \newcommand{\totalClasses}{1111}
 \newcommand{\totalClassesAnalyzed}{369}
 \newcommand{\totalClassesAboveFifty}{680}
 \newcommand{\totalClassesNotAnalyzedBelowFifty}{431}
 \newcommand{\detexifyPercentage}{$\SI{91.93}{\percent}$}
 \newcommand{\recordingsWithDots}{$\SI{2.77}{\percent}$}  % excluding i,j, ...
 \newcommand{\recordingsWithDotsSizechange}{$\SI{0.85}{\percent}$}  % excluding i,j, ...
--- a/documents/write-math-ba-paper/write-math-ba-paper.bib
+++ b/documents/write-math-ba-paper/write-math-ba-paper.bib
--- a/documents/write-math-ba-paper/write-math-ba-paper.tex
+++ b/documents/write-math-ba-paper/write-math-ba-paper.tex
@ -0,0 +1,313 @@
 \documentclass[9pt,technote]{IEEEtran}
 \usepackage{amssymb, amsmath} % needed for math
 \usepackage{hyperref}   % links im text
 \usepackage{parskip}
 \usepackage{csquotes}
 \usepackage{braket}
 \usepackage[noadjust]{cite}
 \usepackage[nameinlink,noabbrev]{cleveref} % has to be after hyperref, ntheorem, amsthm
 \usepackage[binary-units]{siunitx}
 \sisetup{per-mode=fraction,binary-units=true}
 \DeclareSIUnit\pixel{px}
 \usepackage{glossaries}
 \loadglsentries[main]{glossary}
 \makeglossaries
 \title{On-line Recognition of Handwritten Mathematical Symbols}
 \author{Martin Thoma}
 \hypersetup{ 
  pdfauthor   = {Martin Thoma}, 
  pdfkeywords = {Mathematics,Symbols,recognition}, 
  pdftitle    = {On-line Recognition of Handwritten Mathematical Symbols} 
 }
 \include{variables}
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 % Begin document                                                    %
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \begin{document}
 \maketitle
 \begin{abstract}
 Writing mathematical formulas with \LaTeX{} is easy as soon as one is used to
 commands like \verb+\alpha+ and \verb+\propto+. However, for people who have
 never used \LaTeX{} or who don't know the English name of the command, it can
 be difficult to find the right command. Hence the automatic recognition of
 handwritten mathematical symbols is desirable. This paper presents a system
 which uses the pen trajectory to classify handwritten symbols. Five
 preprocessing steps, one data multiplication algorithm, five features and five
 variants for multilayer Perceptron training were evaluated using $\num{166898}$
 recordings which were collected with two crowdsourcing projects. The evaluation
 results of these 21~experiments were used to create an optimized recognizer
 which has a TOP1 error of less than $\SI{17.5}{\percent}$ and a TOP3 error of
 $\SI{4.0}{\percent}$. This is an improvement of $\SI{18.5}{\percent}$ for the
 TOP1 error and $\SI{29.7}{\percent}$ for the TOP3 error compared to the
 baseline system.
 \end{abstract}
 \section{Introduction}
 On-line recognition makes use of the pen trajectory. This means the data is
 given as groups of sequences of tuples $(x, y, t) \in \mathbb{R}^3$, where
 each group represents a stroke, $(x, y)$ is the position of the pen on a canvas
 and $t$ is the time. One handwritten symbol in the described format is called
 a \textit{recording}. Recordings can be classified by making use of
 this data. One classification approach assigns a probability to each class
 given the data. The classifier can be evaluated by using recordings which
 were classified by humans and were not used by to train the classifier. The
 set of those recordings is called \textit{testset}. Then
 the TOP-$n$ error is defined as the fraction of the symbols where the correct
 class was not within the top $n$ classes of the highest probability.
 Various systems for mathematical symbol recognition with on-line data have been
 described so far~\cite{Kosmala98,Mouchere2013}, but most of them have neither
 published their source code nor their data which makes it impossible to re-run
 experiments to compare different systems. This is unfortunate as the choice of
 symbols is cruicial for the TOP-$n$ error. For example, the symbols $o$, $O$,
 $\circ$ and $0$ are very similar and systems which know all those classes will
 certainly have a higher TOP-$n$ error than systems which only accept one of
 them.
 Daniel Kirsch describes in~\cite{Kirsch} a system which uses time warping to
 classify on-line handwritten symbols and claimes to achieve a TOP3 error of
 less than $\SI{10}{\percent}$ for a set of $\num{100}$~symbols. He also
 published his data, which was collected by a crowd-sourcing approach via
 \url{http://detexify.kirelabs.org}, on
 \url{https://github.com/kirel/detexify-data}. Those recordings as well as
 some recordings which were collected by a similar approach via
 \url{http://write-math.com} were used to train and evaluated different
 classifiers. A complete description of all involved software, data,
 presentations and experiments is listed in~\cite{Thoma:2014}.
 \section{Steps in Handwriting Recognition}
 The following steps are used in all classifiers which are described in the
 following:
 \begin{enumerate}
    \item \textbf{Preprocessing}: Recorded data is never perfect. Devices have
          errors and people make mistakes while using devices. To tackle
          these problems there are preprocessing algorithms to clean the data.
          The preprocessing algorithms can also remove unnecessary variations of
          the data that do not help classify but hide what is important.
          Having slightly different sizes of the same symbol is an example of such a
          variation. Nine preprocessing algorithms that clean or normalize
          recordings are explained in
          \cref{sec:preprocessing}.
    \item \textbf{Data multiplication}: Learning algorithms need lots of data
          to learn internal parameters. If there is not enough data available,
          domain knowledge can be considered to create new artificial data from
          the original data. In the domain of on-line handwriting recognition
          data can be multiplied by adding rotated variants.
    \item \textbf{Segmentation}: The task of formula recognition can eventually
          be reduced to the task of symbol recognition combined with symbol
          placement. Before symbol recognition can be done, the formula has
          to be segmented. As this paper is only about single-symbol
          recognition, this step will not be further discussed.
    \item \textbf{Feature computation}: A feature is high-level information
          derived from the raw data after preprocessing. Some systems like
          Detexify, which was presented in~\cite{Kirsch}, simply take the
          result of the preprocessing step, but many compute new features. This
          might have the advantage that less training data is needed since the
          developer can use knowledge about handwriting to compute highly
          discriminative features. Various features are explained in
          \cref{sec:features}.
    \item \textbf{Feature enhancement}: Applying PCA, LDA, or
          feature standardization might change the features in ways that could
          improve the performance of learning algorithms.
 \end{enumerate}
 After these steps, we are faced with a classification learning task which consists of
 two parts:
 \begin{enumerate}
    \item \textbf{Learning} parameters for a given classifier. This process is
          also called \textit{training}.
    \item \textbf{Classifying} new recordings, sometimes called
          \textit{evaluation}. This should not be confused with the evaluation
          of the classification performance which is done for multiple
          topologies, preprocessing queues, and features in \Cref{ch:Evaluation}.
 \end{enumerate}
 Two fundamentally different systems for classification of time series data were
 evaluated. One uses greedy time warping, which has a very easy, fast learning
 algorithm which only stores some of the seen training examples. The other one is
 based on neural networks, taking longer to train, but is much faster in
 recognition and also leads to better recognition results.
 \section{Algorithms}
 \subsection{Preprocessing}\label{sec:preprocessing}
 Preprocessing in symbol recognition is done to improve the quality and
 expressive power of the data. It should make follow-up tasks like segmentation
 and feature extraction easier, more effective or faster. It does so by resolving
 errors in the input data, reducing duplicate information and removing irrelevant
 information.
 The preprocessing algorithms fall in two groups: Normalization and noise
 reduction algorithms.
 The most important normalization algorithm in single-symbol recognition is
 \textit{scale-and-shift}. It scales the recording so that
 its bounding box fits into a unit square. As the aspect ratio of a recording
 is almost never 1:1, only one dimension will fit exactly in the unit square.
 Then there are multiple ways how to shift the recording. For this paper, it was
 chosen to shift the bigger dimension to fit into the $[0,1] \times [0,1]$ unit
 square whereas the smaller dimension is centered in the $[-1,1] \times [-1,1]$
 square.
 Another normalization preprocessing algorithm is resampling. As the data points
 on the pen trajectory are generated asynchronously and with different
 time-resolutions depending on the used hardware and software, it is desirable
 to resample the recordings to have points spread equally in time for every
 recording. This was done with linear interpolation of the $(x,t)$ and $(y,t)$
 sequences and getting a fixed number of equally spaced samples.
 \textit{Connect strokes} is a noise reduction algorithm. It happens sometimes
 that the hardware detects that the user lifted the pen where he certainly
 didn't do so. This can be detected by measuring the distance between the end of
 one stroke and the beginning of the next stroke. If this distance is below a
 threshold, then the strokes are connected.
 Due to a limited resolution of the recording device and due to erratic
 handwriting, the pen trajectory might not be smooth. One way to smooth is
 calculating a weighted average and replacing points by the weighted average of
 their coordinate and their neighbors coordinates. Another way to do smoothing
 would be to reduce the number of points with the Douglas-Peucker algorithm to
 the most relevant ones and then interpolate those points. The Douglas-Peucker
 stroke simplification algorithm is usually used in cartography to simplify the
 shape of roads. The Douglas-Peucker algorithm works recursively to find a
 subset of control points of a stroke that is simpler and still similar to the
 original shape. The algorithm adds the first and the last point $p_1$ and $p_n$
 of a stroke to the simplified set of points $S$. Then it searches the control
 point $p_i$ in between that has maximum distance from the \gls{line} $p_1 p_n$.
 If this distance is above a threshold $\varepsilon$, the point $p_i$ is added
 to $S$. Then the algorithm gets applied to $p_1 p_i$ and $p_i p_n$ recursively.
 Pseudocode of this algorithm is on \cpageref{alg:douglas-peucker}. It is
 described as \enquote{Algorithm 1} in~\cite{Visvalingam1990} with a different
 notation.
 \subsection{Features}\label{sec:features}
 Features can be global, that means calculated for the complete recording or
 complete strokes. Other features are calculated for single points on the
 pen trajectory and are called \textit{local}.
 Global features are the \textit{number of strokes} in a recording, the
 \textit{aspect ratio} of the bounding box of a recordings bounding box or the
 \textit{ink} being used for a recording. The ink feature gets calculated by
 measuring the length of all strokes combined. The re-curvature, which was
 introduced in~\cite{Huang06}, is defined as
 \[\text{re-curvature}(stroke) := \frac{\text{height}(stroke)}{\text{length}(stroke)}\]
 and a stroke-global feature.
 The most important local feature is the coordinate of the point itself.
 Speed, curvature and a local small-resolution bitmap around the point, which
 was introduced by Manke et al. in~\cite{Manke94} are other local features.
 \subsection{Multilayer Perceptrons}\label{sec:mlp-training}
 \Glspl{MLP} are explained in detail in~\cite{Mitchell97}. They can have
 different numbers of hidden layers, the number of neurons per layer and the
 activation functions can be varied. The learning algorithm is parameterized by
 the learning rate $\eta$, the momentum $\alpha$ and the number of epochs. The
 learning of \glspl{MLP} can be executed in various different ways, for example
 with layer-wise supversided pretraining which means if a three layer \gls{MLP}
 of the topology $160:500:500:500:369$ should get trained, at first a \gls{MLP}
 with one hidden layer ($160:500:369$) is trained. Then the output layer is
 discarded, a new hidden layer and a new output layer is added and it is trained
 again. Then we have a $160:500:500:369$ \gls{MLP}. The output layer is
 discarded again, a new hidden layer is added and a new output layer is added
 and the training is executed again.
 \section{Evaluation}\label{ch:Evaluation}
 In order to evaluate the effect of different preprocessing algorithms, features
 and adjustments in the \gls{MLP} training and topology, the following baseline
 system was used:
 Scale the recording to fit into a unit square while keeping the aspect ratio,
 shift it into $[-1,1] \times [-1,1]$ as described in \cref{sec:preprocessing},
 resample it with linear interpolation to get 20~points per stroke, spaced
 evenly in time. Take the first 4~strokes with 20~points per stroke and
 2~coordinates per point as features, resulting in 160~features which is equal
 to the number of input neurons. If a recording has less than 4~strokes, the
 remaining features were filled with zeroes.
 All experiments were evaluated with four baseline systems $B_i$, $i \in \Set{1,
 2, 3, 4}$, where $i$ is the number of hidden layers as different topologies
 could have a severe influence on the effect of new features or preprocessing
 steps. Each hidden layer in all evaluated systems has $500$ neurons.
 Each \gls{MLP} was trained with a learning rate of $\eta = 0.1$ and a momentum
 of $\alpha = 0.1$. The activation function of every neuron is 
 %TODO: Evaluation randomnes
 %TODO:
 \section{Conclusion}
 The aim of this bachelor's thesis was to build a recognition system that
 can recognize many mathematical symbols with low error rates as well as to
 evaluate which preprocessing steps and features help to improve the recognition
 rate.
 All recognition systems were trained and evaluated with
 $\num{\totalCollectedRecordings{}}$ recordings for \totalClassesAnalyzed{}
 symbols. These recordings were collected by two crowdsourcing projects 
 (\href{http://detexify.kirelabs.org/classify.html}{Detexify} and
 \href{write-math.com}{write-math.com}) and created with various devices. While
 some recordings were created with standard touch devices such as tablets and
 smartphones, others were created with the mouse.
 \Glspl{MLP} were used for the classification task. Four baseline systems with
 different numbers of hidden layers were used, as the number of hidden layer
 influences the capabilities and problems of \glspl{MLP}. Furthermore, an error
 measure MER was defined, which takes the top three \glspl{hypothesis} of the classifier,
 merges symbols such as \verb+\sum+ ($\sum$) and \verb+\Sigma+ ($\Sigma$) to
 equivalence classes, and then calculates the error.
 All baseline systems used the same preprocessing queue. The recordings were
 scaled to fit into a unit square, shifted to $(0,0)$, resampled with linear
 interpolation so that every stroke had exactly 20~points which are spread
 equidistant in time. The 80~($x,y$) coordinates of the first 4~strokes were used
 to get exactly $160$ input features for every recording. The baseline systems
 $B_2$ has a MER error of $\SI{5.67}{\percent}$.
 Three variations of the scale and shift algorithm, wild point filtering, stroke
 connect, weighted average smoothing, and Douglas-Peucker smoothing were
 evaluated. The evaluation showed that the scale and shift algorithm is extremely
 important and the connect strokes algorithm improves the classification. All
 other preprocessing algorithms either diminished the classification performance
 or had less influence on it than the random initialization of the \glspl{MLP}
 weights.
 Adding two slightly rotated variants for each recording and hence tripling the
 training set made the systems $B_3$ and $B_4$ perform much worse, but improved
 the performance of the smaller systems.
 The global features re-curvature, ink, stoke count and aspect ratio improved the
 systems $B_1$--$B_3$, whereas the stroke center point feature made $B_2$ perform
 worse.
 The learning rate and the momentum were evaluated. A learning rate of $\eta=0.1$
 and a momentum of $\alpha=0.9$ gave the best results. Newbob training lead to
 much worse recognition rates. Denoising auto-encoders were evaluated as one way
 to use pretraining, but by this the error rate increased notably. However,
 supervised layer-wise pretraining improved the performance decidedly.
 The stroke connect algorithm was added to the preprocessing steps of the
 baseline system as well as the re-curvature feature, the ink feature, the number
 of strokes and the aspect ratio. The training setup of the baseline system was
 changed to supervised layer-wise pretraining and the resulting model was trained
 with a lower learning rate again. This optimized recognizer $B_{2,c}'$ had a MER
 error of $\SI{3.96}{\percent}$. This means that the MER error dropped by over
 $\SI{30}{\percent}$ in comparison to the baseline system $B_2$.
 A MER error of $\SI{3.96}{\percent}$ makes the system usable for symbol lookup.
 It could also be used as a starting point for the development of a
 multiple-symbol classifier.
 The aim of this bachelor's thesis was to develop a symbol recognition system
 which is easy to use, fast and has high recognition rates as well as evaluating
 ideas for single symbol classifiers. Some of those goals were reached. The
 recognition system $B_{2,c}'$ evaluates new recordings in a fraction of a second
 and has acceptable recognition rates. Many variations algorithms were evaluated.
 However, there are still many more algorithms which could be evaluated and, at
 the time of this work, the best classifier $B_{2,c}'$ is not publicly available.
 \bibliographystyle{IEEEtranSA}
 \bibliography{write-math-ba-paper}
 \end{document}