BibTeX

@inProceedings{dannells-virk-2020-error-297714,
	title        = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models},
	abstract     = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines.      },
	booktitle    = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2020},
}
Sidansvarig: sb-webb