Skip to main content


	title        = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection},
	abstract     = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.},
	booktitle    = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 },
	author       = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus},
	year         = {2017},
	publisher    = {Association for Computing Machinery (ACM)},
	address      = {New York},
	ISBN         = {978-1-4503-5265-9},