Skip to main content
Språkbanken Text is a part of Språkbanken.

BibTeX

@inProceedings{dannells-etal-2020-evaluation-296165,
	title        = {Evaluation of a Two-OCR Engine Method: First Results on Digitized Swedish Newspapers Spanning over nearly 200 Years},
	abstract     = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. We report the first quantitative evaluation results on a material spanning over nearly 200 years. In this first evaluation phase we experimented with word lists for different time periods. Although there was no significant overall improvement of the OCR results, the evaluation shows that some combinations of word lists are successful for certain periods and should therefore be explored further. },
	booktitle    = { CLARIN Annual Conference 2020, (Virtual Event), 5-7 October, 2020. Book of Abstracts},
	author       = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten},
	year         = {2020},
}

@inProceedings{dannells-simon-2020-supervised-289944,
	title        = {Supervised OCR Post-Correction of Historical Swedish Texts: What Role Does the OCR System Play?},
	abstract     = {Current approaches for post-correction of OCR errors offer solutions that are tailored to a specific OCR system. This can be problematic if the post-correction method was trained on a specific OCR
system but have to be applied on the result of another system. Whereas OCR post-correction of historical text has received much attention lately, the question of what role does the OCR system play for the post-correction method has not been addressed. In this study we explore a dataset of
400 documents of historical Swedish text which has been OCR processed by three state-of-the-art OCR systems: Abbyy Finereader, Tesseract and Ocropus. We examine the OCR results of each system and present a supervised machine learning post-correction method that tries to approach
the challenges exhibited by each system. We study the performance of our method by using three evaluation tools: PrimA, Språkbanken evaluation tool and Frontiers Toolkit. Based on the evaluation analysis we discuss the impact each of the OCR systems has on the results of the post-
correction method. We report on quantitative and qualitative results showing varying degrees of OCR post-processing complexity that are important to consider when developing an OCR post-correction method.},
	booktitle    = {Proceedings of the Digital Humanities in the Nordic Countries, 5th Conference, Riga, Latvia, October 21-23, 2020},
	editor       = {Sanita Reinsone and Inguna Skadiņa and Anda Baklāne and Jānis Daugavietis},
	author       = {Dannélls, Dana and Simon, Persson},
	year         = {2020},
	publisher    = {CEUR-WS},
}