Skip to main content


	title        = {Evaluation and refinement of an enhanced OCR process for mass digitisation. },
	abstract     = {Great expectations are placed on the capacity of heritage institutions to make their collections available in digital format. Datadriven research is becoming a key concept within the humanities and social sciences. Kungliga biblioteket’s (National Library of Sweden, KB)collections of digitised newspaper can thus be regarded as unique cultural data sets with information that rarely is conveyed in other media types. The digital format makes it possible to explore these resources in ways not feasible while in printed form. As texts are no longer only read but also subjected to computer based analysis the demand on the correct rendering of the original text increases. OCR technologies for converting images to machine-readable text play a fundamental part in making these resources available, but the effectiveness vary with the type of document being processed. This is evident in relation to the digitisation of newspapers where factors relating to their production, layout and paper quality often impair the OCR production. In order to improve the machine readable text, especially in relation to the digitisation of newspapers, KB initiated the development of an OCR-module where key parameters can be adjusted according to the characteristics of the material being processed. The purpose of this paper is to present the project goals and methods.},
	booktitle    = {Proceedings of the Digital Humanities in the Nordic Countries 4th Conference (DHN 2019), Copenhagen, Denmark, March 5-8, 2019. Edited by: Costanza Navarretta, Manex Agirrezabal, Bente Maegaard},
	author       = {Dannélls, Dana and Johansson, Torsten  and Björk, Lars },
	year         = {2019},
	publisher    = {University of Copenhagen, Faculty of Humanities},
	address      = {Copenhagen},