@inProceedings{adesam-etal-2019-exploring-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN), Copenhagen, Denmark, March 5-8, 2019}, editor = {Costanza Navarretta and Manex Agirrezabal and Bente Maegaard}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR Workshop Proceedings}, address = {Aachen}, } @inProceedings{dannells-etal-2019-evaluation-278761, title = {Evaluation and refinement of an enhanced OCR process for mass digitisation. }, abstract = {Great expectations are placed on the capacity of heritage institutions to make their collections available in digital format. Datadriven research is becoming a key concept within the humanities and social sciences. Kungliga biblioteket’s (National Library of Sweden, KB)collections of digitised newspaper can thus be regarded as unique cultural data sets with information that rarely is conveyed in other media types. The digital format makes it possible to explore these resources in ways not feasible while in printed form. As texts are no longer only read but also subjected to computer based analysis the demand on the correct rendering of the original text increases. OCR technologies for converting images to machine-readable text play a fundamental part in making these resources available, but the effectiveness vary with the type of document being processed. This is evident in relation to the digitisation of newspapers where factors relating to their production, layout and paper quality often impair the OCR production. In order to improve the machine readable text, especially in relation to the digitisation of newspapers, KB initiated the development of an OCR-module where key parameters can be adjusted according to the characteristics of the material being processed. The purpose of this paper is to present the project goals and methods.}, booktitle = {Proceedings of the Digital Humanities in the Nordic Countries 4th Conference (DHN 2019), Copenhagen, Denmark, March 5-8, 2019. Edited by: Costanza Navarretta, Manex Agirrezabal, Bente Maegaard}, author = {Dannélls, Dana and Johansson, Torsten and Björk, Lars}, year = {2019}, publisher = {University of Copenhagen, Faculty of Humanities}, address = {Copenhagen}, }