@inProceedings{dannells-etal-2021-engine-305700, title = {A Two-OCR Engine Method for Digitized Swedish Newspapers }, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. To evaluate the method a reference material spanning the years 1818–2018 was prepared and manually transcribed. A quantitative evaluation was then performed against the material. In this first evaluation we experimented with word lists for different time periods. The results show that even though there was no significant overall improvement of the OCR results, some combinations of word lists are successful for certain periods and should therefore be explored further.}, booktitle = {Selected Papers from the CLARIN Annual Conference 2020, Linköping Electronic Conference Proceedings 180}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten }, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-609-4}, } @inProceedings{skelbye-dannells-2021-processing-306957, title = {OCR Processing of Swedish Historical Newspapers Using Deep Hybrid CNN–LSTM Networks}, abstract = {Deep CNN–LSTM hybrid neural networks have proven to improve the accuracy of Optical Character Recognition (OCR) models for different languages. In this paper we examine to what extent these networks improve the OCR accuracy rates on Swedish historical newspapers. By experimenting with the open source OCR engine Calamari, we are able to show that mixed deep CNN–LSTM hybrid models outperform previous models on the task of character recognition of Swedish historical newspapers spanning 1818–1848. We achieved an average character accuracy rate (CAR) of 97.43% which is a new state–of–the–art result on 19th century Swedish newspaper text. Our data, code and models are released under CC BY licence.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021}, editor = {Galia Angelova and Maria Kunilovskaya and Ruslan Mitkov and Ivelina Nikolova-Koleva}, author = {Skelbye, Molly and Dannélls, Dana}, year = {2021}, publisher = {INCOMA }, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @inProceedings{dannells-virk-2021-supervised-310123, title = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text }, abstract = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.}, booktitle = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2021}, publisher = {Linköping Electronic Press }, address = {Linköping}, }