@inProceedings{dannells-broden-2020-building-297061, title = {Building a Language Technology Infrastructure for Digital Humanities: Challenges, Opportunities and Progress}, abstract = {Språkbanken Text, a research unit at the University of Gothenburg, forms part of the National Language Bank of Sweden and is the main coordinating node of Swe-Clarin, the Swedish national CLARIN node. During the past years, Språkbanken Text has been actively engaged in a number of humanities and social sciences related research projects. This engagement has primarily concerned the development of new resources, methods and tools to accurately process large amounts of digitized material, in addition to interfaces for visualizing the materials, making them easily accessible for further analysis. The activities within Swe-Clarin have been essential for the progress and the success of this work. In this paper we present what was required from Språkbanken Text in order to meet the expectations of researchers from the humanities and social sciences. We discuss some of the challenges this work involves and describe the opportunities this field brings with it and how these opportunities could help to progress the work of Språkbanken Text toward building a language technology infrastructure that supports interdisciplinary research.}, booktitle = {Proceedings of the Twin Talks 2 and 3 Workshops at DHN 2020 and DH 2020 Ottawa Canada and Riga Latvia, July 23 and October 20, 2020}, editor = {Steven Krauwer and Darja Fišer}, author = {Dannélls, Dana and Brodén, Daniel}, year = {2020}, publisher = {CEUR-WS.org}, } @inProceedings{dannells-simon-2020-supervised-289944, title = {Supervised OCR Post-Correction of Historical Swedish Texts: What Role Does the OCR System Play?}, abstract = {Current approaches for post-correction of OCR errors offer solutions that are tailored to a specific OCR system. This can be problematic if the post-correction method was trained on a specific OCR system but have to be applied on the result of another system. Whereas OCR post-correction of historical text has received much attention lately, the question of what role does the OCR system play for the post-correction method has not been addressed. In this study we explore a dataset of 400 documents of historical Swedish text which has been OCR processed by three state-of-the-art OCR systems: Abbyy Finereader, Tesseract and Ocropus. We examine the OCR results of each system and present a supervised machine learning post-correction method that tries to approach the challenges exhibited by each system. We study the performance of our method by using three evaluation tools: PrimA, Språkbanken evaluation tool and Frontiers Toolkit. Based on the evaluation analysis we discuss the impact each of the OCR systems has on the results of the post- correction method. We report on quantitative and qualitative results showing varying degrees of OCR post-processing complexity that are important to consider when developing an OCR post-correction method.}, booktitle = {Proceedings of the Digital Humanities in the Nordic Countries, 5th Conference, Riga, Latvia, October 21-23, 2020}, editor = {Sanita Reinsone and Inguna Skadiņa and Anda Baklāne and Jānis Daugavietis}, author = {Dannélls, Dana and Simon, Persson}, year = {2020}, publisher = {CEUR-WS}, } @inProceedings{dannells-virk-2020-error-297714, title = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models}, abstract = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines. }, booktitle = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2020}, } @inProceedings{dannells-etal-2020-evaluation-296165, title = {Evaluation of a Two-OCR Engine Method: First Results on Digitized Swedish Newspapers Spanning over nearly 200 Years}, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. We report the first quantitative evaluation results on a material spanning over nearly 200 years. In this first evaluation phase we experimented with word lists for different time periods. Although there was no significant overall improvement of the OCR results, the evaluation shows that some combinations of word lists are successful for certain periods and should therefore be explored further. }, booktitle = { CLARIN Annual Conference 2020, (Virtual Event), 5-7 October, 2020. Book of Abstracts}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten}, year = {2020}, } @inProceedings{waldispuhl-etal-2020-material-293332, title = {Material Philology Meets Digital Onomastic Lexicography: The NordiCon Database of Medieval Nordic Personal Names in Continental Sources}, abstract = {We present NordiCon, a database containing medieval Nordic personal names attested in Continental sources. The database combines formally interpreted and richly interlinked onomastic data with digitized versions of the medieval manuscripts from which the data originate and information on the tokens' context. The structure of NordiCon is inspired by other online historical given name dictionaries. It takes up challenges reported on in previous works, such as how to cover material properties of a name token and how to define lemmatization principles, and elaborates on possible solutions. The lemmatization principles for NordiCon are further developed in order to facilitate the connection to other name dictionaries and corpuses, and the integration of the database into SprÃ¥kbanken Text, an infrastructure containing modern and historical written data.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference, Marseille, 11–16 May 2020 / editors: Nicoletta Calzolari... [et. al.]}, author = {Waldispühl, Michelle and Dannélls, Dana and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille}, ISBN = {979-10-95546-34-4}, }