@incollection{borin-etal-2021-introduction-310200, title = {Introduction: Swedish FrameNet++}, abstract = {The Swedish FrameNet++ was designed to be several things. As a digital artifact, it is an integrated panchronic lexical macroresource, primarily for Swedish, but including several other languages, intended as a basic infrastructural component in Swedish language technology research and for developing natural language processing applications. As an activity, it is a long-term R&D initiative, initially aimed at bringing about this macroresource, and now at maintaining and extending it, at promoting its use in language technology research and application development, as well as ensuring that the results of this research and development in their turn are incorporated in the macroresource. As a product of research, it reflects both computational and linguistic approaches to lexicology, lexical semantics, and lexical typology.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin }, author = {Borin, Lars and Dannélls, Dana and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {3 -- 36}, } @book{dannells-etal-2021-swedish-310036, title = {The Swedish FrameNet++ Harmonization, integration, method development and practical language technology applications}, abstract = {Large computational lexicons are central NLP resources. Swedish FrameNet++ aims to be a versatile full-scale lexical resource for NLP containing many kinds of linguistic information. Although focused on Swedish, this ongoing effort, which includes building a new Swedish framenet and recycling existing lexicons, has offered valuable insights into general aspects of lexical-resource building for NLP, which are discussed in this book: computational and linguistic problems of lexical semantics and lexical typology, the nature of lexical items (words and multiword expressions), achieving interoperability among heterogeneous lexical content, NLP methods for extending and interlinking existing lexicons, and deploying the new resource in practical NLP applications. This book is targeted at everyone with an interest in lexicography, computational lexicography, lexical typology, lexical semantics, linguistics, computational linguistics and related fields. We believe it should be of particular interest to those who are or have been involved in language resource creation, development and evaluation.}, author = {Dannélls, Dana and Borin, Lars and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam, Philadelphia}, ISBN = {9789027209900 }, } @inProceedings{virk-etal-2021-data-306964, title = {A Data-Driven Semi-Automatic Framenet Development Methodology }, abstract = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database and together with the annotated example sentences they have been made available through a web interface.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @inProceedings{skelbye-dannells-2021-processing-306957, title = {OCR Processing of Swedish Historical Newspapers Using Deep Hybrid CNN–LSTM Networks}, abstract = {Deep CNN–LSTM hybrid neural networks have proven to improve the accuracy of Optical Character Recognition (OCR) models for different languages. In this paper we examine to what extent these networks improve the OCR accuracy rates on Swedish historical newspapers. By experimenting with the open source OCR engine Calamari, we are able to show that mixed deep CNN–LSTM hybrid models outperform previous models on the task of character recognition of Swedish historical newspapers spanning 1818–1848. We achieved an average character accuracy rate (CAR) of 97.43% which is a new state–of–the–art result on 19th century Swedish newspaper text. Our data, code and models are released under CC BY licence.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021}, editor = {Galia Angelova and Maria Kunilovskaya and Ruslan Mitkov and Ivelina Nikolova-Koleva}, author = {Skelbye, Molly and Dannélls, Dana}, year = {2021}, publisher = {INCOMA }, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @inProceedings{virk-etal-2021-novel-306962, title = {A Novel Machine Learning Based Approach for Post-OCR Error Detection}, abstract = {Post processing is the most conventional approach for correcting errors that are caused by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. }, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Muhammad, Azam Sheikh}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @incollection{dannells-etal-2021-swedish-310041, title = {Swedish FrameNet}, abstract = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. }, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {37 -- 66}, } @incollection{dannells-gruzitis-2021-computational-310047, title = {Computational representation of FrameNet for multilingual natural language generation}, abstract = {Multilingual natural language generation, the process of producing written or spoken utterances in parallel languages from either structured or unstructured representations requires large amounts of syntactic and semantic information to generate an expression that is tailored to the target audience. This information is offered by FrameNet-like resources, which have been developed for a number of languages. In this chapter, we present a computational FrameNet grammar resource for multilingual natural language generation. We compare between English and Swedish framenets to illustrate how these can be unified under a shared computational representation using Grammatical Framework. We demonstrate how the grammar was exploited in two practical multilingual natural language generation applications to facilitate tourist communication and empower museum users with coherent artwork descriptions.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Grūzītis, Normunds}, year = {2021}, publisher = {John Benjamins Publishing Company }, address = {Amsterdam / Philadelphia }, ISBN = { 9789027258489 }, pages = {281 -- 301}, } @inProceedings{dannells-virk-2021-supervised-310123, title = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text }, abstract = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.}, booktitle = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2021}, publisher = {Linköping Electronic Press }, address = {Linköping}, } @inProceedings{hansson-etal-2021-swedish-305126, title = {The Swedish Winogender Dataset}, abstract = {We introduce the SweWinogender test set, a diagnostic dataset to measure gender bias in coreference resolution. It is modelled after the English Winogender benchmark, and is released with reference statistics on the distribution of men and women between occupations and the association between gender and occupation in modern corpus material. The paper discusses the design and creation of the dataset, and presents a small investigation of the supplementary statistics.}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31 - June 2, 2021, Reykjavik, Iceland (online)}, author = {Hansson, Saga and Mavromatakis, Konstantinos and Adesam, Yvonne and Bouma, Gerlof and Dannélls, Dana}, year = {2021}, publisher = {Linköping University Electronic Press }, address = {Linköping }, ISBN = {978-91-7929-614-8}, } @inProceedings{dannells-etal-2021-engine-305700, title = {A Two-OCR Engine Method for Digitized Swedish Newspapers }, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. To evaluate the method a reference material spanning the years 1818–2018 was prepared and manually transcribed. A quantitative evaluation was then performed against the material. In this first evaluation we experimented with word lists for different time periods. The results show that even though there was no significant overall improvement of the OCR results, some combinations of word lists are successful for certain periods and should therefore be explored further.}, booktitle = {Selected Papers from the CLARIN Annual Conference 2020, Linköping Electronic Conference Proceedings 180}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-609-4}, }