@incollection{borin-etal-2024-introduction-343467, title = {Introduction: Vaccine Hesitancy and the COVID-19 Crisis in the Nordic Countries}, abstract = {Already in 2019, WHO singled out the increase of vaccine hesitancy as one of the ten most important and urgent threats to global health. Little did people know then of the heated vaccine discussions waiting around the corner, spurred by the COVID-19 pandemic that set some countries on something reminiscent of a war footing. The mass vaccinations against the coronavirus in the early 2020s were seen by many as a blessing that promised a return to normalcy after lockdowns and other social restrictions. But some citizens actively resisted vaccination, claiming that the vaccines were not safe and questioning the public authorities’ trustworthiness. At the same time, the Nordic region is regarded as a world leader when it comes to societal trust. This tension between the high-trust Nordic societies and the distrust in the COVID-19 vaccines among a minority is in focus in this volume. It also gives insights into the political tensions between these neighbouring nations, and the public discourses taking place in the region during intense phases of the pandemic. The book explores three interrelated research themes: Nordic societal trust under stress; COVID-19 in Nordic public discourses; and the growing chorus on the margin.}, booktitle = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust during the COVID-19 Pandemic}, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, ISBN = {9781040011614}, pages = {1--17}, } @inProceedings{borin-holmer-2024-tradita-333774, title = {Tradita innovare, innovata tradere. The Gothenburg approach to computational lexicography}, abstract = {Swedish computational lexicography has a long history at the University of Gothenburg, both in its primary role as a central aspect of the scientific study of vocabulary and also as an infrastructural component for conducting research based on language data. Starting in the 1960s, the Språkdata research group pioneered corpus-supported lexicography for Swedish, forming the basis for successive editions of the two main descriptive dictionaries of contemporary Swedish, SAOL and SO. Language technological lexical resources for Swedish have been developed by the research unit/research infrastructure Språkbanken Text since the turn of the millennium, most recently in the framework of the Swedish FrameNet++initiative. After two decades of separation, these two largely mutually independently developed strands of computational lexicography have now joined forces under the umbrella of Språkbanken’s lexical research infrastructure to advance the field technically, methodologically, and scientifically.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden / (Eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder)}, author = {Borin, Lars and Holmer, Louise}, year = {2024}, publisher = {LiU Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @article{cousse-etal-2024-auxiliaries-343090, title = {Auxiliaries in Old Dutch. A diachronic parallel corpus exploration}, abstract = {This study explores the use of auxiliaries in the oldest text available for Old Dutch, the Wachtendonck Psalter, dating from the 10th century. Our aim is to understand why there are so few different auxiliaries in this text in comparison to other texts in Old Dutch. We tackle this question by taking a historical comparative perspective, using methodological insights and techniques from corpus-based contrastive linguistics and typology. More specifically, we build a diachronic parallel corpus of psalm translations and compare the contexts in which auxiliaries and inflectional alternatives are used in these parallel texts by means of multidimensional scaling. Our historical comparative method results in five proximity maps which allow us to explore and compare the inventory of verb constructions of the Wachtendonck Psalter both retrospectively, with its source text in Latin, and prospectively, with later translations in Dutch. Our analysis examines the role of grammaticalization as well as the specific nature of the text as an interlinear translation as possible motivations for the presence and absence of auxiliaries in the Wachtendonck Psalter.}, journal = {Journal of Historical Linguistics}, author = {Coussé, Evie and Bouma, Gerlof and van der Sijs, Nicoline}, year = {2024}, } @inProceedings{ahlfeldt-matsson-2024-digarv-334595, title = {The DIGARV Platform: A collaborative platform for working with cultural heritage data and research data}, abstract = {This article covers an easy-to-use research tool for collaborative work. The tool has been adapted for structured data and high-resolution images within four research projects at GRIDH. The platform is especially designed for working with temporal and spatial data. Furthermore, the platform gives researchers access to a relational database system through input forms and access to external cultural heritage data including high-resolution images. This way the platform also aims to utilize external data published as Linked Open Data (LOD) and, at the same time, prepare its own research data for publishing as LOD. Because of the spatial and temporal nature of the data, it is visualized in time and space through maps and timelines to give overview and context during the data management phase.}, booktitle = {Proceedings of the Huminfra Conference, 10-11 January, 2024, Gothenburg, Sweden}, editor = {Elena Volodina and Gerlof Bouma and Markus Forsberg and Dimitrios Kokkinakis and David Alfter and Mats Fridlund and Christian Horn and Lars Ahrenberg and Anna Blåder}, author = {Åhlfeldt, Johan and Matsson, Arild}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @incollection{tiedemann-etal-2024-multiword-343530, title = {Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results}, abstract = {This chapter introduces part of the Swedish L2 profiles, a new resource for Swedish as a second language. Multiword expressions (MWEs) in this resource are based on knowledge-based automatic annotation of MWEs, which we show works quite well for Swedish. In contrast, manual annotation of the compositionality of each MWE proved difficult, probably due to different interpretations of "compositionality" by the two annotators. We show that experts and non-experts can rank MWEs very similarly according to relative receptive difficulty, with particularly high agreement for the easiest items. A qualitative comparison of the proficiency levels associated with the MWEs based on coursebook occurrences and the results from crowdsourcing and direct ranking indicate that MWEs which appear in few books of the same level are more likely to be difficult to associate with an appropriate level based on coursebook corpus data. Furthermore, results show that compositionality and/or transparency might influence the relative ranking. Finally, there is a clear increase in MWE lemmas at higher proficiency levels at the group level, and at the highest level receptive and productive data include the same percentage of MWEs.}, booktitle = {Multiword Expressions in Lexical Resources: Linguistic, Lexicographic, and Computational Perspectives}, author = {Tiedemann, Therese Lindström and Alfter, David and Ali Mohammed, Yousuf and Piipponen, Daniela and Silén, Beatrice and Volodina, Elena}, year = {2024}, ISBN = {9783961104703}, pages = {309--348}, } @inProceedings{morger-2024-swediagnostics-341148, title = {SweDiagnostics: A Diagnostics Natural Language Inference Dataset for Swedish}, abstract = {This paper presents SweDiagnostics, a natural language inference dataset for Swedish based on the GLUE Diagnostic dataset. It is the largest, manually corrected NLI dataset in Swedish to date and can be used to evaluate models on NLI in Swedish as well as estimate English-Swedish language transfer capabilities. We present the dataset, the methodology used for translation, compare existing implementations and discuss limitations of the dataset, in particular those related to translationese.}, booktitle = {17th Workshop on Building and Using Comparable Corpora, BUCC 2024 at LREC-COLING 2024 - Proceedings}, author = {Morger, Felix}, year = {2024}, ISBN = {9782493814319}, } @article{landqvist-2024-finlandssvenska-335636, title = {Finlandssvenska översättare i Svenskt översättarlexikon}, abstract = {Svenskt översättarlexikon innehåller artiklar om sverigesvenska och finlandssvenska översättare. Vilka översättare i lexikonet kan sägas vara finlandssvenskar? Och finns det några finlandssvenska översättare som inte ingår i lexikonet – men som borde göra det? }, journal = {Språkbruk}, author = {Landqvist, Hans}, year = {2024}, volume = {2024}, number = {2024-03-07}, } @inProceedings{humlesjo-etal-2024-queerlit-334589, title = {Queerlit – a bibliography of Swedish fiction with LGBTQI topics}, abstract = {This paper summarizes the project Queerlit: Metadata and Searchability for LGBTQ+ Literary Heritage 2020-2023 and discusses some challenges in the development of this resource. The Queerlit project consist of four parts: 1. Creating a bibliography of Swedish fiction with LGBTQI themes 2. Creating a Swedish thesaurus (QLIT), adapted from the of the linked open data thesaurus Homosaurus 3. Assigning all material in the bibliography with subject headings from QLIT. 4. A web user interface for searching the material All four parts are integrated with the Swedish union catalog, Libris, making the results of the project available for all under a CC0 license. QLIT is the first external thesaurus integrated in the linked open data framework used in the technical platform of Libris, XL. The bibliography spans from rune stones from the 7th century to recently published fiction. When applying subject headings for the material both general aspects of the work and specific LGBTQI topics are described, making this the most comprehensive retrospective indexing project of Swedish literature to date. The underlying knowledge organization is made a prominent method of interacting with the search interface, which is empirically designed around the needs of various user groups.}, booktitle = {Proceedings of the Huminfra Conference, 10-11 January 2024, Gothenburg, Sweden / Editors: Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Humlesjö, Siska and Bergenmar, Jenny and Matsson, Arild}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @article{skoldberg-wenner-2024-varfor-341783, title = {Varför Paris men inte Prag? Om namn i SAOL 14}, abstract = {This article reports on a study of the inclusion of names in the latest edition of The Swedish Academy Glossary (SAOL 14). We begin by presenting the principles for including names in Swedish monolingual dictionaries in general. We then discuss the names included in SAOL 14, based on those from previous editions, emails from dictionary users regarding the names, and the results of an online survey on the topic. Finally, we address the question of how the editors could approach this subset of headwords in the next edition of the glossary}, journal = {Nordic Journal of Socio-Onomastics}, author = {Sköldberg, Emma and Wenner, Lena}, year = {2024}, volume = {4}, number = {2}, pages = { 131--166 }, } @article{skoldberg-landqvist-2024-sorry-343321, title = {Sorry, shit and wow: a case study of the handling of interjections in three Nordic monolingual dictionaries}, abstract = {This paper discusses a qualitative and, to some extent, comparative metalexicographical case study on interjections, with an English origin, in three Nordic monolingual dictionaries. In short, the study answers the following research questions: (1) How are three well-established interjections handled in The Contemporary Dictionary of the Swedish Academy (SO) compared to corresponding entries in a Danish and a Norwegian dictionary and how can the SO descriptions be developed?; (2) How can three less established interjections be analyzed and described in an updated version of the SO? The point of departure for answering the research questions is information types that are common in dictionary entries. Furthermore, the use of interjections in corpora and text collections for Swedish are crucial for the investigation. The study shows that interjections as a category imply several challenges for lexicographers. Finally, some suggestions are presented concerning the way in which the description of interjections in the SO may be developed}, journal = {Lexicographica. International Annual for Lexicography / Revue Internationale de Lexicographie / Internationales Jahrbuch für Lexikographie }, author = {Sköldberg, Emma and Landqvist, Hans}, year = {2024}, volume = {40}, number = {1}, pages = {29--57}, } @inProceedings{holmer-etal-2024-time-341975, title = {Time to Say Goodbye Revisited – On the Exclusion of Headwords from the Swedish Academy Glossary (SAOL)}, abstract = { In the revision process of dictionaries, adding new headwords or new senses to already existing headwords is what typically receives the most attention. In this article, we bring into focus the intriguing dilemma of exclusion of headwords from the Swedish Academy Glossary (SAOL), which is still published in print versions. In the e-dictionary-era, removing headwords may seem questionable. SAOL is, however, a contemporary dictionary which aims to reflect present-day Swedish. In order to keep the lemma list up to date, new headwords are added and obsolete words are removed. The editors of SAOL have practised lemma exclusion in connection with the revisions of new editions for almost 150 years. In this paper, we present SAOL and argue that lemma exclusion is crucial to SAOL’s aim and target group. We also present our most recent corpus material, methods and tools included in this process.}, booktitle = {Despot, Kristina Štrkalj, Ostroški, Ana & Ivana, Anić (eds.). Lexicography and Semantics, Proceedings of the XXI EURALEX International Congress, 8–12 October 2024, Cavtat, Croatia}, author = {Holmer, Louise and Lillieström, Ann and Sköldberg, Emma and Uppström, Jonatan}, year = {2024}, publisher = {Institut za hrvatski jezik}, address = { Zagreb}, ISBN = {978‐953‐7967‐77‐2}, } @article{landqvist-rogstrom-2024-genreutveckling-341609, title = {Genreutveckling i vetenskaplig prosa. Clas Bjerkanders entomologiska rön i Kungliga Vetenskapsakademiens Handlingar 1775–1795}, abstract = {The purpose of this article is to analyze the early (natural) scientific genre development in Sweden during the 18thcentury, focusing on the Royal Swedish Academy of Sciences, and its Transactions, as a discursive community. The Transactions are considered an early example of what we now refer to as sakprosa but have never been thoroughly analyzed regarding its genre specific characteristics. Bjerkander’s 26 findings on entomology are chosen as material for this study, grounded in genre analysis (Swales 1990, 2004; Bhatia 2004). The results reveal that Bjerkander’s entomological findings evolve into three subgenres. From a genre development perspective, Bjerkander’s texts primarily reflect an establishing stage (Gunnarsson 2011), but they anticipate a specialized stage in three instances: Bjerkander’s use of the so-called CARS model, his methodological awareness, and his use of references to previous research.}, journal = {Folkmålsstudier}, author = {Landqvist, Hans and Rogström, Lena}, year = {2024}, volume = {62}, pages = {89–125}, } @inProceedings{sander-etal-2024-durel-341867, title = {The DURel Annotation Tool}, booktitle = {Book of Abstracts of the Workshop Large Language Models and Lexicography, 8 October 2024 Cavtat, Croatia (ed. Simon Krek)}, author = {Sander, Pauline and Hengchen, Simon and Zhao, Wei and Ma, Xiaocheng and Sköldberg, Emma and Virk, Shafqat and Schlechtweg, Dominik}, year = {2024}, } @inProceedings{landqvist-2024-forutsattningar-337141, title = {Förutsättningar, upplägg och utvärderingar. Utmaningar för och möjligheter med två utbildningar i terminologi vid Göteborgs universitet}, booktitle = {Att undervisa i terminologi – utmaningar och möjligheter. Konferens 23–24 maj i Stockholm}, author = {Landqvist, Hans}, year = {2024}, } @techreport{morger-2024-when-342179, title = {When Sparv met Superlim. . . A Sparv Plugin for Natural Language Understanding Analysis of Swedish}, abstract = {This technical report introduces Sparv-Superlim, a Sparv plugin for natural language understanding analysis of Swedish. It uses the reference models trained on the Superlim multi-task benchmark to add additional analyses to the Sparv Pipeline. I show how to install and configure the tool as well as apply it to analyze Swedish political manifestos to see if the predictions the plugin does align with known political positions of Swedish parties. These use cases shows that the reference models vary in their applicability to predict correct sentiments on novel data and illustrates the importance of integrating reference models trained on a multi-task benchmark like Superlim to evaluate the ecological validity of the benchmark.}, author = {Morger, Felix}, year = {2024}, } @inProceedings{holmer-etal-2024-saol-333679, title = {SAOL och svensk språkvetenskaplig infrastruktur – nu och i framtiden}, abstract = {Svenska Akademiens ordlista (SAOL 14, 2015) spelar en viktig roll inom svensk språkvetenskaplig infrastruktur, något som framkommer i denna artikel. Vidare presenteras preliminära resultat av en undersökning av hur frekventa uppslagsorden i SAOL egentligen är i olika delkorpusar med modern allmänspråklig svenska. För att ordlistan även fortsättningsvis ska kunna användas inom svensk ordforskning, vid språkstudier m.m., men också bli mer central inom språkteknologiska sammanhang, är det avgörande att SAOL:s uppslagsord vilar på vetenskaplig grund, moderna språkteknologiska metoder och uppdaterade korpusmaterial. Fokus i artikeln ligger på de uppslagsord som inte finns belagda i korpusmaterialet, och som därmed kan tänkas mönstras ut inför den kommande femtonde upplagan.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024 (eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder)}, author = {Holmer, Louise and Lillieström, Ann and Sköldberg, Emma and Uppström, Jonatan}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{skoldberg-2024-andra-337378, title = {Andra upplagan av Svensk ordbok (SO) – Förutsättningar, teoretiska överväganden, insatser och mottagande}, booktitle = {Svenskans beskrivning 38: Förhandlingar vid trettioåttonde sammankomsten. Örebro 4–6 maj 2022, Del I. Redigerad av Denny Jansson, Ida Melander, Gustav Westberg & Daroon Yassin Falk}, author = {Sköldberg, Emma}, year = {2024}, ISBN = {978-91-87789-89-2}, } @inProceedings{kokkinakis-hammarlin-2024-cluster-338476, title = {Cluster-Based BERTopic Modeling on Swedish COVID-19 Vaccine Posts}, abstract = {This paper explores the prevalent themes across multiple threads on the popular Swedish discussion forum Flashback. Among its diverse array of topics, the forum actively engages users in addressing and debating questions pertaining to COVID-19 vaccines and vaccination. Through distinguishing between positive and negative perspectives within posts across 14 relevant thread discussions, we employ BERTopic, a modular topic modeling framework, which utilizes pre-trained language models and applies clustering techniques to identify prevailing topics. This enables us to conduct a nuanced exploration of overarching themes, offering valuable insights into the multifaceted nature of the discussions regarding COVID-19 vaccines and vaccination in Sweden.}, booktitle = {The 34th Medical Informatics Europe Conference}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie}, year = {2024}, publisher = {IOS Press}, address = {Amsterdam • Washington, DC}, } @inProceedings{landqvist-etal-2024-"appendicit"-337164, title = {Hur kan "appendicit", "blodförgiftning" och "hyperaktivitetssyndrom" behandlas? Medicinens fackområde i Svensk ordbok utgiven av Svenska Akademien}, booktitle = {Svenskans beskrivning 38. Förhandlingar vid trettioåttonde sammankomsten Örebro 4–6 maj 2022. Del II. Redigerad av Denny Jansson, Ida Melander, Gustav Westberg & Daroon Yassin Falk}, author = {Landqvist, Hans and Sköldberg, Emma and Holmer, Louise}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-90-8}, } @inProceedings{munozsanchez-2024-when-341073, title = {When Hieroglyphs Meet Technology: A Linguistic Journey through Ancient Egypt Using Natural Language Processing}, abstract = {Knowing our past can help us better understand our future. The explosive development of NLP in these past few decades has allowed us to study ancient languages and cultures in ways that we couldn’t have done in the past. However, not all languages have received the same level of attention. Despite its popularity in pop culture, the languages spoken in Ancient Egypt have been somewhat overlooked in terms of NLP research. In this survey paper we give an overview of how NLP has been used to study different variations of the Ancient Egyptian languages. This not only includes Old, Middle, and Late Egyptian but also Demotic and Coptic. We begin by giving a short introduction to these languages and their writing systems, before talking about the corpora and lexical resources that are available digitally. We then show the different NLP tasks that have been tackled for different variations of Ancient Egyptian, as well as the approaches that have been used. We hope that our work can stoke interest in the study of these languages within the NLP community.}, booktitle = {3rd Workshop on Language Technologies for Historical and Ancient Languages, LT4HALA 2024 at LREC-COLING 2024 - Workshop Proceedings, 25 May, 2024 Torino, Italia}, author = {Muñoz Sánchez, Ricardo}, year = {2024}, publisher = { ELRA Language Resources Association}, ISBN = {9782493814463}, } @article{landqvist-skoldberg-2024-interjektioner-336473, title = {Interjektioner som lexikografisk utmaning. En fallstudie av interjektioner med engelskt ursprung utifrån Svensk ordbok utgiven av Svenska Akademien}, abstract = {In this article, a qualitative and, to some extent, comparative metalexicographic case study is reported. The study will answer two research questions: (1) How are the interjections "sorry", "shit" and "wow" described in The Contemporary Dictionary of the Swedish Academy (SO) compared to the corresponding dictionary articles in The Danish Dictionary (DDO) and the Norwegian Academy’s Dictionary (NAOB) and how can the SO descriptions be developed?; (2) How can the interjections "yes", "nice/najs" and "woho/wohoo" be analyzed and then described in new dictionary articles in an updated version of SO? The point of departure for answering both RQs is a number of information categories that are common in dictionary articles. Furthermore, the use of the current interjections in contemporary corpora and text collections for Swedish are crucial for the investigation. The results of the study show that interjections as a category implies several challenges for lexicographers regarding information about their spelling, pronunciation, and inflection, meaning, language examples, usage comments as well as information about their establishment, origin, and kinship. Finally, some suggestions are presented for how the description of interjections in the dictionary can be developed.}, journal = {ASLA:s skriftserie/ASLA Studies in Applied Linguistics}, author = {Landqvist, Hans and Sköldberg, Emma}, year = {2024}, volume = {31}, pages = {26--55}, } @inProceedings{masciolini-etal-2024-synthetic-338288, title = {Synthetic-Error Augmented Parsing of Swedish as a Second Language: Experiments with Word Order}, abstract = {Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.}, booktitle = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, May 25, 2024, Torino, Italia}, author = {Masciolini, Arianna and Francis, Emilie and Szawerna, Maria Irena}, year = {2024}, publisher = {ELRA and ICCL}, address = {Torino, Italy}, ISBN = {978-2-493814-20-3}, } @inProceedings{schlechtweg-etal-2024-durel-336715, title = {The DURel Annotation Tool: Human and Computational Measurement of Semantic Proximity, Sense Clusters and Semantic Change}, abstract = {We present the DURel tool implementing the annotation of semantic proximity between word uses into an online, open source interface. The tool supports standardized human annotation as well as computational annotation, building on recent advances with Word-in-Context models. Annotator judgments are clustered with automatic graph clustering techniques and visualized for analysis. This allows to measure word senses with simple and intuitive micro-task judgments between use pairs, requiring minimal preparation efforts. The tool offers additional functionalities to compare the agreement between annotators to guarantee the inter-subjectivity of the obtained judgments and to calculate summary statistics over the annotated data giving insights into sense frequency distributions, semantic variation or changes of senses over time.}, booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, March 17-22, 2024, St. Julians, Malta. }, author = {Schlechtweg, Dominik and Virk, Shafqat and Sander, Pauline and Sköldberg, Emma and Theuer Linke, Lukas and Zhang, Tuo and Tahmasebi, Nina and Schulte im Walde, Sabine}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-091-2}, } @inProceedings{szawerna-2024-stanza-336413, title = {Can Stanza be Used for Part-of-Speech Tagging Historical Polish?}, abstract = {The goal of this paper is to evaluate the performance of Stanza, a part-of-speech (POS) tagger developed for modern Polish, on historical text to assess its possible use for automating the annotation of other historical texts. While the issue of the reliability of utilizing POS taggers on historical data has been previously discussed, most of the research focuses on languages whose grammar differs from Polish, meaning that their results need not be fully applicable in this case. The evaluation of Stanza is conducted on two sets of 10286 and 3270 manually annotated tokens from a piece of historical Polish writing (1899), and the errors are analyzed qualitatively and quantitatively. The results show a good performance of the tagger, especially when it comes to Universal Part-of-Speech (UPOS) tags, which is promising for utilizing the tagger for automatic annotation in larger projects, and pinpoint some common features of misclassified tokens.}, booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, March 21-22, 2024, St. Julian’s, Malta}, author = {Szawerna, Maria Irena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-090-5}, } @inProceedings{holdt-etal-2024-towards-341134, title = {Towards an Ideal Tool for Learner Error Annotation}, abstract = {Annotation and analysis of corrections in learner corpora have always presented technical challenges, mainly on account of the fact that until now there has not been any standard tool available, and that original and corrected versions of texts have been mostly stored together rather than treated as individual texts. In this paper, we present CJVT Svala 1.0, the Slovene version of the SVALA tool, which was originally used for the annotation of Swedish learner language. The localisation into Slovene resulted in the development of several new features in SVALA such as the support for multiple annotation systems, localisation into other languages, and the support for more complex annotation systems. Adopting the parallel aligned approach to text visualisation and annotation, as well as storing the data, combined with the tool supporting this, i.e. SVALA, are proposed as new standards in Learner Corpus Research.}, booktitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024 - Main Conference Proceedings}, author = {Holdt, Špela Arhar and Erjavec, Tomaž and Kosem, Iztok and Volodina, Elena}, year = {2024}, ISBN = {9782493814104}, } @inProceedings{landqvist-holmer-2024-finlandismer-342756, title = {Finlandismer i SAOL 14: markeringssätt och informationstyper}, abstract = {Svenska Akademiens ordlista (SAOL) är den inofficiella normen för stavning och böjning av orden som ingår i aktuell upplaga av SAOL (Borin & Holmer 2024:42). Den första upplagan publicerades 1874, den fjortonde och senaste upplagan, SAOL14, utgavs 2015 och SAOL15 är planerad att utkomma 2025 (Holmer et al. 2024:68). Alltsedan SAOL11 (1986) ingår det i ordlistan ett antal uppslagsord och ordbetydelser vilka enbart eller främst används i finlandssvenskan och ofta kallas ”finlandismer” (af Hällström-Reijonen 2015:104–111; se t.ex. af Hällström-Reijonen 2015:99–100 om begreppen ’finlandssvenska’ och ’finlandism’). Antalet finlandismer i SAOL14 uppges vara mellan 240 och 260 (SAOL14:XVII; af Hällström-Reijonen 2015:105). Vägledande för urvalet har varit frekvens och aktualitet, geografisk spridning i svensktalande områden i Finland samt acceptabilitet. Den sistnämnda principen innebär att ”man inte ska ta med sådana finlandismer i SAOL som språkvårdarna vid Institutet för de inhemska språken avråder från i andra sammanhang […]” (af Hällström-Reijonen 2015:108–109). Urvalet av finlandismer i SAOL14 har alltså gjorts efter angivna principer. Men hur får användare av SAOL14 veta (1) vilka uppslagsord och ordbetydelser som bedöms vara finlandismer? (2) varför orden och betydelserna klassificeras som finlandismer? För att besvara den första forskningsfrågan görs sökningar i Svenska Akademiens lexikala databas (Salex), som utvecklas inom Språkbanken Text, och olika sätt att markera ”finlandism-status” kartläggs (se Holmer et al. 2024:69 om Salex; jfr Svensén 2004:374–397 om markeringssystem i ordböcker). Utgångspunkten för att besvara den andra forskningsfrågan är Bo Svenséns kategorisering av informationstyper som kan användas i ordböcker (Svensén 2004:9–12). Med hänsyn till informationstyper som faktiskt används i SAOL14 kan ”finlandism-status” i ordlistan, i alla fall potentiellt, avse fyra huvudkategorier: (a) formell information (stavning, uttal, morfologi, då både ordböjning och ordbildning), (b) syntagmatisk information (ordklasstillhörighet, konstruktionssätt, kollokationer, idiom), (c) semantisk information (betydelse) och/eller (d) pragmatisk information (förekomst, bruklighet). Referenser Borin, L. & Holmer, L. (2024). Tradita innovare, innovata tradere. The Gothenburg approach to computational lexicography. I: Proceedings of the Huminfra Conference (HiC 2024), s. 41–50. Red. Volodina, E., Bouma, G., Forsberg, M., Kokkinakis, D., Alfter, D., Fridlund, M., Horn, C., Ahrenberg, L. & Blåder, A. Linköping Electronic Conference Proceedings 205. Tillgänglig: https://ecp.ep.liu.se/hic Holmer, L., Lillieström, A., Sköldberg, E. & Uppström, J. (2024). SAOL och svensk språkvetenskaplig infrastruktur – nu och i framtiden. I: Proceedings of the Huminfra Conference (HiC 2024), s. 68–75. Red. Volodina, E., Bouma, G., Forsberg, M., Kokkinakis, D., Alfter, D., Fridlund, M., Horn, C., Ahrenberg, L. & Blåder, A. Linköping Electronic Conference Proceedings 205. Tillgänglig: https://ecp.ep.liu.se/hic af Hällström-Reijonen, C. (2015). Finlandssvenska i SAOL och andra ordböcker. LexicoNordica 22, s. 99–115. Tillgänglig: https://tidsskrift.dk/index.php/lexn/issue/archive SAOL14 = Svenska Akademiens ordlista över svenska språket (2015). 14 uppl. Stockholm: Norstedts i distribution. Tillgänglig: https://www.gu.se/svenska-spraket/saol-svenska-akademiens-ordlista Svensén, B. (2004) [1987]. Handbok i lexikografi. Ordböcker och ordboksarbete i teori och praktik. 2 omarbetade och utökade uppl. Stockholm: Norstedts Akademiska Förlag. }, booktitle = {Svenskan i Finland 21}, author = {Landqvist, Hans and Holmer, Louise}, year = {2024}, } @misc{volodina-etal-2024-proceedings-336386, title = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta}, author = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {979-8-89176-085-1}, } @book{borin-etal-2024-vaccine-341185, title = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust During the COVID-19 Pandemic}, abstract = {Bringing together studies from across the Nordic region, this book examines the challenges brought by the COVID-19 pandemic, with a particular focus on vaccine hesitancy. Shedding light on the political tensions that emerged as a result of the pandemic and the debates that ensued both within and between the Nordic nations, it investigates the vociferous discussions surrounding the COVID-19 vaccines and their presumed negative side effects through the lens of trust; trust in and between the neighbouring countries, in healthcare systems, fellow citizens, and experts; in public authorities, politicians, researchers, journalists, and pharmaceutical companies. The first volume to explore vaccine hesitancy in the Scandinavian context, this ground-breaking volume offers fresh perspectives on vaccine scepticism not as a form of ignorance or lack of knowledge, but as a manifestation of a more fundamental lack of faith in modern government and science. As such, it will appeal to scholars of sociology, politics, anthropology, media studies, communication and cultural studies with interests in public health, popular and political discourse and questions of public trust. }, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, publisher = {Taylor and Francis}, ISBN = {9781040011614}, } @inProceedings{munozsanchez-etal-2024-name-339981, title = {Name Biases in Automated Essay Assessment}, abstract = {Artificial intelligence is being deployed in high-stakes situations, such as automated grading of second language essays in proficiency assessment. While they can improve the opportunities students have (education, work opportunities, etc.), such systems often display human-like biases. Aldrin (2017) notes that human graders have a slight bias based on names appearing in essay texts. We aim to identify whether the same pattern holds in automated systems. In this study we aim to answer the following research questions: 1) Does changing given names inside a second language learner essay affect the way the text is graded? 2) How much does this differ between feature-based machine learning and deep learning? For this, we use a de-anonymized (i.e. original) version of the Swell-pilot corpus of second language Swedish learner essays (Volodina 2016), which consists of 502 essays annotated with CEFR levels as our source data. First, we compile four lists of given names inspired by those of Aldrin (2017): traditional Swedish names; modern Swedish names of Anglo-American origin; Finnish names (due to the close sociocultural links between both countries); and names of Arabic origin (the most prominent group of learners in the corpus). Second, we create a diagnostic dataset to identify biases in the classification task. We select SweLL-pilot essays in which a given name appears only once. Then, we generate an essay version for each name on the lists by substituting the name in the original text with one from the list. Third, we fine-tune a BERT (Devlin et al. 2019) model on the original SweLL-pilot data to predict the CEFR level of a given essay and compare it to an existing feature-based model (Pilan 2016). Finally, we test the two models and compare the equality of opportunity between the different given name groups on the diagnostic dataset. }, booktitle = {The 28th International Congress of Onomastic Sciences (ICOS 28),19-23 August, 2024, Helsinki, Finland}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Lindström Tiedemann, Therese and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, } @article{landqvist-2024-"gratis-344586, title = {”Gratis broddar till dig som är 65 år och bor i Göteborg”: En fallstudie av legitimeringsstrategier i svenska kommuners webbtexter riktade till seniorer}, abstract = {Offers of free ice cleats to senior citizens is a rather new phenomenon in Sweden. Therefore, Swedish municipalities should strive to linguistically legitimize such offers to convince senior citizens to use ice cleats during the winter season. The research questions for the reported study are (1) Which main strategies for legitimation appear in the municipalities’ texts? (2) Which sub- types to different main strategies appear in the municipalities’ texts? (3) What general picture of the municipalities’ approach to senior citizens can the identified strategies be said to convey? A corpus of 23 texts on municipal websites, published in 2022 or 2023, is analysed, using van Leeuwen's model for legitimation analysis (2008) as a theoretical-methodical basis. The results of the study show that ice cleat offers are legitimized lexicogrammatically mainly through the two main legitimation strategies Authorization and Rationalization. The main strategy Moral Evaluation, with a focus on public health and the health and well-being of senior citizens, is also used. No instances of the main strategy Mythopoesis are identified. Several subtypes to the three main strategies Authorization, Rationalization and Moral Evaluation are identified in the corpus. Some of the strategies for legitimation used can be deemed as an expression of an equal relationship between municipalities and senior citizens, while others can be said to express an unequal relationship.}, journal = {VAKKI Publications: Diversity in Communication}, author = {Landqvist, Hans}, year = {2024}, volume = {16}, pages = {67--89}, } @inProceedings{munozsanchez-etal-2024-harnessing-342122, title = {Harnessing GPT to Study Second Language Learner Essays: Can We Use Perplexity to Determine Linguistic Competence?}, abstract = {Generative language models have been used to study a wide variety of phenomena in NLP. This allows us to better understand the linguistic capabilities of those models and to better analyse the texts that we are working with. However, these studies have mainly focused on text generated by L1 speakers of English. In this paper we study whether linguistic competence of L2 learners of Swedish (through their performance on essay tasks) correlates with the perplexity of a decoder-only model (GPT-SW3). We run two sets of experiments, doing both quantitative and qualitative analyses for each of them. In the first one, we analyse the perplexities of the essays and compare them with the CEFR level of the essays, both from an essay-wide level and from a token level. In our second experiment, we compare the perplexity of an L2 learner essay with a normalised version of it. We find that the perplexity of essays tends to be lower for higher CEFR levels and that normalised essays have a lower perplexity than the original versions. Moreover, we find that different factors can lead to spikes in perplexity, not all of them being related to L2 learner language.}, booktitle = {Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024), June 20, 2024, Mexico City, Mexico}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, address = { Mexico City, Mexico}, ISBN = {979-8-89176-100-1}, } @article{landqvist-etal-2024-termer-339976, title = {Termer för grundämnena H, N och O: resultat av nordiskt terminologisamarbete från 1950-tal till 2020-tal}, abstract = {Den som vill bedriva ett framgångsrikt terminologiarbete måste ha samarbetspartners. År 1981 formulerade Christer Laurén detta faktum som att ”[b]åde ingenjören och språkmannen behövs i fackspråklig språkvård”, medan Henrik Nilsson fyrtio år senare konstaterade att ”[t]erminologer behöver experter för att kunna genomföra ett terminologiarbete av god kvalitet” (Laurén 1981, s. 9; Nilsson 2021, s. 77). Samarbetet kan ske inom ett land eller mellan personer och institutioner i flera länder (Bucher 2016a; Bucher 2017; Nilsson 2021). Den här artikeln handlar om terminologi(sam)arbete i Norden.}, journal = {Sprog i Norden 2024/Språk i Norden 2024. Tema: Nordterm 23 Terminologi i samhällets tjänst (Red. Kirsten Lindø Dolberg-Møller)}, author = {Landqvist, Hans and Nissilä, Niina and Sjöberg, Sannina}, year = {2024}, pages = {137–152}, } @article{blensenius-2024-aligning-343305, title = {Aligning grammatical information in linguistic resources published by the same authority. The case of participles in Swedish Academy dictionaries and grammar}, abstract = {This article discusses strategies involved in aligning word classes (parts of speech), particularly participles, in two dictionaries and one grammar for Swedish, all issued by the same publishing house, which at the same time is a language authority in Sweden. The dictionaries are Svenska Akademiens ordlista (‘The Swedish Academy glossary’), abbreviated as SAOL, and Svensk ordbok utgiven av Svenska Akademien (‘The Contemporary Dictionary of the Swedish Academy’), abbreviated as SO. The grammar in question is Svenska Akademiens grammatik (‘The Swedish Academy grammar’), ab-breviated as SAG. I will discuss whether it is possible or desirable to harmonize word classes in dictionaries and grammars from the same publisher, in this case from the Swedish Academy.}, journal = {Lexicographica : Internationales Jahrbuch für Lexikographie}, author = {Blensenius, Kristian}, year = {2024}, volume = {40}, number = {1}, pages = {81–94}, } @inProceedings{lofgren-dannells-2024-post-336065, title = {Post-OCR Correction of Digitized Swedish Newspapers with ByT5}, abstract = {Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to translate it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36\% reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Språkbanken Text, a Swedish language technology infrastructure containing modern and historical written data.}, booktitle = {Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), March 22, 2024, Malta}, author = {Löfgren, Viktoria and Dannélls, Dana}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {United States Pennsylvania East Stroudsburg}, ISBN = {979-8-89176-069-1}, } @inProceedings{lyngfelt-etal-2024-flersprakig-338191, title = {Flerspråkig konstruktikografi med hjälp av språkneutrala jämförelsebegrepp}, booktitle = {Svenskans beskrivning. Förhandlingar vid trettioåttonde sammankomsten, del 1, Örebro 4–6 maj 2022}, author = {Lyngfelt, Benjamin and Andréasson, Maia and Blensenius, Kristian and Bäckström, Linnéa and Höder, Steffen and Ljunglöf, Peter and Uppström, Jonatan}, year = {2024}, publisher = {Örebro universitet }, address = {Örebro}, ISBN = {978-91-87789-89-2}, } @inProceedings{dannells-etal-2024-transformer-338708, title = {Transformer-based Swedish Semantic Role Labeling through Transfer Learning}, abstract = {Semantic Role Labeling (SRL) is a task in natural language understanding where the goal is to extract semantic roles for a given sentence. English SRL has achieved state-of-the-art performance using Transformer techniques and supervised learning. However, this technique is not a viable choice for smaller languages like Swedish due to the limited amount of training data. In this paper, we present the first effort in building a Transformer-based SRL system for Swedish by exploring multilingual and cross-lingual transfer learning methods and leveraging the Swedish FrameNet resource. We demonstrate that multilingual transfer learning outperforms two different cross-lingual transfer models. We also found some differences between frames in FrameNet that can either hinder or enhance the model’s performance. The resulting end-to-end model is freely available and will be made accessible through Språkbanken Text’s research infrastructure.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), 20-25 May, 2024, Torino, Italia}, author = {Dannélls, Dana and Johansson, Richard and Buhr, Lucy Yang}, year = {2024}, publisher = {ELRA and ICCL}, address = {Turin, Italy}, ISBN = {978-2-493814-10-4}, } @inProceedings{skoldberg-etal-2024-revealing-341866, title = {Revealing Semantic Variation in Swedish Using Computational Models of Semantic Proximity–Results From Lexicographical Experiments}, abstract = {The paper reports a pilot study on the detection of lexical semantic variation in modern Swedish. The starting point of the study is the meaning descriptions of around 65,000 headwords in ’The Contemporary Dictionary of the Swedish Academy’ (SO, 2021) covering approximately 100,000 different senses. In our work, we aim to explore the potential of the latest computational methods to discover outdated definitions in SO and update them. For this, we make use of the DURel tool (Schlechtweg et al., 2018, 2024) which relies on state- of-the-art language models for the automatic semantic analysis of word usages. The work resulted in drawing lexicographers’ attention to both main senses and subsenses that should be added to the dictionary. It has also demonstrated that certain meaning descriptions in SO are too general and should be split in accordance with the current principles for the semantic descriptions in the dictionary.}, booktitle = {Lexicography and Semantics. Proceedings of the XXI EURALEX International Congress 8–12 October 2024 Cavtat, Croatia (eds. Kristina Š. Despot, Ana Ostroški Anić & Ivana Brač )}, author = {Sköldberg, Emma and Virk, Shafqat and Sander, Pauline and Hengchen, Simon and Schlechtweg, Dominik}, year = {2024}, publisher = {Institut za hrvatski jezik}, ISBN = {978‐953‐7967‐77‐2}, } @inProceedings{lorenzi-etal-2024-mocca-338189, title = {MoCCA: A Model of Comparative Concepts for Aligning Constructicons}, abstract = {This paper presents MoCCA, a Model of Comparative Concepts for Aligning Constructicons under development by a consortium of research groups building Constructicons of different languages including Brazilian Portuguese, English, German and Swedish. The Constructicons will be aligned by using comparative concepts (CCs) providing language-neutral definitions of linguistic properties. The CCs are drawn from typological research on grammatical categories and constructions, and from FrameNet frames, organized in a conceptual network. Language-specific constructions are linked to the CCs in accordance with general principles. MoCCA is organized into files of two types: a largely static CC Database file and multiple Linking files containing relations between constructions in a Constructicon and the CCs. Tools are planned to facilitate visualization of the CC network and linking of constructions to the CCs. All files and guidelines will be versioned, and a mechanism is set up to report cases where a language-specific construction cannot be easily linked to existing CCs.}, booktitle = {Proceedings of the 20th Joint ACL - ISO Workshop on Interoperable Semantic Annotation @LREC-COLING-2024, 20 May, 2024, Torino, Italia}, author = {Lorenzi, Arthur and Ljunglöf, Peter and Lyngfelt, Benjamin and Torrent, Tiago Timponi and Croft, William and Ziem, Alexander and Böbel, Nina and Bäckström, Linnéa and Uhrig, Peter and Matos, Ely}, year = {2024}, publisher = {ELRA}, ISBN = {978-2-493814-32-6}, } @inProceedings{kokkinakis-etal-2024-analyzing-342781, title = {Analyzing Segregation Discourse in Sweden: Technological Methods and Empirical Data }, abstract = {This paper outlines some of the empirical resources and language technology tools to be used in the project “Language(s) of segregation: Interdisciplinary perspectives on spatial, social, and symbolic division in cities.” The aim of this project is to examine the construction of segregation discourse in Sweden, its implementation as urban policy, and its impact and experience in everyday life. By integrating perspectives from linguistics, public administration, and urban ethnography, this study analyzes various forms of segregation — such as educational and residential — using large corpora to identify patterns and address related disparities. Key resources include political discourse, social media, and press coverage, while language technology tools like word vectors, network and sentiment analysis, and topic modeling will be employed. Preliminary findings provide early insights into the complex dynamics of segregation across different contexts.}, booktitle = {Tenth Swedish Language Technology Conference (SLTC)}, author = {Kokkinakis, Dimitrios and Wojahn, Daniel and Järlehed, Johan}, year = {2024}, } @inProceedings{ljunglof-etal-2024-binary-342402, title = {Binary indexes for optimising corpus queries}, abstract = {To be able to search for patterns in annotated text corpora is crucial for many different research disciplines. However, searching for complex patterns in large corpora can take long time – sometimes several minutes or even hours. We investigate how inverted indexes can be used for efficient searching in large annotated corpora, and in particular binary indexes. We show how corpus queries are translated into lookups in unary and binary inverted indexes, and give efficient strategies for combining the results using efficient set operations. In addition we discuss how to make use of binary indexes for more complex query types.}, booktitle = {Proceedings of the 20th Conference on Natural Language Processing (KONVENS 2024), September 10-13, 2024, Vienna, Austria}, author = {Ljunglöf, Peter and Smallbone, Nicholas and Thoresson, Mijo and Salomonsson, Victor}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {9798331304843}, } @incollection{hammarlin-etal-2024-fearing-336154, title = {Fearing mRNA - A mixed methods study of vaccine rumours }, abstract = {There are well-spread ideas among vaccine-critical individuals around the world that “new” vaccines might be more dangerous to health than other, “traditional” vaccines, which can lead to vaccine hesitancy; the “delay in acceptance or refusal of vaccination despite availability of vaccination services”. For example, a recurring remark made in social media is that mRNA technology resembles a chip that alters the human DNA, which might permanently and irreparably damage the immune system. These ideas sometimes take the shape of rumours and conspiracy theories. Drawing on rumour theories and social cognitive perspectives, the aim of this chapter is to account for the purpose and the spreading of medical rumours that encircle mRNA COVID-19 vaccines. Our research questions are: How are rumours concerning mRNA expressed and established? In terms of trust and distrust, what function do the rumours have?}, booktitle = {Vaccine Hesitancy in the Nordic Countries - Trust and Distrust During the COVID-19 Pandemic / edited By Lars Borin, Mia-Marie Hammarlin, Dimitrios Kokkinakis, Fredrik Miegel}, author = {Hammarlin, Mia-Marie and Kokkinakis, Dimitrios and Miegel, Fredrik and Stoencheva, Jullietta}, year = {2024}, publisher = {Routledge - Taylor & Francis Group}, address = {New York}, ISBN = {978-1-032-30599-8}, pages = {157--184}, } @inProceedings{kokkinakis-2024-from-336089, title = {From Zipf distribution to Universal Dependencies - Interactive Notebooks for Swedish Text Analysis }, abstract = {Notebook-based environments are powerful (web-based) interactive development resources for conducting exploratory (textual) data analysis (EDA). These environments allow the embedding of code (code snippets in ‛code cells’) which can be easily executed with the results immediately presented into the user’s window. This paper introduces some basic exploratory tools and techniques using JupyterLab notebooks, applied to Swedish using a subcorpus that address various topics related to the COVID-19 pandemic published during January-December 2021}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{albertin-kokkinakis-2024-defining-342782, title = {Defining Cohesion Features in the Study of Discourse Properties in Cognitive Impairment }, abstract = {The analysis of discourse and pragmatics, which deteriorate alongside other linguistic levels in cognitive decline, can enhance our understanding of dementia-related language patterns and contribute to the improvement of automated diagnostic tools. This study focuses on discourse cohesion, specifically investigating three linguistic phenomena: reference, lexical repetition, and connectives. Six features related to these categories were defined and automatically extracted from an Italian corpus of semi-spontaneous speech, collected from patients with early dementia, MCI subjects, and healthy controls. Some of these features proved significant in distinguishing among the three groups. Additional quantitative analysis revealed notable differences in the use of these elements, suggesting a potential link between their degradation and cognitive decline.}, booktitle = {Tenth Swedish Language Technology Conference (SLTC)}, author = {Albertin, Giorgia and Kokkinakis, Dimitrios}, year = {2024}, pages = {4}, } @inProceedings{bouma-etal-2024-konsten-333683, title = {Konsten att bedriva svensk ordforskning utan att kränka upphovsrätten}, abstract = {Vi beskriver KB-labb och Språkbanken Texts samarbete för att underlätta ordforskning på de upphovsrätts-skyddade korpusar som finns i Kungliga bibliotekets samlingar. Satsningen har hittils lett till två öppna datasamlingar, Kubord 1 och 2, som ger tillgång till ordstatistik och ordsamförekomststatistik. Vi beskriver även Kubord-fastText, en samling vektormodeller som är baserade på samma korpusar, som är underutveckling}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January, 2024 / eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Bouma, Gerlof and Forsberg, Markus and Sikora, Justyna and Sköldberg, Emma}, year = {2024}, publisher = { Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{belmonte-etal-2024-automatic-336253, title = {Automatic Detection of Rhythmic Features in Pathological Speech of MCI and Dementia Patients }, abstract = {The presence of linguistic alterations represents one of the prodromal signs of cognitive decline associated with dementia. In recent years, a growing body of work has been devoted to the development of algorithms for the automatic linguistic analysis of both oral and written texts, with diagnostic purposes. The extraction of Digital Linguistic Biomarkers from patients' verbal productions can indeed provide a rapid, ecological, and cost-effective system for large-scale screening of the pathology. This article contributes to the ongoing research in the field by exploring a traditionally less studied aspect of language in dementia, namely the rhythmic characteristics of speech. In particular, the paper focuses on the automatic detection of rhythmic features in Italian connected speech. A landmark-based system was developed and evaluated to segment the speech flow into vocalic and consonantal intervals and to calculate several rhythmic metrics. Additionally, the reliability of these metrics in identifying MCI and dementia patients was tested.}, booktitle = {RaPID-5: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments}, author = {Belmonte, Marica and Gagliardi, Gloria and Kokkinakis, Dimitrios and Tamburini, Fabio}, year = {2024}, publisher = {European Language Resources Association (ELRA)}, ISBN = {978-2-493814-11-1}, } @article{forsberg-holmer-2024-datatillgang-343814, title = {Datatillgång, metodutveckling och lexikografiskt arbete vid Språkbanken Text}, journal = {LexicoNordica}, author = {Forsberg, Markus and Holmer, Louise}, year = {2024}, volume = {31}, pages = {61--79}, } @inProceedings{broden-etal-2024-samforfattande-335726, title = {Samförfattande som datadriven tvärvetenskap: Pragmatiska lärdomar från SweTerror-projektet }, abstract = {Terrorism i svensk politik (SweTerror) är ett storskaligt tvärvetenskapligt forskningsprojekt med forskare från såväl human- och samhällsvetenskaperna som datavetenskaperna. Samtidigt använder och utvecklar SweTerror nationell forskningsinfrastruktur för riksdagsdata. Detta paper beskriver användningen av samförfattande som en datadriven tvärvetenskaplig praktik för att integrera olika vetenskapliga perspektiv och skapa samsyn i projektforskningen. Vi tar fasta på betydelsen av valet att koncentrera samarbetsformen kring konferenspapers inom specifikt digital humaniora och diskuterar erfarenheten av att samskrivande försvagar vetenskapligt revirtänkande, liksom ett iterativt förhållningssätt till forskningsdata kopplade till forskningsinfrastrukturer under uppbyggnad. Avslutningsvis betonar vi datadrivet samförfattande som en pragmatisk praktik för att stärka kollaborativt samarbete och kunskapsbryggor inom en tvärvetenskaplig forskargrupp.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Ängsal, Magnus Pettersson and Öhberg, Patrik}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @misc{volodina-etal-2024-proceedings-335190, title = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @incollection{pfaff-bouma-2024-npegl-335993, title = {The NPEGL noun phrase database: design and construction }, booktitle = { Bech, Kristin & Pfaff, Alexander (eds.), Noun phrases in early Germanic languages}, author = {Pfaff, Alexander and Bouma, Gerlof}, year = {2024}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-467-3}, pages = {1–32}, } @inProceedings{angsal-etal-2024-terrorism-337182, title = {Terrorism som tolkningsram: en diskurssemantisk studie av svensk riksdagsdebatt 1993–2018}, booktitle = {Svenskans beskrivning}, author = {Ängsal, Magnus Pettersson and Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Öhberg, Patrik}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-91-5}, } @inProceedings{szawerna-etal-2024-pseudonymization-338089, title = {Pseudonymization Categories across Domain Boundaries}, abstract = {Linguistic data, a component critical not only for research in a variety of fields but also for the development of various Natural Language Processing (NLP) applications, can contain personal information. As a result, its accessibility is limited, both from a legal and an ethical standpoint. One of the solutions is the pseudonymization of the data. Key stages of this process include the identification of sensitive elements and the generation of suitable surrogates in a way that the data is still useful for the intended task. Within this paper, we conduct an analysis of tagsets that have previously been utilized in anonymization and pseudonymization. We also investigate what kinds of Personally Identifiable Information (PII) appear in various domains. These reveal that none of the analyzed tagsets account for all of the PII types present cross-domain at the level of detailedness seemingly required for pseudonymization. We advocate for a universal system of tags for categorizing PIIs leading up to their replacement. Such categorization could facilitate the generation of grammatically, semantically, and sociolinguistically appropriate surrogates for the kinds of information that are considered sensitive in a given domain, resulting in a system that would enable dynamic pseudonymization while keeping the texts readable and useful for future research in various fields.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), LREC-COLING, 2024 20-25 May, 2024, Torino, Italia}, author = {Szawerna, Maria Irena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena}, year = {2024}, publisher = {ELRA and ICCL}, ISBN = {978-2-493814-10-4}, } @article{volodina-etal-2024-swedish-340630, title = {Swedish word family resource}, abstract = {The article introduces a novel lexical resource for Swedish based on word family principles. The development of the Swedish Word Family (SweWF) resource is set into the context of linguistic complexity in second language acquisition. The SweWF is particularly appropriate for that, given that it contains lexical items used in second language corpora, namely, in a corpus of coursebook texts, and in a corpus of learner essays. The main focus of the article is on the construction of the resource with its user interface and on its applicability for research, although it also opens vast possibilities for practical applications for language learning, testing and assessment. We demonstrate the value of the resource through several case studies.}, journal = {ITL-INTERNATIONAL JOURNAL OF APPLIED LINGUISTICS}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Tiedemann, Therese Lindstrom}, year = {2024}, } @inProceedings{munozsanchez-etal-2024-names-336384, title = {Did the Names I Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring}, abstract = {Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta }, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Szawerna, Maria Irena and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @article{berdicevskis-etal-2024-drop-326112, title = {To drop or not to drop? Predicting the omission of the infinitival marker in a Swedish future construction}, abstract = {We investigate the optional omission of the infinitival marker in a Swedish future tense construction. During the last two decades the frequency of omission has been rapidly increasing, and this process has received considerable attention in the literature. We test whether the knowledge which has been accumulated can yield accurate predictions of language variation and change. We extracted all occurrences of the construction from a very large collection of corpora. The dataset was automatically annotated with language-internal predictors which have previously been shown or hypothesized to affect the variation. We trained several models in order to make two kinds of predictions: whether the marker will be omitted in a specific utterance and how large the proportion of omissions will be for a given time period. For most of the approaches we tried, we were not able to achieve a better-than-baseline performance. The only exception was predicting the proportion of omissions using autoregressive integrated moving average models for one-step-ahead forecast, and in this case time was the only predictor that mattered. Our data suggest that most of the language-internal predictors do have some effect on the variation, but the effect is not strong enough to yield reliable predictions.}, journal = {Corpus Linguistics and Linguistic Theory}, author = {Berdicevskis, Aleksandrs and Coussé, Evie and Koplenig, Alexander and Adesam, Yvonne}, year = {2024}, volume = {20}, number = {1}, pages = {219–261}, } @inProceedings{munozsanchez-etal-2024-jingle-342259, title = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT}, abstract = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.}, booktitle = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) }, author = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-8075-774-4}, } @inProceedings{adesam-etal-2024-sprakforandring-337166, title = {Språkförändring på bar gärning. En mikrodiakron korpusstudie av pågående förändringar i stavning, lexikon och grammatik}, booktitle = {Svenskans beskrivning 38: Förhandlingar vid trettioåttonde sammankomsten. Örebro 4–6 maj 2022, Del I}, author = {Adesam, Yvonne and Berdicevskis, Aleksandrs and Coussé, Evie}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-89-2}, pages = {234--251}, } @inProceedings{szawerna-etal-2024-detecting-336385, title = {Detecting Personal Identifiable Information in Swedish Learner Essays}, abstract = {Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, St. Julian’s, Malta}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @inProceedings{masciolini-2024-bootstrapping-338425, title = {Bootstrapping the Annotation of UD Learner Treebanks}, abstract = {Learner data comes in a variety of formats, making corpora difficult to compare with each other. Universal Dependencies (UD) has therefore been proposed as a replacement for the various ad-hoc annotation schemes. Nowadays, the time-consuming task of building a UD treebank often starts with a round of automatic annotation. The performance of the currently available tools trained on standard language, however, tends to decline substantially upon application to learner text. Grammatical errors play a major role, but a significant performance gap has been observed even between standard test sets and normalized learner essays. In this paper, we investigate how to best bootstrap the annotation of UD learner corpora. In particular, we want to establish whether Target Hypotheses (THs), i.e. grammar-corrected learner sentences, are suitable training data for fine-tuning a parser aimed for original (ungrammatical) L2 material. We perform experiments using English and Italian data from two of the already available UD learner corpora. Our results show manually annotated THs to be highly beneficial and suggest that even automatically parsed sentences of this kind might be helpful, if available in sufficiently large amounts.}, booktitle = {Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024, 20 May, 2024, Torino, Italia}, author = {Masciolini, Arianna}, year = {2024}, publisher = {ELRA }, ISBN = {978-2-493814-31-9}, } @article{lindahl-borin-2024-annotation-333043, title = {Annotation for computational argumentation analysis: Issues and perspectives}, abstract = {Argumentation has long been studied in a number of disciplines, including several branches of linguistics. In recent years, computational processing of argumentation has been added to the list, reflecting a general interest from the field of natural language processing (NLP) in building natural language understanding systems for increasingly intricate language phenomena. Computational argumentation analysis – referred to as argumentation mining in the NLP literature – requires large amounts of real-world text with manually analyzed argumentation. This process is known as annotation in the NLP literature and such annotated datasets are used both as “gold standards” for assessing the quality of NLP applications and as training data for the machine learning algorithms underlying most state of the art approaches to NLP. Argumentation annotation turns out to be complex, both because argumentation can be complex in itself and because it does not come across as a unitary phenomenon in the literature. In this survey we review how argumentation has been studied in other fields, how it has been annotated in NLP and what has been achieved so far. We conclude with describing some important current and future issues to be resolved.}, journal = {Language and Linguistics Compass}, author = {Lindahl, Anna and Borin, Lars}, year = {2024}, volume = {18}, number = {1}, } @inProceedings{masciolini-toth-2024-stund-335974, title = {STUnD: ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker }, abstract = {Föreliggande artikel introducerar STUND, ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker som möjliggör parallella syntaktiska sökningar. Vi demonstrerar dess praktiska tillämpning i en fallstudie på tempusformen presens perfekt i svenska och engelska. Resultaten visar att presens perfekt används i ungefär lika stor utsträckning i båda språken, men att det förekommer viss variation som verkar bero på språkspecifika konventioner och översättningsstrategier. }, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024}, author = {Masciolini, Arianna and Tóth, Márton András}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @article{holmer-2024-derivatives-343310, title = {Derivatives in Swedish dictionaries. The case of deverbal nouns in -ande}, abstract = {This article deals with known challenges as well as new ones associated with the lexicographical solutions regarding derivatives in dictionaries. Five of Sweden’s major monolingual dictionaries are being examined with the aim of describing and comparing their derivatives, with special focus on deverbal nouns with the suffix -ande. The research combines morphology, lexicography and metalexicography, aiming at pre-senting and discussing some of the key areas of lemma inclusion and word formation principles in Swedish monolingual, contemporary dictionaries.}, journal = {Lexicographica. International Annual for Lexicography / Revue Internationale de Lexicographie / Internationales Jahrbuch für Lexikographie}, author = {Holmer, Louise}, year = {2024}, volume = {40}, number = {1}, pages = {59--79}, } @inProceedings{lange-2024-setting-341511, title = {Setting up a Research Data Repository Based on Invenio RDM: An Experience Report }, booktitle = {DNHB 2024: Digital Humanities in the Nodic and Baltic Countries 8th Conference, Reykjavík, Iceland, 27–31 May 2024.}, author = {Lange, Herbert}, year = {2024}, publisher = {University of Oslo library}, address = {Oslo, Norway}, } @inProceedings{holmer-2024-svenska-342760, title = {Så kan svenska ordböcker användas i undervisning}, abstract = {Svenska ordböcker – finns sådana fortfarande? Och hur skulle de kunna användas mer aktivt i undervisningen? Med utgångspunkt i dessa frågor visas och diskuteras i föredraget hur allmänt tillgängliga ordboksresurser är uppbyggda och hur de kan användas aktivt i undervisningen.}, booktitle = {Ämnets dag 2024, Svenska}, author = {Holmer, Louise}, year = {2024}, address = {Göteborg}, } @inProceedings{francis-2024-variation-342620, title = {Variation between Credible and Non-Credible News Across Topics}, abstract = {‘Fake News’ continues to undermine trust in modern journalism and politics. Despite con- tinued efforts to study fake news, results have been conflicting. Previous attempts to analyse and combat fake news have largely focused on distinguishing fake news from truth, or differ- entiating between its various sub-types (such as propaganda, satire, misinformation, etc.) This paper conducts a linguistic and stylistic analy- sis of fake news, focusing on variation between various news topics. It builds on related work identifying features from discourse and linguis- tics in deception detection by analysing five distinct news topics: Economy, Entertainment, Health, Science, and Sports. The results em- phasize that linguistic features vary between credible and deceptive news in each domain and highlight the importance of adapting clas- sification tasks to accommodate variety-based stylistic and linguistic differences in order to achieve better real-world performance.}, booktitle = {The First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security}, author = {Francis, Emilie}, year = {2024}, publisher = {NLPAICS’2024}, address = {Lancaster, U.K.}, pages = {86--96}, } @inProceedings{lindahl-2024-disagreement-341074, title = {Disagreement in Argumentation Annotation}, abstract = {Disagreement, perspective or error? There is a growing discussion against the idea of a unified ground truth in annotated data, as well as the usefulness of such a ground truth and resulting gold standard. In data perspectivism, this issue is exemplified with tasks such as hate speech or sentiment classification in which annotators’ different perspectives are important to include. In this paper we turn to argumentation, a related field which has had less focus from this point of view. Argumentation is difficult to annotate for several reasons, from the more practical parts of deciding where the argumentation begins and ends to questions of how argumentation is defined and what it consists of. Learning more about disagreement is therefore important in order to improve argument annotation and to better utilize argument annotated data. Because of this, we examine disagreement in two corpora annotated with argumentation both manually and computationally. We find that disagreement is often not because of annotation errors or mistakes but due to the possibility of multiple possible interpretations. More specifically, these interpretations can be over boundaries, label or existence of argumentation. These results emphasize the need for more thorough analysis of disagreement in data, outside of the more common inter-annotator agreement measures.}, booktitle = {3rd Workshop on Perspectivist Approaches to NLP, NLPerspectives 2024 at LREC-COLING 2024 - Workshop Proceedings}, author = {Lindahl, Anna}, year = {2024}, ISBN = {9782493814234}, }