Hoppa till huvudinnehåll

BibTeX

@inProceedings{Adesam-Yvonne2021-304973,
	title        = {Part-of-speech tagging of Swedish texts in the neural era},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics, NoDaLiDa, May 31–2 June, 2021, Reykjavik, Iceland (online) / eds Simon Dobnik and Lilja Øvrelid},
	author       = {Adesam, Yvonne and Berdicevskis, Aleksandrs},
	year         = {2021},
	publisher    = { Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = { 978-91-7929-614-8},
}

@article{Ehret-Katharina2021-304914,
	title        = {Meaning and Measures: Interpreting and Evaluating Complexity Metrics},
	journal      = {Frontiers in communication},
	author       = {Ehret, Katharina and Blumenthal-Dramé, Alice and Bentz, Christian and Berdicevskis, Aleksandrs},
	year         = {2021},
	volume       = {6},
}

@article{Basirat-Ali2021-302492,
	title        = {An empirical study on the contribution of formal and semantic features to the grammatical gender of nouns},
	abstract     = {This study conducts an experimental evaluation of two hypotheses about the contributions of formal and semantic features to the grammatical gender assignment of nouns. One of the hypotheses (Corbett and Fraser 2000) claims that semantic features dominate formal ones. The other hypothesis, formulated within the optimal gender assignment theory (Rice 2006), states that form and semantics contribute equally. Both hypotheses claim that the combination of formal and semantic features yields the most accurate gender identification. In this paper, we operationalize and test these hypotheses by trying to predict grammatical gender using only character-based embeddings (that capture only formal features), only context-based embeddings (that capture only semantic features) and the combination of both. We performed the experiment using data from three languages with different gender systems (French, German and Russian). Formal features are a significantly better predictor of gender than semantic ones, and the difference in prediction accuracy is very large. Overall, formal features are also significantly better than the combination of form and semantics, but the difference is very small and the results for this comparison are not entirely consistent across languages.},
	journal      = {Linguistics Vanguard},
	author       = {Basirat, Ali and Allassonnière-Tang, Marc and Berdicevskis, Aleksandrs},
	year         = {2021},
	volume       = {7},
	number       = {1},
}

@book{Berdicevskis-Aleksandrs2021-311612,
	title        = {Tri skljanki popoludni i drugie zadachi po lingvistike},
	author       = {Berdicevskis, Aleksandrs and Piperski, Alexander},
	year         = {2021},
	publisher    = {Alpina Non-Fiction},
	address      = {Moskva},
	ISBN         = {978-5-00139-130-2},
}

@inProceedings{Berdicevskis-Aleksandrs2021-311655,
	title        = {Successes and failures of Menzerath’s law at the syntactic level},
	booktitle    = {Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021), 21–25 March, 2022, Sofia, Bulgaria / Radek Čech, Xinying Chen (eds.)},
	author       = {Berdicevskis, Aleksandrs},
	year         = {2021},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = { 978-1-955917-15-5},
}

@inProceedings{Volodina-Elena2021-311725,
	title        = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish},
	abstract     = {We present DaLAJ 1.0, a Dataset for Linguistic Acceptability Judgments for Swedish, comprising 9 596 sentences in its first version. DaLAJ is based on the SweLL second language learner data (Volodina et al., 2019), consisting of essays at different levels of proficiency. To make sure the dataset can be freely available despite the GDPR regulations, we have sentence-scrambled learner essays and removed part of the metadata about learners, keeping for each sentence only information about the mother tongue and the level of the course where the essay has been written. We use the normalized version of learner language as the basis for DaLAJ sentences, and keep only one error per sentence. We repeat the same sentence for each individual correction tag used in the sentence. For DaLAJ 1.0 four error categories of 35 available in SweLL are used, all connected to lexical or word-building choices. The dataset is included in the SwedishGlue benchmark. Below, we describe the format of the dataset, our insights and motivation for the chosen approach to data sharing.},
	booktitle    = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021), Online},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Klezl, Julia },
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-625-4},
}

@inProceedings{Volodina-Elena2021-311724,
	title        = {CoDeRooMor: A new dataset for non-inflectional morphology studies of Swedish},
	abstract     = {The paper introduces a new resource, CoDeRooMor, for studying the morphology of modern Swedish word formation. The approximately 16.000 lexical items in the resource have been manually segmented into word-formation morphemes, and labeled for their categories, such as prefixes, suffixes, roots, etc. Word-formation mechanisms, such as derivation and compounding have been associated with each item on the list. The article describes the selection of items for manual annotation and the principles of annotation, reports on the reliability of the manual annotation, and presents tools, resources and some first statistics. Given the”gold” nature of the resource, it is possible to use it for empirical studies as well as to develop linguistically-aware algorithms for morpheme segmentation and labeling (cf statistical subword approach). The resource is freely available through Språkbanken-Text.},
	booktitle    = { 23rd Nordic Conference on Computational Linguistics (NoDaLiDa) Proceedings, May 31–2 June, 2021, Reykjavik, Iceland Online / Simon Dobnik, Lilja Øvrelid (Editors)},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese},
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{Edlund-Jens2022-311480,
	title        = {A Multimodal Digital Humanities Study of Terrorism in Swedish Politics: An Interdisciplinary Mixed Methods Project on the Configuration of Terrorism in Parliamentary Debates, Legislation, and Policy Networks 1968–2018},
	abstract     = {This paper presents the design of one of Sweden’s largest digital humanities projects, SweTerror, that through an interdisciplinary multi-modal methodological approach develops an extensive speech-to-text digital HSS resource. SweTerror makes a major contribution to the study of terrorism in Sweden through a comprehensive mixed methods study of the political discourse on terrorism since the late 1960s. Drawing on artificial intelligence in the form of state-of-the-art language and speech technology, it systematically analyses all forms of relevant parliamentary utterances. It explores and curates an exhaustive but understudied multi-modal collection of primary sources of central relevance to Swedish democracy: the audio recordings of the Swedish Parliament’s debates. The project studies the framing of terrorism both as policy discourse and enacted politics, examining semantic and emotive components of the parliamentary discourse on terrorism as well as major actors and social networks involved. It covers political responses to a range of terrorism-related issues as well as factors influencing policy-makers’ engagement, including political affiliations and gender. SweTerror also develops an online research portal, featuring the complete research material and searchable audio made readily accessible for further exploration. Long-term, the project establishes a model for combining extraction technologies (speech recognition and analysis) for audiovisual parliamentary data with text mining and HSS interpretive methods and the portal is designed to serve as a prototype for other similar projects.},
	booktitle    = { Intelligent Systems and Applications. Proceedings of the 2021 Intelligent Systems Conference, September 2–3, 2021 / Arai K. (eds) },
	author       = {Edlund, Jens and Brodén, Daniel and Fridlund, Mats and Lindhé , Cecilia and Olsson, Leif-Jöran and Ängsal, Magnus Pettersson and Öhberg, Patrik},
	year         = {2022},
	publisher    = {Springer},
	address      = {Cham},
	ISBN         = {978-3-030-82195-1},
}

@inProceedings{Virk-Shafqat2021-306964,
	title        = {A Data-Driven Semi-Automatic Framenet Development Methodology },
	abstract     = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist
a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database
and together with the annotated example sentences they have been made available through a web interface.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@incollection{Dannélls-Dana2021-310041,
	title        = {Swedish FrameNet},
	abstract     = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been
extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. },
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {37 -- 66},
}

@incollection{Lindén-Krister2021-311386,
	title        = {A multilingual net of lexical resources},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Lindén, Krister and Niemi, Jyrki and Borin, Lars and Forsberg, Markus and Pedersen, Bolette S. and Nimb, Sanni and Orav, Heili and Kahusk, Neeme and Vider, Kadri},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {123–137},
}

@incollection{Borin-Lars2021-311385,
	title        = {Swedish FrameNet++ – lexical samsara},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart and Zechner, Niklas},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {69–95},
}

@misc{Ljunglöf-Peter2021-306645,
	title        = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020},
	abstract     = {Selected extended papers from the Eight Swedish Language Technology Conference (SLTC-2020) which was held between 25-27 November 2020 in Gothenburg and online.},
	author       = {Ljunglöf, Peter and Dobnik, Simon and Johansson, Richard},
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-031-3},
}

@incollection{Lange-Herbert2021-305146,
	title        = {Learning Domain-Specific Grammars from a Small Number of Examples},
	abstract     = {In this chapter we investigate the problem of grammar learning from a perspective that diverges from previous approaches. These prevailing approaches to learning grammars usually attempt to infer a grammar directly from example corpora without any additional information. This either requires a large training set or suffers from bad accuracy. We instead view learning grammars as a problem of grammar restriction or subgrammar extraction. We start from a large-scale grammar (called a resource grammar) and a small number of example sentences, and find a subgrammar that still covers all the examples. To accomplish this, we formulate the problem as a constraint satisfaction problem, and use a constraint solver to find the optimal grammar. We created experiments with English, Finnish, German, Swedish, and Spanish, which show that 10–20 examples are often sufficient to learn an interesting grammar for a specific application. We also present two extensions to this basic method: we include negative examples and allow rules to be merged. The resulting grammars can more precisely cover specific linguistic phenomena. Our method, together with the extensions, can be used to provide a grammar learning system for specific applications. This system is easy-to-use, human-centric, and can be used by non-syntacticians. Based on this grammar learning method, we can build applications for computer-assisted language learning and interlingual communication, which rely heavily on the knowledge of language and domain experts who often lack the competence to develop required grammars themselves.},
	booktitle    = {Natural Language Processing in Artificial Intelligence—NLPinAI 2020},
	author       = {Lange, Herbert and Ljunglöf, Peter},
	year         = {2021},
	publisher    = {Springer International Publishing},
	ISBN         = {978-3-030-63787-3},
}

@inProceedings{Hansson-Saga2021-305126,
	title        = {The Swedish Winogender Dataset},
	abstract     = {We introduce the SweWinogender test set, a diagnostic dataset to measure gender bias in coreference resolution. It is modelled after the English Winogender benchmark, and is released with reference statistics on the distribution of men and women between occupations and the association between gender and occupation in modern corpus material. The paper discusses the design and creation of the dataset, and presents a small investigation of the supplementary statistics.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31 - June 2, 2021, Reykjavik, Iceland (online)},
	author       = {Hansson, Saga and Mavromatakis, Konstantinos and Adesam, Yvonne and Bouma, Gerlof and Dannélls, Dana},
	year         = {2021},
	publisher    = {Linköping University Electronic Press },
	address      = {Linköping },
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{Dannélls-Dana2021-305700,
	title        = {A Two-OCR Engine Method for Digitized Swedish Newspapers },
	abstract     = {In  this  paper  we  present  a  two-OCR  engine  method  that  was  developed  at  Kungliga  biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. To evaluate the method a reference material spanning the years 1818–2018 was prepared and manually transcribed. A quantitative evaluation was then performed against the material. In this first evaluation we experimented with word lists for different time periods. The results show that even though there was no significant overall improvement of the OCR results, some combinations of word lists are successful for certain periods and should therefore be explored further.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2020, Linköping Electronic Conference Proceedings 180},
	author       = {Dannélls, Dana and  Björk, Lars and Dirdal, Ove  and Johansson, Torsten },
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-609-4},
}

@inProceedings{Virk-Shafqat2021-306962,
	title        = {A Novel Machine Learning Based Approach for Post-OCR Error Detection},
	abstract     = {Post processing is the most conventional approach for correcting errors that are caused
by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct
OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused
most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. },
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and  Muhammad, Azam Sheikh},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{Skelbye-Molly2021-306957,
	title        = {OCR Processing of Swedish Historical Newspapers Using Deep Hybrid CNN–LSTM Networks},
	abstract     = {Deep CNN–LSTM hybrid neural networks have proven to improve the accuracy of Optical Character Recognition (OCR) models for different languages. In this paper we examine to what extent these networks improve the OCR accuracy rates on Swedish historical newspapers. By experimenting
with the open source OCR engine Calamari, we are able to show that mixed deep CNN–LSTM hybrid models outperform previous models on the task of character recognition of Swedish historical newspapers spanning 1818–1848. We achieved an average character accuracy rate (CAR) of 97.43% which is a new state–of–the–art result on 19th century Swedish newspaper text. Our data, code and
models are released under CC BY licence.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Skelbye, Molly  and Dannélls, Dana},
	year         = {2021},
	publisher    = {INCOMA },
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@incollection{Dannélls-Dana2021-310047,
	title        = {Computational representation of FrameNet for multilingual natural language generation},
	abstract     = {Multilingual natural language generation, the process of producing written or spoken utterances in parallel languages from either structured or unstructured representations requires large amounts of syntactic and semantic information to generate an expression that is tailored to the target audience. This information is offered by FrameNet-like resources, which have been developed for a number of languages. In this chapter, we present a computational FrameNet grammar resource for multilingual natural language generation. We compare between English and Swedish framenets to illustrate how these can be unified under a shared computational representation using Grammatical Framework.
We demonstrate how the grammar was exploited in two practical multilingual natural language generation applications to facilitate tourist communication and empower museum users with coherent artwork descriptions.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Dannélls, Dana and Grūzītis, Normunds },
	year         = {2021},
	publisher    = {John Benjamins Publishing Company },
	address      = {Amsterdam / Philadelphia },
	ISBN         = { 9789027258489 },
	pages        = {281 -- 301},
}

@book{Dannélls-Dana2021-310036,
	title        = {The Swedish FrameNet++
Harmonization, integration, method development
and practical language technology applications},
	abstract     = {Large computational lexicons are central NLP resources. Swedish FrameNet++ aims to be a versatile full-scale lexical resource for NLP containing many kinds of linguistic information. Although focused on Swedish, this ongoing effort, which includes building a new Swedish framenet and recycling existing lexicons, has offered valuable insights into general aspects of lexical-resource building for NLP, which are discussed in this book: computational and linguistic problems of lexical semantics and lexical typology, the nature of lexical items (words and multiword expressions), achieving interoperability among heterogeneous lexical content, NLP methods for extending and interlinking existing lexicons,
and deploying the new resource in practical NLP applications. This book is targeted at everyone with an interest in lexicography, computational lexicography, lexical typology, lexical semantics, linguistics, computational linguistics and related fields. We believe it should be of particular interest to those who are or have been involved in language resource creation, development and evaluation.},
	author       = {Dannélls, Dana and Borin, Lars and Friberg Heppin, Karin},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam, Philadelphia},
	ISBN         = {9789027209900 },
}

@inProceedings{Dannélls-Dana2021-310123,
	title        = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text },
	abstract     = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.},
	booktitle    = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2021},
	publisher    = {Linköping Electronic Press },
	address      = {Linköping},
}

@incollection{Borin-Lars2021-310200,
	title        = {Introduction: 
Swedish FrameNet++},
	abstract     = {The Swedish FrameNet++ was designed to be several things. As a digital artifact, it is an integrated panchronic lexical macroresource, primarily for Swedish, but including several other languages, intended as a basic infrastructural component in Swedish language technology research and for developing natural language processing applications. As an activity, it is a long-term R&D initiative,
initially aimed at bringing about this macroresource, and now at maintaining and extending it, at promoting its use in language technology research and application development, as well as ensuring that the results of this research and development in their turn are incorporated in the macroresource. As a product of research, it reflects both computational and linguistic approaches to lexicology,
lexical semantics, and lexical typology.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin },
	author       = {Borin, Lars and Dannélls, Dana and Friberg Heppin, Karin},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {3 -- 36},
}

@inProceedings{Hammarlin-Mia-Marie2021-307227,
	title        = {Vaccine hesitancy – trust and distrust in medical expertise and authorities},
	abstract     = {The increase of vaccine hesitancy is singled out by WHO as one of the ten most important and urgent threats to global health (https://www.who.int/emergencies/ten-threats-to-global-health-in-2019). Diseases like measles are returning in different parts of Europe, partly as a result of the activities of the anti-vaccination movement. The herd immunity in most Western countries is high but even a small decrease in vaccination would have immediate negative effects for the population. Sweden offers a perfect site for future anti-vaccination studies due to its high vaccination covering. A decline in the numbers of children vaccinated has had immediate effects. For example, the incident rate in the country of pertussis rose from 700 cases to 3,200 cases per 100,000 children in 4 years due to a rather small decrease in vaccinations. This constitutes a strong argument for the civic importance of the case.

The aim of this presentation is to introduce a new 4-year research project (2020–2023), independently financed by the Bank of Sweden Foundation (Riksbankens jubileumsfond), with the goal to investigate the role and importance of rumouring for the vaccination skepticism growing on the internet, and how it can be understood as an expression of civic engagement in the present digital times entailing crucial transformations for everyday civic culture. Theoretically, the project builds upon, and develop, media researcher Dahlgren’s work on civic culture and Kitta’s studies of the anti-vaccination movement. The overarching research question is: How have the everyday practice and experience of, and the conditions for, rumours been shaped and reshaped in the digital age, and what do these processes mean for civic engagement and participation? The project will offer an understanding of how everyday interaction on the internet has a powerful impact on the spreading of false information, which in the long run may challenge democracy. On a more concrete level the project will answer the following questions in relation to the case of vaccine skepticism: How are rumours about alleged risks and dangers of vaccination propagated and established on the internet? Are there specific patterns and correlations connecting topics, assumptions, myths, argumentation schemes, popularity and time? What do everyday practices, on- and offline, of rumouring mean for its adherents’ civic engagement in the anti-vaccination movement? Which are the civic implications of the spreading and circulation of vaccination hostile rumours on individual citizens and society at large?},
	booktitle    = {8th European Communication Conference (ECREA)},
	author       = {Hammarlin, Mia-Marie and Miegel, Fredrik and Borin, Lars and Kokkinakis, Dimitrios and Jaakonaho, Anna},
	year         = {2021},
}

@book{Tahmasebi-Nina2021-306968,
	title        = {Computational approaches to semantic change},
	abstract     = {Semantic change — how the meanings of words change over time — has preoccupied scholars since well before modern linguistics emerged in the late 19th and early 20th century, ushering in a new methodological turn in the study of language change. Compared to changes in sound and grammar, semantic change is the least  understood. Ever since, the study of semantic change has progressed steadily, accumulating a vast store of knowledge for over a century, encompassing many languages and language families.

Historical linguists also early on realized the potential of computers as research tools, with papers at the very first international conferences in computational linguistics in the 1960s. Such computational studies still tended to be small-scale, method-oriented, and qualitative. However, recent years have witnessed a sea-change in this regard. Big-data empirical quantitative investigations are now coming to the forefront, enabled by enormous advances in storage capability and processing power. Diachronic corpora have grown beyond imagination, defying exploration by traditional manual qualitative methods, and language technology has become increasingly data-driven and semantics-oriented. These developments present a golden opportunity for the empirical study of semantic change over both long and short time spans.

A major challenge presently is to integrate the hard-earned  knowledge and expertise of traditional historical linguistics with  cutting-edge methodology explored primarily in computational linguistics.

The idea for the present volume came out of a concrete response to this challenge.  The 1st International Workshop on Computational Approaches to Historical Language Change (LChange'19), at ACL 2019, brought together scholars from both fields.

This volume offers a survey of this exciting new direction in the study of semantic change, a discussion of the many remaining challenges that we face in pursuing it, and considerably updated and extended versions of a selection of the contributions to the LChange'19 workshop, addressing both more theoretical problems —  e.g., discovery of "laws of semantic change" — and practical applications, such as information retrieval in longitudinal text archives.},
	author       = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang and Hengchen, Simon},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
}

@incollection{Jatowt-Adam2021-307061,
	title        = {Computational approaches to lexical semantic change: Visualization systems and novel applications},
	abstract     = {The purpose of this chapter is to survey visualization and user interface solutions for understanding lexical semantic change as well as to survey a number of applications of techniques developed in computational analysis of lexical semantic change. We first overview approaches aiming to develop systems that support understanding semantic change in an interactive and visual way. It is generally accepted that computational techniques developed for analyzing and uncovering semantic change are beneficial to linguists, historians, sociologists, and practitioners in numerous related fields, especially within the humanities. However, quite a few non-professional users are equally interested in the histories of words. Developing interactive, visual, engaging, and easy-to-understand systems can help them to acquire relevant knowledge.

Second, we believe that other fields could benefit from the research outcomes of computational approaches to lexical semantic change. In general, properly representing the meaning of terms used in the past should be important for a range of natural language processing, information retrieval and other tasks that operate on old texts. In the latter part of the chapter, we then focus on current and potential applications related to computer and information science with the underlying question: “How can modeling semantic change benefit wider downstream applications in these disciplines?”},
	booktitle    = {Computational approaches to semantic change },
	author       = {Jatowt, Adam and Tahmasebi, Nina and Borin, Lars},
	year         = {2021},
	publisher    = { Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-96110-312-6},
	pages        = {311--339},
}

@incollection{Tahmasebi-Nina2021-307058,
	title        = {Survey of computational approaches to lexical semantic change detection},
	abstract     = {Our languages are in constant flux driven by external factors such as cultural, societal and technological changes, as well as by only partially understood internal motivations. Words acquire new meanings and lose old senses, new words are coined or borrowed from other languages and obsolete words slide into obscurity. Understanding the characteristics of shifts in the meaning and in the use of words
is useful for those who work with the content of historical texts, the interested general public, but also in and of itself.

The findings from automatic lexical semantic change detection and the models of diachronic conceptual change are also currently being incorporated in approaches for measuring document across-time similarity, information retrieval from long-term document archives, the design of OCR algorithms, and so on. In recent years we have seen a surge in interest in the academic community in computational methods and tools supporting inquiry into diachronic conceptual change and lexical replacement. This article provides a comprehensive survey of recent computational
techniques to tackle both.},
	booktitle    = {Computational approaches to semantic change / Nina Tahmasebi, Lars Borin, Adam Jatowt, Yang Xu, Simon Hengchen (eds.)  },
	author       = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam},
	year         = {2021},
	publisher    = { Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-96110-312-6 },
	pages        = {1--91},
}

@article{Borin-Lars2021-309082,
	title        = {A bird’s-eye view on South Asian languages
through LSI: Areal or genetic relationships?},
	abstract     = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.},
	journal      = {Journal of South Asian Languages and Linguistics},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {151--185},
}

@incollection{Adesam-Yvonne2021-310933,
	title        = {A lexical resource for computational historical linguistics},
	abstract     = {In this chapter we present the diachronic dimension of Swedish FrameNet++. We describe the historical lexical resources currently available for Swedish, linked to the Contemporary Swedish lexicon Saldo. We present a case study of how interlinking the dictionaries simultaneously allows us to study lexical change. We also present a method of linking text words to lexicon entries, facilitating interactive exploration of historical texts. Diachronical language resources present both a high-variation challenge from a wider language technology perspective, and an interesting object of linguistic study. While a number of improvements of the parts of the diachronic lexical macroresource are still needed, this resource is invaluable for analysing and accessing historical texts, as well as for both synchronic historical and diachronic lexical studies.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Adesam, Yvonne and Andersson, Peter and Borin, Lars and Bouma, Gerlof},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {98–121},
}

@incollection{Borin-Lars2021-311387,
	title        = {Swedish FrameNet++ and comparative linguistics},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {139–165},
}

@incollection{Borin-Lars2021-311388,
	title        = {Multiword expressions – a tough typological nut for Swedish FrameNet++},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Borin, Lars},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {221–259},
}

@book{Alfter-David2021-311727,
	title        = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language
Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural
Language Processing and Speech Technologies in CALL systems and exploring the theoretical and
methodological issues arising in this connection. The latter includes, among others, the integration of
insights from Second Language Acquisition (SLA) research, and the promotion of “Computational
SLA” through setting up Second Language research infrastructures.},
	author       = {Alfter, David and Volodina, Elena and Pilán , Ildikó  and Graën, Johannes and Borin, Lars},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings 177},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-625-4},
}

@article{Zanetti-Arianna2021-311723,
	title        = {Automatic Generation of Exercises for Second Language Learning from Parallel Corpus Data},
	abstract     = {Creating language learning exercises is a time-consuming task and made-up sample sentences frequently lack authenticity. Authentic samples can be obtained from corpora, but it is necessary to identify material that is suitable for language learners. Parallel corpora of written text consist of translated material. Comparing the text in one language with its translation into another (known) language makes the structure accessible to the learner. However, the correspondence of words between the two languages is more important. By carefully selecting well-suited parallel sentences, a learner can explore the target language in a guided way. We present an approach to generate a novel type of language learning exercise from a large parallel corpus based on movie subtitles. The size of the corpus allows for defining selective criteria, favoring precision over recall. It is a non-trivial task to give reliable feedback to automatically generated exercises. ICALL literature often deals with fill-inthe-blanks exercises or multiple-choice questions, which allow for very limited answer options. Our proposed exercise is a special case of sentence reconstruction on bilingual sentence pairs. It combines two elements which have proven to be effective for language learning: a gamified approach, to awaken the students’ competitive desire, and the identification of syntactic structures and vocabulary use, to improve language sensitivity. This article presents the methods used to select example pairs and to implement a prototype. },
	journal      = {International Journal of TESOL Studies},
	author       = {Zanetti, Arianna   and Volodina, Elena and Graën, Johannes},
	year         = {2021},
	volume       = {3},
	number       = {2},
	pages        = {55--71},
}

@article{Alfter-David2021-311721,
	title        = {Crowdsourcing Relative Rankings of Multi-Word Expressions: Experts versus Non-Experts},
	abstract     = {In this study we investigate to which degree experts and non-experts agree on questions of difficulty in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language.},
	journal      = {Northern European Journal of Language Technology (NEJLT)},
	author       = {Alfter, David and Lindström Tiedemann, Therese  and Volodina, Elena},
	year         = {2021},
	volume       = {7},
	number       = {1},
}

@techreport{Megyesi-Beáta2021-311730,
	title        = {SweLL pseudonymization guidelines},
	abstract     = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to
report how we have worked on the material and which decisions we have made. Guidelines are
available for each step in the manual annotation process, including:
• Transcription guidelines
• Pseudonymization guidelines
• Normalization guidelines
• Correction annotation guidelines
We specifically described all processes in English to make sure our principles and experience can
be of help to people working on other learner infrastructure projects independent of the language.},
	author       = {Megyesi, Beáta and Rudebeck, Lisa  and Volodina, Elena},
	year         = {2021},
	ISBN         = {1401-5919},
}

@techreport{Volodina-Elena2021-311729,
	title        = {SweLL transcription guidelines, L2 essays},
	abstract     = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to
report how we have worked on the material and which decisions we have made. Guidelines are
available for each step in the manual annotation process, including:
• Transcription guidelines
• Pseudonymization guidelines
• Normalization guidelines
• Correction annotation guidelines
We specifically described all processes in English to make sure our principles and experience can
be of help to people working on other learner infrastructure projects independent of the language.},
	author       = {Volodina, Elena and Megyesi, Beáta},
	year         = {2021},
	publisher    = {Institutionen för svenska språket, Göteborgs universitet},
	address      = {Göteborg},
}

@incollection{Prentice-Julia2021-310517,
	title        = {Language learning and teaching with Swedish FrameNet++: Two examples},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / edited by Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Prentice, Julia and Håkansson, Camilla and Linström Tiedemann, Therese and Pilán, Ildikó and Volodina, Elena},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam, Philadelphia},
	ISBN         = {9789027258489},
	pages        = {304–329},
}

@article{Hengchen-Simon2021-301262,
	title        = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data},
	abstract     = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.},
	journal      = {Journal of Open Humanities Data},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {1--7},
}

@inProceedings{Hengchen-Simon2021-305157,
	title        = {SuperSim: a test set for word similarity and relatedness in Swedish},
	abstract     = {Language models are notoriously difficult to evaluate. 
We release SuperSim, a large-scale similarity and relatedness test set for Swedish built with expert human judgments. The test set is composed of 1,360 word-pairs independently judged for both relatedness and similarity by five annotators. We evaluate three different models (Word2Vec, fastText, and GloVe) trained on two separate Swedish datasets, namely the Swedish Gigaword corpus and a Swedish Wikipedia dump, to provide a baseline for future comparison. 
We release the fully annotated test set, code, baseline models, and data.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31-June 2 2021, Reykjavik, Iceland (online)},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@incollection{Hengchen-Simon2021-306972,
	title        = {Challenges for computational lexical semantic change},
	abstract     = {The computational study of lexical semantic change (LSC) has taken off in the past few years and we are seeing increasing interest in the field, from both computational sciences and linguistics. Most of the research so far has focused on methods for modelling and detecting semantic change using large diachronic textual data, with the majority of the approaches employing neural embeddings. While methods that offer easy modelling of diachronic text are one of the main reasons for the spiking interest in LSC, neural models leave many aspects of the problem unsolved. The field has several open and complex challenges. In this chapter, we aim to describe the most important of these challenges and outline future directions.},
	booktitle    = {Computational approaches to semantic change / Tahmasebi, Nina, Borin, Lars, Jatowt, Adam, Yang, Xu, Hengchen, Simon (eds.)},
	author       = {Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik and Dubossarsky, Haim},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {341--372},
}

@article{Antonsson-Malin2021-301490,
	title        = {Using a Discourse Task to Explore Semantic Ability in Persons With Cognitive Impairment.},
	abstract     = {This paper uses a discourse task to explore aspects of semantic production in persons with various degree of cognitive impairment and healthy controls. The purpose of the study was to test if an in-depth semantic analysis of a cognitive-linguistic challenging discourse task could differentiate persons with a cognitive decline from those with a stable cognitive impairment. Both quantitative measures of semantic ability, using tests of oral lexical retrieval, and qualitative analysis of a narrative were used to detect semantic difficulties. Besides group comparisons a classification experiment was performed to investigate if the discourse features could be used to improve classification of the participants who had a stable cognitive impairment from those who had cognitively declined. In sum, both types of assessment methods captured difficulties between the groups, but tests of oral lexical retrieval most successfully differentiated between the cognitively stable and the cognitively declined group. Discourse features improved classification accuracy and the best combination of features discriminated between participants with a stable cognitive impairment and those who had cognitively declined with an area under the curve (AUC) of 0.93.},
	journal      = {Frontiers in aging neuroscience},
	author       = {Antonsson, Malin and Lundholm Fors, Kristina and Eckerström, Marie and Kokkinakis, Dimitrios},
	year         = {2021},
	volume       = {12},
}

@inProceedings{Virk-Shafqat2021-306966,
	title        = {A Deep Learning System for Automatic Extraction of Typological Linguistic Information from Descriptive Grammars},
	abstract     = {Linguistic typology is an area of linguistics concerned with analysis of and comparison between natural languages of the world based on their certain linguistic features. For that purpose, historically, the area has relied on manual extraction of linguistic feature values from textural descriptions of languages. This makes it a laborious and time expensive task and is also bound by human brain capacity. In this study, we present a deep learning system for the task of automatic extraction of linguistic features from textual descriptions of natural languages. First, textual descriptions are manually annotated with special structures called semantic frames. Those annotations are learned by a recurrent neural network, which is then used to annotate un-annotated text. Finally, the annotations are converted to linguistic feature values using a separate rule based module. Word embeddings, learned from general purpose text, are used as a major source of knowledge by the recurrent neural network.  We compare the proposed deep learning system to a previously reported machine learning based system for the same task, and the deep learning system wins in terms of F1 scores with a fair margin. Such a system is expected to be a useful contribution for the automatic curation of typological databases, which otherwise are manually developed.},
	booktitle    = {Proceedings of Recent Advances in Natural Language Processing, Sep 1–3, 2021/ edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Foster, Daniel  and Sheikh Muhammad, Azam  and Saleem, Raheela},
	year         = {2021},
	publisher    = {Association for Computational Linguistics (ACL)},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{Kokkinakis-Dimitrios2021-307200,
	title        = {Insights on a Swedish Covid-19 corpus},
	abstract     = {The COVID-19 pandemic has had a serious impact on people all over the world, from mental and physical health to economic downturn to education and social relationships, while political decisions in many countries have had a profound impact on the lives of all people regardless of age. Many of these effects can be studied with statistical and qualitative data such as collected questionnaires and sickness absence rates. But large-scale studies require expertise in multiple domains and from many points of view. SpråkbankenText continuously collects text from various sources. In order to fill the gap in the lack of an available Swedish COVID-19-related dataset, we started to build a Swedish COVID-19 corpus (sv-COVID-19). Various tools for e.g. lexical, semantic or pragmatic/discourse analyses can be then applied in order to answer relevant questions on e.g. how people, on a larger scale than what can be obtained through qualitative studies, experienced their everyday life through the different phases of COVID-19 crisis, or how political decisions and their consequences are described and discussed.},
	booktitle    = {CLARIN Annual Conference (Virtual Event). 27 – 29 September 2021. Monica Monachini, Maria Eskevich (red.). s. 31-34},
	author       = {Kokkinakis, Dimitrios},
	year         = {2021},
}

@misc{Gagliardi-Gloria2021-307124,
	title        = {Editorial: Digital Linguistic Biomarkers: Beyond Paper and Pencil Test},
	abstract     = {Over the last decades, a growing body of linguistic studies have been devoted to the clinical domain (Perkins 2011), while the amount of experimental linguistic research focusing on neuroscience and mental health has increased exponentially during the last few years.
Considering that many of the factors underlying cognitive and neuropsychiatric disorders may yield to late symptoms that are hard to foresee, it is often difficult to predict the existence of a presence or risk of a disease, as well as the disease’s trajectory. In this context, interdisciplinary approaches gain increasing popularity, and the analysis of complex behaviour – such as speech and language – emerges as a natural candidate to identify and analyse the extent to which a given neuropathology can impact the cognitive system at the very early stages. In this context, the development of cognitive evaluation and intervention tools focusing on linguistic biomarkers becomes a critical scientific arena both in and outside the clinic and laboratory (see Petrizzo & Popolo, 2020).

Recent international research has demonstrated that automated collected and analysed quantitative linguistic features, easily extractable from a patient’s verbal productions, can be very useful in separating people with various cognitive or mental impairment from healthy subjects, even at a very early stage (see Bedi et al., 2015), and even to predict the outcomes of clinical interventions (see Carrillo et al., 2018). In this line, machine learning-based language technology methods and tools based on artificial intelligence are particularly promising to address this task (Locke et al. 2021; Sigman et al., 2021). Indeed, subtle language disruptions can be employed as digital linguistic biomarkers, namely objective, quantifiable behavioural data that can be collected and measured by means of digital devices, allowing for a low-cost pathology detection, classification and monitoring. Compared to classical pen-and-paper neuropsychological tests, the use of these instruments shows many advantages – such as its non-intrusive and time-effective application – providing not only offline, but also online measures that serve as a proxy for cognitive processing and its underlying mechanisms.

The aim of the Research Topic Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests is to provide a state-of-the-art overview of this multidisciplinary and constantly evolving area of research, bringing together contributions from different quarters of the cognitive sciences. The collection comprises one systematic review, six original research papers, and one opinion paper. The articles are based on empirical and theoretical research from several disciplines (i.e., linguistics, psychology, Artificial Intelligence), and they tackle a range of developmental and acquired disorders. Most probably, dementia assessment has been one of the most rapidly evolving domain of Natural Language Processing (NLP) application for medical science (Petti, Baker & Korhonen 2020), but this approach is spreading rapidly through the community, with encouraging results on both developmental and acquired pathologies, as shown in the current article collection (i.e., autism, developmental language disorder, attention-deficit hyperactivity disorder, Alzheimer’s disease and mild cognitive impairment, or Parkinson’s disease). Furthermore, this Research Topic covers a variety of test languages showing the degree of internationalization of the research on the analysis verbal productions (i.e., English, Italian, German, and Japanese).},
	author       = {Gagliardi, Gloria and Kokkinakis, Dimitrios and Dunabeitia, Jon Andoni},
	year         = {2021},
	volume       = {12},
	pages        = {752238},
}

@incollection{Johansson-Richard2021-310775,
	title        = {Semantic Role Labeling},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / edited by Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {264–280},
}

@book{Alfter-David2021-304548,
	title        = {Exploring natural language processing for single-word and multi-word lexical complexity from a second language learner perspective},
	abstract     = {In this thesis, we investigate how natural language processing (NLP) tools and techniques can be applied to vocabulary aimed at second language learners of Swedish in order to classify vocabulary items into different proficiency levels suitable for learners of different levels.  

In the first part, we use feature-engineering to represent words as vectors and feed these vectors into machine learning algorithms in order to (1) learn CEFR labels from the input data and (2) predict the CEFR level of unseen words.
Our experiments corroborate the finding that feature-based classification models using 'traditional' machine learning still outperform deep learning architectures in the task of deciding how complex a word is. 

In the second part, we use crowdsourcing as a technique to generate ranked lists of multi-word expressions using both experts and non-experts (i.e. language learners). Our experiment shows that non-expert and expert rankings are highly correlated, suggesting that non-expert intuition can be seen as on-par with expert knowledge, at least in the chosen experimental configuration.

The main practical output of this research comes in two forms: prototypes and resources. We have implemented various prototype applications for (1) the automatic prediction of words based on the feature-engineering machine learning method, (2) language learning applications using graded word lists, and (3) an annotation tool for the manual annotation of expressions across a variety of linguistic factors.},
	author       = {Alfter, David},
	year         = {2021},
	publisher    = {Göteborgs universitet},
	ISBN         = {978-91-87850-79-0},
}

@book{Sköldberg-Emma2021-305730,
	title        = {Svensk ordbok utgiven av Svenska Akademien som app 2021 (för iOS)},
	abstract     = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se. Appen är utgiven av Svenska Akademien och utvecklad av Petrus Wang i samarbete med Institutionen för svenska språket, Göteborgs universitet.},
	author       = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus},
	year         = {2021},
	publisher    = {Svenska Akademien},
	address      = {Stockholm},
}

@book{Sköldberg-Emma2021-305729,
	title        = {Svensk ordbok utgiven av Svenska Akademien som app 2021 (för Android)},
	abstract     = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se. Appen är utgiven av Svenska Akademien och utvecklad av Petrus Wang i samarbete med Institutionen för svenska språket, Göteborgs universitet. },
	author       = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus},
	year         = {2021},
	publisher    = {Svenska Akademien },
	address      = {Stockholm},
}

@book{Sköldberg-Emma2021-305242,
	title        = {Svensk ordbok utgiven av Svenska Akademien, andra upplagan},
	abstract     = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se},
	author       = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus and Bäckerud, Erik},
	year         = {2021},
	publisher    = {Svenska Akademien},
	address      = {Stockholm},
}

@incollection{Bouma-Gerlof2021-311029,
	title        = {Hulpwerkwoorden stapelen – toen en nu.},
	booktitle    = {Wat gebeurt er in het Nederlands? : over taal, frequentie en variatie / Redactie Nicoline van der Sijs, Lauren Fonteyn en Marten van der Meulen},
	author       = {Bouma, Gerlof and Coussé, Evie},
	year         = {2021},
	publisher    = {Sterck & de Vreese},
	address      = {Gorredijk},
	ISBN         = {9789056158033},
	pages        = {36--40},
}

@misc{Romanello-Matteo2021-304990,
	title        = {Detecting Text Reuse with Passim},
	abstract     = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	volume       = {10},
}

@inProceedings{Marjanen-Jani2021-304736,
	title        = {Topic Modelling Discourse Dynamics in Historical Newspapers
},
	abstract     = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models.
},
	booktitle    = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020},
	author       = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko},
	year         = {2021},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik},
	address      = {Aachen },
}

@inProceedings{Duong-Quan2021-305156,
	title        = {An Unsupervised method for OCR Post-Correction and Spelling Normalisation for Finnish},
	abstract     = {Historical corpora are known to contain errors introduced by OCR (optical character recognition) methods used in the digitization process, often said to be degrading the performance of NLP systems. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We build on previous work on fully automatic unsupervised extraction of parallel data to train a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction designed for English, and adapt it to Finnish by proposing solutions that take the rich morphology of the language into account. Our new method shows increased performance while remaining fully unsupervised, with the added benefit of spelling normalisation. The source code and models are available on GitHub and Zenodo.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31–2 June, 2021, Reykjavik, Iceland (online)},
	author       = {Duong, Quan and Hämäläinen, Mika and Hengchen, Simon},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{Hengchen-Simon2021-305550,
	title        = {SBX­-HY at RuShiftEval 2021: Доверяй, но проверяй},
	abstract     = {Research in computational lexical semantic change, due to the inherent nature of language change, has been notoriously difficult to evaluate. This led to the creation of many new exciting models that cannot be easily compared. In this system paper, we describe our submissions at RuShiftEval 2021 – one of the few recently shared tasks that enable researchers, through a standard evaluation set and control conditions, to systematically compare models and gain insights from previous work. We show that despite top results in similar tasks on other languages, Temporal Referencing does not seem to perform as well on Russian.},
	booktitle    = {Computational Linguistics and Intellectual Technologies: Proceedings of the International Conference “Dialogue 2021,” Moscow, June 16–19, 2021},
	author       = {Hengchen, Simon and Viloria, Kate and Indukaev, Andrey},
	year         = {2021},
}

@incollection{Perrone-Valerio2021-306974,
	title        = {Lexical semantic change for Ancient Greek and Latin},
	abstract     = {Change and its precondition, variation, are inherent in languages. Over time, new words enter the lexicon, others become obsolete, and existing words acquire new senses. Associating a word with its correct meaning in its historical context is a central challenge in diachronic research. Historical corpora of classical languages, such as Ancient Greek and Latin, typically come with rich metadata, and existing models are limited by their inability to exploit contextual information beyond the document timestamp. While embedding-based methods feature among the current state of the art systems, they are lacking in their interpretative power. In  contrast, Bayesian models provide explicit and interpretable representations of semantic change phenomena. In this chapter we build on GASC, a recent computational approach to semantic change based on a dynamic Bayesian mixture model. In this model, the evolution of word senses over time is based not only on distributional information of lexical nature, but also on text genres. We provide a systematic comparison of dynamic Bayesian mixture models for semantic change with state-ofthe-art embedding-based models. On top of providing a full description of meaning change over time, we show that Bayesian mixture models are highly competitive approaches to detect binary semantic change in both Ancient Greek and Latin.
},
	booktitle    = {Computational approaches to semantic change},
	author       = {Perrone, Valerio and Hengchen, Simon and Palma, Marco and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {287--310},
}

@misc{Romanello-Matteo2021-307547,
	title        = {Détecter la réutilisation de texte avec Passim},
	abstract     = {Dans cette leçon, vous serez initié à la détection automatique de la réutilisation des textes avec la bibliothèque Passim. Vous apprendrez comment installer et exécuter Passim et ses dépendances, comment préparer vos textes en tant que fichiers d’entrée adaptés à l’utilisation de Passim et, enfin, comment traiter la sortie générée par Passim pour effectuer des analyses de base.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	publisher    = {The Programming Historian en français},
	volume       = {3},
}

@article{Hengchen-Simon2021-309329,
	title        = {A data-driven approach to studying changing vocabularies in historical newspaper collections},
	abstract     = {Nation and nationhood are among the most frequently studied concepts in the field of intellectual history. At the same time, the word ‘nation’ and its historical usage are very vague. The aim in this article was to develop a data-driven method using dependency parsing and neural word embeddings to clarify some of the vagueness in the evolution of this concept. To this end, we propose the following two-step method. First, using linguistic processing, we create a large set of words pertaining to the topic of nation. Second, we train diachronic word embeddings and use them to quantify the strength of the semantic similarity between these words and thereby create meaningful clusters, which are then aligned diachronically. To illustrate the robustness of the study across languages, time spans, as well as large datasets, we apply it to the entirety of five historical newspaper archives in Dutch, Swedish, Finnish, and English. To our knowledge, thus far there have been no large-scale comparative studies of this kind that purport to grasp long-term developments in as many as four different languages in a data-driven way. A particular strength of the method we describe in this article is that, by design, it is not limited to the study of nationhood, but rather expands beyond it to other research questions and is reusable in different contexts.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hengchen, Simon and Ros, Ruben and Marjanen, Jani and Tolonen, Mikko},
	year         = {2021},
	volume       = {36},
	number       = {Supplement 2},
	pages        = {109–126},
}

@inProceedings{Bäckerud-Erik2021-299384,
	title        = {Så används Svenska Akademiens ordböcker på nätet. Implicit och explicit feedback från användarna},
	abstract     = {This study presents and analyses search strings and user data for different Swedish  lexicographical  websites.  The  underlying  empirical  material  was  sourced  from  two  relatively  new  websites,  www.saob.se  and  the  joint  dic­tionary  portal  www.svenska.se,  which  collects  and  provides  a  single  point  of entry to three Swedish monolingual dictionaries financed by the Swedish Academy. Statistics are presented on the most common search strings, when and  where  the  searches  take  place,  and  what  devices  and  digital  platforms  that  are  commonly  used  while  visiting  the  sites.  In  addition,  the  study  ad­dresses  a  number  of  questions  and  other  forms  of  feedback  received  from  dictionary users. Furthermore, the study provides suggestions and examples of how the collected data can be utilised in upcoming updates and revisions of the dictionaries.},
	booktitle    = {Nordiska studier i lexikografi 15. Red. av.  C. Sandström,  U.-M. Forsberg,  C. af Hällström-Reijonen, M. Lehtonen  & K. Ruppel},
	author       = {Bäckerud, Erik and Nilsson, Pär and Sköldberg, Emma},
	year         = {2021},
	ISBN         = {978-952-7359-03-7},
}

@inProceedings{Blensenius-Kristian2021-306723,
	title        = {Finding gaps in semantic descriptions. Visualisation of the cross-reference network in a Swedish monolingual dictionary },
	abstract     = {Providing lexical information in dictionary entries by cross-referencing between semantically related headwords is very important, both from a reception-oriented and a production-oriented perspective. This study presents a survey of cross-references in a comprehensive monolingual dictionary of Swedish. It discusses cross-referencing in dictionaries in general as well as in the Swedish dictionary, focusing on the following four types of paradigmatic cross-references: SEE, COMPARE, SYNONYM, and OPPOSITE. By using data-visualisation software, the semantic network in the dictionary is overviewed in a new way. Furthermore, errors, gaps as well as other areas of improvement  in  the  dictionary  related  to  cross-referencing  are  discovered.  Moreover,  the relationships between the existing cross-references, how they are introduced in the dictionary and the dictionary's intended target groups are addressed. The study also reveals that the traditional lexicographic policies of the dictionary need to be adjusted to take advantage of the transition from paper to electronic publication},
	booktitle    = {Electronic lexicography in the 21st century. Proceedings of the eLex 2021 conference. 5–7 July 2021, virtual. Brno  (Eds.: Kosem, I., Cukr, M., Jakubíček, M., Kallas, J., Krek, S. & Tiberius, C.},
	author       = {Blensenius, Kristian and Sköldberg, Emma and Bäckerud, Erik},
	year         = {2021},
	publisher    = {Lexical Computing CZ s.r.o},
	address      = {Brno},
}

@incollection{Petersson-Stellan2021-307114,
	title        = {Semantic change in Swedish – from a lexicographic perspective},
	abstract     = {In this chapter, we examine semantic change in the general vocabulary of present-day Swedish and its lexicographic description. We discuss the question of whether automatic and semi-automatic methods of computational linguistics are relevant to lexicography and conclude that such methods can facilitate, formalize, and sharpen lexicographic investigations of semantic change.},
	booktitle    = {Computational approaches to semantic change. Eds.: Nina Tahmasebi, Lars Borin, Adam Jatowt, Yang Xu & Simon Hengchen},
	author       = {Petersson, Stellan and Sköldberg, Emma},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2 },
	pages        = {149--167},
}

@article{Blensenius-Kristian2021-309798,
	title        = {SAOL 14 som rättesnöre - diskussion kring den senaste upplagan},
	abstract     = {The article discusses recommendations concerning orthography, morphology, semantics, etc. provided in the most recent edition of the Swedish Academy Glossary (2015). These issues are discussed particularly in relation to the glossary users and to the other language resources of the Swedish Academy, which are now easily compared online. Points for improvement are identified.},
	journal      = {LexicoNordica},
	author       = {Blensenius, Kristian and Holmer, Louise and Sköldberg, Emma},
	year         = {2021},
	volume       = {28},
	pages        = {39--58},
}

@incollection{Blensenius-Kristian2021-308020,
	title        = {Aspekter som sällan uppmärksammas},
	booktitle    = {Nyanser av grammatik  : gränser, mångfald, fördjupning / Johan Brandtler, Mikael Kalm (red.)},
	author       = {Blensenius, Kristian},
	year         = {2021},
	publisher    = {Studentlitteratur},
	address      = {Lund},
	ISBN         = {9789144136233},
	pages        = {263–274},
}

@inProceedings{Landqvist-Hans2021-304467,
	title        = {”finlandssvenska” + ”betydelsefulla” + ”översättare till svenska språket” = ?
Upplysningar och urval i Svenskt översättarlexikon},
	booktitle    = {Sektionsföredrag vid Svenskan i Finland 19, 6–7 maj 2021, Åbo Akademi i Vasa},
	author       = {Landqvist, Hans},
	year         = {2021},
}

@article{Pilke-Nina2021-305858,
	title        = {Organising Terminology Work in Sweden from the 1940s onwards – Participatory Expert Roles in Networks},
	abstract     = {The present study deals with organised terminology work in Sweden from the 1940s to the late 2010s. Using archive material, we describe how practical terminology work was carried out in Sweden during the period 1941–2018/ 2019, when the Swedish Centre for Technical Terminology/the Swedish Centre for Terminology (TNC) was the central actor. Thereafter, we discuss models for building a new infrastructure for terminology work after the clo- sure of the TNC in 2018/2019. This discussion is based on interviews and analyses of articles and current reports. The study shows that multifaceted contacts with experts, academia, industry and society have played an essen- tial role for terminology work in Sweden since the 1930s. In the current situ- ation (2019), the activities are being reorganised and responsibility for terminology work is distributed between several actors. A new main actor is the government agency known as the Institute of Language and Folklore (Isof ). Finally, we discuss future visions for terminology work in Sweden. },
	journal      = {Terminology as a Societal Resource. Possibilities and Responsibilities in a Changing World. Special Issue of Terminology International Journal of Theoretical and Applied Issues in Specialized Communication},
	author       = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans},
	year         = {2021},
	volume       = {27},
	number       = {1},
	pages        = {80--109},
}

@misc{Pilke-Nina2021-305859,
	title        = { Terminology as a Societal Resource. Possibilities and Responsibilities in a Changing World.},
	author       = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans},
	year         = {2021},
	volume       = {27},
	number       = {1},
	pages        = {3–9},
}

@inProceedings{Landqvist-Hans2021-305861,
	title        = {(In)equality? A Case Study of Male and Female Translators in Svenskt översättarlexikon},
	booktitle    = {KäTu2021 Kääntämisen ja tulkkauksen tukimuksen symposiumi Ohjelma ja abstraktit, 20–22.5.2021, Helsingin yliopisto },
	author       = {Landqvist, Hans},
	year         = {2021},
}

@misc{Pilke-Nina2021-305860,
	title        = {Terminology as a Societal Resource Possibilities and Responsibilities in a Changing World},
	author       = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans},
	year         = {2021},
	volume       = {27},
	number       = {1},
	pages        = {177},
}

@inProceedings{Landqvist-Hans2021-310091,
	title        = {Vem vill ta hand om termerna? Terminologiskt arbete som språklig och samhällelig infrastruktur då, nu och sedan},
	abstract     = {Presentation av arbetet inom det pågående projektet Termer i tid – tidens termer, https://sites.uwasa.fi/term/ },
	booktitle    = {Terminologifrämjandets höst-term-in 2021},
	author       = {Landqvist, Hans and Nissilä, Niina and Pilke, Nina},
	year         = {2021},
}

@article{Landqvist-Hans2021-311187,
	title        = {Intresse och engagemang: Kungliga Tekniska högskolans insatser i ett svenskt terminologiskt nätverk 1941–1983},
	abstract     = {In this paper, we study how KTH Royal Institute of Technology (Kungliga Tekniska högskolan – KTH) has participated in and influenced terminology work coordinated by the national terminology centre, the Swedish Centre for Technical Terminology – the TNC, in Sweden during the period 1941–1983. The aim of this paper is to shed light on the development of Swedish (technical) terminology based on networking and experts’ efforts. Based on archive material, we analyze who have been the active KTH experts, in what ways they were involved in the development processes and what effects their efforts had on the term recommendations given by the TNC. The archive material consists of written documents relating to the work process developed by John Wennerberg, who led the TNC between 1941 and 1957. The process was carried out in the form of 373 formal survey letters representing 17 subject fields, with both the TNC and external parties participating. Our results show that the 31 identified KTH experts play a visible role in the processes by 480 received survey letters within 14 subject fields. The response rate, 80 percent, reveal the experts’ involvement in the process and their high esteem of TNC’s work. The analysis of the comprehensive survey letter R198 shows that Wennerberg has considered the experts’ answers regarding terms (selection, linguistic form, acceptance/discourage) and definitions when he has published TNC’s recommendations. Our study shows that networking and experts representing the educational sector and furthermore universities have been an inseparable part of the development of Swedish (technical) terminology during several decades when the national terminology centre in Sweden began to operate and the working methods were established.},
	journal      = {Folkmålsstudier Meddelanden från Föreningen för nordisk filologi},
	author       = {Landqvist, Hans and Pilke, Nina},
	year         = {2021},
	volume       = {59},
	pages        = {103--133},
}

@inProceedings{Nissilä-Niina2021-311592,
	title        = {”Av intresse för saken dristar jag mig att till diskussion framlägga ett par spörsmål” – Kaksi suomalaista akateemista uranuurtajaa terminologiaverkoston kirjeenvaihdossa},
	abstract     = {The Swedish Tekniska Nomenklaturcentralen TNC (2000–2018 Terminologicentrum TNC) has been Sweden's national center for special languages and terminology work for more than 75 years. Since its founding in 1941, the TNC has been active not only in Sweden, but also in establishing and maintaining international contacts. The article describes the contacts between actors in the Swedish and Finnish terminology field, looking in particular at the contacts between the TNC and actors in the Finnish higher education sector between the 1940s and the 1990s. The method utilized is close reading and content analysis. The research material used is the collection of foreign correspondence in the TNC's document archive, and in particular the section stored in connection with the code Ufin, i.e. sections concerning communication between the TNC and Finnish actors. The article describes the topics covered in the communication, the objectives and consequences of the communication and the results achieved. The article focuses especially on the contacts between the TNC and two active actors in Finland, professor Jarl Salin at the Åbo Akademi University and professor Christer Laurén at the University of Vaasa. In the analysis, the Ufin themes of the letters were categorized in four main categories: publications, communication, information and language issues. In professor Jarl Salins letters, the most common theme were language issues, whereas Professor Christer Laurén contacted TNC especially in connection with publications and publishing.},
	booktitle    = {Workplace Communication IV (VAKKI Publications 13.) Eds H. Katajamäki, M. Enell-Nilsson, H. Kauppinen-Räisänen, L. Kääntä & H. Salovaara},
	author       = {Nissilä, Niina and Heittola, Sanna and Pilke, Nina and Landqvist, Hans},
	year         = {2021},
	publisher    = {University of Vaasa},
	address      = {Vaasa},
	ISBN         = {978-952-69732-0-3},
}

@inProceedings{Goldfarb-Tarrant-Seraphina2021-312616,
	title        = {Intrinsic Bias Metrics Do Not Correlate with Application Bias},
	abstract     = {Natural Language Processing (NLP) systems learn harmful societal biases that cause them to amplify inequality as they are deployed in more and more situations. To guide efforts at debiasing these systems, the NLP community relies on a variety of metrics that quantify bias in models. Some of these metrics are intrinsic, measuring bias in word embedding spaces, and some are extrinsic, measuring bias in downstream tasks that the word embeddings enable. Do these intrinsic and extrinsic metrics correlate with each other? We compare intrinsic and extrinsic metrics across hundreds of trained models covering different tasks and experimental conditions. Our results show no reliable correlation between these metrics that holds in all scenarios across tasks and languages. We urge researchers working on debiasing to focus on extrinsic measures of bias, and to make using these measures more feasible via creation of new challenge sets and annotated test data. To aid this effort, we release code, a new intrinsic metric, and an annotated test set focused on gender bias in hate speech.},
	booktitle    = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), August 2021, Online},
	author       = {Goldfarb-Tarrant, Seraphina and Marchant, Rebecca and Muñoz Sánchez, Ricardo and Pandya, Mugdha and Lopez, Adam},
	year         = {2021},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {978-1-954085-52-7 },
}