@inProceedings{alfter-etal-2020-expert-300074, title = {Expert judgments versus crowdsourcing in ordering multi-word expressions}, abstract = {In this study we investigate to which degree experts and non-experts agree on questions of linguistic complexity in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language. }, booktitle = {Proceedings of the Swedish Language Technology Conference (SLTC), 25–27 November 2020, (Online)}, author = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2020}, } @inProceedings{berdicevskis-2020-older-290636, title = {Older English Words Are More Polysemous}, booktitle = {The Evolution of Language: Proceedings of the 13th International Conference (EvoLang13). Pp. 14-21}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {The Evolution of Language Conferences }, address = {Nijmegen }, pages = {14--21}, } @inProceedings{kokkinakis-lundholmfors-2020-digital-295582, title = {Digital Neuropsychological Tests and Biomarkers: Resources for NLP and AI Exploration in the Neuropsychological Domain}, abstract = {Non-invasive, time and cost-effective, easy-to-measure techniques for the early diagnosis or monitoring the progression of brain and mental disorders are at the forefront of recent research in this field. Natural Language Processing and Artificial Intelligence can play an important role in supporting and enhancing data driven approaches to improve the accuracy of prediction and classification. However, large datasets of e.g. recorded speech in the domain of cognitive health are limited. To improve the performance of existing models we need to train them on larger datasets, which could raise the accuracy of clinical diagnosis, and contribute to the detection of early signs at scale. In this paper, we outline our ongoing work to collect such data from a large population in order to support and conduct future research for modelling speech and language features in a cross-disciplinary manner. The final goal is to explore and combine linguistic with multimodal biomarkers from the same population and compare hybrid models that could increase the predictive accuracy of the algorithms that operate on them.}, booktitle = {CLARIN Annual Conference 2020 in Virtual Form}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, } @inProceedings{berdicevskis-etal-2020-subjects-297403, title = {Subjects tend to be coded only once: Corpus-based and grammar-based evidence for an efficiency-driven trade-off}, booktitle = {Proceedings of the 19th International Workshop on Treebanks and Linguistic Theories, TLT 2020, 27–28 October 2020, Düsseldorf, Germany}, author = {Berdicevskis, Aleksandrs and Schmidtke-Bode, Karsten and Seržant, Ilja}, year = {2020}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = { 978-1-952148-01-9}, pages = {79--92}, } @inProceedings{berdicevskis-piperski-2020-corpus-298524, title = {Corpus evidence for word order freezing in Russian and German}, booktitle = {Proceedings of the Fourth Workshop on Universal Dependencies (UDW 2020), December 13, 2020, Barcelona, Spain (Online) / Marie-Catherine de Marneffe, Miryam de Lhoneux, Joakim Nivre, Sebastian Schuster (Editors).}, author = {Berdicevskis, Aleksandrs and Piperski, Alexander}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = { 978-1-952148-48-4}, pages = {26--33}, } @article{themistocleous-etal-2020-voice-295469, title = {Voice quality and speech fluency distinguish individuals with Mild Cognitive Impairment from Healthy Controls}, abstract = {Mild Cognitive Impairment (MCI) is a syndrome characterized by cognitive decline greater than expected for an individual's age and education level. This study aims to determine whether voice quality and speech fluency distinguish patients with MCI from healthy individuals to improve diagnosis of patients with MCI. We analyzed recordings of the Cookie Theft picture description task produced by 26 patients with MCI and 29 healthy controls from Sweden and calculated measures of voice quality and speech fluency. The results show that patients with MCI differ significantly from HC with respect to acoustic aspects of voice quality, namely H1-A3, cepstral peak prominence, center of gravity, and shimmer; and speech fluency, namely articulation rate and averaged speaking time. The method proposed along with the obtainability of connected speech productions can enable quick and easy analysis of speech fluency and voice quality, providing accessible and objective diagnostic markers of patients with MCI.}, journal = {PloS one}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, volume = {15}, number = {7}, pages = {e0236009}, } @inProceedings{virk-etal-2020-dream-295338, title = {The DReaM Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), Marseille, 11–16 May 2020 / Editors : Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis}, author = {Virk, Shafqat and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4 }, } @misc{berdicevskis-2020-pizzas-297688, title = {Pizzas and vermouth}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Faber & Faber}, ISBN = {9781783352203}, pages = {150--151}, } @inProceedings{lange-ljunglof-2020-learning-291243, title = {Learning Domain-specific Grammars from a Small Number of Examples}, abstract = {In this paper we investigate the problem of grammar inference from a different perspective. The common approach is to try to infer a grammar directly from example sentences, which either requires a large training set or suffers from bad accuracy. We instead view it as a problem of grammar restriction or sub-grammar extraction. We start from a large-scale resource grammar and a small number of examples, and find a sub-grammar that still covers all the examples. To do this we formulate the problem as a constraint satisfaction problem, and use an existing constraint solver to find the optimal grammar. We have made experiments with English, Finnish, German, Swedish and Spanish, which show that 10–20 examples are often sufficient to learn an interesting domain grammar. Possible applications include computer-assisted language learning, domain-specific dialogue systems, computer games, Q/A-systems, and others.}, booktitle = {12th International Conference on Agents and Artificial Intelligence - Volume 1: NLPinAI}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2020}, publisher = {SciTePress}, ISBN = {978-989-758-395-7}, } @misc{schlechtweg-etal-2020-post-295466, title = {Post-Evaluation Data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {This data collection contains the post-evaluation data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection: (1) the starting kit to download data, and examples for competing in the CodaLab challenge including baselines; (2) the true binary change scores of the targets for Subtask 1, and their true graded change scores for Subtask 2 (test_data_truth/); (3)the scoring program used to score submissions against the true test data in the evaluation and post-evaluation phase (scoring_program/); and (4) the results of the evaluation phase including, for example, analysis plots (plots/) displaying the results:}, author = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina}, year = {2020}, publisher = {Zenodo}, } @inProceedings{schlechtweg-etal-2020-semeval-295463, title = {SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {Lexical Semantic Change detection, i.e., the task of identifying words that change meaning over time, is a very active research area, with applications in NLP, lexicography, and linguistics. Evaluation is currently the most pressing problem in Lexical Semantic Change detection, as no gold standards are available to the community, which hinders progress. We present the results of the first shared task that addresses this gap by providing researchers with an evaluation framework and manually annotated, high-quality datasets for English, German, Latin, and Swedish. 33 teams submitted 186 systems, which were evaluated on two subtasks. }, booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation (SemEval2020), Barcelona, Spain (Online), December 12, 2020.}, author = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina}, year = {2020}, publisher = {ACL}, } @inProceedings{wichmann-virk-2020-towards-298431, title = { Towards a data-driven network of linguistic terms}, abstract = {Starting from close to 20,000 text docu-ments from the literature of language descrip-tions, from documents either born digitally orscanned and OCR’d, we extract keywords andpass them through a pruning pipeline wheremainly keywords that can be considered as be-longing to linguistic terminology survive. Sub-sequently we quantify relations among those terms using Normalized Pointwise Mutual In-formation (NPMI) and use the resulting measures, in conjunction with the Google PageRank (GPR), to build networks of linguistic terms. Two uses of the work are envisaged:(1) developing a search machine adapted to thelarge DReaM corpus of linguistic descriptive literature and (2) getting insights into how adata-driven ontology of linguistic terminology might be built.}, booktitle = {Swedish Language Technology Conference (SLTC)}, author = {Wichmann, Søren and Virk, Shafqat}, year = {2020}, } @misc{tahmasebi-etal-2020-swedish-295465, title = {Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {This data collection contains the Swedish test data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection. It consists of a Swedish text corpus pair (corpus1/, corpus2/) and 31 lemmas which have been annotated for their lexical semantic change between the two corpora (targets.txt). We sample from the KubHist2 corpus, digitized by the National Library of Sweden, and available through the Språkbanken corpus infrastructure Korp (Borin et al., 2012). The full corpus is available through a CC BY (attribution) license. Each word for which the lemmatizer in the Korp pipeline has found a lemma is replaced with the lemma. In cases where the lemmatizer cannot find a lemma, we leave the word as is (i.e., unlemmatized, no lower-casing). KubHist contains very frequent OCR errors, especially for the older data.More detail about the properties and quality of the Kubhist corpus can be found in (Adesam et al., 2019).}, author = {Tahmasebi, Nina and Hengchen, Simon and Schlechtweg, Dominik and McGillivray, Barbara and Dubossarsky, Haim}, year = {2020}, } @inProceedings{virk-etal-2020-from-295339, title = {From Linguistic Descriptions to Language Profiles}, abstract = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia}, author = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-36-8}, } @article{arharholdt-etal-2020-language-300072, title = {Language teachers and crowdsourcing: Insights from a cross-European survey.}, abstract = {he paper presents a cross-European survey on teachers and crowdsourcing. The survey examines how familiar language teachers are with the concept of crowdsourcing and addresses their attitude towards including crowdsourcing into language teaching activities. The survey was administrated via an online questionnaire and collected volunteers’ data on: (a) teachers’ experience with organizing crowdsourcing activities for students/pupils, (b) the development of crowdsourced resources and materials as well as (c) teachers’ motivation for participating in or employing crowdsourcing activities. The questionnaire was disseminated in over 30 European countries. The final sample comprises 1129 language teachers aged 20 to 65, mostly working at institutions of tertiary education. The data indicates that many participants are not familiar with the concept of crowdsourcing resulting in a low rate of crowdsourcing activities in the classroom. However, a high percentage of responding teachers is potentially willing to crowdsource teaching materials for the language(s) they teach. They are particularly willing to collaborate with other teachers in the creation of interactive digital learning materials, and to select, edit, and share language examples for exercises or tests. Since the inclusion of crowdsourcing activities in language teaching is still in its initial stage, steps for further research are highlighted.}, journal = {Rasprave: Časopis Instituta za hrvatski jezik i jezikoslovlje}, author = {Arhar Holdt, Špela and Zviel-Girshin, Rina and Gajek, Elżbieta and Durán-Muñoz, Isabel and Bago, Petra and Fort, Karën and Hatipoglu, Ciler and Kasperavičienė, Ramunė and Koeva, Svetla and Lazić Konjik, Ivana and Miloshevska, Lina and Ordulj, Antonia and Rodosthenous, Christos and Volodina, Elena and Weber, Tassja and Zanasi, Lorenzo}, year = {2020}, volume = {46}, number = {1}, pages = {1--28}, } @incollection{berdicevskis-semenuks-2020-different-296274, title = {Different trajectories of morphological overspecification and irregularity under imperfect language learning}, booktitle = {The Complexities of Morphology}, editor = {Peter Arkadiev and Francesco Gardani}, author = {Berdicevskis, Aleksandrs and Semenuks, Arturs}, year = {2020}, publisher = {Oxford University Press}, address = {Oxford}, ISBN = {9780198861287}, pages = {283--305}, } @inProceedings{themistocleous-etal-2020-improving-305222, title = {Improving the Diagnosis of Mild Cognitive Impairment in elderly individuals using a multifactorial automatic analysis of voice quality and prosody.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/AEC-30-Paper.JPG}, booktitle = {30th Alzheimer Europe Conference #30AEC -- virtual conference }, author = {Themistocleous, Charalambos and Eckerström, Marie and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{bamutura-etal-2020-towards-296511, title = {Towards Computational Resource Grammars for Runyankore and Rukiga}, abstract = {In this paper, we present computational resource grammars of Runyankore and Rukiga (R&R) languages. Runyankore and Rukiga are two under-resourced Bantu Languages spoken by about 6 million people indigenous to South Western Uganda, East Africa. We used Grammatical Framework (GF), a multilingual grammar formalism and a special-purpose functional programming language to formalise the descriptive grammar of these languages. To the best of our knowledge, these computational resource grammars are the first attempt to the creation of language resources for R&R. In Future Work, we plan to use these grammars to bootstrap the generation of other linguistic resources such as multilingual corpora that make use of data-driven approaches to natural language processing feasible. In the meantime, they can be used to build Computer-Assisted Language Learning (CALL) applications for these languages among others.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, author = {Bamutura, David and Ljunglöf, Peter and Nabende, Peter}, year = {2020}, publisher = {European Language Resources Association}, } @inProceedings{fridlund-etal-2020-trawling-299694, title = {Trawling the Gulf of Bothnia of News: A Big Data Analysis of the Emergence of Terrorism in Swedish and Finnish Newspapers, 1780–1926}, abstract = {This study combines history domain knowledge and language technology expertise to evaluate and expand on research claims regarding the historical meanings associated with terrorism in Swedish and Finnish contexts. Using a cross-border comparative approach and large newspaper corpora made available by the CLARIN research infrastructure, we explore overlapping national discourses on terrorism, the concept’s historical diversity and its relations to different national contexts. We are particularly interested in testing the hypothesis that substate terrorism’s modern meaning was not yet established in the 19th century but primarily restricted to Russian terrorism. We conclude that our comparative study finds both uniquely national and shared meanings of terrorism and that our study strengthen the hypothesis. In extension, the study also serves as an exploration of the potentials of cross-disciplinary evaluative studies based on extensive corpora and of cross-border comparative approaches to Swedish and Finnish newspaper corpora.}, booktitle = {CLARIN Annual Conference Proceedings 2020. Edited by Costanza Navarretta, Maria Eskevich, 05–07 October 2020, Virtual Edition}, author = {Fridlund, Mats and Olsson, Leif-Jöran and Brodén, Daniel and Borin, Lars}, year = {2020}, publisher = {CLARIN}, } @article{roberts-etal-2020-chield-292421, title = {CHIELD: the causal hypotheses in evolutionary linguistics database}, journal = {Journal of Language Evolution}, author = {Roberts, Sean and Killin, Anton and Deb, Angarika and Sheard, Catherine and Greenhill, Simon and Sinnemäki, Kaius and Segovia-Martin, José and Nölle, Jonas and Berdicevskis, Aleksandrs and Humphreys-Balkwill, Archie and Little, Hannah and Opie, Cristopher and Jacques, Guillaume and Bromham, Lindell and Tinits, Peeter and Ross, Robert and Lee, Sean and Gasser, Emily and Calladine, Jasmine and Spike, Matthew and Mann, Stephen and Shcherbakova, Olena and Singer, Ruth and Zhang, Shuya and Benítez-Burraco, Antonio and Kliesch, Christian and Thomas-Colquhoun, Ewan and Skirgård, Hedvig and Tamariz, Monica and Passmore, Sam and Pellard, Thomas and Jordan, Fiona}, year = {2020}, volume = {5}, number = {2}, pages = {101–120}, } @article{kokkinakis-lundholmfors-2020-manga-294522, title = {Hur många djur du kommer på kan avslöja hur din hjärna mår}, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, volume = {2}, pages = {48--51}, } @inProceedings{zechner-borin-2020-towards-296900, title = {Towards a Swedish Roget-Style Thesaurus for NLP}, abstract = {Bring’s thesaurus (Bring) is a Swedish counterpart of Roget, and its digitized version could make a valuable language resource for use in many and diverse natural language processing (NLP) applications. Fromlexicon, word sense disambiguation, topic detection the literature we know that Roget-style thesauruses and wordnets have complementary strengths in this context, so both kinds of lexical-semantic resource are good to have. However, Bring was published in 1930, and its lexical items are in the form of lemma–POS pairings. In order to be useful in our NLP systems, polysemous lexical items need to be disambiguated, and a large amount of modern vocabulary must be added in the proper places in Bring. The work presented here describes experiments aiming at automating these two tasks, at least in part, where we use the structure of an existing Swedish semantic lexicon – Saldo – both for disambiguation of ambiguous Bring entries and for addition of new entries to Bring.}, booktitle = {Proceedings of the 2020 Globalex Workshop on Linked Lexicography. Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020}, author = {Zechner, Niklas and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-46-7}, } @techreport{adesam-etal-2020-swedishglue-299130, title = {SwedishGLUE – Towards a Swedish Test Set for Evaluating Natural Language Understanding Models}, author = {Adesam, Yvonne and Berdicevskis, Aleksandrs and Morger, Felix}, year = {2020}, publisher = {University of Gothenburg}, } @article{zechner-2020-derivatives-303708, title = {Derivatives of regular expressions with cuts}, abstract = {Derivatives of regular expressions are an operation which for a given expression pro-duces an expression for what remains after a specific symbol has been read. This can be used as a step in the process of transforming an expression into a finite string au-tomaton. Cuts are an extension of the ordinary regular expressions; the cut operator is essentially a concatenation without backtracking, formalising a behaviour found in many programming languages. Just as for concatenation, we can also define an iterated cut operator. We show and derive expressions for the derivatives of regular expressions with cuts and iterated cuts. © Institut für Informatik · Justus-Liebig-Universität Giessen.}, journal = {Journal of Automata, Languages and Combinatorics}, author = {Zechner, Niklas}, year = {2020}, volume = {25}, number = {4}, pages = {349--355}, } @inProceedings{johansson-adesam-2020-training-293365, title = {Training a Swedish Constituency Parser on Six Incompatible Treebanks}, abstract = {We investigate a transition-based parser that usesEukalyptus, a function-tagged constituent treebank for Swedish which includesdiscontinuous constituents. In addition, we show that the accuracy of this parser can be improved by using a multitask learning architecture that makes it possible to train the parser on additional treebanks that use other annotation models.}, booktitle = {Proceedings of the 12th International Conference on Language Resources and Evaluation (LREC 2020)}, author = {Johansson, Richard and Adesam, Yvonne}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, } @inProceedings{berdicevskis-eckhoff-2020-diachronic-293349, title = {A Diachronic Treebank of Russian Spanning More Than a Thousand Years}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), May 11-16, 2020, Marseille, France / ed. Nicoletta Calzolari (Conference chair). }, author = {Berdicevskis, Aleksandrs and Eckhoff, Hanne}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4}, pages = {5251--5256}, } @misc{kokkinakis-etal-2020-proceedings-305214, title = {Proceedings of the LREC 2020. Workshop on: Resources and Processing of Linguistic, Para-linguistic and Extra-linguistic Data from People with Various Forms of Cognitive/Psychiatric/Developmental Impairments (RaPID-3), May 11-16, 2020, Marseille, France}, abstract = {RaPID-3 aims to be an interdisciplinary forum for researchers to share information, findings, methods, models and experience on the collection and processing of data produced by people with various forms of mental, cognitive, neuropsychiatric, or neurodegenerative impairments, such as aphasia, dementia, autism, bipolar disorder, Parkinson’s disease or schizophrenia. Particularly, the workshop’s focus is on creation, processing and application of data resources from individuals at various stages of these impairments and with varying degrees of severity. Creation of resources includes e.g. annotation, description, analysis and interpretation of linguistic, paralinguistic and extra-linguistic data (such as spontaneous spoken language, transcripts, eyetracking measurements, wearable and sensor data, etc). Processing is done to identify, extract, correlate, evaluate and disseminate various linguistic or multimodal phenotypes and measurements, which then can be applied to aid diagnosis, monitor the progression or predict individuals at risk. A central aim is to facilitate the study of the relationships among various levels of linguistic, paralinguistic and extra-linguistic observations (e.g., acoustic measures; phonological, syntactic and semantic features; eye tracking measurements; sensors, signs and multimodal signals). Submission of papers are invited in all of the aforementioned areas, particularly emphasizing multidisciplinary aspects of processing such data and the interplay between clinical/nursing/medical sciences, language technology, computational linguistics, natural language processing (NLP) and computer science. The workshop will act as a stimulus for the discussion of several ongoing research questions driving current and future research by bringing together researchers from various research communities. }, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Themistocleous, Charalambos and Antonsson, Malin and Eckerström, Marie}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, ISBN = {979-10-95546-45-0}, } @inProceedings{dannells-broden-2020-building-297061, title = {Building a Language Technology Infrastructure for Digital Humanities: Challenges, Opportunities and Progress}, abstract = {Språkbanken Text, a research unit at the University of Gothenburg, forms part of the National Language Bank of Sweden and is the main coordinating node of Swe-Clarin, the Swedish national CLARIN node. During the past years, Språkbanken Text has been actively engaged in a number of humanities and social sciences related research projects. This engagement has primarily concerned the development of new resources, methods and tools to accurately process large amounts of digitized material, in addition to interfaces for visualizing the materials, making them easily accessible for further analysis. The activities within Swe-Clarin have been essential for the progress and the success of this work. In this paper we present what was required from Språkbanken Text in order to meet the expectations of researchers from the humanities and social sciences. We discuss some of the challenges this work involves and describe the opportunities this field brings with it and how these opportunities could help to progress the work of Språkbanken Text toward building a language technology infrastructure that supports interdisciplinary research.}, booktitle = {Proceedings of the Twin Talks 2 and 3 Workshops at DHN 2020 and DH 2020 Ottawa Canada and Riga Latvia, July 23 and October 20, 2020}, editor = {Steven Krauwer and Darja Fišer}, author = {Dannélls, Dana and Brodén, Daniel}, year = {2020}, publisher = {CEUR-WS.org}, } @inProceedings{themistocleous-etal-2020-automatic-305224, title = {Automatic analysis of voice quality and prosody in patients with Mild Cognitive Impairment.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/SNL2020.pdf}, booktitle = {The 12th Annual Society for the Neurobiology of Language Meeting (SNL) -- virtual conference}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{rouces-etal-2020-creating-290695, title = {Creating an Annotated Corpus for Aspect-Based Sentiment Analysis in Swedish}, abstract = {Aspect-Based Sentiment Analysis constitutes a more fine-grained alternative to traditional sentiment analysis at sentence level. In addition to a sentiment value denoting how positive or negative a particular opinion or sentiment expression is, it identifies additional aspects or 'slots' that characterize the opinion. Some typical aspects are target and source, i.e. who holds the opinion and about which entity or aspect is the opinion. We present a large Swedish corpus annotated for Aspect-Based Sentiment Analysis. Each sentiment expression is annotated as a tuple that contains the following fields: one among 5 possible sentiment values, the target, the source, and whether the sentiment expressed is ironic. In addition, the linguistic element that conveys the sentiment is identified too. Sentiment for a particular topic is also annotated at title, paragraph and document level. The documents are articles obtained from two Swedish media (Svenska Dagbladet and Aftonbladet) and one online forum (Flashback), totalling around 4000 documents. The corpus is freely available and we plan to use it for training and testing an Aspect-Based Sentiment Analysis system.}, booktitle = {Proceedings of the 5th conference in Digital Humanities in the Nordic Countries, Riga, Latvia, October 21-23, 2020.}, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2020}, publisher = {CEUR Workshop Proceedings}, } @inProceedings{frossard-etal-2020-dataset-293923, title = {Dataset for Temporal Analysis of English-French Cognates}, abstract = {Languages change over time and, thanks to the abundance of digital corpora, their evolutionary analysis using computational techniques has recently gained much research attention. In this paper, we focus on creating a dataset to support investigating the similarity in evolution between different languages. We look in particular into the similarities and differences between the use of corresponding words across time in English and French, two languages from different linguistic families yet with shared syntax and close contact. For this we select a set of cognates in both languages and study their frequency changes and correlations over time. We propose a new dataset for computational approaches of synchronized diachronic investigation of language pairs, and subsequently show novel findings stemming from the cognate-focused diachronic comparison of the two chosen languages. To the best of our knowledge, the present study is the first in the literature to use computational approaches and large data to make a cross-language diachronic analysis.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, author = {Frossard, Esteban and Coustaty, Mickael and Doucet, Antoine and Jatowt, Adam and Hengchen, Simon}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille, France}, ISBN = {979-10-95546-34-4}, } @inProceedings{volodina-etal-2020-towards-300069, title = {Towards Privacy by Design in Learner Corpora Research: A Case of On-the-fly Pseudonymization of Swedish Learner Essays}, abstract = {This article reports on an ongoing project aiming at automatization of pseudonymization of learner essays. The process includes three steps: identification of personal information in an unstructured text, labeling for a category, and pseudonymization. We experiment with rule-based methods for detection of 15 categories out of the suggested 19 (Megyesi et al., 2018) that we deem important and/or doable with automatic approaches. For the detection and labeling steps, we use resources covering personal names, geographic names, company and university names and others. For the pseudonymization step, we replace the item using another item of the same type from the above-mentioned resources. Evaluation of the detection and labeling steps are made on a set of manually anonymized essays. The results are promising and show that 89% of the personal information can be successfully identified in learner data, and annotated correctly with an inter-annotator agreement of 86% measured as Fleiss kappa and Krippendorff's alpha.}, booktitle = {Proceedings of the 28th International Conference on Computational Linguistics (COLING), December 8-13, 2020, Barcelona, Spain (Online)}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Derbring, Sandra and Matsson, Arild and Megyesi, Beata}, year = {2020}, publisher = {International Committee on Computational Linguistics}, ISBN = {978-1-952148-27-9}, } @misc{mcgillivray-etal-2020-challenges-295208, title = {The challenges and prospects of the intersection of humanities and data science: A White Paper from The Alan Turing Institute}, abstract = {Since their beginnings, the digital humanities have engaged in an energetic debate about their scope, defining features, and relationship to the wider humanities, and have established themselves as a community of practice (Schreibman et al., 2004; Terras, 2010; Terras, 2013; Terras et al., 2013; Gold and Klein, 2016; The Digital Humanities Manifesto 2.0). The computational focus has characterised the field from its initial explorations (Hockey, 2004; Vanhoutte, 2013; Nyhan and Flinn, 2016) and the shift from the label ‘Humanities Computing’ to ‘Digital Humanities’ was a catalyst for change. In the history of the field, recurring cycles and productive tensions have arisen from the interfolding of computational methodologies and approaches with hermeneutic and critical modes of analysis (see McCarty, 2005; Rockwell and Sinclair, 2016; Jones, 2016). This document postulates that we are currently witnessing another one of these junctures, one that is calling for a critical involvement with data science. In many ways, we are seeing earlier methods blending into, or being extended by data science. Digitisation workflows are being augmented with automatic information extraction, data analysis, automated transcription of handwritten documents, and visualisation of transcribed content. Techniques developed for history, literary studies, and linguistics are being scaled towards larger datasets and more complex problems raising the bar of interpretability and questioning the validity of data collection and analysis methods. On the other hand, the field of data science has recently started to engage with non-STEM (Science, Technology, Engineering, and Mathematics) disciplines, by offering new data-driven modelling frameworks for addressing long-standing research questions (Kitchin, 2014; Lazer et al., 2009) and proposing so-called ‘human-centred approaches’ to data science, focussed on the interpretability of machine learning models and a more active role for human input in algorithms (See Chen et al., 2016). Moreover, in the current historical context we are witnessing an increased awareness of the questions of diversity and inclusion in research and academia, and we are seeing the creation of a strong movement aimed at addressing such issues globally. We believe that this paper can play a role in reinforcing a positive message in this respect.}, author = {McGillivray, Barbara and Alex, Beatrice and Ames, Sarah and Armstrong, Guyda and Beavan, David and Ciula, Arianna and Colavizza, Giovanni and Cummings, James and De Roure, David and Farquhar, Adam and Hengchen, Simon and Lang, Anouk and Loxley, James and Goudarouli, Eirini and Nanni, Federico and Nini, Andrea and Nyhan, Julianne and Osborne, Nicola and Poibeau, Thierry and Ridge, Mia and Ranade, Sonia and Smithies, James and Terras, Melissa and Vlachidis, Andreas and Willcox, Pip}, year = {2020}, } @inProceedings{berdicevskis-2020-foreigner-297766, title = {Foreigner-directed speech is simpler than native-directed: Evidence from social media}, booktitle = {Proceedings of the Fourth Workshop on Natural Language Processing and Computational Social Science, NLP+CSS 2020, November 20, 2020, Online. Pp. 163-172}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = {978-1-952148-80-4}, pages = {163--172}, } @inProceedings{veeman-etal-2020-cross-297782, title = {Cross-lingual Embeddings Reveal Universal and Lineage-Specific Patterns in Grammatical Gender Assignment}, booktitle = {Proceedings of the 24th Conference on Computational Natural Language Learning, Online, November 19-20, 2020. Pp. 265-275}, author = {Veeman, Hartger and Allassonnière-Tang, Marc and Berdicevskis, Aleksandrs and Basirat, Ali}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = {978-1-952148-63-7}, pages = {265--275}, } @incollection{berdicevskis-2020-kogda-296607, title = {Kogda morfologija bessil'na}, booktitle = {VAProsy jazykoznanija: megasbornik nanostatej}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Buki-Vedi}, address = {Moskva}, ISBN = {978-5-4465-2882-0}, pages = {56--60}, } @inProceedings{themistocleous-etal-2020-automated-305223, title = {Automated speech analysis improves MCI diagnosis}, abstract = {Mild Cognitive Impairment (MCI) is a condition characterized by cognitive decline greater than expected for an individual's age and education level. In this study, we are investigating whether acoustic properties of speech production can improve the classification of individuals with MCI from healthy controls augmenting the Mini Mental State Examination, a traditional screening tool, with automatically extracted acoustic information. We found that just one acoustic feature, can improve the AUC score (measuring a trade-off between sensitivity and specificity) from 0.77 to 0.89 in a boosting classification task. These preliminary results suggest that computerized language analysis can improve the accuracy of traditional screening tools}, booktitle = {Proceedings of the 11th Experimental Linguistics Conference (ExLing)}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{r?dveneide-2020-anforanden-302449, title = {Anföranden: Annotated and Augmented Parliamentary Debates from Sweden}, abstract = {The Swedish parliamentary debates have been available since 2010 through the parliament’s open data web site Riksdagens öppna data. While fairly comprehensive, the structure of the data can be hard to understand and its content is somewhat noisy for use as a quality language resource. In order to make it easier to use and process – in particular for language technology research, but also for political science and other fields with an interest in parliamentary data – we have published a large selection of the debates in a cleaned and structured format, annotated with linguistic information and augmented with semantic links. Especially prevalent in the parliament’s data were end-line hyphenations – something that tokenisers generally are not equipped for – and a lot of the effort went into resolving these. In this paper, we provide detailed descriptions of the structure and contents of the resource, and explain how it differs from the parliament’s own version.}, booktitle = {Proceedings of the LREC 2020 Workshop on Creating, Using and Linking of Parliamentary Corpora with Other Types of Political Discourse, 11–16 May 2020}, author = {Rødven-Eide, Stian}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille, France}, ISBN = {979-10-95546-47-4}, } @misc{alfter-etal-2020-proceedings-300071, title = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures. This collection presents four selected papers describing use of Language Technology for language learning.}, author = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars}, year = {2020}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-732-9}, } @article{broden-2020-acknowledging-294579, title = {Acknowledging Ambivalence in Teaching about Art and Aesthetics}, abstract = {In this article Daniel Broden explores the ambivalence in teaching about art and aesthetics in the humanities. By comparing and contrasting Gert J. J. Biesta's educational theory and Jacques Ranciere's writing on aesthetics, he hopes to bring some of the particularities of aesthetic experiences into focus and to discuss a tension in educational situations that concern students' interpretation of aesthetic texts: how the teacher, on the one hand, will serve as a representative for a formal system of education - or what Ranciere calls a system of inequality - and, on the other hand, should respect the autonomy of the aesthetic experience. Broden argues, however, that more interesting than the ambivalence itself is the question of how we can acknowledge this tension in productive ways. Thus, his aim here is to show how the teacher can contribute to the verification of an interpretive approach to art, with Ranciere's axiom of equality in mind. Drawing on Biesta's writings, Broden also highlights how the teacher can provide students with possibilities to pursue a subject-ness and how the risks involved call for a deconstructive approach to the enactment of teacher power. The article concludes by suggesting that we would do better not to view the ambivalence in focus as a problem, but instead to see it as something that calls for continuous engagement and critical reflection.}, journal = {Educational Theory}, author = {Brodén, Daniel}, year = {2020}, volume = {70}, number = {1}, pages = {31--42}, } @inProceedings{lindahl-2020-annotating-302453, title = {Annotating argumentation in Swedish social media}, booktitle = {Proceedings of the 7th Workshop on Argument Mining, Barcelona, Spain (Online), December 13, 2020.}, author = {Lindahl, Anna}, year = {2020}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-952148-44-6}, } @inProceedings{bouma-etal-2020-edges-298473, title = {The EDGeS Diachronic Bible Corpus}, abstract = {We present the EDGeS Diachronic Bible Corpus: a diachronically and synchronically parallel corpus of Bible translations in Dutch, English, German and Swedish, with texts from the 14th century until today. It is compiled in the context of an intended longitudinal and contrastive study of complex verb constructions in Germanic. The paper discusses the corpus design principles, its selection of 36 Bibles, and the information and metadata encoded for the corpus texts. The EDGeS corpus will be available in two forms: the whole corpus will be accessible for researchers behind a login in the well-known OPUS search infrastructure, and the open subpart of the corpus will be available for download.}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), May 11-16, 2020, Marseille, France}, author = {Bouma, Gerlof and Coussé, Evie and Dijkstra, Trude and van der Sijs, Nicoline}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, ISBN = {979-10-95546-34-4}, } @inProceedings{dannells-virk-2020-error-297714, title = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models}, abstract = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines. }, booktitle = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2020}, } @inProceedings{dannells-etal-2020-evaluation-296165, title = {Evaluation of a Two-OCR Engine Method: First Results on Digitized Swedish Newspapers Spanning over nearly 200 Years}, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. We report the first quantitative evaluation results on a material spanning over nearly 200 years. In this first evaluation phase we experimented with word lists for different time periods. Although there was no significant overall improvement of the OCR results, the evaluation shows that some combinations of word lists are successful for certain periods and should therefore be explored further. }, booktitle = { CLARIN Annual Conference 2020, (Virtual Event), 5-7 October, 2020. Book of Abstracts}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten}, year = {2020}, } @inProceedings{dannells-simon-2020-supervised-289944, title = {Supervised OCR Post-Correction of Historical Swedish Texts: What Role Does the OCR System Play?}, abstract = {Current approaches for post-correction of OCR errors offer solutions that are tailored to a specific OCR system. This can be problematic if the post-correction method was trained on a specific OCR system but have to be applied on the result of another system. Whereas OCR post-correction of historical text has received much attention lately, the question of what role does the OCR system play for the post-correction method has not been addressed. In this study we explore a dataset of 400 documents of historical Swedish text which has been OCR processed by three state-of-the-art OCR systems: Abbyy Finereader, Tesseract and Ocropus. We examine the OCR results of each system and present a supervised machine learning post-correction method that tries to approach the challenges exhibited by each system. We study the performance of our method by using three evaluation tools: PrimA, Språkbanken evaluation tool and Frontiers Toolkit. Based on the evaluation analysis we discuss the impact each of the OCR systems has on the results of the post- correction method. We report on quantitative and qualitative results showing varying degrees of OCR post-processing complexity that are important to consider when developing an OCR post-correction method.}, booktitle = {Proceedings of the Digital Humanities in the Nordic Countries, 5th Conference, Riga, Latvia, October 21-23, 2020}, editor = {Sanita Reinsone and Inguna Skadiņa and Anda Baklāne and Jānis Daugavietis}, author = {Dannélls, Dana and Simon, Persson}, year = {2020}, publisher = {CEUR-WS}, } @inProceedings{waldispuhl-etal-2020-material-293332, title = {Material Philology Meets Digital Onomastic Lexicography: The NordiCon Database of Medieval Nordic Personal Names in Continental Sources}, abstract = {We present NordiCon, a database containing medieval Nordic personal names attested in Continental sources. The database combines formally interpreted and richly interlinked onomastic data with digitized versions of the medieval manuscripts from which the data originate and information on the tokens' context. The structure of NordiCon is inspired by other online historical given name dictionaries. It takes up challenges reported on in previous works, such as how to cover material properties of a name token and how to define lemmatization principles, and elaborates on possible solutions. The lemmatization principles for NordiCon are further developed in order to facilitate the connection to other name dictionaries and corpuses, and the integration of the database into SprÃ¥kbanken Text, an infrastructure containing modern and historical written data.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference, Marseille, 11–16 May 2020 / editors: Nicoletta Calzolari... [et. al.]}, author = {Waldispühl, Michelle and Dannélls, Dana and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille}, ISBN = {979-10-95546-34-4}, }