@inProceedings{szawerna-etal-2025-devils-348547, title = {The Devil’s in the Details: the Detailedness of Classes Influences Personal Information Detection and Labeling}, abstract = {In this paper, we experiment with the effect of different levels of detailedness or granularity—understood as i) the number of classes, and ii) the classes’ semantic depth in the sense of hypernym and hyponym relations — of the annotation of Personally Identifiable Information (PII) on automatic detection and labeling of such information. We fine-tune a Swedish BERT model on a corpus of Swedish learner essays annotated with a total of six PII tagsets at varying levels of granularity. We also investigate whether the presence of grammatical and lexical correction annotation in the tokens and class prevalence have an effect on predictions. We observe that the fewer total categories there are, the better the overall results are, but having a more diverse annotation facilitates fewer misclassifications for tokens containing correction annotation. We also note that the classes’ internal diversity has an effect on labeling. We conclude from the results that while labeling based on the detailed annotation is difficult because of the number of classes, it is likely that models trained on such annotation rely more on the semantic content captured by contextual word embeddings rather than just the form of the tokens, making them more robust against nonstandard language.}, booktitle = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), March 3–4, 2025 Tallinn, Estonia) / Richard Johansson and Sara Stymne (eds.)}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-9908-53-109-0}, pages = { 697–708}, } @misc{munozsanchez-etal-2025-proceedings-348545, title = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2025)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on integrating Natural Lan- guage Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the in- tegration of insights from Second Language Acquisition (SLA) research and the promotion of “Computational SLA” through setting up Second Language research infrastructures. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has in- spired the name for this area of research — Intelligent CALL, ICALL for short. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. Therefore, this work- shop invites a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and those where SLA theories (peda- gogical practices or empirical data) and modeled using ICALL tools. The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Muñoz Sánchez, Ricardo and Alfter, David and Volodina, Elena and Kallas, Jelena}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-9908-53-112-0}, } @inProceedings{masciolini-etal-2025-multigec-348546, title = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL}, abstract = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.}, booktitle = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Tallinn}, ISBN = {978-9908-53-112-0}, } @inProceedings{szawerna-etal-2024-pseudonymization-338089, title = {Pseudonymization Categories across Domain Boundaries}, abstract = {Linguistic data, a component critical not only for research in a variety of fields but also for the development of various Natural Language Processing (NLP) applications, can contain personal information. As a result, its accessibility is limited, both from a legal and an ethical standpoint. One of the solutions is the pseudonymization of the data. Key stages of this process include the identification of sensitive elements and the generation of suitable surrogates in a way that the data is still useful for the intended task. Within this paper, we conduct an analysis of tagsets that have previously been utilized in anonymization and pseudonymization. We also investigate what kinds of Personally Identifiable Information (PII) appear in various domains. These reveal that none of the analyzed tagsets account for all of the PII types present cross-domain at the level of detailedness seemingly required for pseudonymization. We advocate for a universal system of tags for categorizing PIIs leading up to their replacement. Such categorization could facilitate the generation of grammatically, semantically, and sociolinguistically appropriate surrogates for the kinds of information that are considered sensitive in a given domain, resulting in a system that would enable dynamic pseudonymization while keeping the texts readable and useful for future research in various fields.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), LREC-COLING, 2024 20-25 May, 2024, Torino, Italia}, author = {Szawerna, Maria Irena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena}, year = {2024}, publisher = {ELRA and ICCL}, ISBN = {978-2-493814-10-4}, pages = {13303–13314}, } @techreport{masciolini-etal-2025-overview-347102, title = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages}, abstract = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten}, year = {2025}, publisher = {University of Gothenburg}, address = {Gothenburg, Sweden}, } @inProceedings{goldfarb-tarrant-etal-2021-intrinsic-312616, title = {Intrinsic Bias Metrics Do Not Correlate with Application Bias}, abstract = {Natural Language Processing (NLP) systems learn harmful societal biases that cause them to amplify inequality as they are deployed in more and more situations. To guide efforts at debiasing these systems, the NLP community relies on a variety of metrics that quantify bias in models. Some of these metrics are intrinsic, measuring bias in word embedding spaces, and some are extrinsic, measuring bias in downstream tasks that the word embeddings enable. Do these intrinsic and extrinsic metrics correlate with each other? We compare intrinsic and extrinsic metrics across hundreds of trained models covering different tasks and experimental conditions. Our results show no reliable correlation between these metrics that holds in all scenarios across tasks and languages. We urge researchers working on debiasing to focus on extrinsic measures of bias, and to make using these measures more feasible via creation of new challenge sets and annotated test data. To aid this effort, we release code, a new intrinsic metric, and an annotated test set focused on gender bias in hate speech.}, booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 1–6 August 2021, Online}, author = {Goldfarb-Tarrant, Seraphina and Marchant, Rebecca and Muñoz Sánchez, Ricardo and Pandya, Mugdha and Lopez, Adam}, year = {2021}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-954085-52-7 }, } @inProceedings{munozsanchez-etal-2022-first-320225, title = {A First Attempt at Unreliable News Detection in Swedish}, abstract = {Throughout the COVID-19 pandemic, a parallel infodemic has also been going on such that the information has been spreading faster than the virus itself. During this time, every individual needs to access accurate news in order to take corresponding protective measures, regardless of their country of origin or the language they speak, as misinformation can cause significant loss to not only individuals but also society. In this paper we train several machine learning models (ranging from traditional machine learning to deep learning) to try to determine whether news articles come from either a reliable or an unreliable source, using just the body of the article. Moreover, we use a previously introduced corpus of news in Swedish related to the COVID-19 pandemic for the classification task. Given that our dataset is both unbalanced and small, we use subsampling and easy data augmentation (EDA) to try to solve these issues. In the end, we realize that, due to the small size of our dataset, using traditional machine learning along with data augmentation yields results that rival those of transformer models such as BERT.}, booktitle = {Proceedings of the Second International Workshop on Resources and Techniques for User Information in Abusive Language Analysis, Marseille, 20-25 June, 2022 / Editors: Johanna Monti, Valerio Basile, Maria Pia Di Buono, Raffaele Manna, Antonio Pascucci, Sara Tonell}, author = {Muñoz Sánchez, Ricardo and Johansson, Eric and Tayefeh, Shakila and Kad, Shreyash}, year = {2022}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, ISBN = {979-10-95546-99-3}, pages = {1--7}, } @inProceedings{szawerna-etal-2024-swedish-346227, title = {Swedish Learner Essays Revisited: Further Insights into Detecting Personal Information}, abstract = {Personally Identifiable Information (PII) is pervasive in linguistic data, making open sharing thereof complicated from both the legal and ethical perspective. Simply redacting out the PIIs or replacing them with pseudonyms presupposes a detection step, where the personal information is identified. In this study, we expand the existing research on PII detection in unstructured data (learner essays) in Swedish, testing more Large Language Models (LLMs) on a larger amount of data. We compare three different LLMs, two Swedish (KB-BERT and AI Sweden’s RoBERTa) and one multilingual (M-BERT). We found that KB-BERT tends to be better than the other models but that there is some overlap in their performance. }, booktitle = {The Tenth Swedish Language Technology Conference (SLTC), 27-29 November, 2024, Linköping, Sweden}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Volodina, Elena}, year = {2024}, } @inProceedings{munozsanchez-etal-2024-name-339981, title = {Name Biases in Automated Essay Assessment}, abstract = {Artificial intelligence is being deployed in high-stakes situations, such as automated grading of second language essays in proficiency assessment. While they can improve the opportunities students have (education, work opportunities, etc.), such systems often display human-like biases. Aldrin (2017) notes that human graders have a slight bias based on names appearing in essay texts. We aim to identify whether the same pattern holds in automated systems. In this study we aim to answer the following research questions: 1) Does changing given names inside a second language learner essay affect the way the text is graded? 2) How much does this differ between feature-based machine learning and deep learning? For this, we use a de-anonymized (i.e. original) version of the Swell-pilot corpus of second language Swedish learner essays (Volodina 2016), which consists of 502 essays annotated with CEFR levels as our source data. First, we compile four lists of given names inspired by those of Aldrin (2017): traditional Swedish names; modern Swedish names of Anglo-American origin; Finnish names (due to the close sociocultural links between both countries); and names of Arabic origin (the most prominent group of learners in the corpus). Second, we create a diagnostic dataset to identify biases in the classification task. We select SweLL-pilot essays in which a given name appears only once. Then, we generate an essay version for each name on the lists by substituting the name in the original text with one from the list. Third, we fine-tune a BERT (Devlin et al. 2019) model on the original SweLL-pilot data to predict the CEFR level of a given essay and compare it to an existing feature-based model (Pilan 2016). Finally, we test the two models and compare the equality of opportunity between the different given name groups on the diagnostic dataset. }, booktitle = {The 28th International Congress of Onomastic Sciences (ICOS 28),19-23 August, 2024, Helsinki, Finland}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Lindström Tiedemann, Therese and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, } @inProceedings{szawerna-etal-2024-detecting-336385, title = {Detecting Personal Identifiable Information in Swedish Learner Essays}, abstract = {Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, St. Julian’s, Malta}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @inProceedings{munozsanchez-etal-2024-jingle-342259, title = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT}, abstract = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.}, booktitle = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) }, author = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-8075-774-4}, } @inProceedings{munozsanchez-etal-2024-names-336384, title = {Did the Names I Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring}, abstract = {Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta }, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Szawerna, Maria Irena and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @inProceedings{munozsanchez-2024-when-341073, title = {When Hieroglyphs Meet Technology: A Linguistic Journey through Ancient Egypt Using Natural Language Processing}, abstract = {Knowing our past can help us better understand our future. The explosive development of NLP in these past few decades has allowed us to study ancient languages and cultures in ways that we couldn’t have done in the past. However, not all languages have received the same level of attention. Despite its popularity in pop culture, the languages spoken in Ancient Egypt have been somewhat overlooked in terms of NLP research. In this survey paper we give an overview of how NLP has been used to study different variations of the Ancient Egyptian languages. This not only includes Old, Middle, and Late Egyptian but also Demotic and Coptic. We begin by giving a short introduction to these languages and their writing systems, before talking about the corpora and lexical resources that are available digitally. We then show the different NLP tasks that have been tackled for different variations of Ancient Egyptian, as well as the approaches that have been used. We hope that our work can stoke interest in the study of these languages within the NLP community.}, booktitle = {3rd Workshop on Language Technologies for Historical and Ancient Languages, LT4HALA 2024 at LREC-COLING 2024 - Workshop Proceedings, 25 May, 2024 Torino, Italia}, author = {Muñoz Sánchez, Ricardo}, year = {2024}, publisher = { ELRA Language Resources Association}, ISBN = {9782493814463}, } @inProceedings{munozsanchez-etal-2024-harnessing-342122, title = {Harnessing GPT to Study Second Language Learner Essays: Can We Use Perplexity to Determine Linguistic Competence?}, abstract = {Generative language models have been used to study a wide variety of phenomena in NLP. This allows us to better understand the linguistic capabilities of those models and to better analyse the texts that we are working with. However, these studies have mainly focused on text generated by L1 speakers of English. In this paper we study whether linguistic competence of L2 learners of Swedish (through their performance on essay tasks) correlates with the perplexity of a decoder-only model (GPT-SW3). We run two sets of experiments, doing both quantitative and qualitative analyses for each of them. In the first one, we analyse the perplexities of the essays and compare them with the CEFR level of the essays, both from an essay-wide level and from a token level. In our second experiment, we compare the perplexity of an L2 learner essay with a normalised version of it. We find that the perplexity of essays tends to be lower for higher CEFR levels and that normalised essays have a lower perplexity than the original versions. Moreover, we find that different factors can lead to spikes in perplexity, not all of them being related to L2 learner language.}, booktitle = {Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024), June 20, 2024, Mexico City, Mexico}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, address = { Mexico City, Mexico}, ISBN = {979-8-89176-100-1}, } @misc{volodina-etal-2024-proceedings-336386, title = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta}, author = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {979-8-89176-085-1}, } @inProceedings{kokkinakis-etal-2023-investigating-325628, title = {Investigating the Effects of MWE Identification in Structural Topic Modelling }, abstract = {Multiword expressions (MWEs) are common word combinations which exhibit idiosyncrasies in various linguistic levels. For various downstream natural language processing applications and tasks, the identification and discovery of MWEs has been proven to be potentially practical and useful, but still challenging to codify. In this paper we investigate various, relevant to MWE, resources and tools for Swedish, and, within a specific application scenario, we apply structural topic modelling to investigate whether there are any interpretative advantages of identifying MWEs.}, booktitle = {The 19th Workshop on Multiword Expressions (MWE 2023)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Bruinsma, Sebastianus C. J. and Hammarlin, Mia-Marie}, year = {2023}, publisher = {ACL}, ISBN = {978-1-959429-59-3}, } @inProceedings{kokkinakis-etal-2023-scaling-326698, title = {Scaling-up the Resources for a Freely Available Swedish VADER (svVADER) }, abstract = {With widespread commercial applications in various domains, sentiment analysis has become a success story for Natural Language Processing (NLP). Still, although sentiment analysis has rapidly progressed during the last years, mainly due to the application of modern AI technologies, many approaches apply knowledge-based strategies, such as lexicon-based, to the task. This is particularly true for analyzing short social media content, e.g., tweets. Moreover, lexicon-based sentiment analysis approaches are usually preferred over learning-based methods when training data is unavailable or insufficient. Therefore, our main goal is to scale-up and apply a lexicon-based approach which can be used as a baseline to Swedish sentiment analysis. All scaled-up resources are made available, while the performance of this enhanced tool is evaluated on two short datasets, achieving adequate results. }, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Hammarlin, Mia-Marie}, year = {2023}, }