Skip to main content

BibTeX

@incollection{tiedemann-etal-2025-practical-356504,
	title        = {A practical guide to the Swedish L2 lexical profile.},
	abstract     = {Vocabulary is a fundamental aspect of any language since without words you cannot communicate, nor learn other aspects of a language, such as grammar or pronunciation. The Swedish L2 profile offers many ways in which researchers can explore the vocabulary which learners can produce and are expected to understand at different proficiency levels. It also provides a foundation for innovative ways of teaching Swedish, for instance, through Computer Assisted Language
Learning (CALL) and Data Driven Learning (DDL). 
In this chapter we show how the lexical part of SweL2P can be used to explore the vocabulary growth of language learners both receptively and productively in a step-by-step overview. Starting from a bird’s eye view of vocabulary in course books and learner essays we show how to zoom in on some specific aspects of vocabulary, choosing adjectives as an example. We use SweL2P to show how adjectives occur in course books and how they appear in learners’ texts – comparing the lexis in both, but also showing the potential to explore the way learners acquire vocabulary more broadly. Finally, we present how results in SweL2P can be easily compared to other Swedish corpora.},
	booktitle    = {Huminfra handbook: Empowering digital and experimental humanities},
	author       = {Tiedemann, Therese Lindström and Alfter, David and Volodina, Elena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {9789908536125},
	pages        = {355----403},
}

@inProceedings{marvinimperial-etal-2025-universalcefr-356501,
	title        = {UniversalCEFR: Enabling Open Multilingual Research on Language Proficiency Assessment.},
	abstract     = {We introduce UniversalCEFR, a large-scale multilingual multidimensional dataset of texts annotated according to the CEFR (Common European Framework of Reference) scale in 13 languages. To enable open research in both automated readability and language proficiency assessment, UniversalCEFR comprises 505,807 CEFR-labeled texts curated from educational and learner-oriented resources, standardized into a unified data format to support consistent processing, analysis, and modeling across tasks and languages. To demonstrate its utility, we conduct benchmark experiments using three modelling paradigms: a) linguistic feature-based classification, b) fine-tuning pre-trained LLMs, and c) descriptor-based prompting of instruction-tuned LLMs. Our results further support using linguistic features and fine-tuning pretrained models in multilingual CEFR level assessment. Overall, UniversalCEFR aims to establish best practices in data distribution in language proficiency research by standardising dataset formats and promoting their accessibility to the global research community.},
	booktitle    = {Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
	author       = {Marvin Imperial, Joseph and Barayan, Abdullah and Stodden, Regina and Wilkens, Rodrigo and Muñoz Sánchez, Ricardo and Gao, Lingyun and Torgbi, Melissa and Knight, Dawn and Forey, Gail and R. Jablonkai, Reka and Kochmar, Ekaterina and Reynolds, Robert Joshua and Ribeiro, Eugénio and Saggion, Horacio and Volodina, Elena and Vajjala, Sowmya and François, Thomas and Alva-Manchego, Fernando and Tayyar Madabushi, Harish},
	year         = {2025},
	publisher    = {Association for Computational Linguistics},
	pages        = {9714–9766},
}

@inProceedings{szawerna-etal-2025-annotating-355296,
	title        = {Annotating Personal Information in Swedish Texts with SPARV},
	abstract     = {Many kinds of language data run the risk of containing personal information (PI).
sparv-sbx-pi-detection is a plugin for the text annotation pipeline SPARV, which enables personal information annotation at the same time as other kinds of annotation. (Im)Personal Data is a visualisation tool which allows researchers to estimate the extent of detected personal information in a sample of
their data and how much of it would be manipulated if PI were to be removed or replaced.},
	booktitle    = {CLARIN Annual Conference 2025, 30 September - 2 October, 2025, Vienna, Austria},
	author       = {Szawerna, Maria Irena and Alfter, David and Volodina, Elena},
	year         = {2025},
}

@incollection{volodina-etal-2025-swell-355697,
	title        = {SweLL with pride: How to put a learner corpus to good use},
	abstract     = {Second language (L2) learner corpora are collections of language samples that demonstrate learners’ abilities to perform some learning tasks, e.g. an ability to write essays, answer to reading comprehension questions, or talk on a given topic. Such corpora are necessary for both empirical-based research within Second Language Acquisition (SLA), and for development of methods for automatic processing of such data. L2 corpora are notoriously difficult to collect, and their value depends to a greater degree on the representativeness and balance of the sampled data, type of associated metadata and reliability of manual annotations. 
In this chapter we thoroughly describe the SweLL-gold corpus of L2 Swedish, its annotation, statistics and metadata, and showcase main types of its use, such as (1) in research on SLA through detailed instructions on how to perform corpus searches given SweLL-specific annotation, combined with guidelines for SVALA usage, a tool for correction annotation; and (2) in NLP research on problems such as grammatical error correction through guidelines on how to use the different available file formats that the SweLL-gold corpus is released in. Both cases are further supported by case studies and, where available, relevant scripts ready for reuse by researchers.},
	booktitle    = {Huminfra handbook: Empowering digital and experimental humanities},
	author       = {Volodina, Elena and Masciolini, Arianna and Megyesi, Beáta and Prentice, Julia and Rudebeck, Lisa and Sundberg, Gunlög and Wirén, Mats},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {9789908536125},
}

@edited_book{bouma-etal-2025-huminfra-356211,
	title        = {Huminfra handbook: Empowering digital and experimental humanities},
	editor       = {Bouma, Gerlof and Dannélls, Dana and Kokkinakis, Dimitrios and Volodina, Elena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu},
	ISBN         = {978-99-0853-612-5},
}

@inProceedings{szawerna-etal-2025-annotating-355832,
	title        = {Annotating Personal Information in Swedish Texts with SPARV},
	booktitle    = {Proceedings of the First Workshop on Natural Language Processing and Language Models for Digital Humanities},
	author       = {Szawerna, Maria Irena and Alfter, David and Volodina, Elena},
	year         = {2025},
	pages        = {155--163},
}

@inProceedings{masciolini-etal-2025-annotating-352761,
	title        = {Annotating Second Language in Universal Dependencies: a Review of Current Practices and Directions for Harmonized Guidelines},
	abstract     = {Universal Dependencies (UD) is gaining popularity as an annotation standard for second language (L2) material. Grammatical errors and other interlanguage phenomena, however, pose significant challenges that official guidelines only address in part. In this paper, we give an overview of current annotation practices and provide some suggestions for harmonizing guidelines for learner corpora.},
	booktitle    = {Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)},
	author       = {Masciolini, Arianna and Berdicevskis, Aleksandrs and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2025},
	publisher    = {Association for Computational Linguistics},
	address      = {Ljubljana, Slovenia},
	ISBN         = {979-8-89176-292-3},
	pages        = {153--163},
}

@inProceedings{volodina-etal-2025-towards-355295,
	title        = {Towards shared standards for pseudonymization of research data},
	abstract     = {Pseudonymization has attracted a lot of attention recently due to legislation (e.g. the GDPR), the European
Guidelines on Pseudonymization, the increased need for high-quality ethical data for the training of large
language models as well as the desire to be able to share data with other researchers. This article introduces key concepts in pseudonymization, summarizes the half-way findings in the intradisciplinary research environment Mormor Karl, and proposes ways to unify and standardize the field of pseudonymization.},
	booktitle    = {Proceedings of the Huminfra Conference (HiC 2025), Stockholm, 12-13 November 2025},
	author       = {Volodina, Elena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Södergård, Lisa and Vu, Xuan-Son},
	year         = {2025},
}

@article{masciolini-etal-2025-towards-349074,
	title        = {Towards better language representation in Natural Language Processing},
	abstract     = {This paper introduces MultiGEC, a dataset for multilingual Grammatical Error Correction (GEC) in twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. MultiGEC distinguishes itself from previous GEC datasets in that it covers several underrepresented languages, which we argue should be included in resources used to train models for Natural Language Processing tasks which, as GEC itself, have implications for Learner Corpus Research and Second Language Acquisition. Aside from multilingualism, the novelty of the MultiGEC dataset is that it consists of full texts - typically learner essays - rather than individual sentences, making it possible to train systems that take a broader context into account. The dataset was built for MultiGEC-2025, the first shared task in multilingual text-level GEC, but it remains accessible after its competitive phase, serving as a resource to train new error correction systems and perform cross-lingual GEC studies.},
	journal      = {INTERNATIONAL JOURNAL OF LEARNER CORPUS RESEARCH},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphee and Kruijsbergen, Joni and Kurfali, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Ostling, Robert and Allkivi, Kais and Arhar Holdt, Spela and Auzina, Ilze and Dargis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glisic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Zagar, Ales and Zesch, Torsten},
	year         = {2025},
	volume       = {11},
	number       = {2},
	pages        = {309--335},
}

@techreport{masciolini-etal-2025-overview-347102,
	title        = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages},
	abstract     = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian.
It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten},
	year         = {2025},
	publisher    = {University of Gothenburg},
	address      = {Gothenburg, Sweden},
}

@inProceedings{masciolini-etal-2025-multigec-348546,
	title        = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL},
	abstract     = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.},
	booktitle    = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning, March, 2025, Tartu, Estland},
	editor       = {Ricardo Muñoz Sánchez and David Alfter and Elena Volodina and Jelena Kallas},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Tallinn},
	ISBN         = {978-9908-53-112-0},
	pages        = {1--33},
}

@misc{munozsanchez-etal-2025-proceedings-348545,
	title        = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2025)},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language
Learning (NLP4CALL) is a meeting place for researchers working on integrating Natural Lan-
guage Processing and Speech Technologies in CALL systems and exploring the theoretical and
methodological issues arising in this connection. The latter includes, among others, the in-
tegration of insights from Second Language Acquisition (SLA) research and the promotion of
“Computational SLA” through setting up Second Language research infrastructures.
The intersection of Natural Language Processing (or Language Technology / Computational
Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings
“understanding” of language to CALL tools, thus making CALL intelligent. This fact has in-
spired the name for this area of research — Intelligent CALL, ICALL for short. As the definition
suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech
Technology, ICALL researchers need good insights into second language acquisition theories and
practices, as well as knowledge of second language pedagogy and didactics. Therefore, this work-
shop invites a wide range of ICALL-relevant research, including studies where NLP-enriched
tools are used for testing SLA and pedagogical theories, and those where SLA theories (peda-
gogical practices or empirical data) and modeled using ICALL tools. The NLP4CALL workshop
series is aimed at bringing together competences from these areas for sharing experiences and
brainstorming around the future of the field.},
	author       = {Muñoz Sánchez, Ricardo and Alfter, David and Volodina, Elena and Kallas, Jelena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-112-0},
}

@inProceedings{szawerna-etal-2025-devils-348547,
	title        = {The Devil’s in the Details: the Detailedness of Classes Influences Personal Information Detection and Labeling},
	abstract     = {In this paper, we experiment with the effect of different levels of detailedness or granularity—understood as i) the number of classes, and ii) the classes’ semantic depth in the sense of hypernym and hyponym relations — of the annotation of Personally Identifiable Information (PII) on automatic detection and labeling of such information. We fine-tune a Swedish BERT model on a corpus of Swedish learner essays annotated with a total of six PII tagsets at varying levels of granularity. We also investigate whether the presence of grammatical and lexical correction annotation in the tokens and class prevalence have an effect on predictions. We observe that the fewer total categories there are, the better the overall results are, but having a more diverse annotation facilitates fewer misclassifications for tokens containing  correction annotation. We also note that the classes’ internal diversity has an effect on labeling. We conclude from the results that while labeling based on the detailed annotation is difficult because of the number of classes, it is likely that models trained on such annotation rely more on the semantic content captured by contextual word embeddings rather than just the form of the tokens, making them more robust against nonstandard language.},
	booktitle    = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), March 3–4, 2025 Tallinn, Estonia) / Richard Johansson and Sara Stymne (eds.)},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Volodina, Elena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-109-0},
	pages        = { 697–708},
}