Skip to main content

BibTeX

@article{masciolini-etal-2025-towards-349074,
	title        = {Towards better language representation in Natural Language Processing},
	abstract     = {This paper introduces MultiGEC, a dataset for multilingual Grammatical Error Correction (GEC) in twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. MultiGEC distinguishes itself from previous GEC datasets in that it covers several underrepresented languages, which we argue should be included in resources used to train models for Natural Language Processing tasks which, as GEC itself, have implications for Learner Corpus Research and Second Language Acquisition. Aside from multilingualism, the novelty of the MultiGEC dataset is that it consists of full texts - typically learner essays - rather than individual sentences, making it possible to train systems that take a broader context into account. The dataset was built for MultiGEC-2025, the first shared task in multilingual text-level GEC, but it remains accessible after its competitive phase, serving as a resource to train new error correction systems and perform cross-lingual GEC studies.},
	journal      = {INTERNATIONAL JOURNAL OF LEARNER CORPUS RESEARCH},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphee and Kruijsbergen, Joni and Kurfali, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Ostling, Robert and Allkivi, Kais and Arhar Holdt, Spela and Auzina, Ilze and Dargis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glisic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Zagar, Ales and Zesch, Torsten},
	year         = {2025},
	volume       = {11},
	number       = {2},
	pages        = {309--335},
}

@inProceedings{masciolini-etal-2025-annotating-352761,
	title        = {Annotating Second Language in Universal Dependencies: a Review of Current Practices and Directions for Harmonized Guidelines},
	abstract     = {Universal Dependencies (UD) is gaining popularity as an annotation standard for second language (L2) material. Grammatical errors and other interlanguage phenomena, however, pose significant challenges that official guidelines only address in part. In this paper, we give an overview of current annotation practices and provide some suggestions for harmonizing guidelines for learner corpora.},
	booktitle    = {Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)},
	author       = {Masciolini, Arianna and Berdicevskis, Aleksandrs and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2025},
	publisher    = {Association for Computational Linguistics},
	address      = {Ljubljana, Slovenia},
	ISBN         = {979-8-89176-292-3},
}

@techreport{masciolini-etal-2025-overview-347102,
	title        = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages},
	abstract     = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian.
It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten},
	year         = {2025},
	publisher    = {University of Gothenburg},
	address      = {Gothenburg, Sweden},
}

@inProceedings{masciolini-etal-2025-multigec-348546,
	title        = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL},
	abstract     = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.},
	booktitle    = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning, March, 2025, Tartu, Estland},
	editor       = {Ricardo Muñoz Sánchez and David Alfter and Elena Volodina and Jelena Kallas},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Tallinn},
	ISBN         = {978-9908-53-112-0},
	pages        = {1--33},
}

@misc{munozsanchez-etal-2025-proceedings-348545,
	title        = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2025)},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language
Learning (NLP4CALL) is a meeting place for researchers working on integrating Natural Lan-
guage Processing and Speech Technologies in CALL systems and exploring the theoretical and
methodological issues arising in this connection. The latter includes, among others, the in-
tegration of insights from Second Language Acquisition (SLA) research and the promotion of
“Computational SLA” through setting up Second Language research infrastructures.
The intersection of Natural Language Processing (or Language Technology / Computational
Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings
“understanding” of language to CALL tools, thus making CALL intelligent. This fact has in-
spired the name for this area of research — Intelligent CALL, ICALL for short. As the definition
suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech
Technology, ICALL researchers need good insights into second language acquisition theories and
practices, as well as knowledge of second language pedagogy and didactics. Therefore, this work-
shop invites a wide range of ICALL-relevant research, including studies where NLP-enriched
tools are used for testing SLA and pedagogical theories, and those where SLA theories (peda-
gogical practices or empirical data) and modeled using ICALL tools. The NLP4CALL workshop
series is aimed at bringing together competences from these areas for sharing experiences and
brainstorming around the future of the field.},
	author       = {Muñoz Sánchez, Ricardo and Alfter, David and Volodina, Elena and Kallas, Jelena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-112-0},
}

@inProceedings{szawerna-etal-2025-devils-348547,
	title        = {The Devil’s in the Details: the Detailedness of Classes Influences Personal Information Detection and Labeling},
	abstract     = {In this paper, we experiment with the effect of different levels of detailedness or granularity—understood as i) the number of classes, and ii) the classes’ semantic depth in the sense of hypernym and hyponym relations — of the annotation of Personally Identifiable Information (PII) on automatic detection and labeling of such information. We fine-tune a Swedish BERT model on a corpus of Swedish learner essays annotated with a total of six PII tagsets at varying levels of granularity. We also investigate whether the presence of grammatical and lexical correction annotation in the tokens and class prevalence have an effect on predictions. We observe that the fewer total categories there are, the better the overall results are, but having a more diverse annotation facilitates fewer misclassifications for tokens containing  correction annotation. We also note that the classes’ internal diversity has an effect on labeling. We conclude from the results that while labeling based on the detailed annotation is difficult because of the number of classes, it is likely that models trained on such annotation rely more on the semantic content captured by contextual word embeddings rather than just the form of the tokens, making them more robust against nonstandard language.},
	booktitle    = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), March 3–4, 2025 Tallinn, Estonia) / Richard Johansson and Sara Stymne (eds.)},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Volodina, Elena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-109-0},
	pages        = { 697–708},
}