Skip to main content

BibTeX

@incollection{masciolini-etal-2025-exploring-356750,
	title        = {Exploring parallel corpora with STUnD: A Search Tool for Universal Dependencies},
	booktitle    = {Huminfra handbook: Empowering digital and experimental humanities},
	author       = {Masciolini, Arianna and Lange, Herbert and Tóth, Márton András},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu},
	ISBN         = {978-99-0853-612-5},
	pages        = {455--503},
}

@incollection{volodina-etal-2025-swell-355697,
	title        = {SweLL with pride: How to put a learner corpus to good use},
	abstract     = {Second language (L2) learner corpora are collections of language samples that demonstrate learners’ abilities to perform some learning tasks, e.g. an ability to write essays, answer to reading comprehension questions, or talk on a given topic. Such corpora are necessary for both empirical-based research within Second Language Acquisition (SLA), and for development of methods for automatic processing of such data. L2 corpora are notoriously difficult to collect, and their value depends to a greater degree on the representativeness and balance of the sampled data, type of associated metadata and reliability of manual annotations. 
In this chapter we thoroughly describe the SweLL-gold corpus of L2 Swedish, its annotation, statistics and metadata, and showcase main types of its use, such as (1) in research on SLA through detailed instructions on how to perform corpus searches given SweLL-specific annotation, combined with guidelines for SVALA usage, a tool for correction annotation; and (2) in NLP research on problems such as grammatical error correction through guidelines on how to use the different available file formats that the SweLL-gold corpus is released in. Both cases are further supported by case studies and, where available, relevant scripts ready for reuse by researchers.},
	booktitle    = {Huminfra handbook: Empowering digital and experimental humanities},
	author       = {Volodina, Elena and Masciolini, Arianna and Megyesi, Beáta and Prentice, Julia and Rudebeck, Lisa and Sundberg, Gunlög and Wirén, Mats},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {9789908536125},
}

@inProceedings{masciolini-etal-2025-annotating-352761,
	title        = {Annotating Second Language in Universal Dependencies: a Review of Current Practices and Directions for Harmonized Guidelines},
	abstract     = {Universal Dependencies (UD) is gaining popularity as an annotation standard for second language (L2) material. Grammatical errors and other interlanguage phenomena, however, pose significant challenges that official guidelines only address in part. In this paper, we give an overview of current annotation practices and provide some suggestions for harmonizing guidelines for learner corpora.},
	booktitle    = {Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)},
	author       = {Masciolini, Arianna and Berdicevskis, Aleksandrs and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2025},
	publisher    = {Association for Computational Linguistics},
	address      = {Ljubljana, Slovenia},
	ISBN         = {979-8-89176-292-3},
	pages        = {153--163},
}

@article{masciolini-etal-2025-towards-349074,
	title        = {Towards better language representation in Natural Language Processing},
	abstract     = {This paper introduces MultiGEC, a dataset for multilingual Grammatical Error Correction (GEC) in twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. MultiGEC distinguishes itself from previous GEC datasets in that it covers several underrepresented languages, which we argue should be included in resources used to train models for Natural Language Processing tasks which, as GEC itself, have implications for Learner Corpus Research and Second Language Acquisition. Aside from multilingualism, the novelty of the MultiGEC dataset is that it consists of full texts - typically learner essays - rather than individual sentences, making it possible to train systems that take a broader context into account. The dataset was built for MultiGEC-2025, the first shared task in multilingual text-level GEC, but it remains accessible after its competitive phase, serving as a resource to train new error correction systems and perform cross-lingual GEC studies.},
	journal      = {INTERNATIONAL JOURNAL OF LEARNER CORPUS RESEARCH},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphee and Kruijsbergen, Joni and Kurfali, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Ostling, Robert and Allkivi, Kais and Arhar Holdt, Spela and Auzina, Ilze and Dargis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glisic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Zagar, Ales and Zesch, Torsten},
	year         = {2025},
	volume       = {11},
	number       = {2},
	pages        = {309--335},
}

@techreport{masciolini-etal-2025-overview-347102,
	title        = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages},
	abstract     = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian.
It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten},
	year         = {2025},
	publisher    = {University of Gothenburg},
	address      = {Gothenburg, Sweden},
}

@inProceedings{masciolini-etal-2025-multigec-348546,
	title        = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL},
	abstract     = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.},
	booktitle    = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning, March, 2025, Tartu, Estland},
	editor       = {Ricardo Muñoz Sánchez and David Alfter and Elena Volodina and Jelena Kallas},
	author       = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Tallinn},
	ISBN         = {978-9908-53-112-0},
	pages        = {1--33},
}