@article{masciolini-etal-2025-towards-349074, title = {Towards better language representation in Natural Language Processing}, abstract = {This paper introduces MultiGEC, a dataset for multilingual Grammatical Error Correction (GEC) in twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. MultiGEC distinguishes itself from previous GEC datasets in that it covers several underrepresented languages, which we argue should be included in resources used to train models for Natural Language Processing tasks which, as GEC itself, have implications for Learner Corpus Research and Second Language Acquisition. Aside from multilingualism, the novelty of the MultiGEC dataset is that it consists of full texts - typically learner essays - rather than individual sentences, making it possible to train systems that take a broader context into account. The dataset was built for MultiGEC-2025, the first shared task in multilingual text-level GEC, but it remains accessible after its competitive phase, serving as a resource to train new error correction systems and perform cross-lingual GEC studies.}, journal = {INTERNATIONAL JOURNAL OF LEARNER CORPUS RESEARCH}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphee and Kruijsbergen, Joni and Kurfali, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Ostling, Robert and Allkivi, Kais and Arhar Holdt, Spela and Auzina, Ilze and Dargis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glisic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Zagar, Ales and Zesch, Torsten}, year = {2025}, }