@inProceedings{masciolini-etal-2025-annotating-352761, title = {Annotating Second Language in Universal Dependencies: a Review of Current Practices and Directions for Harmonized Guidelines}, abstract = {Universal Dependencies (UD) is gaining popularity as an annotation standard for second language (L2) material. Grammatical errors and other interlanguage phenomena, however, pose significant challenges that official guidelines only address in part. In this paper, we give an overview of current annotation practices and provide some suggestions for harmonizing guidelines for learner corpora.}, booktitle = {Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)}, author = {Masciolini, Arianna and Berdicevskis, Aleksandrs and Szawerna, Maria Irena and Volodina, Elena}, year = {2025}, publisher = {Association for Computational Linguistics}, address = {Ljubljana, Slovenia}, ISBN = {979-8-89176-292-3}, } @techreport{masciolini-etal-2025-overview-347102, title = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages}, abstract = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten}, year = {2025}, publisher = {University of Gothenburg}, address = {Gothenburg, Sweden}, } @inProceedings{masciolini-etal-2024-synthetic-338288, title = {Synthetic-Error Augmented Parsing of Swedish as a Second Language: Experiments with Word Order}, abstract = {Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.}, booktitle = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, May 25, 2024, Torino, Italia}, author = {Masciolini, Arianna and Francis, Emilie and Szawerna, Maria Irena}, year = {2024}, publisher = {ELRA and ICCL}, address = {Torino, Italy}, ISBN = {978-2-493814-20-3}, } @inProceedings{masciolini-etal-2025-multigec-348546, title = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL}, abstract = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.}, booktitle = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning, March, 2025, Tartu, Estland}, editor = {Ricardo Muñoz Sánchez and David Alfter and Elena Volodina and Jelena Kallas}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Tallinn}, ISBN = {978-9908-53-112-0}, pages = {1--33}, } @article{masciolini-etal-2025-towards-349074, title = {Towards better language representation in Natural Language Processing}, abstract = {This paper introduces MultiGEC, a dataset for multilingual Grammatical Error Correction (GEC) in twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. MultiGEC distinguishes itself from previous GEC datasets in that it covers several underrepresented languages, which we argue should be included in resources used to train models for Natural Language Processing tasks which, as GEC itself, have implications for Learner Corpus Research and Second Language Acquisition. Aside from multilingualism, the novelty of the MultiGEC dataset is that it consists of full texts - typically learner essays - rather than individual sentences, making it possible to train systems that take a broader context into account. The dataset was built for MultiGEC-2025, the first shared task in multilingual text-level GEC, but it remains accessible after its competitive phase, serving as a resource to train new error correction systems and perform cross-lingual GEC studies.}, journal = {INTERNATIONAL JOURNAL OF LEARNER CORPUS RESEARCH}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphee and Kruijsbergen, Joni and Kurfali, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Ostling, Robert and Allkivi, Kais and Arhar Holdt, Spela and Auzina, Ilze and Dargis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glisic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Zagar, Ales and Zesch, Torsten}, year = {2025}, } @inProceedings{masciolini-2024-bootstrapping-338425, title = {Bootstrapping the Annotation of UD Learner Treebanks}, abstract = {Learner data comes in a variety of formats, making corpora difficult to compare with each other. Universal Dependencies (UD) has therefore been proposed as a replacement for the various ad-hoc annotation schemes. Nowadays, the time-consuming task of building a UD treebank often starts with a round of automatic annotation. The performance of the currently available tools trained on standard language, however, tends to decline substantially upon application to learner text. Grammatical errors play a major role, but a significant performance gap has been observed even between standard test sets and normalized learner essays. In this paper, we investigate how to best bootstrap the annotation of UD learner corpora. In particular, we want to establish whether Target Hypotheses (THs), i.e. grammar-corrected learner sentences, are suitable training data for fine-tuning a parser aimed for original (ungrammatical) L2 material. We perform experiments using English and Italian data from two of the already available UD learner corpora. Our results show manually annotated THs to be highly beneficial and suggest that even automatically parsed sentences of this kind might be helpful, if available in sufficiently large amounts.}, booktitle = {Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024, 20 May, 2024, Torino, Italia}, author = {Masciolini, Arianna}, year = {2024}, publisher = {ELRA }, ISBN = {978-2-493814-31-9}, } @inProceedings{masciolini-toth-2024-stund-335974, title = {STUnD: ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker }, abstract = {Föreliggande artikel introducerar STUND, ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker som möjliggör parallella syntaktiska sökningar. Vi demonstrerar dess praktiska tillämpning i en fallstudie på tempusformen presens perfekt i svenska och engelska. Resultaten visar att presens perfekt används i ungefär lika stor utsträckning i båda språken, men att det förekommer viss variation som verkar bero på språkspecifika konventioner och översättningsstrategier. }, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024}, author = {Masciolini, Arianna and Tóth, Márton András}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{masciolini-etal-2023-towards-329384, title = {Towards automatically extracting morphosyntactical error patterns from L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are UD-annotated corpora of learner sentences paired with correction hypotheses. Automatic morphosyntactical annotation has the potential to remove the need for explicit manual error tagging and improve interoperability, but makes it more challenging to locate grammatical errors in the resulting datasets. We therefore propose a novel method for automatically extracting morphosyntactical error patterns and perform a preliminary bilingual evaluation of its first implementation through a similar example retrieval task. The resulting pipeline is also available as a prototype CALL application.}, booktitle = {Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), July 13, 2023, Toronto, Canada}, author = {Masciolini, Arianna and Volodina, Elena and Dannélls, Dana}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-959429-80-7}, } @inProceedings{masciolini-2023-query-329383, title = {A query engine for L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are learner corpora with interoperability as their main design goal. They consist of sentences produced by learners of a second language (L2) paired with native-like (L1) correction hypotheses. Rather than explicitly labelled for errors, these are annotated following the Universal Dependencies standard. This implies relying on tree queries for error retrieval. Work in this direction is, however, limited. We present a query engine for L1-L2 treebanks and evaluate it on two corpora, one manually validated and one automatically parsed.}, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), May 22-24, 2023 Tórshavn, Faroe Islands / Editors: Tanel Alumäe and Mark Fishel}, author = {Masciolini, Arianna}, year = {2023}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-99-1621-999-7}, } @inProceedings{masciolini-ranta-2021-grammar-324794, title = {Grammar-based concept alignment for domain-specific Machine Translation}, abstract = {Grammar-based domain-specific MT systems are a common use case for CNLs. High-quality translation lexica are a crucial part of such systems, but involve time consuming work and significant linguistic knowledge. With parallel example sentences available, statistical alignment tools can help automate part of the process, but they are not suitable for small datasets and do not always perform well with complex multiword expressions. In addition, the correspondences between word forms obtained in this way cannot be used directly. Addressing these problems, we propose a grammar-based approach to this task and put it to test in a simple translation pipeline.}, booktitle = {Proceedings of the Seventh International Workshop on Controlled Natural Language (CNL 2020/21)}, author = {Masciolini, Arianna and Ranta, Aarne}, year = {2021}, }