@inProceedings{masciolini-2024-bootstrapping-338425, title = {Bootstrapping the Annotation of UD Learner Treebanks}, abstract = {Learner data comes in a variety of formats, making corpora difficult to compare with each other. Universal Dependencies (UD) has therefore been proposed as a replacement for the various ad-hoc annotation schemes. Nowadays, the time-consuming task of building a UD treebank often starts with a round of automatic annotation. The performance of the currently available tools trained on standard language, however, tends to decline substantially upon application to learner text. Grammatical errors play a major role, but a significant performance gap has been observed even between standard test sets and normalized learner essays. In this paper, we investigate how to best bootstrap the annotation of UD learner corpora. In particular, we want to establish whether Target Hypotheses (THs), i.e. grammar-corrected learner sentences, are suitable training data for fine-tuning a parser aimed for original (ungrammatical) L2 material. We perform experiments using English and Italian data from two of the already available UD learner corpora. Our results show manually annotated THs to be highly beneficial and suggest that even automatically parsed sentences of this kind might be helpful, if available in sufficiently large amounts.}, booktitle = {Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024, 20 May, 2024, Torino, Italia}, author = {Masciolini, Arianna}, year = {2024}, publisher = {ELRA }, ISBN = {978-2-493814-31-9}, } @inProceedings{masciolini-etal-2024-synthetic-338288, title = {Synthetic-Error Augmented Parsing of Swedish as a Second Language: Experiments with Word Order}, abstract = {Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.}, booktitle = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, May 25, 2024, Torino, Italia}, author = {Masciolini, Arianna and Francis, Emilie and Szawerna, Maria Irena}, year = {2024}, publisher = {ELRA and ICCL}, address = {Torino, Italy}, ISBN = {978-2-493814-20-3}, } @inProceedings{masciolini-toth-2024-stund-335974, title = {STUnD: ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker }, abstract = {Föreliggande artikel introducerar STUND, ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker som möjliggör parallella syntaktiska sökningar. Vi demonstrerar dess praktiska tillämpning i en fallstudie på tempusformen presens perfekt i svenska och engelska. Resultaten visar att presens perfekt används i ungefär lika stor utsträckning i båda språken, men att det förekommer viss variation som verkar bero på språkspecifika konventioner och översättningsstrategier. }, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024}, author = {Masciolini, Arianna and Tóth, Márton András}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{masciolini-etal-2023-towards-329384, title = {Towards automatically extracting morphosyntactical error patterns from L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are UD-annotated corpora of learner sentences paired with correction hypotheses. Automatic morphosyntactical annotation has the potential to remove the need for explicit manual error tagging and improve interoperability, but makes it more challenging to locate grammatical errors in the resulting datasets. We therefore propose a novel method for automatically extracting morphosyntactical error patterns and perform a preliminary bilingual evaluation of its first implementation through a similar example retrieval task. The resulting pipeline is also available as a prototype CALL application.}, booktitle = {Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), July 13, 2023, Toronto, Canada}, author = {Masciolini, Arianna and Volodina, Elena and Dannélls, Dana}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-959429-80-7}, } @inProceedings{masciolini-2023-query-329383, title = {A query engine for L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are learner corpora with interoperability as their main design goal. They consist of sentences produced by learners of a second language (L2) paired with native-like (L1) correction hypotheses. Rather than explicitly labelled for errors, these are annotated following the Universal Dependencies standard. This implies relying on tree queries for error retrieval. Work in this direction is, however, limited. We present a query engine for L1-L2 treebanks and evaluate it on two corpora, one manually validated and one automatically parsed.}, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), May 22-24, 2023 Tórshavn, Faroe Islands / Editors: Tanel Alumäe and Mark Fishel}, author = {Masciolini, Arianna}, year = {2023}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-99-1621-999-7}, } @inProceedings{masciolini-ranta-2021-grammar-324794, title = {Grammar-based concept alignment for domain-specific Machine Translation}, abstract = {Grammar-based domain-specific MT systems are a common use case for CNLs. High-quality translation lexica are a crucial part of such systems, but involve time consuming work and significant linguistic knowledge. With parallel example sentences available, statistical alignment tools can help automate part of the process, but they are not suitable for small datasets and do not always perform well with complex multiword expressions. In addition, the correspondences between word forms obtained in this way cannot be used directly. Addressing these problems, we propose a grammar-based approach to this task and put it to test in a simple translation pipeline.}, booktitle = {Proceedings of the Seventh International Workshop on Controlled Natural Language (CNL 2020/21)}, author = {Masciolini, Arianna and Ranta, Aarne}, year = {2021}, }