@inProceedings{rosen-etal-2018-error-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{megyesi-etal-2018-learner-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018}, editor = {Ildikó Pilán and Elena Volodina and David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{volodina-etal-2018-interoperability-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad}, year = {2018}, } @inProceedings{alfter-etal-2018-from-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{volodina-etal-2018-annotation-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{pilan-volodina-2018-exploring-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{wiren-etal-2018-svala-285624, title = {SVALA: Annotation of Second-Language Learner Text Based on Mostly Automatic Alignment of Parallel Corpora}, abstract = {Annotation of second-language learner text is a cumbersome manual task which in turn requires interpretation to postulate the intended meaning of the learner’s language. This paper describes SVALA, a tool which separates the logical steps in this process while providing rich visual support for each of them. The first step is to pseudonymize the learner text to fulfil the legal and ethical requirements for a distributable learner corpus. The second step is to correct the text, which is carried out in the simplest possible way by text editing. During the editing, SVALA automatically maintains a parallel corpus with alignments between words in the learner source text and corrected text, while the annotator may repair inconsistent word alignments. Finally, the actual labelling of the corrections (the postulated errors) is performed. We describe the objectives, design and workflow of SVALA, and our plans for further development. }, booktitle = {Selected papers from the CLARIN Annual Conference 2018, Pisa, 8-10 October 2018}, editor = {Inguna Skadina and Maria Eskevich}, author = {Wirén, Mats and Matsson, Arild and Rosén, Dan and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköpings universitet}, ISBN = {978-91-7685-034-3}, }