@article{volodina-etal-2019-swell-285609, title = {The SweLL Language Learner Corpus: From Design to Annotation}, abstract = {The article presents a new language learner corpus for Swedish, SweLL, and the methodology from collection and pesudonymisation to protect personal information of learners to annotation adapted to second language learning. The main aim is to deliver a well-annotated corpus of essays written by second language learners of Swedish and make it available for research through a browsable environment. To that end, a new annotation tool and a new project management tool have been implemented, – both with the main purpose to ensure reliability and quality of the final corpus. In the article we discuss reasoning behind metadata selection, principles of gold corpus compilation and argue for separation of normalization from correction annotation.}, journal = {Northern European Journal of Language Technology}, author = {Volodina, Elena and Granstedt, Lena and Matsson, Arild and Megyesi, Beáta and Pilán, Ildikó and Prentice, Julia and Rosén, Dan and Rudebeck, Lisa and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2019}, volume = {6}, pages = {67--104}, } @inProceedings{volodina-etal-2019-svala-285617, title = {SVALA: an Annotation Tool for Learner Corpora generating parallel texts}, abstract = {Learner corpora are actively used for research on Language Acquisition and in Learner Corpus Research (LCR). The data is, however, very expensive to collect and manually annotate, and includes steps like anonymization, normalization, error annotation, linguistic annotation. In the past, projects often re - used tools from a number of different projects for the above steps. As a result, various input and output formats between the tools needed to be converted, which increased the complexity of the task. In the present project, we are developing a tool that handles all of the above - mentioned steps in one environment maintaining a stable interpretable format between the steps. A distinguishing feature of the tool is that users work in a usual environment (plain text) while the tool visualizes all performed edits via a graph that links an original learner text with an edited one, token by token.}, booktitle = {Learner Corpus Research conference (LCR-2019), Warsaw, 12-14 September 2019, Book of abstracts}, author = {Volodina, Elena and Matsson, Arild and Rosén, Dan and Wirén, Mats}, year = {2019}, } @inProceedings{wiren-etal-2018-svala-285624, title = {SVALA: Annotation of Second-Language Learner Text Based on Mostly Automatic Alignment of Parallel Corpora}, abstract = {Annotation of second-language learner text is a cumbersome manual task which in turn requires interpretation to postulate the intended meaning of the learner’s language. This paper describes SVALA, a tool which separates the logical steps in this process while providing rich visual support for each of them. The first step is to pseudonymize the learner text to fulfil the legal and ethical requirements for a distributable learner corpus. The second step is to correct the text, which is carried out in the simplest possible way by text editing. During the editing, SVALA automatically maintains a parallel corpus with alignments between words in the learner source text and corrected text, while the annotator may repair inconsistent word alignments. Finally, the actual labelling of the corrections (the postulated errors) is performed. We describe the objectives, design and workflow of SVALA, and our plans for further development. }, booktitle = {Selected papers from the CLARIN Annual Conference 2018, Pisa, 8-10 October 2018}, editor = {Inguna Skadina and Maria Eskevich}, author = {Wirén, Mats and Matsson, Arild and Rosén, Dan and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköpings universitet}, ISBN = {978-91-7685-034-3}, } @inProceedings{alfter-etal-2019-larka-281344, title = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2019}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7685-034-3}, } @inProceedings{rosen-etal-2018-error-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{volodina-etal-2018-annotation-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{pilan-volodina-2018-exploring-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{megyesi-etal-2018-learner-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018}, editor = {Ildikó Pilán and Elena Volodina and David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{volodina-etal-2018-interoperability-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad}, year = {2018}, } @inProceedings{alfter-etal-2018-from-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{volodina-etal-2016-friend-248093, title = {A Friend in Need? Research agenda for electronic Second Language infrastructure.}, abstract = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure. Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.}, booktitle = {Proceedings of the Swedish Language Technology Conference}, author = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög}, year = {2016}, publisher = {Umeå Universitet}, }