@inProceedings{stemle-etal-2019-working-319453, title = {Working together towards an ideal infrastructure for language learner corpora}, abstract = {In this article we provide an overview of first-hand experiences and vantage points for best practices from projects in seven European countries dedicated to learner corpus research (LCR) and the creation of language learner corpora. The corpora and tools involved in LCR are becoming more and more important, as are careful preparation and easy retrieval and reusability of corpora and tools. But the lack of commonly agreed solutions for many aspects of LCR, interoperability between learner corpora and the exchange of data from different learner corpus projects remains a challenge. We show how concepts like metadata, anonymization, error taxonomies and linguistic annotations as well as tools, toolchains and data formats can be individually challenging and how the challenges can be solved. }, booktitle = {Widening the Scope of Learner Corpus Research. Selected papers from the fourth Learner Corpus Research Conference. Corpora and Language in Use – Proceedings 5 / Andrea Abel, Aivars Glaznieks, Verena Lyding and Lionel Nicolas (eds.)}, author = {Stemle, Egon and Boyd, Adriane and Janssen, Maarten and Preradović, Nives Mikelić and Rosen, Alexandr and Rosén, Dan and Volodina, Elena}, year = {2019}, publisher = {PUL, Presses Universitaires de Louvain}, address = {Louvain-la-Neuve }, ISBN = {978-2-87558-868-5}, } @article{volodina-etal-2019-swell-285609, title = {The SweLL Language Learner Corpus: From Design to Annotation}, abstract = {The article presents a new language learner corpus for Swedish, SweLL, and the methodology from collection and pesudonymisation to protect personal information of learners to annotation adapted to second language learning. The main aim is to deliver a well-annotated corpus of essays written by second language learners of Swedish and make it available for research through a browsable environment. To that end, a new annotation tool and a new project management tool have been implemented, – both with the main purpose to ensure reliability and quality of the final corpus. In the article we discuss reasoning behind metadata selection, principles of gold corpus compilation and argue for separation of normalization from correction annotation.}, journal = {Northern European Journal of Language Technology}, author = {Volodina, Elena and Granstedt, Lena and Matsson, Arild and Megyesi, Beáta and Pilán , Ildikó and Prentice, Julia and Rosén, Dan and Rudebeck, Lisa and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2019}, volume = {6}, pages = {67--104}, } @inProceedings{volodina-etal-2019-svala-285617, title = {SVALA: an Annotation Tool for Learner Corpora generating parallel texts}, abstract = {Learner corpora are actively used for research on Language Acquisition and in Learner Corpus Research (LCR). The data is, however, very expensive to collect and manually annotate, and includes steps like anonymization, normalization, error annotation, linguistic annotation. In the past, projects often re - used tools from a number of different projects for the above steps. As a result, various input and output formats between the tools needed to be converted, which increased the complexity of the task. In the present project, we are developing a tool that handles all of the above - mentioned steps in one environment maintaining a stable interpretable format between the steps. A distinguishing feature of the tool is that users work in a usual environment (plain text) while the tool visualizes all performed edits via a graph that links an original learner text with an edited one, token by token.}, booktitle = {Learner Corpus Research conference (LCR-2019), Warsaw, 12-14 September 2019, Book of abstracts}, author = {Volodina, Elena and Matsson, Arild and Rosén, Dan and Wirén, Mats }, year = {2019}, }