@inProceedings{francois-etal-2016-svalex-248142, title = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.}, abstract = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource is based on a corpus of coursebook texts, and thus describes receptive vocabulary learners are exposed to during reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish. An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e. vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which is used by the minority of the coursebook writers.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016 Portorož, Slovenia}, author = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs}, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{pilan-etal-2016-predicting-247240, title = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks}, abstract = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many statistical NLP methods. In this paper, we explore whether data sparsity can be successfully tackled when classifying language proficiency levels in the domain of learner-written output texts. We aim at overcoming data sparsity by incorporating knowledge in the trained model from another domain consisting of input texts written by teaching professionals for learners. We compare different domain adaptation techniques and find that a weighted combination of the two types of data performs best, which can even rival systems based on considerably larger amounts of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially improve classification when in-domain data with annotated proficiency levels is not available.}, booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka}, author = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{volodina-etal-2016-swell-248141, title = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.}, abstract = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments. The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task. Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered. The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation. Inter-rater agreement is presented on the basis of SW1203 subcorpus. The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline. This article both describes the resource and the “how-to” behind the compilation of SweLL.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica}, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{daudaravicius-etal-2016-report-248143, title = {A report on the Automatic Evaluation of Scientific Writing Shared Task.}, abstract = {The Automated Evaluation of Scientific Writing, or AESW, is the task of identifying sentences in need of correction to ensure their appropriateness in a scientific prose. The data set comes from a professional editing company, VTeX, with two aligned versions of the same text – before and after editing – and covers a variety of textual infelicities that proofreaders have edited. While previous shared tasks focused solely on grammatical errors (Dale and Kilgarriff, 2011; Dale et al., 2012; Ng et al., 2013; Ng et al., 2014), this time edits cover other types of linguistic misfits as well, including those that almost certainly could be interpreted as style issues and similar “matters of opinion”. The latter arise because of different language editing traditions, experience, and the absence of uniform agreement on what “good” scientific language should look like. Initiating this task, we expected the participating teams to help identify the characteristics of “good” scientific language, and help create a consensus of which language improvements are acceptable (or necessary). Six participating teams took on the challenge.}, booktitle = {Workshop on Innovative Use of NLP for Building Educational Applications, June 16, 2016, San Diego, CA, USA}, author = {Daudaravicius, Vidas and E. Banchs, Rafael and Volodina, Elena and Napoles, Courtney}, year = {2016}, ISBN = {978-1-941643-83-9}, } @article{pilan-etal-2016-readable-226565, title = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.}, abstract = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.}, journal = {Computational Linguistics and Applications}, author = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena}, year = {2016}, volume = {7}, number = {1}, pages = {143--159}, } @misc{volodina-etal-2016-preface-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @inProceedings{volodina-etal-2016-classification-246346, title = {Classification of Swedish learner essays by CEFR levels}, abstract = {The paper describes initial efforts on creating a system for the automatic assessment of Swedish second language (L2) learner essays from two points of view: holistic evaluation of the reached level according to the Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. }, booktitle = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.}, author = {Volodina, Elena and Pilán, Ildikó and Alfter, David}, year = {2016}, publisher = {Research-publishing.net}, ISBN = { 978-1-908416-44-5}, } @inProceedings{alfter-volodina-2016-modeling-246347, title = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Alfter, David and Volodina, Elena}, year = {2016}, } @misc{volodina-etal-2016-proceedings-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{volodina-etal-2016-swellex-248090, title = {SweLLex: second language learners' productive vocabulary.}, abstract = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.}, booktitle = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, author = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{volodina-pilan-2016-svalex-248116, title = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó}, year = {2016}, } @inProceedings{volodina-etal-2016-swell-248145, title = {SweLL – en korpus med L2 uppsatser för CEFR studier.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica}, year = {2016}, } @inProceedings{pilan-volodina-2016-classification-248099, title = {Classification of Language Proficiency Levels in Swedish Learners' Texts}, abstract = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency. Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task. The additional domain consists of coursebook texts written by experts for learners. We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.}, booktitle = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2016}, } @inProceedings{lindstromtiedemann-volodina-2016-larka-248119, title = {Lärka som didaktiskt verktyg. Undersökning om studenternas metaspråkliga kunskap.}, booktitle = {Svenskans Beskrivning 35, 11-13 maj 2016, Göteborg}, author = {Lindström Tiedemann, Therese and Volodina, Elena}, year = {2016}, } @inProceedings{pilan-etal-2016-coursebook-246349, title = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings}, abstract = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels. We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels. With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task. Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. }, booktitle = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2016}, ISBN = {978-4-87974-709-9}, } @article{lindstromtiedemann-etal-2016-larka-248112, title = {Lärka: ett verktyg för träning av språkterminologi och grammatik}, abstract = {Lärka is a corpus-based tool, which allows students to practise and learn grammar based on authentic material. In this study we present how this has been used at four universities. We also use our logs to try to assess the students metalinguistic awareness in relation to international studies, and discuss how these logs can be used in the future.}, journal = {LexicoNordica}, author = {Lindström Tiedemann, Therese and Volodina, Elena and Jansson, Håkan}, year = {2016}, volume = {23}, pages = {161--181}, } @inProceedings{volodina-etal-2016-friend-248093, title = {A Friend in Need? Research agenda for electronic Second Language infrastructure.}, abstract = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure. Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.}, booktitle = {Proceedings of the Swedish Language Technology Conference}, author = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög}, year = {2016}, publisher = {Umeå Universitet}, } @inProceedings{alfter-etal-2016-from-246345, title = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora}, abstract = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral. }, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó}, year = {2016}, publisher = {Linköping University Electronic Press}, ISBN = {978-91-7685-633-8}, }