@inProceedings{Pilán-Ildikó2016-243715, title = {Detecting Context Dependence in Exercise Item Candidates Selected from Corpora}, abstract = {We explore the factors influencing the dependence of single sentences on their larger textual context in order to automatically identify candidate sentences for language learning exercises from corpora which are presentable in isolation. An in-depth investigation of this question has not been previously carried out. Understanding this aspect can contribute to a more efficient selection of candidate sentences which, besides reducing the time required for item writing, can also ensure a higher degree of variability and authenticity. We present a set of relevant aspects collected based on the qualitative analysis of a smaller set of context-dependent corpus example sentences. Furthermore, we implemented a rule-based algorithm using these criteria which achieved an average precision of 0.76 for the identification of different issues related to context dependence. The method has also been evaluated empirically where 80% of the sentences in which our system did not detect context-dependent elements were also considered context-independent by human raters.}, booktitle = {Proceedings of the 11th Workshop on Innovative Use of NLP for Building Educational Applications, June 12 to June 17, 2016, San Diego, USA}, author = {Pilán, Ildikó}, year = {2016}, } @misc{Pilán-Ildikó2016-247241, title = "Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings", author = "Pilán, Ildikó and Alfter, David and Volodina, Elena", year = "2016", isbn = "978-4-87974-709-9", } @inProceedings{Pilán-Ildikó2016-247240, title = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks}, abstract = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many statistical NLP methods. In this paper, we explore whether data sparsity can be successfully tackled when classifying language proficiency levels in the domain of learner-written output texts. We aim at overcoming data sparsity by incorporating knowledge in the trained model from another domain consisting of input texts written by teaching professionals for learners. We compare different domain adaptation techniques and find that a weighted combination of the two types of data performs best, which can even rival systems based on considerably larger amounts of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially improve classification when in-domain data with annotated proficiency levels is not available.}, booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka}, author = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{Pilán-Ildikó2016-246349, title = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings}, abstract = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels. We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels. With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task. Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. }, booktitle = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2016}, ISBN = {978-4-87974-709-9}, } @inProceedings{Volodina-Elena2016-246346, title = {Classification of Swedish learner essays by CEFR levels}, abstract = {The paper describes initial efforts on creating a system for the automatic assessment of Swedish second language (L2) learner essays from two points of view: holistic evaluation of the reached level according to the Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. }, booktitle = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.}, author = {Volodina, Elena and Pilán, Ildikó and Alfter, David}, year = {2016}, publisher = {Research-publishing.net}, ISBN = { 978-1-908416-44-5}, } @inProceedings{Alfter-David2016-246345, title = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora}, abstract = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral. }, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó}, year = {2016}, publisher = {Linköping University Electronic Press}, ISBN = {978-91-7685-633-8}, } @misc{Volodina-Elena2016-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @misc{Volodina-Elena2016-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{Pilán-Ildikó2016-248099, title = {Classification of Language Proficiency Levels in Swedish Learners' Texts}, abstract = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency. Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task. The additional domain consists of coursebook texts written by experts for learners. We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.}, booktitle = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2016}, } @inProceedings{Volodina-Elena2016-248090, title = {SweLLex: second language learners' productive vocabulary.}, abstract = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.}, booktitle = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, author = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas }, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{Volodina-Elena2016-248116, title = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó}, year = {2016}, } @inProceedings{Volodina-Elena2016-248145, title = {SweLL – en korpus med L2 uppsatser för CEFR studier.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica}, year = {2016}, } @inProceedings{François-Thomas2016-248142, title = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.}, abstract = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource is based on a corpus of coursebook texts, and thus describes receptive vocabulary learners are exposed to during reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish. An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e. vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which is used by the minority of the coursebook writers.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016 Portorož, Slovenia}, author = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs }, year = {2016}, publisher = {European Language Resources Association}, adress = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{Volodina-Elena2016-248141, title = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.}, abstract = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments. The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task. Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered. The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation. Inter-rater agreement is presented on the basis of SW1203 subcorpus. The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline. This article both describes the resource and the “how-to” behind the compilation of SweLL.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica }, year = {2016}, publisher = {European Language Resources Association}, adress = {Paris}, ISBN = {978-2-9517408-9-1}, }