@inProceedings{Borin-Lars2017-261157, title = {Swe-Clarin: Language resources and technology for Digital Humanities}, abstract = {CLARIN is a European Research Infrastructure Consortium (ERIC), which aims at (a) making extensive language-based materials available as primary research data to the humanities and social sciences (HSS); and (b) offering state-of-the-art language technology (LT) as an e-research tool for this purpose, positioning CLARIN centrally in what is often referred to as the digital humanities (DH). The Swedish CLARIN node Swe-Clarin was established in 2015 with funding from the Swedish Research Council. In this paper, we describe the composition and activities of Swe-Clarin, aiming at meeting the requirements of all HSS and other researchers whose research involves using text and speech as primary research data, and spreading the awareness of what Swe-Clarin can offer these research communities. We focus on one of the central means for doing this: pilot projects conducted in collaboration between HSS researchers and Swe-Clarin, together formulating a research question, the addressing of which requires working with large language-based materials. Four such pilot projects are described in more detail, illustrating research on rhetorical history, second-language acquisition, literature, and political science. A common thread to these projects is an aspiration to meet the challenge of conducting research on the basis of very large amounts of textual data in a consistent way without losing sight of the individual cases making up the mass of data, i.e., to be able to move between Moretti’s “distant” and “close reading” modes. While the pilot projects clearly make substantial contributions to DH, they also reveal some needs for more development, and in particular a need for document-level access to the text materials. As a consequence of this, work has now been initiated in Swe-Clarin to meet this need, so that Swe-Clarin together with HSS scholars investigating intricate research questions can take on the methodological challenges of big-data language-based digital humanities.}, booktitle = {Digital Humanities 2016. Extended Papers of the International Symposium on Digital Humanities (DH 2016) Växjö, Sweden, November, 7-8, 2016. Edited by Koraljka Golub, Marcelo Milra. Vol-2021}, author = {Borin, Lars and Tahmasebi, Nina and Volodina, Elena and Ekman, Stefan and Jordan, Caspar and Viklund, Jon and Megyesi, Beáta and Näsman, Jesper and Palmér, Anne and Wirén, Mats and Björkenstam, Kristina and Grigonyte, Gintare and Gustafson Capková, Sofia and Kosiński, Tomasz}, year = {2017}, publisher = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen.}, adress = {Aachen}, } @misc{Pilán-Ildikó2016-247241, title = "Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings", author = "Pilán, Ildikó and Alfter, David and Volodina, Elena", year = "2016", isbn = "978-4-87974-709-9", } @inProceedings{Pilán-Ildikó2016-247240, title = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks}, abstract = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many statistical NLP methods. In this paper, we explore whether data sparsity can be successfully tackled when classifying language proficiency levels in the domain of learner-written output texts. We aim at overcoming data sparsity by incorporating knowledge in the trained model from another domain consisting of input texts written by teaching professionals for learners. We compare different domain adaptation techniques and find that a weighted combination of the two types of data performs best, which can even rival systems based on considerably larger amounts of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially improve classification when in-domain data with annotated proficiency levels is not available.}, booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka}, author = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{Pilán-Ildikó2016-246349, title = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings}, abstract = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels. We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels. With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task. Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. }, booktitle = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2016}, ISBN = {978-4-87974-709-9}, } @inProceedings{Alfter-David2016-246347, title = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Alfter, David and Volodina, Elena}, year = {2016}, } @inProceedings{Volodina-Elena2016-246346, title = {Classification of Swedish learner essays by CEFR levels}, abstract = {The paper describes initial efforts on creating a system for the automatic assessment of Swedish second language (L2) learner essays from two points of view: holistic evaluation of the reached level according to the Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. }, booktitle = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.}, author = {Volodina, Elena and Pilán, Ildikó and Alfter, David}, year = {2016}, publisher = {Research-publishing.net}, ISBN = { 978-1-908416-44-5}, } @inProceedings{Alfter-David2016-246345, title = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora}, abstract = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral. }, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó}, year = {2016}, publisher = {Linköping University Electronic Press}, ISBN = {978-91-7685-633-8}, } @misc{Volodina-Elena2016-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @misc{Volodina-Elena2016-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{Pilán-Ildikó2016-248099, title = {Classification of Language Proficiency Levels in Swedish Learners' Texts}, abstract = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency. Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task. The additional domain consists of coursebook texts written by experts for learners. We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.}, booktitle = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2016}, } @inProceedings{Volodina-Elena2016-248093, title = {A Friend in Need? Research agenda for electronic Second Language infrastructure.}, abstract = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure. Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.}, booktitle = {Proceedings of the Swedish Language Technology Conference}, author = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög }, year = {2016}, publisher = {Umeå Universitet}, } @inProceedings{Volodina-Elena2016-248090, title = {SweLLex: second language learners' productive vocabulary.}, abstract = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.}, booktitle = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, author = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas }, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{LindströmTiedemann-Therese2016-248119, title = {Lärka som didaktiskt verktyg. Undersökning om studenternas metaspråkliga kunskap.}, booktitle = {Svenskans Beskrivning 35, 11-13 maj 2016, Göteborg}, author = { Lindström Tiedemann, Therese and Volodina, Elena}, year = {2016}, } @inProceedings{Volodina-Elena2016-248116, title = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó}, year = {2016}, } @article{LindströmTiedemann-Therese2016-248112, title = {Lärka: ett verktyg för träning av språkterminologi och grammatik}, abstract = {Lärka is a corpus-based tool, which allows students to practise and learn grammar based on authentic material. In this study we present how this has been used at four universities. We also use our logs to try to assess the students metalinguistic awareness in relation to international studies, and discuss how these logs can be used in the future.}, author = {Lindström Tiedemann, Therese and Volodina, Elena and Jansson, Håkan}, year = {2016}, volume = {23}, pages = {161--181}, } @inProceedings{Volodina-Elena2016-248145, title = {SweLL – en korpus med L2 uppsatser för CEFR studier.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica}, year = {2016}, } @inProceedings{Daudaravicius-Vidas2016-248143, title = {A report on the Automatic Evaluation of Scientific Writing Shared Task.}, abstract = {The Automated Evaluation of Scientific Writing, or AESW, is the task of identifying sentences in need of correction to ensure their appropriateness in a scientific prose. The data set comes from a professional editing company, VTeX, with two aligned versions of the same text – before and after editing – and covers a variety of textual infelicities that proofreaders have edited. While previous shared tasks focused solely on grammatical errors (Dale and Kilgarriff, 2011; Dale et al., 2012; Ng et al., 2013; Ng et al., 2014), this time edits cover other types of linguistic misfits as well, including those that almost certainly could be interpreted as style issues and similar “matters of opinion”. The latter arise because of different language editing traditions, experience, and the absence of uniform agreement on what “good” scientific language should look like. Initiating this task, we expected the participating teams to help identify the characteristics of “good” scientific language, and help create a consensus of which language improvements are acceptable (or necessary). Six participating teams took on the challenge.}, booktitle = {Workshop on Innovative Use of NLP for Building Educational Applications, June 16, 2016, San Diego, CA, USA}, author = {Daudaravicius, Vidas and E. Banchs, Rafael and Volodina, Elena and Napoles, Courtney }, year = {2016}, ISBN = {978-1-941643-83-9}, } @inProceedings{François-Thomas2016-248142, title = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.}, abstract = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource is based on a corpus of coursebook texts, and thus describes receptive vocabulary learners are exposed to during reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish. An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e. vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which is used by the minority of the coursebook writers.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016 Portorož, Slovenia}, author = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs }, year = {2016}, publisher = {European Language Resources Association}, adress = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{Volodina-Elena2016-248141, title = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.}, abstract = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments. The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task. Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered. The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation. Inter-rater agreement is presented on the basis of SW1203 subcorpus. The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline. This article both describes the resource and the “how-to” behind the compilation of SweLL.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica }, year = {2016}, publisher = {European Language Resources Association}, adress = {Paris}, ISBN = {978-2-9517408-9-1}, } @article{Ildikó-Pilán2016-260382, title = {Candidate sentence selection for language learning exercises: From a comprehensive framework to an empirical evaluation}, abstract = {We present a framework and its implementation relying on Natural Language Processing methods, which aims at the identification of exercise item candidates from corpora. The hybrid system combining heuristics and machine learning methods includes a number of relevant selection criteria. We focus on two fundamental aspects: linguistic complexity and the dependence of the extracted sentences on their original context. Previous work on exercise generation addressed these two criteria only to a limited extent, and a refined overall candidate sentence selection framework appears also to be lacking. In addition to a detailed description of the system, we present the results of an empirical evaluation conducted with language teachers and learners which indicate the usefulness of the system for educational purposes. We have integrated our system into a freely available online learning platform.}, author = {Ildikó, Pilán and Volodina, Elena and Borin, Lars}, year = {2016}, volume = {57}, number = {3}, pages = {67--91}, } @book{Volodina-Elena2015-226574, title = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015}, author = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2015}, publisher = {Linköping University Press}, adress = {Linköping}, ISBN = {978-91-7519-036-5}, } @article{Pilán-Ildikó2015-226565, title = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.}, abstract = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.}, author = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena}, year = {2015}, volume = {Epub ahead of print}, } @inProceedings{Volodina-Elena2015-226543, title = {Lark Trills for Language Drills: Text-to-speech technology for language learners.}, abstract = {This paper reports on the development and the initial evaluation of a dictation&spelling prototype exercise for second language (L2) learners of Swedish based on text-to-speech (TTS) technology. Implemented on an already existing Intelligent Computer-Assisted Language Learning (ICALL) platform, the exercise has not only served as a test case for TTS in L2 environment, but has also shown a potential to train listening and orthographic skills, as well as has become a way of collecting learner-specific spelling errors into a database. Exercise generation re-uses well-annotated corpora, lexical resources, and text-to-speech technology with an accompanying talking head. }, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 4, 2015, Denver, Colorado, USA}, author = {Volodina, Elena and Pijetlovic, Dijana}, year = {2015}, ISBN = {978-1-941643-35-8}, pages = {107--117}, } @article{Kilgariff-Adam2014-188541, title = {Corpus-Based Vocabulary lists for Language Learners for Nine Languages.}, abstract = {We present the KELLY project and its work on developing monolingual and bilingual word lists for language learning, using corpus methods, for nine languages and thirty-six language pairs. We describe the method and discuss the many challenges encountered. We have loaded the data into an online database to make it accessible for anyone to explore and we present our own first explorations of it. The focus of the paper is thus twofold, covering pedagogical and methodological aspects of the lists’ construction, and linguistic aspects of the by-product of the project, the KELLY database. }, author = {Kilgariff, Adam and Charalabopoulou, Frieda and Gavrilidou, Maria and Bondi Johannessen, Janne and Khalil, Saussan and Johansson Kokkinakis, Sofie and Lew, Robert and Sharoff, Serge and Vadlapudi, R. and Volodina, Elena}, year = {2014}, volume = {48}, number = {1}, pages = {121--163}, } @inProceedings{Pilán-Ildikó2014-200967, title = {Reusing Swedish FrameNet for training semantic roles}, abstract = {In this article we present the first experiences of reusing the Swedish FrameNet (SweFN) as a resource for training semantic roles. We give an account of the procedure we used to adapt SweFN to the needs of students of Linguistics in the form of an automatically generated exercise. During this adaptation, the mapping of the fine-grained distinction of roles from SweFN into learner-friendlier coarse-grained roles presented a major challenge. Besides discussing the details of this mapping, we describe the resulting multiple-choice exercise and its graphical user interface. The exercise was made available through Lärka, an online platform for students of Linguistics and learners of Swedish as a second language. We outline also aspects underlying the selection of the incorrect answer options which include semantic as well as frequency-based criteria. Finally, we present our own observations and initial user feedback about the applicability of such a resource in the pedagogical domain. Students' answers indicated an overall positive experience, the majority found the exercise useful for learning semantic roles. }, booktitle = {Proceedings of LREC 2014, May 26-31, 2014, Reykjavik, Iceland}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1359--1363}, } @inProceedings{Pil?n-Ildik?2014-10, title = "Rule-based and machine learning approaches for second language sentence-level readability.", booktitle = "Proceeding of the ACL 2014 9th Workshop on Innovative Use of NLP for Building Educational Applications, Baltimore, june 22-27 2014", author = "Pilán, Ildikó and Volodina, Elena and Johansson, Richard", year = "2014", isbn = "978-1-941643-03-7", url = "http://www.aclweb.org/anthology/W/W14/W14-1821.pdf", pages = "174--184", } @inProceedings{Volodina-Elena2014-201885, title = {A flexible language learning platform based on language resources and web services. }, abstract = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure. Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers – and this way bring Språkbanken's resources in a relevant format to them. Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.}, booktitle = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland }, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3973--3978}, } @book{Volodina-Elena2014-206135, title = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University}, abstract = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).}, author = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2014}, publisher = {Linköping University Press}, adress = {Linköping}, ISBN = {978-91-7519-175-1}, } @inProceedings{Volodina-Elena2014-206141, title = {Evaluating students' metalinguistic knowledge with Lärka.}, booktitle = {Proceedings of the 5th Swedish Language Technology Conference, Uppsala University 13-14 November 2014}, author = {Volodina, Elena and Lindström TIedemann, Therese}, year = {2014}, } @inProceedings{Pilán-Ildikó2014-210940, title = {Rule-based and machine learning approaches for second language sentence-level readability}, abstract = {We present approaches for the identification of sentences understandable by second language learners of Swedish, which can be used in automatically generated exercises based on corpora. In this work we merged methods and knowledge from machine learning-based readability research, from rule-based studies of Good Dictionary Examples and from second language learning syllabuses. The proposed selection methods have also been implemented as a module in a free web-based language learning platform. Users can use different parameters and linguistic filters to personalize their sentence search with or without a machine learning component assessing readability. The sentences selected have already found practical use as multiple-choice exercise items within the same platform. Out of a number of deep linguistic indicators explored, we found mainly lexical-morphological and semantic features informative for second language sentence-level readability. We obtained a readability classification accuracy result of 71%, which approaches the performance of other models used in similar tasks. Furthermore, during an empirical evaluation with teachers and students, about seven out of ten sentences selected were considered understandable, the rule-based approach slightly outperforming the method incorporating the machine learning model.}, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-03-7}, pages = {174----184}, } @inProceedings{Volodina-Elena2014-206132, title = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.}, abstract = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus. In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported on a subset of the corpus. Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others. The paper also presents our first quantitative exploration of the corpus where we focus on textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical focus of exercises across levels. The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. }, booktitle = {NEALT Proceedings Series}, author = {Volodina, Elena and Pilán, Ildikó and Eide, Stian Rødven and Heidarsson, Hannes}, year = {2014}, volume = {22}, ISBN = {978-91-7519-175-1}, pages = {128--144}, } @inProceedings{Pilán-Ildikó2013-188465, title = {Automatic Selection of Suitable Sentences for Language Learning Exercises}, abstract = {In this study we investigated second and foreign language (L2) sentence readability, an area little explored so far in the case of several languages, including Swedish. The outcome of our research consists of two methods for sentence selection from native language corpora based on Natural Language Processing (NLP) and machine learning (ML) techniques. The two approaches have been made available online within Lärka, an Intelligent CALL (ICALL) platform offering activities for language learners and students of linguistics. Such an automatic selection of suitable sentences can be valuable for L2 teachers during the creation of new teaching materials, for L2 students who look for additional self-study exercises as well as for lexicographers in search of example sentences to illustrate the meaning of a vocabulary item. Members from all these potential user groups evaluated our methods and found the majority of the sentences selected suitable for L2 learning purposes.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th to 14th September 2013 Évora, Portugal, Proceedings.}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2013}, ISBN = {978-1-908416-12-4}, pages = {218--225}, } @inProceedings{Pijetlovic-Dijana2013-188543, title = {Developing a Swedish spelling game on an ICALL platform}, abstract = {In this project we developed web services on the ICALL platform Lärka for automatic generation of Swedish spelling exercises using Text-To-Speech (TTS) technology which allows L2 learners to train their spelling and listening individually performance based levels. The embedded avatar pronounces a random item of the desired level, which the user has to spell. Furthermore, the users have the possibility to train their own words for different linguistic levels. A result tracker containing a total and correct answer score keeps track of the language learner’s performance. In order to analyse typical spelling errors and provide better feedback, misspellings are collected in a database. The usability of the spelling exercises, concerning the different linguistic levels and the quality of speech, has been evaluated through a questionnaire with 10 participants.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, Évora, Portugal, Proceedings.}, author = {Pijetlovic, Dijana and Volodina, Elena}, year = {2013}, ISBN = {978-1-908416-12-4}, } @inProceedings{Volodina-Elena2013-188550, title = {Compiling a corpus of CEFR-related texts.}, abstract = {This paper reports on initial efforts to compile a corpus of course book texts used for teaching CEFR-based courses of Swedish to adult immigrants. The research agenda behind compiling such a corpus comprises the study of normative “input” texts that can reveal a number of facts about what is being taught in terms of explicit grammar, receptive vocabulary, text and sentence readability; as well as build insights into linguistic characteristics of normative texts which can help anticipate learner performance in terms of active vocabulary, grammatical competence, etc. in classroom and testing settings. The CEFR “can-do” statements are known to offer flexibility in interpreting them for different languages and target groups. However, they are nonspecific and therefore it is difficult to associate different kinds of competences and levels of accuracy learners need in order to perform the communicative tasks with the different CEFR levels. To address this problem a systematic study needs to be performed for each individual anguage, both for “input” normative texts and “output” learner-produced texts. In this project we take the first step to collect and study normative texts for Swedish. The article describes the process of corpus compilation, annotation scheme of CEFR- relevant parameters, and methods proposed for text analysis, namely statistic and empiric methods, as well as techniques coming from computational linguistics/machine learning. }, booktitle = {Proceedings of the Language Testing and CEFR conference, Antwerpen, Belgium, May 27-29, 2013}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2013}, } @inProceedings{Volodina-Elena2013-188549, title = {Towards a gold standard for Swedish CEFR-based ICALL}, abstract = {In qualitative projects on ICALL (Intelligent Computer-Assisted Language Learning), research and development always go hand in hand: development both depends upon the research results and dictates the research agenda. Likewise, in the development of the Swedish ICALL platform Lärka, the practical issues of development have dictated its research agenda. With NLP approaches, sooner or later, the necessity for reliable training data becomes unavoidable. At the moment Lärka's research agenda cannot be addressed without access to reliable training data, so-called “gold standard”. This paper gives an overview of the current state of the Swedish ICALL platform development and related research agenda, and describes the first attempts to collect the reference corpus (“gold standard”) coming from course books used in CEFR-based language teaching.}, booktitle = {Proceedings of the Second Workshop on NLP for Computer-Assisted Language Learning. NEALT Proceedings Series 17. Nodalida 2013, Oslo, Norway. }, author = {Volodina, Elena and Pijetlovic, Dijana and Pilán, Ildikó and Johansson Kokkinakis, Sofie}, year = {2013}, ISBN = {978-91-7519-588-9}, } @book{Volodina-Elena2013-188675, title = {Proceedings of the second workshop on NLP for computer-assisted language learning at NODALIDA 2013 May 22-24, 2013, Oslo, Norway}, author = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn}, year = {2013}, publisher = {Linköping University Press}, adress = {Linköping, Sweden}, ISBN = {978-91-7519-588-9}, } @inProceedings{Volodina-Elena2012-154723, title = {Introducing Swedish Kelly-list, a new free e-resource for Swedish}, abstract = {Frequency lists and/or lexicons contain information about the words and their statistics. They tend to find their “readers” among linguists, lexicographers, language teachers. Making them available in electronic format helps to expand the target group to cover language engineers, computer programmers and other specialists working in such areas as information retrieval, spam filtering, text readability analysis, test generation, etc. This article describes a new freely available electronic frequency list of modern Swedish that was created in the EU project KELLY. We describe the state of affairs for Swedish frequency lexicons; provide a short description of the KELLY project; mention the corpus the list has been derived from. Further, we dwell on the type of information the list contains, describe shortly the steps for list generation; provide information on the coverage and some other statistics over the items in the list. Finally, some practical information on the license for the Swedish Kelly-list distribution is given; potential application areas are suggested; and future plans for its expansion are mentioned. We hope that with some publicity we can help this list find its users. }, booktitle = {LREC 2012 Proceedings}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2012}, volume = {2012}, } @inProceedings{Volodina-Elena2012-165936, title = {Waste not, want not: Towards a system architecture for ICALL based on NLP component re-use}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012}, author = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn and Arnbjörnsdóttir, Birna and Leifsson, Guðmundur Örn}, year = {2012}, pages = {47--58}, } @inProceedings{Volodina-Elena2012-165961, title = {Semi-automatic selection of best corpus examples for Swedish: Initial algorithm evaluation.}, abstract = {The study presented here describes the results of the initial evaluation of two sorting approaches to automatic ranking of corpus examples for Swedish. Representatives from two potential target user groups have been asked to rate top three hits per approach for sixty search items from the point of view of the needs of their professional target groups, namely second/foreign language (L2) teachers and lexicographers. This evaluation has shown, on the one hand, which of the two approaches to example rating (called in the text below algorithms #1 and #2) performs better in terms of finding better examples for each target user group; and on the other hand, which features evaluators associate with good examples. It has also facilitated statistic analysis of the “good” versus “bad” examples with reference to the measurable features, such as sentence length, word length, lexical frequency profiles, PoS constitution, dependency structure, etc. with a potential to find out new reliable classifiers.}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012. }, author = {Volodina, Elena and Johansson, Richard and Johansson Kokkinakis, Sofie}, year = {2012}, number = {080}, pages = {59--70}, } @techreport{Volodina-Elena2012-165964, title = {Swedish Kelly: Technical Report.}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2012}, publisher = {University of Gothenburg}, adress = {Göteborg}, } @inProceedings{Volodina-Elena2012-168523, title = {Developing an Open-Source Web-Based Exercise Generator for Swedish}, abstract = {This paper reports on the ongoing international project System architecture for ICALL and the progress made by the Swedish partner. The Swedish team is developing a web-based exercise generator reusing available annotated corpora and lexical resources. Apart from the technical issues like implementation of the user interface and the underlying processing machinery, a number of interesting pedagogical questions need to be solved, e.g., adapting learner-oriented exercises to proficiency levels; selecting authentic examples of an appropriate difficulty level; automatically ranking corpus examples by their quality; providing feedback to the learner, and selecting vocabulary for training domain-specific, academic or general-purpose vocabulary. In this paper we describe what has been done so far, mention the exercise types that can be generated at the moment as well as describe the tasks left for the future. }, booktitle = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland}, author = {Volodina, Elena and Borin, Lars}, year = {2012}, volume = {2012}, ISBN = {978-1-908416-03-2}, } @inProceedings{Volodina-Elena2012-168516, title = {Towards a system architecture for ICALL}, abstract = {In this paper, we present an on-going project whose overall aim is to develop open-source system architecture for supporting ICALL systems that will facilitate re-use of existing NLP tools and resources on a plug-and-play basis. We introduce the project, describe the approaches adopted by the two language teams, and present two applications being developed using the proposed architecture.}, booktitle = {In G. Biswas et al. (eds), Proceedings of the 20th International Conference on Computers in Education. Singapore: Asia-Pacific Society for Computers in Education}, author = {Volodina, Elena and Hrafn, Loftsson and Arnbjörnsdóttir, Birna and Borin, Lars and Leifsson, Guðmundur Örn}, year = {2012}, volume = {2012}, ISBN = {978-981-07-4649-0}, } @inProceedings{Charalabopoulou-Frieda2012-168525, title = {Building Corpus-Informed Word Lists for L2 Vocabulary Learning in Nine Languages}, abstract = {Lexical competence constitutes a crucial aspect in L2 learning, since building a rich repository of words is considered indispensable for successful communication. CALL practitioners have experimented with various kinds of computer-mediated glosses to facilitate L2 vocabulary building in the context of incidental vocabulary learning. Intentional learning, on the other hand, is generally underestimated, since it is considered out of fashion and not in line with the communicative L2 learning paradigm. Yet, work is still being done in this area and substantial body of research indicates that the usefulness of incidental vocabulary learning does not exclude the use of dedicated vocabulary study and that by using aids explicitly geared to building vocabularies (such as word lists and word cards) L2 learners exhibit good retention rates and faster learning gains. Intentional vocabulary study should, therefore, have its place in the instructional and learning context. Regardless of the approach, incidental or intentional, the crucial question with respect to vocabulary teaching/learning remains: which and how many words should we teach/learn at different language levels? An attempt to answer the above question was made within the framework of the EU-funded project titled “KELLY” (Keywords for Language Learning for Young and Adults Alike) presented here. The project aimed at building corpus-informed vocabulary lists for L2 learners ranging from A1 to C2 levels for nine languages: Arabic, Chinese, English, Greek, Italian, Norwegian, Polish, Russian and Swedish. }, booktitle = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland}, author = {Charalabopoulou, Frieda and Gavrilidou, Maria and Johansson Kokkinakis, Sofie and Volodina, Elena}, year = {2012}, volume = {2012}, ISBN = {978-1-908416-03-2}, } @book{Borin-Lars2012-188679, title = {Proceedings of the SLTC 2012 workshop on NLP for CALL}, author = {Borin, Lars and Volodina, Elena}, year = {2012}, publisher = {LiU Electronic Press}, adress = {Linköping}, } @article{JohanssonKokkinakis-Sofie2011-148533, title = {Corpus-based approaches for the creation of a frequency based vocabulary list in the EU project KELLY – issues on reliability, validity and coverage}, abstract = {At present there are relatively few vocabulary lists for Swedish describing modern vocabulary as well as being adapted to language learners’ needs. In Europe including Sweden there exist approaches to unify ways of working consistently with language learning, one example worth naming in this respect is the Common European Framework of Reference (CEFR) which provides guidelines for systematic approach to language teaching and assessment of language proficiency. This article describes EU project Kelly (KEywords for Language Learning for Young and adults alike, 2009-2012), the main objective of which was to create vocabulary lists for nine languages (Swedish, English, Norwegian, Greek, Italian, Polish, Arabic, Chinese and Russian) and adapt them to CEFR levels. We describe the process of compiling and validating the Swedish Kelly-list, dwell on benefits and limitations of using a corpus based approach in this project; as well as mention the impact of the methodological approach for compiling vocabulary lists for specific purposes. }, author = {Johansson Kokkinakis, Sofie and Volodina, Elena}, year = {2011}, volume = {2011}, } @book{Volodina-Elena2010-127225, title = {Corpora in Language Classroom: Reusing Stockholm Umeå Corpus in a vocabulary exercise generator}, abstract = {Authentic examples as teaching material are not easy to obtain. Corpora are able to solve this problem, as has been witnessed before. Most experiments with corpora in language classroom describe concordances. However, there are numerous other ways of bringing corpora into language education, as shown in this research. A selective learner-oriented exercise generator has been implemented on the basis of Stockholm Umeå Corpus (SUC). SUC texts have been tested for readability and levels were assigned. This generator assists in automatic selection of authentic examples of appropriate learner levels as well as in construction of wordbank-, multiple choice items and c-tests for a specified proficiency level, frequency band and word class. In Vocabulary Size Test potential words are generated on the basis of existing morphemes and SUC-based frequency lists. Interesting practical and theoretical questions connected with reusage of corpora in an exercise generator are described in this book. The research might inspire computational linguists, language teachers and everyone interested in Computer-Assisted Language Learning and Corpus Linguistics to test similar techniques in their practices. }, author = {Volodina, Elena}, year = {2010}, publisher = {Lambert Academic Publishing}, adress = {Saarbrücken}, ISBN = {978-3-8433-5256-7}, }