Skip to main content
Språkbanken Text is a part of Språkbanken.

BibTeX

@article{skoldberg-etal-2019-state-279701,
	title        = {State-of-the-art on monolingual lexicography for Sweden},
	abstract     = {The minireview describes the state-of-the-art of Swedish monolingual lexicography. The main actors in the field, both commercial and non-commercial, are mentioned alongside with the description of lexicographic products that have been offered by them to the lexicon users. The minireview makes it clear that there is an obvious tendency among the Swedish dictionary users to abandon paper-based dictionaries and switch over to online portals and apps, which influences the practices adopted by commercial publishing houses, such as Norstedts, Bonniers, Natur & Kultur. Among the leading non-commercial players, the Swedish Academy, the Swedish Language Bank, Institute for Language and Folklore are named. Swedish monolingual lexicography offers, however, dictionaries produced not only by experts but also by non-experts (i.e. using the efforts of the crowd).},
	journal      = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research},
	author       = {Sköldberg, Emma and Holmer, Louise and Volodina, Elena and Pilán, Ildikó},
	year         = {2019},
	volume       = {7},
	number       = {1},
	pages        = {13--24},
}

@inProceedings{francois-etal-2016-svalex-248142,
	title        = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.},
	abstract     = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource  is  based  on  a  corpus  of  coursebook  texts,  and  thus  describes  receptive  vocabulary  learners  are  exposed  to  during  reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish.  An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e.  vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which
is used by the minority of the coursebook writers.},
	booktitle    = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016   Portorož, Slovenia},
	author       = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs},
	year         = {2016},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {978-2-9517408-9-1},
}

@inProceedings{pilan-2016-detecting-243715,
	title        = {Detecting Context Dependence in Exercise Item Candidates Selected from Corpora},
	abstract     = {We explore the factors influencing the dependence of single sentences on their larger textual context in order to automatically identify
candidate sentences for language learning exercises from corpora which are presentable in
isolation. An in-depth investigation of this
question has not been previously carried out.
Understanding this aspect can contribute to a
more efficient selection of candidate sentences
which, besides reducing the time required for
item writing, can also ensure a higher degree
of variability and authenticity. We present a
set of relevant aspects collected based on the
qualitative analysis of a smaller set of context-dependent corpus example sentences. Furthermore, we implemented a rule-based algorithm using these criteria which achieved
an average precision of 0.76 for the identification
of different issues related to context dependence. The method has also been
evaluated empirically where 80% of the sentences in which our system did not detect
context-dependent elements were also considered context-independent by human raters.},
	booktitle    = {Proceedings of the 11th Workshop on Innovative Use of NLP for Building Educational Applications, June 12 to June 17, 2016, San Diego, USA},
	author       = {Pilán, Ildikó},
	year         = {2016},
}

@inProceedings{pilan-etal-2016-predicting-247240,
	title        = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks},
	abstract     = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many
statistical NLP methods.   In this paper,  we explore whether data sparsity can be successfully
tackled  when  classifying  language  proficiency  levels  in  the  domain  of  learner-written  output
texts.   We  aim  at  overcoming  data  sparsity  by  incorporating  knowledge  in  the  trained  model
from another domain consisting of input texts written by teaching professionals for learners. We
compare different domain adaptation techniques and find that a weighted combination of the two
types of data performs best, which can even rival systems based on considerably larger amounts
of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially
improve classification when in-domain data with annotated proficiency levels is not available.},
	booktitle    = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka},
	author       = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten},
	year         = {2016},
	ISBN         = {978-4-87974-702-0},
}

@inProceedings{volodina-etal-2016-swell-248141,
	title        = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.},
	abstract     = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments.  The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task.  Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered.  The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation.  Inter-rater agreement is presented on the basis of SW1203 subcorpus.  The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline.  This article both describes
the resource and the “how-to” behind the compilation of SweLL.},
	booktitle    = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia},
	author       = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica},
	year         = {2016},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {978-2-9517408-9-1},
}

@article{kosem-etal-2019-image-275354,
	title        = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture},
	abstract     = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section},
	journal      = {International Journal of Lexicography},
	author       = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer, Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners, Local},
	year         = {2019},
	volume       = {32},
	number       = {1},
	pages        = {92–114},
}

@misc{alfter-etal-2020-proceedings-300071,
	title        = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.
This collection presents four selected papers describing use of Language Technology for language learning.},
	author       = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars},
	year         = {2020},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-732-9},
}

@inProceedings{pilan-etal-2017-larka-289884,
	title        = {Lärka: an online platform where language learning meets natural language processing},
	booktitle    = {7th ISCA Workshop on Speech and Language Technology in Education, 25-26 August 2017, Stockholm, Sweden},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2017},
}

@misc{alfter-etal-2019-proceedings-285613,
	title        = {Proceedings of the 8th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2019), September 30, Turku Finland},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promote development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools.

The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field.
},
	author       = {Alfter, David and Volodina, Elena and Borin, Lars and Pilán, Ildikó and Lange, Herbert},
	year         = {2019},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköping},
	ISBN         = {978-91-7929-998-9},
}

@inProceedings{alfter-etal-2019-larka-281344,
	title        = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2019},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-034-3},
}

@inProceedings{pilan-volodina-2018-exploring-275366,
	title        = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.},
	abstract     = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors.
},
	booktitle    = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computation Linguistics },
	ISBN         = {978-1-948087-61-2},
}

@inProceedings{pilan-volodina-2018-investigating-275367,
	title        = {Investigating the importance of linguistic complexity features across different datasets related to language learning.},
	abstract     = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions.
},
	booktitle    = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computational Linguistics },
	ISBN         = {978-1-948087-62-9},
}

@misc{pilan-etal-2018-proceedings-275358,
	title        = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 },
	abstract     = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.},
	author       = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars},
	year         = {2018},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköpings universitet},
	ISBN         = {978-91-7685-173-9},
}

@inProceedings{alfter-pilan-2018-complex-276407,
	title        = {SB@ GU at the Complex Word Identification 2018 Shared Task},
	booktitle    = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018},
	author       = {Alfter, David and Pilán, Ildikó},
	year         = {2018},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA, USA},
	ISBN         = {978-1-948087-11-7},
}

@inProceedings{alfter-etal-2018-from-275364,
	title        = {From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.},
	booktitle    = {Proceedings of CLARIN-2018 conference, Pisa, Italy},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2018},
}

@article{pilan-etal-2016-readable-226565,
	title        = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.},
	abstract     = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.},
	journal      = {Computational Linguistics and Applications},
	author       = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena},
	year         = {2016},
	volume       = {7},
	number       = {1},
	pages        = {143--159},
}

@article{pilan-etal-2017-candidate-260382,
	title        = {Candidate sentence selection for language learning exercises: From a comprehensive framework to an empirical evaluation},
	abstract     = {We present a framework and its implementation relying on Natural Language Processing methods, which aims at the identification of exercise item candidates from corpora. The hybrid system combining heuristics and machine learning methods includes a number of relevant selection criteria. We focus on two fundamental aspects: linguistic complexity and the dependence of the extracted sentences on their original context. Previous work on exercise generation addressed these two criteria only to a limited extent, and a refined overall candidate sentence selection framework appears also to be lacking. In addition to a detailed description of the system, we present the results of an empirical evaluation conducted with language teachers and learners which indicate the usefulness of the system for educational purposes. We have integrated our system into a freely available online learning platform.},
	journal      = {Revue Traitement Automatique des Langues. Special issue on NLP for Learning and Teaching},
	author       = {Pilán, Ildikó and Volodina, Elena and Borin, Lars},
	year         = {2017},
	volume       = {57},
	number       = {3},
	pages        = {67--91},
}

@misc{volodina-etal-2017-preface-262846,
	title        = {Preface. Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning
and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017},
	abstract     = {For the second year in a row we brought two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together. The goal of organizing joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications. The resulting volume covers a variety of topics from the two fields and - hopefully - showcases the challenges and achievements in the field.
The seven papers in this volume cover native language identification in learner writings, using syntactic complexity development in language learner language to identify reading comprehension texts of appropriate level, exploring the potential of parallel corpora to predict mother-language specific problem areas for learners of another language, tools for learning languages - both well-resourced ones such as English as well as endangered or under-resourced ones such as Yakut and Võro, as well as exploring the potential of automatically identifying and correcting word-level errors in Swedish learner writing.},
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina},
	year         = {2017},
	volume       = {30},
	pages        = {i--vi},
}

@inProceedings{volodina-etal-2017-svalex-262848,
	title        = {SVALex. En andraspråksordlista med CEFR-nivåer},
	abstract     = {När man planerar att utveckla en språkkurs i ett andra- eller främmandespråk (L2) ställs man inför utmaningen att definiera vilket ordförråd inlärarna behöver tillägna sig. Forskning inom andraspråksinlärning tyder på att läsaren behöver kunna 95–98 % av löporden i en text för att förstå den (Laufer & Ravenhorst-Kalovski 2010). Sådana studier är användbara för att uppskatta storleken på det ordförråd som behövs för att tillägna sig innehållet i en text, men de ger ingen närmare metodologisk vägledning för den som vill utveckla nivåstrukturerade läromedel eller kurser för andraspråksundervisning. Speciellt tydligt är detta inom CALL, Computer-Assisted Language Learning, där läromaterial (t.ex. övningar) genereras automatiskt, och behöver elektroniska resurser som kunskapskälla.

Man kan istället angripa problemet från andra hållet. Om man har en samling nivåklassificerade texter för andraspråksinlärare kan man utifrån dem bygga ordlistor där varje ord är placerat på en färdighetsskala. Om man känner till den förutsatta färdighetsnivån hos läsaren, kan man helt enkelt anta att den textnivå där ett ord dyker upp första gången också anger ordets svårighetsgrad. SVALex är ett lexikon som har byggts enligt den principen. Resursen ska kunna användas av inlärare och lärare i svenska som andraspråk, men även av lexikografer, av kursutvecklare och provkonstruktörer samt av dem som likt oss själva ägnar sig åt utveckling av språkteknologibaserade datorstöd för språkinlärning och språktestning.

SVALex utgör en vidareutveckling i förhållande till tidigare lexikonresurser för svenska som andraspråk (se avsnitt 2), genom att den konsekvent relaterar de 15 681 lexikoningångarna till en vida använd färdighetsskala för andra- och främmandespråksinlärning, Europarådets gemensamma europeiska referensram för språk (Common European Framework of Reference, i fortsättningen refererad till som CEFR) (Council of Europe 2001; Skolverket 2009).

Nivåklassningen av lexikonenheterna i SVALex görs på basis av deras distribution i COCTAILL, en korpus innehållande lärobokstexter i svenska som andraspråk, där lärare har placerat in varje text i någon av CEFR-nivåerna (Volodina et al. 2014).
},
	booktitle    = {Svenskans beskrivning. 35, Förhandlingar vid trettiofemte sammankomsten : Göteborg 11–13 maj 2016 / Redigerad av Emma Sköldberg, Maia Andréasson, Henrietta Adamsson Eryd, Filippa Lindahl, Sven Lindström, Julia Prentice & Malin Sandberg},
	author       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó and François, Thomas and Tack, Annaïs},
	year         = {2017},
	publisher    = {Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-64-6},
}

@misc{volodina-etal-2017-proceedings-262838,
	title        = {Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017},
	abstract     = {For the second year in a row we have brought the two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together under one umbrella. The goal of organizing these joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications.},
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina},
	year         = {2017},
	publisher    = {Linköping University Press},
	address      = {Linköping, Sweden},
	ISBN         = { 978-91-7685-502-7},
}

@misc{volodina-etal-2016-preface-248087,
	title        = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	abstract     = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below.

The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics.

The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition.

The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.
},
	author       = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars},
	year         = {2016},
	number       = {130},
	pages        = { i–viii},
}

@inProceedings{volodina-etal-2016-classification-246346,
	title        = {Classification of Swedish learner essays by CEFR levels},
	abstract     = {The paper describes initial efforts on creating a system for the automatic assessment  of  Swedish  second  language  (L2)  learner  essays  from  two  points  of  view: holistic evaluation of the reached level according to the  Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. },
	booktitle    = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.},
	author       = {Volodina, Elena and Pilán, Ildikó and Alfter, David},
	year         = {2016},
	publisher    = {Research-publishing.net},
	ISBN         = { 978-1-908416-44-5},
}

@misc{volodina-etal-2016-proceedings-248081,
	title        = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	abstract     = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below.

The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics.

The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition.

The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.},
	author       = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-etal-2016-swellex-248090,
	title        = {SweLLex: second language learners' productive vocabulary.},
	abstract     = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.},
	booktitle    = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	author       = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-pilan-2016-svalex-248116,
	title        = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.},
	booktitle    = {Svenskans Beskrivning 35, Göteborg 2016},
	author       = {Volodina, Elena and Pilán, Ildikó},
	year         = {2016},
}

@inProceedings{volodina-etal-2016-swell-248145,
	title        = {SweLL – en korpus med L2 uppsatser för CEFR studier.},
	booktitle    = {Svenskans Beskrivning 35, Göteborg 2016},
	author       = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica},
	year         = {2016},
}

@inProceedings{pilan-volodina-2016-classification-248099,
	title        = {Classification of Language Proficiency Levels in Swedish Learners' Texts},
	abstract     = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency.   Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task.  The additional domain consists of coursebook texts written by experts for learners.  We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.},
	booktitle    = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2016},
}

@inProceedings{pilan-etal-2016-coursebook-246349,
	title        = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings},
	abstract     = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels.  We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels.  With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task.  Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. },
	booktitle    = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2016},
	ISBN         = {978-4-87974-709-9},
}

@inProceedings{alfter-etal-2016-from-246345,
	title        = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora},
	abstract     = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral.
},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-etal-2014-what-206132,
	title        = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.},
	abstract     = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign  language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at
Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus.
In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported
on a subset of the corpus.
Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others.
The paper also presents our first quantitative exploration of the corpus where we focus on
textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical
focus of exercises across levels.
The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. },
	booktitle    = {NEALT Proceedings Series},
	author       = {Volodina, Elena and Pilán, Ildikó and Rødven-Eide, Stian and Heidarsson, Hannes},
	year         = {2014},
	volume       = {22},
	ISBN         = {978-91-7519-175-1},
	pages        = {128--144},
}

@edited_book{volodina-etal-2015-proceedings-226574,
	title        = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015},
	editor       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó},
	year         = {2015},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7519-036-5},
}

@inProceedings{pilan-2015-helping-227313,
	title        = {Helping Swedish words come to their senses: word-sense disambiguation based on sense associations from the SALDO lexicon},
	abstract     = {This paper describes a knowledge-based
approach to word-sense disambiguation
using a lexical-semantic resource,
SALDO. This hierarchically organized
lexicon defining senses in terms of other
related senses has not been previously
explored for this purpose. The proposed
method is based on maximizing the
overlap between associated word senses
of nouns and verbs co-occuring within
a sentence. The results of a small-scale
experiment using this method are also
reported. Overall, the approach proved
more efficient for nouns, since not only
was the accuracy score higher for this
category (56%) than for verbs (46%), but
for nouns in 22% more of the cases was
a sense overlap found. As a result of an
in-depth analysis of the predictions, we
identified a number of ways the system
could be modified or extended for an
improved performance.},
	booktitle    = {Proceedings of the 20th Nordic Conference of Computational Linguistics (NODALIDA 2015). May 11–13, 2015, Vilnius, Lithuania},
	editor       = {Beáta Megyesi},
	author       = {Pilán, Ildikó},
	year         = {2015},
	number       = {109},
	ISBN         = {9789175190983},
	pages        = {275--279},
}

@inProceedings{pilan-etal-2014-rule-210940,
	title        = {Rule-based and machine learning approaches for second language sentence-level readability},
	abstract     = {We present approaches for the identification
of sentences understandable by second
language learners of Swedish, which
can be used in automatically generated exercises based on corpora. In this work we
merged methods and knowledge from machine
learning-based readability research,
from rule-based studies of Good Dictionary
Examples and from second language
learning syllabuses. The proposed selection
methods have also been implemented
as a module in a free web-based language
learning platform. Users can use
different parameters and linguistic filters
to personalize their sentence search with
or without a machine learning component
assessing readability. The sentences selected
have already found practical use as
multiple-choice exercise items within the
same platform. Out of a number of deep
linguistic indicators explored, we found
mainly lexical-morphological and semantic
features informative for second language
sentence-level readability. We obtained
a readability classification accuracy
result of 71%, which approaches the performance of other models used in similar
tasks. Furthermore, during an empirical
evaluation with teachers and students,
about seven out of ten sentences selected
were considered understandable, the rule-based approach slightly outperforming the
method incorporating the machine learning
model.},
	booktitle    = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA},
	author       = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard},
	year         = {2014},
	ISBN         = {978-1-941643-03-7},
	pages        = {174----184},
}

@edited_book{volodina-etal-2014-proceedings-206135,
	title        = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University},
	abstract     = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).},
	editor       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó},
	year         = {2014},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7519-175-1},
}

@inProceedings{volodina-etal-2014-flexible-201885,
	title        = {A flexible language learning platform based on language resources and web services. },
	abstract     = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure.
Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers
– and this way bring Språkbanken's resources in a relevant format to them.
Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.},
	booktitle    = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland },
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström},
	year         = {2014},
	ISBN         = {978-2-9517408-8-4},
	pages        = {3973--3978},
}

@inProceedings{pilan-volodina-2014-reusing-200967,
	title        = {Reusing Swedish FrameNet for training semantic roles},
	abstract     = {In this article we present the first experiences of reusing the Swedish FrameNet (SweFN) as a resource for training semantic roles. We
give an account of the procedure we used to adapt SweFN to the needs of students of Linguistics in the form of an automatically
generated exercise. During this adaptation, the mapping of the fine-grained distinction of roles from SweFN into learner-friendlier
coarse-grained roles presented a major challenge. Besides discussing the details of this mapping, we describe the resulting multiple-choice exercise and its graphical user interface. The exercise was made available through Lärka, an online platform for students of Linguistics and learners of Swedish as a second language. We outline also aspects underlying the selection of the incorrect answer
options which include semantic as well as frequency-based criteria. Finally, we present our own observations and initial user feedback
about the applicability of such a resource in the pedagogical domain. Students' answers indicated an overall positive experience, the
majority found the exercise useful for learning semantic roles.
},
	booktitle    = {Proceedings of LREC 2014, May 26-31, 2014, Reykjavik, Iceland},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2014},
	ISBN         = { 978-2-9517408-8-4},
	pages        = {1359--1363},
}

@inProceedings{pilan-etal-2013-automatic-188465,
	title        = {Automatic Selection of Suitable Sentences  for Language Learning Exercises},
	abstract     = {In this study we investigated second and foreign language (L2) sentence 
readability, an area little explored so far in the case of several languages, including 
Swedish. The outcome of our research consists of two methods for sentence selection 
from native language corpora based on Natural Language Processing (NLP) and 
machine learning (ML) techniques. The two approaches have been made available 
online within Lärka, an Intelligent CALL (ICALL) platform offering activities 
for language learners and students of linguistics. Such an automatic selection 
of suitable sentences can be valuable for L2 teachers during the creation of new 
teaching materials, for L2 students who look for additional self-study exercises as 
well as for lexicographers in search of example sentences to illustrate the meaning 
of a vocabulary item. Members from all these potential user groups evaluated our 
methods and found the majority of the sentences selected suitable for L2 learning 
purposes.},
	booktitle    = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th  to 14th September 2013 Évora, Portugal, Proceedings.},
	author       = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard},
	year         = {2013},
	ISBN         = {978-1-908416-12-4},
	pages        = {218--225},
}

@inProceedings{volodina-etal-2013-towards-188549,
	title        = {Towards a gold standard for Swedish CEFR-based ICALL},
	abstract     = {In qualitative projects on ICALL (Intelligent Computer-Assisted Language Learning), research and development always go hand
in hand: development both depends upon the research results and dictates the research agenda. Likewise, in the development of the Swedish ICALL platform Lärka, the practical issues of development have dictated its research agenda. With NLP approaches, sooner or later, the necessity for reliable training data becomes unavoidable. At the moment Lärka's research agenda cannot be addressed without access to reliable training data, so-called “gold standard”. This paper
gives an overview of the current state of the Swedish ICALL platform development and related research agenda, and describes the first attempts to collect the reference corpus (“gold standard”) coming from course books
used in CEFR-based language teaching.},
	booktitle    = {Proceedings of the Second Workshop on NLP for Computer-Assisted Language Learning. NEALT Proceedings Series 17. Nodalida 2013, Oslo, Norway. },
	author       = {Volodina, Elena and Pijetlovic, Dijana and Pilán, Ildikó and Johansson Kokkinakis, Sofie},
	year         = {2013},
	ISBN         = {978-91-7519-588-9},
}