Skip to main content
Språkbanken Text is a part of Språkbanken.

BibTeX

@incollection{tiedemann-etal-2024-multiword-343530,
	title        = {Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results},
	abstract     = {This chapter introduces part of the Swedish L2 profiles, a new resource for Swedish as a second language. Multiword expressions (MWEs) in this resource are based on knowledge-based automatic annotation of MWEs, which we show works quite well for Swedish. In contrast, manual annotation of the compositionality of each MWE proved difficult, probably due to different interpretations of "compositionality" by the two annotators. We show that experts and non-experts can rank MWEs very similarly according to relative receptive difficulty, with particularly high agreement for the easiest items. A qualitative comparison of the proficiency levels associated with the MWEs based on coursebook occurrences and the results from crowdsourcing and direct ranking indicate that MWEs which appear in few books of the same level are more likely to be difficult to associate with an appropriate level based on coursebook corpus data. Furthermore, results show that compositionality and/or transparency might influence the relative ranking. Finally, there is a clear increase in MWE lemmas at higher proficiency levels at the group level, and at the highest level receptive and productive data include the same percentage of MWEs.},
	booktitle    = {Multiword Expressions in Lexical Resources: Linguistic, Lexicographic, and Computational Perspectives},
	author       = {Tiedemann, Therese Lindström and Alfter, David and Ali Mohammed, Yousuf and Piipponen, Daniela and Silén, Beatrice and Volodina, Elena},
	year         = {2024},
	ISBN         = {9783961104703},
	pages        = {309--348},
}

@inProceedings{volodina-etal-2023-dalaj-326817,
	title        = {DaLAJ-GED – a dataset for Grammatical Error Detection tasks on Swedish},
	booktitle    = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023)},
	editor       = {David Alfter and Elena Volodina and Thomas François and Arne Jönsson and Evelina Rennes},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Berdicevskis, Aleksandrs and Bouma, Gerlof and Öhman, Joey},
	year         = {2023},
	publisher    = { Linköping Electronic Conference Proceedings},
	address      = {Linköping },
	ISBN         = {978-91-8075-250-3},
	pages        = {94--101},
}

@inProceedings{berdicevskis-etal-2023-superlim-331445,
	title        = {Superlim: A Swedish Language Understanding Evaluation Benchmark},
	booktitle    = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, December 6-10, 2023, Singapore  / Houda Bouamor, Juan Pino, Kalika Bali (Editors)},
	author       = {Berdicevskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and Öhman, Joey and Adesam, Yvonne and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Isbister, Tim and Lindahl, Anna and Malmsten, Martin and Rekathati, Faton and Sahlgren, Magnus and Volodina, Elena and Börjeson, Love and Hengchen, Simon and Tahmasebi, Nina},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {979-8-89176-060-8},
	pages        = {8137--8153},
}

@inProceedings{szawerna-etal-2024-detecting-336385,
	title        = {Detecting Personal Identifiable Information in Swedish Learner Essays},
	abstract     = {Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.},
	booktitle    = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, St. Julian’s, Malta},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-085-1},
}

@inProceedings{munozsanchez-etal-2024-jingle-342259,
	title        = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT},
	abstract     = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.},
	booktitle    = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) },
	author       = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2024},
	publisher    = {Linköping Electronic Conference Proceedings},
	ISBN         = {978-91-8075-774-4},
}

@inProceedings{munozsanchez-etal-2024-names-336384,
	title        = {Did the Names I Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring},
	abstract     = {Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.},
	booktitle    = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta },
	author       = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Szawerna, Maria Irena and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-085-1},
}

@inProceedings{szawerna-etal-2024-pseudonymization-338089,
	title        = {Pseudonymization Categories across Domain Boundaries},
	abstract     = {Linguistic data, a component critical not only for research in a variety of fields but also for the development of various Natural Language Processing (NLP) applications, can contain personal information. As a result, its accessibility is limited, both from a legal and an ethical standpoint. One of the solutions is the pseudonymization of the data. Key stages of this process include the identification of sensitive elements and the generation of suitable surrogates in a way that the data is still useful for the intended task. Within this paper, we conduct an analysis of tagsets that have previously been utilized in anonymization and pseudonymization. We also investigate what kinds of Personally Identifiable Information (PII) appear in various domains. These reveal that none of the analyzed tagsets account for all of the PII types present cross-domain at the level of detailedness seemingly required for pseudonymization. We advocate for a universal system of tags for categorizing PIIs leading up to their replacement. Such categorization could facilitate the generation of grammatically, semantically, and sociolinguistically appropriate surrogates for the kinds of information that are considered sensitive in a given domain, resulting in a system that would enable dynamic pseudonymization while keeping the texts readable and useful for future research in various fields.},
	booktitle    = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), LREC-COLING, 2024 20-25 May, 2024, Torino, Italia},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena},
	year         = {2024},
	publisher    = {ELRA and ICCL},
	ISBN         = {978-2-493814-10-4},
}

@inProceedings{munozsanchez-etal-2024-harnessing-342122,
	title        = {Harnessing GPT to Study Second Language Learner Essays: Can We Use Perplexity to Determine Linguistic Competence?},
	abstract     = {Generative language models have been used to study a wide variety of phenomena in NLP. This allows us to better understand the linguistic capabilities of those models and to better analyse the texts that we are working with. However, these studies have mainly focused on text generated by L1 speakers of English. In this paper we study whether linguistic competence of L2 learners of Swedish (through their performance on essay tasks) correlates with the perplexity of a decoder-only model (GPT-SW3). We run two sets of experiments, doing both quantitative and qualitative analyses for each of them. In the first one, we analyse the perplexities of the essays and compare them with the CEFR level of the essays, both from an essay-wide level and from a token level. In our second experiment, we compare the perplexity of an L2 learner essay with a normalised version of it. We find that the perplexity of essays tends to be lower for higher CEFR levels and that normalised essays have a lower perplexity than the original versions. Moreover, we find that different factors can lead to spikes in perplexity, not all of them being related to L2 learner language.},
	booktitle    = {Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024), June 20, 2024, Mexico City, Mexico},
	author       = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Volodina, Elena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = { Mexico City, Mexico},
	ISBN         = {979-8-89176-100-1},
}

@inProceedings{munozsanchez-etal-2024-name-339981,
	title        = {Name Biases in Automated Essay Assessment},
	abstract     = {Artificial intelligence is being deployed in high-stakes situations, such as automated grading of second language essays in proficiency assessment. While they can improve the opportunities students have (education, work opportunities, etc.), such systems often display human-like biases. Aldrin (2017) notes that human graders have a slight bias based on names appearing in essay texts. We aim to identify whether the same pattern holds in automated systems. 
In this study we aim to answer the following research questions: 
1) Does changing given names inside a second language learner essay affect the way the text is graded? 
2) How much does this differ between feature-based machine learning and deep learning? 

For this, we use a de-anonymized (i.e. original) version of the Swell-pilot corpus of second language Swedish learner essays (Volodina 2016), which consists of 502 essays annotated with CEFR levels as our source data. 
First, we compile four lists of given names inspired by those of Aldrin (2017): traditional Swedish names; modern Swedish names of Anglo-American origin; Finnish names (due to the close sociocultural links between both countries); and names of Arabic origin (the most prominent group of learners in the corpus). 
Second, we create a diagnostic dataset to identify biases in the classification task. We select SweLL-pilot essays in which a given name appears only once. Then, we generate an essay version for each name on the lists by substituting the name in the original text with one from the list. 
Third, we fine-tune a BERT (Devlin et al. 2019) model on the original SweLL-pilot data to predict the CEFR level of a given essay and compare it to an existing feature-based model (Pilan 2016). 

Finally, we test the two models and compare the equality of opportunity between the different given name groups on the diagnostic dataset. },
	booktitle    = {The 28th International Congress of Onomastic Sciences (ICOS 28),19-23 August, 2024, Helsinki, Finland},
	author       = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Lindström Tiedemann, Therese and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2024},
}

@misc{volodina-etal-2024-proceedings-336386,
	title        = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024,  Malta},
	author       = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {979-8-89176-085-1},
}

@article{volodina-etal-2024-swedish-340630,
	title        = {Swedish word family resource},
	abstract     = {The article introduces a novel lexical resource for Swedish based on word family principles. The development of the Swedish Word Family (SweWF) resource is set into the context of linguistic complexity in second language acquisition. The SweWF is particularly appropriate for that, given that it contains lexical items used in second language corpora, namely, in a corpus of coursebook texts, and in a corpus of learner essays. The main focus of the article is on the construction of the resource with its user interface and on its applicability for research, although it also opens vast possibilities for practical applications for language learning, testing and assessment. We demonstrate the value of the resource through several case studies.},
	journal      = {ITL-INTERNATIONAL JOURNAL OF APPLIED LINGUISTICS},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Tiedemann, Therese Lindstrom},
	year         = {2024},
}

@inProceedings{holdt-etal-2024-towards-341134,
	title        = {Towards an Ideal Tool for Learner Error Annotation},
	abstract     = {Annotation and analysis of corrections in learner corpora have always presented technical challenges, mainly on account of the fact that until now there has not been any standard tool available, and that original and corrected versions of texts have been mostly stored together rather than treated as individual texts. In this paper, we present CJVT Svala 1.0, the Slovene version of the SVALA tool, which was originally used for the annotation of Swedish learner language. The localisation into Slovene resulted in the development of several new features in SVALA such as the support for multiple annotation systems, localisation into other languages, and the support for more complex annotation systems. Adopting the parallel aligned approach to text visualisation and annotation, as well as storing the data, combined with the tool supporting this, i.e. SVALA, are proposed as new standards in Learner Corpus Research.},
	booktitle    = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024 - Main Conference Proceedings},
	author       = {Holdt, Špela Arhar and Erjavec, Tomaž and Kosem, Iztok and Volodina, Elena},
	year         = {2024},
	ISBN         = {9782493814104},
}

@inProceedings{klezl-etal-2022-exploring-321958,
	title        = {Exploring Linguistic Acceptability in Swedish Learners’ Language },
	abstract     = {We present our initial experiments on binary classification of sentences into linguistically correct versus incorrect ones in Swedish using the DaLAJ dataset (Volodina et al., 2021a). The nature of the task is bordering on linguistic acceptability judgments, on the one hand, and on grammatical error detection task, on the other. The experiments include models trained with different input features and on different variations of the training, validation, and test splits. We also analyze the results focusing on different  error  types and errors  made  on  different proficiency levels. Apart from insights into which features and approaches work well for this task, we present first benchmark results on this dataset. The implementation is based on  a  bidirectional  LSTM  network  and  pre-trained  FastText embeddings, BERT embeddings, own word and character embeddings, as well as part-of-speech tags and dependency labels as input  features. The best model used BERT embeddings and a training and validation set enriched with additional correct sentences. It  reached an  accuracy of 73%  on one  of  three  test sets  used  in  the  evaluation. These promising results illustrate that the dataand format of DaLAJ  make a valuable  new resource  for research  in acceptability  judgements in Swedish.},
	booktitle    = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022)},
	author       = {Klezl, Julia and Ali Mohammed, Yousuf and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-459-5 },
}

@inProceedings{casademontmoner-volodina-2022-swedish-321955,
	title        = {Swedish MuClaGED: A new dataset for Grammatical Error Detection in Swedish},
	abstract     = {This paper introduces the Swedish MuClaGED dataset, a new dataset specifically built for the task of Multi-Class Grammatical Error Detection (GED). The dataset has been produced as a part of the multilingual Computational  SLA shared  task  initiative. In  this paper we elaborate on the generation process and the design choices made to obtain Swedish MuClaGED. We also show initial baseline results for the performance on the  dataset in a task of Grammatical Error Detection and Classification on the sentence level, which have been obtained through (Bi)LSTM ((Bidirectional) Long-Short Term Memory) methods.},
	booktitle    = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) },
	author       = {Casademont Moner, Judit and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-459-5},
}

@edited_book{alfter-etal-2023-proceedings-331649,
	title        = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023) },
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.},
	editor       = {Alfter, David and Volodina, Elena and François, Thomas and Jönsson, Arne and Rennes, Evelina},
	year         = {2023},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-8075-250-3},
}

@article{volodina-etal-2022-crowdsourcing-336551,
	title        = {Crowdsourcing ratings for single lexical items: a core vocabulary perspective},
	abstract     = {In this study, we investigate theoretical and practical issues connected to differentiating between core and peripheral vocabulary at different levels of linguistic proficiency using statistical approaches combined with crowdsourcing. We also investigate whether crowdsourcing second language learners’ rankings can be used for assigning levels to unseen vocabulary. The study is performed on Swedish single-word items. 
The four hypotheses we examine are: (1) there is core vocabulary for each proficiency  level,  but  this  is  only  true  until  CEFR  level  B2  (upper-intermediate); (2) core vocabulary shows more systematicity in its behavior and usage, whereas  peripheral  items  have  more  idiosyncratic  behavior;  (3)  given  that  we have truly core items (aka anchor items) for each level, we can place any new unseen item in relation to the identified core items by using a series of comparative  judgment  tasks,  this  way  assigning  a  “target”  level  for  a  previously  unseen  item;  and  (4)  non-experts  will  perform  on  par  with  experts in  a  comparative  judgment  setting.  The  hypotheses  have  been  largely  confirmed:  In  relation  to  (1)  and  (2),  our  results  show  that  there  seems  to  be  some systematicity in core vocabulary for early to mid-levels (A1-B1) while we find less systematicity for higher levels (B2-C1). In relation to (3), we suggest crowdsourcing word rankings using comparative judgment with known anchor  words  as  a  method  to  assign  a  “target”  level  to  unseen  words.  With  regard to (4), we confirm the previous findings that non-experts, in our case language learners, can be effectively used for the linguistic annotation tasks in a comparative judgment setting.},
	journal      = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese},
	year         = {2022},
	volume       = {10},
	number       = {2},
	pages        = {5--61},
}

@inProceedings{volodina-etal-2023-multiged-331652,
	title        = {MultiGED-2023 shared task at NLP4CALL: Multilingual Grammatical Error Detection },
	abstract     = {This paper reports on the NLP4CALL shared task on Multilingual Grammatical Error Detection (MultiGED-2023), which included five languages: Czech, English, German, Italian and Swedish. It is the first shared task organized by the Computational SLA1 working group, whose aim is to promote less represented languages in the fields of Grammatical Error Detection and Correction, and other related fields. The MultiGED datasets have been produced based on second language (L2) learner corpora for each particular language. In this paper we introduce the task as a whole, elaborate on the dataset generation process and the design choices made to obtain MultiGED datasets, provide details of the evaluation metrics and CodaLab setup. We further briefly describe the systems used by participants and report the results. },
	booktitle    = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023) },
	author       = {Volodina, Elena and Bryant, Christopher and Caines, Andrew and De Clercq, Orphée and Frey, Jennifer-Carmen and Ershova, Elizaveta and Rosen, Alexandr and Vinogradova, Olga},
	year         = {2023},
	publisher    = {Linköping University Press},
}

@misc{volodina-etal-2024-proceedings-335190,
	title        = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden},
	author       = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna},
	year         = {2024},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-8075-512-2},
}

@article{skoldberg-etal-2019-state-279701,
	title        = {State-of-the-art on monolingual lexicography for Sweden},
	abstract     = {The minireview describes the state-of-the-art of Swedish monolingual lexicography. The main actors in the field, both commercial and non-commercial, are mentioned alongside with the description of lexicographic products that have been offered by them to the lexicon users. The minireview makes it clear that there is an obvious tendency among the Swedish dictionary users to abandon paper-based dictionaries and switch over to online portals and apps, which influences the practices adopted by commercial publishing houses, such as Norstedts, Bonniers, Natur & Kultur. Among the leading non-commercial players, the Swedish Academy, the Swedish Language Bank, Institute for Language and Folklore are named. Swedish monolingual lexicography offers, however, dictionaries produced not only by experts but also by non-experts (i.e. using the efforts of the crowd).},
	journal      = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research},
	author       = {Sköldberg, Emma and Holmer, Louise and Volodina, Elena and Pilán, Ildikó},
	year         = {2019},
	volume       = {7},
	number       = {1},
	pages        = {13--24},
}

@incollection{alimohammed-etal-2022-annotation-321989,
	title        = {Annotation Management Tool: A Requirement for Corpus Construction},
	abstract     = {We present an annotation management tool, SweLL portal, that has been developed for the purposes of the SweLL infrastructure project for building a learner corpus of Swedish (Volodina et al., 2019). The SweLL portal has been used for supervised access to the database, data versioning, import and export of data and metadata, statistical overview, administration of annotation tasks, monitoring of annotation tasks and reliability controls. The development of the portal was driven by visions of longitudinal sustainable data storage and was partially shaped by situational needs reported by portal users, including project managers, researchers, and annotators.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September / Monica Monachini and Maria Eskevich (eds.)},
	author       = {Ali Mohammed, Yousuf and Matsson, Arild and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping Electronic Conference },
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-444-1},
	pages        = {101--108},
}

@incollection{volodina-etal-2022-reliability-321988,
	title        = {Reliability of Automatic Linguistic Annotation: Native vs Non-native Texts },
	abstract     = {We present the results of a manual evaluation of the performance of automatic linguistic annotation on three different datasets: (1) texts written by native speakers, (2) essays written by second language (L2) learners of Swedish in the original form and (3) the normalized versions of learner-written essays. The focus of the evaluation is on lemmatization, POS-tagging, word sense disambiguation, multi-word detection and dependency annotation. Two annotators manually went through the automatic annotation on a subset of the datasets and marked up all deviations based on their expert judgments and the guidelines provided. We report Inter-Annotator Agreement between the two annotators and accuracy for the linguistic annotation quality for the three datasets, by levels and linguistic features.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September},
	editor       = {Monica Monachini and Maria Eskevich},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese and Lauriala, Maisa and Piipponen, Daniala},
	year         = {2022},
	publisher    = {Linköping Electronic Conference },
	address      = {Linköping, Sweden},
	ISBN         = { 978-91-7929-444-1},
	pages        = {151--167},
}

@misc{alfter-etal-2022-proceedings-321964,
	title        = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) },
	abstract     = {The volume contains articles reviewed and presented at NLP4CALL workshop. The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical an methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.},
	author       = {Alfter, David and Volodina, Elena and François, Thomas and Desmet, Piet and Cornillie, Frederik and Jönsson, Arne and Rennes, Evelina},
	year         = {2022},
	publisher    = {Linköping Electronic Conference Proceedings  },
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-460-1},
}

@inProceedings{francois-etal-2016-svalex-248142,
	title        = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.},
	abstract     = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource  is  based  on  a  corpus  of  coursebook  texts,  and  thus  describes  receptive  vocabulary  learners  are  exposed  to  during  reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish.  An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e.  vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which
is used by the minority of the coursebook writers.},
	booktitle    = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016   Portorož, Slovenia},
	author       = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs},
	year         = {2016},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {978-2-9517408-9-1},
}

@inProceedings{pilan-etal-2016-predicting-247240,
	title        = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks},
	abstract     = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many
statistical NLP methods.   In this paper,  we explore whether data sparsity can be successfully
tackled  when  classifying  language  proficiency  levels  in  the  domain  of  learner-written  output
texts.   We  aim  at  overcoming  data  sparsity  by  incorporating  knowledge  in  the  trained  model
from another domain consisting of input texts written by teaching professionals for learners. We
compare different domain adaptation techniques and find that a weighted combination of the two
types of data performs best, which can even rival systems based on considerably larger amounts
of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially
improve classification when in-domain data with annotated proficiency levels is not available.},
	booktitle    = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka},
	author       = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten},
	year         = {2016},
	ISBN         = {978-4-87974-702-0},
}

@inProceedings{volodina-etal-2016-swell-248141,
	title        = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.},
	abstract     = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments.  The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task.  Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered.  The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation.  Inter-rater agreement is presented on the basis of SW1203 subcorpus.  The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline.  This article both describes
the resource and the “how-to” behind the compilation of SweLL.},
	booktitle    = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia},
	author       = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica},
	year         = {2016},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {978-2-9517408-9-1},
}

@inProceedings{daudaravicius-etal-2016-report-248143,
	title        = {A report on the Automatic Evaluation of Scientific Writing Shared Task.},
	abstract     = {The Automated Evaluation of Scientific Writing, or AESW, is the task of identifying sentences in need of correction to ensure their appropriateness in a scientific prose. The data set comes  from  a  professional  editing  company,
VTeX, with two aligned versions of the same text – before and after editing – and covers a variety of textual infelicities that proofreaders have edited.  While previous shared tasks focused solely on grammatical errors (Dale and Kilgarriff, 2011; Dale et al., 2012; Ng et al., 2013;  Ng et al.,  2014),  this time edits cover other  types  of linguistic  misfits  as  well,  including  those  that  almost  certainly  could  be interpreted as style issues and similar “matters of opinion”.   The latter arise because of different language editing traditions, experience,
and the absence of uniform agreement on what “good”  scientific  language  should  look  like. Initiating this task, we expected the participating teams to help identify the characteristics of “good” scientific language, and help create a consensus of which language improvements are acceptable (or necessary).  Six participating teams took on the challenge.},
	booktitle    = {Workshop on Innovative Use of NLP for Building Educational Applications, June 16, 2016, San Diego, CA, USA},
	author       = {Daudaravicius, Vidas and E. Banchs, Rafael and Volodina, Elena and Napoles, Courtney},
	year         = {2016},
	ISBN         = {978-1-941643-83-9},
}

@article{arharholdt-etal-2020-language-300072,
	title        = {Language teachers and crowdsourcing: Insights from a cross-European survey.},
	abstract     = {he paper presents a cross-European survey on teachers and crowdsourcing. The survey examines how familiar language teachers are with the concept of crowdsourcing and addresses their attitude towards including crowdsourcing into language teaching activities. The survey was administrated via an online questionnaire and collected volunteers’ data on: (a) teachers’ experience with organizing crowdsourcing activities for students/pupils, (b) the development of crowdsourced resources and materials as well as (c) teachers’ motivation for participating in or employing crowdsourcing activities. The questionnaire was disseminated in over 30 European countries. The final sample comprises 1129 language teachers aged 20 to 65, mostly working at institutions of tertiary education. The data indicates that many participants are not familiar with the concept of crowdsourcing resulting in a low rate of crowdsourcing activities in the classroom. However, a high percentage of responding teachers is potentially willing to crowdsource teaching materials for the language(s) they teach. They are particularly willing to collaborate with other teachers in the creation of interactive digital learning materials, and to select, edit, and share language examples for exercises or tests. Since the inclusion of crowdsourcing activities in language teaching is still in its initial stage, steps for further research are highlighted.},
	journal      = {Rasprave: Časopis Instituta za hrvatski jezik i jezikoslovlje},
	author       = {Arhar Holdt, Špela and Zviel-Girshin, Rina and Gajek, Elżbieta and Durán-Muñoz, Isabel and Bago, Petra and Fort, Karën and Hatipoglu, Ciler and Kasperavičienė, Ramunė and Koeva, Svetla and Lazić Konjik, Ivana and Miloshevska, Lina and Ordulj, Antonia and Rodosthenous, Christos and Volodina, Elena and Weber, Tassja and Zanasi, Lorenzo},
	year         = {2020},
	volume       = {46},
	number       = {1},
	pages        = {1--28},
}

@inProceedings{masciolini-etal-2023-towards-329384,
	title        = {Towards automatically extracting morphosyntactical error patterns from L1-L2 parallel dependency treebanks},
	abstract     = {L1-L2 parallel dependency treebanks are UD-annotated corpora of learner sentences paired with correction hypotheses. Automatic morphosyntactical annotation has the potential to remove the need for explicit manual error tagging and improve interoperability, but makes it more challenging to locate grammatical errors in the resulting datasets. We therefore propose a novel method for automatically extracting morphosyntactical error patterns and perform a preliminary bilingual evaluation of its first implementation through a similar example retrieval task. The resulting pipeline is also available as a prototype CALL application.},
	booktitle    = {Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), July 13, 2023, Toronto, Canada},
	author       = {Masciolini, Arianna and Volodina, Elena and Dannélls, Dana},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {978-1-959429-80-7},
}

@inProceedings{lindstromtiedemann-etal-2022-cefr-321899,
	title        = {CEFR-nivåer och svenska flerordsuttryck},
	abstract     = {När vi lär oss ett nytt språk ska vi inte bara lära oss enstaka ord och hur vi använder dessa, utan vi måste också lära oss vilka ordkombinationer som är ”fasta uttryck” till betydelsen (t.ex. hälsa på någon) eller till formen (t.ex. lättare sagt än gjort) eller båda delarna (t.ex. huller om buller). Enligt en del studier kan dessa uttryck utgöra så mycket som 50 % av vokabulären i ett språk som förstaspråk (L1) eller ännu mer (Jackendoff 1997; Erman 2007, 28). Men det är möjligt att de är vanligare i vardagligt språk och talspråk (Prentice & Sköldberg 2013). Flerordsenheter kan vara problematiska för andraspråkstalare (Nesselhauf 2003, 223) till och med på avancerad nivå (jfr Pawley & Syder 1983; Wray & Perkins 2000; Nesselhauf 2003; Prentice 2010). Samtidigt är de en helt nödvändig del av språket (Nesselhauf 2003, 223) och kan utmärka andraspråkstalarna som icke-modersmålstalare (Pawley & Syder 1983; Wray 2002). Flerordsuttryck är alltså en värdefull del av andraspråkskompetensen (se även Paquot 2019) och något som är viktigt att studera hur vi på bästa sätt introducerar för L2-talaren och om de kan kopplas till nivåer i bedömning.
I den här studien presenterar vi resultat kring förståelsen av flerordsuttryck i svenska som andraspråk i relation till färdighetsnivåerna enligt Gemensam Europeisk Referensram för Språk (GERS eller CEFR, Common European Framework of Reference) (COE 2001; 2018; Skolverket 2009; Utbildningsstyrelsen 2018) genom crowdsourcing experiment.},
	booktitle    = {Svenskan i Finland 19 : föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa den 6-7 maj 2021 / redigerade av Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund},
	author       = {Lindström Tiedemann, Therese and Alfter, David and Volodina, Elena},
	year         = {2022},
	publisher    = {Svensk-Österbottniska Samfundet},
	address      = {Vasa},
	ISBN         = {978-952-69650-5-5},
}

@inProceedings{volodina-etal-2023-grandma-328176,
	title        = {Grandma Karl is 27 years old – research agenda for pseudonymization of research data},
	abstract     = {Accessibility of research data is critical for advances in many research fields, but textual data often cannot be shared due to the personal and sensitive information which it con- tains, e.g names or political opinions. General Data Protection Regulation (GDPR) suggests pseudonymization as a solution to secure open access to research data, but we need to learn more about pseudonymization as an approach before adopting it for manipulation of research data. This paper outlines a research agenda within pseudonymization, namely need of studies into the effects of pseudonymization on unstructured data in relation to e.g. readability and language assessment, as well as the effectiveness of pseudonymization as a way of protecting writer identity, while also exploring different ways of developing context-sensitive algorithms for detection, labelling and replacement of personal information in unstructured data. The recently granted project on pseudonymization ‘Grandma Karl is 27 years old’1 addresses exactly those challenges.},
	booktitle    = {2023 IEEE Ninth International Conference on Big Data Computing Service and Applications (BigDataService), Athens, Greece, 2023},
	author       = {Volodina, Elena and Dobnik, Simon and Lindström Tiedemann, Therese and Vu, Xuan-Son},
	year         = {2023},
	publisher    = {IEEE Computer Society},
	address      = {Los Alamitos},
	ISBN         = {979-8-3503-3379-4},
}

@article{kosem-etal-2019-image-275354,
	title        = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture},
	abstract     = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section},
	journal      = {International Journal of Lexicography},
	author       = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer, Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners, Local},
	year         = {2019},
	volume       = {32},
	number       = {1},
	pages        = {92–114},
}

@inProceedings{volodina-etal-2022-swedish-321985,
	title        = {Swedish L2 profile - a tool for exploring L2 data.},
	abstract     = {Learner corpus researchers, NLP researchers, as well as Digital Humanities and Social Sciences in general, rely on access to various data sets for empirical analysis, statistical insights, and/or for model building. However, interpretation of data is a non-trivial task and there is a need for data visualization tools. One such attempt is the Swedish L2 profile (SweL2P) – an ongoing project setting up the first digital tool allowing users to explore written Swedish learner language from a linguistic point of view.},
	booktitle    = {Learner Corpus Research conference, 22-24 September, Padua, Italy},
	author       = {Volodina, Elena and Lindström Tiedemann, Therese and Ali Mohammed, Yousuf},
	year         = {2022},
	address      = {Universitá degli Studi di Padova, Padua, Italy},
}

@techreport{megyesi-etal-2021-swell-311730,
	title        = {SweLL pseudonymization guidelines},
	abstract     = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to
report how we have worked on the material and which decisions we have made. Guidelines are
available for each step in the manual annotation process, including:
• Transcription guidelines
• Pseudonymization guidelines
• Normalization guidelines
• Correction annotation guidelines
We specifically described all processes in English to make sure our principles and experience can
be of help to people working on other learner infrastructure projects independent of the language.},
	author       = {Megyesi, Beáta and Rudebeck, Lisa and Volodina, Elena},
	year         = {2021},
	publisher    = {Institutionen för svenska språket, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {1401-5919},
}

@inProceedings{casademontmoner-volodina-2022-generation-321987,
	title        = {Generation of Synthetic Error Data of Verb Order Errors for Swedish},
	abstract     = {We report on our work-in-progress to generate a synthetic error dataset for Swedish by replicating errors observed in the authentic error annotated dataset. We analyze a small subset of authentic errors, capture regular patterns based on parts of speech, and design a set of rules to corrupt new data. We explore the approach and identify its capabilities, advantages and limitations as a way to enrich the existing collection of error-annotated data. This work focuses on word order errors, specifically those involving the placement of finite verbs in a sentence.},
	booktitle    = {NAACL workshop on Innovative Use of NLP for Building Educational Applications, July 15, 2022, Seattle, Washington},
	author       = {Casademont Moner, Judit and Volodina, Elena},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
	address      = {Seattle, Washington},
	ISBN         = {978-1-955917-83-4},
}

@incollection{volodina-etal-2022-lyxig-321974,
	title        = {Lyxig språklig födelsedagspresent from the Swedish Word Family.},
	abstract     = {Morphology and lexical resources are known to be two of Lars Borin’s biggest research passions.
We have, therefore, prepared a short description of a new kind of a lexical resource for Swedish,
the Swedish Word Family. The resource is compiled based on learner corpora, and contains lexical
items manually analyzed for derivational morphology.},
	booktitle    = {Live and Learn- Festschrift in honor of Lars Borin},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese},
	year         = {2022},
	publisher    = {Department of Swedish, Multilingualism, Language Technology},
	address      = {Gothenburg, Sweden},
	ISBN         = {978-91-87850-83-7},
}

@incollection{volodina-alfter-2022-icall-321984,
	title        = {ICALL: Research versus reality check.},
	abstract     = {Intelligent Computer-Assisted Language Learning has been one of Lars Borin’s research interests.
The work on the Lärka language learning platform has started under his coordination. We see it
our mission to make the platform live and prosperous, and through it to stimulate research into
Swedish as a second language. Below, we name some weaknesses we have identified in Lärka
while working with a course of beginner Swedish and outline our plans for tackling those.},
	booktitle    = {Live and Learn- Festschrift in honor of Lars Borin},
	author       = {Volodina, Elena and Alfter, David},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
	pages        = {145--152},
}

@edited_book{volodina-etal-2022-live-320415,
	title        = {Live and Learn- Festschrift in honor of Lars Borin},
	abstract     = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.},
	editor       = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
}

@inProceedings{stemle-etal-2019-working-319453,
	title        = {Working together towards an ideal infrastructure for language learner corpora},
	abstract     = {In this article we provide an overview of first-hand experiences and vantage points for best practices from projects in seven European countries dedicated to learner corpus research (LCR) and the creation of language learner corpora. The corpora and tools involved in LCR are becoming more and more important, as are careful preparation and easy retrieval and reusability of corpora and tools. But the lack of commonly agreed solutions for many aspects of LCR, interoperability between learner corpora and the exchange of data from different learner corpus projects remains a challenge. We show how concepts like metadata, anonymization, error taxonomies and linguistic annotations as well as tools, toolchains and data formats can be individually challenging and how the challenges can be solved. },
	booktitle    = {Widening the Scope of Learner Corpus Research. Selected papers from the fourth Learner Corpus Research Conference. Corpora and Language in Use – Proceedings 5 / Andrea Abel, Aivars Glaznieks, Verena Lyding and Lionel Nicolas (eds.)},
	author       = {Stemle, Egon and Boyd, Adriane and Janssen, Maarten and Preradović, Nives Mikelić and Rosen, Alexandr and Rosén, Dan and Volodina, Elena},
	year         = {2019},
	publisher    = {PUL, Presses Universitaires de Louvain},
	address      = {Louvain-la-Neuve },
	ISBN         = {978-2-87558-868-5},
}

@article{alfter-etal-2021-crowdsourcing-311721,
	title        = {Crowdsourcing Relative Rankings of Multi-Word Expressions: Experts versus Non-Experts},
	abstract     = {In this study we investigate to which degree experts and non-experts agree on questions of difficulty in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language.},
	journal      = {Northern European Journal of Language Technology (NEJLT)},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2021},
	volume       = {7},
	number       = {1},
}

@techreport{volodina-megyesi-2021-swell-311729,
	title        = {SweLL transcription guidelines, L2 essays},
	abstract     = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to
report how we have worked on the material and which decisions we have made. Guidelines are
available for each step in the manual annotation process, including:
• Transcription guidelines
• Pseudonymization guidelines
• Normalization guidelines
• Correction annotation guidelines
We specifically described all processes in English to make sure our principles and experience can
be of help to people working on other learner infrastructure projects independent of the language.},
	author       = {Volodina, Elena and Megyesi, Beáta},
	year         = {2021},
	publisher    = {Institutionen för svenska språket, Göteborgs universitet},
	address      = {Göteborg},
}

@inProceedings{volodina-etal-2021-dalaj-311725,
	title        = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish},
	abstract     = {We present DaLAJ 1.0, a Dataset for Linguistic Acceptability Judgments for Swedish, comprising 9 596 sentences in its first version. DaLAJ is based on the SweLL second language learner data (Volodina et al., 2019), consisting of essays at different levels of proficiency. To make sure the dataset can be freely available despite the GDPR regulations, we have sentence-scrambled learner essays and removed part of the metadata about learners, keeping for each sentence only information about the mother tongue and the level of the course where the essay has been written. We use the normalized version of learner language as the basis for DaLAJ sentences, and keep only one error per sentence. We repeat the same sentence for each individual correction tag used in the sentence. For DaLAJ 1.0 four error categories of 35 available in SweLL are used, all connected to lexical or word-building choices. The dataset is included in the SwedishGlue benchmark. Below, we describe the format of the dataset, our insights and motivation for the chosen approach to data sharing.},
	booktitle    = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021), Online},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Klezl, Julia},
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-625-4},
}

@inProceedings{volodina-etal-2021-coderoomor-311724,
	title        = {CoDeRooMor: A new dataset for non-inflectional morphology studies of Swedish},
	abstract     = {The paper introduces a new resource, CoDeRooMor, for studying the morphology of modern Swedish word formation. The approximately 16.000 lexical items in the resource have been manually segmented into word-formation morphemes, and labeled for their categories, such as prefixes, suffixes, roots, etc. Word-formation mechanisms, such as derivation and compounding have been associated with each item on the list. The article describes the selection of items for manual annotation and the principles of annotation, reports on the reliability of the manual annotation, and presents tools, resources and some first statistics. Given the”gold” nature of the resource, it is possible to use it for empirical studies as well as to develop linguistically-aware algorithms for morpheme segmentation and labeling (cf statistical subword approach). The resource is freely available through Språkbanken-Text.},
	booktitle    = { 23rd Nordic Conference on Computational Linguistics (NoDaLiDa) Proceedings, May 31–2 June, 2021, Reykjavik, Iceland Online / Simon Dobnik, Lilja Øvrelid (Editors)},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese},
	year         = {2021},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@article{zanetti-etal-2021-automatic-311723,
	title        = {Automatic Generation of Exercises for Second Language Learning from Parallel Corpus Data},
	abstract     = {Creating language learning exercises is a time-consuming task and made-up sample sentences frequently lack authenticity. Authentic samples can be obtained from corpora, but it is necessary to identify material that is suitable for language learners. Parallel corpora of written text consist of translated material. Comparing the text in one language with its translation into another (known) language makes the structure accessible to the learner. However, the correspondence of words between the two languages is more important. By carefully selecting well-suited parallel sentences, a learner can explore the target language in a guided way. We present an approach to generate a novel type of language learning exercise from a large parallel corpus based on movie subtitles. The size of the corpus allows for defining selective criteria, favoring precision over recall. It is a non-trivial task to give reliable feedback to automatically generated exercises. ICALL literature often deals with fill-inthe-blanks exercises or multiple-choice questions, which allow for very limited answer options. Our proposed exercise is a special case of sentence reconstruction on bilingual sentence pairs. It combines two elements which have proven to be effective for language learning: a gamified approach, to awaken the students’ competitive desire, and the identification of syntactic structures and vocabulary use, to improve language sensitivity. This article presents the methods used to select example pairs and to implement a prototype. },
	journal      = {International Journal of TESOL Studies},
	author       = {Zanetti, Arianna and Volodina, Elena and Graën, Johannes},
	year         = {2021},
	volume       = {3},
	number       = {2},
	pages        = {55--71},
}

@edited_book{alfter-etal-2021-proceedings-311727,
	title        = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language
Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural
Language Processing and Speech Technologies in CALL systems and exploring the theoretical and
methodological issues arising in this connection. The latter includes, among others, the integration of
insights from Second Language Acquisition (SLA) research, and the promotion of “Computational
SLA” through setting up Second Language research infrastructures.},
	editor       = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Graën, Johannes and Borin, Lars},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings 177},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-625-4},
}

@incollection{prentice-etal-2021-language-310517,
	title        = {Language learning and teaching with Swedish FrameNet++: Two examples},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	editor       = {Dana Dannélls and Lars Borin and Karin Friberg Heppin},
	author       = {Prentice, Julia and Håkansson, Camilla and Linström Tiedemann, Therese and Pilán, Ildikó and Volodina, Elena},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam, Philadelphia},
	ISBN         = {9789027258489},
	pages        = {304–329},
}

@inProceedings{alfter-etal-2020-expert-300074,
	title        = {Expert judgments versus crowdsourcing in ordering multi-word expressions},
	abstract     = {In  this  study  we  investigate  to  which  degree  experts  and  non-experts  agree  on questions  of  linguistic  complexity  in  a  crowdsourcing  experiment.  We  ask  non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions  in  a  crowdsourcing  experiment. We  find  that  the  resulting  rankings by all the three tested groups correlate to a very high degree, which suggests that judgments  produced  in  a  comparative  setting  are  not  influenced  by  professional insights into Swedish as a second language.  },
	booktitle    = {Proceedings of the Swedish Language Technology Conference (SLTC), 25–27 November 2020, (Online)},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2020},
}

@misc{alfter-etal-2020-proceedings-300071,
	title        = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.
This collection presents four selected papers describing use of Language Technology for language learning.},
	author       = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars},
	year         = {2020},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-732-9},
}

@inProceedings{volodina-etal-2020-towards-300069,
	title        = {Towards Privacy by Design in Learner Corpora Research: A Case of On-the-fly Pseudonymization of Swedish Learner Essays},
	abstract     = {This article reports on an ongoing project aiming at automatization of pseudonymization of learner essays. The process includes three steps: identification of personal information in an unstructured text, labeling for a category, and pseudonymization. We experiment with rule-based methods for detection of 15 categories out of the suggested 19 (Megyesi et al., 2018) that we deem important and/or doable with automatic approaches. For the detection and labeling steps, we use resources covering personal names, geographic names, company and university names and others. For the pseudonymization step, we replace the item using another item of the same type from the above-mentioned resources. Evaluation of the detection and labeling steps are made on a set of manually anonymized essays. The results are promising and show that 89% of the personal information can be successfully identified in learner data, and annotated correctly with an inter-annotator agreement of 86% measured as Fleiss kappa and Krippendorff's alpha.},
	booktitle    = {Proceedings of the 28th International Conference on Computational Linguistics (COLING), December 8-13, 2020, Barcelona, Spain (Online)},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Derbring, Sandra and Matsson, Arild and Megyesi, Beata},
	year         = {2020},
	publisher    = {International Committee on Computational Linguistics},
	ISBN         = {978-1-952148-27-9},
}

@article{volodina-etal-2019-swell-285609,
	title        = {The SweLL Language Learner Corpus: From Design to Annotation},
	abstract     = {The article presents a new language learner corpus for Swedish, SweLL, and the methodology from collection and pesudonymisation to protect personal information of learners to annotation adapted to second language learning. The main aim is to deliver a well-annotated corpus of essays written by second language learners of Swedish and make it available for research through a browsable environment. To that end, a new annotation tool and a new project management tool have been implemented, – both with the main purpose to ensure reliability and quality of the final corpus. In the article we discuss reasoning behind metadata selection, principles of gold corpus compilation and argue for separation of normalization from correction annotation.},
	journal      = {Northern European Journal of Language Technology},
	author       = {Volodina, Elena and Granstedt, Lena and Matsson, Arild and Megyesi, Beáta and Pilán, Ildikó and Prentice, Julia and Rosén, Dan and Rudebeck, Lisa and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats},
	year         = {2019},
	volume       = {6},
	pages        = {67--104},
}

@inProceedings{pilan-etal-2017-larka-289884,
	title        = {Lärka: an online platform where language learning meets natural language processing},
	booktitle    = {7th ISCA Workshop on Speech and Language Technology in Education, 25-26 August 2017, Stockholm, Sweden},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2017},
}

@inProceedings{volodina-etal-2019-svala-285617,
	title        = {SVALA: an Annotation Tool for Learner Corpora generating parallel texts},
	abstract     = {Learner corpora are actively used for research on Language Acquisition and in Learner Corpus Research (LCR).  The  data  is,  however,  very  expensive  to  collect  and  manually  annotate,  and  includes  steps  like  anonymization,  normalization, error annotation, linguistic annotation. In the past, projects often re - used tools from a number of  different projects for the above steps. As a result, various input and output formats between the tools needed to  be converted, which increased the complexity of the task. In  the  present  project,  we  are  developing  a  tool  that  handles  all  of  the  above - mentioned  steps  in  one  environment maintaining a stable interpretable  format between the  steps. A distinguishing feature of the tool is  that users work in a usual environment (plain text) while the tool visualizes all performed edits via a graph that  links an original learner text with an edited one, token by token.},
	booktitle    = {Learner Corpus Research conference (LCR-2019), Warsaw, 12-14 September 2019, Book of abstracts},
	author       = {Volodina, Elena and Matsson, Arild and Rosén, Dan and Wirén, Mats},
	year         = {2019},
}

@misc{alfter-etal-2019-proceedings-285613,
	title        = {Proceedings of the 8th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2019), September 30, Turku Finland},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promote development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools.

The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field.
},
	author       = {Alfter, David and Volodina, Elena and Borin, Lars and Pilán, Ildikó and Lange, Herbert},
	year         = {2019},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköping},
	ISBN         = {978-91-7929-998-9},
}

@inProceedings{alfter-volodina-2019-from-285728,
	title        = {From river to bank: The importance of sense-based graded word lists},
	booktitle    = { EUROCALL 2019 - CALL and Complexity, Book of Abstracts, Louvain-la-Neuve, Belgium, 28-31 August 2019},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2019},
}

@inProceedings{wiren-etal-2018-svala-285624,
	title        = {SVALA: Annotation of Second-Language Learner Text Based on Mostly Automatic Alignment of Parallel Corpora},
	abstract     = {Annotation of second-language learner text is a cumbersome manual task which in turn requires interpretation to postulate the intended meaning of the learner’s language. This paper describes SVALA, a tool which separates the logical steps in this process while providing rich visual support for each of them. The first step is to pseudonymize the learner text to fulfil the legal and ethical requirements for a distributable learner corpus. The second step is to correct the text, which is carried out in the simplest possible way by text editing. During the editing, SVALA automatically maintains a parallel corpus with alignments between words in the learner source text and corrected text, while the annotator may repair inconsistent word alignments. Finally, the actual labelling of the corrections (the postulated errors) is performed. We describe the objectives, design and workflow of SVALA, and our plans for further development.
},
	booktitle    = {Selected papers from the CLARIN Annual Conference 2018, Pisa, 8-10 October 2018},
	editor       = {Inguna Skadina and Maria Eskevich},
	author       = {Wirén, Mats and Matsson, Arild and Rosén, Dan and Volodina, Elena},
	year         = {2018},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköpings universitet},
	ISBN         = {978-91-7685-034-3},
}

@inProceedings{alfter-etal-2019-legato-285625,
	title        = {LEGATO: A flexible lexicographic annotation tool.},
	abstract     = {This article is a report from an ongoing project aiming at analyzing lexical and grammatical competences of Swedish as a Second language (L2). To facilitate lexical analysis, we need access to metalinguistic information about relevant vocabulary that L2 learners can use and understand. The focus of the current article is on the lexical annotation of the vocabulary scope for a range of lexicographical aspects, such as morphological analysis, valency, types of multi-word units, etc. We perform parts of the analysis automatically, and other parts manually. The rationale behind this is that where there is no possibility to add information automatically, manual effort needs to be added. To facilitate the latter, a tool LEGATO has been designed, implemented and currently put to active testing.},
	booktitle    = {Linköping Electronic Conference Proceedings, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland Editor(s): Mareike Hartman and Barbara Plank},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2019},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping university},
	ISBN         = {978-91-7929-995-8},
}

@inProceedings{alfter-etal-2019-larka-281344,
	title        = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2019},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-034-3},
}

@inProceedings{rosen-etal-2018-error-275363,
	title        = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. },
	abstract     = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.},
	booktitle    = {Proceedings of CLARIN-2018 conference,  8-10 October 2018, Pisa, Italy},
	author       = {Rosén, Dan and Wirén, Mats and Volodina, Elena},
	year         = {2018},
}

@inProceedings{volodina-etal-2018-annotation-275361,
	title        = {Annotation of learner corpora: first SweLL insights.},
	abstract     = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. },
	booktitle    = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018},
	author       = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats},
	year         = {2018},
}

@inProceedings{alfter-volodina-2018-whole-275362,
	title        = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.},
	abstract     = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.},
	booktitle    = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2018},
}

@inProceedings{pilan-volodina-2018-exploring-275366,
	title        = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.},
	abstract     = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors.
},
	booktitle    = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computation Linguistics },
	ISBN         = {978-1-948087-61-2},
}

@inProceedings{pilan-volodina-2018-investigating-275367,
	title        = {Investigating the importance of linguistic complexity features across different datasets related to language learning.},
	abstract     = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions.
},
	booktitle    = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computational Linguistics },
	ISBN         = {978-1-948087-62-9},
}

@inProceedings{alfter-volodina-2018-towards-275368,
	title        = {Towards Single Word Lexical Complexity Prediction.},
	abstract     = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.},
	booktitle    = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-948087-11-7},
}

@inProceedings{megyesi-etal-2018-learner-275359,
	title        = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish},
	abstract     = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.},
	booktitle    = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018},
	editor       = {Ildikó Pilán and Elena Volodina and David Alfter and Lars Borin},
	author       = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats and Volodina, Elena},
	year         = {2018},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköpings universitet},
	ISBN         = {978-91-7685-173-9},
}

@misc{pilan-etal-2018-proceedings-275358,
	title        = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 },
	abstract     = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.},
	author       = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars},
	year         = {2018},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköpings universitet},
	ISBN         = {978-91-7685-173-9},
}

@inProceedings{volodina-etal-2018-interoperability-275365,
	title        = {Interoperability of Second Language Resources and Tools},
	abstract     = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.},
	booktitle    = {Proceedings of CLARIN-2018 conference},
	author       = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad},
	year         = {2018},
}

@inProceedings{alfter-etal-2018-from-275364,
	title        = {From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.},
	booktitle    = {Proceedings of CLARIN-2018 conference, Pisa, Italy},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2018},
}

@article{pilan-etal-2016-readable-226565,
	title        = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.},
	abstract     = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.},
	journal      = {Computational Linguistics and Applications},
	author       = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena},
	year         = {2016},
	volume       = {7},
	number       = {1},
	pages        = {143--159},
}

@inProceedings{borin-etal-2017-clarin-261157,
	title        = {Swe-Clarin: Language resources and technology for Digital Humanities},
	abstract     = {CLARIN is a European Research Infrastructure Consortium (ERIC), which aims at (a) making extensive language-based materials available as primary research data to the humanities and social sciences (HSS); and (b) offering state-of-the-art language technology (LT) as an e-research tool for this purpose, positioning CLARIN centrally in what is often referred to as the digital humanities (DH). The Swedish CLARIN node Swe-Clarin was established in 2015 with funding from the Swedish Research Council.

In this paper, we describe the composition and activities of Swe-Clarin, aiming at meeting the requirements of all HSS and other researchers whose research involves using text and speech as primary research data, and spreading the awareness of what Swe-Clarin can offer these research communities. We focus on one of the central means for doing this: pilot projects conducted in collaboration between HSS researchers and Swe-Clarin, together formulating a research question, the addressing of which requires working with large language-based materials. Four such pilot projects are described in more detail, illustrating research on rhetorical history, second-language acquisition, literature, and political science. A common thread to these projects is an aspiration to meet the challenge of conducting research on the basis of very large amounts of textual data in a consistent way without losing sight of the individual cases making up the mass of data, i.e., to be able to move between Moretti’s “distant” and “close reading” modes. 

While the pilot projects clearly make substantial contributions to DH, they also reveal some needs for more development, and in particular a need for document-level access to the text materials. As a consequence of this, work has now been initiated in Swe-Clarin to meet this need, so that Swe-Clarin together with HSS scholars investigating intricate research questions can take on the methodological challenges of big-data language-based digital humanities.},
	booktitle    = {Digital Humanities 2016. Extended Papers of the International Symposium on Digital Humanities (DH 2016) Växjö, Sweden, November, 7-8, 2016.  Edited by Koraljka Golub, Marcelo Milra.  Vol-2021},
	author       = {Borin, Lars and Tahmasebi, Nina and Volodina, Elena and Ekman, Stefan and Jordan, Caspar and Viklund, Jon and Megyesi, Beáta and Näsman, Jesper and Palmér, Anne and Wirén, Mats and Björkenstam, Kristina and Grigonyte, Gintare and Gustafson Capková, Sofia and Kosiński, Tomasz},
	year         = {2017},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen.},
	address      = {Aachen},
}

@article{pilan-etal-2017-candidate-260382,
	title        = {Candidate sentence selection for language learning exercises: From a comprehensive framework to an empirical evaluation},
	abstract     = {We present a framework and its implementation relying on Natural Language Processing methods, which aims at the identification of exercise item candidates from corpora. The hybrid system combining heuristics and machine learning methods includes a number of relevant selection criteria. We focus on two fundamental aspects: linguistic complexity and the dependence of the extracted sentences on their original context. Previous work on exercise generation addressed these two criteria only to a limited extent, and a refined overall candidate sentence selection framework appears also to be lacking. In addition to a detailed description of the system, we present the results of an empirical evaluation conducted with language teachers and learners which indicate the usefulness of the system for educational purposes. We have integrated our system into a freely available online learning platform.},
	journal      = {Revue Traitement Automatique des Langues. Special issue on NLP for Learning and Teaching},
	author       = {Pilán, Ildikó and Volodina, Elena and Borin, Lars},
	year         = {2017},
	volume       = {57},
	number       = {3},
	pages        = {67--91},
}

@misc{volodina-etal-2017-preface-262846,
	title        = {Preface. Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning
and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017},
	abstract     = {For the second year in a row we brought two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together. The goal of organizing joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications. The resulting volume covers a variety of topics from the two fields and - hopefully - showcases the challenges and achievements in the field.
The seven papers in this volume cover native language identification in learner writings, using syntactic complexity development in language learner language to identify reading comprehension texts of appropriate level, exploring the potential of parallel corpora to predict mother-language specific problem areas for learners of another language, tools for learning languages - both well-resourced ones such as English as well as endangered or under-resourced ones such as Yakut and Võro, as well as exploring the potential of automatically identifying and correcting word-level errors in Swedish learner writing.},
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina},
	year         = {2017},
	volume       = {30},
	pages        = {i--vi},
}

@inProceedings{volodina-etal-2017-svalex-262848,
	title        = {SVALex. En andraspråksordlista med CEFR-nivåer},
	abstract     = {När man planerar att utveckla en språkkurs i ett andra- eller främmandespråk (L2) ställs man inför utmaningen att definiera vilket ordförråd inlärarna behöver tillägna sig. Forskning inom andraspråksinlärning tyder på att läsaren behöver kunna 95–98 % av löporden i en text för att förstå den (Laufer & Ravenhorst-Kalovski 2010). Sådana studier är användbara för att uppskatta storleken på det ordförråd som behövs för att tillägna sig innehållet i en text, men de ger ingen närmare metodologisk vägledning för den som vill utveckla nivåstrukturerade läromedel eller kurser för andraspråksundervisning. Speciellt tydligt är detta inom CALL, Computer-Assisted Language Learning, där läromaterial (t.ex. övningar) genereras automatiskt, och behöver elektroniska resurser som kunskapskälla.

Man kan istället angripa problemet från andra hållet. Om man har en samling nivåklassificerade texter för andraspråksinlärare kan man utifrån dem bygga ordlistor där varje ord är placerat på en färdighetsskala. Om man känner till den förutsatta färdighetsnivån hos läsaren, kan man helt enkelt anta att den textnivå där ett ord dyker upp första gången också anger ordets svårighetsgrad. SVALex är ett lexikon som har byggts enligt den principen. Resursen ska kunna användas av inlärare och lärare i svenska som andraspråk, men även av lexikografer, av kursutvecklare och provkonstruktörer samt av dem som likt oss själva ägnar sig åt utveckling av språkteknologibaserade datorstöd för språkinlärning och språktestning.

SVALex utgör en vidareutveckling i förhållande till tidigare lexikonresurser för svenska som andraspråk (se avsnitt 2), genom att den konsekvent relaterar de 15 681 lexikoningångarna till en vida använd färdighetsskala för andra- och främmandespråksinlärning, Europarådets gemensamma europeiska referensram för språk (Common European Framework of Reference, i fortsättningen refererad till som CEFR) (Council of Europe 2001; Skolverket 2009).

Nivåklassningen av lexikonenheterna i SVALex görs på basis av deras distribution i COCTAILL, en korpus innehållande lärobokstexter i svenska som andraspråk, där lärare har placerat in varje text i någon av CEFR-nivåerna (Volodina et al. 2014).
},
	booktitle    = {Svenskans beskrivning. 35, Förhandlingar vid trettiofemte sammankomsten : Göteborg 11–13 maj 2016 / Redigerad av Emma Sköldberg, Maia Andréasson, Henrietta Adamsson Eryd, Filippa Lindahl, Sven Lindström, Julia Prentice & Malin Sandberg},
	author       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó and François, Thomas and Tack, Annaïs},
	year         = {2017},
	publisher    = {Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-64-6},
}

@misc{volodina-etal-2017-proceedings-262838,
	title        = {Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017},
	abstract     = {For the second year in a row we have brought the two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together under one umbrella. The goal of organizing these joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications.},
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina},
	year         = {2017},
	publisher    = {Linköping University Press},
	address      = {Linköping, Sweden},
	ISBN         = { 978-91-7685-502-7},
}

@misc{volodina-etal-2016-preface-248087,
	title        = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	abstract     = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below.

The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics.

The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition.

The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.
},
	author       = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars},
	year         = {2016},
	number       = {130},
	pages        = { i–viii},
}

@inProceedings{volodina-etal-2016-classification-246346,
	title        = {Classification of Swedish learner essays by CEFR levels},
	abstract     = {The paper describes initial efforts on creating a system for the automatic assessment  of  Swedish  second  language  (L2)  learner  essays  from  two  points  of  view: holistic evaluation of the reached level according to the  Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. },
	booktitle    = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.},
	author       = {Volodina, Elena and Pilán, Ildikó and Alfter, David},
	year         = {2016},
	publisher    = {Research-publishing.net},
	ISBN         = { 978-1-908416-44-5},
}

@inProceedings{alfter-volodina-2016-modeling-246347,
	title        = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System},
	booktitle    = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2016},
}

@misc{volodina-etal-2016-proceedings-248081,
	title        = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	abstract     = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below.

The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics.

The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition.

The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.},
	author       = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-etal-2016-swellex-248090,
	title        = {SweLLex: second language learners' productive vocabulary.},
	abstract     = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.},
	booktitle    = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016},
	author       = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-pilan-2016-svalex-248116,
	title        = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.},
	booktitle    = {Svenskans Beskrivning 35, Göteborg 2016},
	author       = {Volodina, Elena and Pilán, Ildikó},
	year         = {2016},
}

@inProceedings{volodina-etal-2016-swell-248145,
	title        = {SweLL – en korpus med L2 uppsatser för CEFR studier.},
	booktitle    = {Svenskans Beskrivning 35, Göteborg 2016},
	author       = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica},
	year         = {2016},
}

@inProceedings{pilan-volodina-2016-classification-248099,
	title        = {Classification of Language Proficiency Levels in Swedish Learners' Texts},
	abstract     = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency.   Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task.  The additional domain consists of coursebook texts written by experts for learners.  We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.},
	booktitle    = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2016},
}

@inProceedings{lindstromtiedemann-volodina-2016-larka-248119,
	title        = {Lärka som didaktiskt verktyg. Undersökning om studenternas metaspråkliga kunskap.},
	booktitle    = {Svenskans Beskrivning 35, 11-13  maj 2016, Göteborg},
	author       = {Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2016},
}

@inProceedings{pilan-etal-2016-coursebook-246349,
	title        = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings},
	abstract     = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels.  We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels.  With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task.  Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. },
	booktitle    = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2016},
	ISBN         = {978-4-87974-709-9},
}

@article{lindstromtiedemann-etal-2016-larka-248112,
	title        = {Lärka: ett verktyg för träning av språkterminologi och grammatik},
	abstract     = {Lärka is a corpus-based tool, which allows students to practise and learn grammar based on authentic material. In this study we present how this has been used at four universities. We also use our logs to try to assess the students metalinguistic awareness in relation to international studies, and discuss how these logs can be used in the future.},
	journal      = {LexicoNordica},
	author       = {Lindström Tiedemann, Therese and Volodina, Elena and Jansson, Håkan},
	year         = {2016},
	volume       = {23},
	pages        = {161--181},
}

@inProceedings{volodina-etal-2016-friend-248093,
	title        = {A Friend in Need? Research agenda for electronic Second Language infrastructure.},
	abstract     = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure.  Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.},
	booktitle    = {Proceedings of the Swedish Language Technology Conference},
	author       = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög},
	year         = {2016},
	publisher    = {Umeå Universitet},
}

@inProceedings{alfter-etal-2016-from-246345,
	title        = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora},
	abstract     = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral.
},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	ISBN         = {978-91-7685-633-8},
}

@inProceedings{volodina-etal-2014-what-206132,
	title        = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.},
	abstract     = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign  language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at
Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus.
In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported
on a subset of the corpus.
Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others.
The paper also presents our first quantitative exploration of the corpus where we focus on
textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical
focus of exercises across levels.
The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. },
	booktitle    = {NEALT Proceedings Series},
	author       = {Volodina, Elena and Pilán, Ildikó and Rødven-Eide, Stian and Heidarsson, Hannes},
	year         = {2014},
	volume       = {22},
	ISBN         = {978-91-7519-175-1},
	pages        = {128--144},
}

@edited_book{volodina-etal-2015-proceedings-226574,
	title        = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015},
	editor       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó},
	year         = {2015},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7519-036-5},
}

@inProceedings{volodina-pijetlovic-2015-lark-226543,
	title        = {Lark Trills for Language Drills: Text-to-speech technology for language learners.},
	abstract     = {This paper reports on the development and the initial evaluation of a dictation&spelling prototype exercise for second language (L2) learners of Swedish based on text-to-speech (TTS) technology. Implemented on an   already   existing   Intelligent   Computer-Assisted Language Learning (ICALL) platform, the exercise has not only served as a test case for TTS in L2 environment, but has also shown a potential to train listening and orthographic skills, as well as has become a
way of collecting learner-specific spelling errors into a database. Exercise generation re-uses well-annotated corpora, lexical resources, and text-to-speech technology with an accompanying talking head. },
	booktitle    = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 4, 2015, Denver, Colorado, USA},
	author       = {Volodina, Elena and Pijetlovic, Dijana},
	year         = {2015},
	ISBN         = {978-1-941643-35-8},
	pages        = {107--117},
}

@article{kilgariff-etal-2014-corpus-188541,
	title        = {Corpus-Based Vocabulary lists for Language Learners for Nine Languages.},
	abstract     = {We present the KELLY project and its work on developing monolingual and bilingual word lists for language learning, using corpus methods, for nine languages and thirty-six language pairs. We describe the method and discuss the many challenges encountered. We have loaded the data into an online database to make it accessible for anyone to explore and we present our own first explorations of it. The focus of the paper is thus twofold, covering pedagogical and methodological aspects of the lists’ construction, and linguistic aspects of the by-product of the project, the KELLY database. },
	journal      = {Language resources and evaluation},
	author       = {Kilgariff, Adam and Charalabopoulou, Frieda and Gavrilidou, Maria and Bondi Johannessen, Janne and Khalil, Saussan and Johansson Kokkinakis, Sofie and Lew, Robert and Sharoff, Serge and Vadlapudi, R. and Volodina, Elena},
	year         = {2014},
	volume       = {48},
	number       = {1},
	pages        = {121--163},
}

@inProceedings{pilan-etal-2014-rule-210940,
	title        = {Rule-based and machine learning approaches for second language sentence-level readability},
	abstract     = {We present approaches for the identification
of sentences understandable by second
language learners of Swedish, which
can be used in automatically generated exercises based on corpora. In this work we
merged methods and knowledge from machine
learning-based readability research,
from rule-based studies of Good Dictionary
Examples and from second language
learning syllabuses. The proposed selection
methods have also been implemented
as a module in a free web-based language
learning platform. Users can use
different parameters and linguistic filters
to personalize their sentence search with
or without a machine learning component
assessing readability. The sentences selected
have already found practical use as
multiple-choice exercise items within the
same platform. Out of a number of deep
linguistic indicators explored, we found
mainly lexical-morphological and semantic
features informative for second language
sentence-level readability. We obtained
a readability classification accuracy
result of 71%, which approaches the performance of other models used in similar
tasks. Furthermore, during an empirical
evaluation with teachers and students,
about seven out of ten sentences selected
were considered understandable, the rule-based approach slightly outperforming the
method incorporating the machine learning
model.},
	booktitle    = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA},
	author       = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard},
	year         = {2014},
	ISBN         = {978-1-941643-03-7},
	pages        = {174----184},
}

@inProceedings{volodina-lindstromtiedemann-2014-evaluating-206141,
	title        = {Evaluating students' metalinguistic knowledge with Lärka.},
	booktitle    = {Proceedings of the 5th Swedish Language Technology Conference, Uppsala University 13-14 November 2014},
	author       = {Volodina, Elena and Lindström TIedemann, Therese},
	year         = {2014},
}

@edited_book{volodina-etal-2014-proceedings-206135,
	title        = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University},
	abstract     = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).},
	editor       = {Volodina, Elena and Borin, Lars and Pilán, Ildikó},
	year         = {2014},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7519-175-1},
}

@inProceedings{volodina-etal-2014-flexible-201885,
	title        = {A flexible language learning platform based on language resources and web services. },
	abstract     = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure.
Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers
– and this way bring Språkbanken's resources in a relevant format to them.
Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.},
	booktitle    = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland },
	author       = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström},
	year         = {2014},
	ISBN         = {978-2-9517408-8-4},
	pages        = {3973--3978},
}

@inProceedings{pilan-volodina-2014-reusing-200967,
	title        = {Reusing Swedish FrameNet for training semantic roles},
	abstract     = {In this article we present the first experiences of reusing the Swedish FrameNet (SweFN) as a resource for training semantic roles. We
give an account of the procedure we used to adapt SweFN to the needs of students of Linguistics in the form of an automatically
generated exercise. During this adaptation, the mapping of the fine-grained distinction of roles from SweFN into learner-friendlier
coarse-grained roles presented a major challenge. Besides discussing the details of this mapping, we describe the resulting multiple-choice exercise and its graphical user interface. The exercise was made available through Lärka, an online platform for students of Linguistics and learners of Swedish as a second language. We outline also aspects underlying the selection of the incorrect answer
options which include semantic as well as frequency-based criteria. Finally, we present our own observations and initial user feedback
about the applicability of such a resource in the pedagogical domain. Students' answers indicated an overall positive experience, the
majority found the exercise useful for learning semantic roles.
},
	booktitle    = {Proceedings of LREC 2014, May 26-31, 2014, Reykjavik, Iceland},
	author       = {Pilán, Ildikó and Volodina, Elena},
	year         = {2014},
	ISBN         = { 978-2-9517408-8-4},
	pages        = {1359--1363},
}

@inProceedings{pilan-etal-2013-automatic-188465,
	title        = {Automatic Selection of Suitable Sentences  for Language Learning Exercises},
	abstract     = {In this study we investigated second and foreign language (L2) sentence 
readability, an area little explored so far in the case of several languages, including 
Swedish. The outcome of our research consists of two methods for sentence selection 
from native language corpora based on Natural Language Processing (NLP) and 
machine learning (ML) techniques. The two approaches have been made available 
online within Lärka, an Intelligent CALL (ICALL) platform offering activities 
for language learners and students of linguistics. Such an automatic selection 
of suitable sentences can be valuable for L2 teachers during the creation of new 
teaching materials, for L2 students who look for additional self-study exercises as 
well as for lexicographers in search of example sentences to illustrate the meaning 
of a vocabulary item. Members from all these potential user groups evaluated our 
methods and found the majority of the sentences selected suitable for L2 learning 
purposes.},
	booktitle    = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th  to 14th September 2013 Évora, Portugal, Proceedings.},
	author       = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard},
	year         = {2013},
	ISBN         = {978-1-908416-12-4},
	pages        = {218--225},
}

@edited_book{borin-volodina-2012-proceedings-188679,
	title        = {Proceedings of the SLTC 2012 workshop on NLP for CALL},
	editor       = {Borin, Lars and Volodina, Elena},
	year         = {2012},
	publisher    = {LiU Electronic Press},
	address      = {Linköping},
}

@edited_book{volodina-etal-2013-proceedings-188675,
	title        = {Proceedings of the second workshop on NLP for computer-assisted language learning at NODALIDA 2013 May 22-24, 2013, Oslo, Norway},
	editor       = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn},
	year         = {2013},
	publisher    = {Linköping University Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7519-588-9},
}

@inProceedings{volodina-etal-2013-towards-188549,
	title        = {Towards a gold standard for Swedish CEFR-based ICALL},
	abstract     = {In qualitative projects on ICALL (Intelligent Computer-Assisted Language Learning), research and development always go hand
in hand: development both depends upon the research results and dictates the research agenda. Likewise, in the development of the Swedish ICALL platform Lärka, the practical issues of development have dictated its research agenda. With NLP approaches, sooner or later, the necessity for reliable training data becomes unavoidable. At the moment Lärka's research agenda cannot be addressed without access to reliable training data, so-called “gold standard”. This paper
gives an overview of the current state of the Swedish ICALL platform development and related research agenda, and describes the first attempts to collect the reference corpus (“gold standard”) coming from course books
used in CEFR-based language teaching.},
	booktitle    = {Proceedings of the Second Workshop on NLP for Computer-Assisted Language Learning. NEALT Proceedings Series 17. Nodalida 2013, Oslo, Norway. },
	author       = {Volodina, Elena and Pijetlovic, Dijana and Pilán, Ildikó and Johansson Kokkinakis, Sofie},
	year         = {2013},
	ISBN         = {978-91-7519-588-9},
}

@inProceedings{volodina-johanssonkokkinakis-2013-compiling-188550,
	title        = {Compiling a corpus of CEFR-related texts.},
	abstract     = {This paper reports on initial efforts to compile a corpus of course book texts used for teaching CEFR-based courses of Swedish to adult immigrants. The research agenda
behind compiling such a corpus comprises the study of normative “input” texts that can
reveal a number of facts about what is being taught in terms of explicit grammar, receptive vocabulary, text and sentence readability; as well as build insights into linguistic characteristics of normative texts which can help anticipate learner performance in terms of active vocabulary, grammatical competence, etc. in classroom and testing settings.
The CEFR “can-do” statements are known to offer flexibility in interpreting them for
different languages and target groups. However, they are nonspecific and therefore it is difficult to associate different kinds of competences and levels of accuracy learners need in order to perform the communicative tasks with the different CEFR levels. To address this problem a systematic study needs to be performed for each individual anguage, both for “input” normative texts and “output” learner-produced texts. In this project we take
the first step to collect and study normative texts for Swedish.
The article describes the process of corpus compilation, annotation scheme of CEFR-
relevant parameters, and methods proposed for text analysis, namely statistic and empiric methods, as well as techniques coming from computational linguistics/machine learning.
},
	booktitle    = {Proceedings of the Language Testing and CEFR conference, Antwerpen, Belgium, May 27-29, 2013},
	author       = {Volodina, Elena and Johansson Kokkinakis, Sofie},
	year         = {2013},
}

@inProceedings{pijetlovic-volodina-2013-developing-188543,
	title        = {Developing a Swedish spelling game on an ICALL platform},
	abstract     = {In this project we developed web services on the ICALL platform Lärka for automatic generation of Swedish spelling exercises using Text-To-Speech (TTS) technology which allows L2 learners to train their spelling and listening individually performance based levels. The embedded avatar pronounces a random item of the desired level, which the user has to spell. Furthermore, the users have the possibility to train their own words for different linguistic levels. A result tracker containing a total and correct answer score keeps track of the language learner’s performance. In order to analyse typical spelling errors and provide better feedback, misspellings are collected in a database. The usability of the spelling exercises, concerning the different linguistic levels and the quality of speech, has been evaluated through a
questionnaire with 10 participants.},
	booktitle    = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, Évora, Portugal, Proceedings.},
	author       = {Pijetlovic, Dijana and Volodina, Elena},
	year         = {2013},
	ISBN         = {978-1-908416-12-4},
}

@inProceedings{volodina-etal-2012-towards-168516,
	title        = {Towards a system architecture for ICALL},
	abstract     = {In this paper, we present an on-going project whose overall aim is to develop open-source system architecture for supporting ICALL systems that will facilitate re-use of existing NLP tools and resources on a plug-and-play basis. We introduce the project, describe the approaches adopted by the two language teams, and present two applications being developed using the proposed architecture.},
	booktitle    = {In G. Biswas et al. (eds), Proceedings of the 20th International Conference on Computers in Education. Singapore: Asia-Pacific Society for Computers in Education},
	author       = {Volodina, Elena and Hrafn, Loftsson and Arnbjörnsdóttir, Birna and Borin, Lars and Leifsson, Guðmundur Örn},
	year         = {2012},
	volume       = {2012},
	ISBN         = {978-981-07-4649-0},
}

@inProceedings{volodina-borin-2012-developing-168523,
	title        = {Developing an Open-Source Web-Based Exercise Generator for Swedish},
	abstract     = {This paper reports on the ongoing international project System architecture for
ICALL and the progress made by the Swedish partner. The Swedish team is developing a
web-based exercise generator reusing available annotated corpora and lexical resources.
Apart from the technical issues like implementation of the user interface and the
underlying processing machinery, a number of interesting pedagogical questions need
to be solved, e.g., adapting learner-oriented exercises to proficiency levels; selecting authentic examples of an appropriate difficulty level; automatically ranking corpus examples by their quality; providing feedback to the learner, and selecting vocabulary for training domain-specific, academic or general-purpose vocabulary. In this paper we describe what has been done so far, mention the exercise types that can be generated at
the moment as well as describe the tasks left for the future.
},
	booktitle    = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland},
	author       = {Volodina, Elena and Borin, Lars},
	year         = {2012},
	volume       = {2012},
	ISBN         = {978-1-908416-03-2},
}

@inProceedings{charalabopoulou-etal-2012-building-168525,
	title        = {Building Corpus-Informed Word Lists for L2 Vocabulary Learning in Nine Languages},
	abstract     = {Lexical competence constitutes a crucial aspect in L2 learning, since building a rich repository of words is considered indispensable for successful communication. CALL practitioners have experimented with various kinds of computer-mediated glosses to facilitate L2 vocabulary building in the context of incidental vocabulary learning. Intentional learning, on the other hand, is generally underestimated, since it is considered out of fashion and not in line with the communicative L2 learning paradigm. Yet, work is still being done in this area and substantial body of research indicates that the usefulness of incidental vocabulary learning does not exclude the use of dedicated vocabulary study and that by using aids explicitly geared to building vocabularies (such as word lists and word cards) L2 learners exhibit good retention rates and faster learning gains. Intentional vocabulary study should, therefore, have its place in the instructional and learning context. Regardless of the approach, incidental or intentional, the crucial question with respect to vocabulary teaching/learning remains: which and how many words should we teach/learn at different language levels?  An attempt to answer the above question was made within the framework of the EU-funded project titled “KELLY” (Keywords for Language Learning for Young and Adults Alike) presented here. The project aimed at building corpus-informed vocabulary lists for L2 learners ranging from A1 to C2 levels for nine languages: Arabic, Chinese, English, Greek, Italian, Norwegian, Polish, Russian and Swedish. },
	booktitle    = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland},
	author       = {Charalabopoulou, Frieda and Gavrilidou, Maria and Johansson Kokkinakis, Sofie and Volodina, Elena},
	year         = {2012},
	volume       = {2012},
	ISBN         = {978-1-908416-03-2},
}

@article{johanssonkokkinakis-volodina-2011-corpus-148533,
	title        = {Corpus-based approaches for the creation of a frequency based vocabulary list in  the EU project KELLY – issues on reliability, validity and coverage},
	abstract     = {At present there are relatively few vocabulary lists for Swedish describing modern vocabulary as well as being adapted to language learners’ needs. In Europe including Sweden there exist approaches to unify ways of working consistently with language learning, one example worth naming in this respect is the Common European Framework of Reference (CEFR) which provides guidelines for systematic approach to language teaching and assessment of language proficiency. This article describes EU project Kelly (KEywords for Language Learning for Young and adults alike, 2009-2012), the main 
objective of which was to create  vocabulary lists  for nine languages (Swedish, English, Norwegian, Greek, Italian, Polish, Arabic, Chinese and Russian)  and adapt them to CEFR levels. We describe the  process of  compiling  and validating the Swedish Kelly-list, dwell on benefits and limitations of using a corpus based approach in this project; as 
well as mention the impact of the methodological approach for compiling vocabulary lists for specific purposes. },
	journal      = {eLex, 10-12 November 2011, Slovenia},
	author       = {Johansson Kokkinakis, Sofie and Volodina, Elena},
	year         = {2011},
	volume       = {2011},
}

@inProceedings{volodina-johanssonkokkinakis-2012-introducing-154723,
	title        = {Introducing Swedish Kelly-list, a new free e-resource for Swedish},
	abstract     = {Frequency lists and/or lexicons contain information about the words and their statistics. They tend to find their “readers” among linguists, lexicographers, language teachers. Making them available in electronic format helps to expand the target group to cover language engineers, computer programmers and other specialists working in such areas as information retrieval, spam filtering, text readability analysis, test generation, etc. 
This article describes a new freely available electronic frequency list of modern Swedish that was created in the EU project KELLY. We describe the state of affairs for Swedish frequency lexicons; provide a short description of the KELLY project; mention the corpus the list has been derived from. Further, we dwell on the type of information the list contains, describe shortly the steps for list generation; provide information on the coverage and some other statistics over the items in the list. Finally, some practical information on the license for the Swedish Kelly-list distribution is given; potential application areas are suggested; and future plans for its expansion are mentioned. We hope that with some publicity we can help this list find its users.
},
	booktitle    = {LREC 2012 Proceedings},
	author       = {Volodina, Elena and Johansson Kokkinakis, Sofie},
	year         = {2012},
	volume       = {2012},
}

@inProceedings{volodina-etal-2012-waste-165936,
	title        = {Waste not, want not: Towards a system architecture for ICALL based on NLP component re-use},
	booktitle    = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012},
	author       = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn and Arnbjörnsdóttir, Birna and Leifsson, Guðmundur Örn},
	year         = {2012},
	pages        = {47--58},
}

@techreport{volodina-johanssonkokkinakis-2012-swedish-165964,
	title        = {Swedish Kelly: Technical Report.},
	author       = {Volodina, Elena and Johansson Kokkinakis, Sofie},
	year         = {2012},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@inProceedings{volodina-etal-2012-semi-165961,
	title        = {Semi-automatic selection of best corpus examples for Swedish: Initial algorithm evaluation.},
	abstract     = {The study presented here describes the results
of the initial evaluation of two sorting
approaches to automatic ranking of corpus
examples for Swedish. Representatives from
two potential target user groups have been
asked to rate top three hits per approach for
sixty search items from the point of view of the
needs of their professional target groups,
namely second/foreign language (L2) teachers
and lexicographers. This evaluation has shown, on the one hand, which of the two approaches to example rating (called in the text below algorithms #1 and #2) performs better in terms of finding better examples for each target user group; and on the other hand, which features evaluators associate with good examples. It has also facilitated statistic analysis of the “good” versus “bad” examples with reference to the measurable features, such as sentence length, word length, lexical frequency profiles, PoS constitution, dependency structure, etc. with a
potential to find out new reliable classifiers.},
	booktitle    = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012. },
	author       = {Volodina, Elena and Johansson, Richard and Johansson Kokkinakis, Sofie},
	year         = {2012},
	number       = {080},
	pages        = {59--70},
}

@book{volodina-2010-corpora-127225,
	title        = {Corpora in Language Classroom: Reusing Stockholm Umeå Corpus in a vocabulary exercise generator},
	abstract     = {Authentic examples as teaching material are not easy to obtain. Corpora are able to solve this problem, as has been witnessed before. Most experiments with corpora in language classroom describe concordances. However, there are numerous other ways of bringing corpora into language education, as shown in this research. A selective learner-oriented exercise generator has been implemented on the basis of Stockholm Umeå Corpus (SUC). SUC texts have been tested for readability and levels were assigned. This generator assists in automatic selection of authentic examples of appropriate learner levels as well as in construction of wordbank-, multiple choice items and c-tests for a specified proficiency level, frequency band and word class. In Vocabulary Size Test potential words are generated on the basis of existing morphemes and SUC-based frequency lists. Interesting practical and theoretical questions connected with reusage of corpora in an exercise generator are described in this book. The research might inspire computational linguists, language teachers and everyone interested in Computer-Assisted Language Learning and Corpus Linguistics to test similar techniques in their practices. },
	author       = {Volodina, Elena},
	year         = {2010},
	publisher    = {Lambert Academic Publishing},
	address      = {Saarbrücken},
	ISBN         = {978-3-8433-5256-7},
}