@inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, } @inProceedings{Alfter-David2018-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, adress = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{Pilán-Ildikó2018-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{Pilán-Ildikó2018-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{Volodina-Elena2018-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad }, year = {2018}, } @inProceedings{Alfter-David2018-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{Rosén-Dan2018-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{Alfter-David2018-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{Volodina-Elena2018-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{Megyesi-Beata2018-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018 / edited by Ildikó Pilán, Elena Volodina, David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén , Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @misc{Pilán-Ildikó2018-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @article{Kosem-Iztok2018-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer , Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners , Local}, year = {2018}, volume = {ecy022}, pages = {1--23}, }