@incollection{graen-volk-2021-binomial-310124, title = {Binomial adverbs in Germanic and Romance Languages – A corpus-based study}, abstract = {As a special type of multiword expressions, binomials are a frequent phenomenon in many languages. We focus on binomial adverbs that are coordinations of two adverbial constituents. Their syntactic contribution to a sentence is adverbial as well and their semantic contribution is idiomatic. They have many uses, such as to intensify (first and foremost), express tendency (more and more), frequency (over and over [again]), vagueness (more or less), determination (sooner or later) etc.In this work, we describe our approaches to identify binomial adverbs in a large multiparallel corpus. Alongside the well-known measure of reversibility, we also calculate measures of statistical association and look for single-word translation equivalents in other languages. Combining these features facilitates the identification of binomial adverbs.}, booktitle = {Corpora in Translation and Contrastive Research in the Digital Age – Recent advances and explorations. Julia Lavid-López, Carmen Maíz-Arévalo, Juan Rafael Zamorano-Mansilla (red.)}, author = {Graën, Johannes and Volk, Martin}, year = {2021}, publisher = {John Benjamins}, ISBN = {9789027209184 }, pages = {326–342}, } @article{zanetti-etal-2021-automatic-311723, title = {Automatic Generation of Exercises for Second Language Learning from Parallel Corpus Data}, abstract = {Creating language learning exercises is a time-consuming task and made-up sample sentences frequently lack authenticity. Authentic samples can be obtained from corpora, but it is necessary to identify material that is suitable for language learners. Parallel corpora of written text consist of translated material. Comparing the text in one language with its translation into another (known) language makes the structure accessible to the learner. However, the correspondence of words between the two languages is more important. By carefully selecting well-suited parallel sentences, a learner can explore the target language in a guided way. We present an approach to generate a novel type of language learning exercise from a large parallel corpus based on movie subtitles. The size of the corpus allows for defining selective criteria, favoring precision over recall. It is a non-trivial task to give reliable feedback to automatically generated exercises. ICALL literature often deals with fill-inthe-blanks exercises or multiple-choice questions, which allow for very limited answer options. Our proposed exercise is a special case of sentence reconstruction on bilingual sentence pairs. It combines two elements which have proven to be effective for language learning: a gamified approach, to awaken the students’ competitive desire, and the identification of syntactic structures and vocabulary use, to improve language sensitivity. This article presents the methods used to select example pairs and to implement a prototype. }, journal = {International Journal of TESOL Studies}, author = {Zanetti, Arianna and Volodina, Elena and Graën, Johannes}, year = {2021}, volume = {3}, number = {2}, pages = {55--71}, } @edited_book{alfter-etal-2021-proceedings-311727, title = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, editor = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Graën, Johannes and Borin, Lars}, year = {2021}, publisher = {Linköping Electronic Conference Proceedings 177}, address = {Linköping, Sweden}, ISBN = {978-91-7929-625-4}, } @inProceedings{alfter-graen-2019-interconnecting-285731, title = {Interconnecting lexical resources and word alignment: How do learners get on with particle verbs?}, abstract = {In this paper, we present a prototype for an online exercise aimed at learners of English and Swedish that serves multiple purposes. The exercise allows learners of the aforementioned languages to train their knowledge of particle verbs receiving clues from the exercise application. The user themselves decide which clue to receive and pay in virtual currency for each, which provides us with valuable information about the utility of the clues that we provide as well as the learners willingness to trade virtual currency versus accuracy of their choice. As resources, we use list with annotated levels from the proficiency scale defined by the Common European Framework of Reference (CEFR) and a multilingual corpus with syntactic dependency relations and word annotation for all language pairs. From the latter resource, we extract translation equivalents for particle verb construction together with a list of parallel corpus examples that can be used as clues in the exercise.}, booktitle = {Linköping Electronic Conference Proceeding, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland / Editor(s): Mareike Hartman and Barbara Plank}, author = {Alfter, David and Graën, Johannes}, year = {2019}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping university}, ISBN = {978-91-7929-995-8}, } @inProceedings{graen-etal-2019-modelling-284429, title = {Modelling large parallel corpora: The Zurich Parallel Corpus Collection}, abstract = {Text corpora come in many different shapes and sizes and carry heterogeneous annotations, depending on their purpose and design. The true benefit of corpora is rooted in their annotation and the method by which this data is encoded is an important factor in their interoperability. We have accumulated a large collection of multilingual and parallel corpora and encoded it in a unified format which is compatible with a broad range of NLP tools and corpus linguistic applications. In this paper, we present our corpus collection and describe a data model and the extensions to the popular CoNLL-U format that enable us to encode it.}, booktitle = {Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-7) 2019. Cardiff, 22nd July 2019 / Piotr Bański, Adrien Barbaresi, Hanno Biber, Evelyn Breiteneder, Simon Clematide, Marc Kupietz, Harald Lüngen, Caroline Iliadi (eds.)}, author = {Graën, Johannes and Kew, Tannon and Shaitarova, Anastassia and Volk, Martin}, year = {2019}, publisher = {Leibniz-Institut für Deutsche Sprache}, address = {Mannheim}, }