Språkbanken Text är en avdelning inom Språkbanken.
BibTeX

@inProceedings{periti-etal-2024-automatically-343719,
	title        = {Automatically Generated Definitions and their utility for Modeling Word Meaning},
	abstract     = {Modeling lexical semantics is a challenging task, often suffering from interpretability pitfalls. In this paper, we delve into the generation of dictionary-like sense definitions and explore their utility for modeling word meaning. We fine-tuned two Llama models and include an existing T5-based model in our evaluation. Firstly, we evaluate the quality of the generated definitions on existing English benchmarks, setting new state-of-the-art results for the Definition Generation task. Next, we explore the use of definitions generated by our models as intermediate representations subsequently encoded as sentence embeddings. We evaluate this approach on lexical semantics tasks such as the Word-in-Context, Word Sense Induction, and Lexical Semantic Change, setting new state-of-the-art results in all three tasks when compared to unsupervised baselines.},
	booktitle    = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
	author       = {Periti, Francesco and Alfter, David and Tahmasebi, Nina},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	pages        = {14008–14026},
}

@inProceedings{schlechtweg-etal-2024-more-343718,
	title        = {More DWUGs: Extending and Evaluating Word Usage Graph Datasets in Multiple Languages},
	abstract     = {Word Usage Graphs (WUGs) represent human semantic proximity judgments for pairs of word uses in a weighted graph, which can be clustered to infer word sense clusters from simple pairwise word use judgments, avoiding the need for word sense definitions. SemEval-2020 Task 1 provided the first and to date largest manually annotated, diachronic WUG dataset. In this paper, we check the robustness and correctness of the annotations by continuing the SemEval annotation algorithm for two more rounds and comparing against an established annotation paradigm. Further, we test the reproducibility by resampling a new, smaller set of word uses from the SemEval source corpora and annotating them. Our work contributes to a better understanding of the problems and opportunities of the WUG annotation paradigm and points to future improvements.},
	booktitle    = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
	author       = {Schlechtweg, Dominik and Cassotti, Pierluigi and Noble, Bill and Alfter, David and Schulte Im Walde, Sabine and Tahmasebi, Nina},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	pages        = {14379–14393},
}

@incollection{tiedemann-etal-2024-multiword-343530,
	title        = {Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results},
	abstract     = {This chapter introduces part of the Swedish L2 profiles, a new resource for Swedish as a second language. Multiword expressions (MWEs) in this resource are based on knowledge-based automatic annotation of MWEs, which we show works quite well for Swedish. In contrast, manual annotation of the compositionality of each MWE proved difficult, probably due to different interpretations of "compositionality" by the two annotators. We show that experts and non-experts can rank MWEs very similarly according to relative receptive difficulty, with particularly high agreement for the easiest items. A qualitative comparison of the proficiency levels associated with the MWEs based on coursebook occurrences and the results from crowdsourcing and direct ranking indicate that MWEs which appear in few books of the same level are more likely to be difficult to associate with an appropriate level based on coursebook corpus data. Furthermore, results show that compositionality and/or transparency might influence the relative ranking. Finally, there is a clear increase in MWE lemmas at higher proficiency levels at the group level, and at the highest level receptive and productive data include the same percentage of MWEs.},
	booktitle    = {Multiword Expressions in Lexical Resources: Linguistic, Lexicographic, and Computational Perspectives},
	author       = {Tiedemann, Therese Lindström and Alfter, David and Ali Mohammed, Yousuf and Piipponen, Daniela and Silén, Beatrice and Volodina, Elena},
	year         = {2024},
	ISBN         = {9783961104703},
	pages        = {309--348},
}

@inProceedings{schlechtweg-etal-2024-more-343019,
	title        = {More DWUGs: Extending and Evaluating Word Usage Graph Datasets in Multiple Languages},
	abstract     = {Word Usage Graphs (WUGs) represent human semantic proximity judgments for pairs of word uses in a weighted graph, which can be clustered to infer word sense clusters from simple pairwise word use judgments, avoiding the need for word sense definitions. SemEval-2020 Task 1 provided the first and to date largest manually annotated, diachronic WUG dataset. In this paper, we check the robustness and correctness of the annotations by continuing the SemEval annotation algorithm for two more rounds and comparing against an established annotation paradigm. Further, we test the reproducibility by resampling a new, smaller set of word uses from the SemEval source corpora and annotating them. Our work contributes to a better understanding of the problems and opportunities of the WUG annotation paradigm and points to future improvements.},
	booktitle    = {    Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
	author       = {Schlechtweg, Dominik and Cassotti, Pierluigi and Noble, Bill and Alfter, David and Schulte Im Walde, Sabine and Tahmasebi, Nina},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {Miami, Florida, USA},
	pages        = {14379–14393},
}

@inProceedings{periti-etal-2024-automatically-343018,
	title        = {Automatically Generated Definitions and their utility for Modeling Word Meaning},
	abstract     = {Modeling lexical semantics is a challenging task, often suffering from interpretability pitfalls. In this paper, we delve into the generation of dictionary-like sense definitions and explore their utility for modeling word meaning. We fine-tuned two Llama models and include an existing T5-based model in our evaluation. Firstly, we evaluate the quality of the generated definitions on existing English benchmarks, setting new state-of-the-art results for the Definition Generation task. Next, we explore the use of definitions generated by our models as intermediate representations subsequently encoded as sentence embeddings. We evaluate this approach on lexical semantics tasks such as the Word-in-Context, Word Sense Induction, and Lexical Semantic Change, setting new state-of-the-art results in all three tasks when compared to unsupervised baselines.},
	booktitle    = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
	author       = {Periti, Francesco and Alfter, David and Tahmasebi, Nina},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {Miami, Florida, USA},
	pages        = {14008----14026},
}

@inProceedings{munozsanchez-etal-2024-jingle-342259,
	title        = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT},
	abstract     = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.},
	booktitle    = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) },
	author       = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2024},
	publisher    = {Linköping Electronic Conference Proceedings},
	ISBN         = {978-91-8075-774-4},
}

@inProceedings{alfter-2024-complexity-341312,
	title        = {Complexity and Indecision: A Proof-of-Concept Exploration of Lexical Complexity and Lexical Semantic Change},
	booktitle    = {Proceedings of the 5th Workshop on Computational Approaches to Historical Language Change, August 15, 2024, Bangkok, Thailand},
	author       = {Alfter, David},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-138-4},
}

@misc{volodina-etal-2024-proceedings-336386,
	title        = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024,  Malta},
	author       = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {979-8-89176-085-1},
}

@misc{tahmasebi-etal-2004-proceedings-339992,
	title        = {Proceedings of the 5th Workshop on Computational Approaches to Historical Language Change},
	abstract     = {Welcome to the 5th International Workshop on Computational Approaches to Historical Language Change (LChange’24) co-located with ACL 2024. LChange is held on August 15th, 2024, as a hybrid event
with participation possible both virtually and on-site in Thailand.},
	author       = {Tahmasebi, Nina and Montariol, Syrielle and Kutuzov, Andrey and Alfter, David and Cassotti, Pierluigi and Huebscher, Netta},
	year         = {2004},
	publisher    = {Association for Computational Linguistics},
}

@edited_book{alfter-etal-2023-proceedings-331649,
	title        = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023) },
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.},
	editor       = {Alfter, David and Volodina, Elena and François, Thomas and Jönsson, Arne and Rennes, Evelina},
	year         = {2023},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-8075-250-3},
}

@inProceedings{wilkens-etal-2023-tcfle-337441,
	title        = {TCFLE-8: a Corpus of Learner Written Productions for French as a Foreign Language and its Application to Automated Essay Scoring},
	abstract     = {Automated Essay Scoring (AES) aims to automatically assess the quality of essays. Automation enables large-scale assessment, improvaements in consistency, reliability, and standardization. Those characteristics are of particular relevance in the context of language certification exams. However, a major bottleneck in the development of AES systems is the availability of corpora, which, unfortunately, are scarce, especially for languages other than English. In this paper, we aim to foster the development of AES for French by providing the TCFLE-8 corpus, a corpus of 6.5k essays collected in the context of the Test de Connaissance du Français (TCF - French Knowledge Test) certification exam. We report the strict quality procedure that led to the scoring of each essay by at least two raters according to the levels of the Common European Framework of Reference for Languages (CEFR) and to the creation of a balanced corpus. In addition, we describe how linguistic properties of the essays relate to the learners' proficiency in TCFLE-8. We also advance the state-of-the-art performance for the AES task in French by experimenting with two strong baselines (i.e., RoBERTa and feature-based). Finally, we discuss the challenges of AES using TCFLE-8.},
	booktitle    = {EMNLP 2023 - 2023 Conference on Empirical Methods in Natural Language Processing, Proceedings},
	author       = {Wilkens, Rodrigo and Pintard, Alice and Alfter, David and Folny, Vincent and François, Thomas},
	year         = {2023},
	ISBN         = {9798891760608},
}

@article{volodina-etal-2022-crowdsourcing-336551,
	title        = {Crowdsourcing ratings for single lexical items: a core vocabulary perspective},
	abstract     = {In this study, we investigate theoretical and practical issues connected to differentiating between core and peripheral vocabulary at different levels of linguistic proficiency using statistical approaches combined with crowdsourcing. We also investigate whether crowdsourcing second language learners’ rankings can be used for assigning levels to unseen vocabulary. The study is performed on Swedish single-word items. 
The four hypotheses we examine are: (1) there is core vocabulary for each proficiency  level,  but  this  is  only  true  until  CEFR  level  B2  (upper-intermediate); (2) core vocabulary shows more systematicity in its behavior and usage, whereas  peripheral  items  have  more  idiosyncratic  behavior;  (3)  given  that  we have truly core items (aka anchor items) for each level, we can place any new unseen item in relation to the identified core items by using a series of comparative  judgment  tasks,  this  way  assigning  a  “target”  level  for  a  previously  unseen  item;  and  (4)  non-experts  will  perform  on  par  with  experts in  a  comparative  judgment  setting.  The  hypotheses  have  been  largely  confirmed:  In  relation  to  (1)  and  (2),  our  results  show  that  there  seems  to  be  some systematicity in core vocabulary for early to mid-levels (A1-B1) while we find less systematicity for higher levels (B2-C1). In relation to (3), we suggest crowdsourcing word rankings using comparative judgment with known anchor  words  as  a  method  to  assign  a  “target”  level  to  unseen  words.  With  regard to (4), we confirm the previous findings that non-experts, in our case language learners, can be effectively used for the linguistic annotation tasks in a comparative judgment setting.},
	journal      = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese},
	year         = {2022},
	volume       = {10},
	number       = {2},
	pages        = {5--61},
}

@misc{volodina-etal-2024-proceedings-335190,
	title        = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden},
	author       = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna},
	year         = {2024},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-8075-512-2},
}

@inProceedings{fridlund-etal-2024-humanistic-335724,
	title        = {Humanistic AI: Towards a new field of interdisciplinary expertise and research},
	abstract     = {The Gothenburg Research Infrastructure in Digital Humanities (GRIDH) have participated in projects within various humanities fields that utilise as well as develop research tools and infrastructural resources that incorporate applications of ‘artificial intelligence’ (AI). These applications can include natural language processing, machine learning, computer vision, large language models, image recognition algorithms, classification, clustering, and deep learning. This paper advances the term ‘humanistic AI’ to describe an emergent form of interdisciplinary practice that uses and develops AI-based research applications to answer humanities research questions together with its entangled humanistic reflection. We coin this term to make implicit and visible the epistemological and material particularities of its practice and the new forms of knowledge its affordances make possible. The paper presents GRIDH projects within ‘humanistic AI’ together with its developed AI resources and applications.},
	booktitle    = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden},
	author       = {Fridlund, Mats and Alfter, David and Brodén, Daniel and Green, Ashely and Karimi, Aram and Lindhé, Cecilia},
	year         = {2024},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-8075-512-2},
}

@incollection{volodina-etal-2022-reliability-321988,
	title        = {Reliability of Automatic Linguistic Annotation: Native vs Non-native Texts },
	abstract     = {We present the results of a manual evaluation of the performance of automatic linguistic annotation on three different datasets: (1) texts written by native speakers, (2) essays written by second language (L2) learners of Swedish in the original form and (3) the normalized versions of learner-written essays. The focus of the evaluation is on lemmatization, POS-tagging, word sense disambiguation, multi-word detection and dependency annotation. Two annotators manually went through the automatic annotation on a subset of the datasets and marked up all deviations based on their expert judgments and the guidelines provided. We report Inter-Annotator Agreement between the two annotators and accuracy for the linguistic annotation quality for the three datasets, by levels and linguistic features.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September},
	editor       = {Monica Monachini and Maria Eskevich},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese and Lauriala, Maisa and Piipponen, Daniala},
	year         = {2022},
	publisher    = {Linköping Electronic Conference },
	address      = {Linköping, Sweden},
	ISBN         = { 978-91-7929-444-1},
	pages        = {151--167},
}

@misc{alfter-etal-2022-proceedings-321964,
	title        = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) },
	abstract     = {The volume contains articles reviewed and presented at NLP4CALL workshop. The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical an methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.},
	author       = {Alfter, David and Volodina, Elena and François, Thomas and Desmet, Piet and Cornillie, Frederik and Jönsson, Arne and Rennes, Evelina},
	year         = {2022},
	publisher    = {Linköping Electronic Conference Proceedings  },
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-460-1},
}

@inProceedings{alfter-2016-learning-241664,
	title        = {Learning the Learner: User Modeling in Intelligent Computer Assisted Language Learning Systems},
	booktitle    = {CEUR Workshop Proceedings, v.1618. UMAP 2016 Extended Proceedings. Halifax, Canada, July 13-16, 2016. Edited by : Federica Cena, Michel Desmarais, Darina Dicheva, Jie Zhang},
	author       = {Alfter, David},
	year         = {2016},
}

@inProceedings{alfter-bizzoni-2016-hybrid-246348,
	title        = {Hybrid Language Segmentation for Historical Documents},
	booktitle    = {Proceedings CLiC-it 2016 and EVALITA 2016, Napoli, Italy, December 5-7, 2016. Edited by : Pierpaolo Basile, Anna Corazza, Franco Cutugno, Simonetta Montemagni, Malvina Nissim, Viviana Patti, Giovanni Semeraro, Rachele Sprugnoli},
	author       = {Alfter, David and Bizzoni, Yuri},
	year         = {2016},
}

@misc{tahmasebi-etal-2023-proceedings-331093,
	title        = {Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change, LChange'23, December 6th, 2023, Singapore},
	abstract     = {Welcome to the 4th International Workshop on Computational Approaches to Historical Language Change (LChange’23) co-located with EMNLP 2023. LChange is held on December 6th, 2023, as a hybrid
event with participation possible both virtually and on-site in Singapore.

Characterizing the time-varying nature of language will have broad implications and applications in
multiple fields including linguistics, artificial intelligence, digital humanities, computational cognitive
and social sciences. In this workshop, we bring together the world’s pioneers and experts in computational approaches to historical language change with a focus on digital text corpora. In doing so, this workshop carries out the triple goals of disseminating state-of-the-art research on diachronic modeling of language change, fostering cross-disciplinary collaborations, and exploring the fundamental theoretical and methodological challenges in this growing niche of computational linguistic research.},
	author       = {Tahmasebi, Nina and Montariol, Syrielle and Dubossarsky, Haim and Kutuzov, Andrey and Hengchen, Simon and Alfter, David and Periti, Francesco and Cassotti, Pierluigi},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {979-8-89176-043-1},
}

@inProceedings{lindstromtiedemann-etal-2022-cefr-321899,
	title        = {CEFR-nivåer och svenska flerordsuttryck},
	abstract     = {När vi lär oss ett nytt språk ska vi inte bara lära oss enstaka ord och hur vi använder dessa, utan vi måste också lära oss vilka ordkombinationer som är ”fasta uttryck” till betydelsen (t.ex. hälsa på någon) eller till formen (t.ex. lättare sagt än gjort) eller båda delarna (t.ex. huller om buller). Enligt en del studier kan dessa uttryck utgöra så mycket som 50 % av vokabulären i ett språk som förstaspråk (L1) eller ännu mer (Jackendoff 1997; Erman 2007, 28). Men det är möjligt att de är vanligare i vardagligt språk och talspråk (Prentice & Sköldberg 2013). Flerordsenheter kan vara problematiska för andraspråkstalare (Nesselhauf 2003, 223) till och med på avancerad nivå (jfr Pawley & Syder 1983; Wray & Perkins 2000; Nesselhauf 2003; Prentice 2010). Samtidigt är de en helt nödvändig del av språket (Nesselhauf 2003, 223) och kan utmärka andraspråkstalarna som icke-modersmålstalare (Pawley & Syder 1983; Wray 2002). Flerordsuttryck är alltså en värdefull del av andraspråkskompetensen (se även Paquot 2019) och något som är viktigt att studera hur vi på bästa sätt introducerar för L2-talaren och om de kan kopplas till nivåer i bedömning.
I den här studien presenterar vi resultat kring förståelsen av flerordsuttryck i svenska som andraspråk i relation till färdighetsnivåerna enligt Gemensam Europeisk Referensram för Språk (GERS eller CEFR, Common European Framework of Reference) (COE 2001; 2018; Skolverket 2009; Utbildningsstyrelsen 2018) genom crowdsourcing experiment.},
	booktitle    = {Svenskan i Finland 19 : föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa den 6-7 maj 2021 / redigerade av Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund},
	author       = {Lindström Tiedemann, Therese and Alfter, David and Volodina, Elena},
	year         = {2022},
	publisher    = {Svensk-Österbottniska Samfundet},
	address      = {Vasa},
	ISBN         = {978-952-69650-5-5},
}

@incollection{volodina-alfter-2022-icall-321984,
	title        = {ICALL: Research versus reality check.},
	abstract     = {Intelligent Computer-Assisted Language Learning has been one of Lars Borin’s research interests.
The work on the Lärka language learning platform has started under his coordination. We see it
our mission to make the platform live and prosperous, and through it to stimulate research into
Swedish as a second language. Below, we name some weaknesses we have identified in Lärka
while working with a course of beginner Swedish and outline our plans for tackling those.},
	booktitle    = {Live and Learn- Festschrift in honor of Lars Borin},
	author       = {Volodina, Elena and Alfter, David},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
	pages        = {145--152},
}

@article{alfter-etal-2021-crowdsourcing-311721,
	title        = {Crowdsourcing Relative Rankings of Multi-Word Expressions: Experts versus Non-Experts},
	abstract     = {In this study we investigate to which degree experts and non-experts agree on questions of difficulty in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language.},
	journal      = {Northern European Journal of Language Technology (NEJLT)},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2021},
	volume       = {7},
	number       = {1},
}

@edited_book{alfter-etal-2021-proceedings-311727,
	title        = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language
Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural
Language Processing and Speech Technologies in CALL systems and exploring the theoretical and
methodological issues arising in this connection. The latter includes, among others, the integration of
insights from Second Language Acquisition (SLA) research, and the promotion of “Computational
SLA” through setting up Second Language research infrastructures.},
	editor       = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Graën, Johannes and Borin, Lars},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings 177},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-625-4},
}

@book{alfter-2021-exploring-304548,
	title        = {Exploring natural language processing for single-word and multi-word lexical complexity from a second language learner perspective},
	abstract     = {In this thesis, we investigate how natural language processing (NLP) tools and techniques can be applied to vocabulary aimed at second language learners of Swedish in order to classify vocabulary items into different proficiency levels suitable for learners of different levels.  

In the first part, we use feature-engineering to represent words as vectors and feed these vectors into machine learning algorithms in order to (1) learn CEFR labels from the input data and (2) predict the CEFR level of unseen words.
Our experiments corroborate the finding that feature-based classification models using 'traditional' machine learning still outperform deep learning architectures in the task of deciding how complex a word is. 

In the second part, we use crowdsourcing as a technique to generate ranked lists of multi-word expressions using both experts and non-experts (i.e. language learners). Our experiment shows that non-expert and expert rankings are highly correlated, suggesting that non-expert intuition can be seen as on-par with expert knowledge, at least in the chosen experimental configuration.

The main practical output of this research comes in two forms: prototypes and resources. We have implemented various prototype applications for (1) the automatic prediction of words based on the feature-engineering machine learning method, (2) language learning applications using graded word lists, and (3) an annotation tool for the manual annotation of expressions across a variety of linguistic factors.},
	author       = {Alfter, David},
	year         = {2021},
	publisher    = {Göteborgs universitet},
	ISBN         = {978-91-87850-79-0},
}

@inProceedings{alfter-etal-2020-expert-300074,
	title        = {Expert judgments versus crowdsourcing in ordering multi-word expressions},
	abstract     = {In  this  study  we  investigate  to  which  degree  experts  and  non-experts  agree  on questions  of  linguistic  complexity  in  a  crowdsourcing  experiment.  We  ask  non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions  in  a  crowdsourcing  experiment. We  find  that  the  resulting  rankings by all the three tested groups correlate to a very high degree, which suggests that judgments  produced  in  a  comparative  setting  are  not  influenced  by  professional insights into Swedish as a second language.  },
	booktitle    = {Proceedings of the Swedish Language Technology Conference (SLTC), 25–27 November 2020, (Online)},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2020},
}

@misc{alfter-etal-2020-proceedings-300071,
	title        = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.
This collection presents four selected papers describing use of Language Technology for language learning.},
	author       = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars},
	year         = {2020},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7929-732-9},
}

@inProceedings{pilan-etal-2017-larka-289884,
	title        = {Lärka: an online platform where language learning meets natural language processing},
	booktitle    = {7th ISCA Workshop on Speech and Language Technology in Education, 25-26 August 2017, Stockholm, Sweden},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2017},
}

@misc{alfter-etal-2019-proceedings-285613,
	title        = {Proceedings of the 8th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2019), September 30, Turku Finland},
	abstract     = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promote development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools.

The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field.
},
	author       = {Alfter, David and Volodina, Elena and Borin, Lars and Pilán, Ildikó and Lange, Herbert},
	year         = {2019},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköping},
	ISBN         = {978-91-7929-998-9},
}

@inProceedings{alfter-volodina-2019-from-285728,
	title        = {From river to bank: The importance of sense-based graded word lists},
	booktitle    = { EUROCALL 2019 - CALL and Complexity, Book of Abstracts, Louvain-la-Neuve, Belgium, 28-31 August 2019},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2019},
}

@inProceedings{alfter-etal-2019-legato-285625,
	title        = {LEGATO: A flexible lexicographic annotation tool.},
	abstract     = {This article is a report from an ongoing project aiming at analyzing lexical and grammatical competences of Swedish as a Second language (L2). To facilitate lexical analysis, we need access to metalinguistic information about relevant vocabulary that L2 learners can use and understand. The focus of the current article is on the lexical annotation of the vocabulary scope for a range of lexicographical aspects, such as morphological analysis, valency, types of multi-word units, etc. We perform parts of the analysis automatically, and other parts manually. The rationale behind this is that where there is no possibility to add information automatically, manual effort needs to be added. To facilitate the latter, a tool LEGATO has been designed, implemented and currently put to active testing.},
	booktitle    = {Linköping Electronic Conference Proceedings, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland Editor(s): Mareike Hartman and Barbara Plank},
	author       = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2019},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping university},
	ISBN         = {978-91-7929-995-8},
}

@inProceedings{alfter-graen-2019-interconnecting-285731,
	title        = {Interconnecting lexical resources and word alignment: How do learners get on with particle verbs?},
	abstract     = {In this paper, we present a prototype for an online exercise aimed at learners of English and Swedish that serves multiple purposes. The exercise allows learners of the aforementioned languages to train their knowledge of particle verbs receiving clues from the exercise application. The user themselves decide which clue to receive and pay in virtual currency for each, which provides us with valuable information about the utility of the clues that we provide as well as the learners willingness to trade virtual currency versus accuracy of their choice. As resources, we use list with annotated levels from the proficiency scale defined by the Common European Framework of Reference (CEFR) and a multilingual corpus with syntactic dependency relations and word annotation for all language pairs. From the latter resource, we extract translation equivalents for particle verb construction together with a list of parallel corpus examples that can be used as clues in the exercise.},
	booktitle    = {Linköping Electronic Conference Proceeding, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland / Editor(s): Mareike Hartman and Barbara Plank},
	author       = {Alfter, David and Graën, Johannes},
	year         = {2019},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköping university},
	ISBN         = {978-91-7929-995-8},
}

@inProceedings{alfter-etal-2019-larka-281344,
	title        = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2019},
	publisher    = {Linköping University Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-034-3},
}

@article{agebjorn-alfter-2019-review-281196,
	title        = {Review of Advanced Proficiency and Exceptional Ability in Second Languages},
	journal      = {Linguist List},
	author       = {Agebjörn, Anders and Alfter, David},
	year         = {2019},
	number       = { Jan 16},
}

@inProceedings{alfter-volodina-2018-whole-275362,
	title        = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.},
	abstract     = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.},
	booktitle    = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2018},
}

@inProceedings{alfter-volodina-2018-towards-275368,
	title        = {Towards Single Word Lexical Complexity Prediction.},
	abstract     = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.},
	booktitle    = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2018},
	publisher    = {Association of Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-948087-11-7},
}

@misc{pilan-etal-2018-proceedings-275358,
	title        = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 },
	abstract     = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other.

The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.},
	author       = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars},
	year         = {2018},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköpings universitet},
	ISBN         = {978-91-7685-173-9},
}

@inProceedings{alfter-pilan-2018-complex-276407,
	title        = {SB@ GU at the Complex Word Identification 2018 Shared Task},
	booktitle    = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018},
	author       = {Alfter, David and Pilán, Ildikó},
	year         = {2018},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA, USA},
	ISBN         = {978-1-948087-11-7},
}

@inProceedings{alfter-etal-2018-from-275364,
	title        = {From Language Learning Platform to Infrastructure for Research on Language Learning},
	abstract     = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.},
	booktitle    = {Proceedings of CLARIN-2018 conference, Pisa, Italy},
	author       = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2018},
}

@article{alfter-agebjorn-2017-review-253359,
	title        = {Review of Developing, Modelling and Assessing Second Languages},
	journal      = {Linguistlist},
	author       = {Alfter, David and Agebjörn, Anders},
	year         = {2017},
}

@inProceedings{volodina-etal-2016-classification-246346,
	title        = {Classification of Swedish learner essays by CEFR levels},
	abstract     = {The paper describes initial efforts on creating a system for the automatic assessment  of  Swedish  second  language  (L2)  learner  essays  from  two  points  of  view: holistic evaluation of the reached level according to the  Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. },
	booktitle    = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.},
	author       = {Volodina, Elena and Pilán, Ildikó and Alfter, David},
	year         = {2016},
	publisher    = {Research-publishing.net},
	ISBN         = { 978-1-908416-44-5},
}

@inProceedings{alfter-volodina-2016-modeling-246347,
	title        = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System},
	booktitle    = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016},
	author       = {Alfter, David and Volodina, Elena},
	year         = {2016},
}

@inProceedings{pilan-etal-2016-coursebook-246349,
	title        = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings},
	abstract     = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels.  We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels.  With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task.  Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. },
	booktitle    = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity},
	author       = {Pilán, Ildikó and Alfter, David and Volodina, Elena},
	year         = {2016},
	ISBN         = {978-4-87974-709-9},
}

@inProceedings{alfter-etal-2016-from-246345,
	title        = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora},
	abstract     = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral.
},
	booktitle    = {Linköping Electronic Conference Proceedings},
	author       = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó},
	year         = {2016},
	publisher    = {Linköping University Electronic Press},
	ISBN         = {978-91-7685-633-8},
}
Sidansvarig: sb-webb