@inProceedings{periti-etal-2024-automatically-343719, title = {Automatically Generated Definitions and their utility for Modeling Word Meaning}, abstract = {Modeling lexical semantics is a challenging task, often suffering from interpretability pitfalls. In this paper, we delve into the generation of dictionary-like sense definitions and explore their utility for modeling word meaning. We fine-tuned two Llama models and include an existing T5-based model in our evaluation. Firstly, we evaluate the quality of the generated definitions on existing English benchmarks, setting new state-of-the-art results for the Definition Generation task. Next, we explore the use of definitions generated by our models as intermediate representations subsequently encoded as sentence embeddings. We evaluate this approach on lexical semantics tasks such as the Word-in-Context, Word Sense Induction, and Lexical Semantic Change, setting new state-of-the-art results in all three tasks when compared to unsupervised baselines.}, booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, author = {Periti, Francesco and Alfter, David and Tahmasebi, Nina}, year = {2024}, publisher = {Association for Computational Linguistics}, pages = {14008–14026}, } @inProceedings{schlechtweg-etal-2024-more-343718, title = {More DWUGs: Extending and Evaluating Word Usage Graph Datasets in Multiple Languages}, abstract = {Word Usage Graphs (WUGs) represent human semantic proximity judgments for pairs of word uses in a weighted graph, which can be clustered to infer word sense clusters from simple pairwise word use judgments, avoiding the need for word sense definitions. SemEval-2020 Task 1 provided the first and to date largest manually annotated, diachronic WUG dataset. In this paper, we check the robustness and correctness of the annotations by continuing the SemEval annotation algorithm for two more rounds and comparing against an established annotation paradigm. Further, we test the reproducibility by resampling a new, smaller set of word uses from the SemEval source corpora and annotating them. Our work contributes to a better understanding of the problems and opportunities of the WUG annotation paradigm and points to future improvements.}, booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, author = {Schlechtweg, Dominik and Cassotti, Pierluigi and Noble, Bill and Alfter, David and Schulte Im Walde, Sabine and Tahmasebi, Nina}, year = {2024}, publisher = {Association for Computational Linguistics}, pages = {14379–14393}, } @incollection{tiedemann-etal-2024-multiword-343530, title = {Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results}, abstract = {This chapter introduces part of the Swedish L2 profiles, a new resource for Swedish as a second language. Multiword expressions (MWEs) in this resource are based on knowledge-based automatic annotation of MWEs, which we show works quite well for Swedish. In contrast, manual annotation of the compositionality of each MWE proved difficult, probably due to different interpretations of "compositionality" by the two annotators. We show that experts and non-experts can rank MWEs very similarly according to relative receptive difficulty, with particularly high agreement for the easiest items. A qualitative comparison of the proficiency levels associated with the MWEs based on coursebook occurrences and the results from crowdsourcing and direct ranking indicate that MWEs which appear in few books of the same level are more likely to be difficult to associate with an appropriate level based on coursebook corpus data. Furthermore, results show that compositionality and/or transparency might influence the relative ranking. Finally, there is a clear increase in MWE lemmas at higher proficiency levels at the group level, and at the highest level receptive and productive data include the same percentage of MWEs.}, booktitle = {Multiword Expressions in Lexical Resources: Linguistic, Lexicographic, and Computational Perspectives}, author = {Tiedemann, Therese Lindström and Alfter, David and Ali Mohammed, Yousuf and Piipponen, Daniela and Silén, Beatrice and Volodina, Elena}, year = {2024}, ISBN = {9783961104703}, pages = {309--348}, } @inProceedings{schlechtweg-etal-2024-more-343019, title = {More DWUGs: Extending and Evaluating Word Usage Graph Datasets in Multiple Languages}, abstract = {Word Usage Graphs (WUGs) represent human semantic proximity judgments for pairs of word uses in a weighted graph, which can be clustered to infer word sense clusters from simple pairwise word use judgments, avoiding the need for word sense definitions. SemEval-2020 Task 1 provided the first and to date largest manually annotated, diachronic WUG dataset. In this paper, we check the robustness and correctness of the annotations by continuing the SemEval annotation algorithm for two more rounds and comparing against an established annotation paradigm. Further, we test the reproducibility by resampling a new, smaller set of word uses from the SemEval source corpora and annotating them. Our work contributes to a better understanding of the problems and opportunities of the WUG annotation paradigm and points to future improvements.}, booktitle = { Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, author = {Schlechtweg, Dominik and Cassotti, Pierluigi and Noble, Bill and Alfter, David and Schulte Im Walde, Sabine and Tahmasebi, Nina}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Miami, Florida, USA}, pages = {14379–14393}, } @inProceedings{periti-etal-2024-automatically-343018, title = {Automatically Generated Definitions and their utility for Modeling Word Meaning}, abstract = {Modeling lexical semantics is a challenging task, often suffering from interpretability pitfalls. In this paper, we delve into the generation of dictionary-like sense definitions and explore their utility for modeling word meaning. We fine-tuned two Llama models and include an existing T5-based model in our evaluation. Firstly, we evaluate the quality of the generated definitions on existing English benchmarks, setting new state-of-the-art results for the Definition Generation task. Next, we explore the use of definitions generated by our models as intermediate representations subsequently encoded as sentence embeddings. We evaluate this approach on lexical semantics tasks such as the Word-in-Context, Word Sense Induction, and Lexical Semantic Change, setting new state-of-the-art results in all three tasks when compared to unsupervised baselines.}, booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing}, author = {Periti, Francesco and Alfter, David and Tahmasebi, Nina}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Miami, Florida, USA}, pages = {14008----14026}, } @inProceedings{munozsanchez-etal-2024-jingle-342259, title = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT}, abstract = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.}, booktitle = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) }, author = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-8075-774-4}, } @inProceedings{alfter-2024-complexity-341312, title = {Complexity and Indecision: A Proof-of-Concept Exploration of Lexical Complexity and Lexical Semantic Change}, booktitle = {Proceedings of the 5th Workshop on Computational Approaches to Historical Language Change, August 15, 2024, Bangkok, Thailand}, author = {Alfter, David}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-138-4}, } @misc{volodina-etal-2024-proceedings-336386, title = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta}, author = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {979-8-89176-085-1}, } @misc{volodina-etal-2024-proceedings-335190, title = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @inProceedings{fridlund-etal-2024-humanistic-335724, title = {Humanistic AI: Towards a new field of interdisciplinary expertise and research}, abstract = {The Gothenburg Research Infrastructure in Digital Humanities (GRIDH) have participated in projects within various humanities fields that utilise as well as develop research tools and infrastructural resources that incorporate applications of ‘artificial intelligence’ (AI). These applications can include natural language processing, machine learning, computer vision, large language models, image recognition algorithms, classification, clustering, and deep learning. This paper advances the term ‘humanistic AI’ to describe an emergent form of interdisciplinary practice that uses and develops AI-based research applications to answer humanities research questions together with its entangled humanistic reflection. We coin this term to make implicit and visible the epistemological and material particularities of its practice and the new forms of knowledge its affordances make possible. The paper presents GRIDH projects within ‘humanistic AI’ together with its developed AI resources and applications.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Fridlund, Mats and Alfter, David and Brodén, Daniel and Green, Ashely and Karimi, Aram and Lindhé, Cecilia}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, }