@book{morger-2024-minds-346391, title = {In the minds of stochastic parrots: Benchmarking, evaluating and interpreting large language models}, abstract = {The arrival of large language models (LLMs) in recent years has changed the landscape of natural language processing (NLP). Their impressive performance on popular benchmarks, ability to solve a range of different tasks and their human-like linguistic interactional abilities, have prompted a debate into whether these are just "stochastic parrots" who are cleverly repeating what humans say without understanding its meaning or whether they are acquiring essential language capabilities, which would be an important stepping stone towards artificial general intelligence. To tackle this question, developing analysis methods to measure and understand the language capabilities of LLMs has become a defining challenge. These include developing benchmarks to reliably measure their performance as well and interpretability methods to gauge their inner-workings. This is especially relevant at a time when these models already are having a considerable impact on our society. An increasing amount users are affected by the technology and calls are made for transparent, regulated and thorough evaluation of AI. In these efforts, it is important to estimate the possibilities and limitations of these analysis methods since they will play an important role in holding technologies in AI accountable. In this compilation thesis, I expound on the components and processes involved in analyzing LLMs. The articles included in this compilation thesis use different approaches for analyzing LLMs, from introducing a multi-task benchmark Superlim for Swedish NLU to investigating LLMs' ability to predict language variation. To this effort I explore what the possibilities and limitations are of popular analysis methods and what implications these have for developing LLMs. I argue that integrating explanatory approaches from empirical linguistic research is important to understand the role of both the data and the linguistic features used when analyzing LLMs. Doing so does not only help guide the development of LLMs, but also bring insights into linguistics.}, author = {Morger, Felix}, year = {2024}, ISBN = {978-91-8069-944-0}, } @techreport{morger-2024-when-342179, title = {When Sparv met Superlim. . . A Sparv Plugin for Natural Language Understanding Analysis of Swedish}, abstract = {This technical report introduces Sparv-Superlim, a Sparv plugin for natural language understanding analysis of Swedish. It uses the reference models trained on the Superlim multi-task benchmark to add additional analyses to the Sparv Pipeline. I show how to install and configure the tool as well as apply it to analyze Swedish political manifestos to see if the predictions the plugin does align with known political positions of Swedish parties. These use cases shows that the reference models vary in their applicability to predict correct sentiments on novel data and illustrates the importance of integrating reference models trained on a multi-task benchmark like Superlim to evaluate the ecological validity of the benchmark.}, author = {Morger, Felix}, year = {2024}, } @inProceedings{morger-2024-swediagnostics-341148, title = {SweDiagnostics: A Diagnostics Natural Language Inference Dataset for Swedish}, abstract = {This paper presents SweDiagnostics, a natural language inference dataset for Swedish based on the GLUE Diagnostic dataset. It is the largest, manually corrected NLI dataset in Swedish to date and can be used to evaluate models on NLI in Swedish as well as estimate English-Swedish language transfer capabilities. We present the dataset, the methodology used for translation, compare existing implementations and discuss limitations of the dataset, in particular those related to translationese.}, booktitle = {17th Workshop on Building and Using Comparable Corpora, BUCC 2024 at LREC-COLING 2024 - Proceedings}, author = {Morger, Felix}, year = {2024}, ISBN = {9782493814319}, }