Skip to main content
Språkbanken Text is a part of Språkbanken.

BibTeX

@inProceedings{hagstrom-etal-2023-effect-331015,
	title        = {The Effect of Scaling, Retrieval Augmentation and Form on the Factual Consistency of Language Models},
	abstract     = {Large Language Models (LLMs) make natural interfaces to factual knowledge, but their usefulness is limited by their tendency to deliver inconsistent answers to semantically equivalent questions. For example, a model might supply the answer “Edinburgh” to “Anne Redpath passed away in X.” and “London” to “Anne Redpath’s life ended in X.” In this work, we identify potential causes of inconsistency and evaluate the effectiveness of two mitigation strategies: up-scaling and augmenting the LM with a passage retrieval database. Our results on the LLaMA and Atlas models show that both strategies reduce inconsistency but that retrieval augmentation is considerably more efficient. We further consider and disentangle the consistency contributions of different components of Atlas. For all LMs evaluated we find that syntactical form and task artifacts impact consistency. Taken together, our results provide a better understanding of the factors affecting the factual consistency of language models.},
	booktitle    = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pages 5457–5476, Singapore},
	author       = {Hagström, Lovisa and Saynova, Denitsa and Norlund, Tobias and Johansson, Moa and Johansson, Richard},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{doostmohammadi-etal-2023-surface-327186,
	title        = {Surface-Based Retrieval Reduces Perplexity of Retrieval-Augmented Language Models},
	abstract     = {Augmenting language models with a retrieval mechanism has been shown to significantly improve their performance while keeping the number of parameters low. Retrieval-augmented models commonly rely on a semantic retrieval mechanism based on the similarity between dense representations of the query chunk and potential neighbors. In this paper, we study the state-of-the-art Retro model and observe that its performance gain is better explained by surface-level similarities, such as token overlap. Inspired by this, we replace the semantic retrieval in Retro with a surface-level method based on BM25, obtaining a significant reduction in perplexity. As full BM25 retrieval can be computationally costly for large datasets, we also apply it in a re-ranking scenario, gaining part of the perplexity reduction with minimal computational overhead.},
	booktitle    = {    Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 521–529, Toronto, Canada},
	author       = {Doostmohammadi, Ehsan and Norlund, Tobias and Kuhlmann, Marco and Johansson, Richard},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{norlund-etal-2023-generalization-326357,
	title        = {On the Generalization Ability of Retrieval-Enhanced Transformers},
	abstract     = {Recent work on the Retrieval-Enhanced Transformer (RETRO) model has shown impressive results: off-loading memory from trainable weights to a retrieval database can significantly improve language modeling and match the performance of non-retrieval models that are an order of magnitude larger in size. It has been suggested that at least some of this performance gain is due to non-trivial generalization based on both model weights and retrieval. In this paper, we try to better understand the relative contributions of these two components. We find that the performance gains from retrieval to a very large extent originate from overlapping tokens between the database and the test data, suggesting less of non-trivial generalization than previously assumed. More generally, our results point to the challenges of evaluating the generalization of retrieval-augmented language models such as RETRO, as even limited token overlap may significantly decrease test-time loss. We release our code and model at https://github.com/TobiasNorlund/retro},
	booktitle    = {Findings of the Association for Computational Linguistics: EACL 2023, pages 1485–1493, Dubrovnik, Croatia},
	author       = {Norlund, Tobias and Doostmohammadi, Ehsan and Johansson, Richard and Kuhlmann, Marco},
	year         = {2023},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{farahani-johansson-2023-empirical-326359,
	title        = {An Empirical Study of Multitask Learning to Improve Open Domain Dialogue Systems},
	abstract     = {Autoregressive models used to generate responses in open-domain dialogue systems often struggle to take long-term context into account and to maintain consistency over a dialogue. Previous research in open-domain dialogue generation has shown that the use of auxiliary tasks can introduce inductive biases that encourage the model to improve these qualities. However, most previous research has focused on encoder-only or encoder/decoder models, while the use of auxiliary tasks in encoder-only autoregressive models is under-explored. This paper describes an investigation where four different auxiliary tasks are added to small and medium-sized GPT-2 models fine-tuned on the PersonaChat and DailyDialog datasets. The results show that the introduction of the new auxiliary tasks leads to small but consistent improvement in evaluations of the investigated models.},
	booktitle    = {     Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), pages 347–357, Tórshavn, Faroe Islands },
	author       = {Farahani, Mehrdad and Johansson, Richard},
	year         = {2023},
	publisher    = {University of Tartu Library },
}

@inProceedings{saynova-etal-2023-class-326358,
	title        = {Class Explanations: the Role of Domain-Specific Content and Stop Words},
	abstract     = {We address two understudied areas related to explainability for neural text models. First, class explanations. What features are descriptive across a class, rather than explaining single input instances? Second, the type of features that are used for providing explanations. Does the explanation involve the statistical pattern of word usage or the presence of domain-specific content words? Here, we present a method to extract both class explanations and strategies to differentiate between two types of explanations – domain-specific signals or statistical variations in frequencies of common words. We demonstrate our method using a case study in which we analyse transcripts of political debates in the Swedish Riksdag.},
	booktitle    = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), pages 103–112, Tórshavn, Faroe Islands},
	author       = {Saynova, Denitsa and Bruinsma, Bastiaan and Johansson, Moa and Johansson, Richard},
	year         = {2023},
	publisher    = {University of Tartu Library},
}