@article{hagberg-etal-2022-semi-314455, title = {Semi-supervised learning with natural language processing for right ventricle classification in echocardiography—a scalable approach}, abstract = {We created a deep learning model, trained on text classified by natural language processing (NLP), to assess right ventricular (RV) size and function from echocardiographic images. We included 12,684 examinations with corresponding written reports for text classification. After manual annotation of 1489 reports, we trained an NLP model to classify the remaining 10,651 reports. A view classifier was developed to select the 4-chamber or RV-focused view from an echocardiographic examination (n = 539). The final models were two image classification models trained on the predicted labels from the combined manual annotation and NLP models and the corresponding echocardiographic view to assess RV function (training set n = 11,008) and size (training set n = 9951. The text classifier identified impaired RV function with 99% sensitivity and 98% specificity and RV enlargement with 98% sensitivity and 98% specificity. The view classification model identified the 4-chamber view with 92% accuracy and the RV-focused view with 73% accuracy. The image classification models identified impaired RV function with 93% sensitivity and 72% specificity and an enlarged RV with 80% sensitivity and 85% specificity; agreement with the written reports was substantial (both κ = 0.65). Our findings show that models for automatic image assessment can be trained to classify RV size and function by using model-annotated data from written echocardiography reports. This pipeline for auto-annotation of the echocardiographic images, using a NLP model with medical reports as input, can be used to train an image-assessment model without manual annotation of images and enables fast and inexpensive expansion of the training dataset when needed. © 2022}, journal = {Computers in Biology and Medicine}, author = {Hagberg, Eva and Hagerman, David and Johansson, Richard and Hosseini, N. and Liu, J. and Björnsson, E. and Alvén, Jennifer and Hjelmgren, Ola}, year = {2022}, volume = {143}, } @inProceedings{hagstrom-johansson-2022-what-316251, title = {What do Models Learn From Training on More Than Text? Measuring Visual Commonsense Knowledge}, abstract = {There are limitations in learning language from text alone. Therefore, recent focus has been on developing multimodal models. However, few benchmarks exist that can measure what language models learn about language from multimodal training. We hypothesize that training on a visual modality should improve on the visual commonsense knowledge in language models. Therefore, we introduce two evaluation tasks for measuring visual commonsense knowledge in language models (code publicly available at: github.com/lovhag/measure-visual-commonsense-knowledge) and use them to evaluate different multimodal models and unimodal baselines. Primarily, we find that the visual commonsense knowledge is not significantly different between the multimodal models and unimodal baseline models trained on visual text data.}, booktitle = { Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop, pp. 252–261, Dublin, Ireland}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{daoud-etal-2022-conceptualizing-317410, title = {Conceptualizing Treatment Leakage in Text-based Causal Inference}, abstract = {Causal inference methods that control for text-based confounders are becoming increasingly important in the social sciences and other disciplines where text is readily available. However, these methods rely on a critical assumption that there is no treatment leakage: that is, the text only contains information about the confounder and no information about treatment assignment. When this assumption does not hold, methods that control for text to adjust for confounders face the problem of post-treatment (collider) bias. However, the assumption that there is no treatment leakage may be unrealistic in real-world situations involving text, as human language is rich and flexible. Language appearing in a public policy document or health records may refer to the future and the past simultaneously, and thereby reveal information about the treatment assignment.In this article, we define the treatment-leakage problem, and discuss the identification as well as the estimation challenges it raises. Second, we delineate the conditions under which leakage can be addressed by removing the treatment-related signal from the text in a pre-processing step we define as text distillation. Lastly, using simulation, we show how treatment leakage introduces a bias in estimates of the average treatment effect (ATE) and how text distillation can mitigate this bias.}, booktitle = { Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5638–5645, Seattle, United States}, author = {Daoud, Adel and Jerzak, Connor T. and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{hagstrom-johansson-2022-small-318477, title = {Can We Use Small Models to Investigate Multimodal Fusion Methods?}, abstract = {Many successful methods for fusing language with information from the visual modality have recently been proposed and the topic of multimodal training is ever evolving. However, it is still largely not known what makes different vision-and-language models successful. Investigations into this are made difficult by the large sizes of the models used, requiring large training datasets and causing long train and compute times. Therefore, we propose the idea of studying multimodal fusion methods in a smaller setting with small models and datasets. In this setting, we can experiment with different approaches for fusing multimodal information with language in a controlled fashion, while allowing for fast experimentation. We illustrate this idea with the math arithmetics sandbox. This is a setting in which we fuse language with information from the math modality and strive to replicate some fusion methods from the vision-and-language domain. We find that some results for fusion methods from the larger domain translate to the math arithmetics sandbox, indicating a promising future avenue for multimodal model prototyping.}, booktitle = {Proceedings of the 2022 CLASP Conference on (Dis)embodiment, Gothenburg, Sweden, pages 45-50.}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{hagstrom-johansson-2022-adapt-319269, title = {How to Adapt Pre-trained Vision-and-Language Models to a Text-only Input?}, abstract = {Current language models have been criticised for learning language from text alone without connection between words and their meaning. Consequently, multimodal training has been proposed as a way for creating models with better language understanding by providing the lacking connection. We focus on pre-trained multimodal vision-and-language (VL) models for which there already are some results on their language understanding capabilities. An unresolved issue with evaluating the linguistic skills of these models, however, is that there is no established method for adapting them to text-only input without out-of-distribution uncertainty. To find the best approach, we investigate and compare seven possible methods for adapting three different pre-trained VL models to text-only input. Our evaluations on both GLUE and Visual Property Norms (VPN) show that care should be put into adapting VL models to zero-shot text-only tasks, while the models are less sensitive to how we adapt them to non-zero-shot tasks. We also find that the adaptation methods perform differently for different models and that unimodal model counterparts perform on par with the VL models regardless of adaptation, indicating that current VL models do not necessarily gain better language understanding from their multimodal training.}, booktitle = {Proceedings of the 29th International Conference on Computational Linguistics, pages 5582–5596, Gyeongju, Republic of Korea}, author = {Hagström, Lovisa and Johansson, Richard}, year = {2022}, publisher = {International Committee on Computational Linguistics}, } @inProceedings{raj-etal-2022-cross-323886, title = {Cross-modal Transfer Between Vision and Language for Protest Detection}, abstract = {Most of today’s systems for socio-political event detection are text-based, while an increasing amount of information published on the web is multi-modal. We seek to bridge this gap by proposing a method that utilizes existing annotated unimodal data to perform event detection in another data modality, zero-shot. Specifically, we focus on protest detection in text and images, and show that a pretrained vision-and-language alignment model (CLIP) can be leveraged towards this end. In particular, our results suggest that annotated protest text data can act supplementarily for detecting protests in images, but significant transfer is demonstrated in the opposite direction as well.}, booktitle = {Proceedings of the 5th Workshop on Challenges and Applications of Automated Extraction of Socio-political Events from Text (CASE), pages 56-60, Abu Dhabi}, author = {Raj, Ria and Andréasson, Kajsa and Norlund, Tobias and Johansson, Richard and Lagerberg, Aron}, year = {2022}, publisher = {Association for Computational Linguistics}, } @inProceedings{malik-johansson-2022-controlling-323885, title = {Controlling for Stereotypes in Multimodal Language Model Evaluation}, abstract = {We propose a methodology and design two benchmark sets for measuring to what extent language-and-vision language models use the visual signal in the presence or absence of stereotypes. The first benchmark is designed to test for stereotypical colors of common objects, while the second benchmark considers gender stereotypes. The key idea is to compare predictions when the image conforms to the stereotype to predictions when it does not. Our results show that there is significant variation among multimodal models: the recent Transformer-based FLAVA seems to be more sensitive to the choice of image and less affected by stereotypes than older CNN-based models such as VisualBERT and LXMERT. This effect is more discernible in this type of controlled setting than in traditional evaluations where we do not know whether the model relied on the stereotype or the visual signal.}, booktitle = {Proceedings of the Fifth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, pages 263-271, Abu Dhabi}, author = {Malik, Manuj and Johansson, Richard}, year = {2022}, publisher = {Association for Computational Linguistics}, } @incollection{johansson-2022-coveting-326360, title = {Coveting Your Neighbor's Wife: Using Lexical Neighborhoods in Substitution-based Word Sense Disambiguation}, abstract = {We explore a simple approach to word sense disambiguation for the case where a graph-structured lexicon of word sense identifiers is available, but no definitions or annotated training examples. The key idea is to consider the neighborhood in a lexical graph to generate a set of potential substitutes of the target word, which can then be compared to a set of substitutes suggested by a language model for a given context. We applied the proposed method to the SALDO lexicon for Swedish and used a BERT model to propose contextual substitutes. The system was evaluated on sense-annotated corpora, and despite its simplicity we see a strong improvement over previously proposed models for unsupervised SALDO-based word sense disambiguation.}, booktitle = {LIVE and LEARN – Festschrift in honor of Lars Borin; Volodina, Elena, Dannélls, Dana, Berdicevskis, Aleksandrs, Forsberg, Markus, and Virk, Shafqat (editors)}, author = {Johansson, Richard}, year = {2022}, publisher = {GU-ISS Forskningsrapporter från Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg}, pages = {61--66}, }