BibTeX

@article{hagberg-etal-2022-semi-314455,
	title        = {Semi-supervised learning with natural language processing for right ventricle classification in echocardiography—a scalable approach},
	abstract     = {We created a deep learning model, trained on text classified by natural language processing (NLP), to assess right ventricular (RV) size and function from echocardiographic images. We included 12,684 examinations with corresponding written reports for text classification. After manual annotation of 1489 reports, we trained an NLP model to classify the remaining 10,651 reports. A view classifier was developed to select the 4-chamber or RV-focused view from an echocardiographic examination (n = 539). The final models were two image classification models trained on the predicted labels from the combined manual annotation and NLP models and the corresponding echocardiographic view to assess RV function (training set n = 11,008) and size (training set n = 9951. The text classifier identified impaired RV function with 99% sensitivity and 98% specificity and RV enlargement with 98% sensitivity and 98% specificity. The view classification model identified the 4-chamber view with 92% accuracy and the RV-focused view with 73% accuracy. The image classification models identified impaired RV function with 93% sensitivity and 72% specificity and an enlarged RV with 80% sensitivity and 85% specificity; agreement with the written reports was substantial (both κ = 0.65). Our findings show that models for automatic image assessment can be trained to classify RV size and function by using model-annotated data from written echocardiography reports. This pipeline for auto-annotation of the echocardiographic images, using a NLP model with medical reports as input, can be used to train an image-assessment model without manual annotation of images and enables fast and inexpensive expansion of the training dataset when needed. © 2022},
	journal      = {Computers in Biology and Medicine},
	author       = {Hagberg, Eva and Hagerman, David and Johansson, Richard and Hosseini, N. and Liu, J. and Björnsson, E. and Alvén, Jennifer and Hjelmgren, Ola},
	year         = {2022},
	volume       = {143},
}

@inProceedings{hagstrom-johansson-2022-what-316251,
	title        = {What do Models Learn From Training on More Than Text? Measuring Visual Commonsense Knowledge},
	abstract     = {There are limitations in learning language from text alone. Therefore, recent focus has been on developing multimodal models. However, few benchmarks exist that can measure what language models learn about language from multimodal training. We hypothesize that training on a visual modality should improve on the visual commonsense knowledge in language models. Therefore, we introduce two evaluation tasks for measuring visual commonsense knowledge in language models (code publicly available at: github.com/lovhag/measure-visual-commonsense-knowledge) and use them to evaluate different multimodal models and unimodal baselines. Primarily, we find that the visual commonsense knowledge is not significantly different between the multimodal models and unimodal baseline models trained on visual text data.},
	booktitle    = {    Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop,  pp. 252–261, Dublin, Ireland},
	author       = {Hagström, Lovisa and Johansson, Richard},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{daoud-etal-2022-conceptualizing-317410,
	title        = {Conceptualizing Treatment Leakage in Text-based Causal Inference},
	abstract     = {Causal inference methods that control for text-based confounders are becoming increasingly important in the social sciences and other disciplines where text is readily available. However, these methods rely on a critical assumption that there is no treatment leakage: that is, the text only contains information about the confounder and no information about treatment assignment. When this assumption does not hold, methods that control for text to adjust for confounders face the problem of post-treatment (collider) bias. However, the assumption that there is no treatment leakage may be unrealistic in real-world situations involving text, as human language is rich and flexible. Language appearing in a public policy document or health records may refer to the future and the past simultaneously, and thereby reveal information about the treatment assignment.In this article, we define the treatment-leakage problem, and discuss the identification as well as the estimation challenges it raises. Second, we delineate the conditions under which leakage can be addressed by removing the treatment-related signal from the text in a pre-processing step we define as text distillation. Lastly, using simulation, we show how treatment leakage introduces a bias in estimates of the average treatment effect (ATE) and how text distillation can mitigate this bias.},
	booktitle    = {    Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5638–5645, Seattle, United States},
	author       = {Daoud, Adel and Jerzak, Connor T. and Johansson, Richard},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{hagstrom-johansson-2022-small-318477,
	title        = {Can We Use Small Models to Investigate Multimodal Fusion Methods?},
	abstract     = {Many successful methods for fusing language with information from the visual modality have recently been proposed and the topic of multimodal training is ever evolving. However, it is still largely not known what makes different vision-and-language models successful. Investigations into this are made difficult by the large sizes of the models used, requiring large training datasets and causing long train and compute times. Therefore, we propose the idea of studying multimodal fusion methods in a smaller setting with small models and datasets. In this setting, we can experiment with different approaches for fusing multimodal information with language in a controlled fashion, while allowing for fast experimentation. We illustrate this idea with the math arithmetics sandbox. This is a setting in which we fuse language with information from the math modality and strive to replicate some fusion methods from the vision-and-language domain. We find that some results for fusion methods from the larger domain translate to the math arithmetics sandbox, indicating a promising future avenue for multimodal model prototyping.},
	booktitle    = {Proceedings of the 2022 CLASP Conference on (Dis)embodiment, Gothenburg, Sweden, pages 45-50.},
	author       = {Hagström, Lovisa and Johansson, Richard},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
}

@inProceedings{hagstrom-johansson-2022-adapt-319269,
	title        = {How to Adapt Pre-trained Vision-and-Language Models to a Text-only Input?},
	abstract     = {Current language models have been criticised for learning language from text alone without connection between words and their meaning. Consequently, multimodal training has been proposed as a way for creating models with better language understanding by providing the lacking connection. We focus on pre-trained multimodal vision-and-language (VL) models for which there already are some results on their language understanding capabilities. An unresolved issue with evaluating the linguistic skills of these models, however, is that there is no established method for adapting them to text-only input without out-of-distribution uncertainty. To find the best approach, we investigate and compare seven possible methods for adapting three different pre-trained VL models to text-only input. Our evaluations on both GLUE and Visual Property Norms (VPN) show that care should be put into adapting VL models to zero-shot text-only tasks, while the models are less sensitive to how we adapt them to non-zero-shot tasks. We also find that the adaptation methods perform differently for different models and that unimodal model counterparts perform on par with the VL models regardless of adaptation, indicating that current VL models do not necessarily gain better language understanding from their multimodal training.},
	booktitle    = {Proceedings of the 29th International Conference on Computational Linguistics, pages 5582–5596, Gyeongju, Republic of Korea},
	author       = {Hagström, Lovisa and Johansson, Richard},
	year         = {2022},
	publisher    = {International Committee on Computational Linguistics},
}
Sidansvarig: sb-webb