@inProceedings{broden-etal-2025-distant-356669,
title = {A Distant Technology? Experiments with a Generative Model
for Retouching Noisy Newspaper OCR},
abstract = {This paper explores the use of generative language models to enhance digitized historical newspaper
text. While large language models offer new means of addressing noisy OCR, their opaque,
probabilistic processes raise epistemological concerns. Within the project The Order of Criticism
Revisited, which integrates literary and computational approaches to Swedish criticism, we tested
GPT-4o to “retouch” OCR data from the National Library of Sweden using zero-shot prompting.
Comparisons with flawed OCR outputs and manually transcribed texts show that the model
produced more legible versions, often closer to the originals than the raw OCR. This indicates
potential for improving the quality of digitized sources and enabling more robust large-scale
analysis. At the same time, drawing on the notions of artificial communication and distant
technology, we argue that such models extend analytical capacity while creating perceptual and
methodological distance. Their outputs, better seen as probabilistic “retouching” than correction or
reconstruction, weaken the indexical link to original sources.},
booktitle = {HiC 2025: Huminfra Conference 12–13 November, 2025 Stockholm, Sweden},
author = {Brodén, Daniel and Samuelsson, Lina and Alfter, David and Malmstedt, Johan},
year = {2025},
pages = {1--7},
}