@inProceedings{johansson-etal-2026-exploring-363172,
title = {Exploring the similarities and differences between VLM-driven and
traditional OCR for Historical Swedish Data},
abstract = {Recent Swedish OCR efforts rely primarily on traditional OCR methods, including deep CNN–LSTM hybrid neural networks and transformer-based models. Some approaches have also demonstrated the applicability of VLM-driven OCR to historical material. However, to date, no studies have examined in depth the performance of VLM-based OCR on historical Swedish sources. In this paper, we ask: How do transformers and VLMs differ in character- and word-level recognition performance across typefaces, and what qualitative differences can be observed in their error patterns? We show that fine-tuned versions of the Alibaba Cloud Qwen3-VL-8B-Instruct and Qwen3-VL-2B-Instruct,
combined with a simple repetition-trimming step, outperform conventional OCR systems. Remaining errors are primarily attributable to challenges associated with the Blackletter typeface and formatting issues, such as missing or extra line breaks, characters, and spaces. Even when characters are correctly recognized, formatting inconsistencies can substantially increase transcription error rates.},
booktitle = {Proceedings of The Fourth Workshop on the Role of Resources in the Age of Large Language Models (RESOURCEFUL 2026), May 11, 2026, Palma de Mallorca, Spain / Felix Morger, Nikolai Ilinykh, Barbara Scalvini, Simon Dobnik, Dana Dannélls (eds.)},
author = {Johansson, Martin and Waginder, Selma and Dannélls, Dana},
year = {2026},
publisher = {ELRA Language Resources Association (ELRA)},
address = {Paris},
ISBN = {978-2-493814-94-4},
pages = {193–199},
}