@inProceedings{ilinykh-szawerna-2025-i-349004,
title = {“I Need More Context and an English Translation”: Analysing How LLMs identify Personal Information in Komi, Polish, and English},
abstract = {In this paper we present a pilot study and a qualitative analysis of the errors made by three large language models (LLMs) prompted to identify personal information (PI) in texts written in languages with varying resource availability: Komi (extremely low), Polish (medium), and English (high). Our analysis shows that LLMs perform better in detection of PI when provided with JSON-eliciting prompts. We also conjecture that the rich morphology and inflectionality of languages like Komi and Polish might affect the models’ performance. The small-scale parallel dataset of text that we introduce here can be used as a starting point in developing benchmarks for evaluation of PI detection with longer textual contexts and LLMs. },
booktitle = {Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025), March 2, 2025, Tallinn, Estonia / Špela Arhar Holdt, Nikolai Ilinykh, Barbara Scalvini, Micaella Bruton, Iben Nyholm Debess, Crina Madalina Tudor (eds.)},
author = {Ilinykh, Nikolai and Szawerna, Maria Irena},
year = {2025},
publisher = {University of Tartu Library},
address = {Tartu, Estonia},
ISBN = {978-9908-53-121-2},
pages = {165–178},
}