BibTeX

@inProceedings{ilinykh-szawerna-2025-i-349004,
	title        = {“I Need More Context and an English Translation”: Analysing How LLMs identify Personal Information in Komi, Polish, and English},
	abstract     = {In this paper we present a pilot study and a qualitative analysis of the errors made by three large language models (LLMs) prompted to identify personal information (PI) in texts written in languages with varying resource availability: Komi (extremely low), Polish (medium), and English (high). Our analysis shows that LLMs perform better in detection of PI when provided with JSON-eliciting prompts. We also conjecture that the rich morphology and inflectionality of languages like Komi and Polish might affect the models’ performance. The small-scale parallel dataset of text that we introduce here can be used as a starting point in developing benchmarks for evaluation of PI detection with longer textual contexts and LLMs. },
	booktitle    = {Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025), March 2, 2025, Tallinn, Estonia / Špela Arhar Holdt, Nikolai Ilinykh, Barbara Scalvini, Micaella Bruton, Iben Nyholm Debess, Crina Madalina Tudor (eds.)},
	author       = {Ilinykh, Nikolai and Szawerna, Maria Irena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-121-2},
	pages        = {165–178},
}
Sidansvarig: sb-webb