Hoppa till huvudinnehåll

BibTeX

@inProceedings{szawerna-dobnik-2026-birds-362843,
	title        = {Birds of a Feather: Do Embedding Representations of Personal Information Flock Together?},
	abstract     = {Personally identifiable information (PII or PI) can appear in a wide variety of linguistic data, posing both ethical and legal challenges for conducting research and developing applications involving such texts. In this paper, we investigate the alignment between automatic clustering of FastText and Transformer embedding representations of personal information spans sourced from essays written by adult learners of Swedish as a second language and the general and detailed personal information labels assigned to these spans by expert annotators. Our goals are to assess the extent of overlap between the semantic categories and evaluate the semantic coherence of the human-assigned classes, which may have implications for de-identification procedures. We observe that while contextual embeddings, especially ones from a specialized word-in-context model, produce relatively good clustering results, they only partly map to the human understanding of how to classify personal information.},
	booktitle    = {Proceedings of the Joint Workshop on Legal and Ethical Issues in Human Language Technologies and Computational Approaches to Language Data Pseudonymization, Anonymization, De-identification, and Data Privacy (LEGAL2026 and CALD-pseudo 2026) @ LREC 2026},
	author       = {Szawerna, Maria Irena and Dobnik, Simon},
	year         = {2026},
	publisher    = {ELRA},
	ISBN         = {978-2-493814-86-9},
	pages        = {62--72},
}

@misc{siegert-etal-2026-proceedings-362841,
	title        = {Proceedings of the Joint Workshop on Legal and Ethical Issues in Human Language Technologies and Computational Approaches to Language Data Pseudonymization, Anonymization, De-identification, and Data Privacy (LEGAL2026 and CALD-pseudo 2026) @ LREC 2026},
	author       = {Siegert, Ingo and Szawerna, Maria Irena and Choukri, Khalid and Dobnik, Simon and Kamocki, Paweł and Lindström Tiedemann, Therese and Lison, Pierre and Muñoz Sánchez, Ricardo and Pilán, Ildikó and Södergård, Lisa and Talmoudi, Kossay and Volodina, Elena and Vu, Xuan-Son},
	year         = {2026},
	publisher    = {ELRA},
	ISBN         = { 978-2-493814-86-9},
}

@inProceedings{szawerna-suchardt-2026-fill-362803,
	title        = {Fill-in-the-Blanks: Automatic Generation and Evaluation of Language Models' Pseudonyms for English and Swedish Texts},
	abstract     = {While considerable effort has gone into developing solutions for detecting Personally Identifiable Information (PII) in linguistic data, less research has gone into automating the generation of appropriate pseudonyms and developing evaluation methods, both relevant for the creation of privacy-friendly language resources. We conduct pilot experiments using Masked and Generative Large Language Models to generate predictions for redacted PII-spans in a cloze-like fashion for English legal texts and parallel news articles in Swedish and English. Furthermore, we explore metrics for automatic evaluation of the generated pseudonyms in the legal data, and investigate the effect of part-of-speech constraints on performance. For the parallel, multilingual data, we contribute our manual PII-annotation and conduct a fine-grained error analysis across two of our pseudonym generation methods and a baseline. Our results illustrate the complexity of pseudonym evaluation and the particular challenge of automatic, at-scale evaluation as well as the models’ tendency to predict prototypical and even stereotypical answers.},
	booktitle    = {Proceedings of the Fifteenth Language Resources and Evaluation Conference (LREC 2026)},
	author       = {Szawerna, Maria Irena and Suchardt, Jacob Lee},
	year         = {2026},
	publisher    = {European Language Resources Association (ELRA)},
	ISBN         = {978-2-493814-49-4},
	pages        = {1155--1169},
}