Hoppa till huvudinnehåll

BibTeX

@inProceedings{szawerna-dobnik-2026-birds-362843,
	title        = {Birds of a Feather: Do Embedding Representations of Personal Information Flock Together?},
	abstract     = {Personally identifiable information (PII or PI) can appear in a wide variety of linguistic data, posing both ethical and legal challenges for conducting research and developing applications involving such texts. In this paper, we investigate the alignment between automatic clustering of FastText and Transformer embedding representations of personal information spans sourced from essays written by adult learners of Swedish as a second language and the general and detailed personal information labels assigned to these spans by expert annotators. Our goals are to assess the extent of overlap between the semantic categories and evaluate the semantic coherence of the human-assigned classes, which may have implications for de-identification procedures. We observe that while contextual embeddings, especially ones from a specialized word-in-context model, produce relatively good clustering results, they only partly map to the human understanding of how to classify personal information.},
	booktitle    = {Proceedings of the Joint Workshop on Legal and Ethical Issues in Human Language Technologies and Computational Approaches to Language Data Pseudonymization, Anonymization, De-identification, and Data Privacy (LEGAL2026 and CALD-pseudo 2026) @ LREC 2026},
	author       = {Szawerna, Maria Irena and Dobnik, Simon},
	year         = {2026},
	publisher    = {ELRA},
	ISBN         = {978-2-493814-86-9},
	pages        = {62--72},
}