Skip to main content

BibTeX

@inProceedings{lindstromtiedemann-etal-2025-mormor-358255,
	title        = {Om mormor Karl sägs vara 27 år gammal, vad säger det om skribenten? En presentation om att identifiera och ersätta identifierande element i språkvetenskapliga forskningsdata},
	abstract     = {Utmaningarna med etik och dataskydd är gemensamma för många vetenskaper. Inom projektet Mormor Karl undersöker vi frågor relaterade till automatiskt stöd för pseudonymisering av forskningsdata (på svenska) men även effekterna av pseudonymisering med tanke på materialets användbarhet för forskningoch påverkan på möjlig språkteknologisk användning som t.ex. automatisk bedömning. Vi hävdar att det behövs en diskussion inom just språkvetenskapen för att belysa de frågor som är centrala för vår disciplin. Språkvetare analyserar språk i relation till allt från regional och social variation, språkliga normer och normenlighet, maktutövning via språkliga medel, till språkinlärning och språkdidaktiska frågor. Är det möjligt att hitta en enda gemensam lösning för att förhindra identifiering? Hur detaljerat behöver vi beskriva hur vi behandlat våra data i fråga om anonymisering eller pseudonymisering för att veta vad som kan undersökas språkvetenskapligt i datan och således hur tillförlitliga resultaten i en studie är? Kan vi garantera att vi inte samtidigt riskerar att avslöja personerna som dolts? I vår presentation vill vi diskutera och illustrera varför pseudonymisering i allra högsta grad är en fråga som berör språkvetenskapen.},
	booktitle    = {Abstractsamling Svenskans beskrivning 40 (Svebe40), Workshop om Pseudonymisering inom språkvetenskap},
	author       = {Lindström Tiedemann, Therese and Södergård, Lisa and Volodina, Elena and Dobnik, Simon and Szawerna, Maria Irena and Muñoz Sánchez, Ricardo and Vu, Xuan-Son},
	year         = {2025},
	address      = {Stockholm University, Stockholm, Sweden},
}

@inProceedings{szawerna-etal-2025-annotating-355296,
	title        = {Annotating Personal Information in Swedish Texts with SPARV},
	abstract     = {Many kinds of language data run the risk of containing personal information (PI).
sparv-sbx-pi-detection is a plugin for the text annotation pipeline SPARV, which enables personal information annotation at the same time as other kinds of annotation. (Im)Personal Data is a visualisation tool which allows researchers to estimate the extent of detected personal information in a sample of
their data and how much of it would be manipulated if PI were to be removed or replaced.},
	booktitle    = {CLARIN Annual Conference 2025, 30 September - 2 October, 2025, Vienna, Austria},
	author       = {Szawerna, Maria Irena and Alfter, David and Volodina, Elena},
	year         = {2025},
}

@inProceedings{szawerna-etal-2025-annotating-355832,
	title        = {Annotating Personal Information in Swedish Texts with SPARV},
	booktitle    = {Proceedings of the First Workshop on Natural Language Processing and Language Models for Digital Humanities},
	author       = {Szawerna, Maria Irena and Alfter, David and Volodina, Elena},
	year         = {2025},
	pages        = {155--163},
}

@inProceedings{masciolini-etal-2025-annotating-352761,
	title        = {Annotating Second Language in Universal Dependencies: a Review of Current Practices and Directions for Harmonized Guidelines},
	abstract     = {Universal Dependencies (UD) is gaining popularity as an annotation standard for second language (L2) material. Grammatical errors and other interlanguage phenomena, however, pose significant challenges that official guidelines only address in part. In this paper, we give an overview of current annotation practices and provide some suggestions for harmonizing guidelines for learner corpora.},
	booktitle    = {Proceedings of the Eighth Workshop on Universal Dependencies (UDW, SyntaxFest 2025)},
	author       = {Masciolini, Arianna and Berdicevskis, Aleksandrs and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2025},
	publisher    = {Association for Computational Linguistics},
	address      = {Ljubljana, Slovenia},
	ISBN         = {979-8-89176-292-3},
	pages        = {153--163},
}

@inProceedings{volodina-etal-2025-towards-355295,
	title        = {Towards shared standards for pseudonymization of research data},
	abstract     = {Pseudonymization has attracted a lot of attention recently due to legislation (e.g. the GDPR), the European
Guidelines on Pseudonymization, the increased need for high-quality ethical data for the training of large
language models as well as the desire to be able to share data with other researchers. This article introduces key concepts in pseudonymization, summarizes the half-way findings in the intradisciplinary research environment Mormor Karl, and proposes ways to unify and standardize the field of pseudonymization.},
	booktitle    = {Proceedings of the Huminfra Conference (HiC 2025), Stockholm, 12-13 November 2025},
	author       = {Volodina, Elena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Södergård, Lisa and Vu, Xuan-Son},
	year         = {2025},
}

@inProceedings{ilinykh-szawerna-2025-i-349004,
	title        = {“I Need More Context and an English Translation”: Analysing How LLMs identify Personal Information in Komi, Polish, and English},
	abstract     = {In this paper we present a pilot study and a qualitative analysis of the errors made by three large language models (LLMs) prompted to identify personal information (PI) in texts written in languages with varying resource availability: Komi (extremely low), Polish (medium), and English (high). Our analysis shows that LLMs perform better in detection of PI when provided with JSON-eliciting prompts. We also conjecture that the rich morphology and inflectionality of languages like Komi and Polish might affect the models’ performance. The small-scale parallel dataset of text that we introduce here can be used as a starting point in developing benchmarks for evaluation of PI detection with longer textual contexts and LLMs. },
	booktitle    = {Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025), March 2, 2025, Tallinn, Estonia / Špela Arhar Holdt, Nikolai Ilinykh, Barbara Scalvini, Micaella Bruton, Iben Nyholm Debess, Crina Madalina Tudor (eds.)},
	author       = {Ilinykh, Nikolai and Szawerna, Maria Irena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-121-2},
	pages        = {165–178},
}

@inProceedings{szawerna-etal-2025-devils-348547,
	title        = {The Devil’s in the Details: the Detailedness of Classes Influences Personal Information Detection and Labeling},
	abstract     = {In this paper, we experiment with the effect of different levels of detailedness or granularity—understood as i) the number of classes, and ii) the classes’ semantic depth in the sense of hypernym and hyponym relations — of the annotation of Personally Identifiable Information (PII) on automatic detection and labeling of such information. We fine-tune a Swedish BERT model on a corpus of Swedish learner essays annotated with a total of six PII tagsets at varying levels of granularity. We also investigate whether the presence of grammatical and lexical correction annotation in the tokens and class prevalence have an effect on predictions. We observe that the fewer total categories there are, the better the overall results are, but having a more diverse annotation facilitates fewer misclassifications for tokens containing  correction annotation. We also note that the classes’ internal diversity has an effect on labeling. We conclude from the results that while labeling based on the detailed annotation is difficult because of the number of classes, it is likely that models trained on such annotation rely more on the semantic content captured by contextual word embeddings rather than just the form of the tokens, making them more robust against nonstandard language.},
	booktitle    = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), March 3–4, 2025 Tallinn, Estonia) / Richard Johansson and Sara Stymne (eds.)},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Volodina, Elena},
	year         = {2025},
	publisher    = {University of Tartu Library},
	address      = {Tartu, Estonia},
	ISBN         = {978-9908-53-109-0},
	pages        = { 697–708},
}