Hoppa till huvudinnehåll
Språkbanken Text är en avdelning inom Språkbanken.

BibTeX

@inProceedings{szawerna-etal-2024-detecting-336385,
	title        = {Detecting Personal Identifiable Information in Swedish Learner Essays},
	abstract     = {Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.},
	booktitle    = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, St. Julian’s, Malta},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-085-1},
}

@inProceedings{munozsanchez-etal-2024-jingle-342259,
	title        = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT},
	abstract     = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.},
	booktitle    = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) },
	author       = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2024},
	publisher    = {Linköping Electronic Conference Proceedings},
	ISBN         = {978-91-8075-774-4},
}

@inProceedings{szawerna-2024-stanza-336413,
	title        = {Can Stanza be Used for Part-of-Speech Tagging Historical Polish?},
	abstract     = {The goal of this paper is to evaluate the performance of Stanza, a part-of-speech (POS) tagger developed for modern Polish, on historical text to assess its possible use for automating the annotation of other historical texts. While the issue of the reliability of utilizing POS taggers on historical data has been previously discussed, most of the research focuses on languages whose grammar differs from Polish, meaning that their results need not be fully applicable in this case. The evaluation of Stanza is conducted on two sets of 10286 and 3270 manually annotated tokens from a piece of historical Polish writing (1899), and the errors are analyzed qualitatively and quantitatively. The results show a good performance of the tagger, especially when it comes to Universal Part-of-Speech (UPOS) tags, which is promising for utilizing the tagger for automatic annotation in larger projects, and pinpoint some common features of misclassified tokens.},
	booktitle    = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, March 21-22, 2024, St. Julian’s, Malta},
	author       = {Szawerna, Maria Irena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-090-5},
}

@inProceedings{munozsanchez-etal-2024-names-336384,
	title        = {Did the Names I Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring},
	abstract     = {Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.},
	booktitle    = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta },
	author       = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Szawerna, Maria Irena and Lindström Tiedemann, Therese and Volodina, Elena},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-085-1},
}

@inProceedings{szawerna-etal-2024-pseudonymization-338089,
	title        = {Pseudonymization Categories across Domain Boundaries},
	abstract     = {Linguistic data, a component critical not only for research in a variety of fields but also for the development of various Natural Language Processing (NLP) applications, can contain personal information. As a result, its accessibility is limited, both from a legal and an ethical standpoint. One of the solutions is the pseudonymization of the data. Key stages of this process include the identification of sensitive elements and the generation of suitable surrogates in a way that the data is still useful for the intended task. Within this paper, we conduct an analysis of tagsets that have previously been utilized in anonymization and pseudonymization. We also investigate what kinds of Personally Identifiable Information (PII) appear in various domains. These reveal that none of the analyzed tagsets account for all of the PII types present cross-domain at the level of detailedness seemingly required for pseudonymization. We advocate for a universal system of tags for categorizing PIIs leading up to their replacement. Such categorization could facilitate the generation of grammatically, semantically, and sociolinguistically appropriate surrogates for the kinds of information that are considered sensitive in a given domain, resulting in a system that would enable dynamic pseudonymization while keeping the texts readable and useful for future research in various fields.},
	booktitle    = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), LREC-COLING, 2024 20-25 May, 2024, Torino, Italia},
	author       = {Szawerna, Maria Irena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena},
	year         = {2024},
	publisher    = {ELRA and ICCL},
	ISBN         = {978-2-493814-10-4},
}

@inProceedings{munozsanchez-etal-2024-name-339981,
	title        = {Name Biases in Automated Essay Assessment},
	abstract     = {Artificial intelligence is being deployed in high-stakes situations, such as automated grading of second language essays in proficiency assessment. While they can improve the opportunities students have (education, work opportunities, etc.), such systems often display human-like biases. Aldrin (2017) notes that human graders have a slight bias based on names appearing in essay texts. We aim to identify whether the same pattern holds in automated systems. 
In this study we aim to answer the following research questions: 
1) Does changing given names inside a second language learner essay affect the way the text is graded? 
2) How much does this differ between feature-based machine learning and deep learning? 

For this, we use a de-anonymized (i.e. original) version of the Swell-pilot corpus of second language Swedish learner essays (Volodina 2016), which consists of 502 essays annotated with CEFR levels as our source data. 
First, we compile four lists of given names inspired by those of Aldrin (2017): traditional Swedish names; modern Swedish names of Anglo-American origin; Finnish names (due to the close sociocultural links between both countries); and names of Arabic origin (the most prominent group of learners in the corpus). 
Second, we create a diagnostic dataset to identify biases in the classification task. We select SweLL-pilot essays in which a given name appears only once. Then, we generate an essay version for each name on the lists by substituting the name in the original text with one from the list. 
Third, we fine-tune a BERT (Devlin et al. 2019) model on the original SweLL-pilot data to predict the CEFR level of a given essay and compare it to an existing feature-based model (Pilan 2016). 

Finally, we test the two models and compare the equality of opportunity between the different given name groups on the diagnostic dataset. },
	booktitle    = {The 28th International Congress of Onomastic Sciences (ICOS 28),19-23 August, 2024, Helsinki, Finland},
	author       = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Lindström Tiedemann, Therese and Szawerna, Maria Irena and Volodina, Elena},
	year         = {2024},
}

@misc{volodina-etal-2024-proceedings-336386,
	title        = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024,  Malta},
	author       = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {979-8-89176-085-1},
}

@inProceedings{masciolini-etal-2024-synthetic-338288,
	title        = {Synthetic-Error Augmented Parsing of Swedish as a Second Language: Experiments with Word Order},
	abstract     = {Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.},
	booktitle    = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, May 25, 2024, Torino, Italia},
	author       = {Masciolini, Arianna and Francis, Emilie and Szawerna, Maria Irena},
	year         = {2024},
	publisher    = {ELRA and ICCL},
	address      = {Torino, Italy},
	ISBN         = {978-2-493814-20-3},
}