Hoppa till huvudinnehåll
Språkbanken Text är en avdelning inom Språkbanken.

BibTeX

@inProceedings{klezl-etal-2022-exploring-321958,
	title        = {Exploring Linguistic Acceptability in Swedish Learners’ Language },
	abstract     = {We present our initial experiments on binary classification of sentences into linguistically correct versus incorrect ones in Swedish using the DaLAJ dataset (Volodina et al., 2021a). The nature of the task is bordering on linguistic acceptability judgments, on the one hand, and on grammatical error detection task, on the other. The experiments include models trained with different input features and on different variations of the training, validation, and test splits. We also analyze the results focusing on different  error  types and errors  made  on  different proficiency levels. Apart from insights into which features and approaches work well for this task, we present first benchmark results on this dataset. The implementation is based on  a  bidirectional  LSTM  network  and  pre-trained  FastText embeddings, BERT embeddings, own word and character embeddings, as well as part-of-speech tags and dependency labels as input  features. The best model used BERT embeddings and a training and validation set enriched with additional correct sentences. It  reached an  accuracy of 73%  on one  of  three  test sets  used  in  the  evaluation. These promising results illustrate that the dataand format of DaLAJ  make a valuable  new resource  for research  in acceptability  judgements in Swedish.},
	booktitle    = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022)},
	author       = {Klezl, Julia and Ali Mohammed, Yousuf and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-459-5 },
}

@inProceedings{casademontmoner-volodina-2022-swedish-321955,
	title        = {Swedish MuClaGED: A new dataset for Grammatical Error Detection in Swedish},
	abstract     = {This paper introduces the Swedish MuClaGED dataset, a new dataset specifically built for the task of Multi-Class Grammatical Error Detection (GED). The dataset has been produced as a part of the multilingual Computational  SLA shared  task  initiative. In  this paper we elaborate on the generation process and the design choices made to obtain Swedish MuClaGED. We also show initial baseline results for the performance on the  dataset in a task of Grammatical Error Detection and Classification on the sentence level, which have been obtained through (Bi)LSTM ((Bidirectional) Long-Short Term Memory) methods.},
	booktitle    = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) },
	author       = {Casademont Moner, Judit and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-459-5},
}

@article{volodina-etal-2022-crowdsourcing-336551,
	title        = {Crowdsourcing ratings for single lexical items: a core vocabulary perspective},
	abstract     = {In this study, we investigate theoretical and practical issues connected to differentiating between core and peripheral vocabulary at different levels of linguistic proficiency using statistical approaches combined with crowdsourcing. We also investigate whether crowdsourcing second language learners’ rankings can be used for assigning levels to unseen vocabulary. The study is performed on Swedish single-word items. 
The four hypotheses we examine are: (1) there is core vocabulary for each proficiency  level,  but  this  is  only  true  until  CEFR  level  B2  (upper-intermediate); (2) core vocabulary shows more systematicity in its behavior and usage, whereas  peripheral  items  have  more  idiosyncratic  behavior;  (3)  given  that  we have truly core items (aka anchor items) for each level, we can place any new unseen item in relation to the identified core items by using a series of comparative  judgment  tasks,  this  way  assigning  a  “target”  level  for  a  previously  unseen  item;  and  (4)  non-experts  will  perform  on  par  with  experts in  a  comparative  judgment  setting.  The  hypotheses  have  been  largely  confirmed:  In  relation  to  (1)  and  (2),  our  results  show  that  there  seems  to  be  some systematicity in core vocabulary for early to mid-levels (A1-B1) while we find less systematicity for higher levels (B2-C1). In relation to (3), we suggest crowdsourcing word rankings using comparative judgment with known anchor  words  as  a  method  to  assign  a  “target”  level  to  unseen  words.  With  regard to (4), we confirm the previous findings that non-experts, in our case language learners, can be effectively used for the linguistic annotation tasks in a comparative judgment setting.},
	journal      = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese},
	year         = {2022},
	volume       = {10},
	number       = {2},
	pages        = {5--61},
}

@incollection{alimohammed-etal-2022-annotation-321989,
	title        = {Annotation Management Tool: A Requirement for Corpus Construction},
	abstract     = {We present an annotation management tool, SweLL portal, that has been developed for the purposes of the SweLL infrastructure project for building a learner corpus of Swedish (Volodina et al., 2019). The SweLL portal has been used for supervised access to the database, data versioning, import and export of data and metadata, statistical overview, administration of annotation tasks, monitoring of annotation tasks and reliability controls. The development of the portal was driven by visions of longitudinal sustainable data storage and was partially shaped by situational needs reported by portal users, including project managers, researchers, and annotators.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September / Monica Monachini and Maria Eskevich (eds.)},
	author       = {Ali Mohammed, Yousuf and Matsson, Arild and Volodina, Elena},
	year         = {2022},
	publisher    = {Linköping Electronic Conference },
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-444-1},
	pages        = {101--108},
}

@incollection{volodina-etal-2022-reliability-321988,
	title        = {Reliability of Automatic Linguistic Annotation: Native vs Non-native Texts },
	abstract     = {We present the results of a manual evaluation of the performance of automatic linguistic annotation on three different datasets: (1) texts written by native speakers, (2) essays written by second language (L2) learners of Swedish in the original form and (3) the normalized versions of learner-written essays. The focus of the evaluation is on lemmatization, POS-tagging, word sense disambiguation, multi-word detection and dependency annotation. Two annotators manually went through the automatic annotation on a subset of the datasets and marked up all deviations based on their expert judgments and the guidelines provided. We report Inter-Annotator Agreement between the two annotators and accuracy for the linguistic annotation quality for the three datasets, by levels and linguistic features.},
	booktitle    = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September},
	editor       = {Monica Monachini and Maria Eskevich},
	author       = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese and Lauriala, Maisa and Piipponen, Daniala},
	year         = {2022},
	publisher    = {Linköping Electronic Conference },
	address      = {Linköping, Sweden},
	ISBN         = { 978-91-7929-444-1},
	pages        = {151--167},
}

@misc{alfter-etal-2022-proceedings-321964,
	title        = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) },
	abstract     = {The volume contains articles reviewed and presented at NLP4CALL workshop. The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical an methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.},
	author       = {Alfter, David and Volodina, Elena and François, Thomas and Desmet, Piet and Cornillie, Frederik and Jönsson, Arne and Rennes, Evelina},
	year         = {2022},
	publisher    = {Linköping Electronic Conference Proceedings  },
	address      = {Linköping, Sweden},
	ISBN         = {978-91-7929-460-1},
}

@inProceedings{lindstromtiedemann-etal-2022-cefr-321899,
	title        = {CEFR-nivåer och svenska flerordsuttryck},
	abstract     = {När vi lär oss ett nytt språk ska vi inte bara lära oss enstaka ord och hur vi använder dessa, utan vi måste också lära oss vilka ordkombinationer som är ”fasta uttryck” till betydelsen (t.ex. hälsa på någon) eller till formen (t.ex. lättare sagt än gjort) eller båda delarna (t.ex. huller om buller). Enligt en del studier kan dessa uttryck utgöra så mycket som 50 % av vokabulären i ett språk som förstaspråk (L1) eller ännu mer (Jackendoff 1997; Erman 2007, 28). Men det är möjligt att de är vanligare i vardagligt språk och talspråk (Prentice & Sköldberg 2013). Flerordsenheter kan vara problematiska för andraspråkstalare (Nesselhauf 2003, 223) till och med på avancerad nivå (jfr Pawley & Syder 1983; Wray & Perkins 2000; Nesselhauf 2003; Prentice 2010). Samtidigt är de en helt nödvändig del av språket (Nesselhauf 2003, 223) och kan utmärka andraspråkstalarna som icke-modersmålstalare (Pawley & Syder 1983; Wray 2002). Flerordsuttryck är alltså en värdefull del av andraspråkskompetensen (se även Paquot 2019) och något som är viktigt att studera hur vi på bästa sätt introducerar för L2-talaren och om de kan kopplas till nivåer i bedömning.
I den här studien presenterar vi resultat kring förståelsen av flerordsuttryck i svenska som andraspråk i relation till färdighetsnivåerna enligt Gemensam Europeisk Referensram för Språk (GERS eller CEFR, Common European Framework of Reference) (COE 2001; 2018; Skolverket 2009; Utbildningsstyrelsen 2018) genom crowdsourcing experiment.},
	booktitle    = {Svenskan i Finland 19 : föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa den 6-7 maj 2021 / redigerade av Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund},
	author       = {Lindström Tiedemann, Therese and Alfter, David and Volodina, Elena},
	year         = {2022},
	publisher    = {Svensk-Österbottniska Samfundet},
	address      = {Vasa},
	ISBN         = {978-952-69650-5-5},
}

@inProceedings{volodina-etal-2022-swedish-321985,
	title        = {Swedish L2 profile - a tool for exploring L2 data.},
	abstract     = {Learner corpus researchers, NLP researchers, as well as Digital Humanities and Social Sciences in general, rely on access to various data sets for empirical analysis, statistical insights, and/or for model building. However, interpretation of data is a non-trivial task and there is a need for data visualization tools. One such attempt is the Swedish L2 profile (SweL2P) – an ongoing project setting up the first digital tool allowing users to explore written Swedish learner language from a linguistic point of view.},
	booktitle    = {Learner Corpus Research conference, 22-24 September, Padua, Italy},
	author       = {Volodina, Elena and Lindström Tiedemann, Therese and Ali Mohammed, Yousuf},
	year         = {2022},
	address      = {Universitá degli Studi di Padova, Padua, Italy},
}

@inProceedings{casademontmoner-volodina-2022-generation-321987,
	title        = {Generation of Synthetic Error Data of Verb Order Errors for Swedish},
	abstract     = {We report on our work-in-progress to generate a synthetic error dataset for Swedish by replicating errors observed in the authentic error annotated dataset. We analyze a small subset of authentic errors, capture regular patterns based on parts of speech, and design a set of rules to corrupt new data. We explore the approach and identify its capabilities, advantages and limitations as a way to enrich the existing collection of error-annotated data. This work focuses on word order errors, specifically those involving the placement of finite verbs in a sentence.},
	booktitle    = {NAACL workshop on Innovative Use of NLP for Building Educational Applications, July 15, 2022, Seattle, Washington},
	author       = {Casademont Moner, Judit and Volodina, Elena},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
	address      = {Seattle, Washington},
	ISBN         = {978-1-955917-83-4},
}

@incollection{volodina-etal-2022-lyxig-321974,
	title        = {Lyxig språklig födelsedagspresent from the Swedish Word Family.},
	abstract     = {Morphology and lexical resources are known to be two of Lars Borin’s biggest research passions.
We have, therefore, prepared a short description of a new kind of a lexical resource for Swedish,
the Swedish Word Family. The resource is compiled based on learner corpora, and contains lexical
items manually analyzed for derivational morphology.},
	booktitle    = {Live and Learn- Festschrift in honor of Lars Borin},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese},
	year         = {2022},
	publisher    = {Department of Swedish, Multilingualism, Language Technology},
	address      = {Gothenburg, Sweden},
	ISBN         = {978-91-87850-83-7},
}

@incollection{volodina-alfter-2022-icall-321984,
	title        = {ICALL: Research versus reality check.},
	abstract     = {Intelligent Computer-Assisted Language Learning has been one of Lars Borin’s research interests.
The work on the Lärka language learning platform has started under his coordination. We see it
our mission to make the platform live and prosperous, and through it to stimulate research into
Swedish as a second language. Below, we name some weaknesses we have identified in Lärka
while working with a course of beginner Swedish and outline our plans for tackling those.},
	booktitle    = {Live and Learn- Festschrift in honor of Lars Borin},
	author       = {Volodina, Elena and Alfter, David},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
	pages        = {145--152},
}

@edited_book{volodina-etal-2022-live-320415,
	title        = {Live and Learn- Festschrift in honor of Lars Borin},
	abstract     = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.},
	editor       = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
}