@techreport{kokkinakis-johanssonkokkinakis-1998-cascaded-56209, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1998}, publisher = {Svenska språket}, address = {Göteborg}, } @article{johanssonkokkinakis-kokkinakis-1999-beskrivning-55910, title = {Beskrivning av några problem vid automatisk analys av text}, journal = {Från dataskärm och forskarpärm, "Språkliga studier tillägnade Birgitta Ernby", MISS, Göteborgs universitet}, author = {Johansson Kokkinakis, Sofie and Kokkinakis, Dimitrios}, year = {1999}, volume = {No 25}, pages = {88--95}, } @article{kokkinakis-johanssonkokkinakis-1999-cascaded-56216, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, journal = {European Chapter of the Association of Computational Linguistics (EACL)}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @article{kokkinakis-johanssonkokkinakis-1999-automatisk-56218, title = {Automatisk betydelseidentifiering på cykelnivå m.h.a. GLDB}, journal = {Proceedings från NFL Symposiet (Nordisk Förening i Lexikografi) och Nordiska studier i Lexikografi}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @techreport{kokkinakis-johanssonkokkinakis-1999-sense-56213, title = {Sense Tagging at the Cycle-Level Using GLDB}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @misc{ellison-wilhelmsson-2001-implementation-249271, title = {En implementation för domänoberoende textkategorisering}, author = {Ellison, Magnus and Wilhelmsson, Kenneth}, year = {2001}, publisher = {Datavetenskapligt program, Datalingvistikprogrammet}, address = {Göteborg}, } @book{kokkinakis-2001-framework-125224, title = {A Framework for the Acquisition of Lexical Knowledge; Description and Application}, author = {Kokkinakis, Dimitrios}, year = {2001}, address = {Göteborg}, ISBN = {LIBRIS-ID:8245865}, } @inProceedings{lindh-2002-preliminary-47286, title = {Preliminary Observations on Discontinuities in Two TTS Concatenation Systems.}, booktitle = {Proceedings of Fonetik 2002, TMH-QPSR, KTH, Stockholm}, author = {Lindh, Jonas}, year = {2002}, volume = {44(1)}, pages = {113--116}, } @inProceedings{kokkinakis-2004-reducing-33928, title = {Reducing the Effect of Name Explosion.}, abstract = {The problem of new vocabulary is particularly frustrating once one begins to work with large corpora of real texts. The identification of unknown proper nouns, chains of non-proper nouns and even common words that function as names (i.e. named entities) in unrestricted text, and their subsequent classification into some sort of semantic type is a challenging and difficult problem in Natural Language Processing (NLP). Systems that perform Information Extraction, Information Retrieval, Question-Answering, Topic Detection, Text Mining, Machine Translation and annotation for the Semantic Web have highlighted the need for the automatic recognition of such entities, since their constant introduction in any domain, however narrow, is very common and needs special attention. Proper names are usually not listed in defining or other common types of dictionaries, they may appear in many alias forms and abbreviated variations, which makes their listing infeasible. This paper deals with some extensions to the traditional named entity recognition approaches. It puts emphasis on more name classes and their further subclassification into finer sets. An operative system that can be tested and evaluated on-line implements the ideas described in this paper.}, booktitle = {Proceedings of the LREC Workshop: Beyond Named Entity Recognition, Semantic labelling for NLP tasks. ourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2004}, } @inProceedings{kokkinakis-etal-2004-intelligent-33932, title = {Intelligent Building of Language Resources for HLT Applications}, booktitle = {Proceedings of the LREC Workshop: Amazing Utility of Parallel and Comparable Corpora. Fourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios and Samiotou, Anna and Kranias, Lambros}, year = {2004}, } @incollection{borin-prutz-2004-wine-33945, title = {New wine in old skins? A corpus investigation of L1 syntactic transfer in learner language}, booktitle = {Aston, G., Bernardini, S. & Stewart, D. (eds). Corpora and language learners}, author = {Borin, Lars and Prütz, Klas}, year = {2004}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {90-272-2288-6}, pages = {67--87}, } @inProceedings{lindh-2004-acoustic-47302, title = {Acoustic and Perceptual Analysis of Discontinuities in Two TTS Concatenation Systems}, booktitle = {Proceedings of the XVIIth Swedish Phonetics Conference, Department of Linguistics, Stockholm University}, author = {Lindh, Jonas}, year = {2004}, } @inProceedings{lindh-2004-handling-47298, title = {Handling the "Voiceprint" Issue}, booktitle = {Proceedings of the XVIIth Swedish Phonetics Conference}, author = {Lindh, Jonas}, year = {2004}, } @incollection{borin-2004-language-33976, title = {Language technology resources for less prevalent languages: Will the Münchhausen Model work?}, booktitle = {Holmboe, H. (ed). Nordisk sprogteknologi 2003. Nordic language technology. Årbog for Nordisk Sprogteknologisk Forskningsprogram 2000-2004}, author = {Borin, Lars}, year = {2004}, publisher = {Museum Tusculanums Forlag}, address = {København}, ISBN = {87-7289-997-2}, pages = {71--82}, } @inProceedings{lindh-2004-preliminary-47289, title = {Preliminary Observations on Speaker Identification in a Closed Set Using Graphic Representations of LTAS}, booktitle = {Annual conference of IAFPA, Helsinki 2004}, author = {Lindh, Jonas}, year = {2004}, } @incollection{borin-saxena-2004-grammar-33944, title = {Grammar, incorporated}, booktitle = {Henrichsen, P. J. (ed). CALL for the Nordic languages}, author = {Borin, Lars and Saxena, Anju}, year = {2004}, publisher = {Samfundslitteratur}, address = {Frederiksberg}, ISBN = {87-593-1176-2}, pages = {125--145}, } @article{toporowskagronostaj-2005-elektroniska-34035, title = {Elektroniska ordböcker i Sverige: nutid och framtid}, abstract = {The article Swedish electronic dictionaries: the present and the future aims at a survey of the state of art concerning electronic dictionaries in Sweden. The focus is on CD-ROM dictionaries, mainly defining ones, and their functionalities of technical and conceptual character. It is assumed here that the content, structure and functionalities of the electronic dictionaries of today can make a relevant contribution to designing the truly electronic dictionaries of the future. Some proposals for extending the content of the next generation dictionaries are suggested. The question of using a more sophisticated structural and functional access to the information is also discussed.}, journal = {LexicoNordica}, author = {Toporowska Gronostaj, Maria}, year = {2005}, volume = {12-2005}, pages = {87--107}, } @article{sandfordpedersen-etal-2005-sprogteknologiske-35156, title = {Sprogteknologiske ordbaser for nordiske sprog - rapport fra et forskning-netværk}, journal = {Nordiske studiar i leksikografi }, author = {Sandford Pedersen, Bolette and Fjeld, Ruth V. and Toporowska Gronostaj, Maria}, year = {2005}, volume = {7}, } @inProceedings{lindh-2005-visual-47310, title = {Visual Acoustic vs. Aural Perceptual Speaker Identification in a Closed Set of Disguised Voices}, booktitle = {Annual conference of IAFPA, in Marrakech 2005}, author = {Lindh, Jonas}, year = {2005}, } @inProceedings{kokkinakis-2005-identification-33934, title = {Identification of Named Entities and Medical Terminology in Swedish Patient Records.}, abstract = {An anonymisation or de-identification system can provide a broad spectrum of services related to the growing demands for better forms of dissemination of information about individuals found in electronic patient records. The range of these services includes: health care statistics and sharing clinical information across institutions; validation and monitoring of new diagnostic tests; release of individual data by protecting identities or hints that can identify individuals, and appropriate mechanisms to provide only the information necessary to the professional who has the need to know. This paper describes our first experiments intended for automatic anonymisation of Swedish electronic patient records using a generic system for Named Entity Recognition. There are eight main types of entities that the system recognizes: person, location, organisation, event, object, work & art, time and measure. To this set, two new modules have been recently developed. One is dedicated to animacy recognition, a modules based on a number of clues (such as key words utilized in the persons module grammar and verbs requiring animate subject), and another one designated to identify and annotate medical terminology. The latter module annotates names of drugs and chemical substances, diseases, symptoms, organisms and anatomical terms. A detailed evaluation of the system, on authentic patient records, is given both for the named, medical and animate entities. }, booktitle = {WSEAS Transactions on BIOLOGY and BIOMEDICINE}, author = {Kokkinakis, Dimitrios}, year = {2005}, volume = {2}, number = {3}, pages = {312--317}, } @inProceedings{baud-etal-2005-interchanging-33867, title = {Interchanging lexical information for a multilingual dictionary}, booktitle = {AMIA 2005 Proceedings}, author = {Baud, Robert and Nyström, Mikael and Borin, Lars and Evans, Roger and Schulz, Stefan and Zweigenbaum, Pierre}, year = {2005}, pages = {31--35}, } @article{borin-2005-mannen-33865, title = {Mannen är faderns mormor: Svenskt associationslexikon reinkarnerat}, journal = {LexicoNordica}, author = {Borin, Lars}, year = {2005}, volume = {12}, pages = {39--54}, } @inProceedings{lindh-2005-visual-47308, title = {Visual Acoustic vs. Aural Perceptual Speaker Identification in a Closed Set of Disguised Voices}, booktitle = {Proceedings of the XVIIIth Swedish Phonetics Conference, Department of Linguistics, Göteborg University, eds Jonas Lindh & Anders Eriksson}, author = {Lindh, Jonas}, year = {2005}, } @inProceedings{lindh-2005-model-47305, title = {A Model-Based Experiment Towards an Emotional Synthesis}, booktitle = {Proceedings of the XVIIIth Swedish Phonetics Conference, Department of Linguistics, Göteborg University, eds Jonas Lindh & Anders Eriksson}, author = {Lindh, Jonas}, year = {2005}, } @inProceedings{marko-etal-2006-cross-34049, title = {Cross-Lingual Alignment of Medical Lexicons}, abstract = {We present an approach for the creation of a multilingual medical dictionary for the biomedical domain. In a first step, available monolingual lexical resources are compiled into a common interchange format. Secondly, according to a linking format deciced by the authors, the cross-lingual mappings of lexical entries are added. We show how these mappings can be generated using a morpho-semantic term normalization engine, which captures intra- as well as interlingual synonymy relationships on the level of subwords.}, booktitle = {Language Resources and Evaluation }, author = {Marko, Kornel and Baud, Robert and Zweigenbaum, Pierre and Merkel, Magnus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Schulz, Stefan}, year = {2006}, volume = {2006}, pages = {5--8}, } @inProceedings{kokkinakis-2006-developing-33925, title = {Developing Resources for Swedish Bio-Medical Text Mining}, abstract = {Collection and annotation of corpora in specialized fields, such as medicine, and particularly for lesser-spoken languages, than for instance English, is an important enterprise for the continuous development and growth of language technology research, for resource development and for the implementation of practical applications for these languages. In this paper, we describe our ongoing efforts to build a large Swedish medical corpus, the MEDLEX Corpus, how we combine ge-neric named entity and terminology recognition for the detailed annotation of the corpus, and how these annotations are further utilized by an annotations-aware cascaded finite-state parser. }, booktitle = {Proceedings of the 2nd International Symposium on Semantic Mining in Biomedicine (SMBM)}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{kokkinakis-2006-collection-33937, title = {Collection, Encoding and Linguistic Processing of a Swedish Medical Corpus - The MEDLEX Experience.}, abstract = {Corpora annotated with structural and linguistic characteristics play a major role in nearly every area of language processing. During recent years a number of corpora and large data sets became known and available to research even in specialized fields such as medicine, but still however, targeted predominantly for the English language. This paper provides a description of the collection, encoding and linguistic processing of an ever growing Swedish medical corpus, the MEDLEX Corpus. MEDLEX consists of a variety of text-documents related to various medical text genres. The MEDLEX Corpus has been structurally annotated using the Corpus Encoding Standard for XML (XCES), lemmatized and automatically annotated with part-of-speech and semantic information (extended named entities and the Medical Subject Headings, MeSH, terminology). The results from the processing stages (part-of-speech, entities and terminology) have been merged into a single representation format and syntactically analysed using a cascaded finite state parser. Finally, the parsers results are converted into a tree structure that follows the TIGER-XML coding scheme, resulting a suitable for further exploration and fairly large Treebank of Swedish medical texts. }, booktitle = {Proceedings of the 5th Languages Resources and Evalutaion (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{olsson-2006-does-116086, title = {Does the ITG platform eXist?}, abstract = {An overview of the ITG platform and its backend eXist (http://exist-db.org). }, booktitle = {XML Prague 2006 (http://xmlprague.cz), 17–18 juni 2006, Prag}, author = {Olsson, Leif-Jöran}, year = {2006}, } @article{kokkinakis-2006-towards-45197, title = {Towards a Swedish Medical Treebank}, abstract = {In this paper, we present our current activities towards the compilation and the multi-layered annotation of a domain-dependent corpus for Swedish in the area of medicine. The focus of the paper is based on the description of the constituent structure and functionally oriented annotation of the corpus. Moreover, the annotation scheme adopted, which incorporates three main layers of linguistic processing, lexical analysis, shallow semantic analysis and syntactic processing, will be exemplified. For the syntactic analysis we use a cascaded finite-state parser, aware of the shallow semantic annotations produced. The result of this analysis, including syntactic parsing and shallow semantic analysis, is transformed into the TIGER-XML interchange format. Our goal is to produce a large, rich in annotations, medical treebank suitable for both corpus-based grammar learning systems, for semantic relation extraction and for linguistic exploration of theoretical nature.}, journal = {Proceedings of the 5th Conference on Treebanks and Linguistic Theories}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{kokkinakis-2006-towards-34033, title = {Towards a Swedish Medical Treebank}, booktitle = {5th Conference on Treebanks and Linguistic Theories}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @incollection{borin-2006-supporting-33863, title = {Supporting lesser-known languages: The promise of language technology}, booktitle = {Saxena, A. & Borin, L. (eds). Lesser-known languages of South Asia. Status and policies, case studies and applications of information technology}, author = {Borin, Lars}, year = {2006}, publisher = {Mouton de Gruyter}, address = {Berlin}, ISBN = {3-11-018976-3}, pages = {317--337}, } @inProceedings{skoldberg-toporowskagronostaj-2006-swedish-33612, title = {Swedish Appellativized Forenames in Compunds - A Lexicographic Approach}, booktitle = {Proceedings XII Euralex International Congress. Torino, Italia, September 6-9, 2006}, author = {Sköldberg, Emma and Toporowska Gronostaj, Maria}, year = {2006}, volume = {2}, ISBN = {88-7694-918-6}, pages = {1193--1199}, } @incollection{borin-2006-sparv-44950, title = {Sparv i tranedansen eller fisken i vattnet? Språkteknologi och språklärande}, booktitle = {Från vision till praktik: Språkutbildning och informationsteknik}, author = {Borin, Lars}, year = {2006}, publisher = {NSHU - Myndigheten för nätverk och samarbete inom högre utbildning}, address = {Härnösand}, ISBN = {978-91-975425-8-6}, pages = {25--49}, } @incollection{kokkinakis-etal-2006-bygga-56225, title = {Att bygga en språkbro mellan allmänhet och vårdpersonal - språket i texter om hjärt-kärlsjukdomar}, booktitle = {Humanistdag-boken}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Johansson Kokkinakis, Sofie}, year = {2006}, publisher = {Göteborgs universitet}, address = {Göteborg}, } @inProceedings{kokkinakis-dannells-2006-recognizing-33936, title = {Recognizing Acronyms and their Definitions in Swedish Medical Texts}, abstract = {This paper addresses the task of recognizing acronym-definition pairs in Swedish (medical) texts as well as the compilation of a freely available sample of such manually annotated pairs. A material suitable not only for supervised learning experiments, but also as a testbed for the evaluation of the quality of future acronym-definition recognition systems. There are a number of approaches to the identification described in the literature, particularly within the biomedical domain, but none of those addresses the variation and complexity exhibited in a language other than English. This is realized by the fact that we can have a mixture of two languages in the same document and/or sentence, i.e. Swedish and English; that Swedish is a compound language that significantly deteriorates the performance of previous approaches (without adaptations) and, most importantly, the fact that there is a large variation of possible acronym-definition permutations realized in the analysed corpora, a variation that is usually ignored in previous studies. }, booktitle = {roceedings of the 5th Languages Resources and Evalutaion (LREC). }, author = {Kokkinakis, Dimitrios and Dannélls, Dana}, year = {2006}, } @inProceedings{kokkinakis-toporowskagronostaj-2006-language-33938, title = {Lay Language versus Professional Language within the Cardiovascular Subdomain - a Contrastive Study}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, booktitle = {Proceedings of the 2006 WSEAS Int. Conf. on Cellular & Molecular Biology, Biophysics & Bioengineering}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, } @article{nimb-etal-2006-leksikalisk-35157, title = {Leksikalisk beskrivelse af adverbiers semantik i norsk, svensk og dansk - LEXADV}, journal = {Skrifter / Nordisk forening for leksikografi}, author = {Nimb, Sanni and Fjeld, Ruth V. and Toporowska Gronostaj, Maria}, year = {2006}, volume = {9}, pages = {301--314}, } @inProceedings{marko-etal-2006-towards-40540, title = {Towards a multilingual medical lexicon}, booktitle = {Proceedings of the American Medical Informatics Association Symposium (AMIA '06)}, author = {Markó, Kornél and Baud, Robert and Zweigenbaum, Pierre and Borin, Lars and Merkel, Magnus and Schulz, Stefan}, year = {2006}, pages = {534--538}, } @inProceedings{lindh-2006-preliminary-47314, title = {Preliminary Descriptive F0-statistics for Young Male Speakers}, booktitle = {Papers from FONETIK 2006, Working Papers, 52, Department of Linguistics and Phonetics, Lund University}, author = {Lindh, Jonas}, year = {2006}, volume = {52}, pages = {89--92}, } @incollection{borin-2006-gar-33864, title = {Vi som går köksvägen: Språkteknologer och korpuslingvister i Litteraturbanken}, booktitle = {Börjesson, M. (red). Fältanteckningar: Utbildnings- och kultursociologiska texter tillägnade Donald Broady}, author = {Borin, Lars}, year = {2006}, publisher = {Forskningsgruppen för utbildnings- och kultursociologi (ILU), Uppsala universitet}, address = {Uppsala}, ISBN = {91-631-8807-4}, pages = {399--404}, } @article{kokkinakis-toporowskagronostaj-2006-comparing-34032, title = {Comparing Lay and Professional Language in Cardiovascular Disorders Corpora.}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, journal = {WSEAS Transactions on BIOLOGY and BIOMEDICINE}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, volume = {3}, number = {6}, pages = {429--437}, } @inProceedings{lindh-2006-preliminary-47318, title = {Preliminary F0 Statistics and Forensic Phonetics}, booktitle = {Annual conference of IAFPA, Department of Linguistics, Göteborg University, 2006. Eds. Jonas Lindh and Anders Eriksson}, author = {Lindh, Jonas}, year = {2006}, } @inProceedings{lindh-2006-statistics-99168, title = {F0 Statistics, Robustness and Measures - Implications for Forensic Speaker Identification}, booktitle = {Proceedings of The Swedish Language Technology Conference 2006}, author = {Lindh, Jonas}, year = {2006}, } @inProceedings{lindh-2006-case-47316, title = {A Case Study of /r/ in the Västgöta Dialect}, booktitle = {Papers from FONETIK 2006, Working Papers, 52, Department of Linguistics and Phonetics, Lund University}, author = {Lindh, Jonas}, year = {2006}, volume = {52}, pages = {85--88}, } @techreport{ahlfelt-etal-2006-literature-34047, title = {Literature Review on Patient_Friendly Documentation Systems}, author = {Åhlfelt, Hans and Borin, Lars and Daumke, Philipp and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Kokkinakis, Dimitrios and Mancini, Clara and Marko, Kornel and Merkel, Magnus and Pietsch, Christian and Power, Richard and Scott, Donia and Silvervarg, Annika and Toporowska Gronostaj, Maria and Williams, Sandra and Willis, Alistair}, year = {2006}, publisher = {Göteborg University}, address = {Göteborg}, } @inProceedings{borin-olsson-2006-plattformen-116093, title = {ITG-plattformen som korpusverktyg}, abstract = {En genomgång och handfast presentation om hur ITG-plattformen kan användas som korpusverktyg.}, booktitle = {Fjärde svenska lingvistikkonferensen (Sling 2006), 27–28 april 2006, Stockholm}, author = {Borin, Lars and Olsson, Leif-Jöran}, year = {2006}, } @edited_book{saxena-borin-2006-lesser-33862, title = {Lesser-known languages of South Asia. Status and policies, case studies and applications of information technology}, editor = {Saxena, Anju and Borin, Lars}, year = {2006}, publisher = {Mouton de Gruyter}, address = {Berlin}, ISBN = {3-11-018976-3}, } @inProceedings{lindh-2007-voxalys-47320, title = {Voxalys- a Pedagogical Praat Plugin for Voice Analysis.}, booktitle = {Proceedings of Fonetik 2007, TMH-QPSR, KTH, Stockholm}, author = {Lindh, Jonas}, year = {2007}, volume = {50}, pages = {73--77}, } @inProceedings{borin-etal-2007-medical-44951, title = {Medical frames as target and tool}, booktitle = {FRAME 2007: Building Frame Semantics resources for Scandinavian and Baltic languages. (Nodalida 2007 workshop proceedings)}, author = {Borin, Lars and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2007}, ISBN = {978-91-976939-0-5}, pages = {11--18}, } @techreport{borin-etal-2007-empowering-53590, title = {Empowering the patient with language technology}, author = {Borin, Lars and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Williams, Sandra and Willis, Alistair}, year = {2007}, publisher = {Göteborg University}, address = {Göteborg}, } @inProceedings{lindh-eriksson-2007-robustness-47321, title = {Robustness of Long Time Measures of Fundamental Frequency}, abstract = {In many speech technology based applications as well as in forensic phonetics it is desirable to obtain reliable estimates of a speaker’s fundamental frequency. We would like the measures to be accurate and reliable enough in order to be used meaningfully as a parameter in speaker identification or verification. Under optimal conditions such as when high quality studio recordings and normal speech styles are used this is often possible. In real life applications such conditions are the exception rather than the rule. The study presented here reports the result from an investigation where different measures were tested on speech material that varied with respect to speaking style, vocal effort and recording quality. Based on the results from these tests we would like to suggest a measure we call the alternative fundamental frequency baseline as the measure that is most robust with respect to the above-mentioned sources of variation. Index Terms: speaker recognition, speaker identification, fundamental frequency, F0.}, booktitle = {In Proceedings of Interspeech 2007, Antwerp, Belgium.}, author = {Lindh, Jonas and Eriksson, Anders}, year = {2007}, pages = {2025–2028}, } @article{kokkinakis-thurin-2007-anonymisation-45193, title = {Anonymisation of Swedish Clinical Data}, abstract = {There is a constantly growing demand for exchanging clinical and health-related information electronically. In the era of the Electronic Health Record the release of individual data for research, health care statistics, monitoring of new diagnostic tests and tracking disease outbreak alerts are some of the areas in which the protection of (patient) privacy has become an important concern. In this paper we present a system for automatic anonymisation of Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology.}, journal = {Lecture Notes in Computer Science}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, volume = {4594}, pages = {237--241}, } @inProceedings{borin-etal-2007-naming-44954, title = {Naming the past: Named entity and animacy recognition in 19th century Swedish literature}, booktitle = {ACL 2007 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2007)}, author = {Borin, Lars and Kokkinakis, Dimitrios and Olsson, Leif-Jöran}, year = {2007}, pages = {1--8}, } @article{kokkinakis-thurin-2007-identification-45195, title = {Identification of Entity References in Hospital Discharge Letters}, abstract = {In the era of the Electronic Health Record the release of medical narrative textual data for research, for health care statistics, for monitoring of new diagnostic tests and for tracking disease outbreak alerts imposes tough restrictions by various public authority bodies for the protection of (patient) privacy. In this paper we present a system for automatic identification of named entities in Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology with minor adaptations}, journal = {Proceedings of the 16th Nordic Conference of Computational Linguistics (NODALIDA)}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, } @article{kokkinakis-etal-2007-lexical-45194, title = {Lexical Parameters, Based on Corpus Analysis of English and Swedish Cancer Data, of Relevance for NLG}, abstract = {This paper reports on a corpus-based, contrastive study of the Swedish and English medical language in the cancer sub-domain. It is focused on the examination of a number of linguistic parameters differentiating two types of cancer-related textual material, one intended for medical experts and one for laymen. Language-dependent and language independent characteristics of the textual data between the two languages and the two registers are examined and compared. The aim of the work is to gain insights into the differences between lay and expert texts in order to support natural language generation (NLG) systems.}, journal = {roceedings of the 16th Nordic Conference of Computational Linguistics (NODALIDA)}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Hallett, Catalina and Hardcastle, david}, year = {2007}, } @inProceedings{olsson-2007-exist-66850, title = {How do you do eXist}, booktitle = {Javaforum 2007-05-23}, author = {Olsson, Leif-Jöran}, year = {2007}, } @inProceedings{kokkinakis-2007-automatic-47933, title = {Automatic Indexing using the English and Swedish MeSH®, a Note on Coverage}, abstract = {The identification and mapping of terminology onto a concept hierarchy is the very first stage of semantic, deeper analysis of textual documents. Work regarding automatic terminology recognition using the Swedish MeSH® thesaurus (Medical Subject Headings, edition 2006) and its corresponding English source is reported. A number of transformations and refinements were applied to the original lexical database in order to enhance the automatic process of mapping the extensive variability of lexical terms in authentic data to structured MeSH codes. Means to increase the coverage of both thesauruses for automatic indexing of Swedish medical data are investigated.}, booktitle = {Svenska Läkaresällskapets Riksstämma 2007}, author = {Kokkinakis, Dimitrios}, year = {2007}, } @techreport{andreasson-etal-2008-habeas-102220, title = {Habeas Corpus: A survey for SNK - a Swedish national corpus}, author = {Andréasson, Maia and Borin, Lars and Merkel, Magnus}, year = {2008}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{kokkinakis-toporowskagronostaj-2008-medlex-73976, title = {MEDLEX+: An Integrated Corpus-Lexicon Medical Workbench for Swedish}, abstract = {This paper reports on ongoing work on developing a medical corpus-lexicon workbench for Swedish, MedLex+. At the moment the workbench incorporates: (i) an annotated collection of medical texts, 25 million tokens, 50,000 documents, (ii) a number of language processing components, including tools for collocation extraction, compound segmentation and thesaurus-based semantic annotation, and (iii) a lexical database of medical terms (5,000 entries). MedLex+ is a multifunctional lexical resource due to its structural design and content which can be easily queried. The medical workbench is intended to support lexicographers in their work on compiling lexicons and also lexicon users more or less initiated in the medical domain. It can also assist researchers working in the fields of lexical semantics and natural language processing (NLP) with focus on medical language. The linguistically and semantically annotated medical texts in combination with a set of queries turn the corpus into a rich repository of semasiological and onomasiological knowledge about medical terminology and their linguistic, lexical and pragmatic properties. These properties are recorded in the lexical database with a cognitive profile. The MedLex+ workbench seems to offer constructive help in many different lexical tasks. }, booktitle = {Proceedings of the 13th EURALEX}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2008}, } @book{marinov-2008-dependency-88750, title = {Dependency-Based Syntactic Analysis of Bulgarian}, author = {Marinov, Svetoslav}, year = {2008}, ISBN = {978-91-977196-2-9}, } @incollection{borin-2008-lemma-72507, title = {Lemma, lexem eller mittemellan? Ontologisk ångest i den digitala domänen}, booktitle = {Nog ordat? Festskrift till Sven-Göran Malmgren}, author = {Borin, Lars}, year = {2008}, publisher = {University of Gothenburg}, address = {Göteborg}, pages = {59--67}, } @article{borin-2008-review-72506, title = {Review of Stig Johansson: Seeing through multilingual corpora: On the use of corpora in contrastive studies}, journal = {ICAME Journal}, author = {Borin, Lars}, year = {2008}, volume = {32}, pages = {261--267}, } @inProceedings{kokkinakis-2008-semantic-73975, title = {Semantic Relation Mining of Solid Compounds in Medical Corpora.}, abstract = {In the context of scientific and technical texts, meaning is usually embedded in noun compounds and the semantic interpretation of these compounds deals with the detection and semantic classification of the relation that holds between the compound’s constituents. Semantic relation mining, the technology applied for marking up, interpreting, extracting and classifying relations that hold between pairs of words, is an important enterprise that contribute to deeper means of enhancing document understanding technologies, such as Information Extraction, Question Answering, Summarization, Paraphrasing, Ontology Building and Textual Entailment. This paper explores the application of assigning semantic descriptors taken from a multilingual medical thesaurus to a large sample of solid (closed form) compounds taken from large Swedish medical corpora, and determining the relation(s) that may hold between the compound constituents. Our work is inspired by previous research in the area of using lexical hierarchies for identifying relations between two-word noun compounds in the medical domain. In contrast to previous research, Swedish, as other Germanic languages, require further means of analysis, since compounds are written as one sequence with no white space between the words, e.g. virus diseases vs. virussjukdomar, which makes the problem more challenging, since solid compounds are harder to identify and segment.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, ISBN = {9786611733414}, } @inProceedings{wilhelmsson-2008-heuristic-79686, title = {Heuristic Schema Parsing of Swedish Text}, abstract = {A method for identification of the primary (main clause) functional constituents of Swedish sentences is outlined. The method gives a robust analysis of the unbounded constituents (phrases which do not have an upper bound on their length: subjects, objects/predicatives and adverbials) by first identifying bounded constituents. Diderichsen’s sentence schema, chunking, syntactic valency data and heuristics are used for the delimitation of the constituents and labelling with grammatical functions.}, booktitle = {Proceedings of the Swedish Language Technology Conference (SLTC'08)}, author = {Wilhelmsson, Kenneth}, year = {2008}, pages = {41--42}, } @incollection{borin-etal-2008-hunting-72504, title = {The hunting of the BLARK - SALDO, a freely available lexical database for Swedish language technology}, booktitle = {Resourceful language technology. Festschrift in honor of Anna Sågvall Hein}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, publisher = {Uppsala University}, address = {Uppsala}, pages = {21--32}, } @inProceedings{olsson-2008-valkommen-116044, title = {Välkommen till eXist}, abstract = {A walkthrough of the versatility of the native XML database eXist, (http://exist-db.org), combined with overwiews of XPath and XQuery.}, booktitle = {FSCONS (http://fscons.org), 24–26 oktober 2008, Göteborg}, author = {Olsson, Leif-Jöran}, year = {2008}, } @inProceedings{skoldberg-toporowskagronostaj-2008-from-73087, title = {From Subdomains and Parameters to Collocational Patterns: On the Analysis of Swedish Medical Collocations}, booktitle = {Proceedings of the XIII EURALEX International Congress July 15 - 19 2008}, author = {Sköldberg, Emma and Toporowska Gronostaj, Maria}, year = {2008}, ISBN = {978-84-96742-67-3}, pages = {1421--1432}, } @article{toporowskagronostaj-skoldberg-2008-betydelseindikatorer-70495, title = {Betydelseindikatorer och tematiska slingor. Om jakten på rätt betydelse i framtida svenska digitala ordböcker}, journal = {Nog ordat? Festskrift till Sven-Göran Malmgren}, author = {Toporowska Gronostaj, Maria and Sköldberg, Emma}, year = {2008}, pages = {364--372}, } @article{borin-etal-2008-saldo-110525, title = {SALDO 1.0 (Svenskt associationslexikon version 2)}, journal = {Språkbanken, Göteborg universitet}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2008}, } @inProceedings{borin-forsberg-2008-something-72502, title = {Something old, something new: A computational morphological description of Old Swedish}, booktitle = {LREC 2008 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2008)}, author = {Borin, Lars and Forsberg, Markus}, year = {2008}, pages = {9--16}, } @inProceedings{kokkinakis-thurin-2008-applying-73972, title = {Applying MeSH® to the (Swedish) Clinical Domain - Evaluation and Lessons learned}, abstract = {Medical discharge summaries and clinical notes provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a potential goldmine for both medical scientists as well as practitioners in the language technology field. The capability to extract the key concepts and their relationships from such data can be of great value for knowledge management tasks such as indexing, data interchange, data aggregation and clinical decision support. The purpose of this work is to get insights into the feasibility of applying the content of a controlled vocabulary, the Medical Subject Headings (MeSH) to a sample of electronic discharge letters (i.e. free text clinical notes). We explore the application of natural language processing (NLP) techniques to the challenge of efficiently detecting the terminology, as encoded in MeSH and we evaluate MeSH in this setting, showing that a lot of work remains to be done in order to increase the coverage of the resource both in terms of its breadth and depth. }, booktitle = {Proceedings of the 6th Scandinavian Health Informatics and the 12th Swedish National Term Conference}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2008}, } @article{wilhelmsson-2008-automatic-79714, title = {Automatic Variation of Swedish Text by Syntactic Fronting}, abstract = {Ongoing work with a prototype implementation for automatic fronting of primary (main clause) constituents in Swedish input text is described. Linguistic constraints and some technical aspects are also discussed.}, journal = {Proceedings of the Workshop on NLP for Reading and Writing — Resources, Algorithms and Tools November 20, 2008 Stockholm, Sweden SLTC 2008, NEALT Proceedings Series}, author = {Wilhelmsson, Kenneth}, year = {2008}, volume = {3 }, pages = {22--23}, } @inProceedings{kokkinakis-2008-meshr-73973, title = {MeSH® - From a Controlled Vocabulary to a Processable Resource}, abstract = {Large repositories of life science data in the form of domain-specific literature, textual databases and other large specialised textual collections (corpora) in electronic form increase on a daily basis to a level beyond the human mind can grasp and interpret. As the volume of data continues to increase, substantial support from new information technologies and computational techniques grounded in the form of the ever increasing applications of the mining paradigm is becoming apparent. These emerging technologies play an increasingly critical role in aiding research productivity, and they provide the means for reducing the workload for information access and decision support and for speeding up and enhancing the knowledge discovery process. In order to accomplish these higher level goals and support the mining approach however, a fundamental and unavoidable starting point is the identification and mapping of terminology from the textual, unstructured data onto biomedical knowledge sources and concept hierarchies. In this paper, we provide a description of the work regarding terminology recognition using the Swedish MeSH® thesaurus and its corresponding English original source. We explain the various transformation and refinement steps applied to the original database tables into a fully-fledged processing oriented annotating resource. Particular attention has been given to a number of these steps in order to automatically map the extensive variability of lexical terms to structured MeSH® nodes. Issues on annotation and coverage are also discussed. }, booktitle = {Proceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{kokkinakis-2008-semantic-73977, title = {Semantic Pre-processing for Complexity Reduction in Parsing Medical Texts}, abstract = {Collection and multilayer annotation of textual corpora in specialized fields, such as (bio-) medicine is an important enterprise for empirically-based, data-driven language processing, human language technologies and linguistic research. One of the most important and difficult to achieve piece of annotation that can be made available is at the syntactic and functional level, i.e. parsing, particularly in sublanguages where specialized tools have to be adapted which is considered too expensive for many applications. In this paper, we describe a way to reduce the complexity of parsing in medical discourse by the use of a semantic pre-processing stage guided by annotations provided by medical thesauri and other domain-specific lexical resources. Parsing biomedical texts, apart from the challenge it possesses (deviant and idiosyncratic uses of vocabulary and syntax), is required in order to support and improve technologies such as Information Extraction and Retrieval, enhance the acquisition of relations between terminology support terminology management and population of medical semantic resources.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{kokkinakis-2008-semantically-73974, title = {A Semantically Annotated Swedish Medical Corpus}, abstract = {With the information overload in the life sciences there is an increasing need for annotated corpora, particularly with biological and biomedical entities, which is the driving force for data-driven language processing applications and the empirical approach to language study. Inspired by the work in the GENIA Corpus, which is one of the very few of such corpora, extensively used in the biomedical field, and in order to fulfil the needs of our research, we have collected a Swedish medical corpus, the MEDLEX Corpus. MEDLEX is a large structurally and linguistically annotated document collection, consisting of a variety of text documents related to various medical text subfields, and does not focus at a particular medical genre, due to the lack of large Swedish resources within a particular medical subdomain. Out of this collection we selected 300 documents which were manually examined by two human experts who inspected, corrected and/or accordingly modified the automatically provided annotations according to a set of provided labelling guidelines. The annotations consist of medical terminology provided by the Swedish and English MeSH® (Medical Subject Headings) thesauri as well as named entity labels provided by an enhanced named entity recognition software.}, booktitle = {roceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{skoldberg-toporowskagronostaj-2008-modell-91906, title = {Modell för beskrivning av kollokationer i ett medicinskt lexikon (MedLex)}, booktitle = {Nordiske studier i leksikografi 9. Rapport fra konference om lexikografi i Norden, Akureyri 22.-26. maj 2007}, author = {Sköldberg, Emma and Toporowska Gronostaj, Maria}, year = {2008}, ISBN = {978-9979-654-05-6}, pages = {433--445}, } @inProceedings{lindh-2008-robustness-99174, title = {Robustness of Forced Alignment in a Forensic Context}, booktitle = {Proceedings of IAFPA2008, Lausanne, Switzerland}, author = {Lindh, Jonas}, year = {2008}, } @edited_book{lendvai-borin-2009-proceedings-91853, title = {Proceedings of the EACL 2009 Workshop on Language Technology and Resources for Cultural Heritage, Social Sciences, Humanities, and Education (LaTeCH -- SHELT&R 2009)}, editor = {Lendvai, Piroska and Borin, Lars}, year = {2009}, publisher = {ACL}, address = {Athens}, ISBN = {1-932432-21-3}, } @inProceedings{borin-2009-linguistic-102209, title = {Linguistic diversity in the information society}, booktitle = {Proceedings of the SALTMIL 2009 workshop on Information Retrieval and Information Extraction for Less Resourced Languages}, author = {Borin, Lars}, year = {2009}, ISBN = {978-84-692-4940-6}, pages = {1--7}, } @article{kokkinakis-gerdin-2009-uppbyggandet-105136, title = {Uppbyggandet av en svensk medicinsk korpus för termvalidering och termextrahering - hur bra täcker SNOMED CT olika delfackområden?}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv <http://ltarkiv.lakartidningen.se> och dels från DiabetologNytts (DN) digitala arkiv <http://diabetolognytt.se/aterkommande/arkivet.html>. }, journal = {2009 års nationella termkonferens Språk och Kommunikation}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{kokkinakis-2009-shallow-94705, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning, a pilot study}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discuss the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a machine learning task in which the relations are tried to be learned in a supervised fashion, using pre-annotated data. The challenges designing the problem and empirical results are presented.}, booktitle = {Proceedings of the 12th International Conference TSD (Text, Speech and Dialogue). Springer Verlag, LNCS/LNAI series.}, author = {Kokkinakis, Dimitrios}, year = {2009}, } @inProceedings{ljunglof-2009-trindikit-99883, title = {trindikit.py: An open-source Python library for developing ISU-based dialogue systems}, abstract = {TrindiKit is one of the main tools for developing ISU-based dialogue systems, but it is implemented in a non-free dialect of the programming language Prolog. Therefore we have translated the TrindiKit toolkit into an open-source Python package. We have tried to remain close to the original TrindiKit formulation, while making the most of Python classes and objects.}, booktitle = {IWSDS'09, 1st International Workshop on Spoken Dialogue Systems Technology}, author = {Ljunglöf, Peter}, year = {2009}, } @inProceedings{lindh-2009-perception-99180, title = {Perception of voice similarity and the results of a voice line-up}, booktitle = {The XXIInd Swedish Phonetics Conference, Department of Linguistics, Stockholm University, 2009.}, author = {Lindh, Jonas}, year = {2009}, ISBN = {978-91-633-4892-1}, pages = {186--189}, } @article{skoldberg-toporowskagronostaj-2009-charmknutte-101453, title = {Charmknutte, viktigpetter och kladdmaja. Substantiverade förnamn i sammansättningar ur ett lexikografiskt perspektiv}, journal = {Studia anthroponymica Scandinavica}, author = {Sköldberg, Emma and Toporowska Gronostaj, Maria}, year = {2009}, volume = {27}, pages = {73--96}, } @techreport{borin-2009-bush-102214, title = {One in the bush: Low-density language technology}, author = {Borin, Lars}, year = {2009}, publisher = {University of Gothenburg}, address = {Göteborg}, } @article{kokkinakis-2009-shallow-105133, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning; a pilot study.}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discusses the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a supervised machine learning task in which the relations are tried to be learned using pre-annotated data. The challenges designing the problem and empirical results are presented.}, journal = {Lecture Notes in Computer Science}, author = {Kokkinakis, Dimitrios}, year = {2009}, volume = {5729}, pages = {395--402}, } @inProceedings{lindh-2009-first-99189, title = {A first step towards a text-independent speaker verification Praat plug-in using Mistral/Alize tools}, booktitle = {The XXIInd Swedish Phonetics Conference, Department of Linguistics, Stockholm University, 2009.}, author = {Lindh, Jonas}, year = {2009}, ISBN = {978-91-633-4892-1}, pages = {194--197}, } @inProceedings{ljunglof-2009-trik-99886, title = {TRIK: en talande och ritande robot för barn med kommunikativa funktionshinder}, booktitle = {Presentation på ID-dagarna, 7–9 oktober 2009, Stockholm}, author = {Ljunglöf, Peter}, year = {2009}, } @inProceedings{lindh-2009-pick-123922, title = {Pick a Voice among Wolves, Goats and Lambs}, booktitle = {Proceedings of the 18th Annual Conference of the International Association for Forensic Phonetics and Acoustics, Cambridge, UK}, author = {Lindh, Jonas}, year = {2009}, number = {18}, } @inProceedings{derbring-etal-2009-subtts-148340, title = {SubTTS: Light-weight automatic reading of subtitles}, abstract = {We present a simple tool that enables the computer to read subtitles of movies and TV shows aloud. The tool works by reading subtitle files, which can be freely downloaded or extracted from a DVD using existing tools, and read the text aloud through a speech synthesizer. The target audience are people who have trouble reading subtitles while watching a movie, for example people with visual impairments and people with reading difficulties, such as dyslexia. The application will be evaluated together with user from these groups to see if this could be an accepted solution to their need. }, booktitle = {Proceedings of the 17th Nordic Conference of Computational Linguistics NODALIDA 2009}, author = {Derbring, Sandra and Ljunglöf, Peter and Olsson, Maria}, year = {2009}, } @inProceedings{ljunglof-2009-dialogue-95890, title = {Dialogue Management as Interactive Tree Building}, abstract = {We introduce a new dialogue model and a formalism for limited-domain dialogue systems, which works by interactively building dialogue trees. The model borrows its fundamental ideas from type theoretical grammars and Dynamic Syntax. The resulting dialogue theory is a simple and light-weight formalism, which is still capable of advanced dialogue behaviour.}, booktitle = {DiaHolmia'09, 13th Workshop on the Semantics and Pragmatics of Dialogue}, author = {Ljunglöf, Peter}, year = {2009}, } @incollection{kokkinakis-2009-lexical-73979, title = {Lexical granularity for automatic indexing and means to achieve it - the case of Swedish MeSH®}, abstract = {The identification and mapping of terminology from large repositories of life science data onto concept hierarchies constitute an important initial step for a deeper semantic exploration of unstructured textual content. Accurate and efficient mapping of this kind is likely to provide better means of enhancing indexing and retrieval of text, uncovering subtle differences, similarities and useful patterns, and hopefully new knowledge, among complex surface realisations, overlooked by shallow techniques based on various forms of lexicon look-up approaches. However, a finer-grained level of mapping between terms as they occur in natural language and domain concepts is a cumbersome enterprise that requires various levels of processing in order to make explicit relevant linguistic structures. This chapter highlights some of the challenges encountered in the process of bridging free to controlled vocabularies and thesauri and vice versa. We investigate how the extensive variability of lexical terms in authentic data can be efficiently projected to hierarchically structured codes, while means to increase the coverage of the underlying lexical resources are also investigated.}, booktitle = {Information Retrieval in Biomedicine : Natural Language Processing for Knowledge Integration}, author = {Kokkinakis, Dimitrios}, year = {2009}, publisher = {IGI Global }, address = {Hershey, Pennsylvania}, } @inProceedings{ljunglof-etal-2009-trik-91892, title = {TRIK: A talking and drawing robot for children with communication disabilities}, abstract = {This paper describes an ongoing project where we develop and evaluate setup involving a communication board (for manual sign communication) and a drawing robot, which can communicate with each other via spoken language. The purpose is to help children with severe communication disabilities to learn language, language use and cooperation, in a playful and inspiring way. The communication board speaks and the robot is able to understand and talk back. This encourages the child to use the language and learn to cooperate to reach a common goal, which in this case is to get the robot to draw figures on a paper.}, booktitle = {Proceedings of the 17th Nordic Conference of Computational Linguistics NODALIDA 2009}, author = {Ljunglöf, Peter and Larsson, Staffan and Thunberg, Gunilla and Mühlenbock, Katarina}, year = {2009}, volume = {4}, } @inProceedings{kokkinakis-gerdin-2009-kvalitetssakring-105141, title = {Kvalitetssäkring av SNOMED CT med hjälp av Läkartidningens arkiv. }, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ (SNOMED CT) till svenska. Arbetet är både omfattande och tidskrävande samtidigt som uppdragstagaren har krav om kvalitetssäkring av översättningen. Hur kan Läkartidningens arkiv bidra till kvalitetssäkringen? Med hjälp av Läkartidningens digitala arkiv, LDA, (årgångarna 1996-2009) har vi utvecklat metoder för att effektivisera kvalitetssäkringen av olika SNOMED CT-urval (t.ex. diabetestermer). Det innebär att vi underlättar för utförandet av empiriska, SNOMED CT-relaterade studier, som t.ex. framtagning av underlag om termernas användning, variation och frekvensdistribution över tid. Arkivets förädling: LDA:t omvandlades till ett enhetligt textbaserat format och textinnehållet normaliserades med avseenden på dokumentformat och teckenkodning för att kunna skapa ett bra underlag för den efterföljande språkteknologiska analysen. Alla artiklar i varje publicerad årgång extraherades och märktes upp dels med olika slags metainformation (t.ex. genretillhörighet) dels med lingvistisk och semantisk information, sammanlagt 27 000 artiklar. Den språkteknologiska bearbetningen innefattade automatiskt tillägg av lingvistisk information som t.ex. ordklasstillhörighet för varje ord i korpusen och automatiskt, semantisk mappning dels till den svenska MeSH-tesaurusen och dels till delar av den svensköversatta SNOMED-hierarkin. LDA i en ny skepnad: LDA utgör sedan länge en värdefull svensk medicinsk resurs för alla som yrkesmässigt jobbar med termer och språk. Vi har dock bidragit med att göra textmaterialet ännu mer välstrukturerat och förädlat, som kan vara till hjälp för explorativa studier där sökningar kan förfinas på ett flertal sätt och därmed ge forskare möjligheter att göra djupare innehållsanalyser av texterna och samla grundläggande kunskaper inom olika ämnesområden. Kombinationen av enstaka termer och ord med lingvistisk och semantisk information ger unika möjligheter till att skaffa information och generera fakta som kan leda till nya hypoteser och eventuellt ny kunskap om olika aspekter som gäller termanvändning och variation och vi kommer att redovisa exempel på sådana analyser. }, booktitle = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{ljunglof-olsson-2009-trik-99885, title = {TRIK: en talande och ritande robot för barn med kommunikativa funktionshinder}, booktitle = {Presentation vid 8:e Västsvenska Kommunikationskarnevalen, 1–2 juni 2009}, author = {Ljunglöf, Peter and Olsson, Maria}, year = {2009}, } @article{kokkinakis-gerdin-2009-issues-105140, title = {Issues on Quality Assessment of SNOMED CT® Subsets - Term Validation and Term Extraction}, abstract = {The aim of this paper is to apply and develop methods based on Natural Language Processing for automatically testing the validity, reliability and coverage of various Swedish SNOMED-CT subsets, the Systematized NOmenclature of MEDicine - Clinical Terms a multiaxial, hierarchical classification system which is currently being translated from English to Swedish. Our work has been developed across two dimensions. Initially a Swedish electronic text collection of scientific medical documents has been collected and processed to a uniform format. Secondly, a term processing activity has been taken place. In the first phase of this activity, various SNOMED CT subsets have been mapped to the text collection for evaluating the validity and reliability of the translated terms. In parallel, a large number of term candidates have been extracted from the corpus in order to examine the coverage of SNOMED CT. Term candidates that are currently not included in the Swedish SNOMED CT can be either parts of compounds, parts of potential multiword terms, terms that are not yet been translated or potentially new candidates. In order to achieve these goals a number of automatic term recognition algorithms have been applied to the corpus. The results of the later process is to be reviewed by domain experts (relevant to the subsets extracted) through a relevant interface who can decide whether a new set of terms can be incorporated in the Swedish translation of SNOMED CT or not. }, journal = {Proceedings of RANLP-2009 Workshop: Biomedical Information Extraction.}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @article{lindh-eriksson-2009-swedat-118616, title = {The SweDat Project and Swedia Database for Phonetic and Acoustic Research}, abstract = {The project described here may be seen as a continuation of an earlier project, SweDia 2000, aimed at transforming the database collected in that project to a full-fledged e-science database. The database consists of recordings of Swedish dialects from 107 locations in Sweden and Swedish speaking parts of Finland. The goal of the present project is to make the material searchable in a flexible and simple way to make it available to a much wider sector of the research community than is the case at present. The database will be accessible over the Internet via user-friendly interfaces specifically designed for this type of data. Other more specialized research interfaces will also be designed to facilitate phonetic acoustic research and orientation of the database.}, journal = {Proceeding E-SCIENCE '09 Proceedings of the 2009 Fifth IEEE International Conference on e-Science}, author = {Lindh, Jonas and Eriksson, Anders}, year = {2009}, pages = {45--49}, } @article{malmgren-toporowskagronostaj-2009-valensbeskrivning-109243, title = {Valensbeskrivning i svenska ordböcker — och några andra}, journal = {LexicoNordica}, author = {Malmgren, Sven-Göran and Toporowska Gronostaj, Maria}, year = {2009}, volume = {2009}, number = {16}, pages = {181--196}, } @inProceedings{andreasson-etal-2009-swedish-102211, title = {Swedish CLARIN activities}, booktitle = {Proceedings of the Nodalida 2009 workshop on CLARIN activities in the Nordic countries. NEALT Proceedings Series}, author = {Andréasson, Maia and Borin, Lars and Forsberg, Markus and Beskow, Jonas and Carlson, Rolf and Edlund, Jens and Elenius, Kjell and Hellmer, Kahl and House, David and Merkel, Magnus and Forsbom, Eva and Megyesi, Beáta and Eriksson, Anders and Strömqvist, Sven}, year = {2009}, volume = {5}, pages = {1--5}, } @edited_book{allen-etal-2009-svensk-99825, title = {Svensk ordbok utgiven av Svenska Akademien. 1-2}, editor = {Allén, Sture (vetenskaplig rådgivare) and Berg, Daniel and Berg, Sture and Gellerstam, Martin and Holmer, Louise and Hult, Ann-Kristin and Lindstrand, Susanne and Lövfors, Sven and Malmgren, Sven-Göran and Sjögreen, Christian and Sköldberg, Emma and Tegner, Lennart and Toporowska Gronostaj, Maria}, year = {2009}, ISBN = {978-91-1-302267-3}, } @inProceedings{borin-etal-2009-thinking-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @inProceedings{borin-forsberg-2009-family-102212, title = {All in the family: A comparison of SALDO and WordNet}, booktitle = {Proceedings of the Nodalida 2009 Workshop on WordNets and other Lexical Semantic Resources - between Lexical Semantics, Lexicography, Terminology and Formal Ontologies. NEALT Proceedings Series}, author = {Borin, Lars and Forsberg, Markus}, year = {2009}, volume = {7}, } @inProceedings{akesson-etal-2010-post-122323, title = {Post surgery effects on VOT for Parkinson Disease STN/DBS patients}, abstract = {In this paper we discuss and analyse voice onset time (VOT) pre and post surgical treatment with deep brain stimulation (DBS) in 17 patients diagnosed with Parkinson’s disease (PD) at Sahlgrenska University Hospital in Gothenburg, Sweden. The patients were all at different stages of the disease but with the common denominator they have all undergone surgery to enhance synaptic responses through bilateral electrode implants in the subthalamic nucleus (STN) region of the brain, also known as Deep Brain Stimulation (DBS).The main focal point of the paper is to compare the pre and post surgery VOT data to see if there were any effects stemming from the STN surgery. Preliminary results for Mean VOT, Standard deviation VOT and percent of unsuccessfully produced/unmeasureable diadochokinetic syllable repetitions are presented and discussed. We found that the standard deviation decreased significantly for the consonant /p/ and this is discussed in the perspective of the ease of articulation of the different plosives. }, booktitle = {Proceedings from FONETIK 2010, Working Papers}, author = {Åkesson, Joel and Lindh, Jonas and Hartelius, Lena}, year = {2010}, volume = {54}, pages = {119--124}, } @inProceedings{lindh-eriksson-2010-voice-122326, title = {Voice similarity — a comparison between judgements by human listeners and automatic voice comparison}, abstract = {Comparison between the way human listeners judge voice similarity and how state-of-the art GMM-UBM systems for voice recognition compare voices is a little explored area of research. In this study groups of informants judged the similarity between voice samples taken from a set of fairly similar male voices that had previously been used in a voice line-up experiment. The result from the listening tests was then compared to the scores from a UBM-GMM automatic voice comparison system, built on the Mistral LIA_RAL open source platform. The results show a correlation between scores obtained from the automatic system and the judgements by the listeners. Listeners are, however, more sensitive to language dependent parameters or idiosyncratic phonetic features such as speaking tempo, while the system only bases its likelihood ratios on spectral similarities, i.e. timbre.}, booktitle = {Proceedings from FONETIK 2010, Working Papers}, author = {Lindh, Jonas and Eriksson, Anders}, year = {2010}, volume = {54}, pages = {63--69}, } @inProceedings{lindh-etal-2010-methodological-123919, title = {Methodological Issues in the Presentation and Evaluation of Speech Evidence in Sweden}, booktitle = {Proceedings of the 19th Annual Conference of the International Association for Forensic Phonetics and Acoustics, Trier, Germany}, author = {Lindh, Jonas and Eriksson, Anders and Nelhans, Gustaf}, year = {2010}, number = {19}, } @inProceedings{ljunglof-2010-trik-130134, title = {TRIK: A Talking and Drawing Robot for Children with Communication Disabilities}, abstract = {In this project we have developed and evaluated a setup involving a touch-screen computer with a dynamic screen software, and a drawing robot, which can communicate with each other via spoken language. The purpose is to help children with severe communication disabilities to learn language, language use and cooperation, in a playful and inspiring way. The communication board speaks and the robot is able to understand and talk back. This encourages the child to use language and learn to cooperate to reach a common goal, which in this case is to get the robot to draw figures on a paper. The robot has been tested on three children, two with cerebral palsy and one with autism spectrum disorder. During this session we present the preliminary results.}, booktitle = {ISAAC-2010, 14th Biennial Conference for Augmentative and Alternative Communication}, author = {Ljunglöf, Peter}, year = {2010}, } @book{wilhelmsson-2010-heuristisk-126092, title = {Heuristisk analys med Diderichsens satsschema - Tillämpningar för svensk text}, abstract = {A heuristic method for parsing Swedish text, heuristic schema parsing, is described and implemented. Focusing on main clause (primary) analysis, a collection of licensing techniques for removing non-primary verb candidates is employed, leaving e.g. the primary verbs, particles and conjunctions (bounded key constituents) that delimit the content of the fields in Diderichsen’s sentence schema. Hereby, the subsequent identification of constituents which do not have an upper bound on their length (subject, object/predicatives and adverbials) can be identified relying to a lesser on extent explicit pattern matching, and more on different heuristic rules. For phrase type identification and delimitation of these constituents, when adjacent to each other, a novel chunking technique, rank-based chunking, is applied. Following this, a series of further rules merge chunks into larger ones, aiming at a final number of nominal chunks compatible with the valency information of the main verb. The aim is to identify full nominal and adverbial constituents, including post-modifiers. The implementation uses the Stockholm Umeå Corpus 2.0, a corpus which is balanced for different genres in published Swedish text. SUC’s tagset is also used unmodified in part-of-speech tagging which enables the program to deal with input text. The functional parsing, which includes no explicit language-defining grammar component is carried out technically using an object-based representation of clause structure. Although output formats and types of evaluations of correctness are very different in parsers for Swedish text, it is claimed that the manual approach presented can provide high accuracy, which can be improved given more time for development. The thesis work also includes two prototype applications, both requiring high accuracy of the sort of functional syntactic analysis described here. The first one is an implementation of automatic syntactic fronting in the area of text editing for Swedish, where the user is presented with a syntactically analyzed copy of her writing, from which paraphrases easily can be generated. The second application is in the field of natural language query systems and produces questions with answers from an arbitrary declarative input text. This prototype incorporates a text database from Swedish Wikipedia, and investigates primarily generation of WH-questions formed via fronting of unbounded primary constituents. The questions are generated as a text is opened and thus permits users to only ask the available ones, thus aiming at a high precision value.}, author = {Wilhelmsson, Kenneth}, year = {2010}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{ljunglof-2010-grasp-130137, title = {GRASP: Grammar-based Language Learning}, abstract = {We are developing a pedagogical tool to support language learning and training for children with communicative disabilities. The system has a graphical interface, where the user can move, replace, add, and in other ways modify, words or phrases. The system keeps the sentence grammatical, by automatically rearranging the words and changing inflection, if necessary. In this way we hope that the system stimulates the child to explore the possibilities of language.}, booktitle = {SLTC-2010, 3rd Swedish Language Technology Conference}, author = {Ljunglöf, Peter}, year = {2010}, } @book{volodina-2010-corpora-127225, title = {Corpora in Language Classroom: Reusing Stockholm Umeå Corpus in a vocabulary exercise generator}, abstract = {Authentic examples as teaching material are not easy to obtain. Corpora are able to solve this problem, as has been witnessed before. Most experiments with corpora in language classroom describe concordances. However, there are numerous other ways of bringing corpora into language education, as shown in this research. A selective learner-oriented exercise generator has been implemented on the basis of Stockholm Umeå Corpus (SUC). SUC texts have been tested for readability and levels were assigned. This generator assists in automatic selection of authentic examples of appropriate learner levels as well as in construction of wordbank-, multiple choice items and c-tests for a specified proficiency level, frequency band and word class. In Vocabulary Size Test potential words are generated on the basis of existing morphemes and SUC-based frequency lists. Interesting practical and theoretical questions connected with reusage of corpora in an exercise generator are described in this book. The research might inspire computational linguists, language teachers and everyone interested in Computer-Assisted Language Learning and Corpus Linguistics to test similar techniques in their practices. }, author = {Volodina, Elena}, year = {2010}, publisher = {Lambert Academic Publishing}, address = {Saarbrücken}, ISBN = {978-3-8433-5256-7}, } @article{kokkinakis-2010-complementary-125644, title = {Complementary Methods for De-identifying Sensitive Data with a focus on Clinical Discourse}, abstract = {In the era of the Electronic Health Record (EHR) the release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly clinical, or other sources of sensitive data. The main challenge and goal of this research is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, journal = {Revista de Procesamiento de Lenguaje Natural (SEPLN)}, author = {Kokkinakis, Dimitrios}, year = {2010}, volume = {45}, pages = {243--246}, } @article{wilhelmsson-2010-automatisk-137859, title = {Automatisk generering av frågor som svensk text besvarar: ett informationssystem}, abstract = {Vilken information kan en text sägas innehålla? Ett enkelt svar är ”de frågor som den besvarar.” I vilken grad går det i så fall att automatiskt generera dessa frågor och därmed programmera ett frågebesvarande informationssystem för svensk text? Ett prototypsystem för denna uppgift har skapats som en del av ett avhandlingsprojekt inom språkteknologi. Det vore till exempel möjligt att vidareutveckla det system som här visas till en allmän teknisk tjänst, t.ex. webbaserad, som ger användare möjlighet att söka efter information med naturligt språk i en valfri digital text. Denna text tar upp de allmänna förutsättningarna för automatisk generering av de frågor som en svensk text besvarar. Själva den teoretiska uppgiften har egenskaper som kan sägas vara lingvistiska eller informationsteoretiska. För att skapa det program som här beskrivs har dessutom naturligtvis en programmeringsinsats krävts, men denna kommer inte att tas upp här, den rent praktiska sidan av uppgiften är möjlig att lösa på många sätt. http://www.hum.gu.se/samverkan/popularvetenskap/roster-fran-humanisten-2010/ http://hdl.handle.net/2320/7176 }, journal = {Röster från Humanisten, 2010}, author = {Wilhelmsson, Kenneth}, year = {2010}, volume = {2010}, } @article{borin-2010-zipf-130257, title = {Med Zipf mot framtiden - en integrerad lexikonresurs för svensk språkteknologi}, journal = {LexicoNordica}, author = {Borin, Lars}, year = {2010}, volume = {17}, pages = {35--54}, } @book{wilhelmsson-2010-heuristisk-132135, title = {Heuristisk analys med Diderichsens satsschema – Tillämpningar för svensk text, 2 uppl}, abstract = {A heuristic method for parsing Swedish text, heuristic schema parsing, is described and implemented. Focusing on main clause (primary) analysis, a collection of licensing techniques for removing non-primary verb candidates is employed, leaving e.g. the primary verbs, particles and conjunctions (bounded key constituents) that delimit the content of the fields in Diderichsen’s sentence schema. Hereby, the subsequent identification of constituents which do not have an upper bound on their length (subject, object/predicatives and adverbials) can be identified relying to a lesser on extent explicit pattern matching, and more on different heuristic rules. For phrase type identification and delimitation of these constituents, when adjacent to each other, a novel chunking technique, rank-based chunking, is applied. Following this, a series of further rules merge chunks into larger ones, aiming at a final number of nominal chunks compatible with the valency information of the main verb. The aim is to identify full nominal and adverbial constituents, including post-modifiers. The implementation uses the Stockholm Umeå Corpus 2.0, a corpus which is balanced for different genres in published Swedish text. SUC’s tagset is also used unmodified in part-of-speech tagging which enables the program to deal with input text. The functional parsing, which includes no explicit language-defining grammar component is carried out technically using an object-based representation of clause structure. The thesis work also includes two prototype applications, both requiring high accuracy of the sort of functional syntactic analysis described here. The first one is an implementation of automatic syntactic fronting in the area of text editing for Swedish, where the user is presented with a syntactically analyzed copy of her writing, from which paraphrases easily can be generated. The second application is in the field of natural language query systems and produces questions with answers from an arbitrary declarative input text. This prototype incorporates a text database from Swedish Wikipedia, and investigates primarily generation of WH-questions formed via fronting of unbounded primary constituents. The questions are generated as a text is opened and thus permits users to only ask the available ones, thus aiming at a high precision value. }, author = {Wilhelmsson, Kenneth}, year = {2010}, publisher = {University of Gothenburg}, address = {Göteborg}, ISBN = {978-91-977196-9-8}, } @inProceedings{borin-etal-2010-diabase-118907, title = {Diabase: Towards a diachronic BLARK in support of historical studies}, booktitle = {Proceedings of LREC 2010}, author = {Borin, Lars and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{wittenburg-etal-2010-resource-118909, title = {Resource and service centres as the backbone for a sustainable service infrastructure}, booktitle = {Proceedings of LREC 2010}, author = {Wittenburg, Peter and Bel, Nuria and Borin, Lars and Budin, Gerhard and Calzolari, Nicoletta and Hajicova, Eva and Koskenniemi, Kimmo and Lemnitzer, Lothar and Mægaard, Bente and Piasecki, Maciej and Pierrel, Jean-Marie and Piperidis, Stelios and Skadina, Inguna and Tufis, Dan and van Veenendal, Remco and Váradi, Tamás and Wynne, Martin}, year = {2010}, } @article{kokkinakis-2010-initiala-130210, title = {Initiala resultat av en storskalig automatisk indexering av vetenskaplig litteratur med hela det svenska SNOMED CT - problem och möjligheter.}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv <http://ltarkiv.lakartidningen.se> och dels från DiabetologNytts (DN) digitala arkiv <http://diabetolognytt.se/aterkommande/arkivet.html>.}, journal = {2010-års nationella termkonferens: Professionen i språket - språket i professionen.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{hu-lindh-2010-perceptual-125330, title = {PERCEPTUAL MISTAKES OF CHINESE TONES IN 2-SYLLABLE WORDS BY SWEDISH LISTENERS}, abstract = {Earlier studies on the perception of Chinese tones have almost exclusively used 1-syllable words for the listening tests (Kiriloff, 1969; Chuang, 1971; Klatt, 1973; Gandour, 1978). In these earlier studies the misperception between tone 2 and tone 3 has been shown to be the most common. However, no studies that we have found have looked at the perception of 2- syllable words besides Chuang (1971), who only used nonsense words. By tradition the teaching of Chinese as a foreign language has been concentrated on training of perception and production of tones since adult students have been shown to show particular difficulties in perceiving their difference. Experienced teachers have through tests established that this assumption is not valid when it comes to the so-called static tone. When it comes to communicating in Chinese and to be able to use the separate tones it is not enough to know the difference in 1-syllable words especially since most modern words in standard Chinese contains 2 or more. Guo (1993) has shown that the more syllables a word contains the higher ratio of misperceived tones. So far, no investigations for Swedish students have been performed. A possible hypothesis could be that Swedish listeners would perform better due to the Swedish grave and acute accents. By asking experienced teachers in Sweden, we knew that this should not be the case however. The general impressions from teachers are also that Swedish students have the largest proportion misperceptions between tone 2 and 3. To test this we conducted a listening test on 27 native speakers of Swedish (9 bilingual Chinese speakers with native ability in Swedish) on 25 Chinese 2-syllable lexical words with 15 different tone combinations. One male and one female native speaker of Chinese pronounced the words in isolation. The words were taken from a random number of 2-syllable glossary. Each speaker repeated the words once with 1 seconds pause in between the repetition and then 2 seconds pause before the new word. The audio was presented in high quality headphones in the student language lab at the University of Gothenburg. The participants were all second semester students of Chinese and the listening test was also an exam, which made the participants wanting to perform as well as possible. If they wanted they could repeat the sequence as many times as they until satisfied with their answer. The results show that produced tone 1 and tone 2 are confused more than 3 and 4 (tone 4 more than 3, see figure 1). However, the distribution of misperceptions seems to be rather equally distributed if we exclude the static tone (below called 0) in contradiction to earlier studies claiming misperception mostly between tone 2 and 3. However, we also notice that certain types of syllables containing different vowels are misperceived differently. The next step is to figure out if certain syllable nucleuses are more misperceived than others and in certain positions. These conclusions can in the future lead to new approaches when it comes to teaching students production and perception of tones.}, booktitle = {Proceedings of the Fourth European Conference on Tone and Intonation (TIE4)}, author = {Hu, Guohua and Lindh, Jonas}, year = {2010}, } @article{kokkinakis-2010-data-130212, title = {Is data scrubbing useful for anonymizing sensitive data?.}, abstract = {The release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly in the clinical setting. The main challenge is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, journal = {the Third Swedish Language Technology Conference}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{kokkinakis-2010-data-130213, title = {Är "data scrubbing" en användbar metod för att anonymisera känsliga patientdata?.}, abstract = {De senaste årens ökande användning av modern informationsteknik inom sjukvården har medfört en kraftig ökning av elektronisk dokumentation som rör patientens hälsotillstånd, vård och behandling. Vårddokumentationen blir både mer detaljerad och mer individuell, samtidigt som den uppdateras och förändras regelbundet. Patientjournalen är i första hand till för att bidra till en god och säker vård av patienten, men också en viktig informationskälla för FoU. Ett stort hinder för utnyttjandet av journalinformation som forskningskälla är de etiska och rättsliga problemen. För att kunna hantera och utnyttja dessa stora och ständigt växande informationsmängder ställs därmed högre krav på säker, skyddad och effektiv informationshantering.}, journal = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{kokkinakis-gerdin-2010-lakartidningens-120480, title = {Läkartidningens arkiv i en ny skepnad - En resurs för forskare, läkare och allmänhet}, abstract = {I Sverige har det tagits fram en medicinsk korpus baserad på Läkartidningens digitala arkiv. Denna resurs möjliggör precisa sökningar och värdefull tillgång till medicinsk terminologisk information på olika nivåer. Dimitrios Kokkinakis från Göteborgs universitet och Ulla Gerdin från Socialstyrelsen presenterar projektet. }, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, volume = {1/2010}, pages = {22--28}, } @inProceedings{allvin-etal-2010-characteristics-120479, title = {Characteristics and Analysis of Finnish and Swedish Clinical Intensive Care Nursing Narratives}, abstract = {We present a comparative study of Finnish and Swedish free-text nursing narratives from intensive care. Although the two languages are linguistically very dissimilar, our hypothesis is that there are similarities that are important and interesting from a language technology point of view. This may have implications when building tools to support producing and using health care documentation. We perform a comparative qualitative analysis based on structure and content, as well as a comparative quantitative analysis on Finnish and Swedish Intensive Care Unit (ICU) nursing narratives. Our findings are that ICU nursing narratives in Finland and Sweden have many properties in common, but that many of these are challenging when it comes to developing language technology tools. }, booktitle = {Proceedings of the NAACL HLT 2010 Second Louhi Workshop on Text and Data Mining of Health Documents}, author = {Allvin, H. and Carlsson, E. and Dalianis, H. and Danielsson-Ojala, R. and Daudaravicius, V. and Hassel, M. and Kokkinakis, Dimitrios and Lundgren-Laine, H. and Nilsson, G. and Nytrø, Ø. and Salanterä, S. and Skeppstedt, M. and Suominen, H. and Velupillai, S.}, year = {2010}, pages = {53 -- 60}, } @inProceedings{kokkinakis-gerdin-2010-swedish-113194, title = {A Swedish Scientific Medical Corpus for Terminology Management and Linguistic Exploration}, abstract = {This paper describes the development of a new Swedish scientific medical corpus. We provide a detailed description of the characteristics of this new collection as well results for a number of term management tasks, including terminology validation and terminology extraction based on this material. Although the corpus is representative for the scientific medical domain it still covers a lot of specialised sub-disciplines such as “diabetes” and “osteoporosis” which makes it suitable for facilitating the production of smaller and more focused subcorpora. We have tried to address this issue by making explicit some features of the corpus in order to demonstrate the corpus usefulness particularly for the quality assessment of official terminologies such as the Systematized NOmenclature of MEDicine - Clinical Terms (SNOMED CT).}, booktitle = {Proceedings of the 7th international conference on Language Resources and Evaluation (LREC), Malta}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, } @article{borin-forsberg-2010-beyond-129125, title = {Beyond the synset: Swesaurus – a fuzzy Swedish wordnet}, journal = {Re-thinking synonymy: semantic sameness and similarity in languages and their description}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, } @inProceedings{kokkinakis-toporowskagronostaj-2010-linking-119441, title = {Linking SweFN++ with Medical Resources, towards a MedFrameNet for Swedish}, abstract = {In this pilot study we define and apply a methodology for building an event extraction system for the Swedish scientific medical and clinical language. Our aim is to find and describe linguistic expressions which refer to medical events, such as events related to diseases, symptoms and drug effects. In order to achieve this goal we have initiated actions that aim to extend and refine parts of the ongoing compilation of the Swedish FrameNet++ (SFN++), which, as its English original predecessor, is grounded in Frame Semantics which provides a sound theoretical ground for modeling and linking linguistic structures encountered in general language and in specific domains (after specialization). Using such resource we manually annotate domain texts to be used as training data for automatic event extraction by automated techniques.}, booktitle = {Proceedings of the Second Louhi Workshop on Text and Data Mining of Health Documents. A NAACL-HTL Workshop}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2010}, } @inProceedings{lindh-2010-preliminary-123920, title = {Preliminary Formant Data of the Swedia Dialect Database in a Forensic Phonetic Perspective}, booktitle = {Proceedings of the 19th Annual Conference of the International Association for Forensic Phonetics and Acoustics, Trier, Germany}, author = {Lindh, Jonas}, year = {2010}, number = {19}, } @inProceedings{borin-forsberg-2010-from-118908, title = {From the People’s Synonym Dictionary to fuzzy synsets - first steps}, booktitle = {Proceedings of the LREC 2010 workshop Semantic relations. Theory and Applications}, author = {Borin, Lars and Forsberg, Markus}, year = {2010}, pages = {18--25}, } @inProceedings{forsbom-wilhelmsson-2010-revision-259876, title = {Revision of Part-of-Speech Tagging in Stockholm Umeå Corpus 2.0}, abstract = {Many parsers use a part-of-speech tagger as a first step in parsing. The accuracy of the tagger naturally affects the performance of the parser. In this experiment, we revise 1500+ proposed errors in SUC 2.0 that were mainly found during work with schema parsing, and evaluate tagger instances trained on the revised corpus. The revisions turned out to be beneficial also for the taggers.}, booktitle = {Proceedings of the Third Swedish Language Technology Conference (SLTC), Linköping, Sverige}, author = {Forsbom, Eva and Wilhelmsson, Kenneth}, year = {2010}, address = {Linköping}, } @incollection{toporowskagronostaj-skoldberg-2010-swedish-121119, title = {Swedish Medical Collocations: A Lexicographic Approach}, booktitle = {Korpora, Web und Datenbanken. Computergestützte Methoden in der modernen Phraseologie und Lexikographie (Phraseologie und Parömiologie 25)}, author = {Toporowska Gronostaj, Maria and Sköldberg, Emma}, year = {2010}, publisher = {Schneider Verlag Hohengehren GmbH}, address = {Baltmannsweiler, Germany}, ISBN = {978-3-8340-0733-9}, pages = {181--195}, } @inProceedings{kokkinakis-2010-korpus-119444, title = {Korpus för vårdens och omsorgens fackspråk.}, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ till svenska. Med hjälp av Läkartidningens digitala arkiv har vi utvecklat metoder för att effektivisera kvalitetssäkringen av terminnehållet. }, booktitle = {Humanistdagen 2010 - humaniora i dagens samhälle.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{wilhelmsson-2010-automatisk-247440, title = {Automatisk generering av frågor som svensk text besvarar: ett informationssystem}, abstract = {Vilken information kan en text sägas innehålla? Ett enkelt svar är ”de frågor som den besvarar.” I vilken grad går det i så fall att automatiskt generera dessa frågor och därmed programmera ett frågebesvarande informationssystem för svensk text?}, booktitle = {Röster från Humanisten 2010}, author = {Wilhelmsson, Kenneth}, year = {2010}, } @inProceedings{borin-etal-2010-past-110368, title = {The past meets the present in Swedish FrameNet++}, abstract = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.}, booktitle = {14th EURALEX International Congress}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, pages = {269--281}, } @incollection{borin-kokkinakis-2010-literary-124517, title = {Literary onomastics and language technology}, booktitle = {Literary education and digital learning}, author = {Borin, Lars and Kokkinakis, Dimitrios}, year = {2010}, publisher = {Information Science Reference}, address = {Hershey - New York}, ISBN = {978-1-60566-932-8}, pages = {53--78}, } @article{borin-etal-2010-swedish-129126, title = {Swedish FrameNet++}, journal = {Swedish Language Technology Conference 2010}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, } @incollection{ljunglof-wiren-2010-syntactic-99884, title = {Syntactic parsing}, abstract = {This chapter presents basic techniques for grammar-driven natural language parsing, that is, analysing a string of words (typically a sentence) to determine its structural description according to a formal grammar. Basic parsing concepts are explained after which a number of well-known parsing techniques are described.}, booktitle = {Handbook of Natural Language Processing, 2nd edition}, author = {Ljunglöf, Peter and Wirén, Mats}, year = {2010}, publisher = {CRC Press, Taylor and Francis}, ISBN = {978-1420085921}, } @incollection{borin-2010-avtryck-136656, title = {Avtryck från WGLN-projekten i forskningen}, booktitle = {Kunskapens nya världar}, author = {Borin, Lars}, year = {2010}, publisher = {Uppsala universitet, Uppsala Learning Lab}, address = {Uppsala}, ISBN = {978-91-506-2189-1}, pages = {127--133}, } @article{forsberg-2011-green-140694, title = {Green resources in plain sight: opening up the SweFN++ project}, abstract = {SweFN++ is a project focused on the cre- ation and curation of Swedish lexical re- sources geared towards language technol- ogy applications. An important theme of the project is openness and its realization as a lexical infrastructure. We give a short overview of the project, elaborate on what we mean by openness, and present the current state of the lexical infrastructure. }, journal = {Proceedings of the Nodalida 2011 Workshop on visibility and availability of LT resources}, author = {Forsberg, Markus}, year = {2011}, } @incollection{borin-forsberg-2011-diachronic-144291, title = {A diachronic computational lexical resource for 800 years of Swedish}, booktitle = {Language technology for cultural heritage}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-20226-1}, pages = {41--61}, } @article{borin-forsberg-2011-swesaurus-151331, title = {Swesaurus – ett svenskt ordnät med fria tyglar}, journal = {LexicoNordica}, author = {Borin, Lars and Forsberg, Markus}, year = {2011}, volume = {18}, pages = {17--39}, } @inProceedings{smith-etal-2011-developing-152723, title = {Developing a toolkit for written information materials for patients with colorectal cancer undergoing elective surgery}, abstract = {This study examines language complexity, readability and suitability, of written health information materials given to patients undergoing colorectal cancer (CRC) surgery. The overall aim is to investigate whether the implementation of adapted, person-centred information and communication for patients with CRC undergoing elective surgery, can enhance the patients’ self-care beliefs and well-being during recovery in the phase following diagnosis and initial treatment. Several explorative, qualitative studies are planned and will function both as a basis for the proposed interventions and provide explanations for the actual processes leading to the desired outcomes. Patients’ knowledge enablement will be reached by several interrelated intervention strategies and specific activities. One of these strategies deals with means to facilitate patients’ information seeking patterns and the goal is to provide patients with written information materials according to preferences for complex and detailed or legible texts. Thus, the interventions planed aim to enhance a movement from receiving information and instructions to participating in knowing. Written and printed patient information material from 28 Swedish clinics for patients diagnosed with CRC undergoing elective surgery were selected for analysis by means of standard metrics and more elaborate language technology techniques. Various text parameters such as lexical variation, frequency bands and the use of terminology were examined. The material was also analysed using a Suitability Assessment Instrument in order to examine content, literacy demand, graphic illustrations, layout and typography, learning stimulation and finally cultural appropriateness. In addition, five focusgroups were conducted where patients were asked to give their experiences of using written information. Results from the language technology analysis showed a variety in materials, where it could be divided in to easy, medium and difficult to read and comprehend. Patients in focusgroups told they would like written materials to be levelled in order to gain information stepwise, but also stressed the importance of information given both orally and in writing, and that they must correspond. Using the SAM-instrument was a good complement for deeper understanding, and taking all three analyses in account, we aim to design a balanced toolkit for how to best design written information materials where a person tailored approach can be offered. }, booktitle = {Svenska Läkaresällskapets Riksstämman}, author = {Smith, Frida and Carlsson, Eva and Friberg, Febe and Kokkinakis, Dimitrios and Forsberg, Markus and Öhrn, Matilda and Öhlén, Joakim}, year = {2011}, } @inProceedings{borin-etal-2011-semantic-140686, title = {Semantic Search in Literature as an e-Humanities Research Tool: CONPLISIT – Consumption Patterns and Life-Style in 19th Century Swedish Literature}, abstract = {We present our ongoing work on language technology-based e-science in the humanities, with a focus on text-based research in the historical sciences. Currently, we are working on the adaptation and integration of lexical resources representing different historical stages of Swedish into a lexical and morphological toolbox that will allow us to develop semantically oriented text search applications for historical research on Swedish text. We describe a semantic search prototype which was built using REST web services from this toolbox as components, and which has been evaluated by historians interested in using digitized 19th century novels as primary data for an historical investigation of the emerging consumer society in 19th century Sweden.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Borin, Lars and Forsberg, Markus and Ahlberger, Christer}, year = {2011}, volume = {11}, pages = {58--65}, } @inProceedings{kokkinakis-2011-evaluating-139977, title = {Evaluating the Coverage of three Controlled Health Vocabularies with Focus on Findings, Signs & Symptoms}, abstract = {The medical domain is blessed with a magnitude of terminological resources of various characteristics, sizes, structure, depth and breadth of descriptive power, granularity etc. In this domain a particularly interesting and difficult entity type are signs, symptoms and findings which to a large extend are expressed in a periphrastic manner, sometimes by the use of figurative or metaphorical language, or contextualized using a wealth of vague variant expressions. We hypothesize therefore that no major official terminology source alone can accommodate for the variation and complexity present in real text data, such as electronic medical records, notes or health related documents. In this paper we evaluate the content of the three largest medical control vocabularies available for Swedish on extracted reference symptom lists and initiate a discussion on how we should proceed in order to accommodate for increased coverage on similar genres. }, booktitle = {Workshop on Creation, Harmonization and Application of Terminology Resources Co-located with NODALIDA 2011}, author = {Kokkinakis, Dimitrios}, year = {2011}, pages = {5}, } @inProceedings{saxena-borin-2011-dialect-140689, title = {Dialect Classification in the Himalayas: a Computational Approach}, abstract = {Linguistic fieldwork data – in the form of basic vocabulary lists – for nine closely related language varieties are compared using an automatic procedure with manual feedback, whose major advantage is its complete consistency. The results of the vocabulary comparison turn out to be in accord with other linguistic features, making this methodology a promising addition to the toolbox of genetic lingusitics.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Saxena, Anju and Borin, Lars}, year = {2011}, volume = {11}, pages = {307--310}, } @inProceedings{skadina-etal-2011-meta-148648, title = {META-NORD: Towards sharing of language resources in Nordic and Baltic countries}, abstract = {This paper introduces the META-NORD project which develops Nordic and Baltic part of the European open language resource infrastructure. META-NORD works on assembling, linking across languages, and making widely available the basic language resources used by developers, professionals and researchers to build specific products and applications. The goals of the project, overall approach and specific action lines on wordnets, terminology resources and treebanks are described. Moreover, results achieved in first five months of the project, i.e. language whitepapers, metadata specification and IPR management, are presented.}, booktitle = {Proceedings of the Workshop on Language Resources, Technology and Services in the Sharing Paradigm}, author = {Skadina, Inguna and Vasiljevs, Andrejs and Borin, Lars and De Smedt, Koenraad and Lindén, Krister and Rögnvaldsson, Eiríkur}, year = {2011}, pages = {107--114}, } @inProceedings{kokkinakis-2011-reducing-143877, title = {Reducing Complexity in Parsing Scientific Medical Data, a Diabetes Case Study}, abstract = {The aim of this study is to assemble and deploy various NLP components and resources in order to parse scientific medical data and evaluate the degree in which these resources contribute to the overall parsing performance. With parsing we limit our efforts to the identi-fication of unrestricted noun phrases with full phrase structure and investigate the effects of using layers of semantic annotations prior to parsing. Scientific medical texts exhibit com-plex linguistic structure but also regularities that can be captured by pre-processing the texts with specialized semantically-aware tools. Our results show evidence of improved performance while the complexity of parsing is reduced. Parsed scientific texts and inferred syntactic information can be leveraged to improve the accuracy of higher-level tasks such as information extraction and enhance the acquisition of semantic relations and events.}, booktitle = {Workshop: Biomedical Natural Language Processing in conjunction with Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{kokkinakis-2011-health-141311, title = {Health Portals and Clinical Phenotypes - Recognition using SNOMED CT}, abstract = {The medical domain is particularly well endowed with various sources of terminology. Usually, such sources vary with respect to size, structure, depth and breadth of descriptive power, granularity and applicability. This paper investigates the extent by which the largest available medical nomenclature for Swedish can cope with a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. We evaluated the content of the resource on extracted reference symptom lists from several popular health portals. The results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions.}, booktitle = {9th Scandinavian Conference on Health Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @article{kokkinakis-2011-medicinska-149931, title = {Medicinska terminologier - officiella standarder och verklighet}, abstract = {Officiella medicinska termlistor hinner aldrig bli helt kompletta eller uppdaterade i tid med de senaste upptäckterna inom det (bio)medicinska fältet Växande behov av koppling mellan fack- och allmänspråk för praktiska (medicinskt orienterade) tillämpningar, t.ex. "din journal på nätet"-projektet Applikationer med indata som innehåller både fackspråk och allmänspråk - brist på täckande medicinska (elektroniska) ordböcker/termlistor med integrerad utförlig språklig och medicinsk information för lekmän finns inte transkriberade patient-läkarsamtal Använda existerande medicinska terminologier i språkteknologisk forskning som stöd för informationsutvinning - skapa strukturerade representationer av texter (samförekomstanalys; faktaextraktion och syntes; relation- och händelseextraktion; t.ex. mellan sjukdom - behandling - utfall få att kunna få ett bra underlag för att kunna förutsäga hur framtida behandlingar slår) Använda terminologin som ett medium för att underlätta kommunikationen mellan hälsotagare och hälsogivare t.ex. underlätta förståelse av medicinska termer av allmänheten }, journal = {Terminologiworkshop i Karlstad}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{rama-borin-2011-estimating-140688, title = {Estimating Language Relationships from a Parallel Corpus. A Study of the Europarl Corpus}, abstract = {Since the 1950s, linguists have been using short lists (40–200 items) of basic vocabulary as the central component in a methodology which is claimed to make it possible to automatically calculate genetic relationships among languages. In the last few years these methods have experienced something of a revival, in that more languages are involved, different distance measures are systematically compared and evaluated, and methods from computational biology are used for calculating language family trees. In this paper, we explore how this methodology can be extended in another direction, by using larger word lists automatically extracted from a parallel corpus using word alignment software. We present preliminary results from using the Europarl parallel corpus in this way for estimating the distances between some languages in the Indo-European language family.}, booktitle = {NEALT Proceedings Series (NODALIDA 2011 Conference Proceedings)}, author = {Rama, Taraka and Borin, Lars}, year = {2011}, volume = {11}, pages = {161--167}, } @article{johanssonkokkinakis-volodina-2011-corpus-148533, title = {Corpus-based approaches for the creation of a frequency based vocabulary list in the EU project KELLY – issues on reliability, validity and coverage}, abstract = {At present there are relatively few vocabulary lists for Swedish describing modern vocabulary as well as being adapted to language learners’ needs. In Europe including Sweden there exist approaches to unify ways of working consistently with language learning, one example worth naming in this respect is the Common European Framework of Reference (CEFR) which provides guidelines for systematic approach to language teaching and assessment of language proficiency. This article describes EU project Kelly (KEywords for Language Learning for Young and adults alike, 2009-2012), the main objective of which was to create vocabulary lists for nine languages (Swedish, English, Norwegian, Greek, Italian, Polish, Arabic, Chinese and Russian) and adapt them to CEFR levels. We describe the process of compiling and validating the Swedish Kelly-list, dwell on benefits and limitations of using a corpus based approach in this project; as well as mention the impact of the methodological approach for compiling vocabulary lists for specific purposes. }, journal = {eLex, 10-12 November 2011, Slovenia}, author = {Johansson Kokkinakis, Sofie and Volodina, Elena}, year = {2011}, volume = {2011}, } @inProceedings{kokkinakis-malm-2011-character-143875, title = {Character Profiling in 19th Century Fiction}, abstract = {This paper describes the way in which personal relationships between main characters in 19th century Swedish prose fiction can be identified using information guided by named entities, provided by a entity recognition system adapted to the 19th century Swedish language characteristics. Interpersonal relation extraction is based on the context between two relevant, identified person entities. The extraction process of the relationships also utilize the content of on-line available lexical semantic resources (suitable vocabularies) and fairly standard context matching methods that provide a basic mechanism for identifying a wealth of interpersonal relations that hopefully can aid the reader of a 19th-century Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story.}, booktitle = {Workshop: Language Technologies for Digital Humanities and Cultural Heritage in conjunction with the Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2011}, } @article{kokkinakis-2011-natural-149930, title = {Natural language processing of clinical data with a focus on diffuse symptoms}, abstract = {The medical domain is well supported with a wealth of large, rich and varied controlled vocabularies and terminological resources. This paper investigates the extent by which the largest available medical nomenclature for Swedish, the Systematized Nomenclature of Medicine Clinical Terms (SNOMED CT), can handle a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. The aim of the study is to better understand phenotype contextualization in order to improve and enhance our knowledge of communicative events in various healthcare settings. Our approach can be seen as an exploratory one in which we believe to yield useful insights into the nature of how findings, symptoms and signs (i.e. clinical phenotypes in general) are expressed in real data. This study is initiated in the context of the project "Interpretation and understanding of functional symptoms in primary health care". The main research goal of which is to study health care interactions with patients suffering from Functional Somatic Syndromes (FSS). FSS are characterized by particular constellations of medically unexplained, often chronic symptoms, such as dizziness, fatigue, dyspepsia, muscle and joint pain. We use methods from the natural language processing field in order to investigate how symptom mentions are expressed and how available successful automated means are for capturing symptom descriptions both on collected written (patient records) and transcribed material (patient/nurse and patient/doctor encounters). We manually evaluated the content of the resource on the collected data and our results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions. We have evaluated the content of a large controlled vocabulary for Swedish on symptom descriptions in clinical texts.}, journal = {Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios}, year = {2011}, } @article{lindh-2011-peter-142484, title = {Peter French}, abstract = {The Encyclopedia of Applied Linguistics is a ground-breaking resource, spanning the entire field. Truly international in scope, it brings together contributions from the world’s most respected scholars in applied linguistics. Available online or as a 10-volume print set, this comprehensive print and electronic resource provides an overview of all the key areas in applied linguistics, from language learning and language policy, to qualitative methods in applied linguistics, and technology and language. Comprising over 3.5 million words, across 1,200 entries, it spans key developments and ideas in applied linguistics, historic and emerging areas of research, and includes 250 biographies of prominent figures who have helped shaped this diverse, and ever-growing field.}, journal = {The Encyclopedia of Applied Linguistics}, author = {Lindh, Jonas}, year = {2011}, pages = {2}, } @inProceedings{vancoppenolle-etal-2011-german-154315, title = {A German Grammar for Generation in OpenCCG}, abstract = {We present a freely available CCG fragment for German that is being developed for natural language generation tasks in the domain of share price statistics. It is implemented in OpenCCG, an open source Java implementation of the compuationally attractive CCG formalism. Since generation requires lexical categories to have semantic representations, so that posssible realizations can be produced, the underlying grammar needs to define semantics. Hybrid Logic Dependency Semantics, a logic calculus especially suited for encodings linguistic meaning, is used to declare the semantics layer. To our knowledge, related work on German CCG development has not yet focused on the semantics layer. In terms of syntax, we concentrate on aspects of German as a partially free constituent order language. Special attention is payed to scrambling, where we employ CCG's type-changing mechanism in a manner athat is somewhat unusual, but allows us to a) minimize the amount of syntactic categories that are needed to model scrambling, compared to providing categories for all possible argument orders, and b) retain enough control to impose restrictions on scrambling.}, booktitle = {H. Hedeland, T. Schmidt, K. Wörner (eds.): Multilingual Resources and Multilingual Applications. Proc. of the Conference of the German Society for Computational Linguistics and Language Technology (GSCL), Hamburg, 2011. Working Papers in Multilingualism, Series B }, author = {Vancoppenolle, Jean and Tabbert, Eric and Bouma, Gerlof and Stede, Manfred}, year = {2011}, number = {96}, pages = {145--150}, } @article{lindh-2011-francis-142483, title = {Francis Nolan}, abstract = {The Encyclopedia of Applied Linguistics is a ground-breaking resource, spanning the entire field. Truly international in scope, it brings together contributions from the world’s most respected scholars in applied linguistics. Available online or as a 10-volume print set, this comprehensive print and electronic resource provides an overview of all the key areas in applied linguistics, from language learning and language policy, to qualitative methods in applied linguistics, and technology and language. Comprising over 3.5 million words, across 1,200 entries, it spans key developments and ideas in applied linguistics, historic and emerging areas of research, and includes 250 biographies of prominent figures who have helped shaped this diverse, and ever-growing field.}, journal = {The Encyclopedia of Applied Linguistics}, author = {Lindh, Jonas}, year = {2011}, pages = {2}, } @inProceedings{lindh-morrison-2011-humans-146100, title = {Humans versus machine: forensic voice comparison on a small database of Swedish voice recordings}, abstract = {A procedure for comparing the performance of humans and machines on speaker recognition and on forensic voice comparison is proposed and demonstrated. The procedure is consistent with the new paradigm for forensic-comparison science (use of the likelihood-ratio framework and testing of the validity and reliability of the results). The use of the procedure is demonstrated using a small database of Swedish voice recordings.}, booktitle = {Proceedings of ICPhS2011}, author = {Lindh, Jonas and Morrison, Geoffrey Stewart}, year = {2011}, volume = {17}, pages = {4}, } @inProceedings{kokkinakis-2011-what-141312, title = {What is the Coverage of SNOMED CT® on Scientific Medical Corpora?}, abstract = {This paper reports on the results of a large scale mapping of SNOMED CT on scientific medical corpora. The aim is to automatically access the validity, reliability and coverage of the Swedish SNOMED-CT translation, the largest, most extensive available resource of medical terminology. The method described here is based on the generation of predominantly safe harbor term variants which together with simple linguistic processing and the already available SNOMED term content are mapped to large corpora. The results show that term variations are very frequent and this may have implication on technological applications (such as indexing and information retrieval, decision support systems, text mining) using SNOMED CT. Naïve approaches to terminology mapping and indexing would critically affect the performance, success and results of such applications. SNOMED CT appears not well-suited for automatically capturing the enormous variety of concepts in scientific corpora (only 6,3% of all SNOMED terms could be directly matched to the corpus) unless extensive variant forms are generated and fuzzy and partial matching techniques are applied with the risk of allowing the recognition of a large number of false positives and spurious results.}, booktitle = {Studies in Health Technology and Informatics / XXIII International Conference of the European Federation for Medical Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, volume = {169}, } @edited_book{malmgren-etal-2011-lexins-174145, title = {Lexins svenska lexikon (4 uppl.)}, editor = {Malmgren, Sven-Göran and Berg, Daniel and Berg, Sture and Hult, Ann-Kristin and Holmer, Louise and Sjögreen, Christian and Sköldberg, Emma and Toporowska Gronostaj, Maria}, year = {2011}, publisher = {Internetpublikation}, address = {Stockholm}, } @article{hammarstrom-borin-2011-unsupervised-141707, title = {Unsupervised learning of morphology}, journal = {Computational Linguistics}, author = {Hammarström, Harald and Borin, Lars}, year = {2011}, volume = {37}, number = {2}, pages = {309--350}, } @techreport{borin-etal-2011-metadata-142495, title = {Metadata descriptions and other interoperability standards}, abstract = {An important aim of META-NORD is to upgrade and harmonize national language resources and tools in order to make them interoperable, within languages and across languages, with respect to their data formats and as far as possible also as regards their content. Since resources and to some extent tools will remain in one location – one of a number of META-NORD centers – the preferred way of accessing and utilizing resources and tools will be through metadata and APIs, allowing the assembly of on-the-fly tool-chains made up of standardized component language technology tools, processing distributed – and in many cases interlinked – language resources in standardized formats.}, author = {Borin, Lars and Lindh, Jonas and Brandt, Martha and Olsson, Leif-Jöran}, year = {2011}, } @inProceedings{ju-etal-2011-towards-151361, title = {Towards Using Reranking in Hierarchical Classification}, abstract = {We consider the use of reranking as a way to relax typical in- dependence assumptions often made in hierarchical multilabel classification. Our reranker is based on (i) an algorithm that generates promising k-best classification hypotheses from the output of local binary classifiers that clas- sify nodes of a target tree-shaped hierarchy; and (ii) a tree kernel-based reranker applied to the classification tree associated with the hypotheses above. We carried out a number of experiments with this model on the Reuters corpus: we firstly show the potential of our algorithm by computing the oracle classification accuracy. This demonstrates that there is a signifi- cant room for potential improvement of the hierarchical classifier. Then, we measured the accuracy achieved by the reranker, which shows a significant performance improvement over the baseline. }, booktitle = {Proceedings of the Joint ECML/PKDD-PASCAL Workshop on Large-Scale Hierarchical Classification; September 5, 2011; Athens, Greece}, author = {Ju, Qi and Johansson, Richard and Moschitti, Alessandro}, year = {2011}, } @inProceedings{wilhelmsson-2011-automatic-259874, title = {Automatic Question Generation from Swedish Documents as a Tool for Information Extraction}, abstract = {An implementation of automatic question generation (QG) from raw Swedish text is presented. QG is here chosen as an alternative to natural query systems where any query can be posed and no indication is given of whether the current text database includes the information sought for. The program builds on parsing with grammatical functions from which corresponding questions are generated and it incorporates the article database of Swedish Wikipedia. The pilot system is meant to work with a text shown in the GUI and auto-completes user input to help find available questions. The act of question generation is here described together with early test results regarding the current produced questions.}, booktitle = {Proceedings of the 18th Nordic Conference of Computational Linguistics NODALIDA 2011, NEALT Proceedings Series Vol. 11}, author = {Wilhelmsson, Kenneth}, year = {2011}, publisher = { Northern European Association for Language Technology (NEALT) }, address = {Tartu}, } @inProceedings{ghosh-etal-2011-shallow-151356, title = {Shallow Discourse Parsing with Conditional Random Fields}, abstract = {Parsing discourse is a challenging natural language processing task. In this paper we take a data driven approach to identify arguments of explicit discourse connectives. In contrast to previous work we do not make any assumptions on the span of arguments and consider parsing as a token-level sequence labeling task. We design the argument segmentation task as a cascade of decisions based on conditional random fields (CRFs). We train the CRFs on lexical, syntactic and semantic features extracted from the Penn Discourse Treebank and evaluate feature combinations on the commonly used test split. We show that the best combination of features includes syntactic and semantic features. The comparative error analysis investigates the performance variability over connective types and argument positions.}, booktitle = {Proceedings of 5th International Joint Conference on Natural Language Processing; editors Haifeng Wang and David Yarowsky; Chiang Mai, Thailand; November 8-13, 2011}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2011}, pages = {1071--1079}, } @inProceedings{ghosh-etal-2011-discourse-151350, title = {End-to-End Discourse Parser Evaluation}, abstract = {We are interested in the problem of discourse parsing of textual documents. We present a novel end-to-end discourse parser that, given a plain text document in input, identifies the discourse relations in the text, assigns them a semantic label and detects discourse arguments spans. The parsing architecture is based on a cascade of decisions supported by Conditional Random Fields (CRF). We train and evaluate three different parsers using the PDTB corpus. The three system versions are compared to evaluate their robustness with respect to deep/shallow and automatically extracted syntactic features.}, booktitle = {Fifth IEEE International Conference on Semantic Computing (ICSC), 2011; September 18-21, 2011; Palo Alto, United States}, author = {Ghosh, Sucheta and Tonelli, Sara and Riccardi, Giuseppe and Johansson, Richard}, year = {2011}, ISBN = {978-1-4577-1648-5}, } @inProceedings{vasljevs-etal-2011-meta-140690, title = {META-NORD: Baltic and Nordic Branch of the European Open Linguistic Infrastructure}, booktitle = {Proceedings of the Nodalida 2011 Workshop on visibilty and availability of LT resources}, author = {Vasljevs, Andrejs and Pedersen, Bolette Sandford and De Smedt, Koenraad and Borin, Lars and Skadina, Inguna}, year = {2011}, } @inProceedings{smith-etal-2012-forbattra-170895, title = {Hur kan vi förbättra skriftligt informations- och utbildningsmaterial för patienter som opereras elektivt för kolorektal cancer?}, abstract = {Kolorektal cancer (KRC) är den tredje största cancerdiagnosen i Sverige med drygt 5500 drabbade årligen. Primär behandling är kirurgi kompletterad av pre- och postoperativ onkologisk behandling. Standardiserade koncept för accelererat vårdförlopp med kortare vårdtider lägger mycket fokus på fysisk rehabilitering, men mindre på den psykiska påfrestning det innebär att bli opererad för en cancerdiagnos. Patienter förväntas ta stort ansvar för sin rehabilitering, både på sjukhuset och hemma. För att vara förberedd behövs både skriftlig och muntlig information. Syftet med studien var att kartlägga och karaktärisera det skriftliga informations- och utbildningsmaterial (IOU) som används till patienter som opereras elektivt för KRC. Vidare var syftet att beskriva patienters uppfattning om struktur och innehåll på IOU. IOU från 28 kliniker som opererar patienter med KRC samlades in (totalt 220 st). För att kunna ge ett mått på texternas svårighetsgrad gjordes språkteknologisk analys på samtliga IOU, där bl.a. ordlängd, meningsbyggnad och jämförelse med annan typ av litteratur mättes På 117 st gjordes en suitabilityanalys med instrumentet SAM+CAM där domän som innehåll, läsbarhet, bilder, layout samt stimulans och motivation för lärande bedömdes. Fem fokusgrupper med patienter genomfördes där patienterna uppmanades att berätta om vad de tycker utmärker ett bra respektive dåligt IOU, vad de saknar i innehåll och när och på vilket sätt de vill ha materialet utlämnat. Resultatet av språkteknologiska- och suitabilityanalysen visar att de flesta IOU bedömdes som ”adequate”, men spridningen var stor. Patienterna hade önskemål om mer nivåuppdelat/nivåriktat material, där man själv kan välja hur mycket information man vill ha vid ett visst tillfälle. Flera ämnen saknades, eller var för otydligt beskrivna för att patienterna skulle känna sig trygga vid hemgång. Resultatet av de tre analysmetoderna bör kunna användas för att utveckla en ”verktygslåda” för att i framtiden kunna utforma bättre riktat IOU för patientgruppen. }, booktitle = {Nationella konferensen i Cancervård, 24-25 maj 2012, Stockholm}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Forsberg, Markus and Kokkinakis, Dimitrios and Friberg, Febe}, year = {2012}, } @inProceedings{lyngfelt-etal-2012-adding-163582, title = {Adding a constructicon to the Swedish resource network of Språkbanken}, abstract = {This paper presents the integrated Swedish resource network of Språkbanken in general, and its latest addition – a constructicon – in particular. The constructicon, which is still in its early stages, is a collection of (partially) schematic multi-word units, constructions, developed as an addition to the Swedish FrameNet (SweFN). SweFN and the constructicon are integrated with other parts of Språkbanken, both lexical resources and corpora, through the lexical resource SALDO. In most respects, the constructicon is modeled on its English counterpart in Berkeley, and, thus, following the FrameNet format. The most striking differencies are the inclusion of so-called collostructional elements and the treatment of semantic roles, which are defined globally instead of locally as in FrameNet. Incorporating subprojects such as developing methods for automatic identification of constructions in authentic text on the one hand, and accounting for constructions problematic for L2 acquisition on the other, the approach is highly cross-disciplinary in nature, combining various theoretical linguistic perspectives on construction grammar with language technology, lexicography, and L2 research.}, booktitle = {11th Conference on Natural Language Processing (KONVENS) Proceedings}, author = {Lyngfelt, Benjamin and Borin, Lars and Forsberg, Markus and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, ISBN = {3-85027-005-X}, pages = {452--461}, } @misc{andersen-forsberg-2012-sibirientyska-162958, title = {Sibirientyska}, abstract = {German in Siberia are transcriptions of German spoken in the region of Krasnoyarsk (Russia). The corpus contains about 34 000 running words. Codeswitching to Russian and verb forms are annotated (Russian word forms in brackets like [vot], finite verb forms (FINIT), infinite verb forms (INFIN)). The transcription and annotation of the corpus have been established in collaboration with the Astafyev University Krasnoyarsk. The corpus is a part of a research project at the University of Gothenburg, see http://www.sprak.gu.se/kontakta-oss/larare/andersen-christiane/syntax-in-contact/ The data base is currently in the test phase. }, author = {Andersen, Christiane and Forsberg, Markus}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{borin-etal-2012-transferring-157213, title = {Transferring Frames: Utilization of Linked Lexical Resources}, abstract = {In our experiment, we evaluate the transferability of frames from Swedish to Finnish in parallel corpora. We evaluate both the theoretical possibility of transferring frames and the possibility of performing it using available lexical resources. We add the frame information to an extract of the Swedish side of the Kotus and JRC-Acquis corpora using an automatic frame labeler and copy it to the Finnish side. We focus on evaluating the results to get an estimation on how often the parallel sentences can be said to express the same frame. This sheds light to the questions: Are the same situations in the two languages expressed using different frames, i.e. are the frames transferable even in theory? How well can the frame information of running text be transferred from language to another? }, booktitle = {Proceedings of the Workshop on Inducing Linguistic Structure Submission (WILS)}, author = {Borin, Lars and Forsberg, Markus and Johansson, Richard and Muhonen, Kristiina and Purtonen, Tanja and Voionmaa, Kaarlo}, year = {2012}, pages = {8--15}, } @inProceedings{forsberg-lager-2012-cloud-156078, title = {Cloud Logic Programming for Integrating Language Technology Resources}, abstract = {The main goal of the CLT Cloud project is to equip lexica, morphological processors, parsers and other software components developed within CLT (Centre of Language Technology) with so called web API:s, thus making them available on the Internet in the form of web services. We present a proof-of-concept implementation of the CLT Cloud server where we use the logic programming language Prolog for composing and aggregating existing web services into new web services in a way that encourages creative exploration and rapid prototyping of LT applications. }, booktitle = {Proceedings of LREC 2012}, author = {Forsberg, Markus and Lager, Torbjörn}, year = {2012}, volume = {Accepted}, } @techreport{hoglund-etal-2012-maskininlarningsbaserad-159347, title = {Maskininlärningsbaserad indexering av digitaliserade museiartefakter - projektrapport}, abstract = {Projektet har genomfört försök med maskinbaserad analys och maskininlärning för automatisk indexering och analys av bilder som stöd för registrering av föremål i museibestånd. Resultaten visar att detta är möjligt för avgränsade delmängder i kombination med maskininlärning som stöd för, men inte som ersättning för, manuell analys. Projektet har också funnit behov av utveckling av ett användargränssnitt för både text och bildsökning och utvecklat en prototyplösning för detta, vilket finns dokumenterat i denna rapport och i ett separat appendix till rapporten. Materialet utgör grundunderlag för implementeringar som innebär utökade sökmöjligheter, effektivare registrering samt ett användarvänligt gränssnitt. Arbetet ligger i framkant av forskningsområdets resultat och etablerade metoder och kombinerar statististiska, lingvistiska och datavetenskapliga metoder. Se länk till rapport och även länk till appendix längre ned. }, author = {Höglund, Lars and Eklund, Johan and Wilhelmsson, Kenneth}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{borin-etal-2012-growing-171988, title = {Growing a Swedish constructicon in lexical soil}, booktitle = {Proceedings of the Swedish Language Technology Conference. Lund, October 24-26, 2012}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia}, year = {2012}, pages = {10--11}, } @inProceedings{junger-etal-2012-scxml-164522, title = {SCXML for Building Conversational Agents in the Dialog Web Lab}, abstract = {The W3C has selected Harel Statecharts, under the name of State Chart XML (SCXML), as the basis for future stan- dards in the area of (multimodal) dialog systems (Barnett et al. 2012). In an effort to educate people about SCXML we are building a web-based development environment where the dialogs of embodied, spoken conversational agents can be managed and controlled using SCXML, in a playful and interesting manner.}, booktitle = {Proceedings of The Swedish Language Technology Conference (SLTC) 2012}, author = {Junger, David and Lager, Torbjörn and Roxendal, Johan}, year = {2012}, } @inProceedings{borin-etal-2012-korp-156080, title = {Korp – the corpus infrastructure of Språkbanken}, abstract = {We present Korp, the corpus infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure consists of three main components: the Korp corpus pipeline, the Korp backend, and the Korp frontend. The Korp corpus pipeline is used for importing corpora, annotating them, and then exporting the annotated corpora into different formats. An essential feature of the pipeline is the ability to leave existing annotations untouched, both structural and word level annotations, and to use the existing annotations as the foundation of other annotations. The Korp backend consists of a set of REST-based web services for searching in and retrieving information about the corpora. Finally, the Korp frontend is a graphical search interface that interacts with the Korp backend. The interface has been inspired by corpus search interfaces such as SketchEngine, Glossa, and DeepDict, and it uses State Chart XML (SCXML) in order to enable users to bookmark interaction states. We give a functional and technical overview of the three components, followed by a discussion of planned future work. }, booktitle = {Proceedings of LREC 2012. Istanbul: ELRA}, author = {Borin, Lars and Forsberg, Markus and Roxendal, Johan}, year = {2012}, volume = {Accepted}, pages = {474–478}, } @inProceedings{pedersen-etal-2012-linking-155599, title = {Linking and validating Nordic and Baltic wordnets}, booktitle = {Proceedings of the 6th International Global Wordnet Conference}, author = {Pedersen, Bolette Sandford and Borin, Lars and Forsberg, Markus and Lindén, Krister and Orav, Heili and Rögnvaldsson, Eírikur}, year = {2012}, volume = {Accepted}, pages = {254--260}, } @inProceedings{ahlberg-enache-2012-type-166722, title = {A Type-Theoretical Wide-Coverage Computational Grammar for Swedish}, booktitle = {Proceedings of the 15th International Conference, TSD(Text, Speech and Dialogue) 2012, Brno, Czech Republic, September 3-7, 2012,LNCS series "Text, Speech and Dialogue"}, author = {Ahlberg, Malin and Enache, Ramona}, year = {2012}, volume = {7499}, ISBN = {978-3-642-32790-2}, pages = {183--190}, } @inProceedings{ahlberg-bouma-2012-best-172769, title = { A best-first anagram hashing filter for approximate string matching with generalized edit distance}, abstract = {This paper presents an efficient method for approximate string matching against a lexicon. We define a filter that for each source word selects a small set of target lexical entries, from which the best match is then selected using generalized edit distance, where edit operations can be assigned an arbitrary weight. The filter combines a specialized hash function with best-first search. Our work extends and improves upon a previously proposed hash-based filter, developed for matching with uniform-weight edit distance. We evaluate an approximate matching system implemented with the new best-first filter, by conducting several experiments on a historical corpus and a set of weighted rules taken from the literature. We present running times and discuss how performance varies using different stopping criteria and target lexica. The results show that the filter is suitable for large rule sets and million word corpora, and encourage further development. }, booktitle = {24th International Conference on Computational Linguistics COLING, 8-15 December 2012, Mumbai, India. Proceedings}, author = {Ahlberg, Malin and Bouma, Gerlof}, year = {2012}, } @article{smith-etal-2012-studie-170897, title = {Ny studie visar hur information till patienter med kolorektal cancer kan förbättras}, abstract = {Skriftligt informationsmaterial är ofta skrivet på för hög nivå och ställer höga krav på den tänkta läsaren (patienten). Förutom läsbarhet finns det fler faktorer att utvärdera för att se om materialet är lämpligt. Innehåll, struktur, layout och typsnitt, illustrationer och lärande och motivation är sådant som bör tas hänsyn till. Ett lämpligare, bättre anpassat material kan hjälpa personer med sjukdom att ställa bättre frågor när de har samtal med vårdpersonal och det kan göra personen mindre osäker och orolig för det okända som väntar. En ny studie som ingår i forskningsprojektet PINCORE (personcentred information and communication in colorectal cancer care) syftar till att förbättra information och kommunikation vid kolorektal cancer.}, journal = {Cancervården}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Friberg, Febe and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2012}, number = {5}, pages = {18--21}, } @inProceedings{wilhelmsson-2012-automatic-165989, title = {Automatic question generation for Swedish: The current state}, abstract = {The research area of question generation (QG), in its current form, has a relatively brief history within NLP. A description of the current question generation implementation for Swedish text built on schema parsing is here presented and exemplified. Underlying the current approach is the view of ‘all textual information as answers to questions.’ This paper discusses strategies for enhanced functionality for arbitrary Swedish text through extended question generation. It also brings up some theoretical issues regarding the nature of the task, and concerns practical considerations in an area such as Intelligent CALL (ICALL) where this type of application has been considered for English. ISSN (print): 1650-3686, ISSN (online): 1650-3740}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012, Linköping Electronic Conference Proceedings}, author = {Wilhelmsson, Kenneth}, year = {2012}, volume = {80}, pages = {71--79}, } @inProceedings{vasijevs-etal-2012-creation-156083, title = {Creation of an Open Shared Language Resource Repository in the Nordic and Baltic Countries}, abstract = {The META-NORD project has contributed to an open infrastructure for language resources (data and tools) under the META-NET umbrella. This paper presents the key objectives of META-NORD and reports on the results achieved in the first year of the project. META-NORD has mapped and described the national language technology landscape in the Nordic and Baltic countries in terms of language use, language technology and resources, main actors in the academy, industry, government and society; identified and collected the first batch of language resources in the Nordic and Baltic countries; documented, processed, linked, and upgraded the identified language resources to agreed standards and guidelines. The three horizontal multilingual actions in META-NORD are overviewed in this paper: linking and validating Nordic and Baltic wordnets, the harmonisation of multilingual Nordic and Baltic treebanks, and consolidating multilingual terminology resources across European countries. This paper also touches upon intellectual property rights for the sharing of language resources. }, booktitle = {Proceedings of LREC 2012}, author = {Vasiļjevs, Andrejs and Forsberg, Markus and Gornostay, Tatiana and Hansen, Dorte H. and Jóhannsdóttir, Kristín M. and Lindén, Krister and Lyse, Gunn I. and Offersgaard, Lene and Oksanen, Ville and Olsen, Sussi and Pedersen, Bolette S. and Rögnvaldsson, Eiríkur and Rozis, Roberts and Skadiņa, Inguna and Smedt, Koenraad De}, year = {2012}, ISBN = {978-2-9517408-7-7}, } @techreport{wilhelmsson-2012-adverbialkarakteristik-160440, title = {Adverbialkarakteristik för praktisk informationsextraktion i svensk text - Projektrapport}, abstract = {Den aktuella rapporten beskriver ett projekt som i första hand har inneburit ett praktiskt arbete syftande till att skapa en automatiserad process som returnerar frågeled, t.ex. varifrån, för adverbialled, t.ex. inifrån rummet, i svensk digital text. Det är en utbytesprocess som behövs av rent praktiska skäl i uppgiften frågegenerering, vilken innebär att en samling frågor som en text besvarar genereras snabbt automatiskt. Denna process finner sin plats i program som på olika sätt syftar till att ge informationsåtkomst i godtycklig okänd svensk text. Det är i detta tillämpningsfall fråga om att på något sätt öppna upp för den stora informationsmängd som i datalogiskt perspektiv ligger ’ostrukturerad’, dvs. i naturligt språk-form. Syftet med att avgöra lämpliga frågeled (ofta till en hv-form) för förekommande satsled i text har dock förmodligen en mer allmän relevans än användning i nämnda programtyp. Förutom att också behövas i andra liknande datalingvistiska applikationer kan själva frågeställningen rymmas inom ramarna för grundforskningen. De vanliga semantiskt grundade adverbialkategorierna (vilka skiljer sig åt mellan olika grammatikor) definierar gärna adverbialkategorier just genom att beskriva vilka slags frågor de besvarar. Att som här sikta på att avgöra frågeled för adverbial är en mer detaljerad uppgift än att avgöra adverbialkategori. Den praktiska metod som implementerats i projektet kan sönderdelas i ett antal steg som antas vara allmängiltiga och svåra att undgå med det aktuella syftet. Indata till programmet är ett i princip godtyckligt adverbialled som användaren i prototypprogrammet kan skriva in. De nämnda steg som tar vid är de följande. 1) En uppmärkning med ordklass- och annan grammatisk information för varje löpord inleder. Detta sker med en statistisk trigrambaserad s.k. Hidden Markov-modell. 2/3) Ett avgörande av vilken strukturtyp som ledet har (bisats, PP, etc.) görs utifrån löporden med informationen i föregående steg. Intimt förknippat med denna uppgift är bestämning av huvudord, och för flera led även bestämning av andra signifikanta komponenter som rektionshuvudord. Lösningen till detta delsteg heter rangbaserad chunkning. 4) De steg som följer härefter skiljer sig mycket åt beroende på den aktuella strukturtypen. För prepositionsfraser undersöks t.ex. preposition och, beroende på vilken preposition det är fråga om, rektionshuvudord, dess grundform och andra ingående textsegment. I arbetet har t.ex. SweFN (Borin, Dannélls, Forsberg, Toporowska Gronostaj, & Kokkinakis, 2010) delvis undersökts för att eventuellt förbättra avgörandet av substantivsemantik, vilket ofta blir relevant för PP-adverbial. Rapporten visar hur uppgiften praktiskt sett varierar mycket i svårighetsgrad, från de fall där adverbialet utgörs av t.ex. particip-, adverbfraser eller bisatser, då en mappning till motsvarande frågeled ofta kan ske direkt utifrån huvudordet – till de mest komplicerade fallen av PP och s.k. som-fraser där kombinationer av huvudord, rektionshuvudord, dess grundform samt annan syntaktisk och semantisk information krävs för att urskilja förekomsters särskilda frågemotsvarigheter. Ett återkommande tema i det praktiska arbetet är undantag som behöver kännas igen. Exempelvis kategorin satsadverbial, som kan anta många olika strukturella former men som ändå oftast renderar resultatet ’ingen frågemotsvarighet’, måste kännas igen uttryckligen (ev. tillsammans med andra med samma frågeledsresultat). Även processen som helhet bygger emellertid programmeringstekniskt på grundfall och undantag. I många fall, som t.ex. för i-PP finns det en mängd olika motsvarigheter och vad som får utgöra grundfall i programmet blir en empirisk/heuristisk fråga under det att regler skrivs mot faktiska förekomster av adverbial i Stockholm Umeå Corpus (Hädanefter SUC). Att i liksom andra prepositioner kan sägas ha en prototypisk riktningsbetydelse betyder inte att var nödvändigtvis ska fungera som utgångsfall. Det förekommer ’lager’ av undantag inom olika strukturslag i programmet men även externt motiverade sådana utgående från huvudverbet, som genom valensmatchning kan klargöra att ett adverbial är ’prepositionsobjekt’ och därmed får andra omfrågningsegenskaper. De användargränssnitt som skapats och använts för regelskrivande utifrån faktiska exempel har tillåtit viss omedelbar regeluppdatering och återkontroll vid åsynen av felaktiga resultat. Det är också genom tillägg av nya undantagsregler i någon mening som programmet rimligen ska kunna förbättras framöver från den aktuella kvalitetsnivån. Korrektheten som uppnåtts hittills är inte kvantitativt övertygande men detta arbete som saknar föregångare möjliggör kontinuerlig förbättring genom programmet. Projektet visar att mappningsuppgiften…}, author = {Wilhelmsson, Kenneth}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @misc{ranta-forsberg-2012-implementing-168685, title = {Implementing Programming Languages}, author = {Ranta, Aarne and Forsberg, Markus}, year = {2012}, publisher = {College Publications}, address = {London}, ISBN = {978-1-84890-064-6}, } @inProceedings{adesam-etal-2012-processing-166657, title = {Processing spelling variation in historical text}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2012}, } @inProceedings{eklund-kokkinakis-2012-drug-165309, title = {Drug interests revealed by a public health portal}, abstract = {Online health information seeking has become an important part of people's everyday lives. However, studies have shown that many of those have problems forming effective queries. In order to develop better support and tools for assisting people in health-related query formation we have to gain a deeper understanding into their information seeking behaviour in relation to key issues, such as medication and drugs. The present study attempts to understand the semantics of the users' information needs with respect to medication-related information. Search log queries from the Swedish 1177.se health portal were automatically annotated and categorized according to relevant background knowledge sources. Understanding the semantics of information needs can enable optimization and tailoring of (official) health related information presented to the online consumer, provide better terminology support and thematic coding of the queries and in the long run better models of consumers’ information needs. }, booktitle = {Proceedings of the SLTC-Workshop: Exploratory Query-log Analysis. Lund, Sweden.}, author = {Eklund, Ann-Marie and Kokkinakis, Dimitrios}, year = {2012}, pages = {2}, } @inProceedings{bouma-2012-real-158261, title = {Real-Time Persistent Queues and Deques with Logic Variables (Declarative Pearl)}, abstract = { We present a Prolog implementation of real-time persistent queues and double-ended queues. Our implementation is inspired by Okasaki’s lazy-functional approach, but relies only on standard Prolog, comprising of the pure subset plus if-then-else constructs to efficiently implement guards and meta-calls for convenience. The resulting data structure is a nice demonstration of the fact that the use of logic variables to hold the outcome of an unfinished computation can sometimes give the same kind of elegant and compact solutions as lazy evaluation. }, booktitle = {Proceedings of the 11th International Symposium on Functional and Logic Programming (FLOPS 2012)}, author = {Bouma, Gerlof}, year = {2012}, ISBN = {978-3-642-29821-9}, pages = {62----73}, } @inProceedings{fribergheppin-toporowskagronostaj-2012-rocky-158473, title = {The Rocky Road towards a Swedish FrameNet – Creating SweFN}, abstract = {The Swedish FrameNet project, SweFN, is a lexical resource under development, designed to support both humans and different applications within language technology, such as text generation, text understanding and information extraction. SweFN is constructed in line with the Berkeley FrameNet and the project is aiming to make it a free, full-scale, multi-functional lexical resource covering morphological, syntactic, and semantic descriptions of 50,000 entries. Frames populated by lexical units belonging to the general vocabulary dominate in SweFN, but there are also frames from the medical and the art domain. As Swedish is a language with very productive compounding, special attention is paid to semantic relations within the one word compounds which populate the frames. This is of relevance for understanding the meaning of the compounds and for capturing the semantic and syntactic alternations which are brought about in the course of compounding. SweFN is a component within a complex of modern and historical lexicon resources named SweFN++, available at <http://spraakbanken.gu.se/eng/swefn>.}, booktitle = {Proceedings of the Eighth conference on International Language Resources and Evaluation (LREC-2012); Istanbul, Turkey}, author = {Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2012}, pages = {256--261}, } @inProceedings{johansson-2012-atomic-156993, title = {Non-atomic Classification to Improve a Semantic Role Labeler for a Low-resource Language}, abstract = {Semantic role classification accuracy for most languages other than English is constrained by the small amount of annotated data. In this paper, we demonstrate how the frame-to-frame relations described in the FrameNet ontology can be used to improve the performance of a FrameNet-based semantic role classifier for Swedish, a low-resource language. In order to make use of the FrameNet relations, we cast the semantic role classification task as a non-atomic label prediction task. The experiments show that the cross-frame generalization methods lead to a 27% reduction in the number of errors made by the classifier. For previously unseen frames, the reduction is even more significant: 50%. }, booktitle = {Proceedings of the First Joint Conference on Lexical and Computational Semantics (*SEM); June 7-8; Montréal, Canada}, author = {Johansson, Richard}, year = {2012}, publisher = {Association for Computational Linguistics}, address = {Montréal, Canada}, } @techreport{lyngfelt-forsberg-2012-svenskt-158226, title = {Ett svenskt konstruktikon. Utgångspunkter och preliminära ramar}, author = {Lyngfelt, Benjamin and Forsberg, Markus}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{borin-etal-2012-search-157338, title = {Search Result Diversification Methods to Assist Lexicographers}, abstract = {We show how the lexicographic task of finding informative and diverse example sentences can be cast as a search result diversification problem, where an objective based on relevance and diversity is maximized. This problem has been studied intensively in the information retrieval community during recent years, and efficient algorithms have been devised. We finally show how the approach has been implemented in a lexicographic project, and describe the relevance and diversity functions used in that context. }, booktitle = {Proceedings of the 6th Linguistic Annotation Workshop}, author = {Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Johansson, Richard and Kjellandsson, Annika}, year = {2012}, pages = {113--117}, } @inProceedings{theiler-bouma-2012-price-172733, title = {Two for the price of one: an LFG treatment of sentence initial object es in German.}, abstract = { We present an analysis of sentence initial object es ‘it’ in German. The weak pronoun es may only realize such an object under specific information structural conditions. We follow recent work suggesting these conditions are exactly those that licence the use of the presentational construction, marked by a sentence initial dummy es. We propose that the initial objects are an example of function amalgamation, show that only objects that may also appear in the clause-internal postverbal domain can participate in this fusion and make this precise in LFG. We end the paper with a contrastive discussion. }, booktitle = {Proceedings of LFG'12. Miriam Butt and Tracy Holloway King (Eds.)}, author = {Theiler, Nadine and Bouma, Gerlof}, year = {2012}, pages = {603--623}, } @inProceedings{adesam-etal-2012-bokstaffua-163218, title = {bokstaffua, bokstaffwa, bokstafwa, bokstaua, bokstawa... Towards lexical link-up for a corpus of Old Swedish}, booktitle = {Proceedings of the LTHist workshop at Konvens}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2012}, } @inProceedings{morrison-etal-2012-calculating-167148, title = {Calculating the reliability of likelihood ratios: Addressing modelling problems related to small n and tails}, abstract = {In forensic speech science we are often faced with the problem of having a relatively small amount of data which is also multivariate and distributionally complex. This results in a serious problem exactly in the scenario where potentially large strengths of evidence could be obtained, i.e., when the trace data are on a tail of the distribution which models either the prosecution or defence hypothesis and a large magnitude log likelihood ratio is calculated. By definition the sampling of a distribution is sparse on its tails and this problem is compounded if the model is trained on a small amount of data – small fluctuations in the training data can lead to large changes in the calculated likelihoods on the tails and thus large changes in the calculated likelihood ratios for trace data on the tails. Large-magnitude calculated log likelihood ratios are therefore inherently unreliable.}, booktitle = {Proceedings of 14th Australasian International Conference on Speech Science and Technology}, author = {Morrison, Geoffrey Stewart and Ochoa, Felipe and Lindh, Jonas}, year = {2012}, volume = {14}, } @inProceedings{kokkinakis-2012-initial-164788, title = {Initial Experiments of Medication Event Extraction Using Frame Semantics}, abstract = {Semantic annotation of text corpora for mining complex relations and events has gained a considerable growing attention in the medical domain. The goal of this paper is to present a snapshot of ongoing work that aims to develop and apply an appropriate infrastructure for automatic event labelling and extraction in the Swedish medical domain. Annotated text samples, appropriate lexical resources (e.g. term lists and the Swedish Frame-Net++) and hybrid techniques are currently developed in order to alleviate some of the difficulties of the task. As a case study this paper presents a pilot approach based on the application of the theory of frame semantics to automatically identify and extract detailed medication information from medical texts. Medication information is often written in narrative form (e.g. in clinical records) and is therefore difficult to be acquired and used in computerized systems (e.g. decision support). Currently our approach uses a combination of generic entity and terminology taggers, specifically designed medical frames and various frame-related patterns. Future work intends to improve and enhance current results by using more annotated samples, more medically-relevant frames and combination of supervised learning techniques with the regular expression patterns.}, booktitle = {Scandinavian Conference on Health Informatics (SHI)}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-7519-758-6}, pages = {41--47}, } @book{borin-etal-2012-svenska-163410, title = {Svenska språket i den digitala tidsåldern}, author = {Borin, Lars and Brandt, Martha and Edlund, Jens and Lindh, Jonas and Parkvall, Mikael}, year = {2012}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-30831-4}, } @inProceedings{charalabopoulou-etal-2012-building-168525, title = {Building Corpus-Informed Word Lists for L2 Vocabulary Learning in Nine Languages}, abstract = {Lexical competence constitutes a crucial aspect in L2 learning, since building a rich repository of words is considered indispensable for successful communication. CALL practitioners have experimented with various kinds of computer-mediated glosses to facilitate L2 vocabulary building in the context of incidental vocabulary learning. Intentional learning, on the other hand, is generally underestimated, since it is considered out of fashion and not in line with the communicative L2 learning paradigm. Yet, work is still being done in this area and substantial body of research indicates that the usefulness of incidental vocabulary learning does not exclude the use of dedicated vocabulary study and that by using aids explicitly geared to building vocabularies (such as word lists and word cards) L2 learners exhibit good retention rates and faster learning gains. Intentional vocabulary study should, therefore, have its place in the instructional and learning context. Regardless of the approach, incidental or intentional, the crucial question with respect to vocabulary teaching/learning remains: which and how many words should we teach/learn at different language levels? An attempt to answer the above question was made within the framework of the EU-funded project titled “KELLY” (Keywords for Language Learning for Young and Adults Alike) presented here. The project aimed at building corpus-informed vocabulary lists for L2 learners ranging from A1 to C2 levels for nine languages: Arabic, Chinese, English, Greek, Italian, Norwegian, Polish, Russian and Swedish. }, booktitle = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland}, author = {Charalabopoulou, Frieda and Gavrilidou, Maria and Johansson Kokkinakis, Sofie and Volodina, Elena}, year = {2012}, volume = {2012}, ISBN = {978-1-908416-03-2}, } @incollection{borin-2012-core-162377, title = {Core vocabulary: A useful but mystical concept in some kinds of linguistics}, booktitle = {Shall we play the festschrift game ? Essays on the Occasion of Lauri Carlson's 60th Birthday}, author = {Borin, Lars}, year = {2012}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-642-30772-0}, pages = {53--65}, } @inProceedings{borin-etal-2012-open-156079, title = {The open lexical infrastructure of Språkbanken}, abstract = {We present our ongoing work on Karp, Språkbanken’s (the Swedish Language Bank) open lexical infrastructure, which has two main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; and (2) to publish daily versions of the resources, making them searchable and downloadable. An important requirement on the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 15 Swedish lexical resources, including historical ones, some of which have been created from scratch using existing free resources, both external and in-house. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish. SALDO has been selected as the pivot partly because of its size and quality, but also because its form and sense units have been assigned persistent identifiers (PIDs) to which the lexical information in other lexical resources and in corpora are linked.}, booktitle = {Proceedings of the 8th International Conference on Language Resources and Evaluation : May 23-25, 2012 / eds. Nicoletta Calzolari }, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3598--3602}, } @inProceedings{akesson-etal-2012-voice-162453, title = {Voice Onset Time before and after STN-surgery in patients with Parkinson’s disease}, abstract = {Without abstract}, booktitle = {ICPLA2012}, author = {Åkesson, Joel and Lindh, Jonas and Hartelius, Lena and Carlsson, Emilia}, year = {2012}, volume = {14}, } @inProceedings{volodina-etal-2012-semi-165961, title = {Semi-automatic selection of best corpus examples for Swedish: Initial algorithm evaluation.}, abstract = {The study presented here describes the results of the initial evaluation of two sorting approaches to automatic ranking of corpus examples for Swedish. Representatives from two potential target user groups have been asked to rate top three hits per approach for sixty search items from the point of view of the needs of their professional target groups, namely second/foreign language (L2) teachers and lexicographers. This evaluation has shown, on the one hand, which of the two approaches to example rating (called in the text below algorithms #1 and #2) performs better in terms of finding better examples for each target user group; and on the other hand, which features evaluators associate with good examples. It has also facilitated statistic analysis of the “good” versus “bad” examples with reference to the measurable features, such as sentence length, word length, lexical frequency profiles, PoS constitution, dependency structure, etc. with a potential to find out new reliable classifiers.}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012. }, author = {Volodina, Elena and Johansson, Richard and Johansson Kokkinakis, Sofie}, year = {2012}, number = {080}, pages = {59--70}, } @inProceedings{sundqvist-etal-2012-acoustic-162452, title = {Acoustic and perceptual characteristics of speech in 22q11 deletion syndrome: Measures of voice onset time and syllable durations related to articulation and prosody.}, abstract = {Without abstract}, booktitle = {Proceedings of ICPLA2012}, author = {Sundqvist, Maria and Lindh, Jonas and Hartelius, Lena and Persson, Christina}, year = {2012}, volume = {14}, } @inProceedings{laakso-etal-2012-swedish-162454, title = {Swedish Test of Intelligibility (STI) – Development of computerized assessment of word and sentence intelligibility and the performance of adult control speakers}, abstract = {Without abstract}, booktitle = {ICPLA2012}, author = {Laakso, Katja and Lindh, Jonas and Hartelius, Lena}, year = {2012}, volume = {14}, } @inProceedings{kokkinakis-etal-2012-literacy-164587, title = {Literacy Demands and Information to Cancer Patients}, abstract = {This study examines language complexity of written health information materials for patients undergoing colorectal cancer surgery. Written and printed patient information from 28 Swedish clinics are automatically analyzed by means of language technology. The analysis reveals different problematic issues that might have impact on readability. The study is a first step, and part of a larger project about patients’ health information seeking behavior in relation to written information material. Our study aims to provide support for producing more individualized, person centered information materials according to preferences for complex and detailed or legible texts and thus enhance a movement from receiving information and instructions to participating in knowing. In the near future the study will continue by integrating focus groups with patients that may provide valuable feedback and enhance our knowledge about patients’ use and preferences of different information material.}, booktitle = {Proceedings of the 15th International Conference on Text, Speech and Dialogue}, author = {Kokkinakis, Dimitrios and Forsberg, Markus and Johansson Kokkinakis, Sofie and Smith, Frida and Öhlén, Joakim}, year = {2012}, ISBN = {978-364232789-6}, } @inProceedings{kokkinakis-etal-2012-contextualisation-155530, title = {Contextualisation of functional symptoms in primary health care}, abstract = {Background: a number of patients consulting primary health care have physical symptoms that may be labeled “medically unexplained”, i.e. absence of a demonstrable organic etiology. Common functional somatic symptoms (FSS) are irritable bowel, tension headache and chronic fatigue. FSS-patients are generally frustrated with the inability of health care to alleviate their illness. Health care staff often also feel frustration. The communication between patient and care giver is the key for coming to terms with the problem. Objective: to investigate how complex, vague and long-standing symptoms with no identified organic cause are put into context, interpreted and acted upon in primary health-care interactions. Two types of interventions are envisaged (i) methods for early identification of patients at risk of entering a vicious circle of functional symptoms and (ii) methods for re-interpreting symptoms in alternative and more purposeful ways. Methods: the project studies interactions between patients and nurses giving advice over telephone, consultations between patients and physicians, interviews and study patients' medical case notes. Eligible patients (18-65 y.o.) contact their primary health care centre by telephone, have had at least eight physical consultations with nurses or physicians in the last 12 months and if a majority of the symptoms within this time span had no clear organic or psychiatric cause. The project contains a number of subprojects, according to the type of data collected. Several methods of analysis will be used, mainly critical discourse analysis, phenomenologic-hermeneutic and computation linguistic analyses. (Expected) Results: using the collected data, we describe characteristics of the communication that takes place in these settings and the way symptoms and diseases are represented. This will facilitate the development of future interventions aimed at decreasing the morbidity due to FSS and give further insights into the problem. }, booktitle = {The 5th GENEVA Conference on Person-Centered Medicine. Geneva, Switzerland. }, author = {Kokkinakis, Dimitrios and Lidén, Eva and Svensson, Staffan and Björk Brämberg, Elisabeth and Määttä, Sylvia}, year = {2012}, } @inProceedings{gustavsson-etal-2012-neural-162455, title = {Neural processing of familiar and unfamiliar voices}, booktitle = {Proceedings of IAFPA2012}, author = {Gustavsson, Lisa and Lindh, Jonas and Kallioinen, Petter and Markelius, Marie and Ericsson, Anna and Moniri, Sadegheh Farah and Klintfors, Eeva}, year = {2012}, volume = {21}, } @inProceedings{kokkinakis-2012-journal-155893, title = {The Journal of the Swedish Medical Association - a Corpus Resource for Biomedical Text Mining in Swedish.}, abstract = {Biomedical text mining applications are largely dependent on high quality knowledge resources. Traditionally, these include lexical databases, terminologies, nomenclatures and ontologies and, during the last decade, also corpora of various sizes, variety and diversity. Some of these corpora are annotated with an expanding range of information types and metadata while others become available with a minimal set of annotations. At the same time, it is of great importance that biomedical corpora for lesser-spoken languages also get developed in order to support and facilitate the implementation of practical applications for such languages and to stimulate the development of language technology research and innovation infrastructures in the domain. This paper provides a detailed description of a Swedish biomedical corpus based on the electronic editions of the Journal of the Swedish Medical Association "Läkartidningen" of the years 1996-2010. The corpus consists of a variety of documents that can be related to different medical domains, developed as a response to the increasing needs for large and reliable medical information for Swedish biomedical NLP. The corpus has been structurally annotated with a minimal set of meta information and automatically indexed with the largest and systematically organised computer processable collection of medical terminology, the Swedish SNOMED CT (Systematized Nomenclature of Medicine -- Clinical Terms). This way topic-focused subcorpora, e.g. with diabetes-related content, can be easily developed.}, booktitle = {The Third Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM), an LREC Workshop. Turkey.}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Accepted}, } @inProceedings{lindh-etal-2012-calculating-162456, title = {Calculating the reliability of a likelihood ratio from a disputed utterance}, booktitle = {Proceedings of IAFPA2012}, author = {Lindh, Jonas and Ochoa, Felipe and Morrison, Geoffrey Stewart}, year = {2012}, volume = {21}, } @inProceedings{johansson-etal-2012-semantic-156400, title = {Semantic Role Labeling with the Swedish FrameNet}, abstract = {We present the first results on semantic role labeling using the Swedish FrameNet, which is a lexical resource currently in development. Several aspects of the task are investigated, including the selection of machine learning features, the effect of choice of syntactic parser, and the ability of the system to generalize to new frames and new genres. In addition, we evaluate two methods to make the role label classifier more robust: cross-frame generalization and cluster-based features. Although the small amount of training data limits the performance achievable at the moment, we reach promising results. In particular, the classifier that extracts the boundaries of arguments works well for new frames, which suggests that it already at this stage can be useful in a semi-automatic setting.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3697--3700}, } @inProceedings{volodina-etal-2012-towards-168516, title = {Towards a system architecture for ICALL}, abstract = {In this paper, we present an on-going project whose overall aim is to develop open-source system architecture for supporting ICALL systems that will facilitate re-use of existing NLP tools and resources on a plug-and-play basis. We introduce the project, describe the approaches adopted by the two language teams, and present two applications being developed using the proposed architecture.}, booktitle = {In G. Biswas et al. (eds), Proceedings of the 20th International Conference on Computers in Education. Singapore: Asia-Pacific Society for Computers in Education}, author = {Volodina, Elena and Hrafn, Loftsson and Arnbjörnsdóttir, Birna and Borin, Lars and Leifsson, Guðmundur Örn}, year = {2012}, volume = {2012}, ISBN = {978-981-07-4649-0}, } @inProceedings{oelke-etal-2012-advanced-155493, title = { Advanced Visual Analytics Methods for Literature Analysis}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace, which results also in a growing demand for computational support to analyze such linguistic data. This paper combines robust text analysis with advanced visual analytics and bring a new set of tools to literature analysis. Visual analytics techniques can offer new and unexpected insights and knowledge to the literary scholar. We analyzed a small subset of a large literary collection, the Swedish Literature Bank, by focusing on the extraction of persons’ names, their gender and their normalized, linked form, including mentions of theistic beings (e.g., Gods’ names and mythological figures), and examined their appearance over the course of the novel. A case study based on 13 novels, from the aforementioned collection, shows a number of interesting applications of visual analytics methods to literature problems, where named entities can play a prominent role, demonstrating the advantage of visual literature analysis. Our work is inspired by the notion of distant reading or macroanalysis for the analyses of large literature collections. }, booktitle = {Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH). An EACL 2012 workshop. Avignon, France.}, author = {Oelke, Daniela and Kokkinakis, Dimitrios and Malm, Mats}, year = {2012}, volume = {Accepted}, pages = {10}, } @edited_book{borin-volodina-2012-proceedings-188679, title = {Proceedings of the SLTC 2012 workshop on NLP for CALL}, editor = {Borin, Lars and Volodina, Elena}, year = {2012}, publisher = {LiU Electronic Press}, address = {Linköping}, } @inProceedings{ghosh-etal-2012-improving-156399, title = {Improving the Recall of a Discourse Parser by Constraint-based Postprocessing}, abstract = {We describe two constraint-based methods that can be used to improve the recall of a shallow discourse parser based on conditional random field chunking. These methods use a set of natural structural constraints as well as others that follow from the annotation guidelines of the Penn Discourse Treebank. We evaluated the resulting systems on the standard test set of the PDTB and achieved a rebalancing of precision and recall with improved F-measures across the board. This was especially notable when we used evaluation metrics taking partial matches into account; for these measures, we achieved F-measure improvements of several points.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Ghosh, Sucheta and Johansson, Richard and Riccardi, Giuseppe and Tonelli, Sara}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {2791--2794}, } @inProceedings{bergenmar-olsson-2012-connecting-169845, title = {Connecting European Women Writers. The Selma Lagerlöf Archive and Women Writers Database}, booktitle = {Digital Humanities 2012. 16-20 July 2012, Hamburg. Book of Abstracts}, author = {Bergenmar, Jenny and Olsson, Leif-Jöran}, year = {2012}, } @inProceedings{moschitti-etal-2012-modeling-156401, title = {Modeling Topic Dependencies in Hierarchical Text Categorization}, abstract = {In this paper, we encode topic dependencies in hierarchical multi-label Text Categorization (TC) by means of rerankers. We represent reranking hypotheses with several innovative kernels considering both the structure of the hierarchy and the probability of nodes. Additionally, to better investigate the role of category relationships, we consider two interesting cases: (i) traditional schemes in which node-fathers include all the documents of their child-categories; and (ii) more general schemes, in which children can include documents not belonging to their fathers. The extensive experimentation on Reuters Corpus Volume 1 shows that our rerankers inject effective structural semantic dependencies in multi-classifiers and significantly outperform the state of the art.}, booktitle = {Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (ACL 2012); Jeju, Korea; July 8-14}, author = {Moschitti, Alessandro and Ju, Qi and Johansson, Richard}, year = {2012}, pages = {759--767}, } @inProceedings{kokkinakis-oelke-2012-women-155537, title = {Men, Women and Gods: Distant Reading in Literary Collections - Combining Visual Analytics with Language Technology}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace and so increases the need to computationally support the analysis of such data. Literature can be studied in a number of different ways and from many different perspectives and text analysis make up a central component of literature studies. If such analysis can be integrated with advanced visual methods and fed back to the daily work of the literature researcher, then it is likely to reveal the presence of useful and nuanced insights into the complex daily lives, ideas and beliefs of the main characters found in many of the literary works. In this paper we describe the combination of robust text analysis with visual analytics and bring a new set of tools to literary analysis. As a show case, we analyzed a small subset (13 novels of a single author) taken from a large literary collection, the Swedish Literature Bank <http://litteraturbanken.se/#!om/inenglish>. The analysis is based upon two levels of inquiry, namely by focusing on mentions of theistic beings (e.g. Gods' names) as well as mentions of persons' names, including their gender and their normalized, linked variant forms, and examining their appearance in sentences, paragraphs and chapters. The case study shows several successful applications of visual analytics methods to various literature problems and demonstrates the advantages of the implementation of visual literature fingerprinting. Our work is inspired by the notion of distant reading or macronalysis for the analyses of literature collections. We start by recognizing all characters in the novels using a mature language technology (named entity recognition) which can be turned into a tool in aid of text analysis in this field. We apply context cues, lists of animacy and gender markers and inspired by the document centered approach and the labelled consistency principle which is a form of on-line learning from documents under processing which looks at unambiguous usages of words or names for assigning annotations in ambiguous words or names. For instance, if in an unambiguous context where there is a strong gender indicator, such as 'Mrs Alexander' the name 'Alexander' is assigned a feminine gender, then subsequent mentions of the same name in the same discourse will be assigned the feminine gender as well unless there is a conflict with another person with the same name. We argue, that the integration of text analysis such as the one briefly outlined and visualization techniques, such as higher resolution pixel-based fingerprinting, could be put to effective use also in literature studies. We also see an opportunity to devise new ways of exploring the large volumes of literary texts being made available through national cultural heritage digitization projects, for instance by exploring the possibility to show several literary texts (novels) at once. We will illustrate some of the applied techniques using several examples from our case study, such as summary plots based on all the characters in these novels as well as fingerprints based on the distribution of characters across the novels.}, booktitle = {Proceedings of the Advances in Visual Methods for Linguistics (AVML)}, author = {Kokkinakis, Dimitrios and Oelke, Daniela}, year = {2012}, volume = {Accepted}, } @inProceedings{ghosh-etal-2012-global-157440, title = {Global Features for Shallow Discourse Parsing}, abstract = {A coherently related group of sentences may be referred to as a discourse. In this paper we address the problem of parsing coherence relations as defined in the Penn Discourse Tree Bank (PDTB). A good model for discourse structure analysis needs to account both for local dependencies at the token-level and for global dependencies and statistics. We present techniques on using inter-sentential or sentence-level (global), data-driven, non-grammatical features in the task of parsing discourse. The parser model follows up previous approach based on using token-level (local) features with conditional random fields for shallow discourse parsing, which is lacking in structural knowledge of discourse. The parser adopts a two-stage approach where first the local constraints are applied and then global constraints are used on a reduced weighted search space (n-best). In the latter stage we experiment with different rerankers trained on the first stage n-best parses, which are generated using lexico-syntactic local features. The two-stage parser yields significant improvements over the best performing model of discourse parser on the PDTB corpus.}, booktitle = {Proceedings of the 13th Annual Meeting of the Special Interest Group on Discourse and Dialogue (SIGDIAL)}, author = {Ghosh, Sucheta and Riccardi, Giuseppe and Johansson, Richard}, year = {2012}, pages = {150--159}, } @inProceedings{bennaceur-etal-2012-machine-160393, title = {Machine Learning for Emergent Middleware}, abstract = {Highly dynamic and heterogeneous distributed systems are challenging today's middleware technologies. Existing middleware paradigms are unable to deliver on their most central promise, which is offering interoperability. In this paper, we argue for the need to dynamically synthesise distributed system infrastructures according to the current operating environment, thereby generating "Emergent Middleware'' to mediate interactions among heterogeneous networked systems that interact in an ad hoc way. The paper outlines the overall architecture of Enablers underlying Emergent Middleware, and in particular focuses on the key role of learning in supporting such a process, spanning statistical learning to infer the semantics of networked system functions and automata learning to extract the related behaviours of networked systems.}, booktitle = {Proceedings of the Joint Workshop on Intelligent Methods for Software System Engineering (JIMSE)}, author = {Bennaceur, Amel and Howar, Falk and Issarny, Valérie and Johansson, Richard and Moschitti, Alessandro and Spalazzese, Romina and Steffen, Bernhard and Sykes, Daniel}, year = {2012}, volume = {Accepted}, } @inProceedings{hughes-etal-2012-operavox-201897, title = {operAVoX - On PErson RApid VOice eXaminer}, abstract = {At present, objective analysis of voice quality using acoustic parameters is only possible within a voice laboratory using specialist hardware and software. We have developed an easy-to-use portable voice analysis and feedback application running on the Apple iPhone, iPad, or iPod Touch. OperaVOX™ combines the signal processing power, easy connectivity, user-friendly interface, high-quality microphones and portability of these handheld devices with novel acoustic voice analysis algorithms to provide a powerful voice quality measurement tool that you can carry in your pocket. OperaVOX™ is designed for anyone who is interested in measuring the quality of their voice, such as a patient recovering following a stroke, a professional voice user such as singers or an aspiring actor. Built into OperaVOX™ are the validated Voice Handicap Index questionnaires and the ability for the user to record their voice for acoustic and perceptual analysis both on board the device and externally in the voice laboratory. Furthermore, the user can instruct OperaVOX™ to automatically and confidentially send these data via email to their speech therapist, voice coach or researcher team. OperaVOX™ makes it easy for everyone to accurately measure changes in the quality of their voice every hour, day, or week and without having to travel to the hospital. Two versions of OperaVOX™ will soon be available on the Apple App Store, one for the general public and another for professionals such as speech and language therapists. We have also worked with world-leading University research teams both in the UK and North America to develop bespoke versions of OperaVOX™ specifically tailored for their research and clinical requirements.}, booktitle = {5th national Conference in Logopedics}, author = {Hughes, Owain Rhys and Alexander, Anil and Forth, Oscar and Lindh, Jonas}, year = {2012}, number = {5}, } @inProceedings{volodina-borin-2012-developing-168523, title = {Developing an Open-Source Web-Based Exercise Generator for Swedish}, abstract = {This paper reports on the ongoing international project System architecture for ICALL and the progress made by the Swedish partner. The Swedish team is developing a web-based exercise generator reusing available annotated corpora and lexical resources. Apart from the technical issues like implementation of the user interface and the underlying processing machinery, a number of interesting pedagogical questions need to be solved, e.g., adapting learner-oriented exercises to proficiency levels; selecting authentic examples of an appropriate difficulty level; automatically ranking corpus examples by their quality; providing feedback to the learner, and selecting vocabulary for training domain-specific, academic or general-purpose vocabulary. In this paper we describe what has been done so far, mention the exercise types that can be generated at the moment as well as describe the tasks left for the future. }, booktitle = {CALL: Using, Learning, Knowing. EuroCALL Conference, Gothenburg, Sweden, 22-25 August 2012, Proceedings. Eds. Linda Bradley and Sylvie Thouësny. Research-publishing.net, Dublin, Ireland}, author = {Volodina, Elena and Borin, Lars}, year = {2012}, volume = {2012}, ISBN = {978-1-908416-03-2}, } @edited_book{larsson-borin-2012-from-167661, title = {From Quantification to Conversation}, editor = {Larsson, Staffan and Borin, Lars}, year = {2012}, publisher = {College Publications}, address = {London}, ISBN = {978-1-84890-091-2}, } @inProceedings{volodina-johanssonkokkinakis-2012-introducing-154723, title = {Introducing Swedish Kelly-list, a new free e-resource for Swedish}, abstract = {Frequency lists and/or lexicons contain information about the words and their statistics. They tend to find their “readers” among linguists, lexicographers, language teachers. Making them available in electronic format helps to expand the target group to cover language engineers, computer programmers and other specialists working in such areas as information retrieval, spam filtering, text readability analysis, test generation, etc. This article describes a new freely available electronic frequency list of modern Swedish that was created in the EU project KELLY. We describe the state of affairs for Swedish frequency lexicons; provide a short description of the KELLY project; mention the corpus the list has been derived from. Further, we dwell on the type of information the list contains, describe shortly the steps for list generation; provide information on the coverage and some other statistics over the items in the list. Finally, some practical information on the license for the Swedish Kelly-list distribution is given; potential application areas are suggested; and future plans for its expansion are mentioned. We hope that with some publicity we can help this list find its users. }, booktitle = {LREC 2012 Proceedings}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2012}, volume = {2012}, } @techreport{volodina-johanssonkokkinakis-2012-swedish-165964, title = {Swedish Kelly: Technical Report.}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2012}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{dannells-borin-2012-toward-156502, title = {Toward language independent methodology for generating artwork descriptions – Exploring FrameNet information}, abstract = {Today museums and other cultural heritage institutions are increasingly storing object descriptions using semantic web domain ontologies. To make this content accessible in a multilingual world, it will need to be conveyed in many languages, a language generation task which is domain specific and language dependent. This paper describes how semantic and syntactic information such as that provided in a framenet can contribute to solving this task. It is argued that the kind of information offered by such lexical resources enhances the output quality of a multilingual language generation application, in particular when generating domain specific content. }, booktitle = {EACL 2012 workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH)}, author = {Dannélls, Dana and Borin, Lars}, year = {2012}, } @inProceedings{oelke-etal-2012-visual-155495, title = {Visual Analytics and the Language of Web Query Logs - A Terminology Perspective}, abstract = {This paper explores means to integrate natural language processing methods for terminology and entity identification in medical web session logs with visual analytics techniques. The aim of the study is to examine whether the vocabulary used in queries posted to a Swedish regional health web site can be assessed in a way that will enable a terminologist or medical data analysts to instantly identify new term candidates and their relations based on significant co-occurrence patterns. We provide an example application in order to illustrate how the visualizations of co-occurrence relationships between medical and general entities occurring in such logs can be visualized, accessed and explored. To enable a visual exploration of the generated co-occurrence graphs, we employ a general purpose social network analysis tool, Visone (http://visone.info), that permits to visualize and analyze various types of graph structures. Our examples show that visual analytics based on co-occurrence analysis provides insights into the use of layman language in relation to established (professional) terminologies, which may help terminologists decide which terms to include in future terminologies. Increased understanding of the used querying language is also of interest in the context of public health web sites. The query results should reflect the intentions of the information seekers, who may express themselves in layman language that differs from the one used on the available web sites provided by medical professionals.}, booktitle = {The 15th EURALEX International Congress (European Association of Lexicography). Oslo, Norway.}, author = {Oelke, Daniela and Eklund, Ann-Marie and Marinov, Svetoslav and Kokkinakis, Dimitrios}, year = {2012}, pages = {8}, } @inProceedings{johansson-2012-bridging-163602, title = {Bridging the Gap between Two Different Swedish Treebanks}, abstract = {We present two simple adaptation methods to train a dependency parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To test the methods, we train parsers on the Talbanken and Syntag treebanks of Swedish. The results show that the methods are effective for low-to-medium training set sizes.}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Johansson, Richard}, year = {2012}, volume = {Accepted}, } @inProceedings{rama-borin-2012-properties-164449, title = {Properties of phoneme N -grams across the world’s language families}, abstract = {In this article, we investigate the properties of phoneme N -grams across half of the world’s languages. The sizes of three different N -gram distributions of the world’s language families obey a power law. Further, the N -gram distributions of language families parallel the sizes of the families, which also follow a power law distribution. The correlation between N -gram distributions and language family sizes improves with increasing values of N . The study also raises some new questions about the use of N -gram distributions in linguistic research, which we hope to be able to investigate in the future.}, booktitle = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)}, author = {Rama, Taraka and Borin, Lars}, year = {2012}, } @inProceedings{volodina-etal-2012-waste-165936, title = {Waste not, want not: Towards a system architecture for ICALL based on NLP component re-use}, booktitle = {Proceedings of the SLTC 2012 workshop on NLP for CALL, Lund, 25th October, 2012}, author = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn and Arnbjörnsdóttir, Birna and Leifsson, Guðmundur Örn}, year = {2012}, pages = {47--58}, } @inProceedings{akesson-lindh-2013-describing-188836, title = {Describing a database collection procedure for studying ‘double filtering’ effects}, booktitle = {22nd conference of the International Association for Forensic Phonetics and Acoustics (IAFPA). July 21st-24th, 2013, Tampa, Florida, USA}, author = {Åkesson, Joel and Lindh, Jonas}, year = {2013}, } @inProceedings{borin-etal-2013-lexical-186032, title = {The lexical editing system of Karp}, abstract = {Karp is the open lexical infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure has three main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; (2) to publish the resources, making them searchable and downloadable; and (3) to offer advanced editing functionalities. An important feature of the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 23 Swedish lexical resources. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish.}, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia.}, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Olsson, Olof and Uppström, Jonatan}, year = {2013}, publisher = {Trojina, Institute for Applied Slovene Studies / Eesti Keele Instituut }, address = {Ljubljana/Tallinn}, ISBN = { 978-961-93594-0-2}, } @inProceedings{ju-etal-2013-learning-166990, title = {Learning to Rank from Structures in Hierarchical Text Classification}, abstract = {In this paper, we model learning to rank algorithms based on structural dependencies in hierarchical multi-label text categorization (TC). Our method uses the classification probability of the binary classifiers of a standard top-down approach to generate k-best hypotheses. The latter are generated according to their global probability while at the same time satisfy the structural constraints between father and children nodes. The rank is then refined using Support Vector Machines and tree kernels applied to a structural representation of hypotheses, i.e., a hierarchy tree in which the outcome of binary one-vs-all classifiers is directly marked in its nodes. Our extensive experiments on the whole Reuters Corpus Volume 1 show that our models significantly improve over the state of the art in TC, thanks to the use of structural dependecies.}, booktitle = {Advances in Information Retrieval; 35th European Conference on IR Research, ECIR 2013, Moscow, Russia, March 24-27, 2013; P. Serdyukov et al. (ed)}, author = {Ju, Qi and Moschitti, Alessandro and Johansson, Richard}, year = {2013}, volume = {Lecture Notes in Computer Science 7814}, ISBN = {978-3-642-36972-8}, pages = {183--194}, } @inProceedings{johansson-2013-training-173587, title = {Training Parsers on Incompatible Treebanks}, abstract = {We consider the problem of training a statistical parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To address this problem, we present two simple adaptation methods: the first method is based on the idea of using a shared feature representation when parsing multiple treebanks, and the second method on guided parsing where the output of one parser provides features for a second one. To evaluate and analyze the adaptation methods, we train parsers on treebank pairs in four languages: German, Swedish, Italian, and English. We see significant improvements for all eight treebanks when training on the full training sets. However, the clearest benefits are seen when we consider smaller training sets. Our experiments were carried out with unlabeled dependency parsers, but the methods can easily be generalized to other feature-based parsers.}, booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Johansson, Richard}, year = {2013}, pages = {127--137}, } @inProceedings{kokkinakis-2013-figurative-168227, title = {Figurative Language in Swedish Clinical Texts. Potsdam, Germany}, abstract = {Automated processing of clinical texts with the intention to link all important text fragments to various established terminologies and ontologies for relation or event extraction is commonly faced with various less exposed, and not so regularly discussed linguistically motivated issues that needs to be addressed. One of these issues is the usage of figurative language. Figurative language, that is the use of words that go beyond their ordinary meaning, is not only a linguistically complex and challenging problem but also a problem that causes great difficulty for the field of natural language processing (NLP), both for the processing of general language and of various sublanguages, such as clinical medicine. Therefore, a comprehensive model of e.g. clinical language processing needs to account for figurative language usage and this paper provides a description towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. As a matter of fact, all these types of expressions form a continuum with fuzzy boundaries, and most of the NLP-oriented approaches discussed in the past have used either very large data for the analysis or hand annotates samples, a situation that has been prohibitive so far in our project. Therefore distinction is solely based on a more general level, namely between literal versus figurative language, and on a more quantitative and corpus-based level, supported with concrete examples that illustrate several types of figurative expressions in the clinical discourse. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient doctor and patient nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree. }, booktitle = {Computational Semantics in Clinical Text workshop. Part of the 10th International Conference on Computational Semantics}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-1-62748-398-8}, pages = {6}, } @inProceedings{lindh-akesson-2013-pilot-188837, title = {A pilot study on the effect of different phonetic acoustic input to a GMM - UBM system for voice comparison}, booktitle = {22nd conference of the International Association for Forensic Phonetics and Acoustics (IAFPA). July 21st-24th, 2013, Tampa, Florida, USA}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2013}, } @inProceedings{backstrom-etal-2013-automatic-178351, title = {Automatic identification of construction candidates for a Swedish constructicon}, abstract = {We present an experiment designed for extracting construction candidates for a Swedish constructicon from text corpora. We have explored the use of hybrid n-grams with the practical goal to discover previously undescribed partially schematic constructions. The experiment was successful, in that quite a few new constructions were discovered. The precision is low, but as a push-button tool for construction discovery, it has proven a valuable tool for the work on a Swedish constructicon.}, booktitle = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway. NEALT Proceedings Series 19}, author = {Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Sköldberg, Emma}, year = {2013}, pages = {2--11}, } @techreport{roxendal-2013-state-189377, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction – W3C Working Draft 6 December 2012}, author = {Roxendal, Johan}, year = {2013}, publisher = {MIT}, address = {Cambridge, USA}, } @inProceedings{hamon-etal-2013-medication-189545, title = {Medication Extraction and Guessing in Swedish, French and English. }, abstract = {Extraction of information related to the medication is an im-portant task within the biomedical area. While the elaboration and updating of the drug vocabularies cannot follow the rap-id evolution of the drug development, we propose an automat-ic method for the extraction of known and new drug names. Our method combines internal and contextual clues. The method is applied to different types of documents in three languages (Swedish, French and English). The results indi-cate that with this kind of approach, we can efficiently update and enrich the existing drug vocabularies (probably with rap-id manual browsing). Precision and recall scores varied be-tween 81%-91% for precision and 85%-100% for recall. As a future work we intend to continuously refine the approach, by for instance better integration of semantic patterns and fuzzy matching that should hopefully enable further increase of the obtained results.}, booktitle = {Proceedings of the 14th World Congress on Medical and Health Informatics (MEDINFO). Studies in Health Technology and Informatics. Copenhagen, Denmark.}, author = {Hamon, Thierry and Grabar, Natalia and Kokkinakis, Dimitrios}, year = {2013}, volume = {192}, } @article{andersson-ahlberg-2013-towards-181972, title = {Towards automatic tracking of lexical change: linking historical lexical resources}, journal = {NEALT Proceedings Series}, author = {Andersson, Peter and Ahlberg, Malin}, year = {2013}, volume = {18}, } @inProceedings{dannells-etal-2013-mapserver-178095, title = {MapServer for Swedish Language Technology}, abstract = {The MapServer application used by the Swedish Language Bank provides new opportunities for visualizing geographical information found in its large repository of written texts, in particular literary texts. The application is capable of performing coordinate search on the basis of recognized place names and rendering both static and dynamic maps that display their geographical locations. }, booktitle = {Digital Humanities}, author = {Dannélls, Dana and Borin, Lars and Olsson, Leif-Jöran}, year = {2013}, } @inProceedings{kokkinakis-eklund-2013-query-189552, title = {Query Logs as a Corpus.}, abstract = {This paper provides a detailed description of a large Swedish health-related query log corpus and explores means to derive useful statistics, their distributions and analytics from its content across several dimensions. Information acquisition from query logs can be useful for several purposes and potential types of users, such as terminologists, infodemiologists / epidemiologists, medical data and web analysts, specialists in NLP technologies such as information retrieval and text mining but also public officials in health and safety organizations.}, booktitle = {Corpus Linguistics 2013 : abstract book. Lancaster: UCREL}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Eklund, Ann-Marie}, year = {2013}, pages = {329}, } @inProceedings{kokkinakis-2013-terminologihantering-189541, title = {Terminologihantering i medicinska loggfiler.}, booktitle = {Proceedings of the "Nationell termkonferens". Göteborg}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @inProceedings{kokkinakis-malm-2013-macroanalytic-188518, title = {A Macroanalytic View of Swedish Literature using Topic Modeling.}, abstract = {New research opportunities are plentiful for digital and literature scholars who are currently faced with increasingly large portions of large digitized archives produced during the last decades. Conventional methods of analysis involving a so called close reading view are not enough. Distant reading or macroanalysis is proposed instead, as a better, viable and more pragmatic alternative to the traditional methods of analyzing e.g., literature. According to this view, understanding literature is not accomplished by studying individual texts, but by aggregating and analyzing massive amounts of data. Therefore, applying macroanalytic methods and technologies is a priority among many research groups in the humanities worldwide. In this paper we explore topic modeling, an increasingly popular statistical method used for uncovering themes, topics and patterns in large amounts of text. We use available topic modeling software and, as empirical data, the content of the Swedish literature bank, a constantly growing body of Swedish fiction corpus from the 18th and 19th century. We present preliminary results on a sample of this corpus and discuss how humanistic research can be conducted through this type of computation, as a means to identify potential issues of interest e.g., for historians.}, booktitle = {Corpus Linguistics 2013 : abstract book (Lancaster)}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2013}, } @article{johansson-moschitti-2013-relational-158811, title = {Relational Features in Fine-grained Opinion Analysis}, abstract = {Fine-grained opinion analysis often makes use of linguistic features but typically does not take the interaction between opinions into account. This article describes a set of experiments that demonstrate that relational features, mainly derived from dependency-syntactic and semantic role structures, can significantly improve the performance of automatic systems for a number of fine-grained opinion analysis tasks: marking up opinion expressions, finding opinion holders, and determining the polarities of opinion expressions. These features make it possible to model the way opinions expressed in natural-language discourse interact in a sentence over arbitrary distances. The use of relations requires us to consider multiple opinions simultaneously, which makes exact inference intractable. However, a reranker can be used as a sufficiently accurate and efficient approximation. A number of feature sets and machine learning approaches for the rerankers are evaluated. For the task of opinion expression extraction, the best model shows a 10-point absolute improvement in soft recall on the MPQA corpus over a conventional sequence labeler based on local contextual features, while precision decreases only slightly. Significant improvements are also seen for the extended tasks where holders and polarities are considered: 10 and 7 points in recall, respectively. In addition, the systems outperform previously published results for unlabeled (6 F-measure points) and polarity-labeled (10–15 points) opinion expression extraction. Finally, as an extrinsic evaluation, the extracted MPQA-style opinion expressions are used in practical opinion mining tasks. In all scenarios considered, the machine learning features derived from the opinion expressions lead to statistically significant improvement.}, journal = {Computational Linguistics}, author = {Johansson, Richard and Moschitti, Alessandro}, year = {2013}, volume = {39}, number = {3}, pages = {473--509}, } @inProceedings{kokkinakis-2013-annotation-189536, title = {Annotation of interpersonal relations in Swedish prose fiction.}, abstract = {This paper describes the manual annotation of a small sample of Swedish 19th and 20th century prose fiction with interpersonal relations between characters in six literary works. An interpersonal relationship is an association between two or more people that may range in duration from brief to enduring. The annotation is guided by a named entity recognition step. Our goal is to get an in-depth understanding of the difficulties of such a task and elaborate a model that can be applied for similar annotation on a larger scale, both manually as well as automatically. The identification of interpersonal relations can, hopefully, aid the reader of a Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story. Our aim is to use such annotations in a hybrid context, i.e., using machine learning and rule-based methods, which, in conjunction with named entity recognition, can provide the necessary infrastructure for creating detailed biographical sketches and extracting facts for various named entities which can be exploited in various possible ways by Natural Language Processing (NLP) technologies such as summarization, question answering, as well as visual analytic techniques.}, booktitle = {Proceedings of the 3rd Workshop on Annotation of Corpora for Research in the Humanities (ACRH-3). Sofia, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-954-91700-5-4}, pages = {37--47}, } @inProceedings{bennaceur-etal-2013-automatic-158812, title = {Automatic Service Categorisation through Machine Learning in Emergent Middleware}, booktitle = {Lecture notes in computer sciences}, author = {Bennaceur, Amel and Johansson, Richard and Moschitti, Alessandro and Sykes, Daniel and Issarny, Valérie}, year = {2013}, volume = {7542}, pages = {133--149}, } @inProceedings{gustavsson-etal-2013-neural-177670, title = {Neural processing of voices - Familiarity}, abstract = {Brain responses to familiar and unfamiliar voices were investigated with ERPs (Event Related Potentials). Presentation of a stream of one syllable utterances from a female voice established a standard expectation, and similar samples from four other male voices where inserted as unexpected deviants in a typical mismatch paradigm. The participants were 12 students from the basic course in linguistics. Two of the deviant voices were familiar voices of their teachers. The two other deviant voices were matched (same age, sex and dialect) but unfamiliar to the participants. A typical MMN (Mismatch Negativity) was elicited, i.e. a more negative response to the deviants compared to the standards. In contrast to verbal reports, where only one participant identified any of the deviant voices, the MMN response differed on group level between familiar and unfamiliar voices. MMN to familiar voices was larger. Using teachers' voices ensured naturalistic long term exposure, but did not allow for random assignment to conditions of familiarity making the design quasi-experimental. Thus acoustic analysis of voice characteristics as well as follow up studies with randomized exposure to voices are needed to rule out possible confounds and establish a causal effect of voice familiarity.}, booktitle = {Proceedings of 21st International Congress on Acoustics}, author = {Gustavsson, Lisa and Kallioinen, Petter and Klintfors, Eeva and Lindh, Jonas}, year = {2013}, volume = {19}, number = {I}, pages = {060204----6}, } @inProceedings{ghosh-etal-2013-mining-188844, title = {Mining Fine-grained Opinion Expressions with Shallow Parsing}, abstract = {Opinion analysis deals with public opinions and trends, but subjective language is highly ambiguous. In this paper, we follow a simple data-driven technique to learn fine-grained opinions. We select an intersection set of Wall Street Journal documents that is included both in the Penn Discourse Tree Bank (PDTB) and in the Multi-Perspective Question Answering (MPQA) corpus. This is done in order to explore the usefulness of discourse-level structure to facilitate the extraction of fine-grained opinion expressions. Here we perform shallow parsing of MPQA expressions with connective based discourse structure, and then also with Named Entities (NE) and some syntax features using conditional random fields; the latter feature set is basically a collection of NEs and a bundle of features that is proved to be useful in a shallow discourse parsing task. We found that both of the feature-sets are useful to improve our baseline at different levels of this fine-grained opinion expression mining task.}, booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, author = {Ghosh, Sucheta and Tonelli, Sara and Johansson, Richard}, year = {2013}, pages = {302--310}, } @inProceedings{skoldberg-etal-2013-between-186041, title = {Between Grammars and Dictionaries: a Swedish Constructicon }, abstract = {This paper introduces the Swedish Constructicon (SweCxn), a database of Swedish constructions currently under development. We also present a small study of the treatment of constructions in Swedish (paper) dictionaries, thus illustrating the need for a constructionist approach, and discuss three different methods used to identify potential constructions for inclusion in the constructicon. SweCxn is a freely available electronic resource, with a particular focus on semi-general linguistic patterns of the type that are difficult to account for from a purely lexicographic or a purely grammatical perspective, and which therefore have tended to be neglected in both dictionaries and grammars. Far from being a small set of borderline cases, such constructions are both numerous and common. They are also quite problematic for second language acquisition as well as LT applications. Accordingly, various kinds of multi-word units have received more attention in recent years, not least from a lexicographic perspective. The coverage, however, is only partial, and the productivity of many constructions is hard to capture from a lexical viewpoint. To identify constructions for SweCxn, we use a combination of methods, such as working from existing construction descriptions for Swedish and other languages, applying LT tools to discover recurring patterns in texts, and extrapolating constructional information from dictionaries. }, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia. Ljubljana/Tallinn: Trojina, Institute for Applied Slovene Studies/Eesti Keele Instituut.}, author = {Sköldberg, Emma and Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Tingsell, Sofia and Uppström, Jonatan}, year = {2013}, pages = {310--327}, } @edited_book{eyorsson-etal-2013-proceedings-190256, title = {Proceedings of the workshop on computational historical linguistics at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Eyþórsson, Þórhallur and Borin, Lars and Haug, Dag and Rögnvaldsson, Eiríkur}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-587-2}, } @inProceedings{bouma-adesam-2013-experiments-177631, title = {Experiments on sentence segmentation in Old Swedish editions}, booktitle = {NEALT Proceedings Series }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2013}, volume = {18}, ISBN = {978-91-7519-587-2}, } @article{borin-etal-2013-close-187063, title = {Close encounters of the fifth kind: Some linguistic and computational aspects of the Swedish FrameNet++ project}, abstract = {The Swedish FrameNet++ (SweFN++) project aims at developing an integrated Swedish lexical macro-resource to be used primarily in language technology R&D to build natural language processing (NLP) applications. Most of the component resources making up SweFN++ are existing digital lexical resources; in their case the central project effort is directed at making them interoperable on as many levels as possible. An important new resource being created in the project is a Swedish framenet. Now a sister project is starting with the aim of adding a Swedish constructicon (SweCxn) to the macro-resource. In this paper, we discuss some theoretical and conceptual issues which have arisen in the course of our work on the SweFN++ and the planning of the SweCxn, in the close encounter between the practical requirements of NLP and the theory and practice of linguistic – lexical and grammatical – description. }, journal = {Veredas}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin}, year = {2013}, volume = {17}, number = {1}, pages = {28--43}, } @inProceedings{pedersen-etal-2013-nordic-178357, title = {Nordic and Baltic wordnets aligned and compared through “WordTies”}, abstract = {During the last few years, extensive wordnets have been built locally for the Nordic and Baltic languages applying very different compilation strategies. The aim of the present investigation is to consolidate and examine these wordnets through an alignment via Princeton Core WordNet and thereby compare them along the measures of taxonomical structure, synonym structure, and assigned relations to approximate to a best practice. A common web interface and visualizer “WordTies” is developed to facilitate this purpose. Four bilingual wordnets are automatically processed and evaluated exposing interesting differences between the wordnets. Even if the alignments are judged to be of a good quality, the precision of the translations vary due to considerable differences in hyponymy depth and interpretation of the synset. All seven monolingual and four bilingual wordnets as well as WordTies have been made available via META-SHARE through the META-NORD project.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Pedersen, Bolette and Borin, Lars and Forsberg, Markus and Kahusk, Neeme and Lindén, Krister and Niemi, Jyrki and Nisbeth, Niklas and Nygaard, Lars and Orav, Heili and Rögnvaldsson, Eiríkur and Seaton, Mitchel and Vider, Kadri and Voionmaa, Kaarlo}, year = {2013}, number = {16}, pages = {147--162}, } @inProceedings{kokkinakis-2013-medical-188517, title = {Medical Event Extraction using Frame Semantics - Challenges and Opportunities. Samos, Greece}, abstract = {Abstract. The aim of this paper is to present some findings from a study into how a large scale semantic resource, FrameNet, can be applied for event extraction in the (Swedish) biomedical domain. Combining lexical resources with domain specific knowledge provide a powerful modeling mechanism that can be utilized for event extraction and other advanced text mining-related activities. The results, from developing a rule-based approach, showed that only small discrepancies and omissions were found between the semantic descriptions, the corpus data examined and the domain-specific semantics provided by SNOMED CT (medical terminology), NPL (medicinal products) and various semi-automatically developed clue lists (e. g., domain-related abbreviations). Although the described experiment is only based on four different domain-specific frames, the methodology is extendable to the rest ones and there is much room for improvements, for instance by combining rule-based with machine learning techniques, and using more advanced syntactic representations.}, booktitle = {Proceedings of the 14th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing)}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @edited_book{borin-etal-2013-proceedings-190260, title = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Borin, Lars and Fjeld, Ruth Vatvedt and Forsberg, Markus and Nimb, Sanni and Nugues, Pierre and Pedersen, Bolette Sandford}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-586-5}, } @article{oelke-etal-2013-fingerprint-181484, title = {Fingerprint Matrices: Uncovering the dynamics of social networks in prose literature}, abstract = {In prose literature often complex dynamics of interpersonal relationships can be observed between the different characters. Traditionally, node-link diagrams are used to depict the social network of a novel. However, static graphs can only visualize the overall social network structure but not the development of the networks over the course of the story, while dynamic graphs have the serious problem that there are many sudden changes between different portions of the overall social network. In this paper we explore means to show the relationships between the characters of a plot and at the same time their development over the course of a novel. Based on a careful exploration of the design space, we suggest a new visualization technique called Fingerprint Matrices. A case study exemplifies the usage of Fingerprint Matrices and shows that they are an effective means to analyze prose literature with respect to the development of relationships between the different characters.}, journal = {Computer Graphics Forum}, author = {Oelke, D. and Kokkinakis, Dimitrios and Keim, D. A.}, year = {2013}, volume = {32}, number = {3}, pages = {371--380}, } @article{borin-etal-2013-saldo-188604, title = {SALDO: a touch of yin to WordNet's yang}, abstract = {The English-language Princeton WordNet (PWN) and some wordnets for other languages have been extensively used as lexical–semantic knowledge sources in language technology applications, due to their free availability and their size. The ubiquitousness of PWN-type wordnets tends to overshadow the fact that they represent one out of many possible choices for structuring a lexical-semantic resource, and it could be enlightening to look at a differently structured resource both from the point of view of theoretical–methodological considerations and from the point of view of practical text processing requirements. The resource described here—SALDO—is such a lexical–semantic resource, intended primarily for use in language technology applications, and offering an alternative organization to PWN- style wordnets. We present our work on SALDO, compare it with PWN, and discuss some implications of the differences. We also describe an integrated infrastructure for computational lexical resources where SALDO forms the central component.}, journal = {Language resources and evaluation}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2013}, volume = {47}, number = {4}, pages = {1191--1211}, } @techreport{roxendal-2013-state-189376, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction – W3C Last Call Working Draft 1 August 2013}, author = {Roxendal, Johan}, year = {2013}, publisher = {MIT}, address = {Cambridge, USA}, } @edited_book{desmedt-etal-2013-proceedings-190263, title = {Proceedings of the workshop on Nordic language research infrastructure at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {De Smedt, Koenrad and Borin, Lars and Lindén, Krister and Maegaard, Bente and Rögnvaldsson, Eiríkur and Vider, Kadri}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-585-8}, } @inProceedings{volodina-johanssonkokkinakis-2013-compiling-188550, title = {Compiling a corpus of CEFR-related texts.}, abstract = {This paper reports on initial efforts to compile a corpus of course book texts used for teaching CEFR-based courses of Swedish to adult immigrants. The research agenda behind compiling such a corpus comprises the study of normative “input” texts that can reveal a number of facts about what is being taught in terms of explicit grammar, receptive vocabulary, text and sentence readability; as well as build insights into linguistic characteristics of normative texts which can help anticipate learner performance in terms of active vocabulary, grammatical competence, etc. in classroom and testing settings. The CEFR “can-do” statements are known to offer flexibility in interpreting them for different languages and target groups. However, they are nonspecific and therefore it is difficult to associate different kinds of competences and levels of accuracy learners need in order to perform the communicative tasks with the different CEFR levels. To address this problem a systematic study needs to be performed for each individual anguage, both for “input” normative texts and “output” learner-produced texts. In this project we take the first step to collect and study normative texts for Swedish. The article describes the process of corpus compilation, annotation scheme of CEFR- relevant parameters, and methods proposed for text analysis, namely statistic and empiric methods, as well as techniques coming from computational linguistics/machine learning. }, booktitle = {Proceedings of the Language Testing and CEFR conference, Antwerpen, Belgium, May 27-29, 2013}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2013}, } @inProceedings{skadina-etal-2013-baltic-194532, title = {Baltic and Nordic parts of the European linguistic infrastructure}, booktitle = {71. Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013) 22-24, May 2013 Oslo, Norway}, author = {Skadina, Inguna and Vasiljevs, Andrejs and Borin, Lars and Lindén, Krister and Losnegaard, Gyri and Pedersen, Bolette Sandford and Rozis, Roberts and De Smedt, Koenraad}, year = {2013}, ISBN = {978-91-7519-589-6}, pages = {195--211}, } @inProceedings{pijetlovic-volodina-2013-developing-188543, title = {Developing a Swedish spelling game on an ICALL platform}, abstract = {In this project we developed web services on the ICALL platform Lärka for automatic generation of Swedish spelling exercises using Text-To-Speech (TTS) technology which allows L2 learners to train their spelling and listening individually performance based levels. The embedded avatar pronounces a random item of the desired level, which the user has to spell. Furthermore, the users have the possibility to train their own words for different linguistic levels. A result tracker containing a total and correct answer score keeps track of the language learner’s performance. In order to analyse typical spelling errors and provide better feedback, misspellings are collected in a database. The usability of the spelling exercises, concerning the different linguistic levels and the quality of speech, has been evaluated through a questionnaire with 10 participants.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, Évora, Portugal, Proceedings.}, author = {Pijetlovic, Dijana and Volodina, Elena}, year = {2013}, ISBN = {978-1-908416-12-4}, } @edited_book{volodina-etal-2013-proceedings-188675, title = {Proceedings of the second workshop on NLP for computer-assisted language learning at NODALIDA 2013 May 22-24, 2013, Oslo, Norway}, editor = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn}, year = {2013}, publisher = {Linköping University Press}, address = {Linköping, Sweden}, ISBN = {978-91-7519-588-9}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @inProceedings{ahlberg-etal-2013-korp-178355, title = {Korp and Karp – a bestiary of language resources: the research infrastructure of Språkbanken}, abstract = {A central activity in Språkbanken, an R&D unit at the University of Gothenburg, is the systematic construction of a research infrastructure based on interoperability and widely accepted standards for metadata and data. The two main components of this infrastructure deal with text corpora and with lexical resources. For modularity and flexibility, both components have a backend, or server-side part, accessed through an API made up of a set of well-defined web services. This means that there can be any number of different user interfaces to these components, corresponding, e.g., to different research needs. Here, we will demonstrate the standard corpus and lexicon search interfaces, designed primarily for linguistic searches: Korp and Karp.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Olsson, Leif-Jöran and Olsson, Olof and Roxendal, Johan and Uppström, Jonatan}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, } @edited_book{borin-saxena-2013-approaches-184757, title = {Approaches to Measuring Linguistic Differences}, abstract = {The present volume collects contributions addressing different aspects of the measurement of linguistic differences, a topic which probably is as old as language itself but at the same time has acquired renewed interest over the last decade or so, reflecting a rapid development of data-intensive computing in all fields of research, including linguistics.}, editor = {Borin, Lars and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, } @inProceedings{pilan-etal-2013-automatic-188465, title = {Automatic Selection of Suitable Sentences for Language Learning Exercises}, abstract = {In this study we investigated second and foreign language (L2) sentence readability, an area little explored so far in the case of several languages, including Swedish. The outcome of our research consists of two methods for sentence selection from native language corpora based on Natural Language Processing (NLP) and machine learning (ML) techniques. The two approaches have been made available online within Lärka, an Intelligent CALL (ICALL) platform offering activities for language learners and students of linguistics. Such an automatic selection of suitable sentences can be valuable for L2 teachers during the creation of new teaching materials, for L2 students who look for additional self-study exercises as well as for lexicographers in search of example sentences to illustrate the meaning of a vocabulary item. Members from all these potential user groups evaluated our methods and found the majority of the sentences selected suitable for L2 learning purposes.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th to 14th September 2013 Évora, Portugal, Proceedings.}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2013}, ISBN = {978-1-908416-12-4}, pages = {218--225}, } @incollection{borin-etal-2013-intercontinental-184760, title = {The Intercontinental Dictionary Series – a rich and principled database for language comparison}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin ; Anju Saxena}, author = {Borin, Lars and Comrie, Bernard and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {285--302}, } @incollection{borin-2013-measuring-184758, title = {The why and how of measuring linguistic differences}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin and Anju Saxena}, author = {Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {3--26}, } @inProceedings{volodina-etal-2013-towards-188549, title = {Towards a gold standard for Swedish CEFR-based ICALL}, abstract = {In qualitative projects on ICALL (Intelligent Computer-Assisted Language Learning), research and development always go hand in hand: development both depends upon the research results and dictates the research agenda. Likewise, in the development of the Swedish ICALL platform Lärka, the practical issues of development have dictated its research agenda. With NLP approaches, sooner or later, the necessity for reliable training data becomes unavoidable. At the moment Lärka's research agenda cannot be addressed without access to reliable training data, so-called “gold standard”. This paper gives an overview of the current state of the Swedish ICALL platform development and related research agenda, and describes the first attempts to collect the reference corpus (“gold standard”) coming from course books used in CEFR-based language teaching.}, booktitle = {Proceedings of the Second Workshop on NLP for Computer-Assisted Language Learning. NEALT Proceedings Series 17. Nodalida 2013, Oslo, Norway. }, author = {Volodina, Elena and Pijetlovic, Dijana and Pilán, Ildikó and Johansson Kokkinakis, Sofie}, year = {2013}, ISBN = {978-91-7519-588-9}, } @incollection{saxena-borin-2013-carving-184759, title = {Carving Tibeto-Kanauri by its joints: Using basic vocabulary lists for genetic grouping of languages}, booktitle = {Approaches to Measuring Linguistic Differences}, author = {Saxena, Anju and Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {175--198}, } @inProceedings{lenkiewicz-etal-2014-dwan-216695, title = {The DWAN framework: Application of a web annotation framework for the general humanities to the domain of language resources}, abstract = {Researchers share large amounts of digital resources, which offer new chances for cooperation. Collaborative annotation systems are meant to support this. Often, these systems are targeted at a specific task or domain, e.g., annotation of a corpus. The DWAN framework for web annotation is generic and can support a wide range of tasks and domains. A key feature of the framework is its support for caching representations of the annotated resource. This allows showing the context of the annotation even if the resource has changed or has been removed. The paper describes the design and implementation of the framework. Use cases provided by researchers are well in line with the key characteristics of the DWAN annotation framework.}, booktitle = {LREC 2014, Reykjavik, Iceland; http://lrec2014.lrec-conf.org/en/conference-programme/list-accepted-papers/}, author = {Lenkiewicz, Przemyslaw and Shkaravska, Olha and Goosen, Twan and Windhouwer, Menzo and Broeder, Daan and Roth, Stephanie S. and Olsson, Olof}, year = {2014}, } @inProceedings{dannells-gruzitis-2014-extracting-198499, title = {Extracting a bilingual semantic grammar from FrameNet-annotated corpora}, abstract = {We present the creation of an English-Swedish FrameNet-based grammar in Grammatical Framework. The aim of this research is to make existing framenets computationally accessible for multilingual natural language applications via a common semantic grammar API, and to facilitate the porting of such grammar to other languages. In this paper, we describe the abstract syntax of the semantic grammar while focusing on its automatic extraction possibilities. We have extracted a shared abstract syntax from ~58,500 annotated sentences in Berkeley FrameNet (BFN) and ~3,500 annotated sentences in Swedish FrameNet (SweFN). The abstract syntax defines 769 frame-specific valence patterns that cover 77,8% examples in BFN and 74,9% in SweFN belonging to the shared set of 471 frames. As a side result, we provide a unified method for comparing semantic and syntactic valence patterns across framenets.}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC)}, author = {Dannélls, Dana and Gruzitis, Normunds}, year = {2014}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-8-4}, } @inProceedings{borin-etal-2014-bring-198549, title = {Bring vs. MTRoget: Evaluating automatic thesaurus translation}, booktitle = {Proceedings of LREC 2014, May 26-31, 2014 Reykjavik, Iceland}, author = {Borin, Lars and Allwood, Jens and de Melo, Gerard}, year = {2014}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-8-4}, } @incollection{damova-etal-2014-natural-178094, title = {Natural Language Interaction with Semantic Web Knowledge Bases and Linked Open Data}, abstract = {Cultural heritage appears to be a very useful use case for Semantic Web technologies. The domain provides with plenty of circumstances where linkages between different knowledge sources are required to ensure access to rich information and respond to the needs of professionals dealing with cultural heritage content. Semantic Web technologies offer the technological backbone to meet the requirement of integrating heterogeneous data easily, but they are still more adapted to be consumed by computers than by humans, especially non-engineers or developers. This chapter is about a technique which allows interaction in natural language with semantic knowledge bases. The proposed technique offers a method that allows querying a semantic repository in natural language and obtaining results from it as a coherent text. This unique solution includes several steps of transition from natural language to SPARQL and from RDF to coherent multilingual descriptions, using the Grammatical Framework, GF. The approach builds on a semantic knowledge infrastructure in RDF, it is based on OWLIM-SE and the data integration method Reason-able View supplied with an ontological reference layer. The latter is connected via formal rules with abstract representations derived from the syntactic trees of natural language input using the GF resource grammar library. }, booktitle = {Towards multilingual Semantic Web}, author = {Damova, Mariana and Dannélls, Dana and Mateva, Maria and Enache, Ramona and Ranta, Aarne}, year = {2014}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-662-43585-4}, pages = {211--226}, } @inProceedings{volodina-lindstromtiedemann-2014-evaluating-206141, title = {Evaluating students' metalinguistic knowledge with Lärka.}, booktitle = {Proceedings of the 5th Swedish Language Technology Conference, Uppsala University 13-14 November 2014}, author = {Volodina, Elena and Lindström TIedemann, Therese}, year = {2014}, } @inProceedings{adesam-etal-2014-koala-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @article{dupplaw-etal-2014-information-195563, title = {Information extraction from multimedia web documents: an open-source platform and testbed}, abstract = {The LivingKnowledge project aimed to enhance the current state of the art in search, retrieval and knowledge management on the web by advancing the use of sentiment and opinion analysis within multimedia applications. To achieve this aim, a diverse set of novel and complementary analysis techniques have been integrated into a single, but extensible software platform on which such applications can be built. The platform combines state-of-the-art techniques for extracting facts, opinions and sentiment from multimedia documents, and unlike earlier platforms, it exploits both visual and textual techniques to support multimedia information retrieval. Foreseeing the usefulness of this software in the wider community, the platform has been made generally available as an open-source project. This paper describes the platform design, gives an overview of the analysis algorithms integrated into the system and describes two applications that utilise the system for multimedia information retrieval.}, journal = {International Journal of Multimedia Information Retrieval}, author = {Dupplaw, David and Matthews, Michael and Johansson, Richard and Boato, Giulia and Costanzo, Andrea and Fontani, Marco and Minack, Enrico and Demidova, Elena and Blanco, Roi and Griffiths, Thomas and Lewis, Paul and Hare, Jonathon and Moschitti, Alessandro}, year = {2014}, volume = {3}, number = {2}, pages = {97--111}, } @inProceedings{kokkinakis-etal-2014-hfst-209800, title = {HFST-SweNER . A New NER Resource for Swedish}, abstract = {Named entity recognition (NER) is a knowledge-intensive information extraction task that is used for recognizing textual mentions of entities that belong to a predefined set of categories, such as locations, organizations and time expressions. NER is a challenging, difficult, yet essential preprocessing technology for many natural language processing applications, and particularly crucial for language understanding. NER has been actively explored in academia and in industry especially during the last years due to the advent of social media data. This paper describes the conversion, modeling and adaptation of a Swedish NER system from a hybrid environment, with integrated functionality from various processing components, to the Helsinki Finite-State Transducer Technology (HFST) platform. This new HFST-based NER (HFST-SweNER) is a full-fledged open source implementation that supports a variety of generic named entity types and consists of multiple, reusable resource layers, e.g., various n-gram-based named entity lists (gazetteers).}, booktitle = {Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC), Reykjavik 26 - 31 May 2014.}, author = {Kokkinakis, Dimitrios and Niemi, Jyrki and hardwick, sam and Lindén, Krister and Borin, Lars}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {2537--2543}, } @inProceedings{borin-forsberg-2014-swesaurus-193085, title = {Swesaurus; or, The Frankenstein Approach to Wordnet Construction}, abstract = {Swesaurus is a freely available (under a CC-BY license) Swedish wordnet under construction, built primarily by scavenging and recycling information from a number of existing lexical resources. Among its more unusual characteristics are graded lexical-semantic relations and inclusion of all parts of speech, not only open-class items. }, booktitle = {Proceedings of the Seventh Global WordNet Conference (GWC 2014)}, author = {Borin, Lars and Forsberg, Markus}, year = {2014}, ISBN = {978-9949-32-492-7}, } @inProceedings{lyngfelt-etal-2014-svenskt-208457, title = {Ett svenskt konstruktikon. Grammatik möter lexikon}, booktitle = {Svenskans beskrivning : Förhandlingar vid Trettiotredje sammankomsten för svenskans beskrivning. Helsingfors den 15–17 maj 2013}, author = {Lyngfelt, Benjamin and Borin, Lars and Bäckström, Linnéa and Forsberg, Markus and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia and Uppström, Jonatan}, year = {2014}, volume = {33}, ISBN = {978-951-51-0120-4}, pages = {268--279}, } @article{borin-etal-2014-geographic-198286, title = {Geographic visualization of place names in Swedish literary texts}, abstract = {This article describes the development of a geographical information system (GIS) at Språkbanken as part of a visualization solution to be used in an archive of historical Swedish literary texts. The research problems we are aiming to address concern orthographic and morphological variation, missing place names, and missing place name coordinates. Some of these problems form a central part in the development of methods and tools for the automatic analysis of historical Swedish literary texts at our research unit. We discuss the advantages and challenges of covering large-scale spelling variation in place names from different sources and in generating maps with focus on different time periods. }, journal = {Literary & Linguistic Computing}, author = {Borin, Lars and Dannélls, Dana and Olsson, Leif-Jöran}, year = {2014}, volume = {29}, number = {3}, pages = {400--404}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{agfjord-etal-2014-grammar-208776, title = {Grammar-based Suggestion Engine with Keyword Search.}, booktitle = {The Fifth Swedish Language Technology Conference}, author = {Agfjord, Martin and Angelov, Krasimir and Fredelius, Per and Marinov, Svetoslav}, year = {2014}, } @article{johansson-2014-automatic-201874, title = {Automatic Expansion of the Swedish FrameNet Lexicon}, abstract = {We evaluate several lexicon-based and corpus-based methods to automatically induce new lexical units for the Swedish FrameNet, and we see that the best-performing setup uses a combination of both types of methods. A particular challenge for Swedish is the absence of a lexical resource such as WordNet; however, we show that the semantic network SALDO, which is organized according to lexicographical principles quite different from those of WordNet, is very useful for our purposes.}, journal = {Constructions and Frames}, author = {Johansson, Richard}, year = {2014}, volume = {6}, number = {1}, pages = {92--113}, } @inProceedings{ahlberg-etal-2014-semi-198791, title = {Semi-supervised learning of morphological paradigms and lexicons}, abstract = {We present a semi-supervised approach to the problem of paradigm induction from inflection tables. Our system extracts generalizations from inflection tables, representing the resulting paradigms in an abstract form. The process is intended to be language-independent, and to provide human-readable generalizations of paradigms. The tools we provide can be used by linguists for the rapid creation of lexical resources. We evaluate the system through an inflection table reconstruction task using Wiktionary data for German, Spanish, and Finnish. With no additional corpus information available, the evaluation yields per word form accuracy scores on inflecting unseen base forms in different lan guages ranging from 87.81% (German nouns) to 99.52% (Spanish verbs); with additional unlabeled tex t corpora available for training the scores range from 91.81% (German nouns) to 99.58% (Spanish verbs). We separately evaluate the system in a simulated task of Swedish lexicon creation, and show that on the basis of a small number of inflection tables, the system can accurately collect from a list of noun forms a lexicon with inflection information ranging from 100.0% correct (collect 100 words), to 96.4% correct (collect 1000 words).}, booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics, Gothenburg, Sweden 26–30 April 2014 }, author = {Ahlberg, Malin and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = {978-1-937284-78-7}, pages = {569--578}, } @inProceedings{gunther-etal-2014-rtrgo-201512, title = {RTRGO: Enhancing the GU-MLT-LT System for Sentiment Analysis of Short Messages}, abstract = {This paper describes the enhancements made to our GU-MLT-LT system (Günther and Furrer, 2013) for the SemEval-2014 re-run of the SemEval-2013 shared task on sentiment analysis in Twitter. The changes include the usage of a Twitter-specific tokenizer, additional features and sentiment lexica, feature weighting and random subspace learning. The improvements result in an increase of 4.18 F-measure points on this year’s Twitter test set, ranking 3rd. }, booktitle = {Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014) August 23-24, 2014 Dublin, Ireland}, author = {Günther, Tobias and Vancoppenolle, Jean and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-24-2}, pages = {497--502}, } @inProceedings{dannells-gruzitis-2014-controlled-201944, title = {Controlled Natural Language Generation from a Multilingual FrameNet-based Grammar}, abstract = {This paper presents a currently bilingual but potentially multilingual FrameNet-based grammar library implemented in Grammatical Framework. The contribution of this paper is two-fold. First, it offers a methodological approach to automatically generate the grammar based on semantico-syntactic valence patterns extracted from FrameNet-annotated corpora. Second, it provides a proof of concept for two use cases illustrating how the acquired multilingual grammar can be exploited in different CNL applications in the domains of arts and tourism.}, booktitle = {Lecture Notes in Computer Science}, author = {Dannélls, Dana and Gruzitis, Normunds}, year = {2014}, volume = {8625}, ISBN = {978-3-319-10222-1}, pages = {155--166}, } @article{borin-etal-2014-introduction-202127, title = {Introduction: Constructions and frames meet language technology}, journal = {Constructions and Frames}, author = {Borin, Lars and de Melo, Gerard and Friberg Heppin, Karin and Torrent, Tiago Timponi}, year = {2014}, volume = {6}, number = {1}, pages = {1--8}, } @incollection{ribeck-borin-2014-lexical-201965, title = {Lexical Bundles in Swedish Secondary School Textbooks}, abstract = {The present paper describes the process of identifying lexical bundles, i.e., frequently recurring word sequences such as by means of and in the end of, in secondary school history and physics textbooks. In its determination of finding genuine lexical bundles, i.e. the word boundaries between lexical bundles and surrounding arbitrary words, it proposes a new approach to come to terms with the problem of extracting overlapping bundles of different lengths. The results of the structural classification indicate that history uses more NP/PP-based and less dependent-clause-based bundles than physics. The comparative analysis manages to restrict this difference to the referential function. History almost only refers to phrases, i.e. within clauses, while physics much more tends to make references across clauses. The article also includes a report on an extension of the study, ongoing work where the automatic identification of multi-word expressions in general is in focus.}, booktitle = {Human Language Technology Challenges for Computer Science and Linguistics 5th Language and Technology Conference, LTC 2011, Poznań, Poland, November 25--27, 2011, Revised Selected Papers}, editor = {Zygmunt Vetulani and Joseph Mariani.}, author = {Ribeck, Judy Carola and Borin, Lars}, year = {2014}, publisher = {Springer International Publishing}, volume = {2014}, number = {XVI}, address = {Cham}, ISBN = {978-3-319-08958-4}, pages = {238--249}, } @inProceedings{volodina-etal-2014-flexible-201885, title = {A flexible language learning platform based on language resources and web services. }, abstract = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure. Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers – and this way bring Språkbanken's resources in a relevant format to them. Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.}, booktitle = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland }, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3973--3978}, } @inProceedings{borin-etal-2014-linguistic-198551, title = {Linguistic landscaping of South Asia using digital language resources: Genetic vs. areal linguistics}, booktitle = {Proceedings of LREC, May 26-31, 2014, Reykjavik, Iceland}, author = {Borin, Lars and Saxena, Anju and Rama, Taraka and Comrie, Bernard}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3137--3144}, } @article{forsberg-etal-2014-from-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, journal = {Constructions and Frames}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @edited_book{volodina-etal-2014-proceedings-206135, title = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University}, abstract = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2014}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-175-1}, } @article{rama-borin-2014-gram-187121, title = {N-Gram Approaches to the Historical Dynamics of Basic Vocabulary}, journal = {Journal of Quantitative Linguistics}, author = {Rama, Taraka and Borin, Lars}, year = {2014}, volume = {21}, number = {1}, pages = {50--64}, } @inProceedings{rehm-etal-2014-strategic-198556, title = {The strategic impact of META-NET on the regional, national and international level}, booktitle = {Proceedings of LREC 2014, 26-31 May, Reykjavik, Iceland }, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bieleviciene, Audrone and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and Garcia-Mateo, Carmen and Genabith, Josef Van and Hajic, Jan and Hernaez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asuncion and Odijk, Jan and Ogrodniczuk, Maciej and Pezik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Michael and Pedersen, Bolette Sandford and Skadina, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiș, Dan and Váradi, Tamás and Vasiljevs, Andrejs and Vider, Kadri and Zabarskaite, Jolanta}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {1517--1524}, } @inProceedings{borin-etal-2014-representing-204731, title = {Representing Swedish Lexical Resources in RDF with lemon}, abstract = {The paper presents an ongoing project which aims to publish Swedish lexical-semantic resources using Semantic Web and Linked Data technologies. In this article, we highlight the practical conversion methods and challenges of converting three of the Swedish language resources in RDF with lemon.}, booktitle = { Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and McCrae, John P.}, year = {2014}, volume = {1272 }, pages = {329--332}, } @inProceedings{dannells-etal-2014-using-201951, title = {Using language technology resources and tools to construct Swedish FrameNet}, abstract = {Having access to large lexical and grammatical resources when creating a new language resource is essential for its enhancement and enrichment. This paper describes the interplay and interac- tive utilization of different language technology tools and resources, in p articular the Swedish lexicon SALDO and Swedish Constructicon, in the creation of Swedish Frame Net. We show how integrating resources in a larger infrastructure is much more than the su m of the parts. }, booktitle = {Proceedings of the Workshop on Lexical and Grammatical Resources for Language Processing, Dublin Ireland, August 24, 2014}, author = {Dannélls, Dana and Friberg Heppin, Karin and Ehrlemark, Anna}, year = {2014}, ISBN = {978-1-873769-44-7}, pages = {8--17}, } @inProceedings{dannells-etal-2014-multilingual-204733, title = {A Multilingual SPARQL-Based Retrieval Interface for Cultural Heritage Objects}, booktitle = {Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Dannélls, Dana and Enache, Ramona and Damova, Mariana}, year = {2014}, volume = {1272}, pages = {205--208}, } @article{borin-johansson-2014-kulturomik-192931, title = {Kulturomik: Att spana efter språkliga och kulturella förändringar i digitala textarkiv}, journal = {Historia i en digital värld}, author = {Borin, Lars and Johansson, Richard}, year = {2014}, } @inProceedings{adesam-etal-2014-computer-198794, title = {Computer-aided Morphology Expansion for Old Swedish}, abstract = {In this paper we describe and evaluate a tool for paradigm induction and lexicon extraction that has been applied to Old Swedish. The tool is semi-supervised and uses a small seed lexicon and unannotated corpora to derive full inflection tables for input lemmata. In the work presented here, the tool has been modified to deal with the rich spelling variation found in Old Swedish texts. We also present some initial experiments, which are the first steps towards creating a large-scale morphology for Old Swedish.}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14) May 26-31, 2014 Reykjavik, Iceland }, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Bouma, Gerlof and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1102--1105}, } @article{smith-etal-2014-readability-188146, title = {Readability, suitability and comprehensibility in patient education materials for Swedish patients with colorectal cancer undergoing elective surgery: A mixed method design.}, abstract = {To characterize education materials provided to patients undergoing colorectal cancer surgery to gain a better understanding of how to design readable, suitable, comprehensible materials.}, journal = {Patient education and counseling}, author = {Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, Richard and Friberg, Febe and Öhlén, Joakim}, year = {2014}, volume = {94}, number = {2}, pages = {202–209}, } @article{fribergheppin-toporowskagronostaj-2014-exploiting-210058, title = {Exploiting FrameNet for Swedish: Mismatch?}, abstract = {This paper presents work on developing Swedish FrameNet (SweFN) as a resource analogous to the original Berkeley-based FrameNet. We describe the theoretical and practical basics of FrameNet, and articulate some multilingual issues that arise in expanding a linguistic resource from one language to another. SweFN uses FrameNet as a starting point in order to save time and effort, and to make it compatible with other FrameNet-based resources. The lexical units are from the pivot lexicon SALDO, making SweFN compatible with other resources of the larger project SweFN++. It is a corpus-based resource, meant to support tasks within natural language processing relying on semantic data.}, journal = {Constructions and Frames}, author = {Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2014}, volume = {6}, number = {1}, pages = {52--72}, } @inProceedings{pilan-etal-2014-rule-210940, title = {Rule-based and machine learning approaches for second language sentence-level readability}, abstract = {We present approaches for the identification of sentences understandable by second language learners of Swedish, which can be used in automatically generated exercises based on corpora. In this work we merged methods and knowledge from machine learning-based readability research, from rule-based studies of Good Dictionary Examples and from second language learning syllabuses. The proposed selection methods have also been implemented as a module in a free web-based language learning platform. Users can use different parameters and linguistic filters to personalize their sentence search with or without a machine learning component assessing readability. The sentences selected have already found practical use as multiple-choice exercise items within the same platform. Out of a number of deep linguistic indicators explored, we found mainly lexical-morphological and semantic features informative for second language sentence-level readability. We obtained a readability classification accuracy result of 71%, which approaches the performance of other models used in similar tasks. Furthermore, during an empirical evaluation with teachers and students, about seven out of ten sentences selected were considered understandable, the rule-based approach slightly outperforming the method incorporating the machine learning model.}, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-03-7}, pages = {174----184}, } @article{morrison-etal-2014-likelihood-188784, title = {Likelihood ratio calculation for a disputed-utterance analysis with limited available data}, abstract = {We present a disputed-utterance analysis using relevant data, quantitative measurements and statistical models to calculate likelihood ratios. The acoustic data were taken from an actual forensic case in which the amount of data available to train the statistical models was small and the data point from the disputed word was far out on the tail of one of the modelled distributions. A procedure based on single multivariate Gaussian models for each hypothesis led to an unrealistically high likelihood ratio value with extremely poor reliability, but a procedure based on Hotelling’s T2 statistic and a procedure based on calculating a posterior predictive density produced more acceptable results. The Hotelling’s T2 procedure attempts to take account of the sampling uncertainty of the mean vectors and covariance matrices due to the small number of tokens used to train the models, and the posterior-predictive-density analysis integrates out the values of the mean vectors and covariance matrices as nuisance parameters. Data scarcity is common in forensic speech science and we argue that it is important not to accept extremely large calculated likelihood ratios at face value, but to consider whether such values can be supported given the size of the available data and modelling constraints.}, journal = {Speech Communication}, author = {Morrison, Geoffrey Stewart and Lindh, Jonas and Curran, James M}, year = {2014}, volume = {58}, pages = {81--90}, } @inProceedings{hu-lindh-2014-effects-203082, title = {Effects of initial sounds on the perception of Chinese disyllable tones by Swedish students of Chinese}, abstract = {ABSTRACT This paper extends previous research on the effects of initial sounds on perception of Chinese disyllable tones. A perception test was performed on Swedish adult students of Chinese using disyllable words (most previous studies have been made using solely monosyllable words). The main results indicate that voiced initial sounds e.g. [l] have a strong connection to the tone confusion pattern Tone 2 perceived as Tone 3. On the contrary, a voiceless aspirated initial sound e.g. [th] is mostly connected to misidentifications between Tone 3 to Tone 2. Unvoiced unaspirated initial sounds affect tone perception heavily, especially when they occur in the second syllable of a disyllabic word. }, booktitle = {2014 International Conference on Phonetic Research and Language Learning (ICPRLL) & English Phonetic Conference in China (EPCC)}, author = {Hu, Guohua and Lindh, Jonas}, year = {2014}, } @inProceedings{kokkinakis-grahn-2014-corpus-209807, title = {A corpus-based approach to the identification of non-literal language in a medical setting.}, abstract = {Automated processing of clinical texts is commonly faced with various less exposed, and not so regularly discussed linguistically complex problems that need to be addressed. One of these issues concerns the usage of figurative language. Figurative language implies the use of words that go beyond their ordinary meaning, a linguistically complex and challenging problem and also a problem that causes great difficulty for the field of natural language processing (NLP). The problem is equally prevalent in both general language and also in various sublanguages, such as clinical medicine. Therefore we believe that a comprehensive model of e.g. clinical language processing needs to account for figurative language usage, and this paper provides a description, and preliminary results towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. We illustrate several types of figurative expressions in the clinical discourse and apply a rather quantitative and corpus-based level analysis. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient-doctor and patient-nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree.}, booktitle = {Proceedings of the Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Kokkinakis, Dimitrios and Grahn, Inga-Lill}, year = {2014}, pages = {1}, } @inProceedings{kokkinakis-etal-2014-semantics-209802, title = {Semantics in Storytelling in Swedish Fiction}, abstract = {In this paper, we aim to define foundations and research questions for future large scale exploration of various types of semantic relationships in literature, namely Swedish prose fiction. More specifically, we are interested to get an in-depth understanding of storytelling in Swedish fiction by analyzing and mining the narrative discourse in a small sample of such data, focusing on interpersonal relationships and answering various questions such as how to recognize and assess gender patterns. Our intention is to apply our findings into a much larger scale in the near future in order to obtain useful insights about the social relations, structures, behavior and everyday life of characters found in literary works, thus enhancing the use of prose fiction as a source for research within the humanities and social sciences. Our work is inspired by the notions of distant reading and macroanalysis, a relatively new and often contested paradigm of literary research. In order to achieve our goal we strive for a combination of natural language processing techniques and simple visualizations that allow the user to rapidly focus on key areas of interest and provide the ability to discover latent semantic patterns and structures. }, booktitle = {Proceedings of the Digital Access to textual Cultural Heritage (DATeCH).}, author = {Kokkinakis, Dimitrios and Malm, Mats and Bergenmar, Jenny and Ighe, Ann}, year = {2014}, ISBN = {978-1-4503-2588-2}, pages = {6}, } @inProceedings{moradi-etal-2014-graph-197533, title = {A Graph-Based Analysis of Medical Queries of a Swedish Health Care Portal}, abstract = {Today web portals play an increasingly important role in health care allowing information seekers to learn about diseases and treatments, and to administrate their care. Therefore, it is important that the portals are able to support this process as well as possible. In this paper, we study the search logs of a public Swedish health portal to address the questions if health information seeking differs from other types of Internet search and if there is a potential for utilizing network analysis methods in combination with semantic annotation to gain insights into search behaviors. Using a semantic-based method and a graph-based analysis of word cooccurrences in queries, we show there is an overlap among the results indicating a potential role of these types of methods to gain insights and facilitate improved information search. In addition we show that samples, windows of a month, of search logs may be sufficient to obtain similar results as using larger windows. We also show that medical queries share the same structural properties found for other types of information searches, thereby indicating an ability to reuse existing analysis methods for this type of search data.}, booktitle = {The Fifth International Workshop on Health Text Mining and Information Analysis (Louhi)}, author = {Moradi, Farnaz and Eklund, Ann-Marie and Kokkinakis, Dimitrios and Olovsson, Tomas and Tsigas, Philippas}, year = {2014}, ISBN = {978-1-937284-90-9}, pages = {2--10}, } @inProceedings{grahn-kokkinakis-2014-legitimating-216142, title = {Legitimating the visit - a recurrent challenge among patients with medically unexplained symptoms}, abstract = {The doctor’s evaluation of presented symptoms as doctorable, is a legitimation of the patient’s decision to seek medical care. It is also a confirmation of the rational, and even the moral, status of the patient, since consulting a doctor without good reasons is considered irrational. The analysis focuses on how patients take initiatives to present problems and on the doctors’ responses and evaluations regarding the doctorability. Situations where participants seem to have different views of the doctorability of the problems are examined in relation to conversational practices and social actions. The analyses shows that the doctor as well as the patient orients to the potential doctorability of the problems and to the moral challenges related to it, but that their different expectations and roles lead to communicatively unclear situations. Further analyses will illustrate in what ways the MUS-patients’ recurrent challenge of legitimating their visits could be influenced by the interaction, and hence in what ways conscious conversational practices from the care givers might facilitate these situations.}, booktitle = {Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Grahn, Inga-Lill and Kokkinakis, Dimitrios}, year = {2014}, } @inProceedings{kageback-etal-2014-extractive-210878, title = {Extractive Summarization using Continuous Vector Space Models}, abstract = {Automatic summarization can help users extract the most important pieces of information from the vast amount of text digitized into electronic form everyday. Central to automatic summarization is the notion of similarity between sentences in text. In this paper we propose the use of continuous vector representations for semantically aware representations of sentences as a basis for measuring similarity. We evaluate different compositions for sentence representation on a standard dataset using the ROUGE evaluation measures. Our experiments show that the evaluated methods improve the performance of a state-of-the-art summarization framework and strongly indicate the benefits of continuous word vector representations for automatic summarization.}, booktitle = {Proceedings of the 2nd Workshop on Continuous Vector Space Models and their Compositionality (CVSC) EACL, April 26-30, 2014 Gothenburg, Sweden}, author = {Kågebäck, Mikael and Mogren, Olof and Tahmasebi, Nina and Dubhashi, Devdatt}, year = {2014}, ISBN = {978-1-937284-94-7}, pages = {31--39}, } @inProceedings{pilan-volodina-2014-reusing-200967, title = {Reusing Swedish FrameNet for training semantic roles}, abstract = {In this article we present the first experiences of reusing the Swedish FrameNet (SweFN) as a resource for training semantic roles. We give an account of the procedure we used to adapt SweFN to the needs of students of Linguistics in the form of an automatically generated exercise. During this adaptation, the mapping of the fine-grained distinction of roles from SweFN into learner-friendlier coarse-grained roles presented a major challenge. Besides discussing the details of this mapping, we describe the resulting multiple-choice exercise and its graphical user interface. The exercise was made available through Lärka, an online platform for students of Linguistics and learners of Swedish as a second language. We outline also aspects underlying the selection of the incorrect answer options which include semantic as well as frequency-based criteria. Finally, we present our own observations and initial user feedback about the applicability of such a resource in the pedagogical domain. Students' answers indicated an overall positive experience, the majority found the exercise useful for learning semantic roles. }, booktitle = {Proceedings of LREC 2014, May 26-31, 2014, Reykjavik, Iceland}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1359--1363}, } @inProceedings{volodina-etal-2014-what-206132, title = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.}, abstract = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus. In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported on a subset of the corpus. Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others. The paper also presents our first quantitative exploration of the corpus where we focus on textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical focus of exercises across levels. The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. }, booktitle = {NEALT Proceedings Series}, author = {Volodina, Elena and Pilán, Ildikó and Rødven-Eide, Stian and Heidarsson, Hannes}, year = {2014}, volume = {22}, ISBN = {978-91-7519-175-1}, pages = {128--144}, } @article{kilgariff-etal-2014-corpus-188541, title = {Corpus-Based Vocabulary lists for Language Learners for Nine Languages.}, abstract = {We present the KELLY project and its work on developing monolingual and bilingual word lists for language learning, using corpus methods, for nine languages and thirty-six language pairs. We describe the method and discuss the many challenges encountered. We have loaded the data into an online database to make it accessible for anyone to explore and we present our own first explorations of it. The focus of the paper is thus twofold, covering pedagogical and methodological aspects of the lists’ construction, and linguistic aspects of the by-product of the project, the KELLY database. }, journal = {Language resources and evaluation}, author = {Kilgariff, Adam and Charalabopoulou, Frieda and Gavrilidou, Maria and Bondi Johannessen, Janne and Khalil, Saussan and Johansson Kokkinakis, Sofie and Lew, Robert and Sharoff, Serge and Vadlapudi, R. and Volodina, Elena}, year = {2014}, volume = {48}, number = {1}, pages = {121--163}, } @inProceedings{lindh-akesson-2014-effect-218075, title = {Effect of the Double-Filtering effect on Automatic Voice Comparison}, abstract = {In forensic casework today it is not uncommon to receive material recorded with mobile phones or other handheld recording devices. From experience we know most people do not treat recordings with as much care as a person well versed in audio technology. Especially given the varying circumstances under which the material can be recorded. Thus it is important we learn more about what sort of acoustic effects take place under particular conditions and how these effects can influence Automatic Voice Comparison (AVC). The current study aims at evaluating the effects of recording material consisting of what could be described as ‘doublefiltered’ sound, henceforth referred to as DF, e.g. when a phone call is recorded using a handheld recorder placed in the vicinity of the mobile device. This filtering effect constitutes sound transmitted via GSM communication (1st filter) which then passes an indeterminable distance through the air before being captured by another recording device, such as a mobile phone or handheld recorder’s microphone (2nd filter). This effect affects the energy in the signal. The energy decreases in both the low and the high frequencies, while the middle frequencies are boosted. In this study we have used a database consisting of 150 female speakers of Swedish, all students of speech and language pathology. The recordings were made in a sound treated recording booth using a setup of one computer equipped with an internal MAudio soundcard and a high quality headset microphone. Each recording consists of solicited spontaneous speech together with read speech material (Swedish standard reading passage called ‘Ett svårt fall’). Each speaker is informed and encouraged to finish the task at their own pace. Mean duration of the full recording among the speakers was 69.3 seconds (std 16 seconds).}, booktitle = {Proceedings of IAFPA 2014. International Association for Forensic Phonetics and Acoustics Annual Conference 31 August - 3 September 2014}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2014}, pages = {2}, } @inProceedings{kokkinakis-etal-2014-vocation-209808, title = {Vocation Identification in Swedish Fiction. }, abstract = {This paper presents a system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Besides vocation identification, the system assigns gender (male, female, unknown) to the vocation words. Since gender is a prominent attribute of first names, we apply a named-entity recognizer (NER) that uses first name gazetteers where each name has been pre-assigned gender, which aids gender assignment to vocations with unknown gender if appropriate context is available. We also use a statistical modelling method, conditional random fields (CRF), for learning gender-assigned vocations in combination with the results of the NER and other pattern matching techniques. The purpose of this work is to develop and apply tools to literature as means to expand our understanding of history in the area of literature-based gender studies, e.g. investigate how women enter literature, which functions do they assume and their working patterns. Vocation identification can be used as one such indicator for achieving some these goals.}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference (SLTC)}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2014}, pages = {3}, } @inProceedings{forsberg-etal-2015-forensic-222113, title = {A forensic and sociophonetic perspective on a new corpus of young urban Swedish}, booktitle = {10th UK Language Variation and Change (UKLVC) conference 1-3/9 2015, York, UK}, author = {Forsberg, Julia and Gross, Johan and Lindh, Jonas and Åkesson, Joel}, year = {2015}, } @edited_book{volodina-etal-2015-proceedings-226574, title = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2015}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-036-5}, } @incollection{rama-borin-2015-comparative-197484, title = {Comparative evaluation of string similarity measures for automatic language classification.}, booktitle = {Sequences in Language and Text}, author = {Rama, Taraka and Borin, Lars}, year = {2015}, publisher = {De Gruyter Mouton}, ISBN = {978-3-11-036287-9}, } @inProceedings{kokkinakis-malm-2015-detecting-225762, title = {Detecting Reuse of Biblical Quotes in Swedish 19th Century Fiction using Sequence Alignment}, abstract = {Text reuse, a form of text repetition, recycling or borrowing, is a theoretically and practically interesting problem that has attracted considerable attention during the last years e.g. in the cultural heritage context (historical and comparative linguistics); in the context of social network propagation of ideas and in the measuring of journalistic reuse. In this paper we briefly outline and experiment with a method used for biological sequence alignment that have been also used in humanities research for e.g. the detection of similar passages in the complete works of Voltaire and 18th century French encyclopedias or for tracing how and which ideas spread in 19th century US-newspaper collections. We use available software (text-PAIR: Pairwise Alignment for Intertextual Relations) and experiment with the Charles XII Bible translation into Swedish, completed in 1703, against the content of the Swedish prose fiction 1800-1900, in order to automatically detect passages taken from this particular Bible translation in the prose fiction corpus.}, booktitle = {Corpus-based Research in the Humanities workshop (CRH), 10 December 2015 Warsaw, Poland }, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2015}, ISBN = {978-83-63159-19-1}, pages = {79--86}, } @inProceedings{volodina-pijetlovic-2015-lark-226543, title = {Lark Trills for Language Drills: Text-to-speech technology for language learners.}, abstract = {This paper reports on the development and the initial evaluation of a dictation&spelling prototype exercise for second language (L2) learners of Swedish based on text-to-speech (TTS) technology. Implemented on an already existing Intelligent Computer-Assisted Language Learning (ICALL) platform, the exercise has not only served as a test case for TTS in L2 environment, but has also shown a potential to train listening and orthographic skills, as well as has become a way of collecting learner-specific spelling errors into a database. Exercise generation re-uses well-annotated corpora, lexical resources, and text-to-speech technology with an accompanying talking head. }, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 4, 2015, Denver, Colorado, USA}, author = {Volodina, Elena and Pijetlovic, Dijana}, year = {2015}, ISBN = {978-1-941643-35-8}, pages = {107--117}, } @inProceedings{fribergheppin-dannells-2015-polysemy-218276, title = {Polysemy and questions of lumping or splitting in the construction of Swedish FrameNet}, abstract = {When working on a lexical resource, such as Swedish FrameNet (SweFN), assumptions based on linguistic theories are made, and methodological directions based upon them are taken. These directions often need to be revised when not beforehand foreseen problems arise. One assumption that was made already in the early development stages of SweFN was that each lexical entry from the reference lexicon, SALDO, would evoke only one semantic frame in SweFN. If a lexical entry evoked more than one frame, it entailed more than one sense and therefore required a new entry in the lexicon. As work progressed, this inclination towards splitting, in the perpetual lumpers and splitters discussion proved to be progressively untenable. This paper will give an account of the problems which were encountered and suggestions for solutions on polysemy issues forcing a discussion on lumping or splitting.}, booktitle = {Proceedings of the Workshop on Semantic resources and Semantic Annotation for Natural Language Processing and the Digital Humanities at NODALIDA 2015, Vilnius, 11th May, 2015}, author = {Friberg Heppin, Karin and Dannélls, Dana}, year = {2015}, pages = {12--20}, } @misc{andersen-etal-2015-sibirientyska-215757, title = {Sibirientyska kvinnor (Siberian German women)}, abstract = {Siberian German women The corpus consists of dialogs between four women born in 1927 to 1937 in the Soviet Volga Republic. Their mother tongue is a German variety spoken in Russia since the second half of the 18th century. Since the end of the Second World War, the women have lived in the region of Krasnoyarsk. They talk about their backgrounds and their everyday lives in the village. The corpus consists of about 16 000 words. Russian words and hybrids are given in [brackets], the turns of the interviewers are in {brackets}; all verb forms have got the attribute FINIT or INFINIT. More information on the research project see Syntax in contact. }, author = {Andersen, Christiane and Forsberg, Markus and Hammarstedt, Martin and Pankow, Alexander}, year = {2015}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{bergenmar-olsson-2015-tracing-228773, title = {Tracing Cultural Transfer Through Multiple Translation Analysis. The Case of the Swedish 19th-Century Bourgeois Novel in German and Czech}, abstract = {In the last decades, Comparative Literature has become more directed towards questions of transculturality. This renders translations of literary texts an important role as a vehicle not just for the transfer of text and language, but also of ideas and cultures. Digital methods for comparing multiple translations within and across languages might prove to be important for exploring how, for example, a Swedish 19th century bourgeois novel is reframed in Czech translations. The chosen example is A Merchant House (1859) by Emilie Flygare–Carlén (1807–1892) who was one of the most popular authors in Czech speaking regions in the late 19th Century. In this paper existing collation tools are used for comparing two different Czech translations (1872 and 1910), by two different translators. This might both reveal how the gender, context and position of the translator colours the literary text and how the translations are adapted to changing literary trends. Furthermore, parallel text alignment is tried as a method for comparing across languages, since the Czech translation is made from a German translation. Are the Czech translations subject to “foreignization” or “domestication”? Or do they retain the same traits as the German translation, which is the source of the first Czech translation? Does the systematic comparison of multiple translations contribute to the understanding of how texts move from certain gendered cultural contexts and ideologies to others? }, booktitle = {Digital Literary Studies. International Conference May 14-15 2015, Coimbra, Portugal}, author = {Bergenmar, Jenny and Olsson, Leif-Jöran}, year = {2015}, } @inProceedings{borin-etal-2015-here-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @inProceedings{kokkinakis-etal-2015-gender-215535, title = {Gender-Based Vocation Identification in Swedish 19th Century Prose Fiction using Linguistic Patterns, NER and CRF Learning}, abstract = {This paper investigates how literature could be used as a means to expand our understanding of history. By applying macroanalytic techniques we are aiming to investigate how women enter literature and particularly which functions they assume, their working patterns and if we can spot differences in how often male and female characters are mentioned with various types of occupational titles (vocation) in Swedish literary texts. Modern historiography, and especially feminist and women’s history has emphasized a relative invisibility of women’s work and women workers. The reasons behind this are manifold, and the extent, the margin of error in terms of women’s work activities is of course hard to assess. Therefore, vocation identification can be used as an indicator for such exploration and we present a hybrid system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Beside vo-cations, the system also assigns gender (male, female or unknown) to the vocation words, a prerequisite for the goals of the study and fu-ture in-depth explorations of the corpora.}, booktitle = {Proceedings of the Fourth Workshop on Computational Linguistics for Literature (Clfl). Co-located with the NAACL/HLT. Denver, Colorado, USA}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2015}, pages = {9}, } @inProceedings{forsberg-etal-2015-speaker-220340, title = {Speaker comparison evaluation using a new corpus of urban speech}, booktitle = {24th Annual Conference of the International Association for Forensic Phonetics and Acoustics, 8-10/7 2015, Leiden}, author = {Forsberg, Julia and Gross, Johan and Lindh, Jonas and Åkesson, Joel}, year = {2015}, pages = {46--47}, } @inProceedings{lindh-2015-forensic-222517, title = {Forensic speaker comparison evaluations}, booktitle = {Proceedings of Roundtable in Forensic Linguistics 2015, September 4th- 6th, Mainz, Germany}, author = {Lindh, Jonas}, year = {2015}, } @inProceedings{ghanimifard-johansson-2015-enriching-222749, title = {Enriching Word-sense Embeddings with Translational Context}, abstract = {Vector-space models derived from corpora are an effective way to learn a representation of word meaning directly from data, and these models have many uses in practical applications. A number of unsupervised approaches have been proposed to automatically learn representations of word senses directly from corpora, but since these methods use no information but the words themselves, they sometimes miss distinctions that could be possible to make if more information were available. In this paper, we present a general framework that we call context enrichment that incorporates external information during the training of multi-sense vector-space models. Our approach is agnostic as to which external signal is used to enrich the context, but in this work we consider the use of translations as the source of enrichment. We evaluated the models trained using the translation-enriched context using several similarity benchmarks and a word analogy test set. In all our evaluations, the enriched model outperformed the purely word-based baseline soundly. }, booktitle = {Proceedings of Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov. International Conference and Hissar and Bulgaria 7–9 September and 2015}, author = {Ghanimifard, Mehdi and Johansson, Richard}, year = {2015}, pages = {208--215}, } @inProceedings{adesam-etal-2015-multiwords-228833, title = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish}, abstract = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels. In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.}, booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-83-63159-18-4}, pages = {3--12}, } @inProceedings{gruzitis-etal-2015-formalising-220419, title = {Formalising the Swedish Constructicon in Grammatical Framework}, abstract = {This paper presents a semi-automatic approach to acquire a computational construction grammar from the semi-formal Swedish Constructicon. The implementation is based on the resource grammar library provided by Grammatical Framework and can be seen as an extension to the existing Swedish resource grammar. An important consequence of this work is that it generates feedback, explicit and implicit, on how to improve the annotation consistency and adequacy of the original construction resource. }, booktitle = {Proceedings of the Grammar Engineering Across Frameworks (GEAF) Workshop, 53rd Annual Meeting of the ACL and 7th IJCNLP, Beijing, China, July 26-31, 2015}, author = {Gruzitis, Normunds and Dannélls, Dana and Lyngfelt, Benjamin and Ranta, Aarne}, year = {2015}, ISBN = {978-1-932432-66-4}, pages = {49----56}, } @inProceedings{adesam-etal-2015-defining-217815, title = {Defining the Eukalyptus forest – the Koala treebank of Swedish}, abstract = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {1--9}, } @article{tahmasebi-etal-2015-visions-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, journal = {International Journal on Digital Libraries}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @inProceedings{ahlberg-etal-2015-case-217988, title = {A case study on supervised classification of Swedish pseudo-coordination}, abstract = {We present a case study on supervised classification of Swedish pseudo-coordination (SPC). The classification is attempted on the type-level with data collected from two data sets: a blog corpus and a fiction corpus. Two small experiments were designed to evaluate the feasability of this task. The first experiment explored a classifier’s ability to discriminate pseudo-coordinations from ordinary verb coordinations, given a small labeled data set created during the experiment. The second experiment evaluated how well the classifier performed at detecting and ranking SPCs in a set of unlabeled verb coordinations, to investigate if it could be used as a semi-automatic discovery procedure to find new SPCs.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania}, author = {Ahlberg, Malin and Andersson, Peter and Forsberg, Markus and Tahmasebi, Nina}, year = {2015}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7519-098-3}, } @techreport{barnett-etal-2015-state-234687, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction}, abstract = {This document describes SCXML, or the "State Chart extensible Markup Language". SCXML provides a generic state-machine based execution environment based on CCXML and Harel State Tables.}, author = {Barnett, Jim and Akolkar, Rahul and Auburn, RJ and Bodell, Michael and Burnett, Daniel C. and Carter, Jerry and McGlashan, Scott and Lager, Torbjörn and Helbing, Mark and Hosn, Rafah and Raman, T.V. and Reifenrath, Klaus and Rosenthal, No'am and Roxendal, Johan}, year = {2015}, publisher = {World Wide Web Consortium}, address = {Massachusetts, USA}, } @inProceedings{ahlberg-etal-2015-paradigm-217987, title = {Paradigm classification in supervised learning of morphology}, abstract = {Supervised morphological paradigm learning by identifying and aligning the longest common subsequence found in inflection tables has recently been proposed as a simple yet competitive way to induce morphological patterns. We combine this non-probabilistic strategy of inflection table generalization with a discriminative classifier to permit the reconstruction of complete inflection tables of unseen words. Our system learns morphological paradigms from labeled examples of inflection patterns (inflection tables) and then produces inflection tables from unseen lemmas or base forms. We evaluate the approach on datasets covering 11 different languages and show that this approach results in consistently higher accuracies vis-a-vis other methods on the same task, thus indicating that the general method is a viable approach to quickly creating high-accuracy morphological resources.}, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Ahlberg, Malin and Forsberg, Markus and Huldén, Måns}, year = {2015}, } @article{holzmann-etal-2015-named-209780, title = {Named entity evolution recognition on the Blogosphere}, abstract = {Advancements in technology and culture lead to changes in our language. These changes create a gap between the language known by users and the language stored in digital archives. It affects user’s possibility to firstly find content and secondly interpret that content. In a previous work, we introduced our approach for named entity evolution recognition (NEER) in newspaper collections. Lately, increasing efforts in Web preservation have led to increased availability of Web archives covering longer time spans. However, language on the Web is more dynamic than in traditional media and many of the basic assumptions from the newspaper domain do not hold for Web data. In this paper we discuss the limitations of existing methodology for NEER. We approach these by adapting an existing NEER method to work on noisy data like the Web and the Blogosphere in particular. We develop novel filters that reduce the noise and make use of Semantic Web resources to obtain more information about terms. Our evaluation shows the potentials of the proposed approach.}, journal = {International Journal on Digital Libraries}, author = {Holzmann, Helge and Tahmasebi, Nina and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {209--235}, } @inProceedings{johansson-nietopina-2015-embedding-217863, title = {Embedding a Semantic Network in a Word Space}, abstract = {We present a framework for using continuous- space vector representations of word meaning to derive new vectors representing the meaning of senses listed in a semantic network. It is a post-processing approach that can be applied to several types of word vector representations. It uses two ideas: first, that vectors for polysemous words can be decomposed into a convex combination of sense vectors; secondly, that the vector for a sense is kept similar to those of its neighbors in the network.This leads to a constrained optimization problem, and we present an approximation for the case when the distance function is the squared Euclidean. We applied this algorithm on a Swedish semantic network, and we evaluate the quality of the resulting sense representations extrinsically by showing that they give large improvements when used in a classifier that creates lexical units for FrameNet frames. }, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Denver, United States, May 31 – June 5, 2015}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-1-941643-49-5}, pages = {1428--1433}, } @inProceedings{johansson-nietopina-2015-combining-216865, title = {Combining Relational and Distributional Knowledge for Word Sense Disambiguation}, abstract = {We present a new approach to word sense disambiguation derived from recent ideas in distributional semantics. The input to the algorithm is a large unlabeled corpus and a graph describing how senses are related; no sense-annotated corpus is needed. The fundamental idea is to embed meaning representations of senses in the same continuous-valued vector space as the representations of words. In this way, the knowledge encoded in the lexical resource is combined with the infor- mation derived by the distributional methods. Once this step has been carried out, the sense representations can be plugged back into e.g. the skip-gram model, which allows us to compute scores for the different possible senses of a word in a given context. We evaluated the new word sense disambiguation system on two Swedish test sets annotated with senses defined by the SALDO lexical resource. In both evaluations, our system soundly outperformed random and first-sense baselines. Its accuracy was slightly above that of a well- known graph-based system, while being computationally much more efficient,}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, May 12-13, Vilnius, Lithuania. Linköping Electronic Conference Proceedings 109, Linköping University Electronic Press..}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {69--78}, } @inProceedings{nietopina-johansson-2015-simple-222611, title = {A Simple and Efficient Method to Generate Word Sense Representations}, abstract = {Distributed representations of words have boosted the performance of many Natural Language Processing tasks. However, usually only one representation per word is obtained, not acknowledging the fact that some words have multiple meanings. This has a negative effect on the individual word representations and the language model as a whole. In this paper we present a simple model that enables recent techniques for building word vectors to represent distinct senses of polysemic words. In our assessment of this model we show that it is able to effectively discriminate between words’ senses and to do so in a computationally efficient manner.}, booktitle = {Proceedings of International Conference in Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov and Hissar and Bulgaria 7–9 September and 2015}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2015}, pages = {465--472}, } @inProceedings{aristodemou-etal-2015-acoustics-239890, title = {The Acoustics of Cypriot Greek Fricatives}, booktitle = {Proceedings of the 6th ISEL Conference on Experimental Linguistics ExLing 2015 26 - 27 June 2015 Athens, Greece Edited by Antonis Botinis }, author = {Aristodemou, Andrie and Savva, Angelandria and Themistocleous, Charalambos}, year = {2015}, publisher = {University of Athens}, address = {Athens}, pages = {9--12}, } @inProceedings{kageback-etal-2015-neural-217864, title = {Neural context embeddings for automatic discovery of word senses}, abstract = {Word sense induction (WSI) is the problem of automatically building an inventory of senses for a set of target words using only a text corpus. We introduce a new method for embedding word instances and their context, for use in WSI. The method, Instance-context embedding (ICE), leverages neural word embeddings, and the correlation statistics they capture, to compute high quality embeddings of word contexts. In WSI, these context embeddings are clustered to find the word senses present in the text. ICE is based on a novel method for combining word embeddings using continuous Skip-gram, based on both se- mantic and a temporal aspects of context words. ICE is evaluated both in a new system, and in an extension to a previous system for WSI. In both cases, we surpass previous state-of-the-art, on the WSI task of SemEval-2013, which highlights the generality of ICE. Our proposed system achieves a 33% relative improvement.}, booktitle = {Proceedings of the 1st Workshop on Vector Space Modeling for Natural Language Processing. Denver, United States}, author = {Kågebäck, Mikael and Johansson, Fredrik and Johansson, Richard and Dubhashi, Devdatt}, year = {2015}, pages = {25--32}, } @misc{wilhelmsson-2015-autentiska-249227, title = {Autentiska och artificiella frågor till svensk text Automatisk frågegenerering jämfört med användares frågor för informationsåtkomst}, abstract = {Informationssökning mot ostrukturerade datakällor som fri text är ett av de områden där användargränssnitt med fri formulering i naturligt språk har tagits fram. I ett sådant, eventuellt AI-betonat, system kan några grundläggande svårigheter från användarperspektivet märkas. En sådan svårighet är att en användare inte känner till huruvida en fråga som hon avser att ställa egentligen kan besvaras av den aktuella texten. Denna svårighet, tillsammans med andra, som de kraftiga variationsmöjligheterna för formen för ett giltigt svar på en ställd fråga, riskerar att leda till att användarintrycken av systemtypen blir negativa. De moment som behöver ingå i ett sådant frågebaserat informationssystems funktionssätt måste på något sätt inbegripa en mappning av frågeled i frågan (t.ex. när) till den form och grammatisk funktion som svaret i texten måste ha (för frågan när normalt ett tidsadverbial). Bland annat denna iakttagelse inbjuder till användning av automatisk frågegenerering (question generation, QG). Frågegenerering innebär att frågor som en naturlig text besvarar initialt utvinns av ett program som samlar in dem i explicit form. Tanken för användning i informationssökning är att en användare i gränssnittet enbart ska kunna ställa just dessa frågor, vilka faktiskt besvaras av texten. Denna studie gäller just de frågor som ett automatiskt frågegenereringssystem för svenska kan, och genom vidare utveckling, skulle kunna generera för godtycklig digital svensk text. Även om mängden automatiskt genererade frågor och frågeformuleringar kan bli mycket stor, utrymmesmässigt många gånger större än ursprungstexten, så är det tydligt att den beskrivna metoden för frågegenerering för svenska inte kan och troligen inte heller kommer att kunna förmås att skapa alla de frågor och frågeformuleringar som en vanlig användare skulle anse att en viss text besvarar. Men hur väl fungerar då automatiskt genererade frågor i detta sammanhang? Denna uppsats kretsar kring en användarundersökning där undersökningsdeltagare har ombetts att formulera frågor som texter besvarar, och som anses vara relevanta frågor. Den resulterande samlingen frågor undersöktes och kategoriserades. Resultatet av undersökningens huvudfråga visar att bara 20-25 % av användarnas frågeformuleringar skulle kunna genereras direkt automatiskt med aktuell ansats – utan vissa informationstekniska förbättringar. Uppsatsen föreslår viss ny terminologi för detta outforskade område, bl.a. för att skilja mellan de olika grader av processkrav som generering av olika frågeslag från text kräver.}, author = {Wilhelmsson, Kenneth}, year = {2015}, address = {Göteborgs universitet, Inst för tillämpad IT}, } @inProceedings{themistocleous-muller-2015-intonation-232414, title = {The intonation of Albanian polar questions and statements}, abstract = {This studyaims to provide an account of the effects of sentence type (statements vs. polar questions) on Standard Albanian prenuclear rises through a polynomial model representing the dynamic characteristics of tonal contours.Results show that the main difference in contour shape between Albanian statements and polar questions is located in the shape of the prenuclear rise, and this difference was significant; onset timing of the prenuclear rise, however, did not differ significantly betweenthe two types of sentence.}, booktitle = {6th International Conference of Experimental Linguistics. ExLing 2015, 26-27 June 2015, Athens, Greece / Edited by Antonis Botinis}, author = {Themistocleous, Charalambos and Müller, Daniela}, year = {2015}, publisher = {University of Athens}, address = {Athens}, ISBN = {978-960-466-160-2}, } @inProceedings{lindh-2015-forensic-222514, title = {Forensic speaker comparison using machine and mind}, booktitle = {24th Annual Conference of the International Association for Forensic Phonetics and Acoustics, 8 - 10 July 2015, Leiden, Netherlands}, author = {Lindh, Jonas}, year = {2015}, } @inProceedings{pilan-2015-helping-227313, title = {Helping Swedish words come to their senses: word-sense disambiguation based on sense associations from the SALDO lexicon}, abstract = {This paper describes a knowledge-based approach to word-sense disambiguation using a lexical-semantic resource, SALDO. This hierarchically organized lexicon defining senses in terms of other related senses has not been previously explored for this purpose. The proposed method is based on maximizing the overlap between associated word senses of nouns and verbs co-occuring within a sentence. The results of a small-scale experiment using this method are also reported. Overall, the approach proved more efficient for nouns, since not only was the accuracy score higher for this category (56%) than for verbs (46%), but for nouns in 22% more of the cases was a sense overlap found. As a result of an in-depth analysis of the predictions, we identified a number of ways the system could be modified or extended for an improved performance.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics (NODALIDA 2015). May 11–13, 2015, Vilnius, Lithuania}, editor = {Beáta Megyesi}, author = {Pilán, Ildikó}, year = {2015}, number = {109}, ISBN = {9789175190983}, pages = {275--279}, } @inProceedings{adouane-johansson-2016-gulf-242243, title = {Gulf Arabic Resource Building for Sentiment Analysis}, abstract = {This paper deals with building linguistic resources for Gulf Arabic, one of the Arabic variations, for sentiment analysis task using machine learning. To our knowledge, no previous works were done for Gulf Arabic sentiment analysis despite the fact that it is present in different online platforms. Hence, the first challenge is the absence of annotated data and sentiment lexicons. To fill this gap, we created these two main linguistic resources. Then we conducted different experiments: use Naive Bayes classifier without any lexicon; add a sentiment lexicon designed basically for MSA; use only the compiled Gulf Arabic sentiment lexicon and finally use both MSA and Gulf Arabic sentiment lexicons. The Gulf Arabic lexicon gives a good improvement of the classifier accuracy (90.54 %) over a baseline that does not use the lexicon (82.81%), while the MSA lexicon causes the accuracy to drop to (76.83%). Moreover, mixing MSA and Gulf Arabic lexicons causes the accuracy to drop to (84.94%) compared to using only Gulf Arabic lexicon. This indicates that it is useless to use MSA resources to deal with Gulf Arabic due to the considerable differences and conflicting structures between these two languages.}, booktitle = {Proceedings of the Language Resources and Evaluation Conference (LREC), 23-28 May 2016, Portorož, Slovenia}, author = {Adouane, Wafia and Johansson, Richard}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{alfter-bizzoni-2016-hybrid-246348, title = {Hybrid Language Segmentation for Historical Documents}, booktitle = {Proceedings CLiC-it 2016 and EVALITA 2016, Napoli, Italy, December 5-7, 2016. Edited by : Pierpaolo Basile, Anna Corazza, Franco Cutugno, Simonetta Montemagni, Malvina Nissim, Viviana Patti, Giovanni Semeraro, Rachele Sprugnoli}, author = {Alfter, David and Bizzoni, Yuri}, year = {2016}, } @inProceedings{kelly-etal-2016-identifying-242814, title = {Identifying Perceptually Similar Voices with a Speaker Recognition System Using Auto-Phonetic Features}, booktitle = {17th Annual Conference of the International-Speech-Communication-Association (Interspeech 2016). San Francisco, CA, USA. 8-12 september 2016.}, author = {Kelly, Finnian and Alexander, Anil and Forth, Oscar and Kent, Samuel and Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {1567----1568}, } @article{sundqvist-etal-2016-syllable-227628, title = {Syllable Repetition vs. Finger Tapping: Aspects of Motor Timing in 100 Healthy Adults.}, abstract = {In this study we systematically compared syllable repetition and finger tapping in healthy adults, and explored possible impacts of tempi, metronome, musical experience, and age on motor timing ability. One hundred healthy adults used finger-tapping and syllable repetition to perform an isochronous pulse in three different tempi, with and without a metronome. Results showed that the motor timing was more accurate with finger tapping than with syllable repetition in the slowest tempo, and the motor timing ability was better with the metronome than without. Persons with musical experience showed better motor timing accuracy than persons without such experience, and the timing asynchrony increased with increasing age. The slowest tempo 90 bpm posed extra challenges to the participants. We speculate that this pattern reflects the fact that the slow tempo lies outside the 3-8 Hz syllable rate of natural speech, which in turn has been linked to theta-based oscillations in the brain.}, journal = {Motor control}, author = {Sundqvist, Maria and Åsberg Johnels, Jakob and Lindh, Jonas and Laakso, Katja and Hartelius, Lena}, year = {2016}, volume = {20}, number = {3}, pages = {233--54}, } @inProceedings{pilan-volodina-2016-classification-248099, title = {Classification of Language Proficiency Levels in Swedish Learners' Texts}, abstract = {We evaluate a system for the automatic classification of texts written by learners of Swedish as a second language into levels of language proficiency. Since the amount of available annotated learner essay data for our target language is rather small, we explore also the potentials of domain adaptation for this task. The additional domain consists of coursebook texts written by experts for learners. We find that already with a smaller amount of in-domain Swedish learner essay data it is possible to obtain results that compare well to state-of-the-art systems for other languages, with domain adaptation methods yielding a slight improvement.}, booktitle = {The Sixth Swedish Language Technology Conference (SLTC), Umeå University, 17-18 November, 2016}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2016}, } @article{themistocleous-2016-bursts-243451, title = {The bursts of stops can convey dialectal information}, abstract = {This study investigates the effects of the dialect of the speaker on the spectral properties of stop bursts. Forty-five female speakers—20 Standard Modern Greek and 25 Cypriot Greek speakers—participated in this study. The spectral properties of stop bursts were calculated from the burst spectra and analyzed using spectral moments. The findings show that besides linguistic information, i.e., the place of articulation and the stress, the speech signals of bursts can encode social information, i.e., the dialects. A classification model using decision trees showed that skewness and standard deviation have a major contribution for the classification of bursts across dialects.}, journal = {Journal of the Acoustical Society of America}, author = {Themistocleous, Charalambos}, year = {2016}, volume = {140}, number = {4}, pages = {EL334--EL339}, } @article{themistocleous-logotheti-2016-standard-239899, title = {Standard Modern Greek and Cypriot Greek vowels: a sociophonetic study}, abstract = {This study is a comparative analysis of Standard Modern Greek (SMG) and Cypriot Greek (CG) vowels. Specifically, the study examines the effects of vowel (/e i a o u/), language variety (SMG vs CG), and stress (stressed vs unstressed vowels) on vowel formants F1 and F2, vowel duration, and fundamental frequency (f0). 45 female speakers were recorded: 20 SMG speakers and 25 CG speakers from Athens and Nicosia respectively. The results showed significant effects of vowel, stress, and language variety on formants, duration and f0. The study confirms the findings of earlier studies on SMG vowels, provides the first report on CG vowels’ acoustic structure, and constitutes the first comparative sociophonetic research on SMG and CG vowels. }, journal = {Proceedings of the international conference on Modern Greek dialects and Linguistic Theory, Patras, 25-28 September 2014}, author = {Themistocleous, Charalambos and Logotheti, Angeliki}, year = {2016}, volume = {6}, number = {1}, pages = {178--184}, } @inProceedings{pilan-2016-detecting-243715, title = {Detecting Context Dependence in Exercise Item Candidates Selected from Corpora}, abstract = {We explore the factors influencing the dependence of single sentences on their larger textual context in order to automatically identify candidate sentences for language learning exercises from corpora which are presentable in isolation. An in-depth investigation of this question has not been previously carried out. Understanding this aspect can contribute to a more efficient selection of candidate sentences which, besides reducing the time required for item writing, can also ensure a higher degree of variability and authenticity. We present a set of relevant aspects collected based on the qualitative analysis of a smaller set of context-dependent corpus example sentences. Furthermore, we implemented a rule-based algorithm using these criteria which achieved an average precision of 0.76 for the identification of different issues related to context dependence. The method has also been evaluated empirically where 80% of the sentences in which our system did not detect context-dependent elements were also considered context-independent by human raters.}, booktitle = {Proceedings of the 11th Workshop on Innovative Use of NLP for Building Educational Applications, June 12 to June 17, 2016, San Diego, USA}, author = {Pilán, Ildikó}, year = {2016}, } @book{rosen-2016-theory-231969, title = {Theory Exploration and Inductive Theorem Proving}, abstract = {We have built two state-of-the-art inductive theorem provers named HipSpec and Hipster. The main issue when automating proofs by induction is to discover essential helper lemmas. Our theorem provers use the technique theory exploration, which is a method to systematically discover interesting conclusions about a mathematical theory. We use the existing theory exploration system QuickSpec which conjectures properties for a program that seem to hold based on testing. The idea is to try to prove these explored conjectures together with the user-stated goal conjecture. By using this idea and connecting it with our previous work on Hip, the Haskell Inductive Prover, we were able to take new leaps in field of inductive theorem proving. Additionally, we have developed a benchmark suite named TIP, short for Tons of Inductive Problems, with benchmark problems for inductive theorem provers, and a tool box for converting and manipulating problems expressed in the TIP format. There were two main reasons to this initiative. Firstly, the inductive theorem proving field lacked a shared benchmark suite as well as a format. Secondly, the benchmarks that have been used were outdated: all contemporary provers would solve almost every problem. We have so far added hundreds of new challenges to the TIP suite to encourage further research. }, author = {Rosén, Dan}, year = {2016}, publisher = {Chalmers University of Technology}, address = {Göteborg}, } @article{nietopina-johansson-2016-benchmarking-251412, title = {Benchmarking Word Sense Disambiguation Systems for Swedish}, abstract = {We compare several word sense disambiguation systems for Swedish and evaluate them on seven different sense-annotated corpora. Our results show that unsupervised systems beat a random baseline, but generally do not outperform a first-sense baseline considerably. On a lexical-sample dataset that allows us to train a supervised system, the unsupervised disambiguators are strongly outperformed by the supervised one.}, journal = {The Sixth Swedish Language Technology Conference}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, } @inProceedings{themistocleous-etal-2016-effects-239893, title = {Effects of stress on fricatives: Evidence from Standard Modern Greek}, abstract = {This study investigates the effects of stress on the spectral properties of fricative noise in Standard Modern Greek (SMG). Twenty female speakers of SMG participated in the study. Fricatives were produced in stressed and unstressed positions in two vowel place positions: back and front vowels. Acoustic measurements were taken and the temporal and spectral properties of fricatives using spectral moments were calculated. Stressed fricatives are produced with increased duration, center of gravity, standard deviation, and normalized intensity. The machine learning and classification algorithm C5.0 has been employed to estimate the contribution of the temporal and spectral parameters for the classification of fricatives. Overall, duration and center of gravity contribute the most to the classification of stressed vs. unstressed fricatives.}, booktitle = {17th Annual Conference of the International Speech Communication Association, Interspeech 2016 8-12 Sep 2016, San Francisco, USA }, author = {Themistocleous, Charalambos and Savva, Angelandria and Aristodemou, Andrie}, year = {2016}, ISBN = {978-1-5108-3313-5}, } @inProceedings{kokkinakis-etal-2016-specifications-243183, title = {Specifications and Methodology for Language-Related Data Acquisition and Analysis in the Domain of Dementia Diagnostics}, abstract = {This paper outlines the initial stages of a project that aims to build and use a corpus with data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of audio-recorded spoken language samples; transcripts of the audio recordings and eye tracking measurements. From these data we plan to extract, evaluate and model features to be used for learning classification models in order to test how well a differentiation between the aforementioned subject groups can be made. Features will be also correlated with outcomes from e.g. other language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = { The Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2016}, } @article{themistocleous-2016-seeking-239901, title = {Seeking an Anchorage. Stability and Variability in Tonal Alignment of Rising Prenuclear Pitch Accents in Cypriot Greek}, abstract = {Although tonal alignment constitutes a quintessential property of pitch accents, its exact characteristics remain unclear. This study, by exploring the timing of the Cypriot Greek L*+H prenuclear pitch accent, examines the predictions of three hypotheses about tonal alignment: the invariance hypothesis, the segmental anchoring hypothesis, and the segmental anchorage hypothesis. The study reports on two experiments: the first of which manipulates the syllable patterns of the stressed syllable, and the second of which modifies the distance of the L*+H from the following pitch accent. The findings on the alignment of the low tone (L) are illustrative of the segmental anchoring hypothesis predictions: the L persistently aligns inside the onset consonant, a few milliseconds before the stressed vowel. However, the findings on the alignment of the high tone (H) are both intriguing and unexpected: the alignment of the H depends on the number of unstressed syllables that follow the prenuclear pitch accent. The ‘wandering’ of the H over multiple syllables is extremely rare among languages, and casts doubt on the invariance hypothesis and the segmental anchoring hypothesis, as well as indicating the need for a modified version of the segmental anchorage hypothesis. To address the alignment of the H, we suggest that it aligns within a segmental anchorage–the area that follows the prenuclear pitch accent–in such a way as to protect the paradigmatic contrast between the L*+H prenuclear pitch accent and the L+H* nuclear pitch accent.}, journal = {Language and Speech}, author = {Themistocleous, Charalambos}, year = {2016}, volume = {59}, number = {4}, pages = {433--461}, } @inProceedings{volodina-pilan-2016-svalex-248116, title = {SVALex: en andraspråksordlista graderad enligt CEFR nivåer.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó}, year = {2016}, } @inProceedings{francois-etal-2016-svalex-248142, title = {SVALex: a CEFR-graded lexical resource for Swedish foreign and second language learners.}, abstract = {The paper introduces SVALex, a lexical resource primarily aimed at learners and teachers of Swedish as a foreign and second language that describes the distribution of 15,681 words and expressions across the Common European Framework of Reference (CEFR). The resource is based on a corpus of coursebook texts, and thus describes receptive vocabulary learners are exposed to during reading activities, as opposed to productive vocabulary they use when speaking or writing. The paper describes the methodology applied to create the list and to estimate the frequency distribution. It also discusses some chracteristics of the resulting resource and compares it to other lexical resources for Swedish. An interesting feature of this resource is the possibility to separate the wheat from the chaff, identifying the core vocabulary at each level, i.e. vocabulary shared by several coursebook writers at each level, from peripheral vocabulary which is used by the minority of the coursebook writers.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016 Portorož, Slovenia}, author = {François, Thomas and Volodina, Elena and Pilán, Ildikó and Tack, Anaïs}, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{pilan-etal-2016-coursebook-246349, title = {Coursebook texts as a helping hand for classifying linguistic complexity in language learners' writings}, abstract = {We bring together knowledge from two different types of language learning data, texts learners read and texts they write, to improve linguistic complexity classification in the latter. Linguistic complexity in the foreign and second language learning context can be expressed in terms of proficiency levels. We show that incorporating features capturing lexical complexity information from reading passages can boost significantly the machine learning based classification of learner-written texts into proficiency levels. With an F1 score of .8 our system rivals state-of-the-art results reported for other languages for this task. Finally, we present a freely available web-based tool for proficiency level classification and lexical complexity visualization for both learner writings and reading texts. }, booktitle = {Proceedings of the workshop on Computational Linguistics for Linguistic Complexity}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2016}, ISBN = {978-4-87974-709-9}, } @inProceedings{volodina-etal-2016-swell-248141, title = {SweLL on the rise: Swedish Learner Language corpus for European Reference Level studies.}, abstract = {We present a new resource for Swedish, SweLL, a corpus of Swedish Learner essays linked to learners’ performance according to the Common European Framework of Reference (CEFR). SweLL consists of three subcorpora – SpIn, SW1203 and Tisus, collected from three different educational establishments. The common metadata for all subcorpora includes age, gender, native languages, time of residence in Sweden, type of written task. Depending on the subcorpus, learner texts may contain additional information, such as text genres, topics, grades. Five of the six CEFR levels are represented in the corpus: A1, A2, B1, B2 and C1 comprising in total 339 essays. C2 level is not included since courses at C2 level are not offered. The work flow consists of collection of essays and permits, essay digitization and registration, meta-data annotation, automatic linguistic annotation. Inter-rater agreement is presented on the basis of SW1203 subcorpus. The work on SweLL is still ongoing with more that 100 essays waiting in the pipeline. This article both describes the resource and the “how-to” behind the compilation of SweLL.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016), May 23-28, 2016, Portorož, Slovenia}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Llozhi, Lorena and Lundkvist, Peter and Sundberg, Gunlög and Sandell, Monica}, year = {2016}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {978-2-9517408-9-1}, } @inProceedings{tahmasebi-etal-2016-clarin-233899, title = {SWE-CLARIN – the Swedish CLARIN project – aims and activities}, booktitle = {Digital Humanities in the Nordic countries, Oslo, March 15-17 2016}, author = {Tahmasebi, Nina and Borin, Lars and Jordan, Caspar and Ekman, Stefan}, year = {2016}, pages = {122--123}, } @inProceedings{adouane-etal-2016-arabicized-252492, title = {Arabicized and Romanized Berber Automatic Identification}, abstract = {We present an automatic language identification tool for both Arabicized Berber (Berber written in the Arabic script) and Romanized Berber (Berber written in the Latin script). The focus is on short texts (social media content). We use supervised machine learning method with character and word-based n-gram models as features. We also describe the corpora used in this paper. For both Arabicized and Romanized Berber, character-based 5-grams score the best giving an F-score of 99.50%.}, booktitle = {Proceedings of TICAM 2016}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {IRCAM}, address = {Morocco}, } @inProceedings{adouane-etal-2016-romanized-246849, title = {Romanized Berber and Romanized Arabic Automatic Language Identification Using Machine Learning}, abstract = {The identification of the language of text/speech input is the first step to be able to properly do any language-dependent natural language processing. The task is called Automatic Language Identification (ALI). Being a well-studied field since early 1960’s, various methods have been applied to many standard languages. The ALI standard methods require datasets for training and use character/word-based n-gram models. However, social media and new technologies have contributed to the rise of informal and minority languages on the Web. The state-of-the-art automatic language identifiers fail to properly identify many of them. Romanized Arabic (RA) and Romanized Berber (RB) are cases of these informal languages which are under-resourced. The goal of this paper is twofold: detect RA and RB, at a document level, as separate languages and distinguish between them as they coexist in North Africa. We consider the task as a classification problem and use supervised machine learning to solve it. For both languages, character-based 5-grams combined with additional lexicons score the best, F-score of 99.75% and 97.77% for RB and RA respectively.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 53–61; December 12, 2016 ; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @inProceedings{alfter-2016-learning-241664, title = {Learning the Learner: User Modeling in Intelligent Computer Assisted Language Learning Systems}, booktitle = {CEUR Workshop Proceedings, v.1618. UMAP 2016 Extended Proceedings. Halifax, Canada, July 13-16, 2016. Edited by : Federica Cena, Michel Desmarais, Darina Dicheva, Jie Zhang}, author = {Alfter, David}, year = {2016}, } @inProceedings{pilan-etal-2016-predicting-247240, title = {Predicting proficiency levels in learner writings by transferring a linguistic complexity model from expert-written coursebooks}, abstract = {The lack of a sufficient amount of data tailored for a task is a well-recognized problem for many statistical NLP methods. In this paper, we explore whether data sparsity can be successfully tackled when classifying language proficiency levels in the domain of learner-written output texts. We aim at overcoming data sparsity by incorporating knowledge in the trained model from another domain consisting of input texts written by teaching professionals for learners. We compare different domain adaptation techniques and find that a weighted combination of the two types of data performs best, which can even rival systems based on considerably larger amounts of in-domain data. Moreover, we show that normalizing errors in learners’ texts can substantially improve classification when in-domain data with annotated proficiency levels is not available.}, booktitle = {Proceedings of the 26th International Conference on Computational Linguistics (COLING), December 13-16, 2016, Osaka}, author = {Pilán, Ildikó and Volodina, Elena and Zesch, Torsten}, year = {2016}, ISBN = {978-4-87974-702-0}, } @inProceedings{adesam-bouma-2016-swedish-251827, title = {Old Swedish Part-of-Speech Tagging between Variation and External Knowledge}, booktitle = {Proceedings of the 10th SIGHUM Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities, Berlin, Germany, August 11, 2016}, author = {Adesam, Yvonne and Bouma, Gerlof}, year = {2016}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-945626-09-8}, } @inProceedings{daudaravicius-etal-2016-report-248143, title = {A report on the Automatic Evaluation of Scientific Writing Shared Task.}, abstract = {The Automated Evaluation of Scientific Writing, or AESW, is the task of identifying sentences in need of correction to ensure their appropriateness in a scientific prose. The data set comes from a professional editing company, VTeX, with two aligned versions of the same text – before and after editing – and covers a variety of textual infelicities that proofreaders have edited. While previous shared tasks focused solely on grammatical errors (Dale and Kilgarriff, 2011; Dale et al., 2012; Ng et al., 2013; Ng et al., 2014), this time edits cover other types of linguistic misfits as well, including those that almost certainly could be interpreted as style issues and similar “matters of opinion”. The latter arise because of different language editing traditions, experience, and the absence of uniform agreement on what “good” scientific language should look like. Initiating this task, we expected the participating teams to help identify the characteristics of “good” scientific language, and help create a consensus of which language improvements are acceptable (or necessary). Six participating teams took on the challenge.}, booktitle = {Workshop on Innovative Use of NLP for Building Educational Applications, June 16, 2016, San Diego, CA, USA}, author = {Daudaravicius, Vidas and E. Banchs, Rafael and Volodina, Elena and Napoles, Courtney}, year = {2016}, ISBN = {978-1-941643-83-9}, } @inProceedings{ehrlemark-etal-2016-retrieving-242241, title = {Retrieving Occurrences of Grammatical Constructions}, abstract = {Finding authentic examples of grammatical constructions is central in constructionist approaches to linguistics, language processing, and second language learning. In this paper, we address this problem as an information retrieval (IR) task. To facilitate research in this area, we built a benchmark collection by annotating the occurrences of six constructions in a Swedish corpus. Furthermore, we implemented a simple and flexible retrieval system for finding construction occurrences, in which the user specifies a ranking function using lexical-semantic similarities (lexicon-based or distributional). The system was evaluated using standard IR metrics on the new benchmark, and we saw that lexical-semantical rerankers improve significantly over a purely surface-oriented system, but must be carefully tailored for each individual construction. }, booktitle = {Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics : Technical Papers, December 11–17; Osaka, Japan}, author = {Ehrlemark, Anna and Johansson, Richard and Lyngfelt, Benjamin}, year = {2016}, ISBN = {978-4-87974-702-0}, } @techreport{wilhelmsson-2016-huvudansatser-247442, title = {Huvudansatser för parsningsmetoder. Om programutvecklingens förutsättningar i en svensk kontext}, abstract = {Syftet med denna text var att ge en inblick i området (syntaktisk) parsning. Tanken var att ge en bild av utvecklingen som var 1) fri från alltför tekniska detaljer, då området är programmeringstekniskt, och 2) beskriven ur ett svenskt perspektiv. Bakgrunden till valet av ämne till texten, som var tänkt att finnas med i antologin Text och kontext, var att parsning är relativt okänt för många personer verksamma inom närliggande områden, samtidigt som det är ett absolut nyckelbegrepp för den som ägnar sig åt datorlingvistik eller språkteknologi. Målet var alltså att ge en ganska allmän utifrånblick på några centrala sidor av utvecklingen, samtidigt som det tydligt är så att den som själv arbetat med utveckling kan ha starka åsikter och preferenser rörande metodval, något som i ärlighetens namn kanske inte heller denna text är lösgjord från. Hur ska det göras? Konsten att utveckla automatisk syntaxanalys av naturlig text kan läras ut från ett flertal perspektiv. Det kan t.ex. ske med fokus på användandet av en viss grammatikformalism, med fokus på beräkningssnabbhet, med fokus på entydiggörande av möjliga ambiguiteter. Tolkningsval kan göras med hjälp av antingen handskrivna regler eller inhämtad statistik. En sorts huvudtema i denna text är hur metoder för parsning på senare år uppvisar förändringar som kanske kan förklaras med att programmen har fått andra användningsområden och att metoderna har anpassats därefter (en annan tolkning är att flera senare system inte längre gör parsning i strikt mening). När detta tänkta ”kapitel” var färdigt fick det kommentaren att det inte var anpassat för antologins målgrupp. Det fick skrivas en annan kapiteltext, men det kom samtidigt ett förslag att publicera texten om parsning här som denna rapport.}, author = {Wilhelmsson, Kenneth}, year = {2016}, publisher = {Göteborgs universitet}, address = {Göteborg}, } @inProceedings{forsberg-hulden-2016-deriving-237061, title = {Deriving Morphological Analyzers from Example Inflections}, abstract = {This paper presents a semi-automatic method to derive morphological analyzers from a limited number of example inflections suitable for languages with alphabetic writing systems. The system we present learns the inflectional behavior of morphological paradigms from examples and converts the learned paradigms into a finite-state transducer that is able to map inflected forms of previously unseen words into lemmas and corresponding morphosyntactic descriptions. We evaluate the system when provided with inflection tables for several languages collected from the Wiktionary.}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC-2016) May 23-28, 2016, Portorož, Slovenia}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, ISBN = {978-2-9517408-9-1}, } @inProceedings{adouane-etal-2016-romanized-255457, title = {Romanized Arabic and Berber Detection Using Prediction by Partial Matching and Dictionary Methods}, abstract = {Arabic is one of the Semitic languages written in Arabic script in its standard form. However, the recent rise of social media and new technologies has contributed considerably to the emergence of a new form of Arabic, namely Arabic written in Latin scripts, often called Romanized Arabic or Arabizi. While Romanized Arabic is an informal language, Berber or Tamazight uses Latin script in its standard form with some orthography differences depending on the country it is used in. Both these languages are under-resourced and unknown to the state-of-theart language identifiers. In this paper, we present a language automatic identifier for both Romanized Arabic and Romanized Berber. We also describe the built linguistic resources (large dataset and lexicons) including a wide range of Arabic dialects (Algerian, Egyptian, Gulf, Iraqi, Levantine, Moroccan and Tunisian dialects) as well as the most popular Berber varieties (Kabyle, Tashelhit, Tarifit, Tachawit and Tamzabit). We use the Prediction by Partial Matching (PPM) and dictionary-based methods. The methods reach a macro-average F-Measure of 98.74% and 97.60% respectively.}, booktitle = {2016 IEEE/ACS 13TH INTERNATIONAL CONFERENCE OF COMPUTER SYSTEMS AND APPLICATIONS (AICCSA)}, author = {Adouane, Wafia and Semmar, N. and Johansson, Richard}, year = {2016}, ISBN = {978-1-5090-4320-0}, } @techreport{borin-etal-2016-free-233768, title = {A free cloud service for OCR / En fri molntjänst för OCR}, author = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana}, year = {2016}, publisher = {University of Gothenburg}, address = {Göteborg}, } @inProceedings{gruzitis-etal-2016-grammatical-233921, title = {Grammatical Framework for implementing multilingual frames and constructions}, booktitle = {Book of Abstracts. The 9th International Conference on Construction Grammar (ICCG9) theme session on Computational Semantics with Frames and Constructions. October 05-09, 2016, Juiz de Fora, Brazil }, author = {Gruzitis, Normunds and Dannélls, Dana and Ranta, Aarne and Tyers, Francis M.}, year = {2016}, } @inProceedings{nietopina-johansson-2016-embedding-241139, title = {Embedding Senses for Efficient Graph-based Word Sense Disambiguation}, abstract = {We propose a simple graph-based method for word sense disambiguation (WSD) where sense and context embeddings are constructed by applying the Skip-gram method to random walks over the sense graph. We used this method to build a WSD system for Swedish using the SALDO lexicon, and evaluated it on six different annotated test sets. In all cases, our system was several orders of magnitude faster than a state-of-the-art PageRank-based system, while outperforming a random baseline soundly.}, booktitle = { Proceedings of TextGraphs-10: the Workshop on Graph-based Methods for Natural Language Processing}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2016}, publisher = {Association for Computational Linguistics}, } @inProceedings{ahlberg-etal-2016-sprakbankens-246063, title = {Språkbanken’s Open Lexical Infrastructure}, abstract = {Karp is an open lexical infrastructure and a web based tool for searching, exploring and developing lexical resources. Språkbanken currently hosts a number of lexicons in Karp and on-going work aims at broadening the type of resources that can be developed in the system. This abstract gives a short overview of Karp's basic functionality, and describes some current projects and on-going work.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @incollection{borin-2016-lexikografi-246607, title = {Lexikografi för maskiner och lexikografi för människor}, booktitle = {Framtidens lexikografi: Rapport från ett symposium i Göteborg 5 oktober 2012}, author = {Borin, Lars}, year = {2016}, publisher = {Meijerbergs institut vid Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-01-1}, pages = {9--27}, } @inProceedings{bouma-adesam-2016-multiword-251825, title = {Multiword Annotation in the Eukalyptus Treebank of Written Swedish}, booktitle = {PARSEME, 6th general meeting, 7-8 April 2016, Struga, FYR Macedonia }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{ahlberg-etal-2016-karp-246072, title = {Karp: Språkbanken’s Open Lexical Infrastructure}, booktitle = {Globalex 2016, May 24, Portorož, Slovenia}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @inProceedings{johansson-etal-2016-multi-233140, title = {A Multi-domain Corpus of Swedish Word Sense Annotation}, abstract = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators. }, booktitle = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)}, author = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin}, year = {2016}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-9-1}, } @inProceedings{kelly-etal-2016-automatically-242810, title = {Automatically identifying perceptually similar voices for voice parades}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016}, author = {Kelly, Finnian and Alexander, Anil and Forth, Oscar and Kent, Samuel and Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {25--26}, } @article{adesam-etal-2016-sprakteknologi-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, journal = {Kungliga Skytteanska Samfundets Handlingar}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @inProceedings{adouane-etal-2016-automatic-246765, title = {Automatic Detection of Arabicized Berber and Arabic Varieties}, abstract = {Automatic Language Identification (ALI) is the detection of the natural language of an input text by a machine. It is the first necessary step to do any language-dependent natural language processing task. Various methods have been successfully applied to a wide range of languages, and the state-of-the-art automatic language identifiers are mainly based on character n-gram models trained on huge corpora. However, there are many languages which are not yet automatically processed, for instance minority and informal languages. Many of these languages are only spoken and do not exist in a written format. Social media platforms and new technologies have facilitated the emergence of written format for these spoken languages based on pronunciation. The latter are not well represented on the Web, commonly referred to as under-resourced languages, and the current available ALI tools fail to properly recognize them. In this paper, we revisit the problem of ALI with the focus on Arabicized Berber and dialectal Arabic short texts. We introduce new resources and evaluate the existing methods. The results show that machine learning models combined with lexicons are well suited for detecting Arabicized Berber and different Arabic varieties and distinguishing between them, giving a macro-average F-score of 92.94%.}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 63–72; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard and Bobicev, Victoria}, year = {2016}, } @inProceedings{bouma-adesam-2016-part-254389, title = {Part-of-speech and Morphology Tagging Old Swedish}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2016}, } @inProceedings{borin-etal-2016-towards-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, address = {Paris}, } @incollection{forsberg-hulden-2016-learning-240208, title = {Learning Transducer Models for Morphological Analysis from Example Inflections}, abstract = {In this paper, we present a method to convert morphological inflection tables into unweighted and weighted finite transducers that perform parsing and generation. These transducers model the inflectional behavior of morphological paradigms induced from examples and can map inflected forms of previously unseen word forms into their lemmas and give morphosyntactic descriptions of them. The system is evaluated on several languages with data collected from the Wiktionary.}, booktitle = {Proceedings of the SIGFSM Workshop on Statistical NLP and Weighted Automata. Association for Computational Linguistics. August 12, 2016 Berlin, Germany}, author = {Forsberg, Markus and Hulden, Mans}, year = {2016}, publisher = {ACL}, address = {Stroudsburg, PA, USA}, ISBN = {978-1-945626-13-5 }, pages = {42--50}, } @inProceedings{kokkinakis-etal-2016-data-243069, title = {Data Resource Acquisition from People at Various Stages of Cognitive Decline – Design and Exploration Considerations}, abstract = {In this paper we are introducing work in progress towards the development of an infrastructure (i.e., design, methodology, creation and description) of linguistic and extra-linguistic data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of various types of modalities; i.e. audio-recorded spoken language samples; transcripts of the audio recordings (text) and eye tracking measurements. The integration of the extra-linguistic information with the linguistic phenotypes and measurements elicited from audio and text, will be used to extract, evaluate and model features to be used in machine learning experiments. In these experiments, classification models that will be trained, that will be able to learn from the whole or a subset of the data to make predictions on new data in order to test how well a differentiation between the aforementioned groups can be made. Features will be also correlated with measured outcomes from e.g. language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = {The Seventh International Workshop on Health Text Mining and Information Analysis (Louhi). November 5, 2016, Austin, Texas, USA}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Nordlund, Arto}, year = {2016}, } @article{pilan-etal-2016-readable-226565, title = {A readable read: Automatic Assessment of Language Learning Materials based on Linguistic Complexity.}, abstract = {Corpora and web texts can become a rich language learning resource if we have a means of assessing whether they are linguistically appropriate for learners at a given proficiency level. In this paper, we aim at addressing this issue by presenting the first approach for predicting linguistic complexity for Swedish second language learning material on a 5-point scale. After showing that the traditional Swedish readability measure, Läsbarhetsindex (LIX), is not suitable for this task, we propose a supervised machine learning model, based on a range of linguistic features, that can reliably classify texts according to their difficulty level.Our model obtained an accuracy of 81.3% and an F-score of 0.8, which is comparable to the state of the art in English and is considerably higher than previously reported results for other languages. We further studied the utility of our features with single sentences instead of full texts since sentences are a common linguistic unit in language learning exercises. We trained a separate model on sentence-level data with five classes, which yielded 63.4% accuracy. Although this is lower than the document level performance, we achieved an adjacent accuracy of 92%. Furthermore, we found that using a combination of different features, compared to using lexical features alone, resulted in 7% improvement in classification accuracy at the sentence level, whereas at the document level, lexical features were more dominant. Our models are intended for use in a freely accessible web-based language learning platform for the automatic generation of exercises, and they will be available also in the form of web-services.}, journal = {Computational Linguistics and Applications}, author = {Pilán, Ildikó and Vajjala, Sowmya and Volodina, Elena}, year = {2016}, volume = {7}, number = {1}, pages = {143--159}, } @inProceedings{alfter-volodina-2016-modeling-246347, title = {Modeling Individual Learner Knowledge in a Computer Assisted Language Learning System}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Alfter, David and Volodina, Elena}, year = {2016}, } @inProceedings{lundholmfors-breitholtz-2016-mocking-240344, title = {Are you mocking me or are you laughing with me?}, booktitle = { SEMDIAL 2016, JerSem, Proceedings of the 20th Workshop on the Semantics and Pragmatics of Dialogue, 16-18 July 2016 Rutgers, New Brunswick, NJ, USA / Julie Hunter, Mandy Simons, and Matthew Stone (eds.)}, author = {Lundholm Fors, Kristina and Breitholtz, Ellen}, year = {2016}, } @inProceedings{borin-etal-2016-sparv-246053, title = {Sparv: Språkbanken’s corpus annotation pipeline infrastructure}, abstract = {Sparv is Språkbanken's corpus annotation pipeline infrastructure. The easiest way to use the pipeline is from its web interface with a plain text document. The pipeline uses in-house and external tools on the text to segment it into sentences and paragraphs, tokenise, tag parts-of-speech, look up in dictionaries and analyse compounds. The pipeline can also be run using a web API with XML results, and it is run locally at Språkbanken to prepare the documents in Korp, our corpus search tool. While the most sophisticated support is for modern Swedish, the pipeline supports 15 languages.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference, Umeå University, 17-18 November, 2016}, author = {Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Rosén, Dan and Schäfer, Roland and Schumacher, Anne}, year = {2016}, } @inProceedings{borin-kosiski-2016-towards-238147, title = {Towards interactive visualization of public discourse in time and space}, abstract = {We report on a proof-of-concept study where we (1) apply NLP tools for extracting political-discourse topics from a large Swedish Twitter dataset; and (2) design an interactive spatiotemporal visualization application allowing humanities and social-science scholars to explore how the tweet topics vary over space and time.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Borin, Lars and Kosiński, Tomasz}, year = {2016}, volume = {126}, ISBN = {978-91-7685-733-5}, pages = {1--7}, } @inProceedings{rodven-eide-etal-2016-swedish-250073, title = {The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP}, abstract = {In this paper we present a dataset of contemporary Swedish containing one billion words. The dataset consists of a wide range of sources, all annotated using a state-of-the-art corpus annotation pipeline, and is intended to be a static and clearly versioned dataset. This will facilitate reproducibility of experiments across institutions and make it easier to compare NLP algorithms on contemporary Swedish. The dataset contains sentences from 1950 to 2015 and has been carefully designed to feature a good mix of genres balanced over each included decade. The sources include literary, journalistic, academic and legal texts, as well as blogs and web forum entries.}, booktitle = {Linköping Electronic Conference Proceedings. Digital Humanities 2016. From Digitization to Knowledge 2016: Resources and Methods for Semantic Processing of Digital Works/Texts, July 11, 2016, Krakow, Poland}, author = {Rødven-Eide, Stian and Tahmasebi, Nina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-733-5}, } @inProceedings{volodina-etal-2016-classification-246346, title = {Classification of Swedish learner essays by CEFR levels}, abstract = {The paper describes initial efforts on creating a system for the automatic assessment of Swedish second language (L2) learner essays from two points of view: holistic evaluation of the reached level according to the Common European Framework of Reference (CEFR), and the lexical analysis of texts for receptive and productive vocabulary per CEFR level. We describe the data and resources that our experiments were based on, provide a short introduction to the algorithm for essay classification and experiment results, present the user interface we developed for testing new essays and outline future work. }, booktitle = {Proceedings of EuroCALL 2016. 24-27th August 2016, Cyprus.}, author = {Volodina, Elena and Pilán, Ildikó and Alfter, David}, year = {2016}, publisher = {Research-publishing.net}, ISBN = { 978-1-908416-44-5}, } @misc{volodina-etal-2016-preface-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @inProceedings{volodina-etal-2016-swellex-248090, title = {SweLLex: second language learners' productive vocabulary.}, abstract = {This paper presents a new lexical resource for learners of Swedish as a second language, SweLLex, and a know-how behind its creation. We concentrate on L2 learners’ productive vocabulary, i.e. words that they are actively able to produce, rather than the lexica they comprehend (receptive vocabulary). The proposed list covers productive vocabulary used by L2 learners in their essays. Each lexical item on the list is connected to its frequency distribution over the six levels of proficiency defined by the Common European Framework of Reference (CEFR) (Council of Europe, 2001}. To make this list a more reliable resource, we experiment with normalizing L2 word-level errors by replacing them with their correct equivalents. SweLLex has been tested in a prototype system for automatic CEFR level classification of essays as well as in a visualization tool aimed at exploring L2 vocabulary contrasting receptive and productive vocabulary usage at different levels of language proficiency.}, booktitle = {Linköping Electronic Conference Proceedings. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, author = {Volodina, Elena and Pilán, Ildikó and Llozhi, Lorena and Degryse, Baptiste and François, Thomas}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @misc{volodina-etal-2016-proceedings-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{volodina-etal-2016-swell-248145, title = {SweLL – en korpus med L2 uppsatser för CEFR studier.}, booktitle = {Svenskans Beskrivning 35, Göteborg 2016}, author = {Volodina, Elena and Pilán, Ildikó and Enström, Ingegerd and Lundkvist, Peter and Sundberg, Gunlög and Llozhi, Lorena and Sandell, Monica}, year = {2016}, } @inProceedings{volodina-etal-2016-friend-248093, title = {A Friend in Need? Research agenda for electronic Second Language infrastructure.}, abstract = {In this article, we describe the research and societal needs as well as ongoing efforts to shape Swedish as a Second Language (L2) infrastructure. Our aim is to develop an electronic research infrastructure that would stimulate empiric research into learners’ language development by preparing data and developing language technology methods and algorithms that can successfully deal with deviations in the learner language.}, booktitle = {Proceedings of the Swedish Language Technology Conference}, author = {Volodina, Elena and Megyesi, Beata and Wirén, Mats and Granstedt, Lena and Prentice, Julia and Reichenberg, Monica and Sundberg, Gunlög}, year = {2016}, publisher = {Umeå Universitet}, } @inProceedings{lindstromtiedemann-volodina-2016-larka-248119, title = {Lärka som didaktiskt verktyg. Undersökning om studenternas metaspråkliga kunskap.}, booktitle = {Svenskans Beskrivning 35, 11-13 maj 2016, Göteborg}, author = {Lindström Tiedemann, Therese and Volodina, Elena}, year = {2016}, } @inProceedings{viklund-borin-2016-data-236738, title = {How can big data help us study rhetorical history?}, abstract = {Rhetorical history is traditionally studied through rhetorical treatises or selected rhetorical practices, for example the speeches of major orators. Although valuable sources, these do not give us the answers to all our questions. Indeed, focus on a few canonical works or the major historical key figures might even lead us to reproduce cultural self-identifications and false generalizations. However, thanks to increasing availability of relevant digitized texts, we are now at a point where it is possible to see how new research questions can be formulated – and how old research questions can be addressed from a new angle or established results verified – on the basis of exhaustive collections of data, rather than small samples, but where a methodology has not yet established itself. The aim of this paper is twofold: (1) We wish to demonstrate the usefulness of large-scale corpus studies (“text mining”) in the field of rhetorical history, and hopefully point to some interesting research problems and how they can be analyzed using “big-data” methods. (2) In doing this, we also aim to make a contribution to method development in e-science for the humanities and social sciences, and in particular in the framework of CLARIN. }, booktitle = {Linköping Electronic Conference Proceedings, No. 123. Edited by Koenraad De Smedt. Selected Papers from the CLARIN Annual Conference 2015. October 14–16, 2015, Wroclaw, Poland}, author = {Viklund, Jon and Borin, Lars}, year = {2016}, volume = {123}, ISBN = {978-91-7685-765-6}, pages = {79--93}, } @article{lindstromtiedemann-etal-2016-larka-248112, title = {Lärka: ett verktyg för träning av språkterminologi och grammatik}, abstract = {Lärka is a corpus-based tool, which allows students to practise and learn grammar based on authentic material. In this study we present how this has been used at four universities. We also use our logs to try to assess the students metalinguistic awareness in relation to international studies, and discuss how these logs can be used in the future.}, journal = {LexicoNordica}, author = {Lindström Tiedemann, Therese and Volodina, Elena and Jansson, Håkan}, year = {2016}, volume = {23}, pages = {161--181}, } @article{rehm-etal-2016-strategic-237609, title = {The strategic impact of META-NET on the regional, national and international level}, abstract = {This article provides an overview of the dissemination work carried out in META-NET from 2010 until 2015; we describe its impact on the regional, national and international level, mainly with regard to politics and the funding situation for LT topics. The article documents the initiative’s work throughout Europe in order to boost progress and innovation in our field.}, journal = {Language resources and evaluation}, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bielevičienė, Audronė and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and García-Mateo, Carmen and Genabith, Josef Van and Hajič, Jan and Hernáez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asunción and Odijk, Jan and Ogrodniczuk, Maciej and Pęzik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Mike and Pedersen, Bolette Sandford and Skadiņa, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiş, Dan and Váradi, Tamás and Vasiļjevs, Andrejs and Vider, Kadri and Zabarskaitė, Jolanta}, year = {2016}, volume = {50}, number = {2}, pages = {351--374}, } @inProceedings{alfter-etal-2016-from-246345, title = {From Distributions to Labels: A Lexical Proficiency Analysis using Learner Corpora}, abstract = {In this work we look at how information from second language learner essay corpora can be used for the evaluation of unseen learner essays. Using a corpus of learner essays which have been graded by well-trained human assessors using the CEFR scale, we extract a list of word distributions over CEFR levels. For the analysis of unseen essays, we want to map each word to a so-called target CEFR level using this word list. However, the task of mapping from a distribution to a single label is not trivial. We are also investigating how we can evaluate the mapping from distribution to label. We show that the distributional profile of words from the essays, informed with the essays’ levels, consistently overlaps with our frequency-based method, in the sense that words holding the same level of proficiency as predicted by our mapping tend to cluster together in a semantic space. In the absence of a gold standard, this information can be useful to see how often a word is associated with the same level in two different models. Also, in this case we have a similarity measure that can show which words are more central to a given level and which words are more peripheral. }, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Bizzoni, Yuri and Agebjörn, Anders and Volodina, Elena and Pilán, Ildikó}, year = {2016}, publisher = {Linköping University Electronic Press}, ISBN = {978-91-7685-633-8}, } @inProceedings{kokkinakis-2016-linguistic-243100, title = {Linguistic and extra-linguistic parameters for early detection of cognitive impairment}, abstract = {AIM: to adapt, develop and test methods that in isolation have shown promising outcomes on tasks related to (early) detection of dementia, differentiating between various dementia types and controls and also increase our understanding of the cognitive processes that underlie written text and certain forms of spoken language production. Unlike previous models, based solely on a certain aspect of language abilities (i.e. on written or spoken language alone), the project is comprehensive and more likely to provide new insights in the area of dementia detection and improve practices applied so far. The project builds on the success stories of the past and focus on the interplay between various types of technologies that hold the potential to provide reliable estimates for the detection of cognitive decline. The project emphasizes its interdisciplinary nature, by bringing together researchers from humanities (computational linguistics / language technology), computer science and medicine, and foresees the development of a comprehensive set of novel analytic approaches not explored jointly in the past GOAL: discovering evidence about linguistic performance and identifying whether the addition of new ways for investigating, combining and evaluating measurement and other parameters for improvement of established models can advance our understanding of: i) the boundaries between normal aging and dementia; ii) its effects on linguistic performance extrapolated from various sources and iii) whether effects of cognitive decline can be seen across (daily) language production. }, booktitle = {European Summer School on Eye Movements (ESSEM), 11-17 september, 2016 Athens, Greece.}, author = {Kokkinakis, Dimitrios}, year = {2016}, } @misc{kokkinakis-2016-proceedings-252412, title = {Proceedings of LREC 2016 Workshop: Resources and Processing of Linguistic and Extra-Linguistic Data from People with Various Forms of Cognitive/Psychiatric Impairments (RaPID-2016), Monday 23rd of May 2016. Linköping electronic conference proceedings.}, abstract = {The purpose of the Workshop on “Resources and ProcessIng of linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments” (RaPID-2016) was to provide a snapshot view of some of the current technological landscape, resources, data samples and also needs and challenges in the area of processing various data from individuals with various types of mental and neurological health impairments and similar conditions at various stages; increase the knowledge, understanding, awareness and ability to achieve useful outcomes in this area and strengthen the collaboration between researchers and workers in the field of clinical/nursing/medical sciences and those in the field of language technology/computational linguistics/Natural Language Processing (NLP). Although many of the causes of cognitive and neuropsychiatric impairments are difficult to foresee and accurately predict, physicians and clinicians work with a wide range of factors that potentially contribute to such impairments, e.g., traumatic brain injuries, genetic predispositions, side effects of medication, and congenital anomalies. In this context, there is new evidence that the acquisition and processing of linguistic data (e.g., spontaneous story telling) and extra-linguistic and production measures (e.g., eye tracking) could be used as a complement to clinical diagnosis and provide the foundation for future development of objective criteria to be used for identifying progressive decline or degeneration of normal mental and brain functioning. An important new area of research in NLP emphasizes the processing, analysis, and interpretation of such data and current research in this field, based on linguistic-oriented analysis of text and speech produced by such a population and compared to healthy adults, has shown promising outcomes. This is manifested in early diagnosis and prediction of individuals at risk, the differentiation of individuals with various degrees of severity forms of brain and mental illness, and for the monitoring of the progression of such conditions through the diachronic analysis of language samples or other extralinguistic measurements. Initially, work was based on written data but there is a rapidly growing body of research based on spoken samples and other modalities. Nevertheless, there remains significant work to be done to arrive at more accurate estimates for prediction purposes in the future and more research is required in order to reliably complement the battery of medical and clinical examinations currently undertaken for the early diagnosis or monitoring of, e.g., neurodegenerative and other brain and mental disorders and accordingly, aid the development of new, non-invasive, time and cost-effective and objective (future) clinical tests in neurology, psychology, and psychiatry.}, author = {Kokkinakis, Dimitrios}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-730-4}, } @inProceedings{adouane-etal-2016-asirem-246853, title = {ASIREM Participation at the Discriminating Similar Languages Shared Task 2016}, booktitle = {Proceedings of the Third Workshop on NLP for Similar Languages, Varieties and Dialects; 163–169; December 12; Osaka, Japan}, author = {Adouane, Wafia and Semmar, Nasredine and Johansson, Richard}, year = {2016}, } @inProceedings{lindh-etal-2016-comparison-242808, title = {Comparison of Perceptual and ASR Results on the SweEval2016 Corpus}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016.}, author = {Lindh, Jonas and Åkesson, Joel and Sundqvist, Maria}, year = {2016}, pages = {110--111}, } @inProceedings{lindh-akesson-2016-evaluation-242811, title = {Evaluation of Software ‘Error checks’ on the SweEval2016 Corpus for Forensic Speaker Comparison}, booktitle = {Proceedings of IAFPA25. 25th Annual Conference of the International Association for Forensic Phonetics and Acoustics. York, UK 24th – 27th July 2016}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2016}, pages = {57--58}, } @inProceedings{nusko-etal-2016-building-238135, title = {Building a Sentiment Lexicon for Swedish}, abstract = {In this paper we will present our ongoing project to build and evaluate a sentiment lexicon for Swedish. Our main resource is SALDO, a lexical resource of modern Swedish developed at Språkbanken, University of Gothenburg. Using a semi-supervised approach, we expand a manually chosen set of six core words using parent-child relations based on the semantic network structure of SALDO. At its current stage the lexicon consists of 175 seeds, 633 children, and 1319 grandchildren.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Nusko, Bianka and Tahmasebi, Nina and Mogren, Olof}, year = {2016}, volume = {126}, number = {006}, ISBN = {978-91-7685-733-5}, pages = {32----37}, } @inProceedings{cap-etal-2016-sword-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, } @inProceedings{nord-forsberg-2017-enklare-259902, title = {Enklare efter klarspråk? Myndighetstexter före och efter ett klarspråksprojekt}, booktitle = {Saga Bendegard, Ulla Melander Marttala & Maria Westman (red.), Språk och norm: Rapport från ASLA:s symposium, Uppsala universitet 21–22 april 2016}, author = {Nord, Andreas and Forsberg, Markus}, year = {2017}, publisher = {ASLA}, address = {Uppsala}, ISBN = {978-91-87884-26-9}, } @article{gruzitis-dannells-2017-multilingual-225789, title = {A multilingual FrameNet-based grammar and lexicon for controlled natural language}, abstract = {Berkeley FrameNet is a lexico-semantic resource for English based on the theory of frame semantics. It has been exploited in a range of natural language processing applications and has inspired the development of framenets for many languages. We present a methodological approach to the extraction and generation of a computational multilingual FrameNet-based grammar and lexicon. The approach leverages FrameNet-annotated corpora to automatically extract a set of cross-lingual semantico-syntactic valence patterns. Based on data from Berkeley FrameNet and Swedish FrameNet, the proposed approach has been implemented in Grammatical Framework (GF), a categorial grammar formalism specialized for multilingual grammars. The implementation of the grammar and lexicon is supported by the design of FrameNet, providing a frame semantic abstraction layer, an interlingual semantic application programming interface (API), over the interlingual syntactic API already provided by GF Resource Grammar Library. The evaluation of the acquired grammar and lexicon shows the feasibility of the approach. Additionally, we illustrate how the FrameNet-based grammar and lexicon are exploited in two distinct multilingual controlled natural language applications. The produced resources are available under an open source license.}, journal = {Language resources and evaluation}, author = {Gruzitis, Normunds and Dannélls, Dana}, year = {2017}, volume = {51}, number = {1}, pages = {37–66}, } @inProceedings{hammarstrom-etal-2017-poor-261851, title = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection}, abstract = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.}, booktitle = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 }, author = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus}, year = {2017}, publisher = {Association for Computing Machinery (ACM)}, address = {New York}, ISBN = {978-1-4503-5265-9}, } @inProceedings{borin-etal-2017-clarin-261157, title = {Swe-Clarin: Language resources and technology for Digital Humanities}, abstract = {CLARIN is a European Research Infrastructure Consortium (ERIC), which aims at (a) making extensive language-based materials available as primary research data to the humanities and social sciences (HSS); and (b) offering state-of-the-art language technology (LT) as an e-research tool for this purpose, positioning CLARIN centrally in what is often referred to as the digital humanities (DH). The Swedish CLARIN node Swe-Clarin was established in 2015 with funding from the Swedish Research Council. In this paper, we describe the composition and activities of Swe-Clarin, aiming at meeting the requirements of all HSS and other researchers whose research involves using text and speech as primary research data, and spreading the awareness of what Swe-Clarin can offer these research communities. We focus on one of the central means for doing this: pilot projects conducted in collaboration between HSS researchers and Swe-Clarin, together formulating a research question, the addressing of which requires working with large language-based materials. Four such pilot projects are described in more detail, illustrating research on rhetorical history, second-language acquisition, literature, and political science. A common thread to these projects is an aspiration to meet the challenge of conducting research on the basis of very large amounts of textual data in a consistent way without losing sight of the individual cases making up the mass of data, i.e., to be able to move between Moretti’s “distant” and “close reading” modes. While the pilot projects clearly make substantial contributions to DH, they also reveal some needs for more development, and in particular a need for document-level access to the text materials. As a consequence of this, work has now been initiated in Swe-Clarin to meet this need, so that Swe-Clarin together with HSS scholars investigating intricate research questions can take on the methodological challenges of big-data language-based digital humanities.}, booktitle = {Digital Humanities 2016. Extended Papers of the International Symposium on Digital Humanities (DH 2016) Växjö, Sweden, November, 7-8, 2016. Edited by Koraljka Golub, Marcelo Milra. Vol-2021}, author = {Borin, Lars and Tahmasebi, Nina and Volodina, Elena and Ekman, Stefan and Jordan, Caspar and Viklund, Jon and Megyesi, Beáta and Näsman, Jesper and Palmér, Anne and Wirén, Mats and Björkenstam, Kristina and Grigonyte, Gintare and Gustafson Capková, Sofia and Kosiński, Tomasz}, year = {2017}, publisher = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik V, RWTH Aachen.}, address = {Aachen}, } @inProceedings{virk-etal-2017-automatic-261789, title = {Automatic extraction of typological linguistic features from descriptive grammars}, abstract = {The present paper describes experiments on automatically extracting typological linguistic features of natural languages from traditional written descriptive grammars. The feature-extraction task has high potential value in typological, genealogical, historical, and other related areas of linguistics that make use of databases of structural features of languages. Until now, extraction of such features from grammars has been done manually, which is highly time and labor consuming and becomes prohibitive when extended to the thousands of languages for which linguistic descriptions are available. The system we describe here starts from semantically parsed text over which a set of rules are applied in order to extract feature values. We evaluate the system’s performance on the manually curated Grambank database as the gold standard and report the first measures of precision and recall for this problem.}, booktitle = {Text, Speech, and Dialogue 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings}, editor = {Kamil Ekštein and Václav Matoušek.}, author = {Virk, Shafqat and Borin, Lars and Saxena, Anju and Hammarström, Harald}, year = {2017}, publisher = {Springer International Publishing}, address = {Cham}, ISBN = {978-3-319-64205-5}, } @misc{volodina-etal-2017-preface-262846, title = {Preface. Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017}, abstract = {For the second year in a row we brought two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together. The goal of organizing joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications. The resulting volume covers a variety of topics from the two fields and - hopefully - showcases the challenges and achievements in the field. The seven papers in this volume cover native language identification in learner writings, using syntactic complexity development in language learner language to identify reading comprehension texts of appropriate level, exploring the potential of parallel corpora to predict mother-language specific problem areas for learners of another language, tools for learning languages - both well-resourced ones such as English as well as endangered or under-resourced ones such as Yakut and Võro, as well as exploring the potential of automatically identifying and correcting word-level errors in Swedish learner writing.}, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina}, year = {2017}, volume = {30}, pages = {i--vi}, } @article{pilan-etal-2017-candidate-260382, title = {Candidate sentence selection for language learning exercises: From a comprehensive framework to an empirical evaluation}, abstract = {We present a framework and its implementation relying on Natural Language Processing methods, which aims at the identification of exercise item candidates from corpora. The hybrid system combining heuristics and machine learning methods includes a number of relevant selection criteria. We focus on two fundamental aspects: linguistic complexity and the dependence of the extracted sentences on their original context. Previous work on exercise generation addressed these two criteria only to a limited extent, and a refined overall candidate sentence selection framework appears also to be lacking. In addition to a detailed description of the system, we present the results of an empirical evaluation conducted with language teachers and learners which indicate the usefulness of the system for educational purposes. We have integrated our system into a freely available online learning platform.}, journal = {Revue Traitement Automatique des Langues. Special issue on NLP for Learning and Teaching}, author = {Pilán, Ildikó and Volodina, Elena and Borin, Lars}, year = {2017}, volume = {57}, number = {3}, pages = {67--91}, } @inProceedings{volodina-etal-2017-svalex-262848, title = {SVALex. En andraspråksordlista med CEFR-nivåer}, abstract = {När man planerar att utveckla en språkkurs i ett andra- eller främmandespråk (L2) ställs man inför utmaningen att definiera vilket ordförråd inlärarna behöver tillägna sig. Forskning inom andraspråksinlärning tyder på att läsaren behöver kunna 95–98 % av löporden i en text för att förstå den (Laufer & Ravenhorst-Kalovski 2010). Sådana studier är användbara för att uppskatta storleken på det ordförråd som behövs för att tillägna sig innehållet i en text, men de ger ingen närmare metodologisk vägledning för den som vill utveckla nivåstrukturerade läromedel eller kurser för andraspråksundervisning. Speciellt tydligt är detta inom CALL, Computer-Assisted Language Learning, där läromaterial (t.ex. övningar) genereras automatiskt, och behöver elektroniska resurser som kunskapskälla. Man kan istället angripa problemet från andra hållet. Om man har en samling nivåklassificerade texter för andraspråksinlärare kan man utifrån dem bygga ordlistor där varje ord är placerat på en färdighetsskala. Om man känner till den förutsatta färdighetsnivån hos läsaren, kan man helt enkelt anta att den textnivå där ett ord dyker upp första gången också anger ordets svårighetsgrad. SVALex är ett lexikon som har byggts enligt den principen. Resursen ska kunna användas av inlärare och lärare i svenska som andraspråk, men även av lexikografer, av kursutvecklare och provkonstruktörer samt av dem som likt oss själva ägnar sig åt utveckling av språkteknologibaserade datorstöd för språkinlärning och språktestning. SVALex utgör en vidareutveckling i förhållande till tidigare lexikonresurser för svenska som andraspråk (se avsnitt 2), genom att den konsekvent relaterar de 15 681 lexikoningångarna till en vida använd färdighetsskala för andra- och främmandespråksinlärning, Europarådets gemensamma europeiska referensram för språk (Common European Framework of Reference, i fortsättningen refererad till som CEFR) (Council of Europe 2001; Skolverket 2009). Nivåklassningen av lexikonenheterna i SVALex görs på basis av deras distribution i COCTAILL, en korpus innehållande lärobokstexter i svenska som andraspråk, där lärare har placerat in varje text i någon av CEFR-nivåerna (Volodina et al. 2014). }, booktitle = {Svenskans beskrivning. 35, Förhandlingar vid trettiofemte sammankomsten : Göteborg 11–13 maj 2016 / Redigerad av Emma Sköldberg, Maia Andréasson, Henrietta Adamsson Eryd, Filippa Lindahl, Sven Lindström, Julia Prentice & Malin Sandberg}, author = {Volodina, Elena and Borin, Lars and Pilán, Ildikó and François, Thomas and Tack, Annaïs}, year = {2017}, publisher = {Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-64-6}, } @misc{volodina-etal-2017-proceedings-262838, title = {Proceedings of the Joint 6th Workshop on NLP for Computer Assisted Language Learning and 2nd Workshop on NLP for Research on Language Acquisition at NoDaLiDa 2017, Gothenburg, 22nd May 2017}, abstract = {For the second year in a row we have brought the two related themes of NLP for Computer-Assisted Language Learning and NLP for Language Acquisition together under one umbrella. The goal of organizing these joint workshops is to provide a meeting place for researchers working on language learning issues including both empirical and experimental studies and NLP-based applications.}, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Grigonyte, Gintare and Nilsson Björkenstam, Kristina}, year = {2017}, publisher = {Linköping University Press}, address = {Linköping, Sweden}, ISBN = { 978-91-7685-502-7}, } @inProceedings{pilan-etal-2017-larka-289884, title = {Lärka: an online platform where language learning meets natural language processing}, booktitle = {7th ISCA Workshop on Speech and Language Technology in Education, 25-26 August 2017, Stockholm, Sweden}, author = {Pilán, Ildikó and Alfter, David and Volodina, Elena}, year = {2017}, } @techreport{hammarstedt-etal-2017-korp-256056, title = {Korp 6 - Användarmanual}, author = {Hammarstedt, Martin and Borin, Lars and Forsberg, Markus and Roxendal, Johan and Schumacher, Anne and Öhrman, Maria}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @techreport{hammarstedt-etal-2017-korp-256055, title = {Korp 6 - Technical Report}, author = {Hammarstedt, Martin and Roxendal, Johan and Öhrman, Maria and Borin, Lars and Forsberg, Markus and Schumacher, Anne}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @inProceedings{tahmasebi-risse-2017-uses-256649, title = {On the Uses of Word Sense Change for Research in the Digital Humanities}, abstract = {With advances in technology and culture, our language changes. We invent new words, add or change meanings of existing words and change names of existing things. Unfortunately, our language does not carry a memory; words, expressions and meanings used in the past are forgotten over time. When searching and interpreting content from archives, language changes pose a great challenge. In this paper, we present results of automatic word sense change detection and show the utility for archive users as well as digital humanities’ research. Our method is able to capture changes that relate to the usage and culture of a word that cannot easily be found using dictionaries or other resources.}, booktitle = {Research and Advanced Technology for Digital Libraries - 21st International Conference on Theory and Practice of Digital Libraries, TPDL 2017, Thessaloniki, Greece, September 18-21, 2017. Proceedings}, editor = {Jaap Kamps and Giannis Tsakonas and Yannis Manolopoulos and Lazaros Iliadis and Ioannis Karydis}, author = {Tahmasebi, Nina and Risse, Thomas}, year = {2017}, publisher = {Springer Verlag}, address = {Cham}, ISBN = {978-3-319-67007-2}, } @inProceedings{abualhajia-etal-2017-parameter-256642, title = {Parameter Transfer across Domains for Word Sense Disambiguation}, abstract = {Word sense disambiguation is defined as finding the corresponding sense for a target word in a given context, which comprises a major step in text applications. Recently, it has been addressed as an optimization problem. The idea behind is to find a sequence of senses that corresponds to the words in a given context with a maximum semantic similarity. Metaheuristics like simulated annealing and D-Bees provide approximate good-enough solutions, but are usually influenced by the starting parameters. In this paper, we study the parameter tuning for both algorithms within the word sense disambiguation problem. The experiments are conducted on different datasets to cover different disambiguation scenarios. We show that D-Bees is robust and less sensitive towards the initial parameters compared to simulated annealing, hence, it is sufficient to tune the parameters once and reuse them for different datasets, domains or languages.}, booktitle = {Proceedings of Recent Advances in Natural Language Processing Meet Deep Learning, Varna, Bulgaria 2–8 September 2017 / Edited by Galia Angelova, Kalina Bontcheva, Ruslan Mitkov, Ivelina Nikolova, Irina Temnikova }, author = {Abualhajia, Sallam and Tahmasebi, Nina and Forin, Diane and Zimmermann, Karl-Heinz}, year = {2017}, ISBN = { 978-954-452-048-9}, } @article{themistocleous-2017-nature-251205, title = {The Nature of Phonetic Gradience across a Dialect Continuum: Evidence from Modern Greek Vowels.}, abstract = {This study investigates the acoustic properties of vowels in 2 Modern Greek varieties: Standard Modern Greek (SMG) and Cypriot Greek (CG). Both varieties contain in their phonetic inventories the same 5 vowels. Forty-five female speakers between 19 and 29 years old participated in this study: 20 SMG speakers and 25 CG speakers, born and raised in Athens and Nicosia, respectively. Stimuli consisted of a set of nonsense CVCV and VCV words, each containing 1 of the 5 Greek vowels in stressed and unstressed position. Gaining insights from the controlled experimental design, the study sheds light on the gradient effects of vowel variation in Modern Greek. It shows that (1) stressed vowels are more peripheral than unstressed vowels, (2) SMG unstressed /i a u/ vowels are more raised than the corresponding CG vowels, (3) SMG unstressed vowels are shorter than CG unstressed vowels, and (4) SMG /i·u/ are more rounded than the corresponding CG vowels. Moreover, it shows that variation applies to specific subsystems, as it is the unstressed vowels that vary cross-varietally whereas the stressed vowels display only minor differences. The implications of these findings with respect to vowel raising and vowel reduction are discussed.}, journal = {Phonetica}, author = {Themistocleous, Charalambos}, year = {2017}, volume = {74}, number = {3}, pages = {157--172}, } @article{nautsch-etal-2017-making-258734, title = {Making Likelihood Ratios Digestible for Cross-Application Performance Assessment}, abstract = {Performance estimation is crucial to the assessment of novel algorithms and systems. In detection error tradeoff (DET) diagrams, discrimination performance is solely assessed targeting one application, where cross-application performance considers risks resulting from decisions, depending on application constraints. For the purpose of interchangeability of research results across different application constraints, we propose to augment DET curves by depicting systems regarding their support of security and convenience levels. Therefore, application policies are aggregated into levels based on verbal likelihood ratio scales, providing an easy to use concept for business-to-business communication to denote operative thresholds. We supply a reference implementation in Python, an exemplary performance assessment on synthetic score distributions, and a fine-tuning scheme for Bayes decision thresholds, when decision policies are bounded rather than fix.}, journal = {IEEE Signal Processing Letters}, author = {Nautsch, A. and Meuwly, D. and Ramos, D. and Lindh, Jonas and Busch, C.}, year = {2017}, volume = {24}, number = {10}, pages = {1552--1556}, } @inProceedings{bjorkner-etal-2017-voice-256522, title = {Voice acoustic parameters for detecting signs of early cognitive impairment}, abstract = {Aiding the detection of very early cognitive impairment in Alzheimer's disease (AD) and assessing the disease progression are essential foundations for effective psychological assessment, diagnosis and planning. Efficient tools for routine dementia screening in primary health care, particularly non-invasive and cost-effective methods, are desirable. The aim of this study is to find out if voice acoustic analysis can be a useful tool for detecting signs of early cognitive impairment.}, booktitle = {PEVOC (PanEuropean Voice Conference) 12, August 30th - September 1st 2017, Ghent, Belgium}, author = {Björkner, Eva and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, } @misc{tidemann-tahmasebi-2017-proceedings-264302, title = {Proceedings of the 21st Nordic Conference on Computational Linguistics, NODALIDA 2017, Gothenburg, Sweden, May 22-24, 2017 }, author = {Tidemann, Jörg and Tahmasebi, Nina}, year = {2017}, publisher = {Association for Computational Linguistics}, ISBN = {978-91-7685-601-7}, } @inProceedings{bernardy-themistocleous-2017-modelling-258661, title = {Modelling prosodic structure using Artificial Neural Networks}, abstract = {The ability to accurately perceive whether a speaker is asking a question or is making a statement is crucial for any successful interaction. However, learning and classifying tonal patterns has been a challenging task for automatic speech recognition and for models of tonal representation, as tonal contours are characterized by significant variation. This paper provides a classification model of Cypriot Greek questions and statements. We evaluate two state-of-the-art network architectures: a Long Short-Term Memory (LSTM) network and a convolutional network (ConvNet). The ConvNet outperforms the LSTM in the classification task and exhibited an excellent performance with 95% classification accuracy.}, booktitle = {ExLing 2017. Proceedings of 8 th Tutorial and Research Workshop on Experimental Linguistics, 19-22 June 2017, Heraklion, Crete, Greece}, editor = {Antonis Botinis}, author = {Bernardy, Jean-Philippe and Themistocleous, Charalambos}, year = {2017}, publisher = {University of Athens}, address = {Athens}, ISBN = {978-960-466-162-6}, } @incollection{wilhelmsson-2017-forutsattningarna-249467, title = {Om förutsättningarna för språkligt datorstöd på ordnivån och uppåt}, booktitle = {Text och kontext - perspektiv på textanalys / Karin Helgesson, Hans Lundqvist, Anna Lyngfelt, Andreas Nord & Åsa Wengelin (red.)}, author = {Wilhelmsson, Kenneth}, year = {2017}, publisher = {Gleerups}, address = {Malmö}, ISBN = {978-91-40-69364-8}, pages = {207--228}, } @article{themistocleous-2017-classifying-254040, title = {Classifying linguistic and dialectal information from vowel acoustic parameters}, abstract = {This study provides a classification model of two Modern Greek dialects, namely Athenian Greek and Cypriot Greek, using information from formant dynamics of F1, F2, F3, F4 and vowel duration. To this purpose, a large corpus of vowels from 45 speakers of Athenian Greek and Cypriot Greek was collected. The first four formant frequencies were measured at multiple time points and modelled using second degree polynomials. The measurements were employed in classification experiments, using three classifiers: Linear Discriminant Analysis, Flexible Discriminant Analysis, and C5.0. The latter outperformed the other classification models, resulting in a higher classification accuracy of the dialect. C5.0 classification shows that duration and the zeroth coefficient of F2, F3 and F4 contribute more to the classification of the dialect than the other measurements; it also shows that formant dynamics are important for the classification of dialect.}, journal = {Speech Communication}, author = {Themistocleous, Charalambos}, year = {2017}, volume = {92}, pages = {13--22}, } @inProceedings{tahmasebi-risse-2017-finding-256637, title = {Finding Individual Word Sense Changes and their Delay in Appearance}, abstract = {We present a method for detecting word sense changes by utilizing automatically induced word senses. Our method works on the level of individual senses and allows a word to have e.g. one stable sense and then add a novel sense that later experiences change. Senses are grouped based on polysemy to find linguistic concepts and we can find broadening and narrowing as well as novel (polysemous and homonymic) senses. We evaluate on a testset, present recall and estimates of the time between expected and found change.}, booktitle = {Proceedings of Recent Advances in Natural Language Processing 2017. Varna, Bulgaria 2–8 September, 2017}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov and Ivelina Nikolova and Irina Temnikova}, author = {Tahmasebi, Nina and Risse, Thomas}, year = {2017}, ISBN = {978-954-452-048-9}, } @inProceedings{fraser-etal-2017-analysis-257840, title = {An analysis of eye-movements during reading for the detection of mild cognitive impairment}, abstract = {We present a machine learning analysis of eye-tracking data for the detection of mild cognitive impairment, a decline in cognitive abilities that is associated with an increased risk of developing dementia. We compare two experimental configurations (reading aloud versus reading silently), as well as two methods of combining information from the two trials (concatenation and merging). Additionally, we annotate the words being read with information about their frequency and syntactic category, and use these annotations to generate new features. Ultimately, we are able to distinguish between participants with and without cognitive impairment with up to 86% accuracy.}, booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. September 9-11, 2017 Copenhagen, Denmark / Editors Martha Palmer, Rebecca Hwa, Sebastian Riedel }, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, publisher = {Association for Computational Linguistics }, ISBN = {978-1-945626-83-8}, } @inProceedings{kokkinakis-etal-2017-data-256955, title = {Data Collection from Persons with Mild Forms of Cognitive Impairment and Healthy Controls - Infrastructure for Classification and Prediction of Dementia}, abstract = {Cognitive and mental deterioration, such as difficulties with memory and language, are some of the typical phenotypes for most neurodegenerative diseases including Alzheimer’s disease and other dementia forms. This paper describes the first phases of a project that aims at collecting various types of cognitive data, acquired from human subjects in order to study relationships among linguistic and extra-linguistic observations. The project’s aim is to identify, extract, process, correlate, evaluate, and disseminate various linguistic phenotypes and measurements and thus contribute with complementary knowledge in early diagnosis, monitor progression, or predict individuals at risk. In the near future, automatic analysis of these data will be used to extract various types of features for training, testing and evaluating automatic classifiers that could be used to differentiate individuals with mild symptoms of cognitive impairment from healthy, age-matched controls and identify possible indicators for the early detection of mild forms of cognitive impairment. Features will be extracted from audio recordings (speech signal), the transcription of the audio signals (text) and the raw eye-tracking data.}, booktitle = {Proceedings of the 21st Nordic Conference on Computational Linguistics, NoDaLiDa, 22-24 May 2017, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2017}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7685-601-7}, } @inProceedings{fyndanis-etal-2017-time-260585, title = {Time reference and aspect in agrammatic aphasia: Evidence from Greek}, abstract = {Time reference, which has been found to be selectively impaired in agrammatic aphasia (e.g., Bastiaanse et al., 2011), is often interwoven with grammatical aspect. Dragoy and Bastiaanse (2013) investigated the relationship between time reference/tense and aspect focusing on Russian aphasia and found that the two interact: past reference was less impaired when tested within perfective aspect (compared to when tested within imperfective aspect), and reference to the nonpast was less impaired when tested within imperfective aspect (compared to when tested within perfective aspect). To account for this pattern, Dragoy and Bastiaanse (2013: 114) claimed that “perfectives primarily refer to completed, past events while imperfectives prototypically describe ongoing, non-past events”. This study explores the relationship between time reference and aspect focusing on Greek aphasia. In Greek, verb forms referring to the past and future encode the perfective-imperfective contrast. Dragoy and Bastiaanse (2013) would make predictions PR1–PR4 for Greek. (PR1) past reference within perfective aspect > past reference within imperfective aspect; (PR2) future reference within perfective aspect < future reference within imperfective aspect; (PR3) perfective aspect within past reference > imperfective aspect within past reference; (PR4) perfective aspect within future reference < imperfective aspect within future reference. Methods Eight Greek-speaking persons with agrammatic aphasia (PWA) and eight controls were administered a sentence completion task consisting of 128 experimental source sentence (SS)-target sentence (TS) pairs. There were eight subconditions, each of which consisted of 16 items: past reference within perfective aspect; past reference within imperfective aspect; future reference within perfective aspect; future reference within imperfective aspect; perfective aspect within past reference; imperfective aspect within past reference; perfective aspect within future reference; imperfective aspect within future reference. Participants were auditorily presented with a SS and the beginning of the TS, and were asked to orally complete the TS producing the missing Verb Phrase. We fitted generalized linear mixed-effect models and employed Fisher’s exact tests to make within-participant comparisons. Results Overall, the aphasic group fared significantly worse than the control group (p < 0.001). At the group level, none of the four relevant comparisons (see PR1–PR4) yielded significant differences for PWA (Table 1). Four PWA (P1, P3, P7, P8) exhibited dissociations, with three of them making up a double dissociation: P1 performed better on imperfective aspect-future reference than on perfective aspect-future reference (p < 0.001), and P7 and P8 exhibited the opposite pattern (p = 0.016 and p < 0.001 for P7 and P8, respectively). Discussion Results are not consistent with Dragoy and Bastiaanse’s (2013) findings, which challenges the idea of prototypical and non-prototypical associations between time reference and aspect. The double dissociation that emerged in the aspect condition indicates that a given time reference-aspect combination may be relatively easy to process for some PWA but demanding for some others. Thus, studies investigating tense/time reference in aphasia should ensure that this grammatical/semantic category is not confounded by aspect. }, booktitle = { Front. Hum. Neurosci. Conference Abstract: Academy of Aphasia, 55th Annual Meeting, Baltimore, United States, 5 Nov - 7 Nov, 2017. }, author = {Fyndanis, Valantis and Themistocleous, Charalambos and Christidou, Paraskevi}, year = {2017}, } @inProceedings{nietopina-johansson-2017-training-261938, title = {Training Word Sense Embeddings With Lexicon-based Regularization}, abstract = {We propose to improve word sense embeddings by enriching an automatic corpus-based method with lexicographic data. Information from a lexicon is introduced into the learning algorithm’s objective function through a regularizer. The incorporation of lexicographic data yields embeddings that are able to reflect expertdefined word senses, while retaining the robustness, high quality, and coverage of automatic corpus-based methods. These properties are observed in a manual inspection of the semantic clusters that different degrees of regularizer strength create in the vector space. Moreover, we evaluate the sense embeddings in two downstream applications: word sense disambiguation and semantic frame prediction, where they outperform simpler approaches. Our results show that a corpusbased model balanced with lexicographic data learns better representations and improve their performance in downstream tasks}, booktitle = {Proceedings of the Eighth International Joint Conference on Natural Language Processing (Volume 1: Long Papers), Taipei, Taiwan, November 27 – December 1, 2017}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2017}, publisher = {Asian Federation of Natural Language Processing }, ISBN = {978-1-948087-00-1}, } @article{grohmann-etal-2017-acquiring-252175, title = {Acquiring Clitic Placement in Bilectal Settings: Interactions between Social Factors}, abstract = {This paper examines the development of object clitic placement by children acquiring Cypriot Greek. Greek-speaking Cyprus is sociolinguistically characterized by diglossia between two varieties of Greek, the local Cypriot Greek and the official Standard Modern Greek. Arguably as a result of this situation, clitics may be placed post- (enclisis) or preverbally (proclisis) in the same syntactic environment; while the former is a property of Cypriot Greek, the latter is typically considered an effect of the standard language. The following issues are investigated here: (a) how such bilectal speakers distinguish between the two Greek varieties with respect to clitic placement; (b) how the acquisition of clitics develops over time; (c) how, and which, sociolinguistic factors determine clitic placement; and (d) how schooling may affect clitic placement. To address (a)–(d), a sentence completion task was used to elicit clitic productions, administered to 431 children around Cyprus ranging from 2;8 to 8;11. The C5.0 machine-learning algorithm was employed to model the interaction of (socio-)linguistic factors on the development of clitic placement. The model shows that speakers acquire the relevant features very early, yet compartmentalization of form and function according to style emerges only as they engage in the larger speech community. In addition, the effects of sociolinguistic factors on clitic placement appear gradually.}, journal = {Frontiers in Communication}, author = {Grohmann, Kleanthes and Papadopoulou, Elena and Themistocleous, Charalambos}, year = {2017}, volume = {2}, } @book{lindh-2017-forensic-261214, title = {Forensic comparison of voices, speech and speakers : tools and methods in forensic phonetics}, abstract = {This thesis has three main objectives. The first objective (A) includes Study I, which investigates the parameter fundamental frequency (F0) and its robustness in different acoustic contexts by using different measures. The outcome concludes that using the alternative baseline as a measure will diminish the effect of low-quality recordings or varying speaking liveliness. However, both creaky voice and raised vocal effort induce intra-variation problems that are yet to be solved. The second objective (B) includes study II, III and IV. Study II investigates the differences between the results from an ear witness line-up experiment and the pairwise perceptual judgments of voice similarity performed by a large group of listeners. The study shows that humans seem to be much more focused on similarities of speech style than features connected to voice quality, even when recordings are played backwards. Study III investigates the differences between an automatic voice comparison system and humans’ perceptual judgments of voice similarity. The experiments’ results show that it is possible to see a correlation between how speakers were judged as more or less different using multidimensional scaling of similarity ranks compared to both the automatic system and the listeners. However, there are also differences due to the fact that human listeners include information about speech style and have difficulties weighting the parameters, i.e. ignoring them when they are contradictory. Study IV successfully investigates a new functional method for how to convert the perceptual similarity judgments made by humans and then compare those to the automatic system results within the likelihood ratio framework. It was discovered that the automatic system outperformed the naïve human listeners in this task (using a very small dataset). The third objective (C) includes study V. Study V investigates several statistical modelling techniques to calculate relevant likelihood ratios using simulations based on existing reference data in an authentic forensic case of a disputed utterance. The study presents several problems with modelling small datasets and develops methods to take into account the lack of data within the likelihood ratio framework. In summary, the thesis contains a larger historical background to forensic speaker comparison to guide the reader into the current research situation within forensic phonetics. The work further seeks to build a bridge between forensic phonetics and automatic voice recognition. Practical casework implications have been considered throughout the work on the basis of own experience as a forensic caseworker and through collaborative interaction with other parties working in the field, both in research and in forensic practice and law enforcement. Since 2005, the author has been involved in over 400 forensic cases and given testimony in several countries.}, author = {Lindh, Jonas}, year = {2017}, publisher = {Department of Philosophy, Linguistics, and Theory of Science, University of Gothenburg}, address = {Gothenburg}, ISBN = {978-91-629-0141-7}, } @article{themistocleous-2017-effects-259668, title = {Effects of Two Linguistically Proximal Varieties on the Spectral and Coarticulatory Properties of Fricatives: Evidence from Athenian Greek and Cypriot Greek}, abstract = {Several studies have explored the acoustic structure of fricatives, yet there has been very little acoustic research on the effects of dialects on the production of fricatives. This article investigates the effects of two linguistically proximal Modern Greek dialects, Athenian Greek and Cypriot Greek on the temporal, spectral, and coarticulatory properties of fricatives and aims to determine the acoustic properties that convey information about these two dialects. Productions of voiced and voiceless labiodental, dental, alveolar, palatal, and velar fricatives were extracted from a speaking task from typically speaking female adult speakers (25 Cypriot Greek and 20 Athenian Greek speakers). Measures were made of spectral properties, using a spectral moments analysis. The formants of the following vowel were measured and second degree polynomials of the formant contours were calculated. The findings showed that Athenian Greek and Cypriot Greek fricatives differ in all spectral properties across all places of articulation. Also, the co-articulatory effects of fricatives on following vowel were different depending on the dialect. Duration, spectral moments, and the starting frequencies of F1, F2, F3, and F4 contributed the most to the classification of dialect. These findings provide a solid evidence base for the manifestation of dialectal information in the acoustic structure of fricatives.}, journal = {Frontiers in Psychology}, author = {Themistocleous, Charalambos}, year = {2017}, volume = {8}, number = {1945}, pages = {1--19}, } @misc{bouma-adesam-2017-proceedings-254435, title = {Proceedings of the NoDaLiDa 2017 Workshop on Processing Historical Language}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2017}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7685-503-4}, } @article{alfter-agebjorn-2017-review-253359, title = {Review of Developing, Modelling and Assessing Second Languages}, journal = {Linguistlist}, author = {Alfter, David and Agebjörn, Anders}, year = {2017}, } @inProceedings{nietopina-johansson-2018-automatically-270261, title = {Automatically Linking Lexical Resources with Word Sense Embedding Models}, abstract = {Automatically learnt word sense embeddings are developed as an attempt to refine the capabilities of coarse word embeddings. The word sense representations obtained this way are, however, sensitive to underlying corpora and parameterizations, and they might be difficult to relate to word senses as formally defined by linguists. We propose to tackle this problem by devising a mechanism to establish links between word sense embeddings and lexical resources created by experts. We evaluate the applicability of these links in a task to retrieve instances of Swedish word senses not present in the lexicon.}, booktitle = {The Third Workshop on Semantic Deep Learning (SemDeep-3), August 20th, 2018, Santa Fe, New Mexico, USA / Luis Espinosa Anke, Thierry Declerck, Dagmar Gromann (eds.)}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2018}, ISBN = {978-1-948087-56-8}, } @techreport{bjorklund-etal-2018-erfarenhetsutbyte-331777, title = {Kan erfarenhetsutbyte med andra med samma funktionsnedsättning leda till förbättrad kommunikation hos vuxna med hörselnedsättning?}, author = {Björklund, Kerstin and Grindborg, Kristin and Lundholm Fors, Kristina and Malmberg, Milijana and Tovetjärn, Margareta and Wickman, Jenny and Öhman, Anna-Karin}, year = {2018}, publisher = {Föreningen Sveriges HörselChefer}, ISBN = {978-91-639-9120-2}, } @inProceedings{volodina-etal-2018-interoperability-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad}, year = {2018}, } @inProceedings{adesam-etal-2018-exploring-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{alfter-pilan-2018-complex-276407, title = {SB@ GU at the Complex Word Identification 2018 Shared Task}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Pilán, Ildikó}, year = {2018}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA, USA}, ISBN = {978-1-948087-11-7}, } @inProceedings{malm-etal-2018-uneek-267351, title = {Uneek: a Web Tool for Comparative Analysis of Annotated Texts}, abstract = {In this paper, we present Uneek, a web based linguistic tool that performs set operations on raw or annotated texts. The tool may be used for automatic distributional analysis, and for disambiguating polysemy with a method that we refer to as semi-automatic uniqueness differentiation (SUDi). Uneek outputs the intersection and differences between their listed attributes, e.g. POS, dependencies, word forms, frame elements. This makes it an ideal supplement to methods for lumping or splitting in frame development processes. In order to make some of Uneek’s functions more clear, we employ SUDi on a small data set containing the polysemous verb "bake". As of now, Uneek may only run two files at a time, but there are plans to develop the tool so that it may simultaneously operate on multiple files. Finally, we relate the developmental plans for added functionality, to how such functions may support FrameNet work in the future.}, booktitle = {Proceedings of the LREC 2018 Workshop International FrameNetWorkshop 2018: Multilingual Framenets and Constructicons, 7-12 May 2018, Miyazaki (Japan) / [ed] Tiago Timponi Torrent, Lars Borin & Collin F. Baker, 2018}, author = {Malm, Per and Ahlberg, Malin and Rosén, Dan}, year = {2018}, ISBN = {979-10-95546-04-7}, } @article{eckhoff-etal-2018-proiel-265108, title = {The PROIEL treebank family: a standard for early attestations of Indo-European languages}, abstract = {This article describes a family of dependency treebanks of early attestations of Indo-European languages originating in the parallel treebank built by the members of the project pragmatic resources in old Indo-European languages. The treebanks all share a set of open-source software tools, including a web annotation interface, and a set of annotation schemes and guidelines developed especially for the project languages. The treebanks use an enriched dependency grammar scheme complemented by detailed morphological tags, which have proved sufficient to give detailed descriptions of these richly inflected languages, and which have been easy to adapt to new languages. We describe the tools and annotation schemes and discuss some challenges posed by the various languages that have been annotated. We also discuss problems with tokenisation, sentence division and lemmatisation, commonly encountered in ancient and mediaeval texts, and challenges associated with low levels of standardisation and ongoing morphological and syntactic change.}, journal = {Language Resources and Evaluation}, author = {Eckhoff, H. and Bech, K. and Bouma, Gerlof and Eide, K. and Haug, D. and Haugen, O. E. and Johndal, M.}, year = {2018}, volume = {52}, number = {1}, pages = {29--65}, } @inProceedings{lundholmfors-etal-2018-automated-263790, title = {Automated Syntactic Analysis of Language Abilities in Persons with Mild and Subjective Cognitive Impairment}, abstract = {In this work we analyze the syntactic complexity of transcribed picture descriptions using a variety of automated syntactic features, and investigate the features’ predictive power in classifying narratives from people with subjective and mild cognitive impairment and healthy controls. Our results indicate that while there are no statistically significant differences, syntactic features can still be moderately successful at distinguishing the participant groups when used in a machine learning framework.}, booktitle = {Building continents of knowledge in oceans of data : the future of co-created eHealth: proceedings of MIE2018, 24-26 April 2018, Gothenburg, Sweden}, editor = {Adrien Ugon and Daniel Karlsson and Gunnar O. Klein and Anne Moen.}, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, publisher = {IOS Press}, address = {Amsterdam}, ISBN = {978-1-61499-851-8}, } @inProceedings{rouces-etal-2018-defining-264721, title = {Defining a gold standard for a Swedish sentiment lexicon: Towards higher-yield text mining in the digital humanities}, abstract = {There is an increasing demand for multilingual sentiment analysis, and most work on sentiment lexicons is still carried out based on English lexicons like WordNet. In addition, many of the non-English sentiment lexicons that do exist have been compiled by (machine) translation from English resources, thereby arguably obscuring possible language-specific characteristics of sentiment-loaded vocabulary. In this paper we describe the creation from scratch of a gold standard for the sentiment annotation of Swedish terms as a first step towards the creation of a full-fledged sentiment lexicon for Swedish.}, booktitle = {CEUR Workshop Proceedings vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina and Rødven-Eide, Stian}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @incollection{borin-etal-2018-linguistics-269084, title = {Linguistics vs. language technology in constructicon building and use}, abstract = {In this chapter, we describe the close interaction of linguists and language technologists in the Swedish constructicon project. This kind of collaboration is not so common today, because of the way that language technology has developed in recent decades, but in our case the collaboration has been very successful, and constituted a genuine instance of cross-fertilization, where an evolving language technology infrastructure and a computational lexical macroresource described in the chapter has formed an integral part of the Swedish constructicon development environment, while at the same time the structured linguistic knowledge described in the constructicon has informed the language technology making up the infrastructure.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Borin, Lars and Dannélls, Dana and Gruzitis, Normunds}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {229--253}, } @inProceedings{tahmasebi-2018-study-264722, title = {A Study on Word2Vec on a Historical Swedish Newspaper Corpus}, abstract = {Detecting word sense changes can be of great interest in the field of digital humanities. Thus far, most investigations and automatic methods have been developed and carried out on English text and most recent methods make use of word embeddings. This paper presents a study on using Word2Vec, a neural word embedding method, on a Swedish historical newspaper collection. Our study includes a set of 11 words and our focus is the quality and stability of the word vectors over time. We investigate if a word embedding method like Word2Vec can be effectively used on texts where the volume and quality is limited.}, booktitle = {CEUR Workshop Proceedings. Vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference, Helsinki Finland, March 7-9, 2018. Edited by Eetu Mäkelä, Mikko Tolonen, Jouni Tuominen }, author = {Tahmasebi, Nina}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{rosen-etal-2018-error-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{jatowt-etal-2018-every-272054, title = {Every Word Has Its History: Interactive Exploration and Visualization of Word Sense Evolution}, booktitle = {CIKM '18 Proceedings of the 27th ACM International Conference on Information and Knowledge Management, October 22 - 26, 2018, Torino, Italy}, author = {Jatowt, Adam and Campos, Ricardo and Bhowmick, Sourav S. and Tahmasebi, Nina and Doucet, Antoine}, year = {2018}, publisher = {ACM}, address = {New York, NY, USA}, ISBN = {978-1-4503-6014-2}, } @inProceedings{rouces-etal-2018-sensaldo-264720, title = {SenSALDO: Creating a Sentiment Lexicon for Swedish}, abstract = {The natural language processing subfield known as sentiment analysis or opinion mining has seen an explosive expansion over the last decade or so, and sentiment analysis has become a standard item in the NLP toolbox. Still, many theoretical and methodological questions remain unanswered and resource gaps unfilled. Most work on automated sentiment analysis has been done on English and a few other languages; for most written languages of the world, this tool is not available. This paper describes the development of an extensive sentiment lexicon for written (standard) Swedish. We investigate different methods for developing a sentiment lexicon for Swedish. We use an existing gold standard dataset for training and testing. For each word sense from the SALDO Swedish lexicon, we assign a real value sentiment score in the range [-1,1] and produce a sentiment label. We implement and evaluate three methods: a graph-based method that iterates over the SALDO structure, a method based on random paths over the SALDO structure and a corpus-driven method based on word embeddings. The resulting sense-disambiguated sentiment lexicon (SenSALDO) is an open source resource and freely available from Språkbanken, The Swedish Language Bank at the University of Gothenburg.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, 7-12 May 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{wiren-etal-2018-svala-285624, title = {SVALA: Annotation of Second-Language Learner Text Based on Mostly Automatic Alignment of Parallel Corpora}, abstract = {Annotation of second-language learner text is a cumbersome manual task which in turn requires interpretation to postulate the intended meaning of the learner’s language. This paper describes SVALA, a tool which separates the logical steps in this process while providing rich visual support for each of them. The first step is to pseudonymize the learner text to fulfil the legal and ethical requirements for a distributable learner corpus. The second step is to correct the text, which is carried out in the simplest possible way by text editing. During the editing, SVALA automatically maintains a parallel corpus with alignments between words in the learner source text and corrected text, while the annotator may repair inconsistent word alignments. Finally, the actual labelling of the corrections (the postulated errors) is performed. We describe the objectives, design and workflow of SVALA, and our plans for further development. }, booktitle = {Selected papers from the CLARIN Annual Conference 2018, Pisa, 8-10 October 2018}, editor = {Inguna Skadina and Maria Eskevich}, author = {Wirén, Mats and Matsson, Arild and Rosén, Dan and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköpings universitet}, ISBN = {978-91-7685-034-3}, } @inProceedings{themistocleous-kokkinakis-2018-themis-265112, title = {THEMIS-SV: Automatic classification of language disorders from speech signals}, abstract = {Background and Aims: Brain injuries resulting from stroke can affect the production of speech resulting in different types of language impairments, such as aphasia. Studying these productions manually is an extremely cumbersome and time consuming process. The aim of this paper is to present THEMIS-SV: a system that enables the automatic transcription of speech signals and the segmentation of vowels and consonants in Swedish. Method: The input of the system are recordings of speech. The system processes the recordings and returns an output with three tiers: the utterance tier, the word tier, and the vowels/consonants tier. Results: The output of the system is a fast and reliable transcription and segmentation of speech, which is very close to transcriptions and segmentations performed manually. The automatic segmentation of speech enables targeted acoustic measurements, such as measurements of consonant spectra, formant frequencies of vowels, fundamental frequency, pauses, speech rate, etc. and other acoustic measurements that have been known to differentiate between the different types of language disorders. Conclusion: The method proposed here can be employed for the analysis of speech of individuals with post-stroke aphasia and other language disorders and constitutes a promising step towards a fully automated differential diagnostic tool for language disorders. }, booktitle = {Abstracts of the 4th European Stroke Organisation Conference (ESOC 2018). Gothenburg, Sweden, 16-18 May, 2018. }, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{edstrom-etal-2018-ageism-267250, title = {Ageism and Swedish news media}, abstract = {Ageism can be seen as a “social disease”, a casual or systematic prejudice, stereotyping and discriminating against individuals or groups on the basis of their age. This is an area of growing concern, particularly the role of mainstream media in relationship to ageism. A valuable and important step is to understand the presence of ageing and older age how different types of online news media. The main objective of this pilot work is to test, collate and produce evidence from Swedish news media representations of older ages and ageing. METHOD(S) Two pilot studies/experiments; first names and their frequencies of the carriers’ age according to Statistics Sweden (SCB) and their presence in 39 online news between 2015 and 2018. ( 4, 7 millions texts). using general pattern matching techniques with regular expressions and applying them to 13 issues (1994, 2001-13) of Göteborgs-Posten (Swedish news corpora). Definition: Older persons ≥60 years. (25 % of the population in Sweden is over 60 yearsRESULTS AND CONCLUSIONS: Clear and consistent differences of how various age spans are represented in the news. 20-50 year olds is highly over represented compared with the Swedish population, while 0-24 and people over 54 are underrepresented, especially women. Pattern matching exhibits similar characteristics with the exception of obituaries where the elderly mentions are much more frequent.Our pilot studies confirm the introspective view of underrepresentation of old age and older people in or trends can be revealed within a larger time span and synchronic media sources. More studies are required and in the near future we plan to improve, scale and apply our methodology on both synchronic and diachronic data using e.g. available text corpora and try to get a solid perspective on whether any differences or trends can be revealed within a larger time span }, booktitle = {24th Nordic Congress of Gerontoloy (NKG). Oslo, Norway: 2-4 May 2018 }, author = {Edström, Maria and Kokkinakis, Dimitrios and Berggren, Max}, year = {2018}, } @inProceedings{denouden-etal-2018-comparison-268339, title = {Comparison of Automated Methods for Vowel Segmentation and Extraction of Acoustic Variables}, abstract = {Introduction: Primary Progressive Aphasia (PPA) is a neurodegenerative syndrome in which linguistic abilities become gradually impaired. There are three primary variants of PPA: the non-fluent agrammatic PPA, the fluent type semantic PPA, and the logopenic PPA, which is also considered an atypical form of Alzheimer’s disease (Mesulam et al., 1982; Gorno-Tempini et al., 2011). Along with the three main variants, a fourth variant has been proposed, a non-fluent apraxia of speech (AOS), though this is currently the subject of an open debate (e.g., Duffy et al., 2017; Henry et al., 2013). According to sophisticated criteria established a few years ago, PPA subtyping for a given patient presented in clinic requires clinical, neuropsychological, and imaging information (Gorno-Tempini et al., 2011). Nevertheless, quantifying the decline of linguistic abilities and subtyping the variants of PPA manually is both hard and laborious, so there is great demand for algorithms that subtype a given patient automatically. Picture description samples of connected speech and random forests techniques have been used for this purpose (de Aguiar et al., 2017; Wilson et al., 2010, Fraser et al. 2013, 2014). In the present study, we compared existing models and we propose a new one. Aims: In this study, we provide an automated classification model of PPA variants trained on known morphological and acoustic predictors and on predictors related to the clinical and linguistic profile of individuals with PPA (e.g., Mack et al., 2015; Gorno-Tempini et al., 2011; Wilson et al., 2010). Method: Speech materials for this study come from the Transcranial Direct Current Stimulation for Primary Progressive Aphasia study at Johns Hopkins University. Twenty-six individuals with PPA (Mean(SD) age = 68.6 (7.8) years, Mean(SD) education = 16.1 (2.9) years) participated in this study. PPA participants were diagnosed based on the established consensus criteria (Gorno-Tempini et al., 2011), i.e., imaging, clinical, and neuropsychological examination by trained neurologists. Individuals with PPA included non-fluent with AOS (N=5), non fluent without AOS (N=7), logopenic (N=8), and semantic (N=6) variants. Recordings of the Cookie Theft picture description from the Boston Diagnostic Aphasia Examination (BDAE) were computationally analyzed. All speech productions were automatically transcribed and segmented using an end-to-end speech-to-transcription platform. From the speech signals, we measured morphological and acoustic predictors, including vowel formants F1 ... F3, measured at 15%, 50%, and 75% of vowel’s duration, vowel duration, fundamental frequency, and pause duration. The analysis and the statistics were conducted using Python and R programming languages (R Core Team, 2017; Rossum, 1995). Three different machine learning algorithms: C5.0 decision trees, Classification and Regression Trees (CART) and random forests were trained on the predictors (Breiman, 2001; Quinlan, 1993; Hastie et al., 2009). All models were trained on the 80% of the speakers (training set), with 3-fold cross-validation. All predictor variables were centered and scaled. C5.0 was trained with winnowing and without winnowing. (Winnowing facilitates the automatic pre-selection of the predictors that are used in the decision tree.) After the training we evaluated the trained models on the unknown dataset, namely the 20% of the speakers (evaluation set). Results: C5.0 provided 86% (95% CI[81, 88], kappa = 0.76) and Random Forests 85% (95% CI[81, 88], kappa = 0.76) classification accuracy on the test data; CART provided the lowest overall classification accuracy. Overall, C5.0 outperformed both the random forests and CART, with high classification accuracy on unknown data. Non-fluent PPA with AOS was correctly predicted by both C5.0 and random forests. Discussion: The C5.0 classification model provides support for the known predictors employed in the literature. Also, it provides some objective ways to distinguish the presence of AOS in PPA and corroborate research on classification of AOS using acoustic properties especially those related to vowel production (Den Ouden et al. 2017). However, given the low number of participants employed in this study, further research is required, with a larger number of participants. Nevertheless, the proposed methods employed here constitute a promising step towards a computational differential diagnostic tool of PPA that is easy to use, quick and accurate. }, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA.}, author = {den Ouden, Dirk B. and Hutchinson, Angelica and Tsapkini, Kyrana and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{rouces-etal-2018-generating-264719, title = {Generating a Gold Standard for a Swedish Sentiment Lexicon}, abstract = {We create a gold standard for sentiment annotation of Swedish terms, using the freely available SALDO lexicon and the Gigaword corpus. For this purpose, we employ a multi-stage approach combining corpus-based frequency sampling, direct score annotation and Best-Worst Scaling. In addition to obtaining a gold standard, we analyze the data from our process and we draw conclusions about the optimal sentiment model.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, May 7-12, 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @misc{kokkinakis-2018-resources-265118, title = {Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2)}, abstract = {Proceedings of the second RaPID: "Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments". An LREC workshop. 8th of May 2018, Miyazaki, Japan}, author = {Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{kokkinakis-etal-2018-textforskning-265113, title = {Kan textforskning bidra till tidigare och säkrare demensdiagnostik?}, abstract = {Tidigare forskning har visat att subtila språkstörningar kan finnas vid de tidigaste förstadierna till demens, flera år innan en klinisk diagnos kan ställas. Inom ramen för projektet ”Språkliga och extra-lingvistiska parametrar för tidig upptäckt av kognitiv svikt” (finansierat av Riksbankens Jubileumsutlysning, 2016-19) undersöker vi med hjälp av språkteknologi och språkanalysstudier hur dessa språkstörningar yttrar sig. Kan språkteknologi användas för att upptäcka dessa tidiga språkrelaterade symtom och därmed bidra med nyanserad, komplementär och användbar kunskap? Kan användning av språkteknologi särskilja personer med de allra tidigaste kognitiva avvikelserna från personer med mer godartad, åldersrelaterad kognitiv svikt? Vilka språkliga förmågor drabbas? Hur yttrar sig dessa förändringar och vilka slags empiriska material finns att tillgå? Dessa är några av de frågor vi söker svar på. Vi gör inspelningar som vi analyserar för att kunna ta fram ny kunskap om subtila språkliga kännetecken som kan föregå demensutveckling. Denna kunskap kan användas för att eventuellt kunna förutsäga vilka individer som befinner sig i riskzonen för att utveckla demens, och kan vara användbar som komplementerande beslutsunderlag till domänexperter. Vi utvinner, analyserar och undersöker om det finns samband mellan olika språkrelaterade parametrar från spontan talinteraktion, transkriptioner men även ögonrörelser och neuropsykologiska tester från personer med subjektiv eller lindrig kognitiv nedsättning och friska kontrollpersoner. Många gånger är det svårt att avgöra huruvida lindriga kognitiva symtom är en del av det normala åldrandet eller början på en neurodegenerativ process. Vi förväntar oss inte heller att varje enskild person med kognitiv nedsättning kommer att uttrycka sig eller läsa på samma sätt utan snarare att dessa personer tidigt i sjukdomsförloppet kommer att börja uppvisa olika slags avvikande läsmönster, eller göra fonologiska, lexikala, syntaktiska eller semantiska fel. I studien utvecklar vi verktyg för att automatiskt hitta dessa avvikelser, och målet är att detta sedan ska kunna användas som komplement till tidig diagnostik samt som prognostiskt eller screeningverktyg. Deltagarna i vår studie har rekryterats från en pågående longitudinell studie, ”Demens i Tidigt Skede”, (eng. ”The Gothenburg MCI study”) på Minnesmottagningen i Göteborg, och vårt projekt har godkänts av den lokala etiknämnden. Alla deltagare i studien (kontrollgruppen [HC], personer med subjektiv kognitiv nedsättning [SCI] och personer med mild kognitiv nedsättning [MCI]) har genomgått baslinjeundersökning och gett informerat skriftligt samtycke (demografisk information finns i tabell 1). Vårt projekt är f.n. pågående och vi kommer presentera resultat baserade på inspelningstillfälle nr ett (aug. 2016-mars 2017). En ny inspelningsomgång, med samma deltagare, började i februari 2018 och förväntas vara avslutat i december 2018. Under presentationen kommer vi ge exempel på olika tal-, text- och ögonrörelseanalyser vi har genomfört och diskutera metodval och resultat baserade på studiens första fas. Vi kommer vidare ge en kort inblick i den nya, pågående inspelningsomgången och de nya testmoment vi använder. Vi vill med vårt arbete visa hur språkteknologisk analys kan bidra till att utöka vår kunskap inom området så att den kan vara användbar för tidig diagnostik och optimal omvårdnad. Enligt Socialstyrelsen (2017) finns det i Sverige över 160 000 personer med någon demenssjukdom. Våra resultat kan ha en betydelse för vårdpersonal som snabbare vill diagnostisera och identifiera individer med olika former av kognitiv funktionsnedsättning innan allvarliga symtom blir påtagliga. Utvecklingsmöjligheterna är många: nya eller förbättrade kognitiva screeningtester som skulle kunna användas inom primär- och specialistvården, samt utveckling och tillämpning av insatser som kan påverka beteendemönster och träna upp individens kommunikativa förmåga, kan på sikt leda till positiva konsekvenser som minskade vårdköer samt effektivare behandling avseende kostnader och behandlingsutfall.}, booktitle = {Forum för textforskning 13 , Lund 7 – 8 juni 2018}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{kokkinakis-etal-2018-swedish-262851, title = {A Swedish Cookie-Theft Corpus}, abstract = {Language disturbances can be a diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, at earlier stages, and connected speech analysis provides a non-invasive and easy-to-assess measure for determining aspects of the severity of language impairment. In this paper we focus on the development of a corpus consisting of audio recordings of picture descriptions of the Cookie-theft, produced by Swedish speakers, and accompanying transcriptions. The speech elicitation procedure provides an established method of obtaining highly constrained samples of connected speech that can allow us to study the intricate interactions between various linguistic levels and cognition. We chose the Cookie-theft picture since it is a standardized test that has been used in various studies in the past, and therefore comparisons can be made based on previous results. This type of picture description task might be useful for detecting subtle language deficits in patients with subjective and mild cognitive impairment. The resulting corpus is a new, rich and multi-faceted resource for the investigation of linguistic characteristics of connected speech and a unique data set that provides a rich resource for (future) research and experimentation in many areas, and of language impairment in particular. The information in the corpus can also be combined and correlated with other collected data about the speakers, such as neuropsychological tests, imaging and brain physiology markers and cerebrospinal fluid markers.}, booktitle = {LREC 2018, 11th edition of the Language Resources and Evaluation Conference, 7-12 May 2018, Miyazaki (Japan) / Editors: Nicoletta Calzolari (Conference chair), Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Koiti Hasida, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis, Takenobu Tokunaga}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Nordlund, Arto}, year = {2018}, publisher = {European Language Resources Association}, ISBN = {979-10-95546-00-9}, } @inProceedings{lundholmfors-etal-2018-voice-264400, title = {Eye-voice span in adults with mild cognitive impairment (MCI) and healthy controls. }, abstract = {Objectives: This study is part of a larger project focused on developing new techniques for identification of early linguistic and extra-linguistic signs of cognitive impairment, with the overall goal of identifying dementia in the preclinical stage. In a previous study, we found that eye movements during reading can be used to distinguish between subjects with mild cognitive impairment (MCI) and healthy controls with up to 86% accuracy. In this study, we are investigating the process of reading aloud, by exploring the eye-voice span in subjects with and without cognitive impairment. The aim of the study is to identify differences in the reading processes and evaluate whether these differences can be used to discriminate between the two groups. Methods: The eye-voice span is a measurement of the temporal and spatial organization between the eye and the voice, and is affected by for example working memory and automaticity, but also by the familiarity and length of words. In previous work, differences between eye movements when reading in healthy controls and subjects with cognitive impairments have been identified, and it has been shown that subjects with Alzheimer’s disease show impairments when reading aloud, specifically with regards to speech and articulation rate. Results: We present a quantitative and qualitative analysis of the reading process in the subjects, focusing both on general measures of eye-voice span, but also specifically on instances of hesitation and mistakes in the speech, and the correlated eye movements. Conclusions/Take home message: Early detection of dementia is important for a number of reasons, such as giving the person access to interventions and medications, and allowing the individual and families time to prepare. By expanding the knowledge about reading processes in subjects with MCI, we are adding to the potential of using reading analysis as an avenue of detecting early signs of dementia.}, booktitle = {Book of Abstracts 10th CPLOL Congress 10-12 May 2018, Cascais, Portugal / editor : Trinite, Baiba }, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{dannells-olsson-2018-integrating-271181, title = {Integrating language resources in two OCR engines to improve processing of historical Swedish text.}, abstract = {We are aiming to address the difficulties that many History and Social Sciences researchers struggle with to bring in non-digitized text into language analysis workflows. In this paper we present the language resources and material we used for training two Optical Character Recognition engines for processing historical Swedish text written in Fraktur (blackletter). The trained models, resources and dictionaries are freely available and accessible through our web service, hosted at Språkbanken, to enable users and developers easy access for extraction of historical Swedish text a that are only available in images for further processing.}, booktitle = {CLARIN Annual Conference}, author = {Dannélls, Dana and Olsson, Leif-Jöran}, year = {2018}, } @inProceedings{lange-ljunglof-2018-putting-274013, title = {Putting Control into Language Learning}, abstract = {Controlled Natural Languages (CNLs) have many applications including document authoring, automatic reasoning on texts and reliable machine translation, but their application is not limited to these areas. We explore a new application area of CNLs, the use of CNLs in computer-assisted language learning. In this paper we present a a web application for language learning using CNLs as well as a detailed description of the properties of the family of CNLs it uses.}, booktitle = {CNL 2018, the 6th International Workshop on Controlled Natural Language, Maynooth, Co Kildare, 27-28th August 2018; published as volume 304 of Frontiers in Artificial Intelligence and Applications}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {IOS Press}, address = {Amsterdam}, ISBN = {978-1-61499-904-1}, } @inProceedings{alfter-volodina-2018-whole-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @inProceedings{pilan-volodina-2018-exploring-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{pilan-volodina-2018-investigating-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{alfter-volodina-2018-towards-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @inProceedings{volodina-etal-2018-annotation-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{fraser-etal-2018-improving-264397, title = {Improving the Sensitivity and Specificity of MCI Screening with Linguistic Information.}, abstract = {The Mini-Mental State Exam (MMSE) is a screening tool for cognitive impairment. It has been extensively validated and is widely used, but has been criticized as not being effective in detecting mild cognitive impairment (MCI). In this study, we examine the utility of augmenting MMSE scores with automatically extracted linguistic information from a narrative speech task to better differentiate between individuals with MCI and healthy controls in a Swedish population. We find that with the addition of just four linguistic features, the F score (measuring a trade-off between sensitivity and specificity) is improved from 0.67 to 0.81 in logistic regression classification. These preliminary results suggest that the accuracy of traditional screening tools may be improved through the addition of computerized language analysis.}, booktitle = {Proceedings of the LREC workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2). 8th of May 2018, Miyazaki, Japan / Dimitrios Kokkinakis (ed.)}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{lange-ljunglof-2018-demonstrating-274016, title = {Demonstrating the MUSTE Language Learning Environment}, abstract = {We present a language learning application that relies on grammars to model the learning outcome. Based on this concept we can provide a powerful framework for language learning exercises with an intuitive user interface and a high reliability. Currently the application aims to augment existing language classes and support students by improving the learner attitude and the general learning outcome. Extensions beyond that scope are promising and likely to be added in the future.}, booktitle = {NLP4CALL 2018, the 7th Workshop on NLP for Computer Assisted Language Learning, Stockholm, 7th November 2018; published as issue 152 of Linköping Electronic Conference Proceedings}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7685-173-9}, } @inProceedings{virk-prasad-2018-towards-295336, title = {Towards Hindi/Urdu FrameNets via the Multilingual FrameNet.}, booktitle = {Proceedings of the LREC 2018 Workshop. International FrameNet Workshop 2018 : Multilingual Framenets and Constructicon, 12 May 2018 – Miyaza, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker }, author = {Virk, Shafqat and Prasad, K.V.S}, year = {2018}, publisher = {European Language Resources Association (ELRA).}, ISBN = {979-10-95546-00-9}, } @inProceedings{angelopoulou-etal-2018-pause-268338, title = {Pause patterns and speech errors in stroke patients with aphasia: cross-linguistic evidence from narrative speech.}, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA.}, author = {Angelopoulou, Georgia and Kiran, Swathi and Kasselimis, Dimitrios and Varkanitsa, Maria and Meier, Erin and Yue, Pan and Tsolakopoulos, Dimitrios and Themistocleous, Charalambos and Vassilopoulou, Sofia and Korompoki, Eleni and Tountopoulou, Argyro and Papageorgiou, Georgios and Goutsos, Dionysis, and Evdokimidis, Ioannis and Potagas, Constantin}, year = {2018}, } @inProceedings{adesam-etal-2018-fsvreader-267311, title = {FSvReader – Exploring Old Swedish Cultural Heritage Texts}, abstract = {This paper describes FSvReader, a tool for easier access to Old Swedish (13th–16th century) texts. Through automatic fuzzy linking of words in a text to a dictionary describing the language of the time, the reader has direct access to dictionary pop-up definitions, in spite of the large amount of morphological and spelling variation. The linked dictionary entries can also be used for simple searches in the text, highlighting possible further instances of the same entry. }, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu, Mäkelä Mikko, Tolonen Jouni Tuominen}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{adesam-etal-2018-koala-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{karsvall-borin-2018-sdhk-265603, title = {SDHK meets NER: Linking place names with medieval charters and historical maps}, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Karsvall, Olof and Borin, Lars}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @article{themistocleous-etal-2018-identification-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, journal = {Frontiers in Neurology}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @incollection{borin-edlund-2018-language-269047, title = {Language technology and 3rd wave HCI: Towards phatic communication and situated interaction}, abstract = {In the field of language technology, researchers are starting to pay more attention to various interactional aspects of language – a development prompted by a confluence of factors, and one which applies equally to the processing of written and spoken language. Notably, the so-called ‘phatic’ aspects of linguistic communication are coming into focus in this work, where linguistic interaction is increasingly recognized as being fundamentally situated. This development resonates well with the concerns of third wave HCI, which involves a shift in focus from stating the requirements on HCI design primarily in terms of “context-free” information flow, to a view where it is recognized that HCI – just like interaction among humans – is indissolubly embedded in complex, shifting contexts. These – together with the different backgrounds and intentions of interaction participants – shape the interaction in ways which are not readily understandable in terms of rational information exchange, but which are nevertheless central aspects of the interaction, and which therefore must be taken into account in HCI design, including its linguistic aspects, forming the focus of this chapter.}, booktitle = {New Directions in Third Wave Human-Computer Interaction: Volume 1 - Technologies}, editor = {Michael Filimowicz and Veronika Tzankova.}, author = {Borin, Lars and Edlund, Jens}, year = {2018}, publisher = {Springer International Publishing}, address = {Cham}, ISBN = {978-3-319-73355-5}, pages = {251--264}, } @misc{torrent-etal-2018-proceedings-267405, title = {Proceedings of the LREC 2018 Workshop International FrameNet Workshop 2018: Multilingual Framenets and Constructicons. 12 May 2018 – Miyazaki, Japan}, abstract = {The International FrameNet Workshop 2018 brought together researchers in Frame Semantics and Construction Grammar, two areas which have traditionally been interrelated, but which have been developing somewhat independently in recent years. It is also addressed at language technology researchers working with language resources based on Frame Semantics or Construction Grammar. The workshop follows on from similar joint meetings in Berkeley, California in 2013 (IFNW 2013, sponsored by the Swedish FrameNet group) and in Juiz de Fora, Brazil in 2016 (IFNW 2016, sponsored by FrameNet Brasil), and will cover the rapidly unfolding developments in both areas and recent research on their interconnections.}, author = {Torrent, Tiago Timponi and Borin, Lars and Baker, Collin}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{neofytou-etal-2018-understanding-271916, title = {Understanding and classifying the different variants of Primary Progressive Aphasia based on spelling performance}, abstract = {Introduction: Previous findings suggest differences in the written spelling performance between the three variants of Primary Progressive Aphasia (PPA) - semantic (svPPA), logopenic (lvPPA) and non-fluent (nfvPPA) (Shim et al., 2012; Sepelyak et al., 2011). However, no attempts have been made to systematically distinguish the three variants in terms of their spelling performance. The challenges of classification are considerable and given the ease of administering a spelling test, we aimed to determine to what extent a spelling task can provide accurate classification of the PPA variants. Method: Thirty-three participants with PPA were included - 14 lvPPAs, 11 nfvPPAs and 8 svPPAs – originally classified using the neuropsychological and spoken language criteria defined by Gorno-Tempini et al. (2011). Data were collected prior to spelling treatment, using a spelling to dictation task with both real-words and pseudowords (92-138 items/per participant), scored for each grapheme (i.e., letter) and analyzed for each participant individually using generalized linear mixed effects models (GLMEM) for real-words and pseudowords separately. The variables of interest for both real-words and pseudowords were word length, phoneme-grapheme conversion probability and grapheme position. The real-word models also included frequency, imageability, and the orthographic and phonological neighborhood density of the target words. The coefficients from the output of the GLMEMs, together with 3 additional variables – verb/noun and pseudoword/word accuracy differences from the spelling task, and language impairment severity according to FTD-CDR (Knopman, 2008) - were used as predictors in a Random Forests (RFs) model implemented in Python, to identify the variables that contribute the most in distinguishing the three variants. Then, the three most significant predictors identified with RFs were used in multinomial models implemented in R to classify the PPA variants. The model was trained on a training set of all participants minus one (i.e. the left-out participant) and evaluated on the left-out participant, known as Leave-One-Out cross-validation. This process was repeated 33 times to evaluate all participants. Results: The three most significant predictors of the RFs analysis were: (1) grapheme position in real-words, (2) pseudoword/word accuracy difference, and (3) length of real-words (Figure 1). The overall accuracy of the multinomial models with these three predictors only was 67%: lvPPA=71%, nfvPPA=64% and svPPA=63%. When severely impaired cases (language severity =3 in Knopman et al., 2008; FTD-CDR criteria) were excluded (giving a new dataset of 22 participants), the overall accuracy increased to 91%: lvPPA=90%, nfvPPA=86% and svPPA=100%. Discussion: Our study provides evidence of the value of considering spelling performance in understanding and classifying the different variants of PPA. The results suggest that lexical status, word length and grapheme position are useful parameters for classification, which index key components of the cognitive architecture of spelling (Rapp, 2002). Also, the finding that prediction accuracy increased when more severe cases were excluded supports previous findings (Mesulam et al., 2012), as severity increases variants become less differentiated and classification is more difficult. In sum, a relatively short, easy-to-administer spelling test, provides useful information for PPA variant classification and can potentially be used as a clinical tool.}, booktitle = {Frontiers in Human Neuroscience}, author = {Neofytou, Kyriaci and Themistocleous, Charalambos and Wiley, Robert and Tsapkini, Kyrana and Rapp, Brenta}, year = {2018}, } @inProceedings{themistocleous-etal-2018-classification-268340, title = {A classification study of the variants of Primary Progressive Aphasia using Machine Learning.}, abstract = {Introduction: Primary Progressive Aphasia (PPA) is a neurodegenerative syndrome in which linguistic abilities become gradually impaired. There are three primary variants of PPA: the non-fluent agrammatic PPA, the fluent type semantic PPA, and the logopenic PPA, which is also considered an atypical form of Alzheimer’s disease (Mesulam et al., 1982; Gorno-Tempini et al., 2011). Along with the three main variants, a fourth variant has been proposed, a non-fluent apraxia of speech (AOS), though this is currently the subject of an open debate (e.g., Duffy et al., 2017; Henry et al., 2013). According to sophisticated criteria established a few years ago, PPA subtyping for a given patient presented in clinic requires clinical, neuropsychological, and imaging information (Gorno-Tempini et al., 2011). Nevertheless, quantifying the decline of linguistic abilities and subtyping its variants manually is both hard and laborious, so there is a great demand for algorithms that subtype a given patient automatically. Picture description samples of connected speech and random forests techniques have been used for this purpose (de Aguiar et al., 2017; Wilson et al., 2010, Fraser et al. 2013, 2014). In the present study, we compared existing models and we propose a new one. Aims: In this study, we provide an automated classification model of the four PPA variants trained on known morphological and acoustic predictors and on predictors related to the clinical and linguistic profile of individuals with PPA (e.g., Mack et al., 2015; Gorno-Tempini et al., 2011; Wilson et al., 2010). Method: Speech materials for this study come from the Transcranial Direct Current Stimulation for Primary Progressive Aphasia study at Johns Hopkins University. Twenty-six individuals with PPA (Mean(SD) age = 68.6 (7.8) years, Mean(SD) education = 16.1 (2.9) years) participated in this study. PPA participants were diagnosed based on the established consensus criteria (Gorno-Tempini et al., 2011) based on imaging, clinical, and neuropsychological examination by trained neurologists. Individuals with PPA included non-fluent AOS (N=5), non fluent (N=7), logopenic (N=8), and semantic (N=6) variants. Recordings of the Cookie Theft picture description from the Boston Diagnostic Aphasia Examination (BDAE) were computationally analyzed. All speech productions were automatically transcribed and segmented using an end-to-end speech-to-transcription platform. From the speech signals, we measured morphological and acoustic predictors, including vowel formants F1 ... F3, measured at 15%, 50%, and 75% of vowel’s duration, vowel duration, fundamental frequency, and pause duration. The analysis and the statistics were conducted using Python and R programming languages (R Core Team, 2017; Rossum, 1995). Three different machine learning algorithms: C5.0 decision trees, Classification and Regression Trees (CART) and random forests were trained on the predictors (Breiman, 2001; Quinlan, 1993; Hastie et al., 2009). All models were trained on the 80% of the speakers (training set), with 3-fold cross-validation. All predictor variables were centered and scaled. C5.0 was trained with winnowing and without winnowing. (Winnowing facilitates the automatic pre-selection of the predictors that are used in the decision tree.) After the training we evaluated the trained models on the unknown dataset, namely the 20% of the speakers (evaluation set). Results: C5.0 provided 86% (95% CI[81, 88], kappa = 0.76) and Random Forests 85% (95% CI[81, 88], kappa = 0.76) classification accuracy on the test data; CART provided the lowest overall classification accuracy. Overall, C5.0 outperformed both the random forests and CART, with high classification accuracy on unknown data. Non-fluent AOS was correctly predicted by both C5.0 and random forests. Discussion: C5.0 classification model provides support for the known predictors employed in the literature. Also, it provides initial support for the distinct properties of the non-fluent AOS variant and corroborate research on classification of AOS using acoustic properties especially those related to vowel production (Den Ouden et al. 2017). However, given the low number of participants employed in this study, further research is required, with a largest number of participants. Nevertheless, the proposed methods employed here constitute a promising step towards a computational differential diagnostic tool of PPA that is easy to use, quick and accurate. }, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA}, author = {Themistocleous, Charalambos and Ficek, Bronte and Webster, Kimberly and Wendt, Haley and Hillis, Argye E. and den Ouden, Dirk Bart and Tsapkini, Kyrana}, year = {2018}, } @inProceedings{themistocleous-etal-2018-acoustic-271915, title = {Acoustic markers of PPA variants using machine learning.}, abstract = {Introduction. Speakers’ acoustic profile carries significant linguistic and non-linguistic information. Employed in clinical practice, it can provide behavioral markers for a quick assessment of primary progressive aphasia (PPA). PPA is a complex language syndrome where different speech and language properties such as prosody, lexical retrieval, and motor speech functioning may be affected. It is classified into three main variants: the nonfluent (nfvPPA), semantic (svPPA), and logopenic (lvPPA). Primary progressive apraxia of speech (PPAOS) is also distinguished (Duffy et al. 2017) but may fall into the category of nfvPPA (Gorno-Tempini et al. 2011). The present study aims to determine the contribution of the acoustic properties of vowels, prosody, and voice quality in the classification of PPA variants by using machine learning models. Methods. Oral samples from picture description tasks of 50 individuals with PPA (lvPPA:17, svPPA:14, nfvPPA:11, PPAOS:8) were automatically transcribed and segmented into vowels and consonants using the new acoustic analysis platform THEMIS. From the segmented vowels, we measured: i. Vowel formants (F1…F5) (den Ouden, et al. 2017); ii. vowel duration (Duffy, et al., 2017); iii. Mean fundamental frequency (F0), min F0 and max F0 (Hillis, 2014); iv. Pause duration (Mack et al. 2015), and v. H1–H2, H1–A1, H1–A2, H1–A3 measures of voice quality. We compared three machine learning models: support vector machines (SVM) (Cortes and Vapnik, 1995), random forests (RF) (Breiman, 2001), and decision trees (DT) (Hastie et al. 2009) in an one-against all strategy, where each variant was tested against all others. We run all models with a 3-fold group-cross-validation to ensure that the speakers in the training and evaluation sets are different. The models were implemented in Python (Pedregosa et al. 2011). Results. We report the mean cross-validated accuracy of the best performing model that resulted from model comparison: i. RF model provided the highest classification accuracy for nfvPPA [Mean 82%, SD: 9%], ii. SVM had the highest accuracy for svPPA [Mean 66%, SD: 8%], iii. RF had the highest accuracy for lvPPA [Mean 57%, SD: 15%] and iv. RF provided the highest classification accuracy for PPAOS [Mean 80%, SD: 8%] (Figure 1). In all models, pause duration and F0 measures were ranked higher than most other features (Figure 2). Discussion. This study employed an innovative method for the classification of PPA variants, using an automated speech transcription, segmentation, feature extraction and modeling. Using just acoustic features the best model classified nfvPP, svPPA, and PPAOS with high accuracy. However, acoustic features alone could not classify lvPPA with such high accuracy. More linguistic markers might be needed for a more accurate classification of lvPPA. Furthermore, we showed that prosody, which is measured by fundamental frequency and pause duration, contributes more than any other factor to the classification of PPA variants as alluded in previous research by our group and others (Hillis 2014, Patel et al. 2018, Mack 2015). Finally, the findings demonstrate the potential benefit of using machine learning models in clinical practice for the subtyping of PPA variants.}, booktitle = {Frontiers in Human Neuroscience. Conference Abstract: Academy of Aphasia 56th Annual Meeting, October 21-23, 2018, Montreal, Canada}, author = {Themistocleous, Charalambos and Ficek, Bronte and Webster, Kimberly and Wendt, Haley and Hillis, Argye and Den Ouden, Dirk-Bart and Tsapkini, Kyrana}, year = {2018}, } @inProceedings{malm-etal-2018-lingfn-267404, title = {LingFN: Towards a framenet for the linguistics domain}, abstract = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive manual process.}, booktitle = {Proceedings : LREC 2018 Workshop, International FrameNet Workshop 2018. Multilingual Framenets and Constructicons, May 12, 2018, Miyazaki, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker}, author = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @edited_book{lyngfelt-etal-2018-constructicography-269082, title = {Constructicography: Constructicon development across languages}, abstract = {In constructionist theory, a constructicon is an inventory of constructions making up the full set of linguistic units in a language. In applied practice, it is a set of construction descriptions – a “dictionary of constructions”. The development of constructicons in the latter sense typically means combining principles of both construction grammar and lexicography, and is probably best characterized as a blend between the two traditions. We call this blend constructicography. The present volume is a comprehensive introduction to the emerging field of constructicography. After a general introduction follow six chapters presenting constructicon projects for English, German, Japanese, Brazilian Portuguese, Russian, and Swedish, respectively, often in relation to a framenet of the language. In addition, there is a chapter addressing the interplay between linguistics and language technology in constructicon development, and a final chapter exploring the prospects for interlingual constructicography. This is the first major publication devoted to constructicon development and it should be particularly relevant for those interested in construction grammar, frame semantics, lexicography, the relation between grammar and lexicon, or linguistically informed language technology. }, editor = {Lyngfelt, Benjamin and Borin, Lars and Ohara, Kyoko and Torrent, Tiago Timponi}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, } @inProceedings{borin-etal-2018-many-267534, title = {Many a little makes a mickle - infrastructure component reuse for a massively multilingual linguistic study}, abstract = {We present ongoing work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia and studies relating to language typology and contact linguistics. The project has two concrete main aims: (1) to conduct a linguistic investigation of the claim that South Asia constitutes a linguistic area; (2) to develop state-of-the-art language technology for automatically extracting the relevant information from the text of the LSI. In this presentation we focus on how, in the first part of the project, a number of existing research infrastructure components provided by Swe-Clarin, the Swedish CLARIN consortium, have been ‘recycled’ in order to allow the linguists involved in the project to quickly orient themselves in the vast LSI material, and to be able to provide input to the language technologists designing the tools for information extraction from the descriptive grammars.}, booktitle = {Selected papers from the CLARIN Annual Conference 2017, Budapest, 18–20 September 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-273-6}, } @inProceedings{alfter-etal-2018-from-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @incollection{haugen-borin-2018-danish-267403, title = {Danish, Norwegian and Swedish}, booktitle = {The world's major languages}, editor = {Bernard Comrie}, author = {Haugen, Einar and Borin, Lars}, year = {2018}, publisher = {Routledge}, address = {London and New York}, ISBN = {9781138184824}, pages = {127--150}, } @inProceedings{borin-etal-2018-language-290841, title = {Language technology for digital linguistics: Turning the Linguistic Survey of India into a rich source of linguistic information}, abstract = {We present our work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) from a printed discursive textual description into a formally structured digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. While doing so, we develop state-of-the-art language technology for automatically extracting the relevant grammatical information from the text of the LSI, and interactive linguistic information visualization tools for better analysis and comparisons of languages based on their structural and functional features.}, booktitle = {Lecture Notes in Computer Science. Computational Linguistics and Intelligent Text Processing, 18th International Conference, CICLing 2017, Budapest, Hungary, April 17–23, 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Springer}, address = {Cham}, } @misc{pilan-etal-2018-proceedings-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{ljunglof-kjellberg-2018-interactive-274247, title = {Interactive correction of speech recognition errors: implementation and evaluation for English and Swedish}, booktitle = {SLTC 2018, the 7th Swedish Language Technology Conference, Stockholm, 7-9th November 2018}, author = {Ljunglöf, Peter and Kjellberg, J. Magnus}, year = {2018}, } @incollection{lyngfelt-etal-2018-constructicography-269085, title = {Constructicography at work: Theory meets practice in the Swedish constructicon}, abstract = {This chapter addresses central topics in constructicography from the viewpoint of the Swedish constructicon project (SweCcn), focusing on practical constructicon development. The full process of construction description is described and discussed, from selection via corpus analysis to finished constructicon entry and beyond, towards structuring the set of entries into a network. Particular attention is given to the description format and the treatment of constructional variation. A main theme in the chapter is the interdependence and alignment of SweCcn and related resources, on the one hand in the local context, notably the infrastructure of Språkbanken (the Swedish language bank), and on the other hand with respect to corresponding resources for other languages. Of key concern is the relation to FrameNet, both the Swedish and other framenets, and a major section is devoted to conditions for linking constructions and frames.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Lyngfelt, Benjamin and Bäckström, Linnéa and Borin, Lars and Ehrlemark, Anna and Rydstedt, Rudolf}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {41--106}, } @inProceedings{fyndanis-themistocleous-2018-morphosyntactic-271917, title = {Morphosyntactic production in agrammatic aphasia: A cross-linguistic machine learning approach.}, abstract = {Introduction Recent studies on agrammatic aphasia by Fyndanis et al. (2012, 2017) reported evidence against the cross-linguistic validity of unitary accounts of agrammatic morphosyntactic impairment, such as the Distributed Morphology Hypothesis (DMH) (Wang et al., 2014), the two versions of the Interpretable Features’ Impairment Hypothesis (IFIH-1: Fyndanis et al., 2012; IFIH-2: Fyndanis et al., 2018b), and the Tree Pruning Hypothesis (TPH) (Friedmann & Grodzinsky, 1997). However, some of the features/factors emphasized by the accounts above (i.e. involvement of inflectional alternations (DMH), involvement of integration processes (IFIH-1), involvement of both integration processes and inflectional alternations (IFIH-2), position of a morphosyntactic feature/category in the syntactic hierarchy (TPH)) may still play a role in agrammatic morphosyntactic production. These features may act in synergy with other factors in determining the way in which morphosyntactic production is impaired across persons with agrammatic aphasia (PWA) and across languages. Relevant factors may include language-independent and language-specific properties of morphosyntactic categories, as well as subject-specific and task/material-specific variables. The present study addresses which factors determine verb-related morphosyntactic production in PWA and what is their relative importance. Methods We collapsed the datasets of the 24 Greek-, German-, and Italian-speaking PWA underlying Fyndanis et al.’s (2017) study, added the data of two more Greek-speaking PWA, and employed machine learning algorithms to analyze the data. The unified dataset consisted of data on subject-verb agreement, time reference (past reference, future reference), grammatical mood (indicative, subjunctive), and polarity (affirmatives, negatives). All items/conditions were represented as clusters of theoretically motivated features: ±involvement of integration processes, ±involvement of inflectional alternations, ±involvement of both integration processes and inflectional alternations, and low/middle/high position in the syntactic hierarchy. We included 14 subject-specific, category-specific and task/material-specific predictors: Verbal Working Memory (WM), (years of formal) Education, Age, Gender, Mean Length of Utterance in (semi)spontaneous speech (Index 1 of severity of agrammatism), Proportion of Grammatical Sentences in (semi)spontaneous speech (Index 2 of severity of agrammatism), Words per Minute in (semi)spontaneous speech (Index of fluency), Involvement of inflectional alternations, Involvement of integration processes, Involvement of both integration processes and inflectional alternations, Position of a given morphosyntactic category in the syntactic hierarchy (high, middle, low), Item Presentation mode (cross-modal, auditory), Response mode (oral, written), and Language (Greek, German, Italian). Different machine learning models were employed: Random Forest, C5.0 decision tree, RPart, and Support Vector Machine. Results & Discussion Random Forest model outperformed all the other models achieving the highest accuracy (0.786). As shown in Figure 1, the best predictors of accuracy on tasks tapping morphosyntactic production were the involvement of both integration processes and inflectional alternations (categories involving both integration processes and inflectional alternations were more impaired than categories involving one or neither of them), verbal WM capacity (the greater the WM capacity, the better the morphosyntactic production), and severity of agrammatism (the more severe the agrammatism, the worse the morphosyntactic production). Results are consistent with IFIH-2 (Fyndanis et al., 2018b) and studies highlighting the role of verbal WM in morphosyntactic production (e.g., Fyndanis et al., 2018a; Kok et al., 2007).}, booktitle = {Frontiers in Human Neuroscience. Academy of Aphasia 56th Annual Meeting, Montreal, Canada, 21 Oct - 23 Oct, 2018. }, author = {Fyndanis, Valantis and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{megyesi-etal-2018-learner-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018}, editor = {Ildikó Pilán and Elena Volodina and David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{lange-ljunglof-2018-mulle-274014, title = {MULLE: A grammar-based Latin language learning tool to supplement the classroom setting}, abstract = {MULLE is a tool for language learning that focuses on teaching Latin as a foreign language. It is aimed for easy integration into the traditional classroom setting and syllabus, which makes it distinct from other language learning tools that provide standalone learning experience. It uses grammar-based lessons and embraces methods of gamification to improve the learner motivation. The main type of exercise provided by our application is to practice translation, but it is also possible to shift the focus to vocabulary or morphology training.}, booktitle = {NLPTEA 2018, the 5th Workshop on Natural Language Processing Techniques for Educational Applications, Melbourne, Australia, 19th July 2018}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {Association for Computational Linguistics}, address = {Melbourne, Australia}, } @inProceedings{themistocleous-etal-2018-effects-270215, title = {Effects of Mild Cognitive Impairment on vowel duration }, abstract = {Mild cognitive impairment (MCI) is a neurological condition, which is characterized by a noticeable decline of cognitive abilities, including communicative and linguistic skills. In this study, we have measured the duration of vowels produced in a reading task by 55 speakers— 30 healthy controls and 25 MCI—. The main results showed that MCI speakers differed significantly from HC in vowel duration as MCI speakers produced overall longer vowels. Also, we found that gender effects on vowel duration were different in MCI and HC. One significant aspect of this finding is that they highlight the contribution of vowel acoustic features as markers of MCI.}, booktitle = {Proceedings of the 9th Tutorial & Research Workshop on Experimental Linguistics, 28 - 30 August 2018, Paris, France}, editor = {Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios and Eckerström, Marie and Fraser, Kathleen and Lundholm Fors, Kristina}, year = {2018}, ISBN = {978-960-466-162-6 }, } @inProceedings{adesam-etal-2018-eukalyptus-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @inProceedings{dubossarsky-etal-2019-time-281304, title = {Time-Out: Temporal Referencing for Robust Modeling of Lexical Semantic Change }, abstract = {State-of-the-art models of lexical semantic change detection suffer from noise stemming from vector space alignment. We have empirically tested the Temporal Referencing method for lexical semantic change and show that, by avoiding alignment, it is less affected by this noise. We show that, trained on a diachronic corpus, the skip-gram with negative sampling architecture with temporal referencing outperforms alignment models on a synthetic task as well as a manual testset. We introduce a principled way to simulate lexical semantic change and systematically control for possible biases. }, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy, July 28 - August 2, 2019 / Anna Korhonen, David Traum, Lluís Màrquez (Editors)}, author = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-48-2}, } @article{tahmasebi-hengchen-2019-strengths-291189, title = {The Strengths and Pitfalls of Large-Scale Text Mining for Literary Studies}, abstract = {This paper is an overview of the opportunities and challenges of using large-scale text mining to answer research questions that stem from the humanities in general and literature specifically. In this paper, we will discuss a data-intensive research methodology and how different views of digital text affect answers to research questions. We will discuss results derived from text mining, how these results can be evaluated, and their relation to hypotheses and research questions. Finally, we will discuss some pitfalls of computational literary analysis and give some pointers as to how these can be avoided.}, journal = {Samlaren : tidskrift för svensk litteraturvetenskaplig forskning}, author = {Tahmasebi, Nina and Hengchen, Simon}, year = {2019}, volume = {140}, pages = {198–227}, } @inProceedings{alfter-etal-2019-legato-285625, title = {LEGATO: A flexible lexicographic annotation tool.}, abstract = {This article is a report from an ongoing project aiming at analyzing lexical and grammatical competences of Swedish as a Second language (L2). To facilitate lexical analysis, we need access to metalinguistic information about relevant vocabulary that L2 learners can use and understand. The focus of the current article is on the lexical annotation of the vocabulary scope for a range of lexicographical aspects, such as morphological analysis, valency, types of multi-word units, etc. We perform parts of the analysis automatically, and other parts manually. The rationale behind this is that where there is no possibility to add information automatically, manual effort needs to be added. To facilitate the latter, a tool LEGATO has been designed, implemented and currently put to active testing.}, booktitle = {Linköping Electronic Conference Proceedings, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland Editor(s): Mareike Hartman and Barbara Plank}, author = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2019}, publisher = {Linköping University Electronic Press}, address = {Linköping university}, ISBN = {978-91-7929-995-8}, } @inProceedings{stemle-etal-2019-working-319453, title = {Working together towards an ideal infrastructure for language learner corpora}, abstract = {In this article we provide an overview of first-hand experiences and vantage points for best practices from projects in seven European countries dedicated to learner corpus research (LCR) and the creation of language learner corpora. The corpora and tools involved in LCR are becoming more and more important, as are careful preparation and easy retrieval and reusability of corpora and tools. But the lack of commonly agreed solutions for many aspects of LCR, interoperability between learner corpora and the exchange of data from different learner corpus projects remains a challenge. We show how concepts like metadata, anonymization, error taxonomies and linguistic annotations as well as tools, toolchains and data formats can be individually challenging and how the challenges can be solved. }, booktitle = {Widening the Scope of Learner Corpus Research. Selected papers from the fourth Learner Corpus Research Conference. Corpora and Language in Use – Proceedings 5 / Andrea Abel, Aivars Glaznieks, Verena Lyding and Lionel Nicolas (eds.)}, author = {Stemle, Egon and Boyd, Adriane and Janssen, Maarten and Preradović, Nives Mikelić and Rosen, Alexandr and Rosén, Dan and Volodina, Elena}, year = {2019}, publisher = {PUL, Presses Universitaires de Louvain}, address = {Louvain-la-Neuve }, ISBN = {978-2-87558-868-5}, } @inProceedings{dubossarsky-etal-2019-time-295438, title = {Time for change: Evaluating models of semantic change without evaluation tasks}, booktitle = {Cambridge Language Sciences Annual Symposium 2019 : Perspectives on Language Change}, author = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik}, year = {2019}, } @inProceedings{alfter-volodina-2019-from-285728, title = {From river to bank: The importance of sense-based graded word lists}, booktitle = { EUROCALL 2019 - CALL and Complexity, Book of Abstracts, Louvain-la-Neuve, Belgium, 28-31 August 2019}, author = {Alfter, David and Volodina, Elena}, year = {2019}, } @inProceedings{tahmasebi-etal-2019-convergence-280684, title = {A Convergence of Methodologies: Notes on Data-Intensive Humanities Research}, abstract = {In this paper, we discuss a data-intensive research methodology for the digital humanities. We highlight the differences and commonalities between quantitative and qualitative research methodologies in relation to a data-intensive research process. We argue that issues of representativeness and reduction must be in focus for all phases of the process; from the status of texts as such, over their digitization topre-processing and methodological exploration.}, booktitle = {CEUR workshop proceedings ; 2364. Proceedings of the 4th Conference on Digital Humanities in the Nordic Countries, Copenhagen, Denmark, March 5-8, 2019}, editor = {Costanza Navarretta and Manex Agirrezabal and Bente Maegaard}, author = {Tahmasebi, Nina and Hagen, Niclas and Brodén, Daniel and Malm, Mats}, year = {2019}, publisher = {CEUR workshop proceedings}, address = {Aachen }, } @article{adesam-bouma-2019-koala-288026, title = {The Koala Part-of-Speech Tagset}, abstract = {We present the Koala part-of-speech tagset for written Swedish. The categorization takes the Swedish Academy Grammar (SAG) as its main starting point, to fit with the current descriptive view on Swedish grammar. We argue that neither SAG, as is, nor any of the existing part-of-speech tagsets meet our requirements for a broadly applicable categorization. Our proposal is outlined and compared to the other descriptions, and motivations for both the tagset as a whole as well as decisions about individual tags are discussed.}, journal = {Northern European Journal of Language Technology}, author = {Adesam, Yvonne and Bouma, Gerlof}, year = {2019}, volume = {6}, pages = {5--41}, } @inProceedings{matsson-etal-2019-imagettr-284011, title = {ImageTTR: Grounding Type Theory with Records in Image Classification for Visual Question Answering}, abstract = {We present ImageTTR, an extension to the Python implementation of Type Theory with Records (pyTTR) which connects formal record type representation with image classifiers implemented as deep neural networks. The Type Theory with Records framework serves as a knowledge representation system for natural language the representations of which are grounded in perceptual information of neural networks. We demonstrate the benefits of this symbolic and data-driven hybrid approach on the task of visual question answering.}, booktitle = {Proceedings of the IWCS 2019 Workshop on Computing Semantics with Types, Frames and Related Structures, May 24, 2019, Gothenburg, Sweden / Rainer Osswald, Christian Retoré, Peter Sutton (Editors)}, author = {Matsson, Arild and Dobnik, Simon and Larsson, Staffan}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-950737-25-3}, } @article{fraser-etal-2019-predicting-282807, title = {Predicting MCI Status From Multimodal Language Data Using Cascaded Classifiers}, abstract = {Recent work has indicated the potential utility of automated language analysis for the detection of mild cognitive impairment (MCI). Most studies combining language processing and machine learning for the prediction of MCI focus on a single language task; here, we consider a cascaded approach to combine data from multiple language tasks. A cohort of 26 MCI participants and 29 healthy controls completed three language tasks: picture description, reading silently, and reading aloud. Information from each task is captured through different modes (audio, text, eye-tracking, and comprehension questions). Features are extracted from each mode, and used to train a series of cascaded classifiers which output predictions at the level of features, modes, tasks, and finally at the overall session level. The best classification result is achieved through combining the data at the task level (AUC = 0.88, accuracy = 0.83). This outperforms a classifier trained on neuropsychological test scores (AUC = 0.75, accuracy = 0.65) as well as the "early fusion" approach to multimodal classification (AUC = 0.79, accuracy = 0.70). By combining the predictions from the multimodal language classifier and the neuropsychological classifier, this result can be further improved to AUC = 0.90 and accuracy = 0.84. In a correlation analysis, language classifier predictions are found to be moderately correlated (rho = 0.42) with participant scores on the Rey Auditory Verbal Learning Test (RAVLT). The cascaded approach for multimodal classification improves both system performance and interpretability. This modular architecture can be easily generalized to incorporate different types of classifiers as well as other heterogeneous sources of data (imaging, metabolic, etc.).}, journal = {Frontiers in Aging Neuroscience}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Öhman, Fredrik and Kokkinakis, Dimitrios}, year = {2019}, volume = {11}, number = {205}, } @article{volodina-etal-2019-swell-285609, title = {The SweLL Language Learner Corpus: From Design to Annotation}, abstract = {The article presents a new language learner corpus for Swedish, SweLL, and the methodology from collection and pesudonymisation to protect personal information of learners to annotation adapted to second language learning. The main aim is to deliver a well-annotated corpus of essays written by second language learners of Swedish and make it available for research through a browsable environment. To that end, a new annotation tool and a new project management tool have been implemented, – both with the main purpose to ensure reliability and quality of the final corpus. In the article we discuss reasoning behind metadata selection, principles of gold corpus compilation and argue for separation of normalization from correction annotation.}, journal = {Northern European Journal of Language Technology}, author = {Volodina, Elena and Granstedt, Lena and Matsson, Arild and Megyesi, Beáta and Pilán, Ildikó and Prentice, Julia and Rosén, Dan and Rudebeck, Lisa and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2019}, volume = {6}, pages = {67--104}, } @article{kosem-etal-2019-image-275354, title = {The image of the monolingual dictionary across Europe. Results of the European survey of dictionary use and culture}, abstract = {The article presents the results of a survey on dictionary use in Europe, focusing on general monolingual dictionaries. The survey is the broadest survey of dictionary use to date, covering close to 10,000 dictionary users (and non-users) in nearly thirty countries. Our survey covers varied user groups, going beyond the students and translators who have tended to dominate such studies thus far. The survey was delivered via an online survey platform, in language versions specific to each target country. It was completed by 9,562 respondents, over 300 respondents per country on average. The survey consisted of the general section, which was translated and presented to all participants, as well as country-specific sections for a subset of 11 countries, which were drafted by collaborators at the national level. The present report covers the general section}, journal = {International Journal of Lexicography}, author = {Kosem, Iztok and Lew, Robert and Müller-Spitzer, Carolin and Ribeiro Silveira, Maria and Wolfer, Sascha and Volodina, Elena and Pilán, Ildikó and Sköldberg, Emma and Holmer, Louise and Dorn, Amelie and Gurrutxaga, Antton and Lorentzen, Henrik and Kallas, Jelena and Abel, Andrea and Tiberius, Carole and Partners, Local}, year = {2019}, volume = {32}, number = {1}, pages = {92–114}, } @inProceedings{fridlund-etal-2019-trawling-287968, title = {Trawling for Terrorists: A Big Data Analysis of Conceptual Meanings and Contexts in Swedish Newspapers, 1780–1926}, abstract = {The conceptual history of terrorism has to a significant extent been studied through canonical texts or historical key figures or organisations. However, through the increasing digitization of text materials convential research questions can now be approached from new angles or established results verified on the basis of exhaustive collections of data, rather than limited samples. Specifically, we are interested in evaluating and expanding on prior research claims regarding the meanings and con- texts associated with the concepts terrorism and terrorist up until the twentieth century in a Swedish context. The investigation is guided by the following research questions: What historical meanings of the concept of terrorism were expressed in the Swedish newspaper discourse? What social and ideological contexts and violent political practices was the concept primarily associated with before the First World War?}, booktitle = {Proceedings of the 5th International Workshop on Computational History (HistoInformatics 2019) co-located with the 23rd International Conference on Theory and Practice of Digital Libraries (TPDL 2019) Oslo, Norway, September 12th, 2019, Melvin Wevers, Mohammed Hasanuzzaman, Gaël Dias, Marten Düring, & Adam Jatowt, eds. }, author = {Fridlund, Mats and Olsson, Leif-Jöran and Brodén, Daniel and Borin, Lars}, year = {2019}, publisher = {CEUR-WS}, address = {Aachen}, } @inProceedings{fraser-etal-2019-multilingual-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), June 2 - June 7, 2019, Minneapolis, Minnesota / Jill Burstein, Christy Doran, Thamar Solorio (Editors) }, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-950737-13-0}, } @article{kokkinakis-edstrom-2019-alderism-284251, title = {Ålderism i dagens mediala Sverige }, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, number = {3/2019}, pages = {22--27}, } @inProceedings{lundholmfors-etal-2019-reading-284036, title = {Reading and mild cognitive impairment}, abstract = {In the present study, we investigated the discriminatory power of eye-tracking features in distinguishing between individuals with mild cognitive impairment (MCI) and healthy controls (HC). The eye movements of the study participants were recorded at two different time points, 18 months apart. Using a machine learning approach with leave-one-out cross-validation, we were able to discriminate between the groups with 73.6 AUC. However, somewhat surprisingly the classification was less successful using data from the second recording session, which might be attributed to the non-static nature of cognitive status. Still, the outcome suggests that eye-tracking measures can be exploited as useful markers of MCI. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Lundholm Fors, Kristina and Antonsson, Malin and Kokkinakis, Dimitrios and Fraser, Kathleen}, year = {2019}, ISBN = {978-618-84585-0-5}, } @inProceedings{volodina-etal-2019-svala-285617, title = {SVALA: an Annotation Tool for Learner Corpora generating parallel texts}, abstract = {Learner corpora are actively used for research on Language Acquisition and in Learner Corpus Research (LCR). The data is, however, very expensive to collect and manually annotate, and includes steps like anonymization, normalization, error annotation, linguistic annotation. In the past, projects often re - used tools from a number of different projects for the above steps. As a result, various input and output formats between the tools needed to be converted, which increased the complexity of the task. In the present project, we are developing a tool that handles all of the above - mentioned steps in one environment maintaining a stable interpretable format between the steps. A distinguishing feature of the tool is that users work in a usual environment (plain text) while the tool visualizes all performed edits via a graph that links an original learner text with an edited one, token by token.}, booktitle = {Learner Corpus Research conference (LCR-2019), Warsaw, 12-14 September 2019, Book of abstracts}, author = {Volodina, Elena and Matsson, Arild and Rosén, Dan and Wirén, Mats}, year = {2019}, } @article{fraser-etal-2019-multilingual-270713, title = {Multilingual word embeddings for the assessment of narrative speech in mild cognitive impairment}, abstract = {We analyze the information content of narrative speech samples from individuals with mild cognitive impairment (MCI), in both English and Swedish, using a combination of supervised and unsupervised learning techniques. We extract information units using topic models trained on word embeddings in monolingual and multilingual spaces, and find that the multilingual approach leads to significantly better classification accuracies than training on the target language alone. In many cases, we find that augmenting the topic model training corpus with additional clinical data from a different language is more effective than training on additional monolingual data from healthy controls. Ultimately we are able to distinguish MCI speakers from healthy older adults with accuracies of up to 63% (English) and 72% (Swedish) on the basis of information content alone. We also compare our method against previous results measuring information content in Alzheimer's disease, and report an improvement over other topic-modeling approaches. Furthermore, our results support the hypothesis that subtle differences in language can be detected in narrative speech, even at the very early stages of cognitive decline, when scores on screening tools such as the Mini-Mental State Exam are still in the “normal” range.}, journal = {Computer Speech and Language}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2019}, volume = {53}, pages = {121--139}, } @inProceedings{themistocleous-kokkinakis-2019-speech-289021, title = {Speech and Mild Cognitive Impairment detection}, abstract = {It is of great importance to detect objective markers that can enable the early and fast identification of individuals with Mild Cognitive Impairment (MCI) from healthy individuals to inform, patient care, family and treatment planning. Connected speech productions can offer such markers. This study analyses recordings from picture description tasks by Swedish individuals with MCI and healthy control individuals (HC) and shows that voice quality, periodicity, and speech rate distinguish individuals with MCI from HC. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2019}, publisher = { ExLing Society}, ISBN = {978-618-84585-0-5}, } @inProceedings{kokkinakis-lundholmfors-2019-hund-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{kokkinakis-edstrom-2019-alderism-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{kokkinakis-etal-2019-multifaceted-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{antonsson-etal-2019-discourse-284038, title = {Discourse in Mild Cognitive Impairment }, abstract = {This paper reports on how persons with mild cognitive impairment (MCI) perform on two types of narrative tasks compared to a group of healthy controls (HC). The first task is a widely used picture description task and the other task is a more complex discourse task. Since the latter task puts higher demands on cognitive linguistic skills, as seen in previous research, we expected this task to be more efficient in discriminating between the two groups. The results confirm this hypothesis. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Antonsson, Malin and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2019}, publisher = { ExLing Society}, ISBN = {978-618-84585-0-5}, } @inProceedings{linz-etal-2019-temporal-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, Minnesota, USA, June 6, 2019 / Kate Niederhoffer, Kristy Hollingshead, Philip Resnik, Rebecca Resnik, Kate Loveys (Editors)}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, publisher = {Association for Computational Linguistics }, address = {Stroudsburg, PA }, ISBN = {978-1-948087-95-7}, } @article{skoldberg-etal-2019-state-279701, title = {State-of-the-art on monolingual lexicography for Sweden}, abstract = {The minireview describes the state-of-the-art of Swedish monolingual lexicography. The main actors in the field, both commercial and non-commercial, are mentioned alongside with the description of lexicographic products that have been offered by them to the lexicon users. The minireview makes it clear that there is an obvious tendency among the Swedish dictionary users to abandon paper-based dictionaries and switch over to online portals and apps, which influences the practices adopted by commercial publishing houses, such as Norstedts, Bonniers, Natur & Kultur. Among the leading non-commercial players, the Swedish Academy, the Swedish Language Bank, Institute for Language and Folklore are named. Swedish monolingual lexicography offers, however, dictionaries produced not only by experts but also by non-experts (i.e. using the efforts of the crowd).}, journal = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research}, author = {Sköldberg, Emma and Holmer, Louise and Volodina, Elena and Pilán, Ildikó}, year = {2019}, volume = {7}, number = {1}, pages = {13--24}, } @inProceedings{johansson-etal-2019-lexical-284330, title = {Lexical diversity and mild cognitive impairment}, abstract = {This paper explores the role that various lexical-based measures play for differentiating between individuals with mild forms of cognitive impairment (MCI) and healthy controls (HC). Recent research underscores the importance of language and linguistic analysis as essential components that can contribute to a variety of sensitive cognitive measures for the identification of milder forms of cognitive impairment. Subtle language changes serve as a sign that an individual’s cognitive functions have been impacted, potentially leading to early diagnosis. Our research aims to identify linguistic biomarkers that could distinguish between individuals with MCI and HC and also be useful in predicting MCI.}, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Johansson, Sofie and Lundholm Fors, Kristina and Antonsson, Malin and Kokkinakis, Dimitrios}, year = {2019}, publisher = {ExLing Society}, address = {Athens, Greece}, ISBN = {978-618-84585-0-5}, } @inProceedings{dannells-etal-2019-evaluation-278761, title = {Evaluation and refinement of an enhanced OCR process for mass digitisation. }, abstract = {Great expectations are placed on the capacity of heritage institutions to make their collections available in digital format. Datadriven research is becoming a key concept within the humanities and social sciences. Kungliga biblioteket’s (National Library of Sweden, KB)collections of digitised newspaper can thus be regarded as unique cultural data sets with information that rarely is conveyed in other media types. The digital format makes it possible to explore these resources in ways not feasible while in printed form. As texts are no longer only read but also subjected to computer based analysis the demand on the correct rendering of the original text increases. OCR technologies for converting images to machine-readable text play a fundamental part in making these resources available, but the effectiveness vary with the type of document being processed. This is evident in relation to the digitisation of newspapers where factors relating to their production, layout and paper quality often impair the OCR production. In order to improve the machine readable text, especially in relation to the digitisation of newspapers, KB initiated the development of an OCR-module where key parameters can be adjusted according to the characteristics of the material being processed. The purpose of this paper is to present the project goals and methods.}, booktitle = {Proceedings of the Digital Humanities in the Nordic Countries 4th Conference (DHN 2019), Copenhagen, Denmark, March 5-8, 2019. Edited by: Costanza Navarretta, Manex Agirrezabal, Bente Maegaard}, author = {Dannélls, Dana and Johansson, Torsten and Björk, Lars}, year = {2019}, publisher = {University of Copenhagen, Faculty of Humanities}, address = {Copenhagen}, } @inProceedings{bamutura-ljunglof-2019-towards-284293, title = {Towards a resource grammar for Runyankore and Rukiga}, abstract = {Currently, there is a lack of computational grammar resources for many under-resourced languages which limits the ability to develop Natural Language Processing (NLP) tools and applications such as Multilingual Document Authoring, Computer-Assisted Language Learning (CALL) and Low-Coverage Machine Translation (MT) for these languages. In this paper, we present our attempt to formalise the grammar of two such languages: Runyankore and Rukiga. For this formalisation we use the Grammatical Framework (GF) and its Resource Grammar Library (GF-RGL).}, booktitle = {WiNLP 2019, the 3rd Workshop on Widening NLP, Florence, Italy, 28th July 2019}, author = {Bamutura, David and Ljunglöf, Peter}, year = {2019}, } @inProceedings{lindahl-etal-2019-towards-286588, title = {Towards Assessing Argumentation Annotation - A First Step}, abstract = {This paper presents a first attempt at using Walton’s argumentation schemes for annotating arguments in Swedish political text and assessing the feasibility of using this particular set of schemes with two linguistically trained annotators. The texts are not pre-annotated with argumentation structure beforehand. The results show that the annotators differ both in number of annotated arguments and selection of the conclusion and premises which make up the arguments. They also differ in their labeling of the schemes, but grouping the schemes increases their agreement. The outcome from this will be used to develop guidelines for future annotations.}, booktitle = {Proceedings of the 6th Workshop on Argument Mining, August 1, 2019, Florence, Italy / Benno Stein, Henning Wachsmuth (Editors)}, author = {Lindahl, Anna and Borin, Lars and Rouces, Jacobo}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-33-8}, } @misc{tahmasebi-etal-2019-proceedings-285886, title = {Proceedings of the 1st International Workshop on Computational Approaches to Historical Language Change, August 2, 2019, Florence, Italy}, author = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-31-4}, } @book{nietopina-2019-splitting-282680, title = {Splitting rocks: Learning word sense representations from corpora and lexica}, abstract = {The representation of written language semantics is a central problem of language technology and a crucial component of many natural language processing applications, from part-of-speech tagging to text summarization. These representations of linguistic units, such as words or sentences, allow computer applications that work with language to process and manipulate the meaning of text. In particular, a family of models has been successfully developed based on automatically learning semantics from large collections of text and embedding them into a vector space, where semantic or lexical similarity is a function of geometric distance. Co-occurrence information of words in context is the main source of data used to learn these representations. Such models have typically been applied to learning representations for word forms, which have been widely applied, and proven to be highly successful, as characterizations of semantics at the word level. However, a word-level approach to meaning representation implies that the different meanings, or senses, of any polysemic word share one single representation. This might be problematic when individual word senses are of interest and explicit access to their specific representations is required. For instance, in cases such as an application that needs to deal with word senses rather than word forms, or when a digital lexicon's sense inventory has to be mapped to a set of learned semantic representations. In this thesis, we present a number of models that try to tackle this problem by automatically learning representations for word senses instead of for words. In particular, we try to achieve this by using two separate sources of information: corpora and lexica for the Swedish language. Throughout the five publications compiled in this thesis, we demonstrate that it is possible to generate word sense representations from these sources of data individually and in conjunction, and we observe that combining them yields superior results in terms of accuracy and sense inventory coverage. Furthermore, in our evaluation of the different representational models proposed here, we showcase the applicability of word sense representations both to downstream natural language processing applications and to the development of existing linguistic resources.}, author = {Nieto Piña, Luis}, year = {2019}, publisher = {University of Gothenburg}, address = {Gothenburg}, ISBN = {978-91-87850-75-2}, } @inProceedings{rodven-eide-2019-swedish-289474, title = {The Swedish PoliGraph}, abstract = {As part of a larger project on argument mining of Swedish parliamentary data, we have created a semantic graph that, together with named entity recognition and resolution (NER), should make it easier to establish connections between arguments in a given debate. The graph is essentially a semantic database that keeps track of Members of Parliament (MPs), in particular their presence in the parliament and activity in debates, but also party affiliation and participation in commissions. The hope is that the Swedish PoliGraph will enable us to perform named entity resolution on debates in the Swedish parliament with a high accuracy, with the aim of determining to whom an argument is directed.}, booktitle = {Proceedings of the 6th Workshop on Argument Mining, August 1, 2019 Florence, Italy / Benno Stein, Henning Wachsmuth (Editors)}, author = {Rødven-Eide, Stian}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-33-8}, } @inProceedings{bouma-etal-2019-building-289485, title = {Building a Diachronic and Contrastive Parallel Corpus – and an Intended Application in the Form of a Study of Germanic Complex Verb Constructions }, abstract = {We present a parallel corpus under construction, which is parallel in diachronically (through time) as well as contrastively (between languages). The corpus is made up of Bible texts spanning almost 6 centuries in 4 languages. Our project's direct purpose of building the corpus is to track the development of verb combinations containing multiple auxiliary verbs through time in German, Dutch, English and Swedish. We will also make the corpus available to other researchers. In this poster, we discuss the design of the corpus, our selection of sources, issues with bringing together a wide variety of sources, and alignment of the data. We will also touch upon intended future work concerning the automatic linguistic processing needed to facilitate the study of verb constructions, and the methodological challenges of doing corpus linguistic research on the varying quality of annotations produced by automatic methods on materials from such a wide range of origins.}, booktitle = {Digital Humanities 2019, 9 -12 July 2019, Utrecht, the Netherlands}, author = {Bouma, Gerlof and Coussé, Evie and de Kooter, Dirk-Jan and van der Sijs, Nicoline}, year = {2019}, } @techreport{ljunglof-etal-2019-assessing-281222, title = {Assessing the quality of Språkbanken’s annotations}, abstract = {Most of the corpora in Språkbanken Text consist of unannotated plain text, such as almost all newspaper texts, social media texts, novels and official documents. We also have some corpora that are manually annotated in different ways, such as Talbanken (annotated for part-of-speech and syntactic structure), and the Stockholm Umeå Corpus (annotated for part-of-speech). Språkbanken’s annotation pipeline Sparv aims to automatise the work of automatically annotating all our corpora, while still keeping the manual annotations intact. When all corpora are annotated, they can be made available, e.g., in the corpus searh tools Korp and Strix. Until now there has not been any comprehensive overview of the annotation tools and models that Sparv has been using for the last eight years. Some of them have not been updated since the start, such as the part-of-speech tagger Hunpos and the dependency parser MaltParser. There are also annotation tools that we still have not included, such as a constituency-based parser. Therefore Språkbanken initiated a project with the aim of conducting such an overview. This document is the outcome of that project, and it contains descriptions of the types of manual and automatic annotations that we currently have in Språkbanken, as well as an incomplete overview of the state-of-the-art with regards to annotation tools and models. }, author = {Ljunglöf, Peter and Zechner, Niklas and Nieto Piña, Luis and Adesam, Yvonne and Borin, Lars}, year = {2019}, } @inProceedings{hoang-etal-2019-aspect-284269, title = {Aspect-Based Sentiment Analysis using BERT}, booktitle = {Proceedings of the 22nd Nordic Conference on Computational Linguistics, 30 September–2 October, 2019, Turku, Finland / Mareike Hartmann, Barbara Plank (Editors)}, author = {Hoang, M. and Bihorac, O. A. and Rouces, Jacobo}, year = {2019}, publisher = {Linköping University Electronic Press}, address = {Sweden}, ISBN = {978-91-7929-995-8}, } @article{themistocleous-2019-dialect-341526, title = {Dialect Classification From a Single Sonorant Sound Using Deep Neural Networks}, abstract = {During spoken communication, the fine acoustic properties of human speech can reveal vital sociolinguistic and linguistic information about speakers and thus, these properties can function as reliable identification markers of speakers' identity. One key piece of information speech reveals is speakers' dialect. The first aim of this study is to provide a machine learning method that can distinguish the dialect from acoustic productions of sonorant sounds. The second aim is to determine the classification accuracy of dialects from the temporal and spectral information of a single sonorant sound and the classification accuracy of dialects using additional co-articulatory information from the adjacent vowel. To this end, this paper provides two classification approaches. The first classification approach aims to distinguish two Greek dialects, namely Athenian Greek, the prototypical form of Standard Modern Greek and Cypriot Greek using measures of temporal and spectral information (i.e., spectral moments) from four sonorant consonants /m n l r/. The second classification study aims to distinguish the dialects using coarticulatory information (e.g., formants frequencies F1 - F5, F0, etc.) from the adjacent vowel in addition to spectral and temporal information from sonorants. In both classification approaches, we have employed Deep Neural Networks, which we compared with Support Vector Machines, Random Forests, and Decision Trees. The findings show that neural networks distinguish the two dialects using a combination of spectral moments, temporal information, and formant frequency information with 81% classification accuracy, which is a 14% accuracy gain over employing temporal properties and spectral moments alone. In conclusion, Deep Neural Networks can classify the dialect from single consonant productions, making them capable of identifying sociophonetic shibboleths.}, journal = {FRONTIERS IN COMMUNICATION}, author = {Themistocleous, Charalambos}, year = {2019}, volume = {4}, } @article{sandberg-etal-2019-issue-285614, title = {Issue Salience on Twitter During Swedish Party Leaders’ Debates }, abstract = {The objective of this study is to contribute knowledge about formation of political agendas on Twitter during mediated political events, using the party leaders’ debates in Sweden before the general election of 2014 as a case study. Our findings show that issues brought up during the debates were largely mirrored on Twitter, with one striking discrepancy. Contrary to our expectations, issues on the left-right policy dimension were more salient on Twitter than in the debates, whereas issues such as the environment, immigration and refugees, all tied to a liberal-authoritarian value axis, were less salient on Twitter.}, journal = {Nordicom Review}, author = {Sandberg, Linn and Bjereld, Ulf and Bunyik, Karina and Forsberg, Markus and Johansson, Richard}, year = {2019}, volume = {40}, number = {2}, pages = {49--61}, } @inProceedings{bouma-2019-exploring-289484, title = {Exploring Combining Training Datasets for the CLIN 2019 Shared Task on Cross-genre Gender Detection in Dutch}, abstract = {We present our entries to the Shared Task on Cross-genre Gender Detection in Dutch at CLIN 2019. We start from a simple logistic regression model with commonly used features, and consider two ways of combining training data from different sources.Our in-genre models do reasonably well, but the cross-genre models area lot worse. Post-task experiments show no clear systematic advantage of one way of combining training data sources over the other, but do suggest accuracy can be gained from a better way of setting model hyperparameters.}, booktitle = {CEUR Workshop Proceedings, vol 2453. Proceedings of the Shared Task on Cross-Genre Gender Prediction in Dutch at CLIN29 (GxG-CLIN29) co-located with the 29th Conference on Computational Linguistics in The Netherlands (CLIN29). Groningen, The Netherlands, January 31, 2019. Edited by Hessel Haagsma, Tim Kreutz, Masha Medvedeva, Walter Daelemans and Malvina Nissim}, author = {Bouma, Gerlof}, year = {2019}, publisher = {CEUR-WS.org}, address = {Aachen }, } @misc{alfter-etal-2019-proceedings-285613, title = {Proceedings of the 8th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2019), September 30, Turku Finland}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promote development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field. }, author = {Alfter, David and Volodina, Elena and Borin, Lars and Pilán, Ildikó and Lange, Herbert}, year = {2019}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7929-998-9}, } @article{agebjorn-alfter-2019-review-281196, title = {Review of Advanced Proficiency and Exceptional Ability in Second Languages}, journal = {Linguist List}, author = {Agebjörn, Anders and Alfter, David}, year = {2019}, number = { Jan 16}, } @inProceedings{virk-etal-2019-exploiting-290903, title = {Exploiting frame semantics and frame-semantic parsing for automatic extraction of typological information from descriptive grammars of natural languages}, abstract = {We describe a novel system for automatic extraction of typological linguistic information from descriptive grammars of natural languages, applying the theory of frame semantics in the form of frame-semantic parsing. The current proof-of-concept system covers a few selected linguistic features, but the methodology is general and can be extended not only to other typological features but also to descriptive grammars written in languages other than English. Such a system is expected to be a useful assistance for automatic curation of typological databases which otherwise are built manually, a very labor and time consuming as well as cognitively taxing enterprise.}, booktitle = {12th International Conference on Recent Advances in Natural Language Processing, RANLP 2019, Varna, Bulgaria, 2-4 September 2019}, author = {Virk, Shafqat and Muhammad, Azam Sheikh and Borin, Lars and Aslam, Muhammad Irfan and Iqbal, Saania and Khurram, Nazia}, year = {2019}, publisher = {INCOMA Ltd.}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-055-7}, } @inProceedings{alfter-etal-2019-larka-281344, title = {Lärka: From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpusbased exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN. Lärka has recently received a new responsive user interface adapted to different devices with different screen sizes. Moreover, the system has also been augmented with new functionalities. These recent additions aim at improving the usability and the usefulness of the platform for pedagogical purposes. The most important development, though, is the adaptation of the platform to serve as a component in an e-infrastructure supporting research on language learning and multilingualism. Thanks to Lärka’s service-oriented architecture, most functionalities are also available as web services which can be easily re-used by other applications.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2019}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7685-034-3}, } @article{fyndanis-themistocleous-2019-there-268753, title = {Are there prototypical associations between time frames and aspectual values? Evidence from Greek aphasia and healthy ageing}, abstract = {Time reference, which has been found to be selectively impaired in agrammatic aphasia, is often interwoven with grammatical aspect. A recent study on Russian aphasia found that time reference and aspect interact: Past reference was less impaired when tested within a perfective aspect context (compared to when tested within an imperfective aspect context), and reference to the non-past was less impaired when tested within an imperfective aspect context (compared to when tested within a perfective aspect context). To explain this pattern, the authors argued that there are prototypical associations between time frames and aspectual values. The present study explores the relationship between time reference and aspect focusing on Greek aphasia and healthy ageing and using a sentence completion task that crosses time reference and aspect. The findings do not support prototypical matches between different time frames and aspectual values. Building on relevant studies, we propose that patterns of performance of healthy or language-impaired speakers on constrained tasks tapping different combinations of time frames with aspectual values should reflect the relative frequency of these combinations in a given language. The analysis of the results at the individual level revealed a double dissociation, which indicates that a given time frame–aspectual value combination may be relatively easy to process for some persons with aphasia but demanding for some others.}, journal = {Clinical Linguistics & Phonetics}, author = {Fyndanis, Valantis and Themistocleous, Charalambos}, year = {2019}, volume = {33}, number = {1-2}, pages = {191--217}, } @inProceedings{rouces-etal-2019-tracking-281308, title = {Tracking Attitudes Towards Immigration in Swedish Media}, abstract = {We use a gold standard under construction for sentiment analysis in Swedish to explore how attitudes towards immigration change across time and media. We track the evolution of attitude starting from the year 2000 for three different Swedish media: the national newspapers Aftonbladet and Svenska Dagbladet, representing different halves of the left–right political spectrum, and the online forum Flashback.}, booktitle = {CEUR Workshop Proceedings (Vol. 2364). Digital Humanities in the Nordic Countries 4th Conference, Copenhagen, Denmark, March 5-8, 2019. }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR Workshop Proceedings}, address = {Aachen }, } @inProceedings{rouces-etal-2019-political-281307, title = {Political Stance Analysis Using Swedish Parliamentary Data}, abstract = {We process and visualize Swedish parliamentary data using methods from statistics and machine learning, which allows us to obtain insight into the political processes behind the data. We produce plots that let us infer the relative stance of political parties and their members on different topics. In addition, we can infer the degree of homogeneity of individual votes within different parties, as well as the degree of multi-dimensionality of Swedish politics.}, booktitle = {CEUR Workshop Proceedings (Vol. 2364). Digital Humanities in the Nordic Countries 4th Conference, Copenhagen, Denmark, March 5-8, 2019.}, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR }, address = {Aachen }, } @inProceedings{alfter-graen-2019-interconnecting-285731, title = {Interconnecting lexical resources and word alignment: How do learners get on with particle verbs?}, abstract = {In this paper, we present a prototype for an online exercise aimed at learners of English and Swedish that serves multiple purposes. The exercise allows learners of the aforementioned languages to train their knowledge of particle verbs receiving clues from the exercise application. The user themselves decide which clue to receive and pay in virtual currency for each, which provides us with valuable information about the utility of the clues that we provide as well as the learners willingness to trade virtual currency versus accuracy of their choice. As resources, we use list with annotated levels from the proficiency scale defined by the Common European Framework of Reference (CEFR) and a multilingual corpus with syntactic dependency relations and word annotation for all language pairs. From the latter resource, we extract translation equivalents for particle verb construction together with a list of parallel corpus examples that can be used as clues in the exercise.}, booktitle = {Linköping Electronic Conference Proceeding, No. 167, NEAL Proceedings of the 22nd Nordic Conference on Computational Linguistics (NoDaLiDa), September 30-October 2, Turku, Finland / Editor(s): Mareike Hartman and Barbara Plank}, author = {Alfter, David and Graën, Johannes}, year = {2019}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping university}, ISBN = {978-91-7929-995-8}, } @inProceedings{adesam-etal-2019-exploring-279948, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, abstract = {The KubHist Corpus is a massive corpus of Swedish historical newspapers, digitized by the Royal Swedish library, and available through the Språkbanken corpus infrastructure Korp. This paper contains a first overview of the KubHist corpus, exploring some of the difficulties with the data, such as OCR errors and spelling variation, and discussing possible paths for improving the quality and the searchability.}, booktitle = {Proceedings of the 4th Conference of The Association Digital Humanities in the Nordic Countries (DHN), Copenhagen, Denmark, March 5-8, 2019}, editor = {Costanza Navarretta and Manex Agirrezabal and Bente Maegaard}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2019}, publisher = {CEUR Workshop Proceedings}, address = {Aachen}, } @inProceedings{berdicevskis-eckhoff-2020-diachronic-293349, title = {A Diachronic Treebank of Russian Spanning More Than a Thousand Years}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), May 11-16, 2020, Marseille, France / ed. Nicoletta Calzolari (Conference chair). }, author = {Berdicevskis, Aleksandrs and Eckhoff, Hanne}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4}, pages = {5251--5256}, } @article{roberts-etal-2020-chield-292421, title = {CHIELD: the causal hypotheses in evolutionary linguistics database}, journal = {Journal of Language Evolution}, author = {Roberts, Sean and Killin, Anton and Deb, Angarika and Sheard, Catherine and Greenhill, Simon and Sinnemäki, Kaius and Segovia-Martin, José and Nölle, Jonas and Berdicevskis, Aleksandrs and Humphreys-Balkwill, Archie and Little, Hannah and Opie, Cristopher and Jacques, Guillaume and Bromham, Lindell and Tinits, Peeter and Ross, Robert and Lee, Sean and Gasser, Emily and Calladine, Jasmine and Spike, Matthew and Mann, Stephen and Shcherbakova, Olena and Singer, Ruth and Zhang, Shuya and Benítez-Burraco, Antonio and Kliesch, Christian and Thomas-Colquhoun, Ewan and Skirgård, Hedvig and Tamariz, Monica and Passmore, Sam and Pellard, Thomas and Jordan, Fiona}, year = {2020}, volume = {5}, number = {2}, pages = {101–120}, } @techreport{adesam-etal-2020-swedishglue-299130, title = {SwedishGLUE – Towards a Swedish Test Set for Evaluating Natural Language Understanding Models}, author = {Adesam, Yvonne and Berdicevskis, Aleksandrs and Morger, Felix}, year = {2020}, publisher = {University of Gothenburg}, } @misc{berdicevskis-2020-pizzas-297688, title = {Pizzas and vermouth}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Faber & Faber}, ISBN = {9781783352203}, pages = {150--151}, } @incollection{berdicevskis-2020-kogda-296607, title = {Kogda morfologija bessil'na}, booktitle = {VAProsy jazykoznanija: megasbornik nanostatej}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Buki-Vedi}, address = {Moskva}, ISBN = {978-5-4465-2882-0}, pages = {56--60}, } @misc{alfter-etal-2020-proceedings-300071, title = {Proceedings of the 9th Workshop on Natural Language Processing for Computer Assisted Language Learning 2020}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures. This collection presents four selected papers describing use of Language Technology for language learning.}, author = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Lange, Herbert and Borin, Lars}, year = {2020}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-732-9}, } @inProceedings{berdicevskis-piperski-2020-corpus-298524, title = {Corpus evidence for word order freezing in Russian and German}, booktitle = {Proceedings of the Fourth Workshop on Universal Dependencies (UDW 2020), December 13, 2020, Barcelona, Spain (Online) / Marie-Catherine de Marneffe, Miryam de Lhoneux, Joakim Nivre, Sebastian Schuster (Editors).}, author = {Berdicevskis, Aleksandrs and Piperski, Alexander}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = { 978-1-952148-48-4}, pages = {26--33}, } @inProceedings{berdicevskis-2020-older-290636, title = {Older English Words Are More Polysemous}, booktitle = {The Evolution of Language: Proceedings of the 13th International Conference (EvoLang13,), 14-17 April, 2020, Brussels, Belgium / Ravignani, A., Barbieri, C., Flaherty, M., Jadoul, Y., Lattenkamp, E. Z., Little, H., et al. (Eds.)}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {The Evolution of Language Conferences }, address = {Nijmegen }, pages = {14--21}, } @misc{mcgillivray-etal-2020-challenges-295208, title = {The challenges and prospects of the intersection of humanities and data science: A White Paper from The Alan Turing Institute}, abstract = {Since their beginnings, the digital humanities have engaged in an energetic debate about their scope, defining features, and relationship to the wider humanities, and have established themselves as a community of practice (Schreibman et al., 2004; Terras, 2010; Terras, 2013; Terras et al., 2013; Gold and Klein, 2016; The Digital Humanities Manifesto 2.0). The computational focus has characterised the field from its initial explorations (Hockey, 2004; Vanhoutte, 2013; Nyhan and Flinn, 2016) and the shift from the label ‘Humanities Computing’ to ‘Digital Humanities’ was a catalyst for change. In the history of the field, recurring cycles and productive tensions have arisen from the interfolding of computational methodologies and approaches with hermeneutic and critical modes of analysis (see McCarty, 2005; Rockwell and Sinclair, 2016; Jones, 2016). This document postulates that we are currently witnessing another one of these junctures, one that is calling for a critical involvement with data science. In many ways, we are seeing earlier methods blending into, or being extended by data science. Digitisation workflows are being augmented with automatic information extraction, data analysis, automated transcription of handwritten documents, and visualisation of transcribed content. Techniques developed for history, literary studies, and linguistics are being scaled towards larger datasets and more complex problems raising the bar of interpretability and questioning the validity of data collection and analysis methods. On the other hand, the field of data science has recently started to engage with non-STEM (Science, Technology, Engineering, and Mathematics) disciplines, by offering new data-driven modelling frameworks for addressing long-standing research questions (Kitchin, 2014; Lazer et al., 2009) and proposing so-called ‘human-centred approaches’ to data science, focussed on the interpretability of machine learning models and a more active role for human input in algorithms (See Chen et al., 2016). Moreover, in the current historical context we are witnessing an increased awareness of the questions of diversity and inclusion in research and academia, and we are seeing the creation of a strong movement aimed at addressing such issues globally. We believe that this paper can play a role in reinforcing a positive message in this respect.}, author = {McGillivray, Barbara and Alex, Beatrice and Ames, Sarah and Armstrong, Guyda and Beavan, David and Ciula, Arianna and Colavizza, Giovanni and Cummings, James and De Roure, David and Farquhar, Adam and Hengchen, Simon and Lang, Anouk and Loxley, James and Goudarouli, Eirini and Nanni, Federico and Nini, Andrea and Nyhan, Julianne and Osborne, Nicola and Poibeau, Thierry and Ridge, Mia and Ranade, Sonia and Smithies, James and Terras, Melissa and Vlachidis, Andreas and Willcox, Pip}, year = {2020}, } @article{themistocleous-etal-2020-voice-295469, title = {Voice quality and speech fluency distinguish individuals with Mild Cognitive Impairment from Healthy Controls}, abstract = {Mild Cognitive Impairment (MCI) is a syndrome characterized by cognitive decline greater than expected for an individual's age and education level. This study aims to determine whether voice quality and speech fluency distinguish patients with MCI from healthy individuals to improve diagnosis of patients with MCI. We analyzed recordings of the Cookie Theft picture description task produced by 26 patients with MCI and 29 healthy controls from Sweden and calculated measures of voice quality and speech fluency. The results show that patients with MCI differ significantly from HC with respect to acoustic aspects of voice quality, namely H1-A3, cepstral peak prominence, center of gravity, and shimmer; and speech fluency, namely articulation rate and averaged speaking time. The method proposed along with the obtainability of connected speech productions can enable quick and easy analysis of speech fluency and voice quality, providing accessible and objective diagnostic markers of patients with MCI.}, journal = {PloS one}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, volume = {15}, number = {7}, pages = {e0236009}, } @misc{kokkinakis-etal-2020-proceedings-305214, title = {Proceedings of the LREC 2020. Workshop on: Resources and Processing of Linguistic, Para-linguistic and Extra-linguistic Data from People with Various Forms of Cognitive/Psychiatric/Developmental Impairments (RaPID-3), May 11-16, 2020, Marseille, France}, abstract = {RaPID-3 aims to be an interdisciplinary forum for researchers to share information, findings, methods, models and experience on the collection and processing of data produced by people with various forms of mental, cognitive, neuropsychiatric, or neurodegenerative impairments, such as aphasia, dementia, autism, bipolar disorder, Parkinson’s disease or schizophrenia. Particularly, the workshop’s focus is on creation, processing and application of data resources from individuals at various stages of these impairments and with varying degrees of severity. Creation of resources includes e.g. annotation, description, analysis and interpretation of linguistic, paralinguistic and extra-linguistic data (such as spontaneous spoken language, transcripts, eyetracking measurements, wearable and sensor data, etc). Processing is done to identify, extract, correlate, evaluate and disseminate various linguistic or multimodal phenotypes and measurements, which then can be applied to aid diagnosis, monitor the progression or predict individuals at risk. A central aim is to facilitate the study of the relationships among various levels of linguistic, paralinguistic and extra-linguistic observations (e.g., acoustic measures; phonological, syntactic and semantic features; eye tracking measurements; sensors, signs and multimodal signals). Submission of papers are invited in all of the aforementioned areas, particularly emphasizing multidisciplinary aspects of processing such data and the interplay between clinical/nursing/medical sciences, language technology, computational linguistics, natural language processing (NLP) and computer science. The workshop will act as a stimulus for the discussion of several ongoing research questions driving current and future research by bringing together researchers from various research communities. }, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Themistocleous, Charalambos and Antonsson, Malin and Eckerström, Marie}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, ISBN = {979-10-95546-45-0}, } @inProceedings{berdicevskis-etal-2020-subjects-297403, title = {Subjects tend to be coded only once: Corpus-based and grammar-based evidence for an efficiency-driven trade-off}, booktitle = {Proceedings of the 19th International Workshop on Treebanks and Linguistic Theories, TLT 2020, 27–28 October 2020, Düsseldorf, Germany}, author = {Berdicevskis, Aleksandrs and Schmidtke-Bode, Karsten and Seržant, Ilja}, year = {2020}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = { 978-1-952148-01-9}, pages = {79--92}, } @inProceedings{frossard-etal-2020-dataset-293923, title = {Dataset for Temporal Analysis of English-French Cognates}, abstract = {Languages change over time and, thanks to the abundance of digital corpora, their evolutionary analysis using computational techniques has recently gained much research attention. In this paper, we focus on creating a dataset to support investigating the similarity in evolution between different languages. We look in particular into the similarities and differences between the use of corresponding words across time in English and French, two languages from different linguistic families yet with shared syntax and close contact. For this we select a set of cognates in both languages and study their frequency changes and correlations over time. We propose a new dataset for computational approaches of synchronized diachronic investigation of language pairs, and subsequently show novel findings stemming from the cognate-focused diachronic comparison of the two chosen languages. To the best of our knowledge, the present study is the first in the literature to use computational approaches and large data to make a cross-language diachronic analysis.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, author = {Frossard, Esteban and Coustaty, Mickael and Doucet, Antoine and Jatowt, Adam and Hengchen, Simon}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille, France}, ISBN = {979-10-95546-34-4}, } @incollection{berdicevskis-semenuks-2020-different-296274, title = {Different trajectories of morphological overspecification and irregularity under imperfect language learning}, booktitle = {The Complexities of Morphology}, editor = {Peter Arkadiev and Francesco Gardani}, author = {Berdicevskis, Aleksandrs and Semenuks, Arturs}, year = {2020}, publisher = {Oxford University Press}, address = {Oxford}, ISBN = {9780198861287}, pages = {283--305}, } @inProceedings{themistocleous-etal-2020-automatic-305224, title = {Automatic analysis of voice quality and prosody in patients with Mild Cognitive Impairment.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/SNL2020.pdf}, booktitle = {The 12th Annual Society for the Neurobiology of Language Meeting (SNL) -- virtual conference}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{fridlund-etal-2020-trawling-299694, title = {Trawling the Gulf of Bothnia of News: A Big Data Analysis of the Emergence of Terrorism in Swedish and Finnish Newspapers, 1780–1926}, abstract = {This study combines history domain knowledge and language technology expertise to evaluate and expand on research claims regarding the historical meanings associated with terrorism in Swedish and Finnish contexts. Using a cross-border comparative approach and large newspaper corpora made available by the CLARIN research infrastructure, we explore overlapping national discourses on terrorism, the concept’s historical diversity and its relations to different national contexts. We are particularly interested in testing the hypothesis that substate terrorism’s modern meaning was not yet established in the 19th century but primarily restricted to Russian terrorism. We conclude that our comparative study finds both uniquely national and shared meanings of terrorism and that our study strengthen the hypothesis. In extension, the study also serves as an exploration of the potentials of cross-disciplinary evaluative studies based on extensive corpora and of cross-border comparative approaches to Swedish and Finnish newspaper corpora.}, booktitle = {CLARIN Annual Conference Proceedings 2020. Edited by Costanza Navarretta, Maria Eskevich, 05–07 October 2020, Virtual Edition}, author = {Fridlund, Mats and Olsson, Leif-Jöran and Brodén, Daniel and Borin, Lars}, year = {2020}, publisher = {CLARIN}, } @inProceedings{virk-etal-2020-from-295339, title = {From Linguistic Descriptions to Language Profiles}, abstract = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia}, author = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-36-8}, } @inProceedings{themistocleous-etal-2020-automated-305223, title = {Automated speech analysis improves MCI diagnosis}, abstract = {Mild Cognitive Impairment (MCI) is a condition characterized by cognitive decline greater than expected for an individual's age and education level. In this study, we are investigating whether acoustic properties of speech production can improve the classification of individuals with MCI from healthy controls augmenting the Mini Mental State Examination, a traditional screening tool, with automatically extracted acoustic information. We found that just one acoustic feature, can improve the AUC score (measuring a trade-off between sensitivity and specificity) from 0.77 to 0.89 in a boosting classification task. These preliminary results suggest that computerized language analysis can improve the accuracy of traditional screening tools}, booktitle = {Proceedings of the 11th Experimental Linguistics Conference (ExLing)}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{dannells-simon-2020-supervised-289944, title = {Supervised OCR Post-Correction of Historical Swedish Texts: What Role Does the OCR System Play?}, abstract = {Current approaches for post-correction of OCR errors offer solutions that are tailored to a specific OCR system. This can be problematic if the post-correction method was trained on a specific OCR system but have to be applied on the result of another system. Whereas OCR post-correction of historical text has received much attention lately, the question of what role does the OCR system play for the post-correction method has not been addressed. In this study we explore a dataset of 400 documents of historical Swedish text which has been OCR processed by three state-of-the-art OCR systems: Abbyy Finereader, Tesseract and Ocropus. We examine the OCR results of each system and present a supervised machine learning post-correction method that tries to approach the challenges exhibited by each system. We study the performance of our method by using three evaluation tools: PrimA, Språkbanken evaluation tool and Frontiers Toolkit. Based on the evaluation analysis we discuss the impact each of the OCR systems has on the results of the post- correction method. We report on quantitative and qualitative results showing varying degrees of OCR post-processing complexity that are important to consider when developing an OCR post-correction method.}, booktitle = {Proceedings of the Digital Humanities in the Nordic Countries, 5th Conference, Riga, Latvia, October 21-23, 2020}, editor = {Sanita Reinsone and Inguna Skadiņa and Anda Baklāne and Jānis Daugavietis}, author = {Dannélls, Dana and Simon, Persson}, year = {2020}, publisher = {CEUR-WS}, } @inProceedings{rodven-eide-2020-anforanden-302449, title = {Anföranden: Annotated and Augmented Parliamentary Debates from Sweden}, abstract = {The Swedish parliamentary debates have been available since 2010 through the parliament’s open data web site Riksdagens öppna data. While fairly comprehensive, the structure of the data can be hard to understand and its content is somewhat noisy for use as a quality language resource. In order to make it easier to use and process – in particular for language technology research, but also for political science and other fields with an interest in parliamentary data – we have published a large selection of the debates in a cleaned and structured format, annotated with linguistic information and augmented with semantic links. Especially prevalent in the parliament’s data were end-line hyphenations – something that tokenisers generally are not equipped for – and a lot of the effort went into resolving these. In this paper, we provide detailed descriptions of the structure and contents of the resource, and explain how it differs from the parliament’s own version.}, booktitle = {Proceedings of the LREC 2020 Workshop on Creating, Using and Linking of Parliamentary Corpora with Other Types of Political Discourse, 11–16 May 2020}, author = {Rødven-Eide, Stian}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille, France}, ISBN = {979-10-95546-47-4}, } @inProceedings{bamutura-etal-2020-towards-296511, title = {Towards Computational Resource Grammars for Runyankore and Rukiga}, abstract = {In this paper, we present computational resource grammars of Runyankore and Rukiga (R&R) languages. Runyankore and Rukiga are two under-resourced Bantu Languages spoken by about 6 million people indigenous to South Western Uganda, East Africa. We used Grammatical Framework (GF), a multilingual grammar formalism and a special-purpose functional programming language to formalise the descriptive grammar of these languages. To the best of our knowledge, these computational resource grammars are the first attempt to the creation of language resources for R&R. In Future Work, we plan to use these grammars to bootstrap the generation of other linguistic resources such as multilingual corpora that make use of data-driven approaches to natural language processing feasible. In the meantime, they can be used to build Computer-Assisted Language Learning (CALL) applications for these languages among others.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, author = {Bamutura, David and Ljunglöf, Peter and Nabende, Peter}, year = {2020}, publisher = {European Language Resources Association}, } @inProceedings{lindahl-2020-annotating-302453, title = {Annotating argumentation in Swedish social media}, booktitle = {Proceedings of the 7th Workshop on Argument Mining, Barcelona, Spain (Online), December 13, 2020.}, author = {Lindahl, Anna}, year = {2020}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-952148-44-6}, } @inProceedings{bouma-etal-2020-edges-298473, title = {The EDGeS Diachronic Bible Corpus}, abstract = {We present the EDGeS Diachronic Bible Corpus: a diachronically and synchronically parallel corpus of Bible translations in Dutch, English, German and Swedish, with texts from the 14th century until today. It is compiled in the context of an intended longitudinal and contrastive study of complex verb constructions in Germanic. The paper discusses the corpus design principles, its selection of 36 Bibles, and the information and metadata encoded for the corpus texts. The EDGeS corpus will be available in two forms: the whole corpus will be accessible for researchers behind a login in the well-known OPUS search infrastructure, and the open subpart of the corpus will be available for download.}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), May 11-16, 2020, Marseille, France}, author = {Bouma, Gerlof and Coussé, Evie and Dijkstra, Trude and van der Sijs, Nicoline}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, ISBN = {979-10-95546-34-4}, } @inProceedings{dannells-virk-2020-error-297714, title = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models}, abstract = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines. }, booktitle = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2020}, } @article{broden-2020-acknowledging-294579, title = {Acknowledging Ambivalence in Teaching about Art and Aesthetics}, abstract = {In this article Daniel Broden explores the ambivalence in teaching about art and aesthetics in the humanities. By comparing and contrasting Gert J. J. Biesta's educational theory and Jacques Ranciere's writing on aesthetics, he hopes to bring some of the particularities of aesthetic experiences into focus and to discuss a tension in educational situations that concern students' interpretation of aesthetic texts: how the teacher, on the one hand, will serve as a representative for a formal system of education - or what Ranciere calls a system of inequality - and, on the other hand, should respect the autonomy of the aesthetic experience. Broden argues, however, that more interesting than the ambivalence itself is the question of how we can acknowledge this tension in productive ways. Thus, his aim here is to show how the teacher can contribute to the verification of an interpretive approach to art, with Ranciere's axiom of equality in mind. Drawing on Biesta's writings, Broden also highlights how the teacher can provide students with possibilities to pursue a subject-ness and how the risks involved call for a deconstructive approach to the enactment of teacher power. The article concludes by suggesting that we would do better not to view the ambivalence in focus as a problem, but instead to see it as something that calls for continuous engagement and critical reflection.}, journal = {Educational Theory}, author = {Brodén, Daniel}, year = {2020}, volume = {70}, number = {1}, pages = {31--42}, } @inProceedings{veeman-etal-2020-cross-297782, title = {Cross-lingual Embeddings Reveal Universal and Lineage-Specific Patterns in Grammatical Gender Assignment}, booktitle = {Proceedings of the 24th Conference on Computational Natural Language Learning, Online, November 19-20, 2020. }, author = {Veeman, Hartger and Allassonnière-Tang, Marc and Berdicevskis, Aleksandrs and Basirat, Ali}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = {978-1-952148-63-7}, pages = {265--275}, } @inProceedings{wichmann-virk-2020-towards-298431, title = { Towards a data-driven network of linguistic terms}, abstract = {Starting from close to 20,000 text docu-ments from the literature of language descrip-tions, from documents either born digitally orscanned and OCR’d, we extract keywords andpass them through a pruning pipeline wheremainly keywords that can be considered as be-longing to linguistic terminology survive. Sub-sequently we quantify relations among those terms using Normalized Pointwise Mutual In-formation (NPMI) and use the resulting measures, in conjunction with the Google PageRank (GPR), to build networks of linguistic terms. Two uses of the work are envisaged:(1) developing a search machine adapted to thelarge DReaM corpus of linguistic descriptive literature and (2) getting insights into how adata-driven ontology of linguistic terminology might be built.}, booktitle = {Swedish Language Technology Conference (SLTC)}, author = {Wichmann, Søren and Virk, Shafqat}, year = {2020}, } @inProceedings{berdicevskis-2020-foreigner-297766, title = {Foreigner-directed speech is simpler than native-directed: Evidence from social media}, booktitle = {Proceedings of the Fourth Workshop on Natural Language Processing and Computational Social Science, NLP+CSS 2020, November 20, 2020, Online / David Bamman, Dirk Hovy, David Jurgens, Brendan O'Connor, Svitlana Volkova (eds.)}, author = {Berdicevskis, Aleksandrs}, year = {2020}, publisher = {Association for Computational Linguistics}, ISBN = {978-1-952148-80-4}, pages = {163--172}, } @inProceedings{dannells-etal-2020-evaluation-296165, title = {Evaluation of a Two-OCR Engine Method: First Results on Digitized Swedish Newspapers Spanning over nearly 200 Years}, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. We report the first quantitative evaluation results on a material spanning over nearly 200 years. In this first evaluation phase we experimented with word lists for different time periods. Although there was no significant overall improvement of the OCR results, the evaluation shows that some combinations of word lists are successful for certain periods and should therefore be explored further. }, booktitle = { CLARIN Annual Conference 2020, (Virtual Event), 5-7 October, 2020. Book of Abstracts}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten}, year = {2020}, } @inProceedings{schlechtweg-etal-2020-semeval-295463, title = {SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {Lexical Semantic Change detection, i.e., the task of identifying words that change meaning over time, is a very active research area, with applications in NLP, lexicography, and linguistics. Evaluation is currently the most pressing problem in Lexical Semantic Change detection, as no gold standards are available to the community, which hinders progress. We present the results of the first shared task that addresses this gap by providing researchers with an evaluation framework and manually annotated, high-quality datasets for English, German, Latin, and Swedish. 33 teams submitted 186 systems, which were evaluated on two subtasks. }, booktitle = {Proceedings of the Fourteenth Workshop on Semantic Evaluation (SemEval2020), Barcelona, Spain (Online), December 12, 2020.}, author = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina}, year = {2020}, publisher = {ACL}, } @inProceedings{waldispuhl-etal-2020-material-293332, title = {Material Philology Meets Digital Onomastic Lexicography: The NordiCon Database of Medieval Nordic Personal Names in Continental Sources}, abstract = {We present NordiCon, a database containing medieval Nordic personal names attested in Continental sources. The database combines formally interpreted and richly interlinked onomastic data with digitized versions of the medieval manuscripts from which the data originate and information on the tokens' context. The structure of NordiCon is inspired by other online historical given name dictionaries. It takes up challenges reported on in previous works, such as how to cover material properties of a name token and how to define lemmatization principles, and elaborates on possible solutions. The lemmatization principles for NordiCon are further developed in order to facilitate the connection to other name dictionaries and corpuses, and the integration of the database into SprÃ¥kbanken Text, an infrastructure containing modern and historical written data.}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference, Marseille, 11–16 May 2020 / editors: Nicoletta Calzolari... [et. al.]}, author = {Waldispühl, Michelle and Dannélls, Dana and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Marseille}, ISBN = {979-10-95546-34-4}, } @article{arharholdt-etal-2020-language-300072, title = {Language teachers and crowdsourcing: Insights from a cross-European survey.}, abstract = {he paper presents a cross-European survey on teachers and crowdsourcing. The survey examines how familiar language teachers are with the concept of crowdsourcing and addresses their attitude towards including crowdsourcing into language teaching activities. The survey was administrated via an online questionnaire and collected volunteers’ data on: (a) teachers’ experience with organizing crowdsourcing activities for students/pupils, (b) the development of crowdsourced resources and materials as well as (c) teachers’ motivation for participating in or employing crowdsourcing activities. The questionnaire was disseminated in over 30 European countries. The final sample comprises 1129 language teachers aged 20 to 65, mostly working at institutions of tertiary education. The data indicates that many participants are not familiar with the concept of crowdsourcing resulting in a low rate of crowdsourcing activities in the classroom. However, a high percentage of responding teachers is potentially willing to crowdsource teaching materials for the language(s) they teach. They are particularly willing to collaborate with other teachers in the creation of interactive digital learning materials, and to select, edit, and share language examples for exercises or tests. Since the inclusion of crowdsourcing activities in language teaching is still in its initial stage, steps for further research are highlighted.}, journal = {Rasprave: Časopis Instituta za hrvatski jezik i jezikoslovlje}, author = {Arhar Holdt, Špela and Zviel-Girshin, Rina and Gajek, Elżbieta and Durán-Muñoz, Isabel and Bago, Petra and Fort, Karën and Hatipoglu, Ciler and Kasperavičienė, Ramunė and Koeva, Svetla and Lazić Konjik, Ivana and Miloshevska, Lina and Ordulj, Antonia and Rodosthenous, Christos and Volodina, Elena and Weber, Tassja and Zanasi, Lorenzo}, year = {2020}, volume = {46}, number = {1}, pages = {1--28}, } @inProceedings{virk-etal-2020-dream-295338, title = {The DReaM Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), Marseille, 11–16 May 2020 / Editors : Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis}, author = {Virk, Shafqat and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4 }, } @inProceedings{volodina-etal-2020-towards-300069, title = {Towards Privacy by Design in Learner Corpora Research: A Case of On-the-fly Pseudonymization of Swedish Learner Essays}, abstract = {This article reports on an ongoing project aiming at automatization of pseudonymization of learner essays. The process includes three steps: identification of personal information in an unstructured text, labeling for a category, and pseudonymization. We experiment with rule-based methods for detection of 15 categories out of the suggested 19 (Megyesi et al., 2018) that we deem important and/or doable with automatic approaches. For the detection and labeling steps, we use resources covering personal names, geographic names, company and university names and others. For the pseudonymization step, we replace the item using another item of the same type from the above-mentioned resources. Evaluation of the detection and labeling steps are made on a set of manually anonymized essays. The results are promising and show that 89% of the personal information can be successfully identified in learner data, and annotated correctly with an inter-annotator agreement of 86% measured as Fleiss kappa and Krippendorff's alpha.}, booktitle = {Proceedings of the 28th International Conference on Computational Linguistics (COLING), December 8-13, 2020, Barcelona, Spain (Online)}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Derbring, Sandra and Matsson, Arild and Megyesi, Beata}, year = {2020}, publisher = {International Committee on Computational Linguistics}, ISBN = {978-1-952148-27-9}, } @inProceedings{themistocleous-etal-2020-improving-305222, title = {Improving the Diagnosis of Mild Cognitive Impairment in elderly individuals using a multifactorial automatic analysis of voice quality and prosody.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/AEC-30-Paper.JPG}, booktitle = {30th Alzheimer Europe Conference #30AEC -- virtual conference }, author = {Themistocleous, Charalambos and Eckerström, Marie and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2020}, } @misc{schlechtweg-etal-2020-post-295466, title = {Post-Evaluation Data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {This data collection contains the post-evaluation data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection: (1) the starting kit to download data, and examples for competing in the CodaLab challenge including baselines; (2) the true binary change scores of the targets for Subtask 1, and their true graded change scores for Subtask 2 (test_data_truth/); (3)the scoring program used to score submissions against the true test data in the evaluation and post-evaluation phase (scoring_program/); and (4) the results of the evaluation phase including, for example, analysis plots (plots/) displaying the results:}, author = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina}, year = {2020}, publisher = {Zenodo}, } @article{zechner-2020-derivatives-303708, title = {Derivatives of regular expressions with cuts}, abstract = {Derivatives of regular expressions are an operation which for a given expression pro-duces an expression for what remains after a specific symbol has been read. This can be used as a step in the process of transforming an expression into a finite string au-tomaton. Cuts are an extension of the ordinary regular expressions; the cut operator is essentially a concatenation without backtracking, formalising a behaviour found in many programming languages. Just as for concatenation, we can also define an iterated cut operator. We show and derive expressions for the derivatives of regular expressions with cuts and iterated cuts. © Institut für Informatik · Justus-Liebig-Universität Giessen.}, journal = {Journal of Automata, Languages and Combinatorics}, author = {Zechner, Niklas}, year = {2020}, volume = {25}, number = {4}, pages = {349--355}, } @misc{tahmasebi-etal-2020-swedish-295465, title = {Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection}, abstract = {This data collection contains the Swedish test data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection. It consists of a Swedish text corpus pair (corpus1/, corpus2/) and 31 lemmas which have been annotated for their lexical semantic change between the two corpora (targets.txt). We sample from the KubHist2 corpus, digitized by the National Library of Sweden, and available through the Språkbanken corpus infrastructure Korp (Borin et al., 2012). The full corpus is available through a CC BY (attribution) license. Each word for which the lemmatizer in the Korp pipeline has found a lemma is replaced with the lemma. In cases where the lemmatizer cannot find a lemma, we leave the word as is (i.e., unlemmatized, no lower-casing). KubHist contains very frequent OCR errors, especially for the older data.More detail about the properties and quality of the Kubhist corpus can be found in (Adesam et al., 2019).}, author = {Tahmasebi, Nina and Hengchen, Simon and Schlechtweg, Dominik and McGillivray, Barbara and Dubossarsky, Haim}, year = {2020}, } @inProceedings{rouces-etal-2020-creating-290695, title = {Creating an Annotated Corpus for Aspect-Based Sentiment Analysis in Swedish}, abstract = {Aspect-Based Sentiment Analysis constitutes a more fine-grained alternative to traditional sentiment analysis at sentence level. In addition to a sentiment value denoting how positive or negative a particular opinion or sentiment expression is, it identifies additional aspects or 'slots' that characterize the opinion. Some typical aspects are target and source, i.e. who holds the opinion and about which entity or aspect is the opinion. We present a large Swedish corpus annotated for Aspect-Based Sentiment Analysis. Each sentiment expression is annotated as a tuple that contains the following fields: one among 5 possible sentiment values, the target, the source, and whether the sentiment expressed is ironic. In addition, the linguistic element that conveys the sentiment is identified too. Sentiment for a particular topic is also annotated at title, paragraph and document level. The documents are articles obtained from two Swedish media (Svenska Dagbladet and Aftonbladet) and one online forum (Flashback), totalling around 4000 documents. The corpus is freely available and we plan to use it for training and testing an Aspect-Based Sentiment Analysis system.}, booktitle = {Proceedings of the 5th conference in Digital Humanities in the Nordic Countries, Riga, Latvia, October 21-23, 2020.}, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina}, year = {2020}, publisher = {CEUR Workshop Proceedings}, } @inProceedings{lange-ljunglof-2020-learning-291243, title = {Learning Domain-specific Grammars from a Small Number of Examples}, abstract = {In this paper we investigate the problem of grammar inference from a different perspective. The common approach is to try to infer a grammar directly from example sentences, which either requires a large training set or suffers from bad accuracy. We instead view it as a problem of grammar restriction or sub-grammar extraction. We start from a large-scale resource grammar and a small number of examples, and find a sub-grammar that still covers all the examples. To do this we formulate the problem as a constraint satisfaction problem, and use an existing constraint solver to find the optimal grammar. We have made experiments with English, Finnish, German, Swedish and Spanish, which show that 10–20 examples are often sufficient to learn an interesting domain grammar. Possible applications include computer-assisted language learning, domain-specific dialogue systems, computer games, Q/A-systems, and others.}, booktitle = {12th International Conference on Agents and Artificial Intelligence - Volume 1: NLPinAI}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2020}, publisher = {SciTePress}, ISBN = {978-989-758-395-7}, } @inProceedings{dannells-broden-2020-building-297061, title = {Building a Language Technology Infrastructure for Digital Humanities: Challenges, Opportunities and Progress}, abstract = {Språkbanken Text, a research unit at the University of Gothenburg, forms part of the National Language Bank of Sweden and is the main coordinating node of Swe-Clarin, the Swedish national CLARIN node. During the past years, Språkbanken Text has been actively engaged in a number of humanities and social sciences related research projects. This engagement has primarily concerned the development of new resources, methods and tools to accurately process large amounts of digitized material, in addition to interfaces for visualizing the materials, making them easily accessible for further analysis. The activities within Swe-Clarin have been essential for the progress and the success of this work. In this paper we present what was required from Språkbanken Text in order to meet the expectations of researchers from the humanities and social sciences. We discuss some of the challenges this work involves and describe the opportunities this field brings with it and how these opportunities could help to progress the work of Språkbanken Text toward building a language technology infrastructure that supports interdisciplinary research.}, booktitle = {Proceedings of the Twin Talks 2 and 3 Workshops at DHN 2020 and DH 2020 Ottawa Canada and Riga Latvia, July 23 and October 20, 2020}, editor = {Steven Krauwer and Darja Fišer}, author = {Dannélls, Dana and Brodén, Daniel}, year = {2020}, publisher = {CEUR-WS.org}, pages = {75--83}, } @inProceedings{alfter-etal-2020-expert-300074, title = {Expert judgments versus crowdsourcing in ordering multi-word expressions}, abstract = {In this study we investigate to which degree experts and non-experts agree on questions of linguistic complexity in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language. }, booktitle = {Proceedings of the Swedish Language Technology Conference (SLTC), 25–27 November 2020, (Online)}, author = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2020}, } @inProceedings{kokkinakis-lundholmfors-2020-digital-295582, title = {Digital Neuropsychological Tests and Biomarkers: Resources for NLP and AI Exploration in the Neuropsychological Domain}, abstract = {Non-invasive, time and cost-effective, easy-to-measure techniques for the early diagnosis or monitoring the progression of brain and mental disorders are at the forefront of recent research in this field. Natural Language Processing and Artificial Intelligence can play an important role in supporting and enhancing data driven approaches to improve the accuracy of prediction and classification. However, large datasets of e.g. recorded speech in the domain of cognitive health are limited. To improve the performance of existing models we need to train them on larger datasets, which could raise the accuracy of clinical diagnosis, and contribute to the detection of early signs at scale. In this paper, we outline our ongoing work to collect such data from a large population in order to support and conduct future research for modelling speech and language features in a cross-disciplinary manner. The final goal is to explore and combine linguistic with multimodal biomarkers from the same population and compare hybrid models that could increase the predictive accuracy of the algorithms that operate on them.}, booktitle = {CLARIN Annual Conference 2020 in Virtual Form}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, } @article{kokkinakis-lundholmfors-2020-manga-294522, title = {Hur många djur du kommer på kan avslöja hur din hjärna mår}, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, volume = {2}, pages = {48--51}, } @inProceedings{johansson-adesam-2020-training-293365, title = {Training a Swedish Constituency Parser on Six Incompatible Treebanks}, abstract = {We investigate a transition-based parser that usesEukalyptus, a function-tagged constituent treebank for Swedish which includesdiscontinuous constituents. In addition, we show that the accuracy of this parser can be improved by using a multitask learning architecture that makes it possible to train the parser on additional treebanks that use other annotation models.}, booktitle = {Proceedings of the 12th International Conference on Language Resources and Evaluation (LREC 2020)}, author = {Johansson, Richard and Adesam, Yvonne}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, } @inProceedings{zechner-borin-2020-towards-296900, title = {Towards a Swedish Roget-Style Thesaurus for NLP}, abstract = {Bring’s thesaurus (Bring) is a Swedish counterpart of Roget, and its digitized version could make a valuable language resource for use in many and diverse natural language processing (NLP) applications. Fromlexicon, word sense disambiguation, topic detection the literature we know that Roget-style thesauruses and wordnets have complementary strengths in this context, so both kinds of lexical-semantic resource are good to have. However, Bring was published in 1930, and its lexical items are in the form of lemma–POS pairings. In order to be useful in our NLP systems, polysemous lexical items need to be disambiguated, and a large amount of modern vocabulary must be added in the proper places in Bring. The work presented here describes experiments aiming at automating these two tasks, at least in part, where we use the structure of an existing Swedish semantic lexicon – Saldo – both for disambiguation of ambiguous Bring entries and for addition of new entries to Bring.}, booktitle = {Proceedings of the 2020 Globalex Workshop on Linked Lexicography. Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020}, author = {Zechner, Niklas and Borin, Lars}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-46-7}, } @inProceedings{virk-etal-2021-novel-306962, title = {A Novel Machine Learning Based Approach for Post-OCR Error Detection}, abstract = {Post processing is the most conventional approach for correcting errors that are caused by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. }, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Muhammad, Azam Sheikh}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @edited_book{skoldberg-etal-2021-svensk-305729, title = {Svensk ordbok utgiven av Svenska Akademien som app 2021 (för Android)}, abstract = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se. Appen är utgiven av Svenska Akademien och utvecklad av Petrus Wang i samarbete med Institutionen för svenska språket, Göteborgs universitet. }, editor = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus}, year = {2021}, publisher = {Svenska Akademien }, address = {Stockholm}, } @techreport{megyesi-etal-2021-swell-311730, title = {SweLL pseudonymization guidelines}, abstract = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to report how we have worked on the material and which decisions we have made. Guidelines are available for each step in the manual annotation process, including: • Transcription guidelines • Pseudonymization guidelines • Normalization guidelines • Correction annotation guidelines We specifically described all processes in English to make sure our principles and experience can be of help to people working on other learner infrastructure projects independent of the language.}, author = {Megyesi, Beáta and Rudebeck, Lisa and Volodina, Elena}, year = {2021}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, address = {Göteborg}, ISBN = {1401-5919}, } @inProceedings{kokkinakis-2021-insights-307200, title = {Insights on a Swedish Covid-19 corpus}, abstract = {The COVID-19 pandemic has had a serious impact on people all over the world, from mental and physical health to economic downturn to education and social relationships, while political decisions in many countries have had a profound impact on the lives of all people regardless of age. Many of these effects can be studied with statistical and qualitative data such as collected questionnaires and sickness absence rates. But large-scale studies require expertise in multiple domains and from many points of view. SpråkbankenText continuously collects text from various sources. In order to fill the gap in the lack of an available Swedish COVID-19-related dataset, we started to build a Swedish COVID-19 corpus (sv-COVID-19). Various tools for e.g. lexical, semantic or pragmatic/discourse analyses can be then applied in order to answer relevant questions on e.g. how people, on a larger scale than what can be obtained through qualitative studies, experienced their everyday life through the different phases of COVID-19 crisis, or how political decisions and their consequences are described and discussed.}, booktitle = {CLARIN Annual Conference (Virtual Event). 27 – 29 September 2021. Monica Monachini, Maria Eskevich (red.). s. 31-34}, author = {Kokkinakis, Dimitrios}, year = {2021}, } @inProceedings{hengchen-etal-2021-sbx--305550, title = {SBX-HY at RuShiftEval 2021: Доверяй, но проверяй}, abstract = {Research in computational lexical semantic change, due to the inherent nature of language change, has been notoriously difficult to evaluate. This led to the creation of many new exciting models that cannot be easily compared. In this system paper, we describe our submissions at RuShiftEval 2021 – one of the few recently shared tasks that enable researchers, through a standard evaluation set and control conditions, to systematically compare models and gain insights from previous work. We show that despite top results in similar tasks on other languages, Temporal Referencing does not seem to perform as well on Russian.}, booktitle = {Computational Linguistics and Intellectual Technologies: Proceedings of the International Conference “Dialogue 2021,” Moscow, June 16–19, 2021}, author = {Hengchen, Simon and Viloria, Kate and Indukaev, Andrey}, year = {2021}, publisher = {Rossiiskii Gosudarstvennyi Gumanitarnyi Universitet }, address = {Moscow}, } @incollection{adesam-etal-2021-lexical-310933, title = {A lexical resource for computational historical linguistics}, abstract = {In this chapter we present the diachronic dimension of Swedish FrameNet++. We describe the historical lexical resources currently available for Swedish, linked to the Contemporary Swedish lexicon Saldo. We present a case study of how interlinking the dictionaries simultaneously allows us to study lexical change. We also present a method of linking text words to lexicon entries, facilitating interactive exploration of historical texts. Diachronical language resources present both a high-variation challenge from a wider language technology perspective, and an interesting object of linguistic study. While a number of improvements of the parts of the diachronic lexical macroresource are still needed, this resource is invaluable for analysing and accessing historical texts, as well as for both synchronic historical and diachronic lexical studies.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Adesam, Yvonne and Andersson, Peter and Borin, Lars and Bouma, Gerlof}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {98–121}, } @incollection{blensenius-2021-aspekter-308020, title = {Aspekter som sällan uppmärksammas}, booktitle = {Nyanser av grammatik : gränser, mångfald, fördjupning / Johan Brandtler, Mikael Kalm (red.)}, author = {Blensenius, Kristian}, year = {2021}, publisher = {Studentlitteratur}, address = {Lund}, ISBN = {9789144136233}, pages = {263–274}, } @article{antonsson-etal-2021-using-301490, title = {Using a Discourse Task to Explore Semantic Ability in Persons With Cognitive Impairment.}, abstract = {This paper uses a discourse task to explore aspects of semantic production in persons with various degree of cognitive impairment and healthy controls. The purpose of the study was to test if an in-depth semantic analysis of a cognitive-linguistic challenging discourse task could differentiate persons with a cognitive decline from those with a stable cognitive impairment. Both quantitative measures of semantic ability, using tests of oral lexical retrieval, and qualitative analysis of a narrative were used to detect semantic difficulties. Besides group comparisons a classification experiment was performed to investigate if the discourse features could be used to improve classification of the participants who had a stable cognitive impairment from those who had cognitively declined. In sum, both types of assessment methods captured difficulties between the groups, but tests of oral lexical retrieval most successfully differentiated between the cognitively stable and the cognitively declined group. Discourse features improved classification accuracy and the best combination of features discriminated between participants with a stable cognitive impairment and those who had cognitively declined with an area under the curve (AUC) of 0.93.}, journal = {Frontiers in aging neuroscience}, author = {Antonsson, Malin and Lundholm Fors, Kristina and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2021}, volume = {12}, } @techreport{volodina-megyesi-2021-swell-311729, title = {SweLL transcription guidelines, L2 essays}, abstract = {The current document is a part of the SweLL guidelines series consisting of four parts which aim to report how we have worked on the material and which decisions we have made. Guidelines are available for each step in the manual annotation process, including: • Transcription guidelines • Pseudonymization guidelines • Normalization guidelines • Correction annotation guidelines We specifically described all processes in English to make sure our principles and experience can be of help to people working on other learner infrastructure projects independent of the language.}, author = {Volodina, Elena and Megyesi, Beáta}, year = {2021}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, address = {Göteborg}, } @misc{gagliardi-etal-2021-editorial-307124, title = {Editorial: Digital Linguistic Biomarkers: Beyond Paper and Pencil Test}, abstract = {Over the last decades, a growing body of linguistic studies have been devoted to the clinical domain (Perkins 2011), while the amount of experimental linguistic research focusing on neuroscience and mental health has increased exponentially during the last few years. Considering that many of the factors underlying cognitive and neuropsychiatric disorders may yield to late symptoms that are hard to foresee, it is often difficult to predict the existence of a presence or risk of a disease, as well as the disease’s trajectory. In this context, interdisciplinary approaches gain increasing popularity, and the analysis of complex behaviour – such as speech and language – emerges as a natural candidate to identify and analyse the extent to which a given neuropathology can impact the cognitive system at the very early stages. In this context, the development of cognitive evaluation and intervention tools focusing on linguistic biomarkers becomes a critical scientific arena both in and outside the clinic and laboratory (see Petrizzo & Popolo, 2020). Recent international research has demonstrated that automated collected and analysed quantitative linguistic features, easily extractable from a patient’s verbal productions, can be very useful in separating people with various cognitive or mental impairment from healthy subjects, even at a very early stage (see Bedi et al., 2015), and even to predict the outcomes of clinical interventions (see Carrillo et al., 2018). In this line, machine learning-based language technology methods and tools based on artificial intelligence are particularly promising to address this task (Locke et al. 2021; Sigman et al., 2021). Indeed, subtle language disruptions can be employed as digital linguistic biomarkers, namely objective, quantifiable behavioural data that can be collected and measured by means of digital devices, allowing for a low-cost pathology detection, classification and monitoring. Compared to classical pen-and-paper neuropsychological tests, the use of these instruments shows many advantages – such as its non-intrusive and time-effective application – providing not only offline, but also online measures that serve as a proxy for cognitive processing and its underlying mechanisms. The aim of the Research Topic Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests is to provide a state-of-the-art overview of this multidisciplinary and constantly evolving area of research, bringing together contributions from different quarters of the cognitive sciences. The collection comprises one systematic review, six original research papers, and one opinion paper. The articles are based on empirical and theoretical research from several disciplines (i.e., linguistics, psychology, Artificial Intelligence), and they tackle a range of developmental and acquired disorders. Most probably, dementia assessment has been one of the most rapidly evolving domain of Natural Language Processing (NLP) application for medical science (Petti, Baker & Korhonen 2020), but this approach is spreading rapidly through the community, with encouraging results on both developmental and acquired pathologies, as shown in the current article collection (i.e., autism, developmental language disorder, attention-deficit hyperactivity disorder, Alzheimer’s disease and mild cognitive impairment, or Parkinson’s disease). Furthermore, this Research Topic covers a variety of test languages showing the degree of internationalization of the research on the analysis verbal productions (i.e., English, Italian, German, and Japanese).}, author = {Gagliardi, Gloria and Kokkinakis, Dimitrios and Dunabeitia, Jon Andoni}, year = {2021}, volume = {12}, pages = {752238}, } @incollection{borin-2021-multiword-311388, title = {Multiword expressions – a tough typological nut for Swedish FrameNet++}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {221–259}, } @inProceedings{blensenius-etal-2021-finding-306723, title = {Finding gaps in semantic descriptions. Visualisation of the cross-reference network in a Swedish monolingual dictionary }, abstract = {Providing lexical information in dictionary entries by cross-referencing between semantically related headwords is very important, both from a reception-oriented and a production-oriented perspective. This study presents a survey of cross-references in a comprehensive monolingual dictionary of Swedish. It discusses cross-referencing in dictionaries in general as well as in the Swedish dictionary, focusing on the following four types of paradigmatic cross-references: SEE, COMPARE, SYNONYM, and OPPOSITE. By using data-visualisation software, the semantic network in the dictionary is overviewed in a new way. Furthermore, errors, gaps as well as other areas of improvement in the dictionary related to cross-referencing are discovered. Moreover, the relationships between the existing cross-references, how they are introduced in the dictionary and the dictionary's intended target groups are addressed. The study also reveals that the traditional lexicographic policies of the dictionary need to be adjusted to take advantage of the transition from paper to electronic publication}, booktitle = {Electronic lexicography in the 21st century. Proceedings of the eLex 2021 conference. 5–7 July 2021, virtual. Brno (Eds.: Kosem, I., Cukr, M., Jakubíček, M., Kallas, J., Krek, S. & Tiberius, C.}, author = {Blensenius, Kristian and Sköldberg, Emma and Bäckerud, Erik}, year = {2021}, publisher = {Lexical Computing CZ s.r.o}, address = {Brno}, } @book{alfter-2021-exploring-304548, title = {Exploring natural language processing for single-word and multi-word lexical complexity from a second language learner perspective}, abstract = {In this thesis, we investigate how natural language processing (NLP) tools and techniques can be applied to vocabulary aimed at second language learners of Swedish in order to classify vocabulary items into different proficiency levels suitable for learners of different levels. In the first part, we use feature-engineering to represent words as vectors and feed these vectors into machine learning algorithms in order to (1) learn CEFR labels from the input data and (2) predict the CEFR level of unseen words. Our experiments corroborate the finding that feature-based classification models using 'traditional' machine learning still outperform deep learning architectures in the task of deciding how complex a word is. In the second part, we use crowdsourcing as a technique to generate ranked lists of multi-word expressions using both experts and non-experts (i.e. language learners). Our experiment shows that non-expert and expert rankings are highly correlated, suggesting that non-expert intuition can be seen as on-par with expert knowledge, at least in the chosen experimental configuration. The main practical output of this research comes in two forms: prototypes and resources. We have implemented various prototype applications for (1) the automatic prediction of words based on the feature-engineering machine learning method, (2) language learning applications using graded word lists, and (3) an annotation tool for the manual annotation of expressions across a variety of linguistic factors.}, author = {Alfter, David}, year = {2021}, publisher = {Göteborgs universitet}, ISBN = {978-91-87850-79-0}, } @incollection{johansson-etal-2021-semantic-310775, title = {Semantic Role Labeling}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, editor = {Dana Dannélls and Lars Borin and Karin Friberg Heppin}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {264–280}, } @article{alfter-etal-2021-crowdsourcing-311721, title = {Crowdsourcing Relative Rankings of Multi-Word Expressions: Experts versus Non-Experts}, abstract = {In this study we investigate to which degree experts and non-experts agree on questions of difficulty in a crowdsourcing experiment. We ask non-experts (second language learners of Swedish) and two groups of experts (teachers of Swedish as a second/foreign language and CEFR experts) to rank multi-word expressions in a crowdsourcing experiment. We find that the resulting rankings by all the three tested groups correlate to a very high degree, which suggests that judgments produced in a comparative setting are not influenced by professional insights into Swedish as a second language.}, journal = {Northern European Journal of Language Technology (NEJLT)}, author = {Alfter, David and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2021}, volume = {7}, number = {1}, } @inProceedings{backerud-etal-2021-anvands-299384, title = {Så används Svenska Akademiens ordböcker på nätet. Implicit och explicit feedback från användarna}, abstract = {This study presents and analyses search strings and user data for different Swedish lexicographical websites. The underlying empirical material was sourced from two relatively new websites, www.saob.se and the joint dictionary portal www.svenska.se, which collects and provides a single point of entry to three Swedish monolingual dictionaries financed by the Swedish Academy. Statistics are presented on the most common search strings, when and where the searches take place, and what devices and digital platforms that are commonly used while visiting the sites. In addition, the study addresses a number of questions and other forms of feedback received from dictionary users. Furthermore, the study provides suggestions and examples of how the collected data can be utilised in upcoming updates and revisions of the dictionaries.}, booktitle = {Nordiska studier i lexikografi 15. Red. av. C. Sandström, U.-M. Forsberg, C. af Hällström-Reijonen, M. Lehtonen & K. Ruppel}, author = {Bäckerud, Erik and Nilsson, Pär and Sköldberg, Emma}, year = {2021}, ISBN = {978-952-7359-03-7}, } @article{hengchen-tahmasebi-2021-collection-301262, title = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data}, abstract = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.}, journal = {Journal of Open Humanities Data}, author = {Hengchen, Simon and Tahmasebi, Nina}, year = {2021}, volume = {7}, number = {2}, pages = {1--7}, } @article{blensenius-etal-2021-saol-309798, title = {SAOL 14 som rättesnöre - diskussion kring den senaste upplagan}, abstract = {The article discusses recommendations concerning orthography, morphology, semantics, etc. provided in the most recent edition of the Swedish Academy Glossary (2015). These issues are discussed particularly in relation to the glossary users and to the other language resources of the Swedish Academy, which are now easily compared online. Points for improvement are identified.}, journal = {LexicoNordica}, author = {Blensenius, Kristian and Holmer, Louise and Sköldberg, Emma}, year = {2021}, volume = {28}, pages = {39--58}, } @inProceedings{skelbye-dannells-2021-processing-306957, title = {OCR Processing of Swedish Historical Newspapers Using Deep Hybrid CNN–LSTM Networks}, abstract = {Deep CNN–LSTM hybrid neural networks have proven to improve the accuracy of Optical Character Recognition (OCR) models for different languages. In this paper we examine to what extent these networks improve the OCR accuracy rates on Swedish historical newspapers. By experimenting with the open source OCR engine Calamari, we are able to show that mixed deep CNN–LSTM hybrid models outperform previous models on the task of character recognition of Swedish historical newspapers spanning 1818–1848. We achieved an average character accuracy rate (CAR) of 97.43% which is a new state–of–the–art result on 19th century Swedish newspaper text. Our data, code and models are released under CC BY licence.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021}, editor = {Galia Angelova and Maria Kunilovskaya and Ruslan Mitkov and Ivelina Nikolova-Koleva}, author = {Skelbye, Molly and Dannélls, Dana}, year = {2021}, publisher = {INCOMA }, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @incollection{borin-etal-2021-introduction-310200, title = {Introduction: Swedish FrameNet++}, abstract = {The Swedish FrameNet++ was designed to be several things. As a digital artifact, it is an integrated panchronic lexical macroresource, primarily for Swedish, but including several other languages, intended as a basic infrastructural component in Swedish language technology research and for developing natural language processing applications. As an activity, it is a long-term R&D initiative, initially aimed at bringing about this macroresource, and now at maintaining and extending it, at promoting its use in language technology research and application development, as well as ensuring that the results of this research and development in their turn are incorporated in the macroresource. As a product of research, it reflects both computational and linguistic approaches to lexicology, lexical semantics, and lexical typology.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin }, author = {Borin, Lars and Dannélls, Dana and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {3 -- 36}, } @inProceedings{virk-etal-2021-deep-319450, title = {A Deep Learning System for Automatic Extraction of Typological Linguistic Information from Descriptive Grammars}, abstract = {Linguistic typology is an area of linguistics concerned with analysis of and comparison between natural languages of the world based on their certain linguistic features. For that purpose, historically, the area has relied on manual extraction of linguistic feature values from textural descriptions of languages. This makes it a laborious and time expensive task and is also bound by human brain capacity. In this study, we present a deep learning system for the task of automatic extraction of linguistic features from textual descriptions of natural languages. First, textual descriptions are manually annotated with special structures called semantic frames. Those annotations are learned by a recurrent neural network, which is then used to annotate un-annotated text. Finally, the annotations are converted to linguistic feature values using a separate rule based module. Word embeddings, learned from general purpose text, are used as a major source of knowledge by the recurrent neural network. We compare the proposed deep learning system to a previously reported machine learning based system for the same task, and the deep learning system wins in terms of F1 scores with a fair margin. Such a system is expected to be a useful contribution for the automatic curation of typological databases, which otherwise are manually developed}, booktitle = {Proceedings of Recent Advances in Natural Language Processing, Sep 1–3, 2021/ edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Foster, Daniel and Sheikh Muhammad, Azam and Saleem, Raheela}, year = {2021}, publisher = {Association for Computational Linguistics (ACL)}, ISBN = {978-954-452-072-4}, } @inProceedings{hengchen-tahmasebi-2021-supersim-305157, title = {SuperSim: a test set for word similarity and relatedness in Swedish}, abstract = {Language models are notoriously difficult to evaluate. We release SuperSim, a large-scale similarity and relatedness test set for Swedish built with expert human judgments. The test set is composed of 1,360 word-pairs independently judged for both relatedness and similarity by five annotators. We evaluate three different models (Word2Vec, fastText, and GloVe) trained on two separate Swedish datasets, namely the Swedish Gigaword corpus and a Swedish Wikipedia dump, to provide a baseline for future comparison. We release the fully annotated test set, code, baseline models, and data.}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31-June 2 2021, Reykjavik, Iceland (online)}, author = {Hengchen, Simon and Tahmasebi, Nina}, year = {2021}, publisher = {Linköping Electronic Conference Proceedings}, address = {Linköping}, ISBN = {978-91-7929-614-8}, } @edited_book{skoldberg-etal-2021-svensk-305242, title = {Svensk ordbok utgiven av Svenska Akademien, andra upplagan}, abstract = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se}, editor = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus and Bäckerud, Erik}, year = {2021}, publisher = {Svenska Akademien}, address = {Stockholm}, } @book{dannells-etal-2021-swedish-310036, title = {The Swedish FrameNet++ Harmonization, integration, method development and practical language technology applications}, abstract = {Large computational lexicons are central NLP resources. Swedish FrameNet++ aims to be a versatile full-scale lexical resource for NLP containing many kinds of linguistic information. Although focused on Swedish, this ongoing effort, which includes building a new Swedish framenet and recycling existing lexicons, has offered valuable insights into general aspects of lexical-resource building for NLP, which are discussed in this book: computational and linguistic problems of lexical semantics and lexical typology, the nature of lexical items (words and multiword expressions), achieving interoperability among heterogeneous lexical content, NLP methods for extending and interlinking existing lexicons, and deploying the new resource in practical NLP applications. This book is targeted at everyone with an interest in lexicography, computational lexicography, lexical typology, lexical semantics, linguistics, computational linguistics and related fields. We believe it should be of particular interest to those who are or have been involved in language resource creation, development and evaluation.}, author = {Dannélls, Dana and Borin, Lars and Friberg Heppin, Karin}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam, Philadelphia}, ISBN = {9789027209900 }, } @incollection{jatowt-etal-2021-computational-307061, title = {Computational approaches to lexical semantic change: Visualization systems and novel applications}, abstract = {The purpose of this chapter is to survey visualization and user interface solutions for understanding lexical semantic change as well as to survey a number of applications of techniques developed in computational analysis of lexical semantic change. We first overview approaches aiming to develop systems that support understanding semantic change in an interactive and visual way. It is generally accepted that computational techniques developed for analyzing and uncovering semantic change are beneficial to linguists, historians, sociologists, and practitioners in numerous related fields, especially within the humanities. However, quite a few non-professional users are equally interested in the histories of words. Developing interactive, visual, engaging, and easy-to-understand systems can help them to acquire relevant knowledge. Second, we believe that other fields could benefit from the research outcomes of computational approaches to lexical semantic change. In general, properly representing the meaning of terms used in the past should be important for a range of natural language processing, information retrieval and other tasks that operate on old texts. In the latter part of the chapter, we then focus on current and potential applications related to computer and information science with the underlying question: “How can modeling semantic change benefit wider downstream applications in these disciplines?”}, booktitle = {Computational approaches to semantic change }, author = {Jatowt, Adam and Tahmasebi, Nina and Borin, Lars}, year = {2021}, publisher = { Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-312-6}, pages = {311--339}, } @inProceedings{duong-etal-2021-unsupervised-305156, title = {An Unsupervised method for OCR Post-Correction and Spelling Normalisation for Finnish}, abstract = {Historical corpora are known to contain errors introduced by OCR (optical character recognition) methods used in the digitization process, often said to be degrading the performance of NLP systems. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We build on previous work on fully automatic unsupervised extraction of parallel data to train a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction designed for English, and adapt it to Finnish by proposing solutions that take the rich morphology of the language into account. Our new method shows increased performance while remaining fully unsupervised, with the added benefit of spelling normalisation. The source code and models are available on GitHub and Zenodo.}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31–2 June, 2021, Reykjavik, Iceland (online)}, author = {Duong, Quan and Hämäläinen, Mika and Hengchen, Simon}, year = {2021}, publisher = {Linköping Electronic Conference Proceedings}, address = {Linköping}, ISBN = {978-91-7929-614-8}, } @inProceedings{hammarlin-etal-2021-vaccine-307227, title = {Vaccine hesitancy – trust and distrust in medical expertise and authorities}, abstract = {The increase of vaccine hesitancy is singled out by WHO as one of the ten most important and urgent threats to global health (https://www.who.int/emergencies/ten-threats-to-global-health-in-2019). Diseases like measles are returning in different parts of Europe, partly as a result of the activities of the anti-vaccination movement. The herd immunity in most Western countries is high but even a small decrease in vaccination would have immediate negative effects for the population. Sweden offers a perfect site for future anti-vaccination studies due to its high vaccination covering. A decline in the numbers of children vaccinated has had immediate effects. For example, the incident rate in the country of pertussis rose from 700 cases to 3,200 cases per 100,000 children in 4 years due to a rather small decrease in vaccinations. This constitutes a strong argument for the civic importance of the case. The aim of this presentation is to introduce a new 4-year research project (2020–2023), independently financed by the Bank of Sweden Foundation (Riksbankens jubileumsfond), with the goal to investigate the role and importance of rumouring for the vaccination skepticism growing on the internet, and how it can be understood as an expression of civic engagement in the present digital times entailing crucial transformations for everyday civic culture. Theoretically, the project builds upon, and develop, media researcher Dahlgren’s work on civic culture and Kitta’s studies of the anti-vaccination movement. The overarching research question is: How have the everyday practice and experience of, and the conditions for, rumours been shaped and reshaped in the digital age, and what do these processes mean for civic engagement and participation? The project will offer an understanding of how everyday interaction on the internet has a powerful impact on the spreading of false information, which in the long run may challenge democracy. On a more concrete level the project will answer the following questions in relation to the case of vaccine skepticism: How are rumours about alleged risks and dangers of vaccination propagated and established on the internet? Are there specific patterns and correlations connecting topics, assumptions, myths, argumentation schemes, popularity and time? What do everyday practices, on- and offline, of rumouring mean for its adherents’ civic engagement in the anti-vaccination movement? Which are the civic implications of the spreading and circulation of vaccination hostile rumours on individual citizens and society at large?}, booktitle = {8th European Communication Conference (ECREA)}, author = {Hammarlin, Mia-Marie and Miegel, Fredrik and Borin, Lars and Kokkinakis, Dimitrios and Jaakonaho, Anna}, year = {2021}, } @edited_book{skoldberg-etal-2021-svensk-305730, title = {Svensk ordbok utgiven av Svenska Akademien som app 2021 (för iOS)}, abstract = {Andra upplagan av Svensk ordbok utgiven av Svenska Akademien har utarbetats av en forskargrupp vid Institutionen för svenska språket, Göteborgs universitet. Den är utgiven av Svenska Akademien och publicerad i ordboksportalen svenska.se. Appen är utgiven av Svenska Akademien och utvecklad av Petrus Wang i samarbete med Institutionen för svenska språket, Göteborgs universitet.}, editor = {Sköldberg, Emma and Blensenius, Kristian and Hannesdottir, Anna Helga and Holmer, Louise and Landqvist, Hans and Martens, Monica and Petersson, Stellan and Hult, Ann-Kristin and Ribeck, Judy Carola and Wenner, Lena and Wang, Petrus}, year = {2021}, publisher = {Svenska Akademien}, address = {Stockholm}, } @inProceedings{marjanen-etal-2021-topic-304736, title = {Topic Modelling Discourse Dynamics in Historical Newspapers }, abstract = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models. }, booktitle = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020}, author = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko}, year = {2021}, publisher = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik}, address = {Aachen }, } @incollection{petersson-skoldberg-2021-semantic-307114, title = {Semantic change in Swedish – from a lexicographic perspective}, abstract = {In this chapter, we examine semantic change in the general vocabulary of present-day Swedish and its lexicographic description. We discuss the question of whether automatic and semi-automatic methods of computational linguistics are relevant to lexicography and conclude that such methods can facilitate, formalize, and sharpen lexicographic investigations of semantic change.}, booktitle = {Computational approaches to semantic change. Eds.: Nina Tahmasebi, Lars Borin, Adam Jatowt, Yang Xu & Simon Hengchen}, author = {Petersson, Stellan and Sköldberg, Emma}, year = {2021}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-008-2 }, pages = {149--167}, } @article{borin-etal-2021-birds-309082, title = {A bird’s-eye view on South Asian languages through LSI: Areal or genetic relationships?}, abstract = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.}, journal = {Journal of South Asian Languages and Linguistics}, author = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard}, year = {2021}, volume = {7}, number = {2}, pages = {151--185}, } @incollection{bouma-cousse-2021-hulpwerkwoorden-311029, title = {Hulpwerkwoorden stapelen – toen en nu.}, booktitle = {Wat gebeurt er in het Nederlands? : over taal, frequentie en variatie / Redactie Nicoline van der Sijs, Lauren Fonteyn en Marten van der Meulen}, author = {Bouma, Gerlof and Coussé, Evie}, year = {2021}, publisher = {Sterck & de Vreese}, address = {Gorredijk}, ISBN = {9789056158033}, pages = {36--40}, } @edited_book{alfter-etal-2021-proceedings-311727, title = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, editor = {Alfter, David and Volodina, Elena and Pilán, Ildikó and Graën, Johannes and Borin, Lars}, year = {2021}, publisher = {Linköping Electronic Conference Proceedings 177}, address = {Linköping, Sweden}, ISBN = {978-91-7929-625-4}, } @incollection{hengchen-etal-2021-challenges-306972, title = {Challenges for computational lexical semantic change}, abstract = {The computational study of lexical semantic change (LSC) has taken off in the past few years and we are seeing increasing interest in the field, from both computational sciences and linguistics. Most of the research so far has focused on methods for modelling and detecting semantic change using large diachronic textual data, with the majority of the approaches employing neural embeddings. While methods that offer easy modelling of diachronic text are one of the main reasons for the spiking interest in LSC, neural models leave many aspects of the problem unsolved. The field has several open and complex challenges. In this chapter, we aim to describe the most important of these challenges and outline future directions.}, booktitle = {Computational approaches to semantic change / Tahmasebi, Nina, Borin, Lars, Jatowt, Adam, Yang, Xu, Hengchen, Simon (eds.)}, author = {Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik and Dubossarsky, Haim}, year = {2021}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-008-2}, pages = {341--372}, } @inProceedings{dannells-etal-2021-engine-305700, title = {A Two-OCR Engine Method for Digitized Swedish Newspapers }, abstract = {In this paper we present a two-OCR engine method that was developed at Kungliga biblioteket (KB), the National Library of Sweden, for improving the correctness of the OCR for mass digitization of Swedish newspapers. To evaluate the method a reference material spanning the years 1818–2018 was prepared and manually transcribed. A quantitative evaluation was then performed against the material. In this first evaluation we experimented with word lists for different time periods. The results show that even though there was no significant overall improvement of the OCR results, some combinations of word lists are successful for certain periods and should therefore be explored further.}, booktitle = {Selected Papers from the CLARIN Annual Conference 2020, Linköping Electronic Conference Proceedings 180}, author = {Dannélls, Dana and Björk, Lars and Dirdal, Ove and Johansson, Torsten}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-609-4}, } @inProceedings{hansson-etal-2021-swedish-305126, title = {The Swedish Winogender Dataset}, abstract = {We introduce the SweWinogender test set, a diagnostic dataset to measure gender bias in coreference resolution. It is modelled after the English Winogender benchmark, and is released with reference statistics on the distribution of men and women between occupations and the association between gender and occupation in modern corpus material. The paper discusses the design and creation of the dataset, and presents a small investigation of the supplementary statistics.}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31 - June 2, 2021, Reykjavik, Iceland (online)}, author = {Hansson, Saga and Mavromatakis, Konstantinos and Adesam, Yvonne and Bouma, Gerlof and Dannélls, Dana}, year = {2021}, publisher = {Linköping University Electronic Press }, address = {Linköping }, ISBN = {978-91-7929-614-8}, } @inProceedings{dannells-virk-2021-supervised-310123, title = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text }, abstract = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.}, booktitle = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2021}, publisher = {Linköping Electronic Press }, address = {Linköping}, } @incollection{perrone-etal-2021-lexical-306974, title = {Lexical semantic change for Ancient Greek and Latin}, abstract = {Change and its precondition, variation, are inherent in languages. Over time, new words enter the lexicon, others become obsolete, and existing words acquire new senses. Associating a word with its correct meaning in its historical context is a central challenge in diachronic research. Historical corpora of classical languages, such as Ancient Greek and Latin, typically come with rich metadata, and existing models are limited by their inability to exploit contextual information beyond the document timestamp. While embedding-based methods feature among the current state of the art systems, they are lacking in their interpretative power. In contrast, Bayesian models provide explicit and interpretable representations of semantic change phenomena. In this chapter we build on GASC, a recent computational approach to semantic change based on a dynamic Bayesian mixture model. In this model, the evolution of word senses over time is based not only on distributional information of lexical nature, but also on text genres. We provide a systematic comparison of dynamic Bayesian mixture models for semantic change with state-ofthe-art embedding-based models. On top of providing a full description of meaning change over time, we show that Bayesian mixture models are highly competitive approaches to detect binary semantic change in both Ancient Greek and Latin. }, booktitle = {Computational approaches to semantic change}, author = {Perrone, Valerio and Hengchen, Simon and Palma, Marco and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara}, year = {2021}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-008-2}, pages = {287--310}, } @incollection{dannells-grztis-2021-computational-310047, title = {Computational representation of FrameNet for multilingual natural language generation}, abstract = {Multilingual natural language generation, the process of producing written or spoken utterances in parallel languages from either structured or unstructured representations requires large amounts of syntactic and semantic information to generate an expression that is tailored to the target audience. This information is offered by FrameNet-like resources, which have been developed for a number of languages. In this chapter, we present a computational FrameNet grammar resource for multilingual natural language generation. We compare between English and Swedish framenets to illustrate how these can be unified under a shared computational representation using Grammatical Framework. We demonstrate how the grammar was exploited in two practical multilingual natural language generation applications to facilitate tourist communication and empower museum users with coherent artwork descriptions.}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Grūzītis, Normunds}, year = {2021}, publisher = {John Benjamins Publishing Company }, address = {Amsterdam / Philadelphia }, ISBN = { 9789027258489 }, pages = {281 -- 301}, } @article{hengchen-etal-2021-data-309329, title = {A data-driven approach to studying changing vocabularies in historical newspaper collections}, abstract = {Nation and nationhood are among the most frequently studied concepts in the field of intellectual history. At the same time, the word ‘nation’ and its historical usage are very vague. The aim in this article was to develop a data-driven method using dependency parsing and neural word embeddings to clarify some of the vagueness in the evolution of this concept. To this end, we propose the following two-step method. First, using linguistic processing, we create a large set of words pertaining to the topic of nation. Second, we train diachronic word embeddings and use them to quantify the strength of the semantic similarity between these words and thereby create meaningful clusters, which are then aligned diachronically. To illustrate the robustness of the study across languages, time spans, as well as large datasets, we apply it to the entirety of five historical newspaper archives in Dutch, Swedish, Finnish, and English. To our knowledge, thus far there have been no large-scale comparative studies of this kind that purport to grasp long-term developments in as many as four different languages in a data-driven way. A particular strength of the method we describe in this article is that, by design, it is not limited to the study of nationhood, but rather expands beyond it to other research questions and is reusable in different contexts.}, journal = {Digital Scholarship in the Humanities}, author = {Hengchen, Simon and Ros, Ruben and Marjanen, Jani and Tolonen, Mikko}, year = {2021}, volume = {36}, number = {Supplement 2}, pages = {109–126}, } @edited_book{tahmasebi-etal-2021-computational-306968, title = {Computational approaches to semantic change}, abstract = {Semantic change — how the meanings of words change over time — has preoccupied scholars since well before modern linguistics emerged in the late 19th and early 20th century, ushering in a new methodological turn in the study of language change. Compared to changes in sound and grammar, semantic change is the least understood. Ever since, the study of semantic change has progressed steadily, accumulating a vast store of knowledge for over a century, encompassing many languages and language families. Historical linguists also early on realized the potential of computers as research tools, with papers at the very first international conferences in computational linguistics in the 1960s. Such computational studies still tended to be small-scale, method-oriented, and qualitative. However, recent years have witnessed a sea-change in this regard. Big-data empirical quantitative investigations are now coming to the forefront, enabled by enormous advances in storage capability and processing power. Diachronic corpora have grown beyond imagination, defying exploration by traditional manual qualitative methods, and language technology has become increasingly data-driven and semantics-oriented. These developments present a golden opportunity for the empirical study of semantic change over both long and short time spans. A major challenge presently is to integrate the hard-earned knowledge and expertise of traditional historical linguistics with cutting-edge methodology explored primarily in computational linguistics. The idea for the present volume came out of a concrete response to this challenge. The 1st International Workshop on Computational Approaches to Historical Language Change (LChange'19), at ACL 2019, brought together scholars from both fields. This volume offers a survey of this exciting new direction in the study of semantic change, a discussion of the many remaining challenges that we face in pursuing it, and considerably updated and extended versions of a selection of the contributions to the LChange'19 workshop, addressing both more theoretical problems — e.g., discovery of "laws of semantic change" — and practical applications, such as information retrieval in longitudinal text archives.}, editor = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang and Hengchen, Simon}, year = {2021}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-008-2}, } @incollection{prentice-etal-2021-language-310517, title = {Language learning and teaching with Swedish FrameNet++: Two examples}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, editor = {Dana Dannélls and Lars Borin and Karin Friberg Heppin}, author = {Prentice, Julia and Håkansson, Camilla and Linström Tiedemann, Therese and Pilán, Ildikó and Volodina, Elena}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam, Philadelphia}, ISBN = {9789027258489}, pages = {304–329}, } @incollection{tahmasebi-etal-2021-survey-307058, title = {Survey of computational approaches to lexical semantic change detection}, abstract = {Our languages are in constant flux driven by external factors such as cultural, societal and technological changes, as well as by only partially understood internal motivations. Words acquire new meanings and lose old senses, new words are coined or borrowed from other languages and obsolete words slide into obscurity. Understanding the characteristics of shifts in the meaning and in the use of words is useful for those who work with the content of historical texts, the interested general public, but also in and of itself. The findings from automatic lexical semantic change detection and the models of diachronic conceptual change are also currently being incorporated in approaches for measuring document across-time similarity, information retrieval from long-term document archives, the design of OCR algorithms, and so on. In recent years we have seen a surge in interest in the academic community in computational methods and tools supporting inquiry into diachronic conceptual change and lexical replacement. This article provides a comprehensive survey of recent computational techniques to tackle both.}, booktitle = {Computational approaches to semantic change / Nina Tahmasebi, Lars Borin, Adam Jatowt, Yang Xu, Simon Hengchen (eds.) }, author = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam}, year = {2021}, publisher = { Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-312-6 }, pages = {1--91}, } @misc{romanello-hengchen-2021-detecting-304990, title = {Detecting Text Reuse with Passim}, abstract = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.}, author = {Romanello, Matteo and Hengchen, Simon}, year = {2021}, volume = {10}, } @incollection{borin-etal-2021-swedish-311387, title = {Swedish FrameNet++ and comparative linguistics}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {139–165}, } @article{zanetti-etal-2021-automatic-311723, title = {Automatic Generation of Exercises for Second Language Learning from Parallel Corpus Data}, abstract = {Creating language learning exercises is a time-consuming task and made-up sample sentences frequently lack authenticity. Authentic samples can be obtained from corpora, but it is necessary to identify material that is suitable for language learners. Parallel corpora of written text consist of translated material. Comparing the text in one language with its translation into another (known) language makes the structure accessible to the learner. However, the correspondence of words between the two languages is more important. By carefully selecting well-suited parallel sentences, a learner can explore the target language in a guided way. We present an approach to generate a novel type of language learning exercise from a large parallel corpus based on movie subtitles. The size of the corpus allows for defining selective criteria, favoring precision over recall. It is a non-trivial task to give reliable feedback to automatically generated exercises. ICALL literature often deals with fill-inthe-blanks exercises or multiple-choice questions, which allow for very limited answer options. Our proposed exercise is a special case of sentence reconstruction on bilingual sentence pairs. It combines two elements which have proven to be effective for language learning: a gamified approach, to awaken the students’ competitive desire, and the identification of syntactic structures and vocabulary use, to improve language sensitivity. This article presents the methods used to select example pairs and to implement a prototype. }, journal = {International Journal of TESOL Studies}, author = {Zanetti, Arianna and Volodina, Elena and Graën, Johannes}, year = {2021}, volume = {3}, number = {2}, pages = {55--71}, } @incollection{borin-etal-2021-swedish-311385, title = {Swedish FrameNet++ – lexical samsara}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart and Zechner, Niklas}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {69–95}, } @inProceedings{virk-etal-2021-data-306964, title = {A Data-Driven Semi-Automatic Framenet Development Methodology }, abstract = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database and together with the annotated example sentences they have been made available through a web interface.}, booktitle = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva}, author = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus}, year = {2021}, publisher = {INCOMA}, address = {Shoumen, Bulgaria}, ISBN = {978-954-452-072-4}, } @inProceedings{adesam-berdicevskis-2021-part-304973, title = {Part-of-speech tagging of Swedish texts in the neural era}, booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics, NoDaLiDa, May 31–2 June, 2021, Reykjavik, Iceland (online) / eds Simon Dobnik and Lilja Øvrelid}, author = {Adesam, Yvonne and Berdicevskis, Aleksandrs}, year = {2021}, publisher = { Linköping University Electronic Press}, address = {Linköping}, ISBN = { 978-91-7929-614-8}, pages = {200--209}, } @article{landqvist-pilke-2021-intresse-311187, title = {Intresse och engagemang: Kungliga Tekniska högskolans insatser i ett svenskt terminologiskt nätverk 1941–1983}, abstract = {In this paper, we study how KTH Royal Institute of Technology (Kungliga Tekniska högskolan – KTH) has participated in and influenced terminology work coordinated by the national terminology centre, the Swedish Centre for Technical Terminology – the TNC, in Sweden during the period 1941–1983. The aim of this paper is to shed light on the development of Swedish (technical) terminology based on networking and experts’ efforts. Based on archive material, we analyze who have been the active KTH experts, in what ways they were involved in the development processes and what effects their efforts had on the term recommendations given by the TNC. The archive material consists of written documents relating to the work process developed by John Wennerberg, who led the TNC between 1941 and 1957. The process was carried out in the form of 373 formal survey letters representing 17 subject fields, with both the TNC and external parties participating. Our results show that the 31 identified KTH experts play a visible role in the processes by 480 received survey letters within 14 subject fields. The response rate, 80 percent, reveal the experts’ involvement in the process and their high esteem of TNC’s work. The analysis of the comprehensive survey letter R198 shows that Wennerberg has considered the experts’ answers regarding terms (selection, linguistic form, acceptance/discourage) and definitions when he has published TNC’s recommendations. Our study shows that networking and experts representing the educational sector and furthermore universities have been an inseparable part of the development of Swedish (technical) terminology during several decades when the national terminology centre in Sweden began to operate and the working methods were established.}, journal = {Folkmålsstudier Meddelanden från Föreningen för nordisk filologi}, author = {Landqvist, Hans and Pilke, Nina}, year = {2021}, volume = {59}, pages = {103--133}, } @incollection{linden-etal-2021-multilingual-311386, title = {A multilingual net of lexical resources}, booktitle = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin}, author = {Lindén, Krister and Niemi, Jyrki and Borin, Lars and Forsberg, Markus and Pedersen, Bolette S. and Nimb, Sanni and Orav, Heili and Kahusk, Neeme and Vider, Kadri}, year = {2021}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027209900}, pages = {123–137}, } @inProceedings{landqvist-2021-finlandssvenska-304467, title = {”finlandssvenska” + ”betydelsefulla” + ”översättare till svenska språket” = ? Upplysningar och urval i Svenskt översättarlexikon}, booktitle = {Sektionsföredrag vid Svenskan i Finland 19, 6–7 maj 2021, Åbo Akademi i Vasa}, author = {Landqvist, Hans}, year = {2021}, } @article{basirat-etal-2021-empirical-302492, title = {An empirical study on the contribution of formal and semantic features to the grammatical gender of nouns}, abstract = {This study conducts an experimental evaluation of two hypotheses about the contributions of formal and semantic features to the grammatical gender assignment of nouns. One of the hypotheses (Corbett and Fraser 2000) claims that semantic features dominate formal ones. The other hypothesis, formulated within the optimal gender assignment theory (Rice 2006), states that form and semantics contribute equally. Both hypotheses claim that the combination of formal and semantic features yields the most accurate gender identification. In this paper, we operationalize and test these hypotheses by trying to predict grammatical gender using only character-based embeddings (that capture only formal features), only context-based embeddings (that capture only semantic features) and the combination of both. We performed the experiment using data from three languages with different gender systems (French, German and Russian). Formal features are a significantly better predictor of gender than semantic ones, and the difference in prediction accuracy is very large. Overall, formal features are also significantly better than the combination of form and semantics, but the difference is very small and the results for this comparison are not entirely consistent across languages.}, journal = {Linguistics Vanguard}, author = {Basirat, Ali and Allassonnière-Tang, Marc and Berdicevskis, Aleksandrs}, year = {2021}, volume = {7}, number = {1}, } @inProceedings{goldfarb-tarrant-etal-2021-intrinsic-312616, title = {Intrinsic Bias Metrics Do Not Correlate with Application Bias}, abstract = {Natural Language Processing (NLP) systems learn harmful societal biases that cause them to amplify inequality as they are deployed in more and more situations. To guide efforts at debiasing these systems, the NLP community relies on a variety of metrics that quantify bias in models. Some of these metrics are intrinsic, measuring bias in word embedding spaces, and some are extrinsic, measuring bias in downstream tasks that the word embeddings enable. Do these intrinsic and extrinsic metrics correlate with each other? We compare intrinsic and extrinsic metrics across hundreds of trained models covering different tasks and experimental conditions. Our results show no reliable correlation between these metrics that holds in all scenarios across tasks and languages. We urge researchers working on debiasing to focus on extrinsic measures of bias, and to make using these measures more feasible via creation of new challenge sets and annotated test data. To aid this effort, we release code, a new intrinsic metric, and an annotated test set focused on gender bias in hate speech.}, booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), 1–6 August 2021, Online}, author = {Goldfarb-Tarrant, Seraphina and Marchant, Rebecca and Muñoz Sánchez, Ricardo and Pandya, Mugdha and Lopez, Adam}, year = {2021}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-954085-52-7 }, } @inProceedings{berdicevskis-2021-successes-311655, title = {Successes and failures of Menzerath’s law at the syntactic level}, booktitle = {Proceedings of the Second Workshop on Quantitative Syntax (Quasy, SyntaxFest 2021), 21–25 March, 2022, Sofia, Bulgaria / Radek Čech, Xinying Chen (eds.)}, author = {Berdicevskis, Aleksandrs}, year = {2021}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = { 978-1-955917-15-5}, pages = {17--33}, } @misc{pilke-etal-2021-terminology-305860, title = {Terminology as a Societal Resource Possibilities and Responsibilities in a Changing World}, author = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans}, year = {2021}, volume = {27}, number = {1}, pages = {177}, } @inProceedings{volodina-etal-2021-coderoomor-311724, title = {CoDeRooMor: A new dataset for non-inflectional morphology studies of Swedish}, abstract = {The paper introduces a new resource, CoDeRooMor, for studying the morphology of modern Swedish word formation. The approximately 16.000 lexical items in the resource have been manually segmented into word-formation morphemes, and labeled for their categories, such as prefixes, suffixes, roots, etc. Word-formation mechanisms, such as derivation and compounding have been associated with each item on the list. The article describes the selection of items for manual annotation and the principles of annotation, reports on the reliability of the manual annotation, and presents tools, resources and some first statistics. Given the”gold” nature of the resource, it is possible to use it for empirical studies as well as to develop linguistically-aware algorithms for morpheme segmentation and labeling (cf statistical subword approach). The resource is freely available through Språkbanken-Text.}, booktitle = { 23rd Nordic Conference on Computational Linguistics (NoDaLiDa) Proceedings, May 31–2 June, 2021, Reykjavik, Iceland Online / Simon Dobnik, Lilja Øvrelid (Editors)}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-614-8}, } @inProceedings{landqvist-etal-2021-vill-310091, title = {Vem vill ta hand om termerna? Terminologiskt arbete som språklig och samhällelig infrastruktur då, nu och sedan}, abstract = {Presentation av arbetet inom det pågående projektet Termer i tid – tidens termer, https://sites.uwasa.fi/term/ }, booktitle = {Terminologifrämjandets höst-term-in 2021}, author = {Landqvist, Hans and Nissilä, Niina and Pilke, Nina}, year = {2021}, } @incollection{dannells-etal-2021-swedish-310041, title = {Swedish FrameNet}, abstract = {This chapter describes the development of Swedish FrameNet. A new framenet project often follows one of two methodological approaches: (1) extension, through translation of a different-language – often English – framenet into the target language, and (2) merging, where the resource is built from scratch in the target language. Both approaches have their pros and cons, which have been extensively discussed in the literature. Swedish FrameNet is mainly developed through the extension approach, although balanced with the merging approach. Drawing on the two approaches simultaneously, we describe how integrated language resources and tools have been exploited to create and develop Swedish FrameNet: how it was constructed, what it contains, and the basic assumptions underlying the annotation of its contents. }, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, author = {Dannélls, Dana and Borin, Lars and Forsberg, Markus and Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {37 -- 66}, } @article{ehret-etal-2021-meaning-304914, title = {Meaning and Measures: Interpreting and Evaluating Complexity Metrics}, journal = {Frontiers in communication}, author = {Ehret, Katharina and Blumenthal-Dramé, Alice and Bentz, Christian and Berdicevskis, Aleksandrs}, year = {2021}, volume = {6}, } @edited_book{berdicevskis-piperski-2021-skljanki-311612, title = {Tri skljanki popoludni i drugie zadachi po lingvistike}, editor = {Berdicevskis, Aleksandrs and Piperski, Alexander}, year = {2021}, publisher = {Alpina Non-Fiction}, address = {Moskva}, ISBN = {978-5-00139-130-2}, } @inProceedings{volodina-etal-2021-dalaj-311725, title = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish}, abstract = {We present DaLAJ 1.0, a Dataset for Linguistic Acceptability Judgments for Swedish, comprising 9 596 sentences in its first version. DaLAJ is based on the SweLL second language learner data (Volodina et al., 2019), consisting of essays at different levels of proficiency. To make sure the dataset can be freely available despite the GDPR regulations, we have sentence-scrambled learner essays and removed part of the metadata about learners, keeping for each sentence only information about the mother tongue and the level of the course where the essay has been written. We use the normalized version of learner language as the basis for DaLAJ sentences, and keep only one error per sentence. We repeat the same sentence for each individual correction tag used in the sentence. For DaLAJ 1.0 four error categories of 35 available in SweLL are used, all connected to lexical or word-building choices. The dataset is included in the SwedishGlue benchmark. Below, we describe the format of the dataset, our insights and motivation for the chosen approach to data sharing.}, booktitle = {Proceedings of the 10th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2021), Online}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Klezl, Julia}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7929-625-4}, } @misc{pilke-etal-2021-terminology-305859, title = { Terminology as a Societal Resource. Possibilities and Responsibilities in a Changing World.}, author = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans}, year = {2021}, volume = {27}, number = {1}, pages = {3–9}, } @article{pilke-etal-2021-organising-305858, title = {Organising Terminology Work in Sweden from the 1940s onwards – Participatory Expert Roles in Networks}, abstract = {The present study deals with organised terminology work in Sweden from the 1940s to the late 2010s. Using archive material, we describe how practical terminology work was carried out in Sweden during the period 1941–2018/ 2019, when the Swedish Centre for Technical Terminology/the Swedish Centre for Terminology (TNC) was the central actor. Thereafter, we discuss models for building a new infrastructure for terminology work after the clo- sure of the TNC in 2018/2019. This discussion is based on interviews and analyses of articles and current reports. The study shows that multifaceted contacts with experts, academia, industry and society have played an essen- tial role for terminology work in Sweden since the 1930s. In the current situ- ation (2019), the activities are being reorganised and responsibility for terminology work is distributed between several actors. A new main actor is the government agency known as the Institute of Language and Folklore (Isof ). Finally, we discuss future visions for terminology work in Sweden. }, journal = {Terminology as a Societal Resource. Possibilities and Responsibilities in a Changing World. Special Issue of Terminology International Journal of Theoretical and Applied Issues in Specialized Communication}, author = {Pilke, Nina and Nissilä, Niina and Landqvist, Hans}, year = {2021}, volume = {27}, number = {1}, pages = {80--109}, } @article{nissila-etal-2021-av-311592, title = {”Av intresse för saken dristar jag mig att till diskussion framlägga ett par spörsmål” – Kaksi suomalaista akateemista uranuurtajaa terminologiaverkoston kirjeenvaihdossa}, abstract = {The Swedish Tekniska Nomenklaturcentralen TNC (2000–2018 Terminologicentrum TNC) has been Sweden's national center for special languages and terminology work for more than 75 years. Since its founding in 1941, the TNC has been active not only in Sweden, but also in establishing and maintaining international contacts. The article describes the contacts between actors in the Swedish and Finnish terminology field, looking in particular at the contacts between the TNC and actors in the Finnish higher education sector between the 1940s and the 1990s. The method utilized is close reading and content analysis. The research material used is the collection of foreign correspondence in the TNC's document archive, and in particular the section stored in connection with the code Ufin, i.e. sections concerning communication between the TNC and Finnish actors. The article describes the topics covered in the communication, the objectives and consequences of the communication and the results achieved. The article focuses especially on the contacts between the TNC and two active actors in Finland, professor Jarl Salin at the Åbo Akademi University and professor Christer Laurén at the University of Vaasa. In the analysis, the Ufin themes of the letters were categorized in four main categories: publications, communication, information and language issues. In professor Jarl Salins letters, the most common theme were language issues, whereas Professor Christer Laurén contacted TNC especially in connection with publications and publishing.}, journal = {Workplace Communication IV (VAKKI Publications 13.) Eds H. Katajamäki, M. Enell-Nilsson, H. Kauppinen-Räisänen, L. Kääntä & H. Salovaara}, author = {Nissilä, Niina and Heittola, Sanna and Pilke, Nina and Landqvist, Hans}, year = {2021}, volume = {13}, pages = {153–168}, } @inProceedings{landqvist-2021-inequality-305861, title = {(In)equality? A Case Study of Male and Female Translators in Svenskt översättarlexikon}, booktitle = {KäTu2021 Kääntämisen ja tulkkauksen tukimuksen symposiumi Ohjelma ja abstraktit, 20–22.5.2021, Helsingin yliopisto }, author = {Landqvist, Hans}, year = {2021}, } @misc{ljunglof-etal-2021-selected-306645, title = {Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November 2020}, abstract = {Selected extended papers from the Eight Swedish Language Technology Conference (SLTC-2020) which was held between 25-27 November 2020 in Gothenburg and online.}, author = {Ljunglöf, Peter and Dobnik, Simon and Johansson, Richard}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping, Sweden}, ISBN = {978-91-7929-031-3}, } @incollection{lange-ljunglof-2021-learning-305146, title = {Learning Domain-Specific Grammars from a Small Number of Examples}, abstract = {In this chapter we investigate the problem of grammar learning from a perspective that diverges from previous approaches. These prevailing approaches to learning grammars usually attempt to infer a grammar directly from example corpora without any additional information. This either requires a large training set or suffers from bad accuracy. We instead view learning grammars as a problem of grammar restriction or subgrammar extraction. We start from a large-scale grammar (called a resource grammar) and a small number of example sentences, and find a subgrammar that still covers all the examples. To accomplish this, we formulate the problem as a constraint satisfaction problem, and use a constraint solver to find the optimal grammar. We created experiments with English, Finnish, German, Swedish, and Spanish, which show that 10–20 examples are often sufficient to learn an interesting grammar for a specific application. We also present two extensions to this basic method: we include negative examples and allow rules to be merged. The resulting grammars can more precisely cover specific linguistic phenomena. Our method, together with the extensions, can be used to provide a grammar learning system for specific applications. This system is easy-to-use, human-centric, and can be used by non-syntacticians. Based on this grammar learning method, we can build applications for computer-assisted language learning and interlingual communication, which rely heavily on the knowledge of language and domain experts who often lack the competence to develop required grammars themselves.}, booktitle = {Natural Language Processing in Artificial Intelligence—NLPinAI 2020}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2021}, publisher = {Springer International Publishing}, ISBN = {978-3-030-63787-3}, } @inProceedings{zechner-2021-cross-322911, title = {Cross-Topic Author Identification – a Case Study on Swedish Literature}, booktitle = {The Eighth Swedish Language Technology Conference (SLTC-2020), Selected Contributions, 25–27 November 2020, Gothenburg, Sweden, Online}, author = {Zechner, Niklas}, year = {2021}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-7929-614-8}, } @article{borin-2022-that-322872, title = {All that glitters . . . : Interannotator agreement in natural language processing}, abstract = {Evaluation has emerged as a central concern in natural language processing (NLP) over the last few decades. Evaluation is done against a gold standard, a manually linguistically annotated dataset, which is assumed to provide the ground truth against which the accuracy of the NLP system can be assessed automatically. In this article, some methodological questions in connection with the creation of gold standard datasets are discussed, in particular (non-)expectations of linguistic expertise in annotators and the interannotator agreement measure standardly but unreflectedly used as a kind of quality index of NLP gold standards.}, journal = {Nordlyd}, author = {Borin, Lars}, year = {2022}, volume = {46}, number = {1}, pages = {19--26}, } @incollection{blensenius-anderssonlilja-2022-search-314515, title = {In search of subjective meaning in Swedish pseudocoordination}, abstract = {This study provides a discussion of the development of subjective meaning associated with the motion-verb pseudocoordination gå och V 'go/walk and V' and the posture-verb pseudocoordination sitta och V 'sit and V', using historical and present-day linguistic data. It is claimed that an interpretation in terms of item-based analogy and entrenchment of frequent meaning clusters is the most plausible analysis for the development of subjective (and pejorative) meaning associated with gå och V. The study of sitta och V is preliminary, but the results indicate that the subjective meaning of this construction is less entrenched than that of the gå och V construction and that the subjective overtone of subjectivity may be a result of the combination of the social/cultural meaning of the posture and certain intrinsically pejorative verbs, together with certain locatives.}, booktitle = {Pseudo-Coordination and Multiple Agreement Constructions}, editor = {Giuliana Giusti and Vincenzo Nicolò Di Caro and Daniel Ross}, author = {Blensenius, Kristian and Andersson Lilja, Peter}, year = {2022}, publisher = {John Benjamins}, address = {Amsterdam, Philadelphia}, ISBN = {9789027210883}, pages = {213–229}, } @article{skoldberg-2022-andra-320475, title = {Andra upplagan av Svensk ordbok: förutsättningar och redaktionella val}, abstract = {In the article, the editor-in-chief of the second edition of Svensk ordbok utgiven av Svenska Akademien (SO2, 2021) gives an overall picture of, e.g., the technical conditions, financial framework and agreements with the financier which have guided the work with the edition. Furthermore, examples are provided of some of the lexicographical work initiatives that have taken place prior to the second edition and the motives behind these, as well as the priorities that have been necessary. }, journal = {LexicoNordica}, author = {Sköldberg, Emma}, year = {2022}, volume = {29}, pages = {139--152}, } @incollection{saxena-etal-2022-clues-317928, title = {Clues to Kanashi prehistory 1: Loanword adaptation in nouns and adjectives}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {173--213}, } @inProceedings{ljunglof-smallbone-2022-efficient-325303, title = {Efficient corpus search using unary and binary indexes}, abstract = {We investigate how disk-based inverted indexes can be used for efficient searching in large annotated corpora. We give a formal semantics for simple corpus queries, and show how they can be translated into lookups in unary and binary indexes.}, booktitle = {Swedish Language Technology Conference}, author = {Ljunglöf, Peter and Smallbone, Nicholas}, year = {2022}, } @edited_book{blensenius-2022-valency-318684, title = {Valency and constructions. Perspectives on combining words}, abstract = {This volume contains papers on the theme valency and constructions, including papers from an international workshop on the same topic held at the University of Gothenburg. The aim is to cover many aspects of the broad topic of valency and constructions. Different languages are represented, for example Japanese, Brazilian Portuguese, and Swedish. Different perspectives on the topic can be identified: lexicographic, constructionist, event-structure, and frame semantics, to name but a few.}, editor = {Blensenius, Kristian}, year = {2022}, publisher = {Meijerbergs institut för svensk etymologisk forskning, Göteborgs universitet}, address = {Göteborg}, ISBN = {9789198679120}, } @incollection{zechner-2022-other-322912, title = {The other SAT-Solver: Applying lexicons to SweSAT word questions}, booktitle = {Live and Learn – Festschrift in honor of Lars Borin / editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk}, author = {Zechner, Niklas}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, pages = {167--169}, } @edited_book{saxena-borin-2022-synchronic-317920, title = {Synchronic and diachronic aspects of Kanashi}, editor = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, } @incollection{saxena-etal-2022-kanashi-317930, title = {Kanashi and West Himalayish: Genealogy, language contact, prehistoric migrations}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {237--254}, } @incollection{saxena-etal-2022-linguistic-317924, title = {Linguistic variation: A challenge for describing the phonology of Kanashi}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Sjöberg, Anna and Sagar, Padam and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {131--144}, } @inProceedings{kokkinakis-etal-2022-necessity-321865, title = {The necessity of digital health communication in social media to boost COVID-19 vaccine acceptance. }, booktitle = {ICA Post Conference: Digital Health Communication: Issues and Perspectives. University of Burgundy Franche-Comté, Dijon, France.}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie and Borin, Lars and Miegel, Fredrik}, year = {2022}, } @misc{tahmasebi-etal-2022-proceedings-316661, title = {Proceedings of the 3rd Workshop on Computational Approaches to Historical Language Change, May 26-27, 2022, Dublin, Ireland}, author = {Tahmasebi, Nina and Montariol, Syrielle and Kutuzov, Andrey and Hengchen, Simon and Dubossarsky, Haim and Borin, Lars}, year = {2022}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-955917-42-1}, } @incollection{forsberg-skoldberg-2022-ordvektorer-320472, title = {Ordvektorer i lexikografiskt arbete}, abstract = {We present a preliminary case study on the use of word vectors in lexicographic practice. The study shows the potential of using vector models in the revision of existing dictionary entries as well as creating new entries.}, booktitle = {Live and learn. Festschrift in honor of Lars Borin (eds. Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg & Shafqat Virk)}, author = {Forsberg, Markus and Sköldberg, Emma}, year = {2022}, publisher = {Department of Swedish, Multilingualism, Language Technology}, address = {Gothenburg}, ISBN = {978-91-87850-82-0}, pages = {37--41}, } @incollection{blensenius-holmer-2022-avokado-321648, title = {avokado-r/-er/-s/-sar}, abstract = {The article discusses lexicographic perspectives of the Swedish plural with the suffix -s. Traditionally, plural nouns ending in -s, for example avokados ‘avocados’, are considered colloquial speech; the formal way of writing the plural in question is avokador or avokadoer. However, since the Swedish Academy grammar included a noun declension indicating plurals with the suffix -s, plural with -s seems to have become more accepted, at least among language planners.}, booktitle = {Live and Learn. Festschrift in honor of Lars Borin (red. Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg & Shafqat Virk)}, author = {Blensenius, Kristian and Holmer, Louise}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, pages = {13--16}, } @incollection{saxena-etal-2022-linguistic-317923, title = {A linguistic sketch of Kanashi}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard and Sagar, Padam}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {53--127}, } @incollection{skoldberg-2022-phraseological-318544, title = {Phraseological theory, evidence in corpora and lexicographical practice. On collocations in a monolingual dictionary of Swedish}, abstract = {In this paper, I examine the treatment of collocations in a comprehensive monolingual dictionary of Swedish, namely Svensk ordbok utgiven av Svenska Akademien (‘Contemporary Dictionary of the Swedish Academy’). Based on research on phraseology, studies of the mastery of Swedish collocations among L2 students, and metalexicographic approaches to collocations, I discuss the identification, selection, lemmatisation, and microstructural presentation of collocations in the dictionary. I also examine and assess corpora findings and the advanced tools provided by the Språkbanken Text research unit. These corpora and advanced tools play an essential role in lexicological and phraseological research of Swedish and in the work done on the Swedish Academy’s dictionaries.}, booktitle = {Valency and constructions. Perspectives on combining words. Ed. by Kristian Blensenius. (Meijerbergs arkiv för svensk ordforskning 46.)}, author = {Sköldberg, Emma}, year = {2022}, publisher = {Meijerbergs institut för svensk etymologisk forskning, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-986791-2-0}, pages = {155--182}, } @incollection{saxena-borin-2022-introduction-317921, title = {Introduction: Kanashi, its speakers, its linguistic and extralinguistic context}, booktitle = {Synchronic and diachronic aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {3--11}, } @incollection{petersson-etal-2022-vagar-321666, title = {Vägar in i en profession – ämneslärarprogrammet i svenska }, booktitle = {Använd rummet Högskolepedagogiska metoder för aktiva lärsalar / Veronica Alfredsson, Noomi Asker, Christel Backman, Sara Uhnoo (red.).}, author = {Petersson, Stellan and Andréasson, Maia and Malmberg, Anja}, year = {2022}, publisher = {Studentlitterstur}, address = {Lund}, ISBN = {9789144157795}, pages = {303--310}, } @incollection{blensenius-holmer-2022-verbal-318518, title = {How do verbal constructional alternations reflect (sub-)sense distinctions in dictionaries? A case study of a Swedish monolingual dictionary}, booktitle = {Blensenius, Kristian (ed.) Valency and constructions. Perspectives on combining words. Meijerbergs arkiv för svensk ordforskning 46}, author = {Blensenius, Kristian and Holmer, Louise}, year = {2022}, address = {Göteborg}, ISBN = {978-91-986791-2-0}, pages = {9--30}, } @incollection{saxena-etal-2022-clues-317929, title = {Clues to Kanashi prehistory 2: Loanword adaptation in verbs}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars and Comrie, Bernard}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {215--233}, } @inProceedings{casademontmoner-volodina-2022-swedish-321955, title = {Swedish MuClaGED: A new dataset for Grammatical Error Detection in Swedish}, abstract = {This paper introduces the Swedish MuClaGED dataset, a new dataset specifically built for the task of Multi-Class Grammatical Error Detection (GED). The dataset has been produced as a part of the multilingual Computational SLA shared task initiative. In this paper we elaborate on the generation process and the design choices made to obtain Swedish MuClaGED. We also show initial baseline results for the performance on the dataset in a task of Grammatical Error Detection and Classification on the sentence level, which have been obtained through (Bi)LSTM ((Bidirectional) Long-Short Term Memory) methods.}, booktitle = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) }, author = {Casademont Moner, Judit and Volodina, Elena}, year = {2022}, publisher = {Linköping University Electronic Press}, address = {Linköping, Sweden}, ISBN = {978-91-7929-459-5}, } @article{landqvist-etal-2022-organisationer-321827, title = {Organisationer, frågor och (an)svar – Institutionellt terminologiskt samarbete mellan Sverige och Finland 1975–1998}, abstract = {This article describes and analyses the contacts between the Swedish and Finnish national terminology organisations between the 1970s and the 1990s. The Swedish Tekniska nomenklaturcentralen TNC (1941–2018; 2000–2018 Terminologicentrum TNC) was Sweden’s national center for special languages and terminology work for more than 75 years. Since its founding in 1941, the TNC was active not only in Sweden, but also in establishing and maintaining international contacts. The Finnish Centre for Technical Terminology TSK (after 2004 Finnish Terminology Centre TSK) was founded in 1974. The research material used is the collection of correspondence in the TNC’s document archive, in particular the section stored with the code “Ufin”, i.e., letters (N=98) documenting written communication between the TNC and the TSK. The article describes the topics covered in the communication, the individuals involved, the objectives and consequences of the communication and the results achieved. In a quantitative analysis, the themes of the letters are categorized in four main categories: publications, communication, information, and language issues. In a qualitative analysis, a specific terminological issue in the field of wood technology is analysed by close reading and content analysis. Finally, further studies, which complete the picture of terminological co- operation on institutional level in the Nordic countries, are proposed.}, journal = {Responsible Communication. VAKKI Publications. Eds H. Katajamäki, M. Enell-Nilsson, H. Kauppinen-Räisänen & H. Limatius }, author = {Landqvist, Hans and Nissilä, Niina and Pilke, Nina and Sjöberg, Sannina}, year = {2022}, volume = {14}, pages = {89–104}, } @book{holmer-2022-neutrala-318517, title = {Neutrala substantiv på -ande i text och ordbok}, author = {Holmer, Louise}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg }, } @inProceedings{morger-etal-2022-cross-325984, title = {A Cross-lingual Comparison of Human and Model Relative Word Importance}, abstract = {Relative word importance is a key metric for natural language processing. In this work, we compare human and model relative word importance to investigate if pretrained neural language models focus on the same words as humans cross-lingually. We perform an extensive study using several importance metrics (gradient-based saliency and attention-based) in monolingual and multilingual models, including eye-tracking corpora from four languages (German, Dutch, English, and Russian). We find that gradient-based saliency, first-layer attention, and attention flow correlate strongly with human eye-tracking data across all four languages. We further analyze the role of word length and word frequency in determining relative importance and find that it strongly correlates with length and frequency, however, the mechanisms behind these non-linear relations remain elusive. We obtain a cross-lingual approximation of the similarity between human and computational language processing and insights into the usability of several importance metrics.}, booktitle = {Proceedings of the 2022 CLASP Conference on (Dis)embodiment, Gothenburg and online 15–16 September 2022 / Simon Dobnik, Julian Grove and Asad Sayeed (eds.)}, author = {Morger, Felix and Brandl, Stephanie and Beinborn, Lisa and Hollenstein, Nora}, year = {2022}, publisher = {Association for Computational Linguistics}, address = {Gothenburg, Sweden}, ISBN = {978-1-955917-67-4}, } @book{holmer-2022-neutrala-330060, title = {Neutrala substantiv på -ande i text och ordbok}, abstract = {Denna utgåva bygger på en doktorsavhandling med samma titel från september 2022. }, author = {Holmer, Louise}, year = {2022}, publisher = {Meijerbergs institut för svensk etymologisk forskning}, address = {Göteborg}, ISBN = {978-91-986791-3-7}, } @incollection{saxena-borin-2022-then-317927, title = {And then there was one: Kanashi numerals from borrowed superdiversity to borrowed uniformity}, booktitle = {Synchronic and Diachronic Aspects of Kanashi}, editor = {Anju Saxena and Lars Borin}, author = {Saxena, Anju and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {9783110703153}, pages = {145--170}, } @incollection{pettersson-borin-2022-swedish-323276, title = {Swedish Diachronic Corpus}, abstract = {The recently compiled Swedish Diachronic Corpus offers access to a total of approximately 16 billion words, covering texts from the 13th century onwards. The corpus contains 14 main genres, with a number of subgenres, compiled from a wide range of sources, including corpus providers and libraries as well as individual researchers and private citizens. All texts in the corpus follow a consistent format, are extensively annotated with metadata, and freely available for download. We firmly believe that the existence of a Swedish diachronic corpus among the resources offered by CLARIN will open up avenues to new, interesting research questions within humanities research, and be a valuable resource for large-scale studies of the Swedish language throughout history – studies that have previously been impossible to conduct in a thorough and consistent manner. Thanks to its embedding in the CLARIN context it also carries the potential to enable broad historical studies from a comparative European perspective.}, booktitle = {CLARIN: The infrastructure for language resources}, editor = {Darja Fišer and Andreas Witt}, author = {Pettersson, Eva and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-076734-6}, pages = {561–585}, } @article{landqvist-2022-telefoner-316926, title = {Telefoner, tidpunkter och termometrar – i verkligheten och i språket}, journal = {GU Journalen}, author = {Landqvist, Hans}, year = {2022}, volume = {2022}, number = {3}, pages = {54--55}, } @incollection{dannells-etal-2022-beyond-321730, title = {Beyond strings of characters: Resources meet NLP – Again}, abstract = {FrameNet (FN) resources have existed for many languages for over a decade but their adoption in real world applications has been limited. To celebrate the 65 anniversary of Lars Borin, the initiator and leader of Swedish FrameNet, among others, we take a standpoint to motivate why language resources are crucial for moving NLP forward. We present our position on (a) the need for language resources to embrace other dimensions of text and language use, and (b) the need for them to relate to other representations through multimodality.}, booktitle = {Live and learn: Festschrift in honor of Lars Borin / Editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk}, author = {Dannélls, Dana and Torrent, Tiago Timponi and Sigiliano, Natalia Sathler and Dobnik, Simon}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, pages = {29--37}, } @inProceedings{ingvarsson-etal-2022-order-323627, title = {The New Order of Criticism. Explorations of Book Reviews Between the Interpretative and Algorithmic}, booktitle = {The 6th Digital Humanities in the Nordic and Baltic Countries Conference (DHNB 2022), Uppsala, Sweden, March 15-18, 2022}, author = {Ingvarsson, Jonas and Brodén, Daniel and Samuelsson, Lina and Wåhlstrand Skärström, Victor and Zechner, Niklas}, year = {2022}, publisher = {CEUR Workshop Proceedings}, } @article{cousse-bouma-2022-semantic-311027, title = {Semantic scope restrictions in complex verb constructions in Dutch}, abstract = {This article addresses the question of how and why verbs combine in complex verb constructions in Dutch. We discuss introspective data reported in reference grammars and add evidence from corpus data to uncover the systematic ways in which Dutch verbs combine. Our analysis shows that verbs expressing meanings such as tense, aspect, modality and evidentiality are organized in a semantic scope hierarchy; that is, some verb meanings systematically have scope over others but not the other way round. We argue that this scope hierarchy reflects hierarchies of functional categories, elaborated in both functional and generative frameworks.}, journal = {Linguistics}, author = {Coussé, Evie and Bouma, Gerlof}, year = {2022}, volume = {60}, number = {1}, pages = {123--176 }, } @inProceedings{ingvarsson-etal-2022-order-324051, title = {The New Order of Criticism. Explorations of Book Reviews Between the Interpretative and Algorithmic}, abstract = {The New Order of Criticism (2020–2024) is a mixed-methods project combining algorithmic and interpretative approaches to the study of literary criticism. The project expands on a prior study of Swedish book reviews from the years 1906, 1956 and 2006 (‘The Order of Criticism’, Samuelsson 2013), re-examining and re-evaluating the original results through the uses of computational tools, language technology and big data. The aim of the present paper is to discuss early experiences and results from the interdisciplinary approach utilized by the current project, a collaborative process where interpreter and programmer are in dialogue, and where methodologies, and their instantiation in tools, are reflexively discussed from an epistemological point of view. In our analysis we ask: How can insights from working with digital methodologies and tools inform traditional scholarship on literary criticism? How can interpretative approaches and results inform digital methods?}, booktitle = {Digital Humanities in Action: The Sixth Digital Humanities in the Nordic and Baltic Countries conference, Uppsala, Sweden, March 15-18, 2022.}, author = {Ingvarsson, Jonas and Brodén, Daniel and Samuelsson, L and Wåhlstrand Skärström, Victor and Zechner, Niklas}, year = {2022}, publisher = {CEUR-WS}, address = {Aachen}, pages = {228--234}, } @inProceedings{heittola-etal-2022-finland-317133, title = {TNC och Finland. Korrespondens inom terminologiområdet 1940–1999 }, booktitle = {Svenskan i Finland 19. Föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa 6–7 maj 2021. Red. Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund}, author = {Heittola, Sanna and Landqvist, Hans and Nissilä, Niina and Pilke, Nina}, year = {2022}, publisher = {Åbo Akademi/Svensk-Österbottniska Samfundet}, address = {Vasa}, ISBN = {978-952-69650-4-8}, } @incollection{lindahl-rodven-eide-2022-argumentative-325260, title = {Argumentative Language Resources at Språkbanken Text}, abstract = {Språkbanken Text at the University of Gothenburg is a CLARIN B-centre providing language resources in Swedish, as well as tools to use them, for a wide range of disciplines. In 2017, we began exploring the field of argument mining – the process of automatically identifying and classifying arguments in text – partly aimed at establishing language resources and tools for argument analysis and mining in Swedish.}, booktitle = {CLARIN: The Infrastructure for Language Resources, eds. Darja Fišer & Andreas Witt}, author = {Lindahl, Anna and Rødven-Eide, Stian}, year = {2022}, publisher = {De Gruyter}, address = {Berlin, Boston}, ISBN = { 9783110767346 }, pages = {667--690}, } @article{volodina-etal-2022-crowdsourcing-336551, title = {Crowdsourcing ratings for single lexical items: a core vocabulary perspective}, abstract = {In this study, we investigate theoretical and practical issues connected to differentiating between core and peripheral vocabulary at different levels of linguistic proficiency using statistical approaches combined with crowdsourcing. We also investigate whether crowdsourcing second language learners’ rankings can be used for assigning levels to unseen vocabulary. The study is performed on Swedish single-word items. The four hypotheses we examine are: (1) there is core vocabulary for each proficiency level, but this is only true until CEFR level B2 (upper-intermediate); (2) core vocabulary shows more systematicity in its behavior and usage, whereas peripheral items have more idiosyncratic behavior; (3) given that we have truly core items (aka anchor items) for each level, we can place any new unseen item in relation to the identified core items by using a series of comparative judgment tasks, this way assigning a “target” level for a previously unseen item; and (4) non-experts will perform on par with experts in a comparative judgment setting. The hypotheses have been largely confirmed: In relation to (1) and (2), our results show that there seems to be some systematicity in core vocabulary for early to mid-levels (A1-B1) while we find less systematicity for higher levels (B2-C1). In relation to (3), we suggest crowdsourcing word rankings using comparative judgment with known anchor words as a method to assign a “target” level to unseen words. With regard to (4), we confirm the previous findings that non-experts, in our case language learners, can be effectively used for the linguistic annotation tasks in a comparative judgment setting.}, journal = {Slovenščina 2.0: Empirical, Applied and Interdisciplinary Research}, author = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese}, year = {2022}, volume = {10}, number = {2}, pages = {5--61}, } @inProceedings{lindahl-2022-machines-322689, title = {Do machines dream of artificial agreement?}, abstract = {In this paper the (assumed) inconsistency between F1-scores and annotator agreement measures is discussed. This is exemplified in five corpora from the field of argumentation mining. High agreement is important in most annotation tasks and also often deemed important for an annotated dataset to be useful for machine learning. However, depending on the annotation task, achieving high agreement is not always easy. This is especially true in the field of argumentation mining, because argumentation can be complex as well as implicit. There are also many different models of argumentation, which can be seen in the increasing number of argumentation annotated corpora. Many of these reach moderate agreement but are still used in machine learning tasks, reaching high F1-score. In this paper we describe five corpora, in particular how they have been created and used, to see how they have handled disagreement. We find that agreement can be raised post-production, but that more discussion regarding evaluating and calculating agreement is needed. We conclude that standardisation of the models and the evaluation methods could help such discussions.}, booktitle = {Proceedings of the 18th Joint ACL - ISO Workshop on Interoperable Semantic Annotation within LREC2022, June 20, 2022, Marseille, France / Harry Bunt (Editor)}, author = {Lindahl, Anna}, year = {2022}, publisher = {European Language Resources Association}, address = {Marseille}, ISBN = {979-10-95546-81-8}, } @edited_book{volodina-etal-2022-live-320415, title = {Live and Learn- Festschrift in honor of Lars Borin}, abstract = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.}, editor = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, } @incollection{alimohammed-etal-2022-annotation-321989, title = {Annotation Management Tool: A Requirement for Corpus Construction}, abstract = {We present an annotation management tool, SweLL portal, that has been developed for the purposes of the SweLL infrastructure project for building a learner corpus of Swedish (Volodina et al., 2019). The SweLL portal has been used for supervised access to the database, data versioning, import and export of data and metadata, statistical overview, administration of annotation tasks, monitoring of annotation tasks and reliability controls. The development of the portal was driven by visions of longitudinal sustainable data storage and was partially shaped by situational needs reported by portal users, including project managers, researchers, and annotators.}, booktitle = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September / Monica Monachini and Maria Eskevich (eds.)}, author = {Ali Mohammed, Yousuf and Matsson, Arild and Volodina, Elena}, year = {2022}, publisher = {Linköping Electronic Conference }, address = {Linköping, Sweden}, ISBN = {978-91-7929-444-1}, pages = {101--108}, } @incollection{fridlund-etal-2022-trawling-319822, title = {Trawling and Trolling for Terrorists in the Digital Gulf of Bothnia: Cross-lingual Text Mining for the Emergence of Terrorism in Swedish and Finnish Newspapers, 1780–1926}, abstract = {In pursuing the historical emergence of the discourse on terrorism, this study trawls the “digital Gulf of Bothnia” in the form of a corpus of combined Swedish and Finnish digitized newspaper texts. Through a cross-lingual exploration of the uses of the concept of terrorism in historical Swedish and Finnish news, we examine meanings anchored in the two culturally close but still decidedly different national political contexts. The study is an outcome of an integrative interdisciplinary effort by Swe-Clarin, using resources accessible through the CLARIN infrastructure to enrich scholarship in the humanities. The capabilities of the corpus tool Korp enable us to affirm prior research on the conceptual history of terrorism, but also to suggest a complex and diverse picture of the connotations of terrorism, both as state and sub-state violence up until the 20th century. At the same time, the study allows us to explore the potentials of cross-lingual text mining for historical analysis of national online newspaper corpora provided by Swe-Clarin and FIN-CLARIN.}, booktitle = {CLARIN: The Infrastructure for Language Resources, eds. Darja Fišer & Andreas Witt}, author = {Fridlund, Mats and Brodén, Daniel and Jauhiainen, Tommi and Malkki, Leena and Olsson, Leif-Jöran and Borin, Lars}, year = {2022}, publisher = {De Gruyter Mouton}, address = {Berlin, Boston}, ISBN = {9783110767346}, pages = {781--802}, } @inProceedings{landqvist-2022-termer-316458, title = {Termer och begrepp – både nytta och nöje! Rapport från ett pågående forskningsprojekt}, abstract = {Termer, begrepp och begreppsdefinitioner är centrala resurser när experter inom fackområden kommunicerar med varandra, experter kommunicerar med icke-experter och icke-experter kommunicerar med andra icke-experter. Exempel på dessa tre situationer, hämtade från juridikens fackområde, är kommunikation mellan jurister i en tingsrätt, kommunikation mellan jurister och icke-juridiskt insatta i en tingsrätt och kommunikation mellan en privatperson som ingår ett köpeavtal med en annan privatperson. Mellan 1941 och 2018 fungerade Tekniska Nomenklaturcentralen TNC (2000–2018: Terminologicentrum TNC) som nationellt centrum för terminologiskt arbete i Sverige. TNC verkade både i den offentliga sektorn och den privata. Efter nedläggningen av TNC vid årsskiftet 2018/2019 har Institutet för språk och folkminnen – Isof börjat bygga upp en verksamhet inriktad på fackspråk och terminologi, med fokus på den offentliga sektorn. Isof har också i uppdrag att förvalta och utveckla termdatabasen Rikstermbanken, som initierades och utvecklades av TNC, samt förvalta TNC:s bibliotek. Vid sidan av Isof finns också andra aktörer som arbetar med terminologi i Sverige, bl.a. företag som fordonstillverkaren Scania, och den ideella föreningen Terminologifrämjandet. Mot denna bakgrund arbetar forskare vid Göteborgs universitet och Vasa universitet i Finland sedan 2016 inom projektet Termer i tid – Tidens termer. Terminologi som språklig infrastruktur då, nu och sedan. Den övergripande målsättningen för projektet är att detta ska kartlägga hur ett antal centrala institutionella aktörer agerar och interagerar i terminologiska frågor i Sverige från 1940-talet och framåt samt klarlägga vilka möjligheter och utmaningar som det finns för terminologiskt arbete som en språklig infrastruktur i dagens och framtidens Sverige. Under seminariet presenteras projektet, ett antal resultat från arbetet inom projektet och några tankar om fortsättningen av projektet. }, booktitle = {Högre seminariet för språk och svenska, Linnéuniversitetet, 24 maj 2022}, author = {Landqvist, Hans}, year = {2022}, } @inProceedings{edlund-etal-2022-multimodal-311480, title = {A Multimodal Digital Humanities Study of Terrorism in Swedish Politics: An Interdisciplinary Mixed Methods Project on the Configuration of Terrorism in Parliamentary Debates, Legislation, and Policy Networks 1968–2018}, abstract = {This paper presents the design of one of Sweden’s largest digital humanities projects, SweTerror, that through an interdisciplinary multi-modal methodological approach develops an extensive speech-to-text digital HSS resource. SweTerror makes a major contribution to the study of terrorism in Sweden through a comprehensive mixed methods study of the political discourse on terrorism since the late 1960s. Drawing on artificial intelligence in the form of state-of-the-art language and speech technology, it systematically analyses all forms of relevant parliamentary utterances. It explores and curates an exhaustive but understudied multi-modal collection of primary sources of central relevance to Swedish democracy: the audio recordings of the Swedish Parliament’s debates. The project studies the framing of terrorism both as policy discourse and enacted politics, examining semantic and emotive components of the parliamentary discourse on terrorism as well as major actors and social networks involved. It covers political responses to a range of terrorism-related issues as well as factors influencing policy-makers’ engagement, including political affiliations and gender. SweTerror also develops an online research portal, featuring the complete research material and searchable audio made readily accessible for further exploration. Long-term, the project establishes a model for combining extraction technologies (speech recognition and analysis) for audiovisual parliamentary data with text mining and HSS interpretive methods and the portal is designed to serve as a prototype for other similar projects.}, booktitle = { Intelligent Systems and Applications. Proceedings of the 2021 Intelligent Systems Conference, September 2–3, 2021 / Arai K. (eds) }, author = {Edlund, Jens and Brodén, Daniel and Fridlund, Mats and Lindhé, Cecilia and Olsson, Leif-Jöran and Ängsal, Magnus Pettersson and Öhberg, Patrik}, year = {2022}, publisher = {Springer}, address = {Cham}, ISBN = {978-3-030-82195-1}, } @inProceedings{landqvist-2022-finlandssvenska-317134, title = {Finlandssvenska översättare i Svenskt översättarlexikon utifrån översättningssociologiska utgångspunkter}, booktitle = {Svenskan i Finland 19. Föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa 6–7 maj 2021. Red. Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund}, author = {Landqvist, Hans}, year = {2022}, publisher = {Åbo Akademi/Svensk-Österbottniska Samfundet}, address = {Vasa}, ISBN = {978-952-69650-4-8}, } @inProceedings{kokkinakis-hammarlin-2022-negative-321864, title = {Negative vaccine voices in Swedish social media }, abstract = {Vaccinations are one of the most significant interventions to public health, but vaccine hesitancy creates concerns for a portion of the population in many countries, including Sweden. Since discussions on vaccine hesitancy are often taken on social networking sites, data from Swedish social media are used to study and quantify the sentiment among the discussants on the vaccination-or-not topic during phases of the COVID-19 pandemic. Out of all the posts analyzed a majority showed a stronger negative sentiment, prevailing throughout the whole of the examined period, with some spikes or jumps due to the occurrence of certain vaccine-related events distinguishable in the results. Sentiment analysis can be a valuable tool to track public opinions regarding the use, efficacy, safety, and importance of vaccination. }, booktitle = {Proceedings of the 13th International Conference of Experimental Linguistics}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie}, year = {2022}, } @inProceedings{angsal-etal-2022-linguistic-318676, title = {Linguistic Framing of Political Terror: Distant and Close Readings of the Discourse on Terrorism in the Swedish Parliament 1993–2018}, abstract = {This paper provides a study of the discourse on terrorism in Swedish parliamentary debate 1993– 2018. The aim is to explore how terrorism is discursively constructed in parliamentary delibera- tions, drawing on the resources of Swe-Clarin in the form of the corpus tool Korp and the linguis- tic concept of ‘frame’. To map meanings attached to terrorism we pursue two research questions: what framing elements are connected to ‘terrorism’ and ‘terrorist’ in parliamentary speeches as 1) simplexes and 2) as part of compounds along the lines of controversies and party affiliations? The latter research question is probed through distant and close readings of the specific compound statsterrorism (‘state terrorism’). Our findings show that terrorism is typically framed as located outside of Sweden and as tied to Islamism, but the question of what countries are associated with state terrorism depends on the political affiliation of the interlocutor. The compound statsterror- ism is most prominently used by the left and green parties and then commonly associated with Israel and Turkey. We conclude by suggesting that a widened inquiry into compounds, in general as well as diachronically, is likely a productive way of expanding the scope of our research.}, booktitle = {CLARIN Annual Conference Proceedings, 10–12 October 2022, Prague, Czechia. Eds. Tomaž Erjavec & Maria Eskevich}, author = {Ängsal, Magnus Pettersson and Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Öhberg, Patrik}, year = {2022}, address = {Prag}, } @inProceedings{fridlund-etal-2022-codifying-315876, title = {Codifying the Debates of the Riksdag: Towards a Framework for Semi-automatic Annotation of Swedish Parliamentary Discourse}, abstract = {This study provides an exploratory attempt to develop a framework for how to semi-automatically annotate salient topics in Swedish parliamentary debate. The discussion is grounded in the ongoing digital humanities project SweTerror that studies the terrorism discourse in the Riksdag 1968–2018 through a mixed-methods approach. The paper presents our tentative framework through its three main categories: metadata, language data and frame data. While the first two categories are mostly generic and their data could mainly be automatically extracted, the third category is contextual and requires manual interpretation. We discuss the design of the latter through the theoretical concept of ‘framing’ and illustrate the framework’s overall principles through a case study of utterances in the debates 1968–1970 concerning terrorism. We conclude by suggesting that it may be more generally applicable for studies of parliamentary debates in HSS research if further modified for the particular research purposes. }, booktitle = {CEUR Workshop Proceedings. Matti La Mela, Fredrik Norén & Eero Hyvönen, eds., Proceedings of Digital Parliamentary Data in Action (DiPaDa 2022). Workshop Co-located with the 6th Digital Humanities in the Nordic and Baltic Countries Conference (DHNB 2022), Uppsala, Sweden, March 15, 2022.}, author = {Fridlund, Mats and Brodén, Daniel and Olsson, Leif-Jöran and Ängsal, Magnus Pettersson}, year = {2022}, publisher = {CEUR-WS}, address = {Aachen}, pages = {167--175}, } @inProceedings{munozsanchez-etal-2022-first-320225, title = {A First Attempt at Unreliable News Detection in Swedish}, abstract = {Throughout the COVID-19 pandemic, a parallel infodemic has also been going on such that the information has been spreading faster than the virus itself. During this time, every individual needs to access accurate news in order to take corresponding protective measures, regardless of their country of origin or the language they speak, as misinformation can cause significant loss to not only individuals but also society. In this paper we train several machine learning models (ranging from traditional machine learning to deep learning) to try to determine whether news articles come from either a reliable or an unreliable source, using just the body of the article. Moreover, we use a previously introduced corpus of news in Swedish related to the COVID-19 pandemic for the classification task. Given that our dataset is both unbalanced and small, we use subsampling and easy data augmentation (EDA) to try to solve these issues. In the end, we realize that, due to the small size of our dataset, using traditional machine learning along with data augmentation yields results that rival those of transformer models such as BERT.}, booktitle = {Proceedings of the Second International Workshop on Resources and Techniques for User Information in Abusive Language Analysis, Marseille, 20-25 June, 2022 / Editors: Johanna Monti, Valerio Basile, Maria Pia Di Buono, Raffaele Manna, Antonio Pascucci, Sara Tonell}, author = {Muñoz Sánchez, Ricardo and Johansson, Eric and Tayefeh, Shakila and Kad, Shreyash}, year = {2022}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, ISBN = {979-10-95546-99-3}, pages = {1--7}, } @incollection{volodina-alfter-2022-icall-321984, title = {ICALL: Research versus reality check.}, abstract = {Intelligent Computer-Assisted Language Learning has been one of Lars Borin’s research interests. The work on the Lärka language learning platform has started under his coordination. We see it our mission to make the platform live and prosperous, and through it to stimulate research into Swedish as a second language. Below, we name some weaknesses we have identified in Lärka while working with a course of beginner Swedish and outline our plans for tackling those.}, booktitle = {Live and Learn- Festschrift in honor of Lars Borin}, author = {Volodina, Elena and Alfter, David}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, pages = {145--152}, } @inProceedings{lindstromtiedemann-etal-2022-cefr-321899, title = {CEFR-nivåer och svenska flerordsuttryck}, abstract = {När vi lär oss ett nytt språk ska vi inte bara lära oss enstaka ord och hur vi använder dessa, utan vi måste också lära oss vilka ordkombinationer som är ”fasta uttryck” till betydelsen (t.ex. hälsa på någon) eller till formen (t.ex. lättare sagt än gjort) eller båda delarna (t.ex. huller om buller). Enligt en del studier kan dessa uttryck utgöra så mycket som 50 % av vokabulären i ett språk som förstaspråk (L1) eller ännu mer (Jackendoff 1997; Erman 2007, 28). Men det är möjligt att de är vanligare i vardagligt språk och talspråk (Prentice & Sköldberg 2013). Flerordsenheter kan vara problematiska för andraspråkstalare (Nesselhauf 2003, 223) till och med på avancerad nivå (jfr Pawley & Syder 1983; Wray & Perkins 2000; Nesselhauf 2003; Prentice 2010). Samtidigt är de en helt nödvändig del av språket (Nesselhauf 2003, 223) och kan utmärka andraspråkstalarna som icke-modersmålstalare (Pawley & Syder 1983; Wray 2002). Flerordsuttryck är alltså en värdefull del av andraspråkskompetensen (se även Paquot 2019) och något som är viktigt att studera hur vi på bästa sätt introducerar för L2-talaren och om de kan kopplas till nivåer i bedömning. I den här studien presenterar vi resultat kring förståelsen av flerordsuttryck i svenska som andraspråk i relation till färdighetsnivåerna enligt Gemensam Europeisk Referensram för Språk (GERS eller CEFR, Common European Framework of Reference) (COE 2001; 2018; Skolverket 2009; Utbildningsstyrelsen 2018) genom crowdsourcing experiment.}, booktitle = {Svenskan i Finland 19 : föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland, Vasa den 6-7 maj 2021 / redigerade av Siv Björklund, Bodil Haagensen, Marianne Nordman och Anders Westerlund}, author = {Lindström Tiedemann, Therese and Alfter, David and Volodina, Elena}, year = {2022}, publisher = {Svensk-Österbottniska Samfundet}, address = {Vasa}, ISBN = {978-952-69650-5-5}, } @article{berdicevskis-semenuks-2022-imperfect-313148, title = {Imperfect language learning reduces morphological overspecification: Experimental evidence}, journal = {PLoS ONE}, author = {Berdicevskis, Aleksandrs and Semenuks, Arturs}, year = {2022}, volume = {17}, number = {1}, pages = {1--26}, } @inProceedings{volodina-etal-2022-swedish-321985, title = {Swedish L2 profile - a tool for exploring L2 data.}, abstract = {Learner corpus researchers, NLP researchers, as well as Digital Humanities and Social Sciences in general, rely on access to various data sets for empirical analysis, statistical insights, and/or for model building. However, interpretation of data is a non-trivial task and there is a need for data visualization tools. One such attempt is the Swedish L2 profile (SweL2P) – an ongoing project setting up the first digital tool allowing users to explore written Swedish learner language from a linguistic point of view.}, booktitle = {Learner Corpus Research conference, 22-24 September, Padua, Italy}, author = {Volodina, Elena and Lindström Tiedemann, Therese and Ali Mohammed, Yousuf}, year = {2022}, address = {Universitá degli Studi di Padova, Padua, Italy}, } @inProceedings{klezl-etal-2022-exploring-321958, title = {Exploring Linguistic Acceptability in Swedish Learners’ Language }, abstract = {We present our initial experiments on binary classification of sentences into linguistically correct versus incorrect ones in Swedish using the DaLAJ dataset (Volodina et al., 2021a). The nature of the task is bordering on linguistic acceptability judgments, on the one hand, and on grammatical error detection task, on the other. The experiments include models trained with different input features and on different variations of the training, validation, and test splits. We also analyze the results focusing on different error types and errors made on different proficiency levels. Apart from insights into which features and approaches work well for this task, we present first benchmark results on this dataset. The implementation is based on a bidirectional LSTM network and pre-trained FastText embeddings, BERT embeddings, own word and character embeddings, as well as part-of-speech tags and dependency labels as input features. The best model used BERT embeddings and a training and validation set enriched with additional correct sentences. It reached an accuracy of 73% on one of three test sets used in the evaluation. These promising results illustrate that the dataand format of DaLAJ make a valuable new resource for research in acceptability judgements in Swedish.}, booktitle = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022)}, author = {Klezl, Julia and Ali Mohammed, Yousuf and Volodina, Elena}, year = {2022}, publisher = {Linköping University Electronic Press}, address = {Linköping, Sweden}, ISBN = {978-91-7929-459-5 }, } @incollection{volodina-etal-2022-lyxig-321974, title = {Lyxig språklig födelsedagspresent from the Swedish Word Family.}, abstract = {Morphology and lexical resources are known to be two of Lars Borin’s biggest research passions. We have, therefore, prepared a short description of a new kind of a lexical resource for Swedish, the Swedish Word Family. The resource is compiled based on learner corpora, and contains lexical items manually analyzed for derivational morphology.}, booktitle = {Live and Learn- Festschrift in honor of Lars Borin}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Lindström Tiedemann, Therese}, year = {2022}, publisher = {Department of Swedish, Multilingualism, Language Technology}, address = {Gothenburg, Sweden}, ISBN = {978-91-87850-83-7}, } @misc{kokkinakis-etal-2022-proceedings-317658, title = {Proceedings of LREC 2022 Workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments. (RaPID-2012), Saturday 25th of June 2022. }, abstract = {RaPID-4 aims to be an interdisciplinary forum for researchers to share information, findings, methods, models and experience on the collection and processing of data produced by people with various forms of mental, cognitive, neuropsychiatric, or neurodegenerative impairments, such as aphasia, dementia, autism, bipolar disorder, Parkinson's disease or schizophrenia. Particularly, the workshop's focus is on creation, processing and application of data resources from individuals at various stages of these impairments and with varying degrees of severity. Creation of resources includes e.g. the annotation, description, analysis and interpretation of linguistic, paralinguistic and extra-linguistic aspects of such data (i.e. spontaneous spoken language, transcripts, eye tracking, wearable and sensor measurements, digital biomarkers, etc.). Processing of such data can be used to identify, extract, correlate, evaluate and disseminate various linguistic or multimodal phenotypes and measurements, which then can be applied to aid diagnosis, monitor the progression or predict individuals at risk. A central aim is to facilitate the study of the relationships among various levels of linguistic, paralinguistic and extra-linguistic observations (e.g., acoustic measures; phonological, syntactic and semantic features; eye tracking, sensors, signs and multimodal signals). Submission of papers are invited in all of the aforementioned areas, particularly emphasizing multidisciplinary aspects of processing such data and the interplay between clinical/nursing/medical sciences, language technology, computational linguistics, natural language processing (NLP) and computer science. The workshop will act as a stimulus for the discussion of several ongoing research questions driving current and future research by bringing together researchers from various research communities. }, author = {Kokkinakis, Dimitrios and Themistocleous, Charalambos K. and Lundholm Fors, Kristina and Tsanas, Athanasios and Fraser, Kathleen C.}, year = {2022}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-77-1}, } @incollection{bouma-adesam-2022-counting-321810, title = {Counting dirty words: The effect of OCR quality on token statistics in historical Swedish corpora}, booktitle = {Live and learn: Festschrift in honor of Lars Borin / Editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk}, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2022}, publisher = {University of Gothenburg}, address = {Gothenburg}, ISBN = {978-91-87850-83-7}, pages = {17--24}, } @inProceedings{casademontmoner-volodina-2022-generation-321987, title = {Generation of Synthetic Error Data of Verb Order Errors for Swedish}, abstract = {We report on our work-in-progress to generate a synthetic error dataset for Swedish by replicating errors observed in the authentic error annotated dataset. We analyze a small subset of authentic errors, capture regular patterns based on parts of speech, and design a set of rules to corrupt new data. We explore the approach and identify its capabilities, advantages and limitations as a way to enrich the existing collection of error-annotated data. This work focuses on word order errors, specifically those involving the placement of finite verbs in a sentence.}, booktitle = {NAACL workshop on Innovative Use of NLP for Building Educational Applications, July 15, 2022, Seattle, Washington}, author = {Casademont Moner, Judit and Volodina, Elena}, year = {2022}, publisher = {Association for Computational Linguistics}, address = {Seattle, Washington}, ISBN = {978-1-955917-83-4}, } @misc{alfter-etal-2022-proceedings-321964, title = {Proceedings of the 11th Workshop on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL 2022) }, abstract = {The volume contains articles reviewed and presented at NLP4CALL workshop. The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical an methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, author = {Alfter, David and Volodina, Elena and François, Thomas and Desmet, Piet and Cornillie, Frederik and Jönsson, Arne and Rennes, Evelina}, year = {2022}, publisher = {Linköping Electronic Conference Proceedings }, address = {Linköping, Sweden}, ISBN = {978-91-7929-460-1}, } @techreport{hammarstedt-etal-2022-sparv-318399, title = {Sparv 5 Developer’s Guide}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This Developer’s Guide describes its general structure and key concepts and serves as an API documentation. Most importantly, it describes how to write plugins for Sparv 5 so that you can add your own functions to the toolkit.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, } @incollection{volodina-etal-2022-reliability-321988, title = {Reliability of Automatic Linguistic Annotation: Native vs Non-native Texts }, abstract = {We present the results of a manual evaluation of the performance of automatic linguistic annotation on three different datasets: (1) texts written by native speakers, (2) essays written by second language (L2) learners of Swedish in the original form and (3) the normalized versions of learner-written essays. The focus of the evaluation is on lemmatization, POS-tagging, word sense disambiguation, multi-word detection and dependency annotation. Two annotators manually went through the automatic annotation on a subset of the datasets and marked up all deviations based on their expert judgments and the guidelines provided. We report Inter-Annotator Agreement between the two annotators and accuracy for the linguistic annotation quality for the three datasets, by levels and linguistic features.}, booktitle = {Selected Papers from the CLARIN Annual Conference 2021, Virtual Event, 2021, 27–29 September}, editor = {Monica Monachini and Maria Eskevich}, author = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese and Lauriala, Maisa and Piipponen, Daniala}, year = {2022}, publisher = {Linköping Electronic Conference }, address = {Linköping, Sweden}, ISBN = { 978-91-7929-444-1}, pages = {151--167}, } @incollection{berdicevskis-etal-2022-actually-320416, title = {We may actually all die tomorrow... nevertheless: Predicting short-term frequency changes in Swedish neologisms}, abstract = {Predicting the future is difficult, as Lars Borin likes to point out by saying the phrase which is included in the title of this paper. Nevertheless, we attempt to predict short-term changes in the frequency of new Swedish words based on some measures of their linguistic and social dissemination. We show that it is possible to predict the direction of change with a higher-than-baseline accuracy. Most interestingly, we show that predictions are much less accurate for those words that denote new phenomena than for those who are new signifiers for already existing phenomena.}, booktitle = {Live and learn: Festschrift in honor of Lars Borin / Editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk}, author = {Berdicevskis, Aleksandrs and Adesam, Yvonne and Coussé, Evie}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet}, address = {Göteborg}, ISBN = {978-91-87850-83-7}, pages = {5--12}, } @techreport{hammarstedt-etal-2022-sparv-318405, title = {Sparv 5 User Manual}, abstract = {The Sparv Pipeline developed by Språkbanken Text is a text analysis tool run from the command line. This user manual describes how to get Sparv 5 up and running on your own machine, how to configure it and how to use it for annotating your own corpora.}, author = {Hammarstedt, Martin and Schumacher, Anne and Borin, Lars and Forsberg, Markus}, year = {2022}, publisher = {Institutionen för svenska, flerspråkighet och språkteknologi}, address = {Göteborg}, } @misc{dunabeitia-etal-2023-editorial-333441, title = {Editorial: Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests -Volume II }, abstract = {Our first volume laid the foundation for understanding the potential of digital linguistic biomarkers in assessing various cognitive and psychological aspects. In this second volume, we witness a significant advancement in both the scope and depth of research in this area. The featured articles in this volume contribute to our understanding of how linguistic biomarkers can transcend traditional paper-and-pencil tests, offering a more nuanced and comprehensive approach to the assessment of cognitive function and psychological well-being.In the first study of the volume [Gonzalez-Recober et al., 2023], the authors employed automated methods to investigate speech production during category and letter fluency tasks, commonly used neuropsychological assessments for evaluating lexical retrieval abilities. Their analysis encompassed a diverse range of linguistic and acoustic features, providing a more comprehensive perspective on these tasks than previous studies. As expected, participants produced more words during the category fluency task than during the letter fluency task. Moreover, several linguistic and acoustic measures displayed distinctions between the two tasks. The automated techniques employed in this study offer a reproducible and scalable approach for analyzing fluency tasks, with potential applications in clinical settings. By implementing these methods, future research endeavors are expected to expand our knowledge of speech feature differences, not only in terms of total scores but also across various speech measures, particularly among clinical populations.In the second article of the volume [Sánchez-Vincitore et al. 2023], the authors present a longitudinal analysis of linguistic biomarkers to detect cognitive decline. Their study underscores the potential of natural language processing techniques in identifying subtle cognitive changes over time. They examined data from over 3,000 participants aged 45 and older to investigate the relationship between age, gender, and language-mediated working memory processes using commercial cognitive tests (in their case, scientific tests developed by CogniFit Inc.). The findings revealed that age negatively predicted working memory performance, highlighting the potential of computerized assessments in predicting cognitive functions during aging and the need for further research on gender effects in cognitive aging. This study contributed to the growing body of evidence supporting the utility of linguistic biomarkers in early cognitive assessment.In the third study of our volume [Kim et al. 2023], the focus shifts to postoperative delirium (POD) in elderly patients following spinal surgery. POD has been linked to adverse outcomes in this demographic, prompting researchers to explore potential biomarkers for degenerative cerebral dysfunctions like mild cognitive impairment and dementia. The authors used electroencephalography (EEG) to measure an EEG biomarker reflecting idle cortical states through intrinsic alpha oscillations in the prefrontal regions. Cognitive follow-ups were performed using the Telephone Interview for Cognitive Status™ (TICS). The study observed that among patients diagnosed with POD, neurocognitive disorders could persist for up to 1 year postsurgery. These findings suggest that EEG has the potential to be a novel and valuable tool for identifying elderly surgical patients at a higher risk of developing postoperative delirium, offering opportunities for early intervention and improved patient outcomes.As the fourth article in our volume, the study by [Saccone et al. 2023] delves into the realm of schizophrenia, examining how it affects speech prosody and pragmatic functions. The study conducted corpus-based research, focusing on real-life spontaneous interactions to shed light on the prosodic features of schizophrenia. Notably, the speech patterns of patients revealed distinct characteristics. Their speech was organized into smaller, less structured information chunks, punctuated by frequent silences and extended pauses during turn-taking. Fluency was disrupted by retracing phenomena, particularly in complex information structures. Besides, comparing Topic and Comment-prominences between patients and non-pathological individuals revealed a consistent pattern. Patients exhibited higher values for Topic-prominence across all parameters, while the non-pathological group displayed the opposite trend. These findings provide valuable insights into the prosodic and pragmatic aspects of speech in schizophrenia, emphasizing the importance of understanding these linguistic manifestations in the context of the disorder's impact on communication.In closing, the second volume of "Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests" presents a short yet diverse and comprehensive array of research articles that collectively advance the field. These contributions not only underscore the relevance and timeliness of linguistic biomarkers in the digital age but also highlight their potential to revolutionize the way we assess cognitive function, psychological well-being, and aging across diverse populations, extending to pathological and clinical samples.}, author = {Dunabeitia, Jon Andoni and Kokkinakis, Dimitrios and Gagliardi, Gloria}, year = {2023}, volume = {14}, } @inProceedings{kokkinakis-etal-2023-prevalence-324818, title = {The Prevalence of mRNA Related Discussions during the Post-COVID-19 Era}, abstract = {Vaccinations are one of the most significant interventions to public health, but vaccine hesitancy and skepticism are raising serious concerns for a portion of the population in many countries, including Sweden. In this study, we use Swedish social media data and structural topic modeling to automatically identify mRNA-vaccine related discussion themes and gain deeper insights into how people’s refusal or acceptance of the mRNA technology affects vaccine uptake. Our point of departure is a scientific study published in February 2022, which seems to once again sparked further suspicion and concern and highlight the necessity to focus on issues about the nature and trustworthiness in vaccine safety. Structural topic modelling is a statistical method that facilitates the study of topic prevalence, temporal topic evolution, and topic correlation automatically. Using such a method, our research goal is to identify the current understanding of the mechanisms on how the public perceives the mRNA vaccine in the light of new experimental findings.}, booktitle = { Caring is Sharing – Exploiting the Value in Data for Health and Innovation / M. Hägglund et al. (eds.) Proceedings of the 33rd Medical Informatics Europe Conference (MIE2023), Gothenburg, Sweden, 22-25 May 2023}, author = {Kokkinakis, Dimitrios and Bruinsma, Sebastianus Cornelis Jacobus and Hammarlin, Mia-Marie}, year = {2023}, publisher = {IOS Press}, ISBN = {978-1-64368-388-1}, } @inProceedings{berdicevskis-erbro-2023-tomato-326355, title = {You say tomato, I say the same: A large-scale study of linguistic accommodation in online communities}, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), May 22-24, 2023 Tórshavn, Faroe Islands}, author = {Berdicevskis, Aleksandrs and Erbro, Viktor}, year = {2023}, publisher = {University of Tartu Library}, ISBN = {978-99-1621-999-7}, pages = {415--424}, } @article{cousse-etal-2023-anvands-332468, title = {Hur används de, dem och dom i nutida skriftspråk? En storskalig korpusundersökning av nyheter och sociala medier}, abstract = {This study ties in with a longstanding debate on the Swedish spelling variants de, dem and dom for personal pronouns (third person plural) and definite articles (plural). It charts the usage of de, dem and dom in five large corpora with news and social media texts over the past 25 years. The corpora contain more than 1.5 billion tokens, which rules out manual handling of the data. Instead, this study makes use of computational methods (including an AI language model) to automatically identify and classify relevant observations. Analysis of the news corpora shows a relatively stable usage of de, dem and dom over the past 25 years. The forms de and dem are predominantly used according to the norm: de for pronouns in subject position and as a definite article; dem for pronouns in object position. The colloquial form dom is hardly found in news texts. Analysis of the social media corpora shows more variation and change. The colloquial form dom is used in 5–25% of all instances instead of de or dem and has decreased after an initial rise. The forms de and dem are sometimes used in a non-standard way: de occurs in object position in 4–10% of the observations; dem is found in subject position or as a definite article in 1–7% of the cases. Non-standard dem is potentially on the rise with younger writers. The corpus analysis also provides details on the usage of de and dem in relative clauses, and on the users’ ratings of posts containing de, dem and dom on the social media platform Reddit}, journal = {Språk & Stil}, author = {Coussé, Evie and Adesam, Yvonne and Rekathati, Faton and Berdicevskis, Aleksandrs}, year = {2023}, volume = {NF 33}, pages = {39--70}, } @inProceedings{volodina-etal-2023-dalaj-326817, title = {DaLAJ-GED – a dataset for Grammatical Error Detection tasks on Swedish}, booktitle = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023)}, editor = {David Alfter and Elena Volodina and Thomas François and Arne Jönsson and Evelina Rennes}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Berdicevskis, Aleksandrs and Bouma, Gerlof and Öhman, Joey}, year = {2023}, publisher = { Linköping Electronic Conference Proceedings}, address = {Linköping }, ISBN = {978-91-8075-250-3}, pages = {94--101}, } @misc{cousse-etal-2023-inget-324690, title = {Inget stöd i forskningen för att de/dem slås ut}, author = {Coussé, Evie and Adesam, Yvonne and Berdicevskis, Aleksandrs}, year = {2023}, number = {2023-03-20}, } @misc{themistocleous-etal-2023-assessing-331090, title = {Assessing Language Disorders using Artificial Intelligence: a Paradigm Shift }, abstract = {Speech, language, and communication deficits are present in most neurodegenerative syndromes. They enable the early detection, diagnosis, treatment planning, and monitoring of neurocognitive disease progression as part of traditional neurological assessment. Nevertheless, standard speech and language evaluation is time-consuming and resource-intensive for clinicians. We argue that using machine learning methodologies, natural language processing, and modern artificial intelligence (AI) for Language Assessment is an improvement over conventional manual assessment. Using these methodologies, Computational Language Assessment (CLA) accomplishes three goals: (i) provides a neuro-cognitive evaluation of speech, language, and communication in elderly and high-risk individuals for dementia; (ii) facilitates the diagnosis, prognosis, and therapy efficacy in at-risk and language-impaired populations; and (iii) allows easier extensibility to assess patients from a wide range of languages. By employing AI models, CLA may inform neurocognitive theory on the relationship between language symptoms and their neural bases. Finally, it signals a paradigm shift by significantly advancing our ability to optimize the prevention and treatment of elderly individuals with communication disorders, allowing them to age gracefully with social engagement. }, author = {Themistocleous, Charalambos and Tsapkini, Kyrana and Kokkinakis, Dimitrios}, year = {2023}, publisher = {arXiv.org}, } @article{forsgren-etal-2023-text-330978, title = {The use of text-mining software to facilitate screening of literature on centredness in health care.}, abstract = {Research evidence supporting the implementation of centredness in health care is not easily accessible due to the sheer amount of literature available and the diversity in terminology and conceptualisations used. The use of text-mining functions to semi-automate the process of screening and collating citations for a review is a way of tackling the vast amount of research citations available today. There are several programmes that use text-mining functions to facilitate screening and data extraction for systematic reviews. However, the suitability of these programmes for reviews on broad topics of research, as well as the general uptake by researchers, is unclear. This commentary has a dual aim, which consists in outlining the challenges of screening literature in fields characterised by vague and overlapping conceptualisations, and to exemplify this by exploratory use of text-mining in the context of a scoping review on centredness in health care.}, journal = {Systematic Reviews}, author = {Forsgren, Emma and Wallström, Sara and Feldthusen, Caroline and Zechner, Niklas and Sawatzky, Richard and Öhlén, Joakim}, year = {2023}, volume = {12}, number = {1}, pages = {73}, } @article{hammarlin-etal-2023-covid-329784, title = {COVID-19 Vaccine Hesitancy: A Mixed Methods Investigation of Matters of Life and Death.}, abstract = {In this article, hesitancy towards COVID-19 vaccinations is investigated as a phenomenon touching upon existential questions. We argue that it encompasses ideas of illness and health, and also of dying and fear of suffering. Building on a specific strand within anti-vaccination studies, we conjecture that vaccine hesitancy is, to some extent, reasonable, and that this scepticism should be studied with compassion. Through a mixed methods approach, vaccine hesitancy, as it is being expressed in a Swedish digital open forum, is investigated and understood as, on the one hand, a perceived need of protecting one’s body from techno-scientific experiments, and thus the risk of becoming a victim of medicine itself. On the other hand, the community members express what we call a tacit belief in modern medicine by demonstrating their own “expert” pandemic knowledge. The analysis also shows how the COVID-19 pandemic triggers memories of another pandemic, namely the swine flu in 2009–2010, and what we term a medical crisis that occurred then, due to a vaccine thatcaused a rare but severe side effect in Sweden and elsewhere.}, journal = {Journal of Digital Social Research (JDSR)}, author = {Hammarlin, MIa-Marie and Kokkinakis, Dimitrios and Borin, Lars}, year = {2023}, volume = {5}, number = {4}, pages = {31--61}, } @misc{ilinykh-etal-2023-proceedings-327035, title = {Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023), May 22, 2023, Tórshavn, Faroe Islands}, abstract = {The second workshop on resources and representations for under-resourced language and domains was held in Tórshavn, Faroe Islands on May 22nd, 2023. The workshop was conducted in a physical setting, allowing for potential hybrid participation. Continuing with the aim of the first edition in 2020, RESOURCEFUL explored the role of the kind and the quality of resources that are available to us, as well as the challenges and directions for constructing new resources in light of the latest trends in natural language processing. The workshop has provided a forum for discussions between the two communities involved in building data-driven and annotation- driven resources.}, author = {Ilinykh, Nikolai and Morger, Felix and Dannélls, Dana and Dobnik, Simon and Megyesi, Beáta and Nivre, Joakim}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-959429-73-9}, } @inProceedings{kokkinakis-etal-2023-scaling-326698, title = {Scaling-up the Resources for a Freely Available Swedish VADER (svVADER) }, abstract = {With widespread commercial applications in various domains, sentiment analysis has become a success story for Natural Language Processing (NLP). Still, although sentiment analysis has rapidly progressed during the last years, mainly due to the application of modern AI technologies, many approaches apply knowledge-based strategies, such as lexicon-based, to the task. This is particularly true for analyzing short social media content, e.g., tweets. Moreover, lexicon-based sentiment analysis approaches are usually preferred over learning-based methods when training data is unavailable or insufficient. Therefore, our main goal is to scale-up and apply a lexicon-based approach which can be used as a baseline to Swedish sentiment analysis. All scaled-up resources are made available, while the performance of this enhanced tool is evaluated on two short datasets, achieving adequate results. }, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Hammarlin, Mia-Marie}, year = {2023}, } @inProceedings{masciolini-etal-2023-towards-329384, title = {Towards automatically extracting morphosyntactical error patterns from L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are UD-annotated corpora of learner sentences paired with correction hypotheses. Automatic morphosyntactical annotation has the potential to remove the need for explicit manual error tagging and improve interoperability, but makes it more challenging to locate grammatical errors in the resulting datasets. We therefore propose a novel method for automatically extracting morphosyntactical error patterns and perform a preliminary bilingual evaluation of its first implementation through a similar example retrieval task. The resulting pipeline is also available as a prototype CALL application.}, booktitle = {Proceedings of the 18th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2023), July 13, 2023, Toronto, Canada}, author = {Masciolini, Arianna and Volodina, Elena and Dannélls, Dana}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-959429-80-7}, } @inProceedings{kokkinakis-etal-2023-investigating-325628, title = {Investigating the Effects of MWE Identification in Structural Topic Modelling }, abstract = {Multiword expressions (MWEs) are common word combinations which exhibit idiosyncrasies in various linguistic levels. For various downstream natural language processing applications and tasks, the identification and discovery of MWEs has been proven to be potentially practical and useful, but still challenging to codify. In this paper we investigate various, relevant to MWE, resources and tools for Swedish, and, within a specific application scenario, we apply structural topic modelling to investigate whether there are any interpretative advantages of identifying MWEs.}, booktitle = {The 19th Workshop on Multiword Expressions (MWE 2023)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Bruinsma, Sebastianus C. J. and Hammarlin, Mia-Marie}, year = {2023}, publisher = {ACL}, ISBN = {978-1-959429-59-3}, } @inProceedings{bloomstrom-etal-2023-preparing-328710, title = {Preparing a corpus of spoken Xhosa}, booktitle = {Proceedings of the 2023 CLASP Conference on Learning with Small Data (LSD), Gothenburg and online 11–12 September 2023}, author = {Bloom Ström, Eva-Marie and Slater, Onelisa and Zahran, Aron and Berdicevskis, Aleksandrs and Schumacher, Anne}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Gothenburg, Sweden}, ISBN = {979-8-89176-000-4}, pages = {62--67}, } @incollection{virk-etal-2023-lingfn-337386, title = {LingFN: A Framenet for the Linguistic Domain}, abstract = {Frame semantics is a theory of meaning in natural language, which defines the structure of the lexical semantic resources known as framenets. Both framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for their limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering domains such as medicine, soccer, and tourism. In this paper, we report on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars (written in English) i.e. a framenet for the linguistic domain (LingFN) to complement the general-language BFN.}, booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, author = {Virk, Shafqat and Klang, Per and Borin, Lars and Saxena, Anju}, year = {2023}, ISBN = {9783031243363}, pages = {367--379}, } @inProceedings{beccaria-etal-2023-extraction-334169, title = {Extraction and Analysis of Acoustic Features from Italian-Speaking Children with Autism Spectrum Disorder }, abstract = {Background: The persistent difficulties in social interaction and communication that characterize Autism Spectrum Disorder can be accessed by investigating the quality of language. Indeed, these deficits involve the presence of anomalies in speech production and understanding, which find an expression at the acoustic and prosodic levels of linguistic analysis. Objectives: The main aim of this work is to propose a speech pipeline for the extraction of Italian speech biomarkers typical of ASD by conducting an acoustic and phonological analysis. Moreover, we will highlight the strengths and difficulties of this kind of investigation introducing new topics for further research. Methods: The poster will present the analysis of a speech corpus of 14 Italian-speaking children with ASD and 14 controls (C). The corpus is demographically balanced (age 6-10, 8;1 ± 1;3. Sex: 3F, 11 M) and homogeneous at the diatopic level (origin: Prato, Pistoia, Florence). First, we extracted the acoustic features by using eGeMAPS (openSMILE; Eyben et al., 2015), specifically ideated for the study of impaired speech. Then, we implemented the Mann-Whitney U-test to select the features with the most statistically significant distance in the production of the two groups. Secondly, we conducted a parallel extraction regarding the pitch (F0 mean and standard deviation). We propose this additional analysis because pitch varies according to some demographic traits of the speaker (sex, age, height) and the literature presents opposite trends. For this task, we used Praat to have more flexibility in the manipulation of the extraction. We set the F0 range between 70 and 400 Hz (Patel et al., 2020). Finally, we conducted a comparison between the results of the two methods excluding female participants to verify if the trend of pitch changes when the participants are not mixed. Results: Table 1 shows the features selected between the ones extracted. They are related to prosody, quality of voice, loudness, and spectral distribution. Jitter, shimmer and HNR are usually investigated together to describe the emotional prosody and the quality of voice. The same trend found on our corpus is recorded in previous studies on languages other than Italian (Bone et al. 2015; Kissine & Geelhand 2019). Moreover, spectral flux is usually investigated together with shimmer and jitter to describe speech impairments (Haider et al., 2019). Nevertheless, if we consider the studies related to autistic speech, there are few that describe this feature because of the different methodologies used during the extraction. Finally, the values of pitch extracted by eGeMAPS and Praat show the same trend. It is higher in ASD than in controls, both if we considered the corpus mixed and the one with only the male speakers. However, the pitch does not show a statistically significant difference between the two groups (Table 2). Conclusions: These results, although preliminary, seem to confirm the presence of phonetic alterations of speech associated with the disorder. Further studies could improve the accuracy of the pipeline proposed by doing a qualitative analysis of the results and considering other linguistic and paralinguistic domains (e.g., morphological, pragmatic, and gestural analysis). }, booktitle = {The 22nd International Society for Autism Research (INSAR), May 3-4, Stockholm, Sweden}, author = {Beccaria, Federica and Gagliardi, Gloria and Kokkinakis, Dimitrios}, year = {2023}, } @inProceedings{zhou-etal-2023-finer-325541, title = {The Finer They Get: Combining Fine-Tuned Models For Better Semantic Change Detection}, abstract = {In this work we investigate the hypothesis that enriching contextualized models using fine-tuning tasks can improve their capacity to detect lexical semantic change (LSC). We include tasks aimed to capture both low-level linguistic information like part-of-speech tagging, as well as higher level (semantic) information. Through a series of analyses we demonstrate that certain combinations of fine-tuning tasks, like sentiment, syntactic information, and logical inference, bring large improvements to standard LSC models that are based only on standard language modeling. We test on the binary classification and ranking tasks of SemEval-2020 Task 1 and evaluate using both permutation tests and under transfer-learning scenarios.}, booktitle = {24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, author = {Zhou, Wei and Tahmasebi, Nina and Dubossarsky, Haim}, year = {2023}, publisher = {Linköping University Electronic Press}, ISBN = {978-99-1621-999-7}, } @misc{ehret-etal-2023-measuring-326620, title = {Measuring Language Complexity: challenges and opportunities}, author = {Ehret, Katharina and Berdicevskis, Aleksandrs and Bentz, Christian and Blumenthal-Dramé, Alice}, year = {2023}, volume = {9}, number = {s1}, pages = {1--8}, } @inProceedings{volodina-etal-2023-grandma-328176, title = {Grandma Karl is 27 years old – research agenda for pseudonymization of research data}, abstract = {Accessibility of research data is critical for advances in many research fields, but textual data often cannot be shared due to the personal and sensitive information which it con- tains, e.g names or political opinions. General Data Protection Regulation (GDPR) suggests pseudonymization as a solution to secure open access to research data, but we need to learn more about pseudonymization as an approach before adopting it for manipulation of research data. This paper outlines a research agenda within pseudonymization, namely need of studies into the effects of pseudonymization on unstructured data in relation to e.g. readability and language assessment, as well as the effectiveness of pseudonymization as a way of protecting writer identity, while also exploring different ways of developing context-sensitive algorithms for detection, labelling and replacement of personal information in unstructured data. The recently granted project on pseudonymization ‘Grandma Karl is 27 years old’1 addresses exactly those challenges.}, booktitle = {2023 IEEE Ninth International Conference on Big Data Computing Service and Applications (BigDataService), Athens, Greece, 2023}, author = {Volodina, Elena and Dobnik, Simon and Lindström Tiedemann, Therese and Vu, Xuan-Son}, year = {2023}, publisher = {IEEE Computer Society}, address = {Los Alamitos}, ISBN = {979-8-3503-3379-4}, } @inProceedings{kokkinakis-etal-2023-analysis-330230, title = {Analysis of mRNA-vaccine posts on Swedish Twitter data }, abstract = {The aim of this study was to use Swedish social media data to capture public perspectives and sentiments regarding the abovementioned study on possible effect of the novel mRNA vaccines that became massively available to the public during late 2021. The intention is to understand the key issues (topics/themes) that have captured public attention in Sweden, as well as the barriers and facilitators to successful or not mRNA vaccines.}, booktitle = {14th International Conference of Experimental Linguistics,18-20 October 2023, Athens, Greece}, author = {Kokkinakis, Dimitrios and Bruinsma, Bastian and Hammarlin, Mia-Marie}, year = {2023}, } @inProceedings{rodven-eide-etal-2023-unsc-338176, title = {The UNSC-Graph: An Extensible Knowledge Graph for the UNSC Corpus}, abstract = {We introduce the UNSC-Graph, a knowledge graph for a corpus of debates of the United Nations Security Council (UNSC) during the period 1995-2020. The graph combines previously disconnected data sources including from the UNSC Repertoire, the UN Library, Wikidata, and from metadata extracted from the speeches themselves. Beyond existing metadata detailing debates’ topics and participants, we also extended the graph to include all country mentions in a speech, geographical neighbours of countries mentioned, as well as sentiment scores. By linking the graph to Wikidata, we are able to include additional geopolitical information and extract various country name aliases to extend the coverage of country mentions beyond existing NER-based approaches. Studying mentions of Ukraine after 2014, we present a use case for the graph as a source for continuous analysis of international politics and geopolitical events discussed in the UNSC.}, booktitle = {Proceedings of the 3rd Workshop on Computational Linguistics for the Political and Social Sciences, September 22, 2023, Ingolstadt, Germany / Christopher Klamm, Gabriella Lapesa, Valentin Gold, Theresa Gessler, Simone Paolo Ponzetto (Editors)}, author = {Rødven-Eide, Stian and Zaczynska, Karolina and Pires, Antonio and Patz, Ronny and Stede, Manfred}, year = {2023}, publisher = {Association for Computational Lingustics}, address = {Ingolstadt, Germany}, pages = {69--74}, } @article{broden-etal-2023-diachrony-330178, title = {The diachrony of the new political terrorism: Neologisms as discursive framing in Swedish parliamentary data 1971–2018}, abstract = {This paper begins to unpack the framing of terrorism in the Swedish Parliament through distant reading and by chronologically extracting neologisms in a comprehensive corpus of transcripts of parliamentary debates. Combining language technology and historical contextualization, we find support for the argument that the term ‘terrorism’ gained much of its modern meaning around 1970. Specifically, our study points to a legislative framing of the issue of terrorism in Swedish parliamentary debate from the early 1970s and onwards. We also find a proliferation in the production of neologisms and compounds after 9/11 2001, reflecting, among other things, the rise of a more distinct counter-terrorism discourse and more ‘specialized’ roles and functions related to terrorism and counter-terrorism activities. The paper concludes by emphasizing the analytical benefits of tracing parliamentary discourse through neologisms as an explorative approach to identify significant patterns for further investigation.}, journal = {Digital Humanities in the Nordic and Baltic Countries Publications}, author = {Brodén, Daniel and Olsson, Leif-Jöran and Fridlund, Mats and Ängsal, Magnus Pettersson and Öhberg, Patrik}, year = {2023}, volume = {5}, number = {1}, pages = {79–89}, } @inProceedings{ljunglof-levin-2023-unicodex-331075, title = {UniCoDeX (Universal Construction Dependency Xrammar)}, booktitle = {Dagstuhl Seminar Report. Universals of Linguistic Idiosyncrasy in Multilingual Computational Linguistics, May 7–12, 2023, Dagstuhl, Germany}, author = {Ljunglöf, Peter and Levin, Lori}, year = {2023}, publisher = { Leibniz-Zentrum für Informatik}, address = {Dagstuhl}, } @incollection{bouma-2023-continental-333445, title = {LFG and Continental West-Germanic languages}, booktitle = {Mary Dalrymple (ed.), Handbook of Lexical Functional Grammar}, author = {Bouma, Gerlof}, year = {2023}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-424-6}, pages = {1407--1468}, } @inProceedings{wilkens-etal-2023-tcfle-337441, title = {TCFLE-8: a Corpus of Learner Written Productions for French as a Foreign Language and its Application to Automated Essay Scoring}, abstract = {Automated Essay Scoring (AES) aims to automatically assess the quality of essays. Automation enables large-scale assessment, improvaements in consistency, reliability, and standardization. Those characteristics are of particular relevance in the context of language certification exams. However, a major bottleneck in the development of AES systems is the availability of corpora, which, unfortunately, are scarce, especially for languages other than English. In this paper, we aim to foster the development of AES for French by providing the TCFLE-8 corpus, a corpus of 6.5k essays collected in the context of the Test de Connaissance du Français (TCF - French Knowledge Test) certification exam. We report the strict quality procedure that led to the scoring of each essay by at least two raters according to the levels of the Common European Framework of Reference for Languages (CEFR) and to the creation of a balanced corpus. In addition, we describe how linguistic properties of the essays relate to the learners' proficiency in TCFLE-8. We also advance the state-of-the-art performance for the AES task in French by experimenting with two strong baselines (i.e., RoBERTa and feature-based). Finally, we discuss the challenges of AES using TCFLE-8.}, booktitle = {EMNLP 2023 - 2023 Conference on Empirical Methods in Natural Language Processing, Proceedings}, author = {Wilkens, Rodrigo and Pintard, Alice and Alfter, David and Folny, Vincent and François, Thomas}, year = {2023}, ISBN = {9798891760608}, } @inProceedings{berdicevskis-etal-2023-superlim-331445, title = {Superlim: A Swedish Language Understanding Evaluation Benchmark}, booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, December 6-10, 2023, Singapore / Houda Bouamor, Juan Pino, Kalika Bali (Editors)}, author = {Berdicevskis, Aleksandrs and Bouma, Gerlof and Kurtz, Robin and Morger, Felix and Öhman, Joey and Adesam, Yvonne and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Isbister, Tim and Lindahl, Anna and Malmsten, Martin and Rekathati, Faton and Sahlgren, Magnus and Volodina, Elena and Börjeson, Love and Hengchen, Simon and Tahmasebi, Nina}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {979-8-89176-060-8}, pages = {8137--8153}, } @article{ehret-etal-2023-measuring-326113, title = {Measuring language complexity: challenges and opportunities}, journal = {Linguistics Vanguard}, author = {Ehret, Katharina and Berdicevskis, Aleksandrs and Bentz, Christian and Blumenthal-Dramé, Alice}, year = {2023}, volume = {9}, pages = {1--8}, } @incollection{borin-etal-2023-language-337444, title = {Language Report Swedish}, abstract = {Swedish speech and language technology (LT) research goes back over 70 years. This has paid off: there is a national research infrastructure, as well as significant research projects, and Swedish is well-endowed with language resources (LRs) and tools. However, there are gaps that need to be filled, especially high-quality goldstandard LRs required by the most recent deep-learning methods. In the future, we would like to see closer collaborations and communication between the “traditional” LT research community and the burgeoning AI field, the establishment of dedicated academic LT training programmes, and national funding for LT research.}, booktitle = {Cognitive Technologies}, author = {Borin, Lars and Domeij, Rickard and Edlund, Jens and Forsberg, Markus}, year = {2023}, pages = {219--222}, } @edited_book{alfter-etal-2023-proceedings-331649, title = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023) }, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, editor = {Alfter, David and Volodina, Elena and François, Thomas and Jönsson, Arne and Rennes, Evelina}, year = {2023}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-8075-250-3}, } @incollection{tahmasebi-dubossarsky-2023-computational-325543, title = {Computational modeling of semantic change}, abstract = {In this chapter we provide an overview of computational modeling for semantic change using large and semi-large textual corpora. We aim to provide a key for the interpretation of relevant methods and evaluation techniques, and also provide insights into important aspects of the computational study of semantic change. We discuss the pros and cons of different classes of models with respect to the properties of the data from which one wishes to model semantic change, and which avenues are available to evaluate the results. This chapter is forthcoming as the book has not yet been published. }, booktitle = {Routledge Handbook of Historical Linguistics, 2nd edition}, author = {Tahmasebi, Nina and Dubossarsky, Haim}, year = {2023}, publisher = {Routledge}, } @inProceedings{hammarlin-etal-2023-fearing-327373, title = {Fearing mRNA: A Mixed Methods Study of Vaccine Rumours}, abstract = {The first mass-distributed vaccines based on mRNA technology were launched in 2021 to protect against COVID-19, sparking rumours among vaccine critical individuals that these “new” vaccines might be more dangerous to the health than other, “traditional” vaccines. Drawing on rumour theories and social cognitive perspectives, the aim of this chapter is to account for the purpose and the spreading of medical rumours that encircle mRNA COVID-19 vaccines. We ask: How are rumours concerning mRNA expressed and established? In terms of trust and distrust, what function do the rumours have? We take as our empirical case the fast spreading of a medical journal article written by a group of infectious medicine researchers at Lund University, Sweden, that spawned an already established vaccine rumour, and analyse Swedish-language tweets discussing mRNA vaccines posted between February 10, 2022 and November 10, 2022. Our study follows a mixed methods sequential explanatory design consisting of an initial computational distant reading analysis based on structural topic modeling, followed by a close qualitative reading and thematic analysis of the results. Our analysis shows how mRNA rumours are not primarily based on ignorance, but rather on distrust regarding the officially sanctioned, positive narrative of new vaccine technologies, expressed through what we term counter-scientific argumentation.}, booktitle = {NordMedia23: "Technological Takeover? Social and Cultural Implications – Promises and Pitfalls", 16–18 August 2023, Bergen, Norway}, author = {Hammarlin, Mia-Marie and Kokkinakis, Dimitrios and Miegel, Fredrik and Stoencheva, Jullietta}, year = {2023}, address = {Bergen, Norway}, } @misc{tahmasebi-etal-2023-proceedings-331093, title = {Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change, LChange'23, December 6th, 2023, Singapore}, abstract = {Welcome to the 4th International Workshop on Computational Approaches to Historical Language Change (LChange’23) co-located with EMNLP 2023. LChange is held on December 6th, 2023, as a hybrid event with participation possible both virtually and on-site in Singapore. Characterizing the time-varying nature of language will have broad implications and applications in multiple fields including linguistics, artificial intelligence, digital humanities, computational cognitive and social sciences. In this workshop, we bring together the world’s pioneers and experts in computational approaches to historical language change with a focus on digital text corpora. In doing so, this workshop carries out the triple goals of disseminating state-of-the-art research on diachronic modeling of language change, fostering cross-disciplinary collaborations, and exploring the fundamental theoretical and methodological challenges in this growing niche of computational linguistic research.}, author = {Tahmasebi, Nina and Montariol, Syrielle and Dubossarsky, Haim and Kutuzov, Andrey and Hengchen, Simon and Alfter, David and Periti, Francesco and Cassotti, Pierluigi}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {979-8-89176-043-1}, } @inProceedings{volodina-etal-2023-multiged-331652, title = {MultiGED-2023 shared task at NLP4CALL: Multilingual Grammatical Error Detection }, abstract = {This paper reports on the NLP4CALL shared task on Multilingual Grammatical Error Detection (MultiGED-2023), which included five languages: Czech, English, German, Italian and Swedish. It is the first shared task organized by the Computational SLA1 working group, whose aim is to promote less represented languages in the fields of Grammatical Error Detection and Correction, and other related fields. The MultiGED datasets have been produced based on second language (L2) learner corpora for each particular language. In this paper we introduce the task as a whole, elaborate on the dataset generation process and the design choices made to obtain MultiGED datasets, provide details of the evaluation metrics and CodaLab setup. We further briefly describe the systems used by participants and report the results. }, booktitle = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023) }, author = {Volodina, Elena and Bryant, Christopher and Caines, Andrew and De Clercq, Orphée and Frey, Jennifer-Carmen and Ershova, Elizaveta and Rosen, Alexandr and Vinogradova, Olga}, year = {2023}, publisher = {Linköping University Press}, } @inProceedings{ohlsson-etal-2023-going-329710, title = {Going to the market together. A presentation of a mixed methods project}, booktitle = {TwinTalks Workshop at DH2023, 10 July, Graz, Austria}, author = {Ohlsson, Claes and Virk, Shafqat and Tahmasebi, Nina}, year = {2023}, } @misc{forsberg-etal-2023-words-328244, title = {Words unboxed: discovering new words with Kubord}, author = {Forsberg, Markus and Sikora, Justyna and Sköldberg, Emma}, year = {2023}, publisher = {Kungliga biblioteket}, number = { 2023-08-29}, address = {Stockholm}, } @inProceedings{matsson-kristrom-2023-building-329957, title = {Building and Serving the Queerlit Thesaurus as Linked Open Data}, abstract = {This paper describes the creation of the Queer Literature Indexing Thesaurus (QLIT) as well as the digital infrastructure supporting the workflow for editing and publishing it. The purpose of QLIT is to adequately catalogue Swedish fiction with LGBTQI themes. It is continually edited in plain-text RDF and automatically processed for correctness and storage. Finally, it is published online as Linked Open Data and used with external systems. The technical approach relies on scripts and applications developed ad hoc, rather than existing solutions. Code is available on https://github.com/gu-gridh/queerlit-terms}, booktitle = {DHNB2023 Conference Proceedings. Sustainability: Environment - Community - Data. The 7thDigital Humanities in the Nordic and BalticCountries Conference. Oslo – Stavanger – Bergen, Norway. March 8–10, 2023}, author = {Matsson, Arild and Kriström, Olov}, year = {2023}, publisher = {Universitetet i Oslo}, address = {Oslo}, pages = {29--39}, } @inProceedings{landqvist-2023-svenskt-330560, title = {Svenskt översättarlexikon för forskningsändamål: några utgångspunkter, resultat och funderingar}, abstract = {Föredrag (inbjuden talare) vid Textseminariet, SOL-centrum, Lunds universitet, 24 november 2023}, booktitle = {Textseminariet, SOL-centrum, Lunds universitet, 24 november 2023}, author = {Landqvist, Hans}, year = {2023}, } @misc{holmer-etal-2023-nordiska-334604, title = {Nordiska studier i lexikografi 16. Rapport från 16:e konferensen om lexikografi i Norden, Lund 27-29 april 2022.}, abstract = {Nordiska studier i lexikografi 16 rapporterar från den 16:e konferensen i lexikografi, som genomfördes i Lund 27–29 april 2022. Volymen innehåller 30 bidrag som bygger på inlägg från konferensen i form av plenarföreläsningar, sektionsföredrag och posterpresentationer. Artiklarna spänner innehållsligt över ett brett fält, men samtliga anlägger någon form av lexikografiskt perspektiv. Flera av dem anknyter till konferensens tema Lexikografiska utmaningar. Merparten av bidragen är författade på danska, norska eller svenska, men ett mindre antal är skrivna på engelska.}, author = {Holmer, Louise and Horn, Greta and Landqvist, Hans and Nilsson, Pär and Nordgren, Eva and Sköldberg, Emma}, year = {2023}, address = {Göteborg}, ISBN = {978-91-986791-5-1}, } @inProceedings{skoldberg-2023-varfor-334607, title = {"Varför står det olika i SAOL och i SO?" Om (bearbetning av) skillnader mellan Svenska Akademiens samtidsordböcker}, abstract = {The Swedish Academy’s contemporary dictionaries, the glossary SAOL and the definition dictionary SO, have many features in common but they also show a lot of differences, especially in terms of content. Many of these differences can be explained with reference to the perspectives, traditions and publication year of the dictionaries. However, some differences are difficult to justify. For this reason, the editorial team of SAOL and SO is currently working on 1) identifying and 2) managing differences with respect to the information given in the two lexical resources. In this article, I discuss different types of differences, both motivated and unmotivated, between the dictionaries. The issue of priorities in the editorial work concerning unmotivated differences between SAOL and SO is also addressed.}, booktitle = {Holmer, Louise, Greta Horn, Hans Landqvist, Pär Nilsson, Eva Nordgren & Emma Sköldberg (red.). Nordiska studier i lexikografi 16. Rapport från 16:e konferensen om lexikografi i Norden. Lund 27–29 april 2022. }, author = {Sköldberg, Emma}, year = {2023}, publisher = {Nordiska föreningen för lexikografi}, address = {Lund & Göteborg}, ISBN = {978-91-986791-5-1}, pages = {349--361}, } @article{landqvist-2023-gender-327264, title = {Gender equality and/or inequality? Female and male translators in a Swedish digital encyclopaedia of translators}, abstract = {This article presents a study of publicly available Svenskt översättarlexikon ‘The Swedish Encyclopaedia of Translators’ (SwET 2009), most probably the first digital encyclopaedia of translators. The study is situated in the fields of the sociology of translators, (literary) translator studies, and translation history, and focuses on how female translators are described, characterized and evaluated in the version of SwET from 2022. Three research questions are addressed in the paper: (1) What is the ratio of entries presenting female and male translators in the SwET? (2) What is the quantitative treatment of the partners in the sub-category “Translator Couples”? (3) And what are the descriptions, characterizations and evaluations of the partners in that sub-category? In response to the three RQs, the same three situations emerge: (a) gender equality and inequality, (b) gender equality, and (c) gender inequality. Possible explanations for the results reported are presented and discussed. Finally, proposals for future studies of digital translator encyclopaedias are presented.}, journal = {Stridon. Journal of Studies in Translation and Interpreting}, author = {Landqvist, Hans}, year = {2023}, volume = {3}, number = {1}, pages = {93--114}, } @inProceedings{landqvist-etal-2023-terminologisamarbete-327015, title = {Terminologisamarbete i Norden. Teman, fokus och resultat från 1940-talet till 1970-talet}, abstract = {År 2026 kommer Nordterm att kunna fira sin femtioårsdag (jfr Bucher 2016b). Men det nordiska terminologiarbetet har en längre historia: ”i alla nordiska länder har [det] funnits ett visst centralt ansvar för terminologiarbete för ländernas huvudspråk ända sedan 1930-talet […]” (Bucher 2016a:74-75). Några aktörer som tagit sådant ansvar är danska Terminologicentralen – TC (1940–1960), norska Rådet for Teknisk Terminologi – RTT (1938–2001) och svenska Tekniska Nomenklaturcentralen/Terminologicentrum – TNC (1941–2018) (Selander 1972; Bucher 2016b; Store norske leksikon 2020; Terminologifrämjandet 2023). Också i Finland bedrevs terminologi(sam)arbete före Nordterms tid, men Centralen för Teknisk Terminologi rf – TSK/Terminologicentralen TSK rf/Terminologicentralen rf inrättades 1974 och bedriver alltjämt verksamhet (Nissilä et al. 2021; Heittola et al. 2022; Terminologicentralen rf 2023). Inom det pågående projektet Termer i tid – Tidens termer kartlägger vi TNC:s arbete för att trygga tillgången på god terminologi inom olika fackområden och bidra till god terminologisk praxis. För detta utnyttjar vi bl.a. det omfattande TNC-arkivet (Heittola et al. 2022; Landqvist et al. 2022). I vår presentation fokuserar vi på samarbetet mellan de tre nationella terminologiorganisationerna TC, RTT och TNC utifrån material i TNC-arkivet. Våra forskningsfrågor är: 1. Vilka teman är aktuella under olika decennier i kontakterna mellan TC, RTT och TNC? 2. Vilka särskilda fokusområden finns mellan å ena sidan TC och TNC och å andra sidan RTT och TNC? 3. Vilka resultat ger kontakterna i fråga om utvalda terminologifrågor? Som material fungerar två delmängder av TNC-arkivet: Utlandskorrespondens Danmark – Udan och Utlandskorrespondens Norge – Unor. Dessa dokumenterar kontakter i skrift mellan TNC, TC och RTT. Vi klarlägger kontakterna över tid och utifrån tema/n, försöker identifiera fokusområden samt redovisar och diskuterar hur utvalda terminologifrågor hanteras (jfr Heittola et al. 2022; Landqvist et al. 2022). Studien anlägger således både ett makro- och ett mikroperspektiv på kontakterna mellan de tre institutionerna. Referenser Bucher, A-L. (2016a). Nationella terminologicentraler – i allmännyttans intresse. I: N. Pilke & N. Nissilä (Red.). Tänkta termer. Terminologihänsyn i nordiskt perspektiv. VAKKI Publications 5. Vasa universitet, 72–99. Tillgänglig: https://vakki.net/wp-content/uploads/2020/08/tankta_termer_72-99_bucher.pdf (citerad 12.2.2023). Bucher, A-L. (2016b). Nordterm 40 år. Terminfo 2016:3. Tillgänglig: http://www.terminfo.fi/sisalto/nordterm-40-ar-359.html (citerad 12.2.2023). Heittola, S., Landqvist, H., Nissilä, N. & Pilke, N. (2022). TNC och Finland. Korrespondens inom terminologiområdet 1941–1999. I: S. Björklund, B. Haagensen, M. Nordman & A. Westerlund (Red.). Svenskan i Finland 19. Föredrag vid den nittonde sammankomsten för beskrivningen av svenskan i Finland. Vasa den 6–7 maj 2021. Skrifter utgivna av Svensk-Österbottniska Samfundet 82. Åbo Akademi & Svensk-Österbottniska Samfundet, 88–103. Tillgänglig: https://www.doria.fi/handle/10024/185549 (citerad 12.2.2023). Landqvist, H., Nissilä, N., Pilke, N. & Sjöberg, S. (2022). Organisationer, frågor och (an)svar – Institutionellt terminologiskt samarbete mellan Sverige och Finland 1975–1998. I: H. Katajamäki, M. Enell-Nilsson, H. Kauppinen-Räisänen & H. Limatius (Red.). Responsible Communication. VAKKI Publications 14. Vaasan yliopisto, 89–104. Tillgänglig: https://vakki.net/index.php/2022/12/15/responsible-communication/ (citerad 12.2.2023). Nissilä, N., Heittola, S., Pilke, N. & Landqvist, H. (2021). ”Av intresse för saken dristar jag mig att till diskussion framlägga ett par spörsmål” – Kaksi suomalaista akateemista uranuurtajaa terminologiaverkoston kirjeenvaihdossa. I: H. Katajamäki, M. Enell-Nilsson, H. Kauppinen-Räisänen, L. Kääntä & H. Salovaara (Red.). Workplace Communication IV. VAKKI Publications 13. Vaasan yliopisto, 153–168. Tillgänglig: https://vakki.net/index.php/2021/12/21/workplace-communication-iv/ (citerad 12.2.2023). Selander, E. (1972). Terminologisamarbete i Norden – erfarenheter i Sverige. I: A. Hamburger, A. Sudmann & B. Molde (Red.). Språk i Norden 1972. Årsskrift för de nordiska språknämnderna. Skrifter utgivna av Nämnden för svensk språkvård 47. Nämnden för svensk språkvård, 95–102. Tillgänglig: http://www.diva-portal.org/smash/get/diva2:1179421/FULLTEXT01.pdf (citerad 12.2.2023). Store norske leksikon (2020). Tillgänglig: https://snl.no/R%C3%A5det_for_teknisk_terminologi (citerad 12.2.2023). Termer i tid – Tidens termer (2023). Terms in Time – The Terms of the Time. Tillgänglig: https://sites.uwasa.fi/term/ (citerad 12.2.2023). Terminologicentralen rf (2023). Tillgänglig: https://sanastokeskus.fi/tsk/sv/terminologicentralen_rf-29.html (citerad 12.2.2023). Terminologifrämjandet (2023). Från Tekniska nomenklaturcentralen till Terminologicentrum. Tillgänglig: https://terminologiframjandet.se/h552a9FtZ/sveriges-terminologiska-landskap-2019/fran-tekniska-nomenklatur%d1%81entralen-till-terminologicentrum/ (citerad 12.2.2023).}, booktitle = {Nordterm 2023, 14–15 juni 2023, Stockholm}, author = {Landqvist, Hans and Nissilä, Niina and Sjöberg, Sannina}, year = {2023}, } @inProceedings{morger-2023-there-333596, title = {Are There Any Limits to English-Swedish Language Transfer? A Fine-grained Analysis Using Natural Language Inference}, abstract = {The developments of deep learning in natural language processing (NLP) in recent years have resulted in an unprecedented amount of computational power and data required to train state-of-the-art NLP models. This makes lower-resource languages, such as Swedish, increasingly more reliant on language transfer effects from English since they do not have enough data to train separate monolingual models. In this study, we investigate whether there is any potential loss in English-Swedish language transfer by evaluating two types of language transfer on the GLUE/SweDiagnostics datasets and comparing between different linguistic phenomena. The results show that for an approach using machine translation for training there is no considerable loss in overall performance nor by any particular linguistic phenomena, while relying on pre-training of a multilingual model results in considerable loss in performance. This raises questions about the role of machine translation and the use of natural language inference (NLI) as well as parallel corpora for measuring English-Swedish language transfer.}, booktitle = {Proceedings of the Second Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2023), May 22, 2023, Torshavn, the Faroe Islands / Editors: Nikolai Ilinykh, Felix Morger, Dana Dannélls, Simon Dobnik, Beáta Megyesi, Joakim Nivre}, author = {Morger, Felix}, year = {2023}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-195942973-9}, } @incollection{holmer-blensenius-2023-stavning-323528, title = {Stavning och böjning av lånord. De orange blinkrarna}, abstract = {Holmer & Blensenius har bidragit med underlag till kapitlet i fråga. Den slutliga utformningen har gjorts av Språkrådet.}, booktitle = {Maria Bylin & Björn Melander (red.). Språkrådet rekommenderar. Perspektiv, metoder och avvägningar i språkriktighetsfrågor}, author = {Holmer, Louise and Blensenius, Kristian}, year = {2023}, publisher = {Språkrådet, Institutet för språk och folkminnen}, address = {Stockholm}, ISBN = {978-91-86959-90-6}, pages = {93--104}, } @inProceedings{holmer-blensenius-2023-okynniga-334601, title = {Okynniga pluraler. Normering och bruk av s-plural speglat i SAOL och SO}, booktitle = {Holmer, Louise, Greta Horn, Hans Landqvist, Pär Nilsson, Eva Nordgren, Emma Sköldberg (red.), Nordiska studier i Lexikografi (NSL) 16, NSL 17, Rapport från 16:e konferensen om lexikografi i Norden, Lund 27–29 april 2022, Meijerbergs arkiv för svensk ordforskning 48}, author = {Holmer, Louise and Blensenius, Kristian}, year = {2023}, publisher = {Nordiska föreningen för lexikografi, Meijerbergs institut för svensk etymologisk forskning}, address = {Göteborg}, ISBN = {978-91-986791-5-1}, } @article{landqvist-etal-2023-samarbetet-332277, title = {Samarbetet mellan TNC och RTT 1938–1998: terminologiarbete som resurs för meningsfull kommunikation}, abstract = {This article describes and analyses the contacts between the Swedish and Norwegian national terminology organisations between the 1940s and the 1990s. The Swedish Tekniska nomenklaturcentralen TNC (1941–2018; 2000–2018 Terminologicentrum TNC) was Sweden’s national center for special languages and terminology work for more than 75 years. Since its founding in 1941, the TNC was active not only in Sweden, but also in establishing and maintaining international contacts. The Norwegian Rådet for teknisk terminologi RTT was founded in 1938 and closed down in 2001. The research material used is the collection of correspondence in the TNC’s document archive, in particular the section stored with the code “Unor”, i.e., letters (N=374) documenting written communication between the TNC and the RTT. The article describes the topics covered in the communication, the individuals involved, the objectives and consequences of the communication and the results achieved. In a quantitative analysis, the themes of the letters are categorised in four main categories: publications, communication, information, and language issues. In a qualitative analysis, a specific terminological issue in the field of welding technology is analysed by close reading and content analysis. Finally, further studies, which complete the picture of terminological cooperation on institutional level in the Nordic countries, are proposed.}, journal = {Communicating with Purpose. VAKKI Publications. Eds. E. Lillqvist, M. Eronen- Valli, V. Manninen, N. Nissilä & E. Salmela}, author = {Landqvist, Hans and Nissilä, Niina and Pilke, Nina and Sjöberg, Sannina}, year = {2023}, volume = {15}, pages = {232–250}, } @misc{blensenius-holmer-2023-saol-324993, title = {SAOL: Dröjer innan de och dem blir som dom}, author = {Blensenius, Kristian and Holmer, Louise}, year = {2023}, number = {2023-04-04 }, } @inProceedings{blensenius-2023-harmonisk-334638, title = {Mot en harmonisk lemma-lexemmodell och ordklassuppsättning}, booktitle = {Holmer, Louise, Greta Horn, Hans Landqvist, Pär Nilsson, Eva Nordgren, Emma Sköldberg (red.), Nordiska studier i Lexikografi (NSL) 16, NSL 17, Rapport från 16:e konferensen om lexikografi i Norden, Lund 27–29 april 2022, Meijerbergs arkiv för svensk ordforskning 48}, author = {Blensenius, Kristian}, year = {2023}, publisher = {Nordiska föreningen för lexikografi, Meijerbergs institut för svensk etymologisk forskning}, address = {Göteborg}, ISBN = {978-91-986791-5-1}, } @inProceedings{masciolini-2023-query-329383, title = {A query engine for L1-L2 parallel dependency treebanks}, abstract = {L1-L2 parallel dependency treebanks are learner corpora with interoperability as their main design goal. They consist of sentences produced by learners of a second language (L2) paired with native-like (L1) correction hypotheses. Rather than explicitly labelled for errors, these are annotated following the Universal Dependencies standard. This implies relying on tree queries for error retrieval. Work in this direction is, however, limited. We present a query engine for L1-L2 treebanks and evaluate it on two corpora, one manually validated and one automatically parsed.}, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa), May 22-24, 2023 Tórshavn, Faroe Islands / Editors: Tanel Alumäe and Mark Fishel}, author = {Masciolini, Arianna}, year = {2023}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-99-1621-999-7}, } @misc{landqvist-2023-allmansprak-324658, title = {Allmänspråk och fackspråk i en ordbok över allmänspråket}, abstract = {Blogginlägg med anledning av det uppmärksammade "snippa-målet" 2023}, author = {Landqvist, Hans}, year = {2023}, number = {2023-03-20}, } @article{landqvist-2023-ordbockers-325762, title = {Ordböckers möjligheter och begränsningar}, journal = {GU Journalen}, author = {Landqvist, Hans}, year = {2023}, number = {2}, pages = {47--48}, } @article{landqvist-2023-kunskapsorganisering-332279, title = {Kunskapsorganisering, sökmöjligheter och läsvägar: en fallstudie av handböcker för hundägare}, abstract = {This paper explores how the anonymous authors of two Swedish handbooks for dog owners, published in 1823 and 1849 respectively, tried to communicate their knowledge about the treatment of dogs so that the readers of the handbooks could, hopefully, apply the information offered in their everyday life. The study clarifies how the authors organize the knowledge that they want their readers to take part of, how they show their readers opportunities to search for the desired knowledge and which reading paths within the handbooks that the readers are offered. The two handbooks are regarded as multimodal, dialogical, final and addressive texts, and the study reported is qualitative and comparative; the handbooks are compared with each other, with other studies of dog owner manuals in Swedish, and with studies of other types of practical handbooks in Swedish. The results show great similarities between the two handbooks regarding the investigated variables, but the handbook from 1849 is judged to be more well planned and easier to use for contemporary knowledge-seeking dog owners. Finally, further studies of the handbooks are suggested, including syntactic-focused studies of them, comparisons with contemporary veterinary medical literature and studies of a larger material of dog owners’ handbooks}, journal = {Communicating with Purpose. VAKKI Publications. Eds. E. Lillqvist, M. Eronen- Valli, V. Manninen, N. Nissilä & E. Salmela}, author = {Landqvist, Hans}, year = {2023}, volume = {15}, pages = {207–231}, } @article{landqvist-2024-finlandssvenska-335636, title = {Finlandssvenska översättare i Svenskt översättarlexikon}, abstract = {Svenskt översättarlexikon innehåller artiklar om sverigesvenska och finlandssvenska översättare. Vilka översättare i lexikonet kan sägas vara finlandssvenskar? Och finns det några finlandssvenska översättare som inte ingår i lexikonet – men som borde göra det? }, journal = {Språkbruk}, author = {Landqvist, Hans}, year = {2024}, volume = {2024}, number = {2024-03-07}, } @inProceedings{belmonte-etal-2024-automatic-336253, title = {Automatic Detection of Rhythmic Features in Pathological Speech of MCI and Dementia Patients }, abstract = {The presence of linguistic alterations represents one of the prodromal signs of cognitive decline associated with dementia. In recent years, a growing body of work has been devoted to the development of algorithms for the automatic linguistic analysis of both oral and written texts, with diagnostic purposes. The extraction of Digital Linguistic Biomarkers from patients' verbal productions can indeed provide a rapid, ecological, and cost-effective system for large-scale screening of the pathology. This article contributes to the ongoing research in the field by exploring a traditionally less studied aspect of language in dementia, namely the rhythmic characteristics of speech. In particular, the paper focuses on the automatic detection of rhythmic features in Italian connected speech. A landmark-based system was developed and evaluated to segment the speech flow into vocalic and consonantal intervals and to calculate several rhythmic metrics. Additionally, the reliability of these metrics in identifying MCI and dementia patients was tested.}, booktitle = {RaPID-5: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments}, author = {Belmonte, Marica and Gagliardi, Gloria and Kokkinakis, Dimitrios and Tamburini, Fabio}, year = {2024}, publisher = {European Language Resources Association (ELRA)}, ISBN = {978-2-493814-11-1}, } @inProceedings{holmer-2024-svenska-342760, title = {Så kan svenska ordböcker användas i undervisning}, abstract = {Svenska ordböcker – finns sådana fortfarande? Och hur skulle de kunna användas mer aktivt i undervisningen? Med utgångspunkt i dessa frågor visas och diskuteras i föredraget hur allmänt tillgängliga ordboksresurser är uppbyggda och hur de kan användas aktivt i undervisningen.}, booktitle = {Ämnets dag 2024, Svenska, 28 oktober 2024, Göteborg}, author = {Holmer, Louise}, year = {2024}, address = {Göteborg}, } @inProceedings{munozsanchez-etal-2024-harnessing-342122, title = {Harnessing GPT to Study Second Language Learner Essays: Can We Use Perplexity to Determine Linguistic Competence?}, abstract = {Generative language models have been used to study a wide variety of phenomena in NLP. This allows us to better understand the linguistic capabilities of those models and to better analyse the texts that we are working with. However, these studies have mainly focused on text generated by L1 speakers of English. In this paper we study whether linguistic competence of L2 learners of Swedish (through their performance on essay tasks) correlates with the perplexity of a decoder-only model (GPT-SW3). We run two sets of experiments, doing both quantitative and qualitative analyses for each of them. In the first one, we analyse the perplexities of the essays and compare them with the CEFR level of the essays, both from an essay-wide level and from a token level. In our second experiment, we compare the perplexity of an L2 learner essay with a normalised version of it. We find that the perplexity of essays tends to be lower for higher CEFR levels and that normalised essays have a lower perplexity than the original versions. Moreover, we find that different factors can lead to spikes in perplexity, not all of them being related to L2 learner language.}, booktitle = {Proceedings of the 19th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2024), June 20, 2024, Mexico City, Mexico}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, address = { Mexico City, Mexico}, ISBN = {979-8-89176-100-1}, } @misc{volodina-etal-2024-proceedings-336386, title = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta}, author = {Volodina, Elena and Alfter, David and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Szawerna, Maria Irena and Vu, Xuan-Son}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {979-8-89176-085-1}, } @inProceedings{holdt-etal-2024-towards-341134, title = {Towards an Ideal Tool for Learner Error Annotation}, abstract = {Annotation and analysis of corrections in learner corpora have always presented technical challenges, mainly on account of the fact that until now there has not been any standard tool available, and that original and corrected versions of texts have been mostly stored together rather than treated as individual texts. In this paper, we present CJVT Svala 1.0, the Slovene version of the SVALA tool, which was originally used for the annotation of Swedish learner language. The localisation into Slovene resulted in the development of several new features in SVALA such as the support for multiple annotation systems, localisation into other languages, and the support for more complex annotation systems. Adopting the parallel aligned approach to text visualisation and annotation, as well as storing the data, combined with the tool supporting this, i.e. SVALA, are proposed as new standards in Learner Corpus Research.}, booktitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation, LREC-COLING 2024 - Main Conference Proceedings}, author = {Holdt, Špela Arhar and Erjavec, Tomaž and Kosem, Iztok and Volodina, Elena}, year = {2024}, ISBN = {9782493814104}, } @inProceedings{schlechtweg-etal-2024-durel-336715, title = {The DURel Annotation Tool: Human and Computational Measurement of Semantic Proximity, Sense Clusters and Semantic Change}, abstract = {We present the DURel tool implementing the annotation of semantic proximity between word uses into an online, open source interface. The tool supports standardized human annotation as well as computational annotation, building on recent advances with Word-in-Context models. Annotator judgments are clustered with automatic graph clustering techniques and visualized for analysis. This allows to measure word senses with simple and intuitive micro-task judgments between use pairs, requiring minimal preparation efforts. The tool offers additional functionalities to compare the agreement between annotators to guarantee the inter-subjectivity of the obtained judgments and to calculate summary statistics over the annotated data giving insights into sense frequency distributions, semantic variation or changes of senses over time.}, booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, March 17-22, 2024, St. Julians, Malta. }, author = {Schlechtweg, Dominik and Virk, Shafqat and Sander, Pauline and Sköldberg, Emma and Theuer Linke, Lukas and Zhang, Tuo and Tahmasebi, Nina and Schulte im Walde, Sabine}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-091-2}, } @inProceedings{humlesjo-etal-2024-queerlit-334589, title = {Queerlit – a bibliography of Swedish fiction with LGBTQI topics}, abstract = {This paper summarizes the project Queerlit: Metadata and Searchability for LGBTQ+ Literary Heritage 2020-2023 and discusses some challenges in the development of this resource. The Queerlit project consist of four parts: 1. Creating a bibliography of Swedish fiction with LGBTQI themes 2. Creating a Swedish thesaurus (QLIT), adapted from the of the linked open data thesaurus Homosaurus 3. Assigning all material in the bibliography with subject headings from QLIT. 4. A web user interface for searching the material All four parts are integrated with the Swedish union catalog, Libris, making the results of the project available for all under a CC0 license. QLIT is the first external thesaurus integrated in the linked open data framework used in the technical platform of Libris, XL. The bibliography spans from rune stones from the 7th century to recently published fiction. When applying subject headings for the material both general aspects of the work and specific LGBTQI topics are described, making this the most comprehensive retrospective indexing project of Swedish literature to date. The underlying knowledge organization is made a prominent method of interacting with the search interface, which is empirically designed around the needs of various user groups.}, booktitle = {Proceedings of the Huminfra Conference, 10-11 January 2024, Gothenburg, Sweden / Editors: Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Humlesjö, Siska and Bergenmar, Jenny and Matsson, Arild}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @inProceedings{holmer-etal-2024-time-341975, title = {Time to Say Goodbye Revisited – On the Exclusion of Headwords from the Swedish Academy Glossary (SAOL)}, abstract = { In the revision process of dictionaries, adding new headwords or new senses to already existing headwords is what typically receives the most attention. In this article, we bring into focus the intriguing dilemma of exclusion of headwords from the Swedish Academy Glossary (SAOL), which is still published in print versions. In the e-dictionary-era, removing headwords may seem questionable. SAOL is, however, a contemporary dictionary which aims to reflect present-day Swedish. In order to keep the lemma list up to date, new headwords are added and obsolete words are removed. The editors of SAOL have practised lemma exclusion in connection with the revisions of new editions for almost 150 years. In this paper, we present SAOL and argue that lemma exclusion is crucial to SAOL’s aim and target group. We also present our most recent corpus material, methods and tools included in this process.}, booktitle = {Despot, Kristina Štrkalj, Ostroški, Ana & Ivana, Anić (eds.). Lexicography and Semantics, Proceedings of the XXI EURALEX International Congress, 8–12 October 2024, Cavtat, Croatia}, author = {Holmer, Louise and Lillieström, Ann and Sköldberg, Emma and Uppström, Jonatan}, year = {2024}, publisher = {Institut za hrvatski jezik}, address = { Zagreb}, ISBN = {978‐953‐7967‐77‐2}, } @inProceedings{holmer-etal-2024-saol-333679, title = {SAOL och svensk språkvetenskaplig infrastruktur – nu och i framtiden}, abstract = {Svenska Akademiens ordlista (SAOL 14, 2015) spelar en viktig roll inom svensk språkvetenskaplig infrastruktur, något som framkommer i denna artikel. Vidare presenteras preliminära resultat av en undersökning av hur frekventa uppslagsorden i SAOL egentligen är i olika delkorpusar med modern allmänspråklig svenska. För att ordlistan även fortsättningsvis ska kunna användas inom svensk ordforskning, vid språkstudier m.m., men också bli mer central inom språkteknologiska sammanhang, är det avgörande att SAOL:s uppslagsord vilar på vetenskaplig grund, moderna språkteknologiska metoder och uppdaterade korpusmaterial. Fokus i artikeln ligger på de uppslagsord som inte finns belagda i korpusmaterialet, och som därmed kan tänkas mönstras ut inför den kommande femtonde upplagan.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024 (eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder)}, author = {Holmer, Louise and Lillieström, Ann and Sköldberg, Emma and Uppström, Jonatan}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{skoldberg-etal-2024-revealing-341866, title = {Revealing Semantic Variation in Swedish Using Computational Models of Semantic Proximity–Results From Lexicographical Experiments}, abstract = {The paper reports a pilot study on the detection of lexical semantic variation in modern Swedish. The starting point of the study is the meaning descriptions of around 65,000 headwords in ’The Contemporary Dictionary of the Swedish Academy’ (SO, 2021) covering approximately 100,000 different senses. In our work, we aim to explore the potential of the latest computational methods to discover outdated definitions in SO and update them. For this, we make use of the DURel tool (Schlechtweg et al., 2018, 2024) which relies on state- of-the-art language models for the automatic semantic analysis of word usages. The work resulted in drawing lexicographers’ attention to both main senses and subsenses that should be added to the dictionary. It has also demonstrated that certain meaning descriptions in SO are too general and should be split in accordance with the current principles for the semantic descriptions in the dictionary.}, booktitle = {Lexicography and Semantics. Proceedings of the XXI EURALEX International Congress 8–12 October 2024 Cavtat, Croatia (eds. Kristina Š. Despot, Ana Ostroški Anić & Ivana Brač )}, author = {Sköldberg, Emma and Virk, Shafqat and Sander, Pauline and Hengchen, Simon and Schlechtweg, Dominik}, year = {2024}, publisher = {Institut za hrvatski jezik}, ISBN = {978‐953‐7967‐77‐2}, } @book{borin-etal-2024-vaccine-341185, title = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust During the COVID-19 Pandemic}, abstract = {Bringing together studies from across the Nordic region, this book examines the challenges brought by the COVID-19 pandemic, with a particular focus on vaccine hesitancy. Shedding light on the political tensions that emerged as a result of the pandemic and the debates that ensued both within and between the Nordic nations, it investigates the vociferous discussions surrounding the COVID-19 vaccines and their presumed negative side effects through the lens of trust; trust in and between the neighbouring countries, in healthcare systems, fellow citizens, and experts; in public authorities, politicians, researchers, journalists, and pharmaceutical companies. The first volume to explore vaccine hesitancy in the Scandinavian context, this ground-breaking volume offers fresh perspectives on vaccine scepticism not as a form of ignorance or lack of knowledge, but as a manifestation of a more fundamental lack of faith in modern government and science. As such, it will appeal to scholars of sociology, politics, anthropology, media studies, communication and cultural studies with interests in public health, popular and political discourse and questions of public trust. }, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, publisher = {Taylor and Francis}, ISBN = {9781040011614}, } @inProceedings{munozsanchez-etal-2024-names-336384, title = {Did the Names I Used within My Essay Affect My Score? Diagnosing Name Biases in Automated Essay Scoring}, abstract = {Automated essay scoring (AES) of second-language learner essays is a high-stakes task as it can affect the job and educational opportunities a student may have access to. Thus, it becomes imperative to make sure that the essays are graded based on the students’ language proficiency as opposed to other reasons, such as personal names used in the text of the essay. Moreover, most of the research data for AES tends to contain personal identifiable information. Because of that, pseudonymization becomes an important tool to make sure that this data can be freely shared. Thus, our systems should not grade students based on which given names were used in the text of the essay, both for fairness and for privacy reasons. In this paper we explore how given names affect the CEFR level classification of essays of second language learners of Swedish. We use essays containing just one personal name and substitute it for names from lists of given names from four different ethnic origins, namely Swedish, Finnish, Anglo-American, and Arabic. We find that changing the names within the essays has no apparent effect on the classification task, regardless of whether a feature-based or a transformer-based model is used.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, Malta }, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Szawerna, Maria Irena and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @inProceedings{albertin-kokkinakis-2024-defining-342782, title = {Defining Cohesion Features in the Study of Discourse Properties in Cognitive Impairment }, abstract = {The analysis of discourse and pragmatics, which deteriorate alongside other linguistic levels in cognitive decline, can enhance our understanding of dementia-related language patterns and contribute to the improvement of automated diagnostic tools. This study focuses on discourse cohesion, specifically investigating three linguistic phenomena: reference, lexical repetition, and connectives. Six features related to these categories were defined and automatically extracted from an Italian corpus of semi-spontaneous speech, collected from patients with early dementia, MCI subjects, and healthy controls. Some of these features proved significant in distinguishing among the three groups. Additional quantitative analysis revealed notable differences in the use of these elements, suggesting a potential link between their degradation and cognitive decline.}, booktitle = {Tenth Swedish Language Technology Conference (SLTC)}, author = {Albertin, Giorgia and Kokkinakis, Dimitrios}, year = {2024}, pages = {4}, } @inProceedings{munozsanchez-etal-2024-jingle-342259, title = { Jingle BERT, Jingle BERT, Frozen All the Way: Freezing Layers to Identify CEFR Levels of Second Language Learners Using BERT}, abstract = {In this paper, we investigate the question of how much domain adaptation is needed for the task of automatic essay assessment by freezing layers in BERT models. We test our methodology on three different graded language corpora (English, French and Swedish) and find that partially fine-tuning base models improves performance over fully fine-tuning base models, although the number of layers to freeze differs by language. We also look at the effect of freezing layers on different grades in the corpora and find that different layers are important for different grade levels. Finally, our results represent a new state-of-the-art in automatic essay classification for the three languages under investigation.}, booktitle = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024) }, author = {Muñoz Sánchez, Ricardo and Alfter, David and Dobnik, Simon and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, publisher = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-8075-774-4}, } @inProceedings{landqvist-2024-forutsattningar-337141, title = {Förutsättningar, upplägg och utvärderingar. Utmaningar för och möjligheter med två utbildningar i terminologi vid Göteborgs universitet}, booktitle = {Att undervisa i terminologi – utmaningar och möjligheter. Konferens 23–24 maj i Stockholm}, author = {Landqvist, Hans}, year = {2024}, } @inProceedings{szawerna-etal-2024-detecting-336385, title = {Detecting Personal Identifiable Information in Swedish Learner Essays}, abstract = {Linguistic data can — and often does — contain PII (Personal Identifiable Information). Both from a legal and ethical standpoint, the sharing of such data is not permissible. According to the GDPR, pseudonymization, i.e. the replacement of sensitive information with surrogates, is an acceptable strategy for privacy preservation. While research has been conducted on the detection and replacement of sensitive data in Swedish medical data using Large Language Models (LLMs), it is unclear whether these models handle PII in less structured and more thematically varied texts equally well. In this paper, we present and discuss the performance of an LLM-based PII-detection system for Swedish learner essays.}, booktitle = {Proceedings of the Workshop on Computational Approaches to Language Data Pseudonymization (CALD-pseudo 2024), March 21, 2024, St. Julian’s, Malta}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-085-1}, } @inProceedings{lorenzi-etal-2024-mocca-338189, title = {MoCCA: A Model of Comparative Concepts for Aligning Constructicons}, abstract = {This paper presents MoCCA, a Model of Comparative Concepts for Aligning Constructicons under development by a consortium of research groups building Constructicons of different languages including Brazilian Portuguese, English, German and Swedish. The Constructicons will be aligned by using comparative concepts (CCs) providing language-neutral definitions of linguistic properties. The CCs are drawn from typological research on grammatical categories and constructions, and from FrameNet frames, organized in a conceptual network. Language-specific constructions are linked to the CCs in accordance with general principles. MoCCA is organized into files of two types: a largely static CC Database file and multiple Linking files containing relations between constructions in a Constructicon and the CCs. Tools are planned to facilitate visualization of the CC network and linking of constructions to the CCs. All files and guidelines will be versioned, and a mechanism is set up to report cases where a language-specific construction cannot be easily linked to existing CCs.}, booktitle = {Proceedings of the 20th Joint ACL - ISO Workshop on Interoperable Semantic Annotation @LREC-COLING-2024, 20 May, 2024, Torino, Italia}, author = {Lorenzi, Arthur and Ljunglöf, Peter and Lyngfelt, Benjamin and Torrent, Tiago Timponi and Croft, William and Ziem, Alexander and Böbel, Nina and Bäckström, Linnéa and Uhrig, Peter and Matos, Ely}, year = {2024}, publisher = {ELRA}, ISBN = {978-2-493814-32-6}, } @inProceedings{volodina-2024-swell-345597, title = {On two SweLL learner corpora – SweLL-pilot and SweLL-gold}, abstract = {SweLL – Swedish Learner Language – is a unifying term for the infrastructure module for research on Swedish as a Second Language (L2), deployed and maintained as a part of bigger infrastructure of Språkbanken Text at the University of Gothenburg, Sweden. The SweLL infrastructure module consists of a number of learner data collections, and tools for annotation and management of learner data. As a result, many of its components contain the prefix SweLL in their names, which has created some confusion, especially with regards to the two corpora. In this article we shortly introduce the various SweLL-components with a special focus on the differences between the two SweLL corpora.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, editor = {Elena Volodina and Gerlof Bouma and Markus Forsberg and Dimitrios Kokkinakis and David Alfter and Mats Fridlund and Christian Horn and Lars Ahrenberg and Anna Blåder}, author = {Volodina, Elena}, year = {2024}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, pages = {83--94}, } @inProceedings{munozsanchez-etal-2024-name-339981, title = {Name Biases in Automated Essay Assessment}, abstract = {Artificial intelligence is being deployed in high-stakes situations, such as automated grading of second language essays in proficiency assessment. While they can improve the opportunities students have (education, work opportunities, etc.), such systems often display human-like biases. Aldrin (2017) notes that human graders have a slight bias based on names appearing in essay texts. We aim to identify whether the same pattern holds in automated systems. In this study we aim to answer the following research questions: 1) Does changing given names inside a second language learner essay affect the way the text is graded? 2) How much does this differ between feature-based machine learning and deep learning? For this, we use a de-anonymized (i.e. original) version of the Swell-pilot corpus of second language Swedish learner essays (Volodina 2016), which consists of 502 essays annotated with CEFR levels as our source data. First, we compile four lists of given names inspired by those of Aldrin (2017): traditional Swedish names; modern Swedish names of Anglo-American origin; Finnish names (due to the close sociocultural links between both countries); and names of Arabic origin (the most prominent group of learners in the corpus). Second, we create a diagnostic dataset to identify biases in the classification task. We select SweLL-pilot essays in which a given name appears only once. Then, we generate an essay version for each name on the lists by substituting the name in the original text with one from the list. Third, we fine-tune a BERT (Devlin et al. 2019) model on the original SweLL-pilot data to predict the CEFR level of a given essay and compare it to an existing feature-based model (Pilan 2016). Finally, we test the two models and compare the equality of opportunity between the different given name groups on the diagnostic dataset. }, booktitle = {The 28th International Congress of Onomastic Sciences (ICOS 28),19-23 August, 2024, Helsinki, Finland}, author = {Muñoz Sánchez, Ricardo and Dobnik, Simon and Lindström Tiedemann, Therese and Szawerna, Maria Irena and Volodina, Elena}, year = {2024}, } @inProceedings{sander-etal-2024-durel-341867, title = {The DURel Annotation Tool}, booktitle = {Book of Abstracts of the Workshop Large Language Models and Lexicography, 8 October 2024 Cavtat, Croatia / Simon Krek (ed.)}, author = {Sander, Pauline and Hengchen, Simon and Zhao, Wei and Ma, Xiaocheng and Sköldberg, Emma and Virk, Shafqat and Schlechtweg, Dominik}, year = {2024}, } @inProceedings{volodina-etal-2024-profiles-345602, title = {Profiles for Swedish as a Second Language: Lexis, Grammar, Morphology}, abstract = {This article gives a short introduction to the Swedish Second Language Profile, a tool that visualizes language in Swedish learner corpora from different angles, such as vocabulary, grammar and morphology. The tool is aimed at research on Second Language Acquisition, development of NLP models, teaching of Swedish as a second language, automatic approaches for second language teaching and learning, and at a number of other fields.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024) 10-11 January, 2024, Gothenburg, Sweden / Editors: Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Volodina, Elena and Alfter, David and Lindström Tiedemann, Therese}, year = {2024}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, pages = {10--19}, } @techreport{morger-2024-when-342179, title = {When Sparv met Superlim. . . A Sparv Plugin for Natural Language Understanding Analysis of Swedish}, abstract = {This technical report introduces Sparv-Superlim, a Sparv plugin for natural language understanding analysis of Swedish. It uses the reference models trained on the Superlim multi-task benchmark to add additional analyses to the Sparv Pipeline. I show how to install and configure the tool as well as apply it to analyze Swedish political manifestos to see if the predictions the plugin does align with known political positions of Swedish parties. These use cases shows that the reference models vary in their applicability to predict correct sentiments on novel data and illustrates the importance of integrating reference models trained on a multi-task benchmark like Superlim to evaluate the ecological validity of the benchmark.}, author = {Morger, Felix}, year = {2024}, publisher = {Språkbanken Text}, address = {Göteborg}, } @inProceedings{szawerna-etal-2024-pseudonymization-338089, title = {Pseudonymization Categories across Domain Boundaries}, abstract = {Linguistic data, a component critical not only for research in a variety of fields but also for the development of various Natural Language Processing (NLP) applications, can contain personal information. As a result, its accessibility is limited, both from a legal and an ethical standpoint. One of the solutions is the pseudonymization of the data. Key stages of this process include the identification of sensitive elements and the generation of suitable surrogates in a way that the data is still useful for the intended task. Within this paper, we conduct an analysis of tagsets that have previously been utilized in anonymization and pseudonymization. We also investigate what kinds of Personally Identifiable Information (PII) appear in various domains. These reveal that none of the analyzed tagsets account for all of the PII types present cross-domain at the level of detailedness seemingly required for pseudonymization. We advocate for a universal system of tags for categorizing PIIs leading up to their replacement. Such categorization could facilitate the generation of grammatically, semantically, and sociolinguistically appropriate surrogates for the kinds of information that are considered sensitive in a given domain, resulting in a system that would enable dynamic pseudonymization while keeping the texts readable and useful for future research in various fields.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), LREC-COLING, 2024 20-25 May, 2024, Torino, Italia}, author = {Szawerna, Maria Irena and Dobnik, Simon and Lindström Tiedemann, Therese and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena}, year = {2024}, publisher = {ELRA and ICCL}, ISBN = {978-2-493814-10-4}, pages = {13303–13314}, } @book{morger-2024-minds-346391, title = {In the minds of stochastic parrots: Benchmarking, evaluating and interpreting large language models}, abstract = {The arrival of large language models (LLMs) in recent years has changed the landscape of natural language processing (NLP). Their impressive performance on popular benchmarks, ability to solve a range of different tasks and their human-like linguistic interactional abilities, have prompted a debate into whether these are just "stochastic parrots" who are cleverly repeating what humans say without understanding its meaning or whether they are acquiring essential language capabilities, which would be an important stepping stone towards artificial general intelligence. To tackle this question, developing analysis methods to measure and understand the language capabilities of LLMs has become a defining challenge. These include developing benchmarks to reliably measure their performance as well and interpretability methods to gauge their inner-workings. This is especially relevant at a time when these models already are having a considerable impact on our society. An increasing amount users are affected by the technology and calls are made for transparent, regulated and thorough evaluation of AI. In these efforts, it is important to estimate the possibilities and limitations of these analysis methods since they will play an important role in holding technologies in AI accountable. In this compilation thesis, I expound on the components and processes involved in analyzing LLMs. The articles included in this compilation thesis use different approaches for analyzing LLMs, from introducing a multi-task benchmark Superlim for Swedish NLU to investigating LLMs' ability to predict language variation. To this effort I explore what the possibilities and limitations are of popular analysis methods and what implications these have for developing LLMs. I argue that integrating explanatory approaches from empirical linguistic research is important to understand the role of both the data and the linguistic features used when analyzing LLMs. Doing so does not only help guide the development of LLMs, but also bring insights into linguistics.}, author = {Morger, Felix}, year = {2024}, ISBN = {978-91-8069-944-0}, } @article{blensenius-2024-aligning-343305, title = {Aligning grammatical information in linguistic resources published by the same authority. The case of participles in Swedish Academy dictionaries and grammar}, abstract = {This article discusses strategies involved in aligning word classes (parts of speech), particularly participles, in two dictionaries and one grammar for Swedish, all issued by the same publishing house, which at the same time is a language authority in Sweden. The dictionaries are Svenska Akademiens ordlista (‘The Swedish Academy glossary’), abbreviated as SAOL, and Svensk ordbok utgiven av Svenska Akademien (‘The Contemporary Dictionary of the Swedish Academy’), abbreviated as SO. The grammar in question is Svenska Akademiens grammatik (‘The Swedish Academy grammar’), ab-breviated as SAG. I will discuss whether it is possible or desirable to harmonize word classes in dictionaries and grammars from the same publisher, in this case from the Swedish Academy.}, journal = {Lexicographica : Internationales Jahrbuch für Lexikographie}, author = {Blensenius, Kristian}, year = {2024}, volume = {40}, number = {1}, pages = {81–94}, } @inProceedings{kokkinakis-etal-2024-analyzing-342781, title = {Analyzing Segregation Discourse in Sweden: Technological Methods and Empirical Data }, abstract = {This paper outlines some of the empirical resources and language technology tools to be used in the project “Language(s) of segregation: Interdisciplinary perspectives on spatial, social, and symbolic division in cities.” The aim of this project is to examine the construction of segregation discourse in Sweden, its implementation as urban policy, and its impact and experience in everyday life. By integrating perspectives from linguistics, public administration, and urban ethnography, this study analyzes various forms of segregation — such as educational and residential — using large corpora to identify patterns and address related disparities. Key resources include political discourse, social media, and press coverage, while language technology tools like word vectors, network and sentiment analysis, and topic modeling will be employed. Preliminary findings provide early insights into the complex dynamics of segregation across different contexts.}, booktitle = {Tenth Swedish Language Technology Conference (SLTC), Linköping, Sweden, 27–29 November 2024.}, author = {Kokkinakis, Dimitrios and Wojahn, Daniel and Järlehed, Johan}, year = {2024}, } @article{lindahl-borin-2024-annotation-333043, title = {Annotation for computational argumentation analysis: Issues and perspectives}, abstract = {Argumentation has long been studied in a number of disciplines, including several branches of linguistics. In recent years, computational processing of argumentation has been added to the list, reflecting a general interest from the field of natural language processing (NLP) in building natural language understanding systems for increasingly intricate language phenomena. Computational argumentation analysis – referred to as argumentation mining in the NLP literature – requires large amounts of real-world text with manually analyzed argumentation. This process is known as annotation in the NLP literature and such annotated datasets are used both as “gold standards” for assessing the quality of NLP applications and as training data for the machine learning algorithms underlying most state of the art approaches to NLP. Argumentation annotation turns out to be complex, both because argumentation can be complex in itself and because it does not come across as a unitary phenomenon in the literature. In this survey we review how argumentation has been studied in other fields, how it has been annotated in NLP and what has been achieved so far. We conclude with describing some important current and future issues to be resolved.}, journal = {Language and Linguistics Compass}, author = {Lindahl, Anna and Borin, Lars}, year = {2024}, volume = {18}, number = {1}, } @inProceedings{kokkinakis-hammarlin-2024-cluster-338476, title = {Cluster-Based BERTopic Modeling on Swedish COVID-19 Vaccine Posts}, abstract = {This paper explores the prevalent themes across multiple threads on the popular Swedish discussion forum Flashback. Among its diverse array of topics, the forum actively engages users in addressing and debating questions pertaining to COVID-19 vaccines and vaccination. Through distinguishing between positive and negative perspectives within posts across 14 relevant thread discussions, we employ BERTopic, a modular topic modeling framework, which utilizes pre-trained language models and applies clustering techniques to identify prevailing topics. This enables us to conduct a nuanced exploration of overarching themes, offering valuable insights into the multifaceted nature of the discussions regarding COVID-19 vaccines and vaccination in Sweden.}, booktitle = {34th Medical Informatics Europe Conference, MIE 2024, 25–29 August 2024, Athens, Greece / John Mantas, Arie Hasman, George Demiris, Kaija Saranto, Michael Marschollek, Theodoros N. Arvanitis, Ivana Ognjanović, Arriel Benis, Parisis Gallos, Emmanouil Zoulias, Elisavet Andrikopoulou (eds.)}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie}, year = {2024}, publisher = {IOS Press}, address = {Amsterdam, Washington, DC}, ISBN = {978-1-64368-533-5 }, pages = {1906 -- 1910}, } @inProceedings{morger-2024-swediagnostics-341148, title = {SweDiagnostics: A Diagnostics Natural Language Inference Dataset for Swedish}, abstract = {This paper presents SweDiagnostics, a natural language inference dataset for Swedish based on the GLUE Diagnostic dataset. It is the largest, manually corrected NLI dataset in Swedish to date and can be used to evaluate models on NLI in Swedish as well as estimate English-Swedish language transfer capabilities. We present the dataset, the methodology used for translation, compare existing implementations and discuss limitations of the dataset, in particular those related to translationese.}, booktitle = {17th Workshop on Building and Using Comparable Corpora, BUCC 2024 at LREC-COLING 2024 - Proceedings}, author = {Morger, Felix}, year = {2024}, ISBN = {9782493814319}, } @incollection{hammarlin-etal-2024-fearing-336154, title = {Fearing mRNA - A mixed methods study of vaccine rumours }, abstract = {There are well-spread ideas among vaccine-critical individuals around the world that “new” vaccines might be more dangerous to health than other, “traditional” vaccines, which can lead to vaccine hesitancy; the “delay in acceptance or refusal of vaccination despite availability of vaccination services”. For example, a recurring remark made in social media is that mRNA technology resembles a chip that alters the human DNA, which might permanently and irreparably damage the immune system. These ideas sometimes take the shape of rumours and conspiracy theories. Drawing on rumour theories and social cognitive perspectives, the aim of this chapter is to account for the purpose and the spreading of medical rumours that encircle mRNA COVID-19 vaccines. Our research questions are: How are rumours concerning mRNA expressed and established? In terms of trust and distrust, what function do the rumours have?}, booktitle = {Vaccine Hesitancy in the Nordic Countries - Trust and Distrust During the COVID-19 Pandemic / edited By Lars Borin, Mia-Marie Hammarlin, Dimitrios Kokkinakis, Fredrik Miegel}, author = {Hammarlin, Mia-Marie and Kokkinakis, Dimitrios and Miegel, Fredrik and Stoencheva, Jullietta}, year = {2024}, publisher = {Routledge - Taylor & Francis Group}, address = {New York}, ISBN = {978-1-032-30599-8}, pages = {157--184}, } @article{cousse-etal-2024-auxiliaries-343090, title = {Auxiliaries in Old Dutch. A diachronic parallel corpus exploration}, abstract = {This study explores the use of auxiliaries in the oldest text available for Old Dutch, the Wachtendonck Psalter, dating from the 10th century. Our aim is to understand why there are so few different auxiliaries in this text in comparison to other texts in Old Dutch. We tackle this question by taking a historical comparative perspective, using methodological insights and techniques from corpus-based contrastive linguistics and typology. More specifically, we build a diachronic parallel corpus of psalm translations and compare the contexts in which auxiliaries and inflectional alternatives are used in these parallel texts by means of multidimensional scaling. Our historical comparative method results in five proximity maps which allow us to explore and compare the inventory of verb constructions of the Wachtendonck Psalter both retrospectively, with its source text in Latin, and prospectively, with later translations in Dutch. Our analysis examines the role of grammaticalization as well as the specific nature of the text as an interlinear translation as possible motivations for the presence and absence of auxiliaries in the Wachtendonck Psalter.}, journal = {Journal of Historical Linguistics}, author = {Coussé, Evie and Bouma, Gerlof and van der Sijs, Nicoline}, year = {2024}, } @inProceedings{szawerna-etal-2024-swedish-346227, title = {Swedish Learner Essays Revisited: Further Insights into Detecting Personal Information}, abstract = {Personally Identifiable Information (PII) is pervasive in linguistic data, making open sharing thereof complicated from both the legal and ethical perspective. Simply redacting out the PIIs or replacing them with pseudonyms presupposes a detection step, where the personal information is identified. In this study, we expand the existing research on PII detection in unstructured data (learner essays) in Swedish, testing more Large Language Models (LLMs) on a larger amount of data. We compare three different LLMs, two Swedish (KB-BERT and AI Sweden’s RoBERTa) and one multilingual (M-BERT). We found that KB-BERT tends to be better than the other models but that there is some overlap in their performance. }, booktitle = {The Tenth Swedish Language Technology Conference (SLTC), 27-29 November, 2024, Linköping, Sweden}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Volodina, Elena}, year = {2024}, } @inProceedings{borin-holmer-2024-tradita-333774, title = {Tradita innovare, innovata tradere. The Gothenburg approach to computational lexicography}, abstract = {Swedish computational lexicography has a long history at the University of Gothenburg, both in its primary role as a central aspect of the scientific study of vocabulary and also as an infrastructural component for conducting research based on language data. Starting in the 1960s, the Språkdata research group pioneered corpus-supported lexicography for Swedish, forming the basis for successive editions of the two main descriptive dictionaries of contemporary Swedish, SAOL and SO. Language technological lexical resources for Swedish have been developed by the research unit/research infrastructure Språkbanken Text since the turn of the millennium, most recently in the framework of the Swedish FrameNet++initiative. After two decades of separation, these two largely mutually independently developed strands of computational lexicography have now joined forces under the umbrella of Språkbanken’s lexical research infrastructure to advance the field technically, methodologically, and scientifically.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden / (Eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder)}, author = {Borin, Lars and Holmer, Louise}, year = {2024}, publisher = {LiU Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @incollection{borin-etal-2024-introduction-343467, title = {Introduction: Vaccine Hesitancy and the COVID-19 Crisis in the Nordic Countries}, abstract = {Already in 2019, WHO singled out the increase of vaccine hesitancy as one of the ten most important and urgent threats to global health. Little did people know then of the heated vaccine discussions waiting around the corner, spurred by the COVID-19 pandemic that set some countries on something reminiscent of a war footing. The mass vaccinations against the coronavirus in the early 2020s were seen by many as a blessing that promised a return to normalcy after lockdowns and other social restrictions. But some citizens actively resisted vaccination, claiming that the vaccines were not safe and questioning the public authorities’ trustworthiness. At the same time, the Nordic region is regarded as a world leader when it comes to societal trust. This tension between the high-trust Nordic societies and the distrust in the COVID-19 vaccines among a minority is in focus in this volume. It also gives insights into the political tensions between these neighbouring nations, and the public discourses taking place in the region during intense phases of the pandemic. The book explores three interrelated research themes: Nordic societal trust under stress; COVID-19 in Nordic public discourses; and the growing chorus on the margin.}, booktitle = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust during the COVID-19 Pandemic}, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, ISBN = {9781040011614}, pages = {1--17}, } @inProceedings{landqvist-holmer-2024-finlandismer-342756, title = {Finlandismer i SAOL 14: markeringssätt och informationstyper}, abstract = {Svenska Akademiens ordlista (SAOL) är den inofficiella normen för stavning och böjning av orden som ingår i aktuell upplaga av SAOL (Borin & Holmer 2024:42). Den första upplagan publicerades 1874, den fjortonde och senaste upplagan, SAOL14, utgavs 2015 och SAOL15 är planerad att utkomma 2025 (Holmer et al. 2024:68). Alltsedan SAOL11 (1986) ingår det i ordlistan ett antal uppslagsord och ordbetydelser vilka enbart eller främst används i finlandssvenskan och ofta kallas ”finlandismer” (af Hällström-Reijonen 2015:104–111; se t.ex. af Hällström-Reijonen 2015:99–100 om begreppen ’finlandssvenska’ och ’finlandism’). Antalet finlandismer i SAOL14 uppges vara mellan 240 och 260 (SAOL14:XVII; af Hällström-Reijonen 2015:105). Vägledande för urvalet har varit frekvens och aktualitet, geografisk spridning i svensktalande områden i Finland samt acceptabilitet. Den sistnämnda principen innebär att ”man inte ska ta med sådana finlandismer i SAOL som språkvårdarna vid Institutet för de inhemska språken avråder från i andra sammanhang […]” (af Hällström-Reijonen 2015:108–109). Urvalet av finlandismer i SAOL14 har alltså gjorts efter angivna principer. Men hur får användare av SAOL14 veta (1) vilka uppslagsord och ordbetydelser som bedöms vara finlandismer? (2) varför orden och betydelserna klassificeras som finlandismer? För att besvara den första forskningsfrågan görs sökningar i Svenska Akademiens lexikala databas (Salex), som utvecklas inom Språkbanken Text, och olika sätt att markera ”finlandism-status” kartläggs (se Holmer et al. 2024:69 om Salex; jfr Svensén 2004:374–397 om markeringssystem i ordböcker). Utgångspunkten för att besvara den andra forskningsfrågan är Bo Svenséns kategorisering av informationstyper som kan användas i ordböcker (Svensén 2004:9–12). Med hänsyn till informationstyper som faktiskt används i SAOL14 kan ”finlandism-status” i ordlistan, i alla fall potentiellt, avse fyra huvudkategorier: (a) formell information (stavning, uttal, morfologi, då både ordböjning och ordbildning), (b) syntagmatisk information (ordklasstillhörighet, konstruktionssätt, kollokationer, idiom), (c) semantisk information (betydelse) och/eller (d) pragmatisk information (förekomst, bruklighet). Referenser Borin, L. & Holmer, L. (2024). Tradita innovare, innovata tradere. The Gothenburg approach to computational lexicography. I: Proceedings of the Huminfra Conference (HiC 2024), s. 41–50. Red. Volodina, E., Bouma, G., Forsberg, M., Kokkinakis, D., Alfter, D., Fridlund, M., Horn, C., Ahrenberg, L. & Blåder, A. Linköping Electronic Conference Proceedings 205. Tillgänglig: https://ecp.ep.liu.se/hic Holmer, L., Lillieström, A., Sköldberg, E. & Uppström, J. (2024). SAOL och svensk språkvetenskaplig infrastruktur – nu och i framtiden. I: Proceedings of the Huminfra Conference (HiC 2024), s. 68–75. Red. Volodina, E., Bouma, G., Forsberg, M., Kokkinakis, D., Alfter, D., Fridlund, M., Horn, C., Ahrenberg, L. & Blåder, A. Linköping Electronic Conference Proceedings 205. Tillgänglig: https://ecp.ep.liu.se/hic af Hällström-Reijonen, C. (2015). Finlandssvenska i SAOL och andra ordböcker. LexicoNordica 22, s. 99–115. Tillgänglig: https://tidsskrift.dk/index.php/lexn/issue/archive SAOL14 = Svenska Akademiens ordlista över svenska språket (2015). 14 uppl. Stockholm: Norstedts i distribution. Tillgänglig: https://www.gu.se/svenska-spraket/saol-svenska-akademiens-ordlista Svensén, B. (2004) [1987]. Handbok i lexikografi. Ordböcker och ordboksarbete i teori och praktik. 2 omarbetade och utökade uppl. Stockholm: Norstedts Akademiska Förlag. }, booktitle = {Svenskan i Finland 21, 3–4 oktober 2024, Uleåborg, Finland}, author = {Landqvist, Hans and Holmer, Louise}, year = {2024}, } @inProceedings{petersson-2024-progressive-346545, title = {Progressive aspect in Swedish and English }, booktitle = {Book of Abstract of Chronos 15. 15th International Conference on Actionality, Tense, Aspect, Modality/Evidentiality, 29-31 May 2024 Toulouse, France}, author = {Petersson, Stellan}, year = {2024}, } @article{holmer-2024-derivatives-343310, title = {Derivatives in Swedish dictionaries. The case of deverbal nouns in -ande}, abstract = {This article deals with known challenges as well as new ones associated with the lexicographical solutions regarding derivatives in dictionaries. Five of Sweden’s major monolingual dictionaries are being examined with the aim of describing and comparing their derivatives, with special focus on deverbal nouns with the suffix -ande. The research combines morphology, lexicography and metalexicography, aiming at pre-senting and discussing some of the key areas of lemma inclusion and word formation principles in Swedish monolingual, contemporary dictionaries.}, journal = {Lexicographica. International Annual for Lexicography / Revue Internationale de Lexicographie / Internationales Jahrbuch für Lexikographie}, author = {Holmer, Louise}, year = {2024}, volume = {40}, number = {1}, pages = {59--79}, } @inProceedings{kokkinakis-2024-from-336089, title = {From Zipf distribution to Universal Dependencies - Interactive Notebooks for Swedish Text Analysis }, abstract = {Notebook-based environments are powerful (web-based) interactive development resources for conducting exploratory (textual) data analysis (EDA). These environments allow the embedding of code (code snippets in ‛code cells’) which can be easily executed with the results immediately presented into the user’s window. This paper introduces some basic exploratory tools and techniques using JupyterLab notebooks, applied to Swedish using a subcorpus that address various topics related to the COVID-19 pandemic published during January-December 2021}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{lofgren-dannells-2024-post-336065, title = {Post-OCR Correction of Digitized Swedish Newspapers with ByT5}, abstract = {Many collections of digitized newspapers suffer from poor OCR quality, which impacts readability, information retrieval, and analysis of the material. Errors in OCR output can be reduced by applying machine translation models to translate it into a corrected version. Although transformer models show promising results in post-OCR correction and related tasks in other languages, they have not yet been explored for correcting OCR errors in Swedish texts. This paper presents a post-OCR correction model for Swedish 19th to 21th century newspapers based on the pre-trained transformer model ByT5. Three versions of the model were trained on different mixes of training data. The best model, which achieved a 36\% reduction in CER, is made freely available and will be integrated into the automatic processing pipeline of Språkbanken Text, a Swedish language technology infrastructure containing modern and historical written data.}, booktitle = {Proceedings of the 8th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2024), March 22, 2024, Malta}, author = {Löfgren, Viktoria and Dannélls, Dana}, year = {2024}, publisher = {Association for Computational Linguistics}, address = {United States Pennsylvania East Stroudsburg}, ISBN = {979-8-89176-069-1}, } @article{landqvist-rogstrom-2024-genreutveckling-341609, title = {Genreutveckling i vetenskaplig prosa. Clas Bjerkanders entomologiska rön i Kungliga Vetenskapsakademiens Handlingar 1775–1795}, abstract = {The purpose of this article is to analyze the early (natural) scientific genre development in Sweden during the 18thcentury, focusing on the Royal Swedish Academy of Sciences, and its Transactions, as a discursive community. The Transactions are considered an early example of what we now refer to as sakprosa but have never been thoroughly analyzed regarding its genre specific characteristics. Bjerkander’s 26 findings on entomology are chosen as material for this study, grounded in genre analysis (Swales 1990, 2004; Bhatia 2004). The results reveal that Bjerkander’s entomological findings evolve into three subgenres. From a genre development perspective, Bjerkander’s texts primarily reflect an establishing stage (Gunnarsson 2011), but they anticipate a specialized stage in three instances: Bjerkander’s use of the so-called CARS model, his methodological awareness, and his use of references to previous research.}, journal = {Folkmålsstudier}, author = {Landqvist, Hans and Rogström, Lena}, year = {2024}, volume = {62}, pages = {89–125}, } @inProceedings{skoldberg-2024-andra-337378, title = {Andra upplagan av Svensk ordbok (SO) – Förutsättningar, teoretiska överväganden, insatser och mottagande}, booktitle = {Svenskans beskrivning 38: Förhandlingar vid trettioåttonde sammankomsten. Örebro 4–6 maj 2022, Del I. Redigerad av Denny Jansson, Ida Melander, Gustav Westberg & Daroon Yassin Falk}, author = {Sköldberg, Emma}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro }, ISBN = {978-91-87789-89-2}, pages = {165--180}, } @incollection{pfaff-bouma-2024-npegl-335993, title = {The NPEGL noun phrase database: design and construction }, booktitle = { Bech, Kristin & Pfaff, Alexander (eds.), Noun phrases in early Germanic languages}, author = {Pfaff, Alexander and Bouma, Gerlof}, year = {2024}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-96110-467-3}, pages = {1–32}, } @article{landqvist-2024-gratis-344586, title = {”Gratis broddar till dig som är 65 år och bor i Göteborg”: En fallstudie av legitimeringsstrategier i svenska kommuners webbtexter riktade till seniorer}, abstract = {Offers of free ice cleats to senior citizens is a rather new phenomenon in Sweden. Therefore, Swedish municipalities should strive to linguistically legitimize such offers to convince senior citizens to use ice cleats during the winter season. The research questions for the reported study are (1) Which main strategies for legitimation appear in the municipalities’ texts? (2) Which sub- types to different main strategies appear in the municipalities’ texts? (3) What general picture of the municipalities’ approach to senior citizens can the identified strategies be said to convey? A corpus of 23 texts on municipal websites, published in 2022 or 2023, is analysed, using van Leeuwen's model for legitimation analysis (2008) as a theoretical-methodical basis. The results of the study show that ice cleat offers are legitimized lexicogrammatically mainly through the two main legitimation strategies Authorization and Rationalization. The main strategy Moral Evaluation, with a focus on public health and the health and well-being of senior citizens, is also used. No instances of the main strategy Mythopoesis are identified. Several subtypes to the three main strategies Authorization, Rationalization and Moral Evaluation are identified in the corpus. Some of the strategies for legitimation used can be deemed as an expression of an equal relationship between municipalities and senior citizens, while others can be said to express an unequal relationship.}, journal = {VAKKI Publications: Diversity in Communication}, author = {Landqvist, Hans}, year = {2024}, volume = {16}, pages = {67--89}, } @inProceedings{landqvist-etal-2024-appendicit-337164, title = {Hur kan "appendicit", "blodförgiftning" och "hyperaktivitetssyndrom" behandlas? Medicinens fackområde i Svensk ordbok utgiven av Svenska Akademien}, booktitle = {Svenskans beskrivning 38. Förhandlingar vid trettioåttonde sammankomsten Örebro 4–6 maj 2022. Del II. Redigerad av Denny Jansson, Ida Melander, Gustav Westberg & Daroon Yassin Falk}, author = {Landqvist, Hans and Sköldberg, Emma and Holmer, Louise}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-90-8}, pages = {89--106}, } @article{landqvist-etal-2024-termer-339976, title = {Termer för grundämnena H, N och O: resultat av nordiskt terminologisamarbete från 1950-tal till 2020-tal}, abstract = {Den som vill bedriva ett framgångsrikt terminologiarbete måste ha samarbetspartners. År 1981 formulerade Christer Laurén detta faktum som att ”[b]åde ingenjören och språkmannen behövs i fackspråklig språkvård”, medan Henrik Nilsson fyrtio år senare konstaterade att ”[t]erminologer behöver experter för att kunna genomföra ett terminologiarbete av god kvalitet” (Laurén 1981, s. 9; Nilsson 2021, s. 77). Samarbetet kan ske inom ett land eller mellan personer och institutioner i flera länder (Bucher 2016a; Bucher 2017; Nilsson 2021). Den här artikeln handlar om terminologi(sam)arbete i Norden.}, journal = {Sprog i Norden 2024/Språk i Norden 2024. Tema: Nordterm 23 Terminologi i samhällets tjänst (Red. Kirsten Lindø Dolberg-Møller)}, author = {Landqvist, Hans and Nissilä, Niina and Sjöberg, Sannina}, year = {2024}, pages = {137–152}, } @article{skoldberg-landqvist-2024-sorry-343321, title = {Sorry, shit and wow: a case study of the handling of interjections in three Nordic monolingual dictionaries}, abstract = {This paper discusses a qualitative and, to some extent, comparative metalexicographical case study on interjections, with an English origin, in three Nordic monolingual dictionaries. In short, the study answers the following research questions: (1) How are three well-established interjections handled in The Contemporary Dictionary of the Swedish Academy (SO) compared to corresponding entries in a Danish and a Norwegian dictionary and how can the SO descriptions be developed?; (2) How can three less established interjections be analyzed and described in an updated version of the SO? The point of departure for answering the research questions is information types that are common in dictionary entries. Furthermore, the use of interjections in corpora and text collections for Swedish are crucial for the investigation. The study shows that interjections as a category imply several challenges for lexicographers. Finally, some suggestions are presented concerning the way in which the description of interjections in the SO may be developed}, journal = {Lexicographica. International Annual for Lexicography / Revue Internationale de Lexicographie / Internationales Jahrbuch für Lexikographie }, author = {Sköldberg, Emma and Landqvist, Hans}, year = {2024}, volume = {40}, number = {1}, pages = {29--57}, } @inProceedings{dannells-etal-2024-transformer-338708, title = {Transformer-based Swedish Semantic Role Labeling through Transfer Learning}, abstract = {Semantic Role Labeling (SRL) is a task in natural language understanding where the goal is to extract semantic roles for a given sentence. English SRL has achieved state-of-the-art performance using Transformer techniques and supervised learning. However, this technique is not a viable choice for smaller languages like Swedish due to the limited amount of training data. In this paper, we present the first effort in building a Transformer-based SRL system for Swedish by exploring multilingual and cross-lingual transfer learning methods and leveraging the Swedish FrameNet resource. We demonstrate that multilingual transfer learning outperforms two different cross-lingual transfer models. We also found some differences between frames in FrameNet that can either hinder or enhance the model’s performance. The resulting end-to-end model is freely available and will be made accessible through Språkbanken Text’s research infrastructure.}, booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), 20-25 May, 2024, Torino, Italia}, author = {Dannélls, Dana and Johansson, Richard and Buhr, Lucy Yang}, year = {2024}, publisher = {ELRA and ICCL}, address = {Turin, Italy}, ISBN = {978-2-493814-10-4}, } @article{skoldberg-wenner-2024-varfor-341783, title = {Varför Paris men inte Prag? Om namn i SAOL 14}, abstract = {This article reports on a study of the inclusion of names in the latest edition of The Swedish Academy Glossary (SAOL 14). We begin by presenting the principles for including names in Swedish monolingual dictionaries in general. We then discuss the names included in SAOL 14, based on those from previous editions, emails from dictionary users regarding the names, and the results of an online survey on the topic. Finally, we address the question of how the editors could approach this subset of headwords in the next edition of the glossary}, journal = {Nordic Journal of Socio-Onomastics}, author = {Sköldberg, Emma and Wenner, Lena}, year = {2024}, volume = {4}, number = {2}, pages = { 131--166 }, } @inProceedings{lyngfelt-etal-2024-flersprakig-338191, title = {Flerspråkig konstruktikografi med hjälp av språkneutrala jämförelsebegrepp}, booktitle = {Svenskans beskrivning. Förhandlingar vid trettioåttonde sammankomsten, del 1, Örebro 4–6 maj 2022}, author = {Lyngfelt, Benjamin and Andréasson, Maia and Blensenius, Kristian and Bäckström, Linnéa and Höder, Steffen and Ljunglöf, Peter and Uppström, Jonatan}, year = {2024}, publisher = {Örebro universitet }, address = {Örebro}, ISBN = {978-91-87789-89-2}, } @misc{gaillat-etal-2024-proceedings-345595, title = {Proceedings of the 13th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2024)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the integration of insights from Second Language Acquisition (SLA) research, and the promotion of “Computational SLA” through setting up Second Language research infrastructures.}, author = {Gaillat, Thomas and Mallart, Cyriel and Moreau, Fabienne and Li, Jen-Yu and Drouet, Griselda and Alfter, David and Volodina, Elena and Jönsson, Arne}, year = {2024}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-8075-774-4}, } @inProceedings{ljunglof-etal-2024-binary-342402, title = {Binary indexes for optimising corpus queries}, abstract = {To be able to search for patterns in annotated text corpora is crucial for many different research disciplines. However, searching for complex patterns in large corpora can take long time – sometimes several minutes or even hours. We investigate how inverted indexes can be used for efficient searching in large annotated corpora, and in particular binary indexes. We show how corpus queries are translated into lookups in unary and binary inverted indexes, and give efficient strategies for combining the results using efficient set operations. In addition we discuss how to make use of binary indexes for more complex query types.}, booktitle = {Proceedings of the 20th Conference on Natural Language Processing (KONVENS 2024), September 10-13, 2024, Vienna, Austria}, author = {Ljunglöf, Peter and Smallbone, Nicholas and Thoresson, Mijo and Salomonsson, Victor}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {9798331304843}, } @article{landqvist-skoldberg-2024-interjektioner-336473, title = {Interjektioner som lexikografisk utmaning. En fallstudie av interjektioner med engelskt ursprung utifrån Svensk ordbok utgiven av Svenska Akademien}, abstract = {In this article, a qualitative and, to some extent, comparative metalexicographic case study is reported. The study will answer two research questions: (1) How are the interjections "sorry", "shit" and "wow" described in The Contemporary Dictionary of the Swedish Academy (SO) compared to the corresponding dictionary articles in The Danish Dictionary (DDO) and the Norwegian Academy’s Dictionary (NAOB) and how can the SO descriptions be developed?; (2) How can the interjections "yes", "nice/najs" and "woho/wohoo" be analyzed and then described in new dictionary articles in an updated version of SO? The point of departure for answering both RQs is a number of information categories that are common in dictionary articles. Furthermore, the use of the current interjections in contemporary corpora and text collections for Swedish are crucial for the investigation. The results of the study show that interjections as a category implies several challenges for lexicographers regarding information about their spelling, pronunciation, and inflection, meaning, language examples, usage comments as well as information about their establishment, origin, and kinship. Finally, some suggestions are presented for how the description of interjections in the dictionary can be developed.}, journal = {ASLA:s skriftserie/ASLA Studies in Applied Linguistics}, author = {Landqvist, Hans and Sköldberg, Emma}, year = {2024}, volume = {31}, pages = {26--55}, } @inProceedings{deworetzki-etal-2024-towards-345085, title = {Towards an Algebraic Approach for Corpus Queries }, abstract = {Analysis of text corpora involves the use of specialised corpus search tools, capable of handling huge amounts of annotated text. The extent to which these tools apply optimisations to reduce query execution times is as diverse as the tools themselves. We argue that the development of a corpus algebra, similar to relational algebra in relational database systems, is a valuable foundation to improve corpus query optimisation. We demonstrate a query optimisation approach based on algebraic transformations, which vastly reduces query execution times.}, booktitle = {Swedish Language Technology Conference, 27–29 November 2024, Linköping, Sweden}, author = {Deworetzki, Niklas and Ljunglöf, Peter and Smallbone, Nicholas}, year = {2024}, } @inProceedings{francis-2024-variation-342620, title = {Variation between Credible and Non-Credible News Across Topics}, abstract = {‘Fake News’ continues to undermine trust in modern journalism and politics. Despite con- tinued efforts to study fake news, results have been conflicting. Previous attempts to analyse and combat fake news have largely focused on distinguishing fake news from truth, or differ- entiating between its various sub-types (such as propaganda, satire, misinformation, etc.) This paper conducts a linguistic and stylistic analy- sis of fake news, focusing on variation between various news topics. It builds on related work identifying features from discourse and linguis- tics in deception detection by analysing five distinct news topics: Economy, Entertainment, Health, Science, and Sports. The results em- phasize that linguistic features vary between credible and deceptive news in each domain and highlight the importance of adapting clas- sification tasks to accommodate variety-based stylistic and linguistic differences in order to achieve better real-world performance.}, booktitle = {The First International Conference on Natural Language Processing and Artificial Intelligence for Cyber Security}, author = {Francis, Emilie}, year = {2024}, publisher = {NLPAICS’2024}, address = {Lancaster, U.K.}, pages = {86--96}, } @inProceedings{lindahl-2024-disagreement-341074, title = {Disagreement in Argumentation Annotation}, abstract = {Disagreement, perspective or error? There is a growing discussion against the idea of a unified ground truth in annotated data, as well as the usefulness of such a ground truth and resulting gold standard. In data perspectivism, this issue is exemplified with tasks such as hate speech or sentiment classification in which annotators’ different perspectives are important to include. In this paper we turn to argumentation, a related field which has had less focus from this point of view. Argumentation is difficult to annotate for several reasons, from the more practical parts of deciding where the argumentation begins and ends to questions of how argumentation is defined and what it consists of. Learning more about disagreement is therefore important in order to improve argument annotation and to better utilize argument annotated data. Because of this, we examine disagreement in two corpora annotated with argumentation both manually and computationally. We find that disagreement is often not because of annotation errors or mistakes but due to the possibility of multiple possible interpretations. More specifically, these interpretations can be over boundaries, label or existence of argumentation. These results emphasize the need for more thorough analysis of disagreement in data, outside of the more common inter-annotator agreement measures.}, booktitle = {3rd Workshop on Perspectivist Approaches to NLP, NLPerspectives 2024 at LREC-COLING 2024 - Workshop Proceedings}, author = {Lindahl, Anna}, year = {2024}, ISBN = {9782493814234}, } @incollection{tiedemann-etal-2024-multiword-343530, title = {Multiword expressions in Swedish as a second language: Taxonomy, annotation, and initial results}, abstract = {This chapter introduces part of the Swedish L2 profiles, a new resource for Swedish as a second language. Multiword expressions (MWEs) in this resource are based on knowledge-based automatic annotation of MWEs, which we show works quite well for Swedish. In contrast, manual annotation of the compositionality of each MWE proved difficult, probably due to different interpretations of "compositionality" by the two annotators. We show that experts and non-experts can rank MWEs very similarly according to relative receptive difficulty, with particularly high agreement for the easiest items. A qualitative comparison of the proficiency levels associated with the MWEs based on coursebook occurrences and the results from crowdsourcing and direct ranking indicate that MWEs which appear in few books of the same level are more likely to be difficult to associate with an appropriate level based on coursebook corpus data. Furthermore, results show that compositionality and/or transparency might influence the relative ranking. Finally, there is a clear increase in MWE lemmas at higher proficiency levels at the group level, and at the highest level receptive and productive data include the same percentage of MWEs.}, booktitle = {Multiword Expressions in Lexical Resources: Linguistic, Lexicographic, and Computational Perspectives}, editor = {Voula Giouli and Verginica Barbu Mititelu}, author = {Tiedemann, Therese Lindström and Alfter, David and Ali Mohammed, Yousuf and Piipponen, Daniela and Silén, Beatrice and Volodina, Elena}, year = {2024}, publisher = {Language Science Press}, address = {Berlin}, ISBN = {978-3-98554-099-0}, pages = {309--348}, } @inProceedings{szawerna-2024-stanza-336413, title = {Can Stanza be Used for Part-of-Speech Tagging Historical Polish?}, abstract = {The goal of this paper is to evaluate the performance of Stanza, a part-of-speech (POS) tagger developed for modern Polish, on historical text to assess its possible use for automating the annotation of other historical texts. While the issue of the reliability of utilizing POS taggers on historical data has been previously discussed, most of the research focuses on languages whose grammar differs from Polish, meaning that their results need not be fully applicable in this case. The evaluation of Stanza is conducted on two sets of 10286 and 3270 manually annotated tokens from a piece of historical Polish writing (1899), and the errors are analyzed qualitatively and quantitatively. The results show a good performance of the tagger, especially when it comes to Universal Part-of-Speech (UPOS) tags, which is promising for utilizing the tagger for automatic annotation in larger projects, and pinpoint some common features of misclassified tokens.}, booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, March 21-22, 2024, St. Julian’s, Malta}, author = {Szawerna, Maria Irena}, year = {2024}, publisher = {Association for Computational Linguistics}, ISBN = {979-8-89176-090-5}, } @inProceedings{masciolini-etal-2024-synthetic-338288, title = {Synthetic-Error Augmented Parsing of Swedish as a Second Language: Experiments with Word Order}, abstract = {Ungrammatical text poses significant challenges for off-the-shelf dependency parsers. In this paper, we explore the effectiveness of using synthetic data to improve performance on essays written by learners of Swedish as a second language. Due to their relevance and ease of annotation, we restrict our initial experiments to word order errors. To do that, we build a corrupted version of the standard Swedish Universal Dependencies (UD) treebank Talbanken, mimicking the error patterns and frequency distributions observed in the Swedish Learner Language (SweLL) corpus. We then use the MaChAmp (Massive Choice, Ample tasks) toolkit to train an array of BERT-based dependency parsers, fine-tuning on different combinations of original and corrupted data. We evaluate the resulting models not only on their respective test sets but also, most importantly, on a smaller collection of sentence-correction pairs derived from SweLL. Results show small but significant performance improvements on the target domain, with minimal decline on normative data.}, booktitle = {Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024, May 25, 2024, Torino, Italia}, author = {Masciolini, Arianna and Francis, Emilie and Szawerna, Maria Irena}, year = {2024}, publisher = {ELRA and ICCL}, address = {Torino, Italy}, ISBN = {978-2-493814-20-3}, } @inProceedings{munozsanchez-2024-when-341073, title = {When Hieroglyphs Meet Technology: A Linguistic Journey through Ancient Egypt Using Natural Language Processing}, abstract = {Knowing our past can help us better understand our future. The explosive development of NLP in these past few decades has allowed us to study ancient languages and cultures in ways that we couldn’t have done in the past. However, not all languages have received the same level of attention. Despite its popularity in pop culture, the languages spoken in Ancient Egypt have been somewhat overlooked in terms of NLP research. In this survey paper we give an overview of how NLP has been used to study different variations of the Ancient Egyptian languages. This not only includes Old, Middle, and Late Egyptian but also Demotic and Coptic. We begin by giving a short introduction to these languages and their writing systems, before talking about the corpora and lexical resources that are available digitally. We then show the different NLP tasks that have been tackled for different variations of Ancient Egyptian, as well as the approaches that have been used. We hope that our work can stoke interest in the study of these languages within the NLP community.}, booktitle = {3rd Workshop on Language Technologies for Historical and Ancient Languages, LT4HALA 2024 at LREC-COLING 2024 - Workshop Proceedings, 25 May, 2024 Torino, Italia}, author = {Muñoz Sánchez, Ricardo}, year = {2024}, publisher = { ELRA Language Resources Association}, ISBN = {9782493814463}, } @inProceedings{masciolini-2024-bootstrapping-338425, title = {Bootstrapping the Annotation of UD Learner Treebanks}, abstract = {Learner data comes in a variety of formats, making corpora difficult to compare with each other. Universal Dependencies (UD) has therefore been proposed as a replacement for the various ad-hoc annotation schemes. Nowadays, the time-consuming task of building a UD treebank often starts with a round of automatic annotation. The performance of the currently available tools trained on standard language, however, tends to decline substantially upon application to learner text. Grammatical errors play a major role, but a significant performance gap has been observed even between standard test sets and normalized learner essays. In this paper, we investigate how to best bootstrap the annotation of UD learner corpora. In particular, we want to establish whether Target Hypotheses (THs), i.e. grammar-corrected learner sentences, are suitable training data for fine-tuning a parser aimed for original (ungrammatical) L2 material. We perform experiments using English and Italian data from two of the already available UD learner corpora. Our results show manually annotated THs to be highly beneficial and suggest that even automatically parsed sentences of this kind might be helpful, if available in sufficiently large amounts.}, booktitle = {Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024, 20 May, 2024, Torino, Italia}, author = {Masciolini, Arianna}, year = {2024}, publisher = {ELRA }, ISBN = {978-2-493814-31-9}, } @inProceedings{adesam-etal-2024-sprakforandring-337166, title = {Språkförändring på bar gärning. En mikrodiakron korpusstudie av pågående förändringar i stavning, lexikon och grammatik}, booktitle = {Svenskans beskrivning 38: Förhandlingar vid trettioåttonde sammankomsten. Örebro 4–6 maj 2022, Del I}, author = {Adesam, Yvonne and Berdicevskis, Aleksandrs and Coussé, Evie}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-89-2}, pages = {234--251}, } @inProceedings{ahlfeldt-matsson-2024-digarv-334595, title = {The DIGARV Platform: A collaborative platform for working with cultural heritage data and research data}, abstract = {This article covers an easy-to-use research tool for collaborative work. The tool has been adapted for structured data and high-resolution images within four research projects at GRIDH. The platform is especially designed for working with temporal and spatial data. Furthermore, the platform gives researchers access to a relational database system through input forms and access to external cultural heritage data including high-resolution images. This way the platform also aims to utilize external data published as Linked Open Data (LOD) and, at the same time, prepare its own research data for publishing as LOD. Because of the spatial and temporal nature of the data, it is visualized in time and space through maps and timelines to give overview and context during the data management phase.}, booktitle = {Proceedings of the Huminfra Conference, 10-11 January, 2024, Gothenburg, Sweden}, editor = {Elena Volodina and Gerlof Bouma and Markus Forsberg and Dimitrios Kokkinakis and David Alfter and Mats Fridlund and Christian Horn and Lars Ahrenberg and Anna Blåder}, author = {Åhlfeldt, Johan and Matsson, Arild}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{lange-2024-setting-341511, title = {Setting up a Research Data Repository Based on Invenio RDM: An Experience Report }, booktitle = {DNHB 2024: Digital Humanities in the Nodic and Baltic Countries 8th Conference, Reykjavík, Iceland, 27–31 May 2024.}, author = {Lange, Herbert}, year = {2024}, publisher = {University of Oslo library}, address = {Oslo, Norway}, } @inProceedings{angsal-etal-2024-terrorism-337182, title = {Terrorism som tolkningsram: en diskurssemantisk studie av svensk riksdagsdebatt 1993–2018}, booktitle = {Svenskans beskrivning 38 Förhandlingar vid trettioåttonde sammankomsten. Örebro 4–6 maj 2022 Del III / Redigerad av Danny Jansson, Ida Melander, Gustav Westberg & Daroon Yassin Falk.}, author = {Ängsal, Magnus Pettersson and Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Öhberg, Patrik}, year = {2024}, publisher = {Örebro universitet}, address = {Örebro}, ISBN = {978-91-87789-91-5}, pages = {194–210}, } @article{volodina-etal-2024-swedish-340630, title = {Swedish word family resource : Construction, applicability, strengths and first experiments}, abstract = {The article introduces a novel lexical resource for Swedish based on word family principles. The development of the Swedish Word Family (SweWF) resource is set into the context of linguistic complexity in second language acquisition. The SweWF is particularly appropriate for that, given that it contains lexical items used in second language corpora, namely, in a corpus of coursebook texts, and in a corpus of learner essays. The main focus of the article is on the construction of the resource with its user interface and on its applicability for research, although it also opens vast possibilities for practical applications for language learning, testing and assessment. We demonstrate the value of the resource through several case studies.}, journal = {ITL - International Journal of Applied Linguistics}, author = {Volodina, Elena and Ali Mohammed, Yousuf and Tiedemann, Therese Lindstrom}, year = {2024}, volume = {175}, number = {1}, pages = {127 -- 161}, } @article{forsberg-holmer-2024-datatillgang-343814, title = {Datatillgång, metodutveckling och lexikografiskt arbete vid Språkbanken Text}, journal = {LexicoNordica}, author = {Forsberg, Markus and Holmer, Louise}, year = {2024}, volume = {31}, pages = {61--79}, } @inProceedings{broden-etal-2024-samforfattande-335726, title = {Samförfattande som datadriven tvärvetenskap: Pragmatiska lärdomar från SweTerror-projektet }, abstract = {Terrorism i svensk politik (SweTerror) är ett storskaligt tvärvetenskapligt forskningsprojekt med forskare från såväl human- och samhällsvetenskaperna som datavetenskaperna. Samtidigt använder och utvecklar SweTerror nationell forskningsinfrastruktur för riksdagsdata. Detta paper beskriver användningen av samförfattande som en datadriven tvärvetenskaplig praktik för att integrera olika vetenskapliga perspektiv och skapa samsyn i projektforskningen. Vi tar fasta på betydelsen av valet att koncentrera samarbetsformen kring konferenspapers inom specifikt digital humaniora och diskuterar erfarenheten av att samskrivande försvagar vetenskapligt revirtänkande, liksom ett iterativt förhållningssätt till forskningsdata kopplade till forskningsinfrastrukturer under uppbyggnad. Avslutningsvis betonar vi datadrivet samförfattande som en pragmatisk praktik för att stärka kollaborativt samarbete och kunskapsbryggor inom en tvärvetenskaplig forskargrupp.}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Brodén, Daniel and Fridlund, Mats and Olsson, Leif-Jöran and Ängsal, Magnus Pettersson and Öhberg, Patrik}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @misc{volodina-etal-2024-proceedings-335190, title = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @inProceedings{bouma-etal-2024-konsten-333683, title = {Konsten att bedriva svensk ordforskning utan att kränka upphovsrätten}, abstract = {Vi beskriver KB-labb och Språkbanken Texts samarbete för att underlätta ordforskning på de upphovsrätts-skyddade korpusar som finns i Kungliga bibliotekets samlingar. Satsningen har hittils lett till två öppna datasamlingar, Kubord 1 och 2, som ger tillgång till ordstatistik och ordsamförekomststatistik. Vi beskriver även Kubord-fastText, en samling vektormodeller som är baserade på samma korpusar, som är underutveckling}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January, 2024 / eds. Elena Volodina, Gerlof Bouma, Markus Forsberg, Dimitrios Kokkinakis, David Alfter, Mats Fridlund, Christian Horn, Lars Ahrenberg, Anna Blåder}, author = {Bouma, Gerlof and Forsberg, Markus and Sikora, Justyna and Sköldberg, Emma}, year = {2024}, publisher = { Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{masciolini-toth-2024-stund-335974, title = {STUnD: ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker }, abstract = {Föreliggande artikel introducerar STUND, ett Sökverktyg för Tvåspråkiga Universal Dependencies-trädbanker som möjliggör parallella syntaktiska sökningar. Vi demonstrerar dess praktiska tillämpning i en fallstudie på tempusformen presens perfekt i svenska och engelska. Resultaten visar att presens perfekt används i ungefär lika stor utsträckning i båda språken, men att det förekommer viss variation som verkar bero på språkspecifika konventioner och översättningsstrategier. }, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), Gothenburg, 10–11 January 2024}, author = {Masciolini, Arianna and Tóth, Márton András}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @article{berdicevskis-etal-2024-drop-326112, title = {To drop or not to drop? Predicting the omission of the infinitival marker in a Swedish future construction}, abstract = {We investigate the optional omission of the infinitival marker in a Swedish future tense construction. During the last two decades the frequency of omission has been rapidly increasing, and this process has received considerable attention in the literature. We test whether the knowledge which has been accumulated can yield accurate predictions of language variation and change. We extracted all occurrences of the construction from a very large collection of corpora. The dataset was automatically annotated with language-internal predictors which have previously been shown or hypothesized to affect the variation. We trained several models in order to make two kinds of predictions: whether the marker will be omitted in a specific utterance and how large the proportion of omissions will be for a given time period. For most of the approaches we tried, we were not able to achieve a better-than-baseline performance. The only exception was predicting the proportion of omissions using autoregressive integrated moving average models for one-step-ahead forecast, and in this case time was the only predictor that mattered. Our data suggest that most of the language-internal predictors do have some effect on the variation, but the effect is not strong enough to yield reliable predictions.}, journal = {Corpus Linguistics and Linguistic Theory}, author = {Berdicevskis, Aleksandrs and Coussé, Evie and Koplenig, Alexander and Adesam, Yvonne}, year = {2024}, volume = {20}, number = {1}, pages = {219–261}, } @inProceedings{masciolini-etal-2025-multigec-348546, title = {The MultiGEC-2025 Shared Task on Multilingual Grammatical Error Correction at NLP4CALL}, abstract = {This paper reports on MultiGEC-2025, the first shared task in text-level Multilingual Grammatical Error Correction. The shared task features twelve European languages (Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian) and is organized into two tracks, one for systems producing minimally corrected texts, thus preserving as much as possible of the original language use, and one dedicated to systems that prioritize fluency and idiomaticity. We introduce the task setup, data, evaluation metrics and baseline; present results obtained by the submitted systems and discuss key takeaways and ideas for future work.}, booktitle = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Tallinn}, ISBN = {978-9908-53-112-0}, } @techreport{masciolini-etal-2025-overview-347102, title = {An overview of Grammatical Error Correction for the twelve MultiGEC-2025 languages}, abstract = {This overview is complementary to the comprehensive dataset description article for MultiGEC – a dataset for Multilingual Grammatical Error Correction including data for twelve European languages: Czech, English, Estonian, German, Greek, Icelandic, Italian, Latvian, Russian, Slovene, Swedish and Ukrainian. It is well-known that in the field of Natural Language Processing (NLP) most publications tend to focus on the English language. While this is due to historical reasons (ease of publication, greater outreach, increased number of citations, etc.), it does leave other languages at a disadvantage across multiple tasks. The MultiGEC dataset was created as an attempt to counteract this effect. This report provides a historical overview of the evolution of GEC for each of the twelve languages in this dataset and provides a context for the work on the dataset and the related MultiGEC-2025 shared task.}, author = {Masciolini, Arianna and Caines, Andrew and De Clercq, Orphée and Kruijsbergen, Joni and Kurfalı, Murathan and Muñoz Sánchez, Ricardo and Volodina, Elena and Östling, Robert and Allkivi, Kais and Arhar Holdt, Špela and Auzin̦a, Ilze and Darģis, Roberts and Drakonaki, Elena and Frey, Jennifer-Carmen and Glišic, Isidora and Kikilintza, Pinelopi and Nicolas, Lionel and Romanyshyn, Mariana and Rosen, Alexandr and Rozovskaya, Alla and Suluste, Kristjan and Syvokon, Oleksiy and Tantos, Alexandros and Touriki, Despoina-Ourania and Tsiotskas, Konstantinos and Tsourilla, Eleni and Varsamopoulos, Vassilis and Wisniewski, Katrin and Žagar, Aleš and Zesch, Torsten}, year = {2025}, publisher = {University of Gothenburg}, address = {Gothenburg, Sweden}, } @article{cousse-adesam-2025-exploring-346846, title = {Exploring the language of Swedish social media: A contrastive corpus analysis}, abstract = {This article explores the language of social media by analyzing a selection of linguistic features in four corpora of Swedish social media available at Språkbanken Text: Blog mix, Familjeliv, Flashback, and Twitter. Previous research describes the language of these corpora as informal, spoken-like, unedited, non-standard, and innovative. Our corpus analysis confirms the informal and spoken-like nature of social media, while also showing that these traits are unevenly distributed across the various social media corpora and that they are also present in other traditional written corpora, such as novels. Our findings also reveal that the social media corpora show traits of involved and interactional language.}, journal = {Nordic Journal of Linguistics}, author = {Coussé, Evie and Adesam, Yvonne}, year = {2025}, } @inProceedings{francis-2025-language-348452, title = {Language of the Swedish Manosphere with Swedish FrameNet}, abstract = {The manosphere is a loose group of online communities centralised around the themes of anti-feminism, misogyny, racism, and hetero-masculinity. It has gained a reputation for violent extremism, particularly from members of the involuntary celibate (incel) community. Sweden sees one of the highest volumes of online traffic to well-known incel forums in all of Europe. In spite of this, there is little information on manosphere/incel cultre in Swedish. This paper uses posts from Flashback’s manosphere subforum automatically annotated with Swedish FrameNet to analyse the language community in a Swedish context. To do so, a lexicon for the Swedish manosphere was created and terms of interest were identified in the Swedish discourse. Analysis of prominent semantic frames linked to these terms of interest presents a detailed look into the language of the Swedish manosphere.}, booktitle = {25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)}, author = {Francis, Emilie}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, pages = {10}, } @incollection{borin-lyngfelt-2025-framenets-347629, title = {Framenets and ConstructiCons}, booktitle = {The Cambridge Handbook of Construction Grammar, ed. by Mirjam Fried & Kiki Nikoforidou}, author = {Borin, Lars and Lyngfelt, Benjamin}, year = {2025}, publisher = {Cambridge University Press}, address = {Cambridge}, ISBN = { 9781009049139}, pages = {71--100}, } @misc{munozsanchez-etal-2025-proceedings-348545, title = {Proceedings of the 14th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2025)}, abstract = {The workshop series on Natural Language Processing (NLP) for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on integrating Natural Lan- guage Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, the in- tegration of insights from Second Language Acquisition (SLA) research and the promotion of “Computational SLA” through setting up Second Language research infrastructures. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has in- spired the name for this area of research — Intelligent CALL, ICALL for short. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. Therefore, this work- shop invites a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and those where SLA theories (peda- gogical practices or empirical data) and modeled using ICALL tools. The NLP4CALL workshop series is aimed at bringing together competences from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Muñoz Sánchez, Ricardo and Alfter, David and Volodina, Elena and Kallas, Jelena}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-9908-53-112-0}, } @inProceedings{szawerna-etal-2025-devils-348547, title = {The Devil’s in the Details: the Detailedness of Classes Influences Personal Information Detection and Labeling}, abstract = {In this paper, we experiment with the effect of different levels of detailedness or granularity—understood as i) the number of classes, and ii) the classes’ semantic depth in the sense of hypernym and hyponym relations — of the annotation of Personally Identifiable Information (PII) on automatic detection and labeling of such information. We fine-tune a Swedish BERT model on a corpus of Swedish learner essays annotated with a total of six PII tagsets at varying levels of granularity. We also investigate whether the presence of grammatical and lexical correction annotation in the tokens and class prevalence have an effect on predictions. We observe that the fewer total categories there are, the better the overall results are, but having a more diverse annotation facilitates fewer misclassifications for tokens containing correction annotation. We also note that the classes’ internal diversity has an effect on labeling. We conclude from the results that while labeling based on the detailed annotation is difficult because of the number of classes, it is likely that models trained on such annotation rely more on the semantic content captured by contextual word embeddings rather than just the form of the tokens, making them more robust against nonstandard language.}, booktitle = {Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025), March 3–4, 2025 Tallinn, Estonia) / Richard Johansson and Sara Stymne (eds.)}, author = {Szawerna, Maria Irena and Dobnik, Simon and Muñoz Sánchez, Ricardo and Vu, Xuan-Son and Volodina, Elena}, year = {2025}, publisher = {University of Tartu Library}, address = {Tartu, Estonia}, ISBN = {978-9908-53-109-0}, pages = { 697–708}, } @incollection{petersson-2025-progressive-346547, title = {Progressive aspect in Swedish and English: a case study of 'ing' and 'hålla på att'}, booktitle = {Building meanings, building connections. A festschrift in honor of Makoto Kanazawa and Christopher Tancredi (Sudo, Yasutada & Uegaki, Wataru, eds.).}, author = {Petersson, Stellan}, year = {2025}, }