@inProceedings{themistocleous-etal-2018-effects-270215, title = {Effects of Mild Cognitive Impairment on vowel duration }, abstract = {Mild cognitive impairment (MCI) is a neurological condition, which is characterized by a noticeable decline of cognitive abilities, including communicative and linguistic skills. In this study, we have measured the duration of vowels produced in a reading task by 55 speakers— 30 healthy controls and 25 MCI—. The main results showed that MCI speakers differed significantly from HC in vowel duration as MCI speakers produced overall longer vowels. Also, we found that gender effects on vowel duration were different in MCI and HC. One significant aspect of this finding is that they highlight the contribution of vowel acoustic features as markers of MCI.}, booktitle = {Proceedings of the 9th Tutorial & Research Workshop on Experimental Linguistics, 28 - 30 August 2018, Paris, France}, editor = {Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios and Eckerström, Marie and Fraser, Kathleen and Lundholm Fors, Kristina}, year = {2018}, ISBN = {978-960-466-162-6 }, } @inProceedings{rouces-etal-2018-generating-264719, title = {Generating a Gold Standard for a Swedish Sentiment Lexicon}, abstract = {We create a gold standard for sentiment annotation of Swedish terms, using the freely available SALDO lexicon and the Gigaword corpus. For this purpose, we employ a multi-stage approach combining corpus-based frequency sampling, direct score annotation and Best-Worst Scaling. In addition to obtaining a gold standard, we analyze the data from our process and we draw conclusions about the optimal sentiment model.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, May 7-12, 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{fraser-etal-2018-improving-264397, title = {Improving the Sensitivity and Specificity of MCI Screening with Linguistic Information.}, abstract = {The Mini-Mental State Exam (MMSE) is a screening tool for cognitive impairment. It has been extensively validated and is widely used, but has been criticized as not being effective in detecting mild cognitive impairment (MCI). In this study, we examine the utility of augmenting MMSE scores with automatically extracted linguistic information from a narrative speech task to better differentiate between individuals with MCI and healthy controls in a Swedish population. We find that with the addition of just four linguistic features, the F score (measuring a trade-off between sensitivity and specificity) is improved from 0.67 to 0.81 in logistic regression classification. These preliminary results suggest that the accuracy of traditional screening tools may be improved through the addition of computerized language analysis.}, booktitle = {Proceedings of the LREC workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2). 8th of May 2018, Miyazaki, Japan / Dimitrios Kokkinakis (ed.)}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @incollection{borin-edlund-2018-language-269047, title = {Language technology and 3rd wave HCI: Towards phatic communication and situated interaction}, abstract = {In the field of language technology, researchers are starting to pay more attention to various interactional aspects of language – a development prompted by a confluence of factors, and one which applies equally to the processing of written and spoken language. Notably, the so-called ‘phatic’ aspects of linguistic communication are coming into focus in this work, where linguistic interaction is increasingly recognized as being fundamentally situated. This development resonates well with the concerns of third wave HCI, which involves a shift in focus from stating the requirements on HCI design primarily in terms of “context-free” information flow, to a view where it is recognized that HCI – just like interaction among humans – is indissolubly embedded in complex, shifting contexts. These – together with the different backgrounds and intentions of interaction participants – shape the interaction in ways which are not readily understandable in terms of rational information exchange, but which are nevertheless central aspects of the interaction, and which therefore must be taken into account in HCI design, including its linguistic aspects, forming the focus of this chapter.}, booktitle = {New Directions in Third Wave Human-Computer Interaction: Volume 1 - Technologies}, editor = {Michael Filimowicz and Veronika Tzankova.}, author = {Borin, Lars and Edlund, Jens}, year = {2018}, publisher = {Springer International Publishing}, address = {Cham}, ISBN = {978-3-319-73355-5}, pages = {251--264}, } @inProceedings{alfter-etal-2018-from-275364, title = {From Language Learning Platform to Infrastructure for Research on Language Learning}, abstract = {Lärka is an Intelligent Computer-Assisted Language Learning (ICALL) platform developed at Språkbanken, as a flexible and a valuable source of additional learning material (e.g. via corpus- based exercises) and a support tool for both teachers and L2 learners of Swedish and students of (Swedish) linguistics. Nowadays, Lärka is being adapted into a central building block in an emerging second language research infrastructure within a larger context of the text-based research infrastructure developed by the national Swedish Language bank, Språkbanken, and SWE-CLARIN.}, booktitle = {Proceedings of CLARIN-2018 conference, Pisa, Italy}, author = {Alfter, David and Borin, Lars and Pilán, Ildikó and Lindström Tiedemann, Therese and Volodina, Elena}, year = {2018}, } @inProceedings{kokkinakis-etal-2018-swedish-262851, title = {A Swedish Cookie-Theft Corpus}, abstract = {Language disturbances can be a diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, at earlier stages, and connected speech analysis provides a non-invasive and easy-to-assess measure for determining aspects of the severity of language impairment. In this paper we focus on the development of a corpus consisting of audio recordings of picture descriptions of the Cookie-theft, produced by Swedish speakers, and accompanying transcriptions. The speech elicitation procedure provides an established method of obtaining highly constrained samples of connected speech that can allow us to study the intricate interactions between various linguistic levels and cognition. We chose the Cookie-theft picture since it is a standardized test that has been used in various studies in the past, and therefore comparisons can be made based on previous results. This type of picture description task might be useful for detecting subtle language deficits in patients with subjective and mild cognitive impairment. The resulting corpus is a new, rich and multi-faceted resource for the investigation of linguistic characteristics of connected speech and a unique data set that provides a rich resource for (future) research and experimentation in many areas, and of language impairment in particular. The information in the corpus can also be combined and correlated with other collected data about the speakers, such as neuropsychological tests, imaging and brain physiology markers and cerebrospinal fluid markers.}, booktitle = {LREC 2018, 11th edition of the Language Resources and Evaluation Conference, 7-12 May 2018, Miyazaki (Japan) / Editors: Nicoletta Calzolari (Conference chair), Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Koiti Hasida, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis, Takenobu Tokunaga}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Nordlund, Arto}, year = {2018}, publisher = {European Language Resources Association}, ISBN = {979-10-95546-00-9}, } @inProceedings{borin-etal-2018-many-267534, title = {Many a little makes a mickle - infrastructure component reuse for a massively multilingual linguistic study}, abstract = {We present ongoing work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia and studies relating to language typology and contact linguistics. The project has two concrete main aims: (1) to conduct a linguistic investigation of the claim that South Asia constitutes a linguistic area; (2) to develop state-of-the-art language technology for automatically extracting the relevant information from the text of the LSI. In this presentation we focus on how, in the first part of the project, a number of existing research infrastructure components provided by Swe-Clarin, the Swedish CLARIN consortium, have been ‘recycled’ in order to allow the linguists involved in the project to quickly orient themselves in the vast LSI material, and to be able to provide input to the language technologists designing the tools for information extraction from the descriptive grammars.}, booktitle = {Selected papers from the CLARIN Annual Conference 2017, Budapest, 18–20 September 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-273-6}, } @misc{torrent-etal-2018-proceedings-267405, title = {Proceedings of the LREC 2018 Workshop International FrameNet Workshop 2018: Multilingual Framenets and Constructicons. 12 May 2018 – Miyazaki, Japan}, abstract = {The International FrameNet Workshop 2018 brought together researchers in Frame Semantics and Construction Grammar, two areas which have traditionally been interrelated, but which have been developing somewhat independently in recent years. It is also addressed at language technology researchers working with language resources based on Frame Semantics or Construction Grammar. The workshop follows on from similar joint meetings in Berkeley, California in 2013 (IFNW 2013, sponsored by the Swedish FrameNet group) and in Juiz de Fora, Brazil in 2016 (IFNW 2016, sponsored by FrameNet Brasil), and will cover the rapidly unfolding developments in both areas and recent research on their interconnections.}, author = {Torrent, Tiago Timponi and Borin, Lars and Baker, Collin}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{pilan-volodina-2018-exploring-275366, title = {Exploring word embeddings and phonological similarity for the unsupervised correction of language learner errors.}, abstract = {The presence of misspellings and other errors or non-standard word forms poses a consider- able challenge for NLP systems. Although several supervised approaches have been proposed previously to normalize these, annotated training data is scarce for many languages. We in- vestigate, therefore, an unsupervised method where correction candidates for Swedish language learners’ errors are retrieved from word embeddings. Furthermore, we compare the usefulness of combining cosine similarity with orthographic and phonological similarity based on a neural grapheme-to-phoneme conversion system we train for this purpose. Although combinations of similarity measures have been explored for finding correction candidates, it remains unclear how these measures relate to each other and how much they contribute individually to identifying the correct alternative. We experiment with different combinations of these and find that integrating phonological information is especially useful when the majority of learner errors are related to misspellings, but less so when errors are of a variety of types including, e.g. grammatical errors. }, booktitle = {Proceedings of the Second Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computation Linguistics }, ISBN = {978-1-948087-61-2}, } @inProceedings{wiren-etal-2018-svala-285624, title = {SVALA: Annotation of Second-Language Learner Text Based on Mostly Automatic Alignment of Parallel Corpora}, abstract = {Annotation of second-language learner text is a cumbersome manual task which in turn requires interpretation to postulate the intended meaning of the learner’s language. This paper describes SVALA, a tool which separates the logical steps in this process while providing rich visual support for each of them. The first step is to pseudonymize the learner text to fulfil the legal and ethical requirements for a distributable learner corpus. The second step is to correct the text, which is carried out in the simplest possible way by text editing. During the editing, SVALA automatically maintains a parallel corpus with alignments between words in the learner source text and corrected text, while the annotator may repair inconsistent word alignments. Finally, the actual labelling of the corrections (the postulated errors) is performed. We describe the objectives, design and workflow of SVALA, and our plans for further development. }, booktitle = {Selected papers from the CLARIN Annual Conference 2018, Pisa, 8-10 October 2018}, editor = {Inguna Skadina and Maria Eskevich}, author = {Wirén, Mats and Matsson, Arild and Rosén, Dan and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköpings universitet}, ISBN = {978-91-7685-034-3}, } @inProceedings{lundholmfors-etal-2018-automated-263790, title = {Automated Syntactic Analysis of Language Abilities in Persons with Mild and Subjective Cognitive Impairment}, abstract = {In this work we analyze the syntactic complexity of transcribed picture descriptions using a variety of automated syntactic features, and investigate the features’ predictive power in classifying narratives from people with subjective and mild cognitive impairment and healthy controls. Our results indicate that while there are no statistically significant differences, syntactic features can still be moderately successful at distinguishing the participant groups when used in a machine learning framework.}, booktitle = {Building continents of knowledge in oceans of data : the future of co-created eHealth: proceedings of MIE2018, 24-26 April 2018, Gothenburg, Sweden}, editor = {Adrien Ugon and Daniel Karlsson and Gunnar O. Klein and Anne Moen.}, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, publisher = {IOS Press}, address = {Amsterdam}, ISBN = {978-1-61499-851-8}, } @inProceedings{lundholmfors-etal-2018-voice-264400, title = {Eye-voice span in adults with mild cognitive impairment (MCI) and healthy controls. }, abstract = {Objectives: This study is part of a larger project focused on developing new techniques for identification of early linguistic and extra-linguistic signs of cognitive impairment, with the overall goal of identifying dementia in the preclinical stage. In a previous study, we found that eye movements during reading can be used to distinguish between subjects with mild cognitive impairment (MCI) and healthy controls with up to 86% accuracy. In this study, we are investigating the process of reading aloud, by exploring the eye-voice span in subjects with and without cognitive impairment. The aim of the study is to identify differences in the reading processes and evaluate whether these differences can be used to discriminate between the two groups. Methods: The eye-voice span is a measurement of the temporal and spatial organization between the eye and the voice, and is affected by for example working memory and automaticity, but also by the familiarity and length of words. In previous work, differences between eye movements when reading in healthy controls and subjects with cognitive impairments have been identified, and it has been shown that subjects with Alzheimer’s disease show impairments when reading aloud, specifically with regards to speech and articulation rate. Results: We present a quantitative and qualitative analysis of the reading process in the subjects, focusing both on general measures of eye-voice span, but also specifically on instances of hesitation and mistakes in the speech, and the correlated eye movements. Conclusions/Take home message: Early detection of dementia is important for a number of reasons, such as giving the person access to interventions and medications, and allowing the individual and families time to prepare. By expanding the knowledge about reading processes in subjects with MCI, we are adding to the potential of using reading analysis as an avenue of detecting early signs of dementia.}, booktitle = {Book of Abstracts 10th CPLOL Congress 10-12 May 2018, Cascais, Portugal / editor : Trinite, Baiba }, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{kokkinakis-etal-2018-textforskning-265113, title = {Kan textforskning bidra till tidigare och säkrare demensdiagnostik?}, abstract = {Tidigare forskning har visat att subtila språkstörningar kan finnas vid de tidigaste förstadierna till demens, flera år innan en klinisk diagnos kan ställas. Inom ramen för projektet ”Språkliga och extra-lingvistiska parametrar för tidig upptäckt av kognitiv svikt” (finansierat av Riksbankens Jubileumsutlysning, 2016-19) undersöker vi med hjälp av språkteknologi och språkanalysstudier hur dessa språkstörningar yttrar sig. Kan språkteknologi användas för att upptäcka dessa tidiga språkrelaterade symtom och därmed bidra med nyanserad, komplementär och användbar kunskap? Kan användning av språkteknologi särskilja personer med de allra tidigaste kognitiva avvikelserna från personer med mer godartad, åldersrelaterad kognitiv svikt? Vilka språkliga förmågor drabbas? Hur yttrar sig dessa förändringar och vilka slags empiriska material finns att tillgå? Dessa är några av de frågor vi söker svar på. Vi gör inspelningar som vi analyserar för att kunna ta fram ny kunskap om subtila språkliga kännetecken som kan föregå demensutveckling. Denna kunskap kan användas för att eventuellt kunna förutsäga vilka individer som befinner sig i riskzonen för att utveckla demens, och kan vara användbar som komplementerande beslutsunderlag till domänexperter. Vi utvinner, analyserar och undersöker om det finns samband mellan olika språkrelaterade parametrar från spontan talinteraktion, transkriptioner men även ögonrörelser och neuropsykologiska tester från personer med subjektiv eller lindrig kognitiv nedsättning och friska kontrollpersoner. Många gånger är det svårt att avgöra huruvida lindriga kognitiva symtom är en del av det normala åldrandet eller början på en neurodegenerativ process. Vi förväntar oss inte heller att varje enskild person med kognitiv nedsättning kommer att uttrycka sig eller läsa på samma sätt utan snarare att dessa personer tidigt i sjukdomsförloppet kommer att börja uppvisa olika slags avvikande läsmönster, eller göra fonologiska, lexikala, syntaktiska eller semantiska fel. I studien utvecklar vi verktyg för att automatiskt hitta dessa avvikelser, och målet är att detta sedan ska kunna användas som komplement till tidig diagnostik samt som prognostiskt eller screeningverktyg. Deltagarna i vår studie har rekryterats från en pågående longitudinell studie, ”Demens i Tidigt Skede”, (eng. ”The Gothenburg MCI study”) på Minnesmottagningen i Göteborg, och vårt projekt har godkänts av den lokala etiknämnden. Alla deltagare i studien (kontrollgruppen [HC], personer med subjektiv kognitiv nedsättning [SCI] och personer med mild kognitiv nedsättning [MCI]) har genomgått baslinjeundersökning och gett informerat skriftligt samtycke (demografisk information finns i tabell 1). Vårt projekt är f.n. pågående och vi kommer presentera resultat baserade på inspelningstillfälle nr ett (aug. 2016-mars 2017). En ny inspelningsomgång, med samma deltagare, började i februari 2018 och förväntas vara avslutat i december 2018. Under presentationen kommer vi ge exempel på olika tal-, text- och ögonrörelseanalyser vi har genomfört och diskutera metodval och resultat baserade på studiens första fas. Vi kommer vidare ge en kort inblick i den nya, pågående inspelningsomgången och de nya testmoment vi använder. Vi vill med vårt arbete visa hur språkteknologisk analys kan bidra till att utöka vår kunskap inom området så att den kan vara användbar för tidig diagnostik och optimal omvårdnad. Enligt Socialstyrelsen (2017) finns det i Sverige över 160 000 personer med någon demenssjukdom. Våra resultat kan ha en betydelse för vårdpersonal som snabbare vill diagnostisera och identifiera individer med olika former av kognitiv funktionsnedsättning innan allvarliga symtom blir påtagliga. Utvecklingsmöjligheterna är många: nya eller förbättrade kognitiva screeningtester som skulle kunna användas inom primär- och specialistvården, samt utveckling och tillämpning av insatser som kan påverka beteendemönster och träna upp individens kommunikativa förmåga, kan på sikt leda till positiva konsekvenser som minskade vårdköer samt effektivare behandling avseende kostnader och behandlingsutfall.}, booktitle = {Forum för textforskning 13 , Lund 7 – 8 juni 2018}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{lange-ljunglof-2018-demonstrating-274016, title = {Demonstrating the MUSTE Language Learning Environment}, abstract = {We present a language learning application that relies on grammars to model the learning outcome. Based on this concept we can provide a powerful framework for language learning exercises with an intuitive user interface and a high reliability. Currently the application aims to augment existing language classes and support students by improving the learner attitude and the general learning outcome. Extensions beyond that scope are promising and likely to be added in the future.}, booktitle = {NLP4CALL 2018, the 7th Workshop on NLP for Computer Assisted Language Learning, Stockholm, 7th November 2018; published as issue 152 of Linköping Electronic Conference Proceedings}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7685-173-9}, } @inProceedings{lange-ljunglof-2018-mulle-274014, title = {MULLE: A grammar-based Latin language learning tool to supplement the classroom setting}, abstract = {MULLE is a tool for language learning that focuses on teaching Latin as a foreign language. It is aimed for easy integration into the traditional classroom setting and syllabus, which makes it distinct from other language learning tools that provide standalone learning experience. It uses grammar-based lessons and embraces methods of gamification to improve the learner motivation. The main type of exercise provided by our application is to practice translation, but it is also possible to shift the focus to vocabulary or morphology training.}, booktitle = {NLPTEA 2018, the 5th Workshop on Natural Language Processing Techniques for Educational Applications, Melbourne, Australia, 19th July 2018}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {Association for Computational Linguistics}, address = {Melbourne, Australia}, } @techreport{bjorklund-etal-2018-erfarenhetsutbyte-331777, title = {Kan erfarenhetsutbyte med andra med samma funktionsnedsättning leda till förbättrad kommunikation hos vuxna med hörselnedsättning?}, author = {Björklund, Kerstin and Grindborg, Kristin and Lundholm Fors, Kristina and Malmberg, Milijana and Tovetjärn, Margareta and Wickman, Jenny and Öhman, Anna-Karin}, year = {2018}, ISBN = {ISBN 978-91-639-9120-2}, } @inProceedings{jatowt-etal-2018-every-272054, title = {Every Word Has Its History: Interactive Exploration and Visualization of Word Sense Evolution}, booktitle = {CIKM '18 Proceedings of the 27th ACM International Conference on Information and Knowledge Management, October 22 - 26, 2018, Torino, Italy}, author = {Jatowt, Adam and Campos, Ricardo and Bhowmick, Sourav S. and Tahmasebi, Nina and Doucet, Antoine}, year = {2018}, publisher = {ACM}, address = {New York, NY, USA}, ISBN = {978-1-4503-6014-2}, } @inProceedings{adesam-etal-2018-fsvreader-267311, title = {FSvReader – Exploring Old Swedish Cultural Heritage Texts}, abstract = {This paper describes FSvReader, a tool for easier access to Old Swedish (13th–16th century) texts. Through automatic fuzzy linking of words in a text to a dictionary describing the language of the time, the reader has direct access to dictionary pop-up definitions, in spite of the large amount of morphological and spelling variation. The linked dictionary entries can also be used for simple searches in the text, highlighting possible further instances of the same entry. }, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu, Mäkelä Mikko, Tolonen Jouni Tuominen}, author = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{ljunglof-kjellberg-2018-interactive-274247, title = {Interactive correction of speech recognition errors: implementation and evaluation for English and Swedish}, booktitle = {SLTC 2018, the 7th Swedish Language Technology Conference, Stockholm, 7-9th November 2018}, author = {Ljunglöf, Peter and Kjellberg, J. Magnus}, year = {2018}, } @inProceedings{borin-etal-2018-language-290841, title = {Language technology for digital linguistics: Turning the Linguistic Survey of India into a rich source of linguistic information}, abstract = {We present our work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) from a printed discursive textual description into a formally structured digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. While doing so, we develop state-of-the-art language technology for automatically extracting the relevant grammatical information from the text of the LSI, and interactive linguistic information visualization tools for better analysis and comparisons of languages based on their structural and functional features.}, booktitle = {Lecture Notes in Computer Science. Computational Linguistics and Intelligent Text Processing, 18th International Conference, CICLing 2017, Budapest, Hungary, April 17–23, 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Springer}, address = {Cham}, } @inProceedings{karsvall-borin-2018-sdhk-265603, title = {SDHK meets NER: Linking place names with medieval charters and historical maps}, booktitle = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Karsvall, Olof and Borin, Lars}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{tahmasebi-2018-study-264722, title = {A Study on Word2Vec on a Historical Swedish Newspaper Corpus}, abstract = {Detecting word sense changes can be of great interest in the field of digital humanities. Thus far, most investigations and automatic methods have been developed and carried out on English text and most recent methods make use of word embeddings. This paper presents a study on using Word2Vec, a neural word embedding method, on a Swedish historical newspaper collection. Our study includes a set of 11 words and our focus is the quality and stability of the word vectors over time. We investigate if a word embedding method like Word2Vec can be effectively used on texts where the volume and quality is limited.}, booktitle = {CEUR Workshop Proceedings. Vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference, Helsinki Finland, March 7-9, 2018. Edited by Eetu Mäkelä, Mikko Tolonen, Jouni Tuominen }, author = {Tahmasebi, Nina}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @inProceedings{lange-ljunglof-2018-putting-274013, title = {Putting Control into Language Learning}, abstract = {Controlled Natural Languages (CNLs) have many applications including document authoring, automatic reasoning on texts and reliable machine translation, but their application is not limited to these areas. We explore a new application area of CNLs, the use of CNLs in computer-assisted language learning. In this paper we present a a web application for language learning using CNLs as well as a detailed description of the properties of the family of CNLs it uses.}, booktitle = {CNL 2018, the 6th International Workshop on Controlled Natural Language, Maynooth, Co Kildare, 27-28th August 2018; published as volume 304 of Frontiers in Artificial Intelligence and Applications}, author = {Lange, Herbert and Ljunglöf, Peter}, year = {2018}, publisher = {IOS Press}, address = {Amsterdam}, ISBN = {978-1-61499-904-1}, } @inProceedings{adesam-etal-2018-exploring-273835, title = {Exploring the Quality of the Digital Historical Newspaper Archive KubHist}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November 2018}, author = {Adesam, Yvonne and Dannélls, Dana and Tahmasebi, Nina}, year = {2018}, } @inProceedings{adesam-etal-2018-koala-273841, title = {The Koala Part-of-Speech and Morphological Tagset for Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2018}, } @inProceedings{rosen-etal-2018-error-275363, title = {Error Coding of Second-Language Learner Texts Based on Mostly Automatic Alignment of Parallel Corpora. }, abstract = {Error coding of second-language learner text, that is, detecting, correcting and annotating errors, is a cumbersome task which in turn requires interpretation of the text to decide what the errors are. This paper describes a system with which the annotator corrects the learner text by editing it prior to the actual error annotation. During the editing, the system automatically generates a parallel corpus of the learner and corrected texts. Based on this, the work of the annotator consists of three independent tasks that are otherwise often conflated: correcting the learner text, repairing inconsistent alignments, and performing the actual error annotation.}, booktitle = {Proceedings of CLARIN-2018 conference, 8-10 October 2018, Pisa, Italy}, author = {Rosén, Dan and Wirén, Mats and Volodina, Elena}, year = {2018}, } @inProceedings{alfter-volodina-2018-whole-275362, title = {Is the whole greater than the sum of its parts? A corpus-based pilot study of the lexical complexity in multi-word expressions.}, abstract = {Multi-word expressions (MWE) are assumed to be good predictors of language learner proficiency, however, there are no methods to establish at which level which MWEs can be assumed to be known. In this study we look at whether the target (proficiency) level of MWEs can be calculated based on the known level of its constituents.}, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, } @misc{pilan-etal-2018-proceedings-275358, title = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018), SLTC, Stockholm, 7th November 2018 }, abstract = {The primary goal of the workshop series on Natural Language Processing for Computer-Assisted Language Learning (NLP4CALL) is to create a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The latter includes, among others, insights from Second Language Acquisition (SLA) research, on the one hand, and promoting the development of “Computational SLA” through setting up Second Language research infrastructure(s), on the other. The intersection of Natural Language Processing (or Language Technology / Computational Linguistics) and Speech Technology with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has given the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition theories and practices, as well as knowledge of second language pedagogy and didactics. This workshop invites therefore a wide range of ICALL-relevant research, including studies where NLP-enriched tools are used for testing SLA and pedagogical theories, and vice versa, where SLA theories, pedagogical practices or empirical data are modeled in ICALL tools. The NLP4CALL workshop series is aimed at bringing together competencies from these areas for sharing experiences and brainstorming around the future of the field.}, author = {Pilán, Ildikó and Volodina, Elena and Alfter, David and Borin, Lars}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{volodina-etal-2018-annotation-275361, title = {Annotation of learner corpora: first SweLL insights.}, abstract = {This is a concise description of experiences with learner corpus annotation performed within SweLL project. Experiences include work with legal issues, anonymization, error annotation, normalization and questions relating to quality of annotation. }, booktitle = {Proceedings of SLTC 2018, Stockholm, October 7-9, 2018}, author = {Volodina, Elena and Granstedt, Lena and Megyesi, Beáta and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats}, year = {2018}, } @inProceedings{adesam-etal-2018-eukalyptus-273839, title = {The Eukalyptus Treebank of Written Swedish}, booktitle = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus}, year = {2018}, } @edited_book{lyngfelt-etal-2018-constructicography-269082, title = {Constructicography: Constructicon development across languages}, abstract = {In constructionist theory, a constructicon is an inventory of constructions making up the full set of linguistic units in a language. In applied practice, it is a set of construction descriptions – a “dictionary of constructions”. The development of constructicons in the latter sense typically means combining principles of both construction grammar and lexicography, and is probably best characterized as a blend between the two traditions. We call this blend constructicography. The present volume is a comprehensive introduction to the emerging field of constructicography. After a general introduction follow six chapters presenting constructicon projects for English, German, Japanese, Brazilian Portuguese, Russian, and Swedish, respectively, often in relation to a framenet of the language. In addition, there is a chapter addressing the interplay between linguistics and language technology in constructicon development, and a final chapter exploring the prospects for interlingual constructicography. This is the first major publication devoted to constructicon development and it should be particularly relevant for those interested in construction grammar, frame semantics, lexicography, the relation between grammar and lexicon, or linguistically informed language technology. }, editor = {Lyngfelt, Benjamin and Borin, Lars and Ohara, Kyoko and Torrent, Tiago Timponi}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, } @incollection{haugen-borin-2018-danish-267403, title = {Danish, Norwegian and Swedish}, booktitle = {The world's major languages}, editor = {Bernard Comrie}, author = {Haugen, Einar and Borin, Lars}, year = {2018}, publisher = {Routledge}, address = {London and New York}, ISBN = {9781138184824}, pages = {127--150}, } @inProceedings{rouces-etal-2018-defining-264721, title = {Defining a gold standard for a Swedish sentiment lexicon: Towards higher-yield text mining in the digital humanities}, abstract = {There is an increasing demand for multilingual sentiment analysis, and most work on sentiment lexicons is still carried out based on English lexicons like WordNet. In addition, many of the non-English sentiment lexicons that do exist have been compiled by (machine) translation from English resources, thereby arguably obscuring possible language-specific characteristics of sentiment-loaded vocabulary. In this paper we describe the creation from scratch of a gold standard for the sentiment annotation of Swedish terms as a first step towards the creation of a full-fledged sentiment lexicon for Swedish.}, booktitle = {CEUR Workshop Proceedings vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu Mäkelä Mikko Tolonen Jouni Tuominen }, author = {Rouces, Jacobo and Borin, Lars and Tahmasebi, Nina and Rødven-Eide, Stian}, year = {2018}, publisher = {University of Helsinki, Faculty of Arts}, address = {Helsinki}, } @incollection{lyngfelt-etal-2018-constructicography-269085, title = {Constructicography at work: Theory meets practice in the Swedish constructicon}, abstract = {This chapter addresses central topics in constructicography from the viewpoint of the Swedish constructicon project (SweCcn), focusing on practical constructicon development. The full process of construction description is described and discussed, from selection via corpus analysis to finished constructicon entry and beyond, towards structuring the set of entries into a network. Particular attention is given to the description format and the treatment of constructional variation. A main theme in the chapter is the interdependence and alignment of SweCcn and related resources, on the one hand in the local context, notably the infrastructure of Språkbanken (the Swedish language bank), and on the other hand with respect to corresponding resources for other languages. Of key concern is the relation to FrameNet, both the Swedish and other framenets, and a major section is devoted to conditions for linking constructions and frames.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Lyngfelt, Benjamin and Bäckström, Linnéa and Borin, Lars and Ehrlemark, Anna and Rydstedt, Rudolf}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {41--106}, } @inProceedings{dannells-olsson-2018-integrating-271181, title = {Integrating language resources in two OCR engines to improve processing of historical Swedish text.}, abstract = {We are aiming to address the difficulties that many History and Social Sciences researchers struggle with to bring in non-digitized text into language analysis workflows. In this paper we present the language resources and material we used for training two Optical Character Recognition engines for processing historical Swedish text written in Fraktur (blackletter). The trained models, resources and dictionaries are freely available and accessible through our web service, hosted at Språkbanken, to enable users and developers easy access for extraction of historical Swedish text a that are only available in images for further processing.}, booktitle = {CLARIN Annual Conference}, author = {Dannélls, Dana and Olsson, Leif-Jöran}, year = {2018}, } @inProceedings{rouces-etal-2018-sensaldo-264720, title = {SenSALDO: Creating a Sentiment Lexicon for Swedish}, abstract = {The natural language processing subfield known as sentiment analysis or opinion mining has seen an explosive expansion over the last decade or so, and sentiment analysis has become a standard item in the NLP toolbox. Still, many theoretical and methodological questions remain unanswered and resource gaps unfilled. Most work on automated sentiment analysis has been done on English and a few other languages; for most written languages of the world, this tool is not available. This paper describes the development of an extensive sentiment lexicon for written (standard) Swedish. We investigate different methods for developing a sentiment lexicon for Swedish. We use an existing gold standard dataset for training and testing. For each word sense from the SALDO Swedish lexicon, we assign a real value sentiment score in the range [-1,1] and produce a sentiment label. We implement and evaluate three methods: a graph-based method that iterates over the SALDO structure, a method based on random paths over the SALDO structure and a corpus-driven method based on word embeddings. The resulting sense-disambiguated sentiment lexicon (SenSALDO) is an open source resource and freely available from Språkbanken, The Swedish Language Bank at the University of Gothenburg.}, booktitle = {LREC 2018, Eleventh International Conference on Language Resources and Evaluation, 7-12 May 2018, Miyazaki (Japan)}, author = {Rouces, Jacobo and Tahmasebi, Nina and Borin, Lars and Rødven-Eide, Stian}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-00-9}, } @inProceedings{virk-prasad-2018-towards-295336, title = {Towards Hindi/Urdu FrameNets via the Multilingual FrameNet.}, booktitle = {Proceedings of the LREC 2018 Workshop. International FrameNet Workshop 2018 : Multilingual Framenets and Constructicon, 12 May 2018 – Miyaza, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker }, author = {Virk, Shafqat and Prasad, K.V.S}, year = {2018}, publisher = {European Language Resources Association (ELRA).}, ISBN = {979-10-95546-00-9}, } @inProceedings{malm-etal-2018-lingfn-267404, title = {LingFN: Towards a framenet for the linguistics domain}, abstract = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive manual process.}, booktitle = {Proceedings : LREC 2018 Workshop, International FrameNet Workshop 2018. Multilingual Framenets and Constructicons, May 12, 2018, Miyazaki, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker}, author = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju}, year = {2018}, publisher = {ELRA}, address = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{megyesi-etal-2018-learner-275359, title = {Learner Corpus Anonymization in the Age of GDPR: Insights from the Creation of a Learner Corpus of Swedish}, abstract = {This paper reports on the status of learner corpus anonymization for the ongoing research infrastructure project SweLL. The main project aim is to deliver and make available for research a well-annotated corpus of essays written by second language (L2) learners of Swedish. As the practice shows, annotation of learner texts is a sensitive process demanding a lot of compromises between ethical and legal demands on the one hand, and research and technical demands, on the other. Below, is a concise description of the current status of pseudonymization of language learner data to ensure anonymity of the learners, with numerous examples of the above-mentioned compromises.}, booktitle = {Proceedings of the 7th Workshop on NLP for Computer Assisted Language Learning (NLP4CALL 2018) at SLTC, Stockholm, 7th November 2018}, editor = {Ildikó Pilán and Elena Volodina and David Alfter and Lars Borin}, author = {Megyesi, Beata and Granstedt, Lena and Johansson, Sofia and Prentice, Julia and Rosén, Dan and Schenström, Carl-Johan and Sundberg, Gunlög and Wirén, Mats and Volodina, Elena}, year = {2018}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7685-173-9}, } @inProceedings{alfter-volodina-2018-towards-275368, title = {Towards Single Word Lexical Complexity Prediction.}, abstract = {In this paper we present work-in-progress where we investigate the usefulness of previously created word lists to the task of single-word lexical complexity analysis and prediction of the complexity level for learners of Swedish as a second language. The word lists used map each word to a single CEFR level, and the task consists of predicting CEFR levels for unseen words. In contrast to previous work on word-level lexical complexity, we experiment with topics as additional features and show that linking words to topics significantly increases accuracy of classification.}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-948087-11-7}, } @incollection{borin-etal-2018-linguistics-269084, title = {Linguistics vs. language technology in constructicon building and use}, abstract = {In this chapter, we describe the close interaction of linguists and language technologists in the Swedish constructicon project. This kind of collaboration is not so common today, because of the way that language technology has developed in recent decades, but in our case the collaboration has been very successful, and constituted a genuine instance of cross-fertilization, where an evolving language technology infrastructure and a computational lexical macroresource described in the chapter has formed an integral part of the Swedish constructicon development environment, while at the same time the structured linguistic knowledge described in the constructicon has informed the language technology making up the infrastructure.}, booktitle = {Constructicography: Constructicon development across languages}, editor = {Benjamin Lyngfelt and Lars Borin and Kyoko Ohara and Tiago Timponi Torrent}, author = {Borin, Lars and Dannélls, Dana and Gruzitis, Normunds}, year = {2018}, publisher = {John Benjamins}, address = {Amsterdam}, ISBN = {9789027263865}, pages = {229--253}, } @inProceedings{volodina-etal-2018-interoperability-275365, title = {Interoperability of Second Language Resources and Tools}, abstract = {Language learning based on learner corpora is an increasingly active area of research in CLARIN centres and beyond. In order to promote comparative research, the interoperability of data and tools in this area must be improved, and metadata and error annotation should be harmonized. A closer European collaboration in the field of learner corpus creation is desirable.}, booktitle = {Proceedings of CLARIN-2018 conference}, author = {Volodina, Elena and Janssen, Maarten and Lindström Tiedemann, Therese and Mikelic Preradovic, Nives and Ragnhildstveit, Silje Karin and Tenfjord, Kari and de Smedt, Koenraad}, year = {2018}, } @inProceedings{pilan-volodina-2018-investigating-275367, title = {Investigating the importance of linguistic complexity features across different datasets related to language learning.}, abstract = {We present the results of our investigations aiming at identifying the most informative linguistic complexity features for classifying language learning levels in three different datasets. The datasets vary across two dimensions: the size of the instances (texts vs. sentences) and the language learning skill they involve (reading comprehension texts vs. texts written by learners themselves). We present a subset of the most predictive features for each dataset, taking into consid- eration significant differences in their per-class mean values and show that these subsets lead not only to simpler models, but also to an improved classification performance. Furthermore, we pin-point fourteen central features that are good predictors regardless of the size of the linguistic unit analyzed or the skills involved, which include both morpho-syntactic and lexical dimensions. }, booktitle = {Proceedings of the Workshop on Linguistic Complexity and Natural Language Processing, COLING, Santa Fe, New Mexico, USA, August 25, 2018.}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2018}, publisher = {Association of Computational Linguistics }, ISBN = {978-1-948087-62-9}, } @inProceedings{nietopina-johansson-2018-automatically-270261, title = {Automatically Linking Lexical Resources with Word Sense Embedding Models}, abstract = {Automatically learnt word sense embeddings are developed as an attempt to refine the capabilities of coarse word embeddings. The word sense representations obtained this way are, however, sensitive to underlying corpora and parameterizations, and they might be difficult to relate to word senses as formally defined by linguists. We propose to tackle this problem by devising a mechanism to establish links between word sense embeddings and lexical resources created by experts. We evaluate the applicability of these links in a task to retrieve instances of Swedish word senses not present in the lexicon.}, booktitle = {The Third Workshop on Semantic Deep Learning (SemDeep-3), August 20th, 2018, Santa Fe, New Mexico, USA / Luis Espinosa Anke, Thierry Declerck, Dagmar Gromann (eds.)}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2018}, ISBN = {978-1-948087-56-8}, } @inProceedings{denouden-etal-2018-comparison-268339, title = {Comparison of Automated Methods for Vowel Segmentation and Extraction of Acoustic Variables}, abstract = {Introduction: Primary Progressive Aphasia (PPA) is a neurodegenerative syndrome in which linguistic abilities become gradually impaired. There are three primary variants of PPA: the non-fluent agrammatic PPA, the fluent type semantic PPA, and the logopenic PPA, which is also considered an atypical form of Alzheimer’s disease (Mesulam et al., 1982; Gorno-Tempini et al., 2011). Along with the three main variants, a fourth variant has been proposed, a non-fluent apraxia of speech (AOS), though this is currently the subject of an open debate (e.g., Duffy et al., 2017; Henry et al., 2013). According to sophisticated criteria established a few years ago, PPA subtyping for a given patient presented in clinic requires clinical, neuropsychological, and imaging information (Gorno-Tempini et al., 2011). Nevertheless, quantifying the decline of linguistic abilities and subtyping the variants of PPA manually is both hard and laborious, so there is great demand for algorithms that subtype a given patient automatically. Picture description samples of connected speech and random forests techniques have been used for this purpose (de Aguiar et al., 2017; Wilson et al., 2010, Fraser et al. 2013, 2014). In the present study, we compared existing models and we propose a new one. Aims: In this study, we provide an automated classification model of PPA variants trained on known morphological and acoustic predictors and on predictors related to the clinical and linguistic profile of individuals with PPA (e.g., Mack et al., 2015; Gorno-Tempini et al., 2011; Wilson et al., 2010). Method: Speech materials for this study come from the Transcranial Direct Current Stimulation for Primary Progressive Aphasia study at Johns Hopkins University. Twenty-six individuals with PPA (Mean(SD) age = 68.6 (7.8) years, Mean(SD) education = 16.1 (2.9) years) participated in this study. PPA participants were diagnosed based on the established consensus criteria (Gorno-Tempini et al., 2011), i.e., imaging, clinical, and neuropsychological examination by trained neurologists. Individuals with PPA included non-fluent with AOS (N=5), non fluent without AOS (N=7), logopenic (N=8), and semantic (N=6) variants. Recordings of the Cookie Theft picture description from the Boston Diagnostic Aphasia Examination (BDAE) were computationally analyzed. All speech productions were automatically transcribed and segmented using an end-to-end speech-to-transcription platform. From the speech signals, we measured morphological and acoustic predictors, including vowel formants F1 ... F3, measured at 15%, 50%, and 75% of vowel’s duration, vowel duration, fundamental frequency, and pause duration. The analysis and the statistics were conducted using Python and R programming languages (R Core Team, 2017; Rossum, 1995). Three different machine learning algorithms: C5.0 decision trees, Classification and Regression Trees (CART) and random forests were trained on the predictors (Breiman, 2001; Quinlan, 1993; Hastie et al., 2009). All models were trained on the 80% of the speakers (training set), with 3-fold cross-validation. All predictor variables were centered and scaled. C5.0 was trained with winnowing and without winnowing. (Winnowing facilitates the automatic pre-selection of the predictors that are used in the decision tree.) After the training we evaluated the trained models on the unknown dataset, namely the 20% of the speakers (evaluation set). Results: C5.0 provided 86% (95% CI[81, 88], kappa = 0.76) and Random Forests 85% (95% CI[81, 88], kappa = 0.76) classification accuracy on the test data; CART provided the lowest overall classification accuracy. Overall, C5.0 outperformed both the random forests and CART, with high classification accuracy on unknown data. Non-fluent PPA with AOS was correctly predicted by both C5.0 and random forests. Discussion: The C5.0 classification model provides support for the known predictors employed in the literature. Also, it provides some objective ways to distinguish the presence of AOS in PPA and corroborate research on classification of AOS using acoustic properties especially those related to vowel production (Den Ouden et al. 2017). However, given the low number of participants employed in this study, further research is required, with a larger number of participants. Nevertheless, the proposed methods employed here constitute a promising step towards a computational differential diagnostic tool of PPA that is easy to use, quick and accurate. }, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA.}, author = {den Ouden, Dirk B. and Hutchinson, Angelica and Tsapkini, Kyrana and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{themistocleous-etal-2018-acoustic-271915, title = {Acoustic markers of PPA variants using machine learning.}, abstract = {Introduction. Speakers’ acoustic profile carries significant linguistic and non-linguistic information. Employed in clinical practice, it can provide behavioral markers for a quick assessment of primary progressive aphasia (PPA). PPA is a complex language syndrome where different speech and language properties such as prosody, lexical retrieval, and motor speech functioning may be affected. It is classified into three main variants: the nonfluent (nfvPPA), semantic (svPPA), and logopenic (lvPPA). Primary progressive apraxia of speech (PPAOS) is also distinguished (Duffy et al. 2017) but may fall into the category of nfvPPA (Gorno-Tempini et al. 2011). The present study aims to determine the contribution of the acoustic properties of vowels, prosody, and voice quality in the classification of PPA variants by using machine learning models. Methods. Oral samples from picture description tasks of 50 individuals with PPA (lvPPA:17, svPPA:14, nfvPPA:11, PPAOS:8) were automatically transcribed and segmented into vowels and consonants using the new acoustic analysis platform THEMIS. From the segmented vowels, we measured: i. Vowel formants (F1…F5) (den Ouden, et al. 2017); ii. vowel duration (Duffy, et al., 2017); iii. Mean fundamental frequency (F0), min F0 and max F0 (Hillis, 2014); iv. Pause duration (Mack et al. 2015), and v. H1–H2, H1–A1, H1–A2, H1–A3 measures of voice quality. We compared three machine learning models: support vector machines (SVM) (Cortes and Vapnik, 1995), random forests (RF) (Breiman, 2001), and decision trees (DT) (Hastie et al. 2009) in an one-against all strategy, where each variant was tested against all others. We run all models with a 3-fold group-cross-validation to ensure that the speakers in the training and evaluation sets are different. The models were implemented in Python (Pedregosa et al. 2011). Results. We report the mean cross-validated accuracy of the best performing model that resulted from model comparison: i. RF model provided the highest classification accuracy for nfvPPA [Mean 82%, SD: 9%], ii. SVM had the highest accuracy for svPPA [Mean 66%, SD: 8%], iii. RF had the highest accuracy for lvPPA [Mean 57%, SD: 15%] and iv. RF provided the highest classification accuracy for PPAOS [Mean 80%, SD: 8%] (Figure 1). In all models, pause duration and F0 measures were ranked higher than most other features (Figure 2). Discussion. This study employed an innovative method for the classification of PPA variants, using an automated speech transcription, segmentation, feature extraction and modeling. Using just acoustic features the best model classified nfvPP, svPPA, and PPAOS with high accuracy. However, acoustic features alone could not classify lvPPA with such high accuracy. More linguistic markers might be needed for a more accurate classification of lvPPA. Furthermore, we showed that prosody, which is measured by fundamental frequency and pause duration, contributes more than any other factor to the classification of PPA variants as alluded in previous research by our group and others (Hillis 2014, Patel et al. 2018, Mack 2015). Finally, the findings demonstrate the potential benefit of using machine learning models in clinical practice for the subtyping of PPA variants.}, booktitle = {Frontiers in Human Neuroscience. Conference Abstract: Academy of Aphasia 56th Annual Meeting, October 21-23, 2018, Montreal, Canada}, author = {Themistocleous, Charalambos and Ficek, Bronte and Webster, Kimberly and Wendt, Haley and Hillis, Argye and Den Ouden, Dirk-Bart and Tsapkini, Kyrana}, year = {2018}, } @inProceedings{themistocleous-etal-2018-classification-268340, title = {A classification study of the variants of Primary Progressive Aphasia using Machine Learning.}, abstract = {Introduction: Primary Progressive Aphasia (PPA) is a neurodegenerative syndrome in which linguistic abilities become gradually impaired. There are three primary variants of PPA: the non-fluent agrammatic PPA, the fluent type semantic PPA, and the logopenic PPA, which is also considered an atypical form of Alzheimer’s disease (Mesulam et al., 1982; Gorno-Tempini et al., 2011). Along with the three main variants, a fourth variant has been proposed, a non-fluent apraxia of speech (AOS), though this is currently the subject of an open debate (e.g., Duffy et al., 2017; Henry et al., 2013). According to sophisticated criteria established a few years ago, PPA subtyping for a given patient presented in clinic requires clinical, neuropsychological, and imaging information (Gorno-Tempini et al., 2011). Nevertheless, quantifying the decline of linguistic abilities and subtyping its variants manually is both hard and laborious, so there is a great demand for algorithms that subtype a given patient automatically. Picture description samples of connected speech and random forests techniques have been used for this purpose (de Aguiar et al., 2017; Wilson et al., 2010, Fraser et al. 2013, 2014). In the present study, we compared existing models and we propose a new one. Aims: In this study, we provide an automated classification model of the four PPA variants trained on known morphological and acoustic predictors and on predictors related to the clinical and linguistic profile of individuals with PPA (e.g., Mack et al., 2015; Gorno-Tempini et al., 2011; Wilson et al., 2010). Method: Speech materials for this study come from the Transcranial Direct Current Stimulation for Primary Progressive Aphasia study at Johns Hopkins University. Twenty-six individuals with PPA (Mean(SD) age = 68.6 (7.8) years, Mean(SD) education = 16.1 (2.9) years) participated in this study. PPA participants were diagnosed based on the established consensus criteria (Gorno-Tempini et al., 2011) based on imaging, clinical, and neuropsychological examination by trained neurologists. Individuals with PPA included non-fluent AOS (N=5), non fluent (N=7), logopenic (N=8), and semantic (N=6) variants. Recordings of the Cookie Theft picture description from the Boston Diagnostic Aphasia Examination (BDAE) were computationally analyzed. All speech productions were automatically transcribed and segmented using an end-to-end speech-to-transcription platform. From the speech signals, we measured morphological and acoustic predictors, including vowel formants F1 ... F3, measured at 15%, 50%, and 75% of vowel’s duration, vowel duration, fundamental frequency, and pause duration. The analysis and the statistics were conducted using Python and R programming languages (R Core Team, 2017; Rossum, 1995). Three different machine learning algorithms: C5.0 decision trees, Classification and Regression Trees (CART) and random forests were trained on the predictors (Breiman, 2001; Quinlan, 1993; Hastie et al., 2009). All models were trained on the 80% of the speakers (training set), with 3-fold cross-validation. All predictor variables were centered and scaled. C5.0 was trained with winnowing and without winnowing. (Winnowing facilitates the automatic pre-selection of the predictors that are used in the decision tree.) After the training we evaluated the trained models on the unknown dataset, namely the 20% of the speakers (evaluation set). Results: C5.0 provided 86% (95% CI[81, 88], kappa = 0.76) and Random Forests 85% (95% CI[81, 88], kappa = 0.76) classification accuracy on the test data; CART provided the lowest overall classification accuracy. Overall, C5.0 outperformed both the random forests and CART, with high classification accuracy on unknown data. Non-fluent AOS was correctly predicted by both C5.0 and random forests. Discussion: C5.0 classification model provides support for the known predictors employed in the literature. Also, it provides initial support for the distinct properties of the non-fluent AOS variant and corroborate research on classification of AOS using acoustic properties especially those related to vowel production (Den Ouden et al. 2017). However, given the low number of participants employed in this study, further research is required, with a largest number of participants. Nevertheless, the proposed methods employed here constitute a promising step towards a computational differential diagnostic tool of PPA that is easy to use, quick and accurate. }, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA}, author = {Themistocleous, Charalambos and Ficek, Bronte and Webster, Kimberly and Wendt, Haley and Hillis, Argye E. and den Ouden, Dirk Bart and Tsapkini, Kyrana}, year = {2018}, } @article{themistocleous-etal-2018-identification-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, journal = {Frontiers in Neurology}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @inProceedings{edstrom-etal-2018-ageism-267250, title = {Ageism and Swedish news media}, abstract = {Ageism can be seen as a “social disease”, a casual or systematic prejudice, stereotyping and discriminating against individuals or groups on the basis of their age. This is an area of growing concern, particularly the role of mainstream media in relationship to ageism. A valuable and important step is to understand the presence of ageing and older age how different types of online news media. The main objective of this pilot work is to test, collate and produce evidence from Swedish news media representations of older ages and ageing. METHOD(S) Two pilot studies/experiments; first names and their frequencies of the carriers’ age according to Statistics Sweden (SCB) and their presence in 39 online news between 2015 and 2018. ( 4, 7 millions texts). using general pattern matching techniques with regular expressions and applying them to 13 issues (1994, 2001-13) of Göteborgs-Posten (Swedish news corpora). Definition: Older persons ≥60 years. (25 % of the population in Sweden is over 60 yearsRESULTS AND CONCLUSIONS: Clear and consistent differences of how various age spans are represented in the news. 20-50 year olds is highly over represented compared with the Swedish population, while 0-24 and people over 54 are underrepresented, especially women. Pattern matching exhibits similar characteristics with the exception of obituaries where the elderly mentions are much more frequent.Our pilot studies confirm the introspective view of underrepresentation of old age and older people in or trends can be revealed within a larger time span and synchronic media sources. More studies are required and in the near future we plan to improve, scale and apply our methodology on both synchronic and diachronic data using e.g. available text corpora and try to get a solid perspective on whether any differences or trends can be revealed within a larger time span }, booktitle = {24th Nordic Congress of Gerontoloy (NKG). Oslo, Norway: 2-4 May 2018 }, author = {Edström, Maria and Kokkinakis, Dimitrios and Berggren, Max}, year = {2018}, } @inProceedings{fyndanis-themistocleous-2018-morphosyntactic-271917, title = {Morphosyntactic production in agrammatic aphasia: A cross-linguistic machine learning approach.}, abstract = {Introduction Recent studies on agrammatic aphasia by Fyndanis et al. (2012, 2017) reported evidence against the cross-linguistic validity of unitary accounts of agrammatic morphosyntactic impairment, such as the Distributed Morphology Hypothesis (DMH) (Wang et al., 2014), the two versions of the Interpretable Features’ Impairment Hypothesis (IFIH-1: Fyndanis et al., 2012; IFIH-2: Fyndanis et al., 2018b), and the Tree Pruning Hypothesis (TPH) (Friedmann & Grodzinsky, 1997). However, some of the features/factors emphasized by the accounts above (i.e. involvement of inflectional alternations (DMH), involvement of integration processes (IFIH-1), involvement of both integration processes and inflectional alternations (IFIH-2), position of a morphosyntactic feature/category in the syntactic hierarchy (TPH)) may still play a role in agrammatic morphosyntactic production. These features may act in synergy with other factors in determining the way in which morphosyntactic production is impaired across persons with agrammatic aphasia (PWA) and across languages. Relevant factors may include language-independent and language-specific properties of morphosyntactic categories, as well as subject-specific and task/material-specific variables. The present study addresses which factors determine verb-related morphosyntactic production in PWA and what is their relative importance. Methods We collapsed the datasets of the 24 Greek-, German-, and Italian-speaking PWA underlying Fyndanis et al.’s (2017) study, added the data of two more Greek-speaking PWA, and employed machine learning algorithms to analyze the data. The unified dataset consisted of data on subject-verb agreement, time reference (past reference, future reference), grammatical mood (indicative, subjunctive), and polarity (affirmatives, negatives). All items/conditions were represented as clusters of theoretically motivated features: ±involvement of integration processes, ±involvement of inflectional alternations, ±involvement of both integration processes and inflectional alternations, and low/middle/high position in the syntactic hierarchy. We included 14 subject-specific, category-specific and task/material-specific predictors: Verbal Working Memory (WM), (years of formal) Education, Age, Gender, Mean Length of Utterance in (semi)spontaneous speech (Index 1 of severity of agrammatism), Proportion of Grammatical Sentences in (semi)spontaneous speech (Index 2 of severity of agrammatism), Words per Minute in (semi)spontaneous speech (Index of fluency), Involvement of inflectional alternations, Involvement of integration processes, Involvement of both integration processes and inflectional alternations, Position of a given morphosyntactic category in the syntactic hierarchy (high, middle, low), Item Presentation mode (cross-modal, auditory), Response mode (oral, written), and Language (Greek, German, Italian). Different machine learning models were employed: Random Forest, C5.0 decision tree, RPart, and Support Vector Machine. Results & Discussion Random Forest model outperformed all the other models achieving the highest accuracy (0.786). As shown in Figure 1, the best predictors of accuracy on tasks tapping morphosyntactic production were the involvement of both integration processes and inflectional alternations (categories involving both integration processes and inflectional alternations were more impaired than categories involving one or neither of them), verbal WM capacity (the greater the WM capacity, the better the morphosyntactic production), and severity of agrammatism (the more severe the agrammatism, the worse the morphosyntactic production). Results are consistent with IFIH-2 (Fyndanis et al., 2018b) and studies highlighting the role of verbal WM in morphosyntactic production (e.g., Fyndanis et al., 2018a; Kok et al., 2007).}, booktitle = {Frontiers in Human Neuroscience. Academy of Aphasia 56th Annual Meeting, Montreal, Canada, 21 Oct - 23 Oct, 2018. }, author = {Fyndanis, Valantis and Themistocleous, Charalambos}, year = {2018}, } @article{eckhoff-etal-2018-proiel-265108, title = {The PROIEL treebank family: a standard for early attestations of Indo-European languages}, abstract = {This article describes a family of dependency treebanks of early attestations of Indo-European languages originating in the parallel treebank built by the members of the project pragmatic resources in old Indo-European languages. The treebanks all share a set of open-source software tools, including a web annotation interface, and a set of annotation schemes and guidelines developed especially for the project languages. The treebanks use an enriched dependency grammar scheme complemented by detailed morphological tags, which have proved sufficient to give detailed descriptions of these richly inflected languages, and which have been easy to adapt to new languages. We describe the tools and annotation schemes and discuss some challenges posed by the various languages that have been annotated. We also discuss problems with tokenisation, sentence division and lemmatisation, commonly encountered in ancient and mediaeval texts, and challenges associated with low levels of standardisation and ongoing morphological and syntactic change.}, journal = {Language Resources and Evaluation}, author = {Eckhoff, H. and Bech, K. and Bouma, Gerlof and Eide, K. and Haug, D. and Haugen, O. E. and Johndal, M.}, year = {2018}, volume = {52}, number = {1}, pages = {29--65}, } @misc{kokkinakis-2018-resources-265118, title = {Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2)}, abstract = {Proceedings of the second RaPID: "Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments". An LREC workshop. 8th of May 2018, Miyazaki, Japan}, author = {Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{neofytou-etal-2018-understanding-271916, title = {Understanding and classifying the different variants of Primary Progressive Aphasia based on spelling performance}, abstract = {Introduction: Previous findings suggest differences in the written spelling performance between the three variants of Primary Progressive Aphasia (PPA) - semantic (svPPA), logopenic (lvPPA) and non-fluent (nfvPPA) (Shim et al., 2012; Sepelyak et al., 2011). However, no attempts have been made to systematically distinguish the three variants in terms of their spelling performance. The challenges of classification are considerable and given the ease of administering a spelling test, we aimed to determine to what extent a spelling task can provide accurate classification of the PPA variants. Method: Thirty-three participants with PPA were included - 14 lvPPAs, 11 nfvPPAs and 8 svPPAs – originally classified using the neuropsychological and spoken language criteria defined by Gorno-Tempini et al. (2011). Data were collected prior to spelling treatment, using a spelling to dictation task with both real-words and pseudowords (92-138 items/per participant), scored for each grapheme (i.e., letter) and analyzed for each participant individually using generalized linear mixed effects models (GLMEM) for real-words and pseudowords separately. The variables of interest for both real-words and pseudowords were word length, phoneme-grapheme conversion probability and grapheme position. The real-word models also included frequency, imageability, and the orthographic and phonological neighborhood density of the target words. The coefficients from the output of the GLMEMs, together with 3 additional variables – verb/noun and pseudoword/word accuracy differences from the spelling task, and language impairment severity according to FTD-CDR (Knopman, 2008) - were used as predictors in a Random Forests (RFs) model implemented in Python, to identify the variables that contribute the most in distinguishing the three variants. Then, the three most significant predictors identified with RFs were used in multinomial models implemented in R to classify the PPA variants. The model was trained on a training set of all participants minus one (i.e. the left-out participant) and evaluated on the left-out participant, known as Leave-One-Out cross-validation. This process was repeated 33 times to evaluate all participants. Results: The three most significant predictors of the RFs analysis were: (1) grapheme position in real-words, (2) pseudoword/word accuracy difference, and (3) length of real-words (Figure 1). The overall accuracy of the multinomial models with these three predictors only was 67%: lvPPA=71%, nfvPPA=64% and svPPA=63%. When severely impaired cases (language severity =3 in Knopman et al., 2008; FTD-CDR criteria) were excluded (giving a new dataset of 22 participants), the overall accuracy increased to 91%: lvPPA=90%, nfvPPA=86% and svPPA=100%. Discussion: Our study provides evidence of the value of considering spelling performance in understanding and classifying the different variants of PPA. The results suggest that lexical status, word length and grapheme position are useful parameters for classification, which index key components of the cognitive architecture of spelling (Rapp, 2002). Also, the finding that prediction accuracy increased when more severe cases were excluded supports previous findings (Mesulam et al., 2012), as severity increases variants become less differentiated and classification is more difficult. In sum, a relatively short, easy-to-administer spelling test, provides useful information for PPA variant classification and can potentially be used as a clinical tool.}, booktitle = {Frontiers in Human Neuroscience}, author = {Neofytou, Kyriaci and Themistocleous, Charalambos and Wiley, Robert and Tsapkini, Kyrana and Rapp, Brenta}, year = {2018}, } @inProceedings{malm-etal-2018-uneek-267351, title = {Uneek: a Web Tool for Comparative Analysis of Annotated Texts}, abstract = {In this paper, we present Uneek, a web based linguistic tool that performs set operations on raw or annotated texts. The tool may be used for automatic distributional analysis, and for disambiguating polysemy with a method that we refer to as semi-automatic uniqueness differentiation (SUDi). Uneek outputs the intersection and differences between their listed attributes, e.g. POS, dependencies, word forms, frame elements. This makes it an ideal supplement to methods for lumping or splitting in frame development processes. In order to make some of Uneek’s functions more clear, we employ SUDi on a small data set containing the polysemous verb "bake". As of now, Uneek may only run two files at a time, but there are plans to develop the tool so that it may simultaneously operate on multiple files. Finally, we relate the developmental plans for added functionality, to how such functions may support FrameNet work in the future.}, booktitle = {Proceedings of the LREC 2018 Workshop International FrameNetWorkshop 2018: Multilingual Framenets and Constructicons, 7-12 May 2018, Miyazaki (Japan) / [ed] Tiago Timponi Torrent, Lars Borin & Collin F. Baker, 2018}, author = {Malm, Per and Ahlberg, Malin and Rosén, Dan}, year = {2018}, ISBN = {979-10-95546-04-7}, } @inProceedings{angelopoulou-etal-2018-pause-268338, title = {Pause patterns and speech errors in stroke patients with aphasia: cross-linguistic evidence from narrative speech.}, booktitle = {Clinical Aphasiology Conference, CAC 2018, Austin, Texas USA.}, author = {Angelopoulou, Georgia and Kiran, Swathi and Kasselimis, Dimitrios and Varkanitsa, Maria and Meier, Erin and Yue, Pan and Tsolakopoulos, Dimitrios and Themistocleous, Charalambos and Vassilopoulou, Sofia and Korompoki, Eleni and Tountopoulou, Argyro and Papageorgiou, Georgios and Goutsos, Dionysis, and Evdokimidis, Ioannis and Potagas, Constantin}, year = {2018}, } @inProceedings{alfter-pilan-2018-complex-276407, title = {SB@ GU at the Complex Word Identification 2018 Shared Task}, booktitle = {Proceedings of the Thirteenth Workshop on Innovative Use of NLP for Building Educational Applications, New Orleans, Louisiana, June 5, 2018}, author = {Alfter, David and Pilán, Ildikó}, year = {2018}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA, USA}, ISBN = {978-1-948087-11-7}, } @inProceedings{themistocleous-kokkinakis-2018-themis-265112, title = {THEMIS-SV: Automatic classification of language disorders from speech signals}, abstract = {Background and Aims: Brain injuries resulting from stroke can affect the production of speech resulting in different types of language impairments, such as aphasia. Studying these productions manually is an extremely cumbersome and time consuming process. The aim of this paper is to present THEMIS-SV: a system that enables the automatic transcription of speech signals and the segmentation of vowels and consonants in Swedish. Method: The input of the system are recordings of speech. The system processes the recordings and returns an output with three tiers: the utterance tier, the word tier, and the vowels/consonants tier. Results: The output of the system is a fast and reliable transcription and segmentation of speech, which is very close to transcriptions and segmentations performed manually. The automatic segmentation of speech enables targeted acoustic measurements, such as measurements of consonant spectra, formant frequencies of vowels, fundamental frequency, pauses, speech rate, etc. and other acoustic measurements that have been known to differentiate between the different types of language disorders. Conclusion: The method proposed here can be employed for the analysis of speech of individuals with post-stroke aphasia and other language disorders and constitutes a promising step towards a fully automated differential diagnostic tool for language disorders. }, booktitle = {Abstracts of the 4th European Stroke Organisation Conference (ESOC 2018). Gothenburg, Sweden, 16-18 May, 2018. }, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, }