@article{Fraser-Kathleen2019-270713, title = {Multilingual word embeddings for the assessment of narrative speech in mild cognitive impairment}, abstract = {We analyze the information content of narrative speech samples from individuals with mild cognitive impairment (MCI), in both English and Swedish, using a combination of supervised and unsupervised learning techniques. We extract information units using topic models trained on word embeddings in monolingual and multilingual spaces, and find that the multilingual approach leads to significantly better classification accuracies than training on the target language alone. In many cases, we find that augmenting the topic model training corpus with additional clinical data from a different language is more effective than training on additional monolingual data from healthy controls. Ultimately we are able to distinguish MCI speakers from healthy older adults with accuracies of up to 63% (English) and 72% (Swedish) on the basis of information content alone. We also compare our method against previous results measuring information content in Alzheimer's disease, and report an improvement over other topic-modeling approaches. Furthermore, our results support the hypothesis that subtle differences in language can be detected in narrative speech, even at the very early stages of cognitive decline, when scores on screening tools such as the Mini-Mental State Exam are still in the “normal” range.}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2019}, volume = {53}, pages = {121--139}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Fraser-Kathleen2019-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the Annual Conference of the North American Chapter of the Association for Computational Linguistics.}, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, adress = {Minneapolis, Minnesota. United States.}, } @inProceedings{Kokkinakis-Dimitrios2019-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{Linz-Nicklas2019-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, USA}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2019-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{Kokkinakis-Dimitrios2018-262851, title = {A Swedish Cookie-Theft Corpus}, abstract = {Language disturbances can be a diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, at earlier stages, and connected speech analysis provides a non-invasive and easy-to-assess measure for determining aspects of the severity of language impairment. In this paper we focus on the development of a corpus consisting of audio recordings of picture descriptions of the Cookie-theft, produced by Swedish speakers, and accompanying transcriptions. The speech elicitation procedure provides an established method of obtaining highly constrained samples of connected speech that can allow us to study the intricate interactions between various linguistic levels and cognition. We chose the Cookie-theft picture since it is a standardized test that has been used in various studies in the past, and therefore comparisons can be made based on previous results. This type of picture description task might be useful for detecting subtle language deficits in patients with subjective and mild cognitive impairment. The resulting corpus is a new, rich and multi-faceted resource for the investigation of linguistic characteristics of connected speech and a unique data set that provides a rich resource for (future) research and experimentation in many areas, and of language impairment in particular. The information in the corpus can also be combined and correlated with other collected data about the speakers, such as neuropsychological tests, imaging and brain physiology markers and cerebrospinal fluid markers.}, booktitle = {LREC 2018, 11th edition of the Language Resources and Evaluation Conference, 7-12 May 2018, Miyazaki (Japan) / Editors: Nicoletta Calzolari (Conference chair), Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Koiti Hasida, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis, Takenobu Tokunaga}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Nordlund, Arto}, year = {2018}, publisher = {European Language Resources Association}, ISBN = {979-10-95546-00-9}, } @inProceedings{LundholmFors-Kristina2018-263790, title = {Automated Syntactic Analysis of Language Abilities in Persons with Mild and Subjective Cognitive Impairment}, abstract = {In this work we analyze the syntactic complexity of transcribed picture descriptions using a variety of automated syntactic features, and investigate the features’ predictive power in classifying narratives from people with subjective and mild cognitive impairment and healthy controls. Our results indicate that while there are no statistically significant differences, syntactic features can still be moderately successful at distinguishing the participant groups when used in a machine learning framework.}, booktitle = {Building continents of knowledge in oceans of data : the future of co-created eHealth: proceedings of MIE2018, 24-26 April 2018, Gothenburg, Sweden / edited by Adrien Ugon, Daniel Karlsson, Gunnar O. Klein and Anne Moen.}, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, publisher = {IOS Press}, adress = {Amsterdam}, ISBN = {978-1-61499-851-8}, } @inProceedings{LundholmFors-Kristina2018-264400, title = {Eye-voice span in adults with mild cognitive impairment (MCI) and healthy controls. }, abstract = {Objectives: This study is part of a larger project focused on developing new techniques for identification of early linguistic and extra-linguistic signs of cognitive impairment, with the overall goal of identifying dementia in the preclinical stage. In a previous study, we found that eye movements during reading can be used to distinguish between subjects with mild cognitive impairment (MCI) and healthy controls with up to 86% accuracy. In this study, we are investigating the process of reading aloud, by exploring the eye-voice span in subjects with and without cognitive impairment. The aim of the study is to identify differences in the reading processes and evaluate whether these differences can be used to discriminate between the two groups. Methods: The eye-voice span is a measurement of the temporal and spatial organization between the eye and the voice, and is affected by for example working memory and automaticity, but also by the familiarity and length of words. In previous work, differences between eye movements when reading in healthy controls and subjects with cognitive impairments have been identified, and it has been shown that subjects with Alzheimer’s disease show impairments when reading aloud, specifically with regards to speech and articulation rate. Results: We present a quantitative and qualitative analysis of the reading process in the subjects, focusing both on general measures of eye-voice span, but also specifically on instances of hesitation and mistakes in the speech, and the correlated eye movements. Conclusions/Take home message: Early detection of dementia is important for a number of reasons, such as giving the person access to interventions and medications, and allowing the individual and families time to prepare. By expanding the knowledge about reading processes in subjects with MCI, we are adding to the potential of using reading analysis as an avenue of detecting early signs of dementia.}, booktitle = {Book of Abstracts 10th CPLOL Congress 10-12 May 2018, Cascais, Portugal / editor : Trinite, Baiba }, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{Fraser-Kathleen2018-264397, title = {Improving the Sensitivity and Specificity of MCI Screening with Linguistic Information.}, abstract = {The Mini-Mental State Exam (MMSE) is a screening tool for cognitive impairment. It has been extensively validated and is widely used, but has been criticized as not being effective in detecting mild cognitive impairment (MCI). In this study, we examine the utility of augmenting MMSE scores with automatically extracted linguistic information from a narrative speech task to better differentiate between individuals with MCI and healthy controls in a Swedish population. We find that with the addition of just four linguistic features, the F score (measuring a trade-off between sensitivity and specificity) is improved from 0.67 to 0.81 in logistic regression classification. These preliminary results suggest that the accuracy of traditional screening tools may be improved through the addition of computerized language analysis.}, booktitle = {Proceedings of the LREC workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2). 8th of May 2018, Miyazaki, Japan / Dimitrios Kokkinakis (ed.)}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{Kokkinakis-Dimitrios2018-265113, title = {Kan textforskning bidra till tidigare och säkrare demensdiagnostik?}, abstract = {Tidigare forskning har visat att subtila språkstörningar kan finnas vid de tidigaste förstadierna till demens, flera år innan en klinisk diagnos kan ställas. Inom ramen för projektet ”Språkliga och extra-lingvistiska parametrar för tidig upptäckt av kognitiv svikt” (finansierat av Riksbankens Jubileumsutlysning, 2016-19) undersöker vi med hjälp av språkteknologi och språkanalysstudier hur dessa språkstörningar yttrar sig. Kan språkteknologi användas för att upptäcka dessa tidiga språkrelaterade symtom och därmed bidra med nyanserad, komplementär och användbar kunskap? Kan användning av språkteknologi särskilja personer med de allra tidigaste kognitiva avvikelserna från personer med mer godartad, åldersrelaterad kognitiv svikt? Vilka språkliga förmågor drabbas? Hur yttrar sig dessa förändringar och vilka slags empiriska material finns att tillgå? Dessa är några av de frågor vi söker svar på. Vi gör inspelningar som vi analyserar för att kunna ta fram ny kunskap om subtila språkliga kännetecken som kan föregå demensutveckling. Denna kunskap kan användas för att eventuellt kunna förutsäga vilka individer som befinner sig i riskzonen för att utveckla demens, och kan vara användbar som komplementerande beslutsunderlag till domänexperter. Vi utvinner, analyserar och undersöker om det finns samband mellan olika språkrelaterade parametrar från spontan talinteraktion, transkriptioner men även ögonrörelser och neuropsykologiska tester från personer med subjektiv eller lindrig kognitiv nedsättning och friska kontrollpersoner. Många gånger är det svårt att avgöra huruvida lindriga kognitiva symtom är en del av det normala åldrandet eller början på en neurodegenerativ process. Vi förväntar oss inte heller att varje enskild person med kognitiv nedsättning kommer att uttrycka sig eller läsa på samma sätt utan snarare att dessa personer tidigt i sjukdomsförloppet kommer att börja uppvisa olika slags avvikande läsmönster, eller göra fonologiska, lexikala, syntaktiska eller semantiska fel. I studien utvecklar vi verktyg för att automatiskt hitta dessa avvikelser, och målet är att detta sedan ska kunna användas som komplement till tidig diagnostik samt som prognostiskt eller screeningverktyg. Deltagarna i vår studie har rekryterats från en pågående longitudinell studie, ”Demens i Tidigt Skede”, (eng. ”The Gothenburg MCI study”) på Minnesmottagningen i Göteborg, och vårt projekt har godkänts av den lokala etiknämnden. Alla deltagare i studien (kontrollgruppen [HC], personer med subjektiv kognitiv nedsättning [SCI] och personer med mild kognitiv nedsättning [MCI]) har genomgått baslinjeundersökning och gett informerat skriftligt samtycke (demografisk information finns i tabell 1). Vårt projekt är f.n. pågående och vi kommer presentera resultat baserade på inspelningstillfälle nr ett (aug. 2016-mars 2017). En ny inspelningsomgång, med samma deltagare, började i februari 2018 och förväntas vara avslutat i december 2018. Under presentationen kommer vi ge exempel på olika tal-, text- och ögonrörelseanalyser vi har genomfört och diskutera metodval och resultat baserade på studiens första fas. Vi kommer vidare ge en kort inblick i den nya, pågående inspelningsomgången och de nya testmoment vi använder. Vi vill med vårt arbete visa hur språkteknologisk analys kan bidra till att utöka vår kunskap inom området så att den kan vara användbar för tidig diagnostik och optimal omvårdnad. Enligt Socialstyrelsen (2017) finns det i Sverige över 160 000 personer med någon demenssjukdom. Våra resultat kan ha en betydelse för vårdpersonal som snabbare vill diagnostisera och identifiera individer med olika former av kognitiv funktionsnedsättning innan allvarliga symtom blir påtagliga. Utvecklingsmöjligheterna är många: nya eller förbättrade kognitiva screeningtester som skulle kunna användas inom primär- och specialistvården, samt utveckling och tillämpning av insatser som kan påverka beteendemönster och träna upp individens kommunikativa förmåga, kan på sikt leda till positiva konsekvenser som minskade vårdköer samt effektivare behandling avseende kostnader och behandlingsutfall.}, booktitle = {Forum för textforskning 13 , Lund 7 – 8 juni 2018}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{Themistocleous-Charalambos2018-265112, title = {THEMIS-SV: Automatic classification of language disorders from speech signals}, abstract = {Background and Aims: Brain injuries resulting from stroke can affect the production of speech resulting in different types of language impairments, such as aphasia. Studying these productions manually is an extremely cumbersome and time consuming process. The aim of this paper is to present THEMIS-SV: a system that enables the automatic transcription of speech signals and the segmentation of vowels and consonants in Swedish. Method: The input of the system are recordings of speech. The system processes the recordings and returns an output with three tiers: the utterance tier, the word tier, and the vowels/consonants tier. Results: The output of the system is a fast and reliable transcription and segmentation of speech, which is very close to transcriptions and segmentations performed manually. The automatic segmentation of speech enables targeted acoustic measurements, such as measurements of consonant spectra, formant frequencies of vowels, fundamental frequency, pauses, speech rate, etc. and other acoustic measurements that have been known to differentiate between the different types of language disorders. Conclusion: The method proposed here can be employed for the analysis of speech of individuals with post-stroke aphasia and other language disorders and constitutes a promising step towards a fully automated differential diagnostic tool for language disorders. }, booktitle = {Abstracts of the 4th European Stroke Organisation Conference ​(ESOC 2018). Gothenburg, Sweden, 16-18 May, 2018. }, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, } @misc{Kokkinakis-Dimitrios2018-265118, title = {Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2)}, abstract = {Proceedings of the second RaPID: "Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments". An LREC workshop. 8th of May 2018, Miyazaki, Japan}, author = {Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{Themistocleous-Charalambos2018-265821, title = {A classification study of speech productions produced by healthy speakers and speakers with Mild Cognitive Impairment using Deep Sequential Neural Networks}, abstract = {Mild cognitive impairment (MCI) is a neurological condition, which is characterized by a noticeable decline of cognitive abilities, including communicative and linguistic skills. Nevertheless, evidence from speech production has been inconsistent with respect to features and factors that are most affected. This study employs speech properties from vowels, produced in a reading task by 55 Swedish speakers—30 healthy controls and 25 MCI — and aims to distinguish MCI and healthy productions. The study presents two machine learning classification tasks. The first is a classification of speech productions as MCI or healthy and the second is a classification of speakers as MCI or healthy. The study evaluates several Deep Neural Network Architectures that resulted in high classification accuracy of MCI and healthy speech productions and MCI and healthy speakers. The proposed neural models can be employed in methods of early detection of cognitive decline in order to quantify the progression of the disease and to provide suitable therapeutics. }, booktitle = {The Sixth IEEE International Conference on Healthcare Informatics (IEEE-ICHI 2018), New York, USA.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{Edström-Maria2018-267250, title = {Ageism and Swedish news media}, abstract = {Ageism can be seen as a “social disease”, a casual or systematic prejudice, stereotyping and discriminating against individuals or groups on the basis of their age. This is an area of growing concern, particularly the role of mainstream media in relationship to ageism. A valuable and important step is to understand the presence of ageing and older age how different types of online news media. The main objective of this pilot work is to test, collate and produce evidence from Swedish news media representations of older ages and ageing. METHOD(S) Two pilot studies/experiments; first names and their frequencies of the carriers’ age according to Statistics Sweden (SCB) and their presence in 39 online news between 2015 and 2018. ( 4, 7 millions texts). using general pattern matching techniques with regular expressions and applying them to 13 issues (1994, 2001-13) of Göteborgs-Posten (Swedish news corpora). Definition: Older persons ≥60 years. (25 % of the population in Sweden is over 60 yearsRESULTS AND CONCLUSIONS: Clear and consistent differences of how various age spans are represented in the news. 20-50 year olds is highly over represented compared with the Swedish population, while 0-24 and people over 54 are underrepresented, especially women. Pattern matching exhibits similar characteristics with the exception of obituaries where the elderly mentions are much more frequent.Our pilot studies confirm the introspective view of underrepresentation of old age and older people in or trends can be revealed within a larger time span and synchronic media sources. More studies are required and in the near future we plan to improve, scale and apply our methodology on both synchronic and diachronic data using e.g. available text corpora and try to get a solid perspective on whether any differences or trends can be revealed within a larger time span }, booktitle = {24th Nordic Congress of Gerontoloy (NKG). Oslo, Norway: 2-4 May 2018 }, author = {Edström, Maria and Kokkinakis, Dimitrios and Berggren, Max}, year = {2018}, } @inProceedings{Themistocleous-Charalambos2018-270215, title = {Effects of Mild Cognitive Impairment on vowel duration }, abstract = {Mild cognitive impairment (MCI) is a neurological condition, which is characterized by a noticeable decline of cognitive abilities, including communicative and linguistic skills. In this study, we have measured the duration of vowels produced in a reading task by 55 speakers— 30 healthy controls and 25 MCI—. The main results showed that MCI speakers differed significantly from HC in vowel duration as MCI speakers produced overall longer vowels. Also, we found that gender effects on vowel duration were different in MCI and HC. One significant aspect of this finding is that they highlight the contribution of vowel acoustic features as markers of MCI.}, booktitle = {Proceedings of the 9th Tutorial & Research Workshop on Experimental Linguistics, 28 - 30 August 2018, Paris, France / edited by Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios and Eckerström, Marie and Fraser, Kathleen and Lundholm Fors, Kristina}, year = {2018}, ISBN = {978-960-466-162-6 }, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{Themistocleous-Charalambos2018-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @inProceedings{Björkner-Eva2017-256522, title = {Voice acoustic parameters for detecting signs of early cognitive impairment}, abstract = {Aiding the detection of very early cognitive impairment in Alzheimer's disease (AD) and assessing the disease progression are essential foundations for effective psychological assessment, diagnosis and planning. Efficient tools for routine dementia screening in primary health care, particularly non-invasive and cost-effective methods, are desirable. The aim of this study is to find out if voice acoustic analysis can be a useful tool for detecting signs of early cognitive impairment.}, booktitle = {PEVOC (PanEuropean Voice Conference) 12, August 30th - September 1st 2017, Ghent, Belgium}, author = {Björkner, Eva and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, } @inProceedings{Kokkinakis-Dimitrios2017-256955, title = {Data Collection from Persons with Mild Forms of Cognitive Impairment and Healthy Controls - Infrastructure for Classification and Prediction of Dementia}, abstract = {Cognitive and mental deterioration, such as difficulties with memory and language, are some of the typical phenotypes for most neurodegenerative diseases including Alzheimer’s disease and other dementia forms. This paper describes the first phases of a project that aims at collecting various types of cognitive data, acquired from human subjects in order to study relationships among linguistic and extra-linguistic observations. The project’s aim is to identify, extract, process, correlate, evaluate, and disseminate various linguistic phenotypes and measurements and thus contribute with complementary knowledge in early diagnosis, monitor progression, or predict individuals at risk. In the near future, automatic analysis of these data will be used to extract various types of features for training, testing and evaluating automatic classifiers that could be used to differentiate individuals with mild symptoms of cognitive impairment from healthy, age-matched controls and identify possible indicators for the early detection of mild forms of cognitive impairment. Features will be extracted from audio recordings (speech signal), the transcription of the audio signals (text) and the raw eye-tracking data.}, booktitle = {Proceedings of the 21st Nordic Conference on Computational Linguistics, NoDaLiDa, 22-24 May 2017, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2017}, publisher = {Linköping University Electronic Press, Linköpings universitet}, adress = {Linköping}, ISBN = {978-91-7685-601-7}, } @inProceedings{Fraser-Kathleen2017-257840, title = {An analysis of eye-movements during reading for the detection of mild cognitive impairment}, abstract = {We present a machine learning analysis of eye-tracking data for the detection of mild cognitive impairment, a decline in cognitive abilities that is associated with an increased risk of developing dementia. We compare two experimental configurations (reading aloud versus reading silently), as well as two methods of combining information from the two trials (concatenation and merging). Additionally, we annotate the words being read with information about their frequency and syntactic category, and use these annotations to generate new features. Ultimately, we are able to distinguish between participants with and without cognitive impairment with up to 86% accuracy.}, booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. September 9-11, 2017 Copenhagen, Denmark / Editors Martha Palmer, Rebecca Hwa, Sebastian Riedel }, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, publisher = {Association for Computational Linguistics }, ISBN = {978-1-945626-83-8}, } @inProceedings{Kokkinakis-Dimitrios2016-243069, title = {Data Resource Acquisition from People at Various Stages of Cognitive Decline – Design and Exploration Considerations}, abstract = {In this paper we are introducing work in progress towards the development of an infrastructure (i.e., design, methodology, creation and description) of linguistic and extra-linguistic data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of various types of modalities; i.e. audio-recorded spoken language samples; transcripts of the audio recordings (text) and eye tracking measurements. The integration of the extra-linguistic information with the linguistic phenotypes and measurements elicited from audio and text, will be used to extract, evaluate and model features to be used in machine learning experiments. In these experiments, classification models that will be trained, that will be able to learn from the whole or a subset of the data to make predictions on new data in order to test how well a differentiation between the aforementioned groups can be made. Features will be also correlated with measured outcomes from e.g. language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = {The Seventh International Workshop on Health Text Mining and Information Analysis (Louhi). November 5, 2016, Austin, Texas, USA}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Nordlund, Arto}, year = {2016}, } @inProceedings{Kokkinakis-Dimitrios2016-243100, title = {Linguistic and extra-linguistic parameters for early detection of cognitive impairment}, abstract = {AIM: to adapt, develop and test methods that in isolation have shown promising outcomes on tasks related to (early) detection of dementia, differentiating between various dementia types and controls and also increase our understanding of the cognitive processes that underlie written text and certain forms of spoken language production. Unlike previous models, based solely on a certain aspect of language abilities (i.e. on written or spoken language alone), the project is comprehensive and more likely to provide new insights in the area of dementia detection and improve practices applied so far. The project builds on the success stories of the past and focus on the interplay between various types of technologies that hold the potential to provide reliable estimates for the detection of cognitive decline. The project emphasizes its interdisciplinary nature, by bringing together researchers from humanities (computational linguistics / language technology), computer science and medicine, and foresees the development of a comprehensive set of novel analytic approaches not explored jointly in the past GOAL: discovering evidence about linguistic performance and identifying whether the addition of new ways for investigating, combining and evaluating measurement and other parameters for improvement of established models can advance our understanding of: i) the boundaries between normal aging and dementia; ii) its effects on linguistic performance extrapolated from various sources and iii) whether effects of cognitive decline can be seen across (daily) language production. }, booktitle = {European Summer School on Eye Movements (ESSEM), 11-17 september, 2016 Athens, Greece.}, author = {Kokkinakis, Dimitrios}, year = {2016}, } @inProceedings{Kokkinakis-Dimitrios2016-243183, title = {Specifications and Methodology for Language-Related Data Acquisition and Analysis in the Domain of Dementia Diagnostics}, abstract = {This paper outlines the initial stages of a project that aims to build and use a corpus with data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of audio-recorded spoken language samples; transcripts of the audio recordings and eye tracking measurements. From these data we plan to extract, evaluate and model features to be used for learning classification models in order to test how well a differentiation between the aforementioned subject groups can be made. Features will be also correlated with outcomes from e.g. other language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = { The Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2016}, } @misc{Kokkinakis-Dimitrios2016-252412, title = {Proceedings of LREC 2016 Workshop: Resources and Processing of Linguistic and Extra-Linguistic Data from People with Various Forms of Cognitive/Psychiatric Impairments (RaPID-2016), Monday 23rd of May 2016. Linköping electronic conference proceedings.}, abstract = {The purpose of the Workshop on “Resources and ProcessIng of linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments” (RaPID-2016) was to provide a snapshot view of some of the current technological landscape, resources, data samples and also needs and challenges in the area of processing various data from individuals with various types of mental and neurological health impairments and similar conditions at various stages; increase the knowledge, understanding, awareness and ability to achieve useful outcomes in this area and strengthen the collaboration between researchers and workers in the field of clinical/nursing/medical sciences and those in the field of language technology/computational linguistics/Natural Language Processing (NLP). Although many of the causes of cognitive and neuropsychiatric impairments are difficult to foresee and accurately predict, physicians and clinicians work with a wide range of factors that potentially contribute to such impairments, e.g., traumatic brain injuries, genetic predispositions, side effects of medication, and congenital anomalies. In this context, there is new evidence that the acquisition and processing of linguistic data (e.g., spontaneous story telling) and extra-linguistic and production measures (e.g., eye tracking) could be used as a complement to clinical diagnosis and provide the foundation for future development of objective criteria to be used for identifying progressive decline or degeneration of normal mental and brain functioning. An important new area of research in NLP emphasizes the processing, analysis, and interpretation of such data and current research in this field, based on linguistic-oriented analysis of text and speech produced by such a population and compared to healthy adults, has shown promising outcomes. This is manifested in early diagnosis and prediction of individuals at risk, the differentiation of individuals with various degrees of severity forms of brain and mental illness, and for the monitoring of the progression of such conditions through the diachronic analysis of language samples or other extralinguistic measurements. Initially, work was based on written data but there is a rapidly growing body of research based on spoken samples and other modalities. Nevertheless, there remains significant work to be done to arrive at more accurate estimates for prediction purposes in the future and more research is required in order to reliably complement the battery of medical and clinical examinations currently undertaken for the early diagnosis or monitoring of, e.g., neurodegenerative and other brain and mental disorders and accordingly, aid the development of new, non-invasive, time and cost-effective and objective (future) clinical tests in neurology, psychology, and psychiatry.}, author = {Kokkinakis, Dimitrios}, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-730-4}, } @inProceedings{Kokkinakis-Dimitrios2015-215535, title = {Gender-Based Vocation Identification in Swedish 19th Century Prose Fiction using Linguistic Patterns, NER and CRF Learning}, abstract = {This paper investigates how literature could be used as a means to expand our understanding of history. By applying macroanalytic techniques we are aiming to investigate how women enter literature and particularly which functions they assume, their working patterns and if we can spot differences in how often male and female characters are mentioned with various types of occupational titles (vocation) in Swedish literary texts. Modern historiography, and especially feminist and women’s history has emphasized a relative invisibility of women’s work and women workers. The reasons behind this are manifold, and the extent, the margin of error in terms of women’s work activities is of course hard to assess. Therefore, vocation identification can be used as an indicator for such exploration and we present a hybrid system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Beside vo-cations, the system also assigns gender (male, female or unknown) to the vocation words, a prerequisite for the goals of the study and fu-ture in-depth explorations of the corpora.}, booktitle = {Proceedings of the Fourth Workshop on Computational Linguistics for Literature (Clfl). Co-located with the NAACL/HLT. Denver, Colorado, USA}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2015}, pages = {9}, } @inProceedings{Kokkinakis-Dimitrios2015-225762, title = {Detecting Reuse of Biblical Quotes in Swedish 19th Century Fiction using Sequence Alignment}, abstract = {Text reuse, a form of text repetition, recycling or borrowing, is a theoretically and practically interesting problem that has attracted considerable attention during the last years e.g. in the cultural heritage context (historical and comparative linguistics); in the context of social network propagation of ideas and in the measuring of journalistic reuse. In this paper we briefly outline and experiment with a method used for biological sequence alignment that have been also used in humanities research for e.g. the detection of similar passages in the complete works of Voltaire and 18th century French encyclopedias or for tracing how and which ideas spread in 19th century US-newspaper collections. We use available software (text-PAIR: Pairwise Alignment for Intertextual Relations) and experiment with the Charles XII Bible translation into Swedish, completed in 1703, against the content of the Swedish prose fiction 1800-1900, in order to automatically detect passages taken from this particular Bible translation in the prose fiction corpus.}, booktitle = {Corpus-based Research in the Humanities workshop (CRH), 10 December 2015 Warsaw, Poland }, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2015}, ISBN = {978-83-63159-19-1}, pages = {79--86}, } @article{Smith-Frida2014-188146, title = {Readability, suitability and comprehensibility in patient education materials for Swedish patients with colorectal cancer undergoing elective surgery: A mixed method design.}, abstract = {To characterize education materials provided to patients undergoing colorectal cancer surgery to gain a better understanding of how to design readable, suitable, comprehensible materials.}, author = {Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, RIchard and Friberg, Febe and Öhlén, Joakim}, year = {2014}, volume = {94}, number = {2}, pages = {202–209}, } @inProceedings{Moradi-Farnaz2014-197533, title = {A Graph-Based Analysis of Medical Queries of a Swedish Health Care Portal}, abstract = {Today web portals play an increasingly important role in health care allowing information seekers to learn about diseases and treatments, and to administrate their care. Therefore, it is important that the portals are able to support this process as well as possible. In this paper, we study the search logs of a public Swedish health portal to address the questions if health information seeking differs from other types of Internet search and if there is a potential for utilizing network analysis methods in combination with semantic annotation to gain insights into search behaviors. Using a semantic-based method and a graph-based analysis of word cooccurrences in queries, we show there is an overlap among the results indicating a potential role of these types of methods to gain insights and facilitate improved information search. In addition we show that samples, windows of a month, of search logs may be sufficient to obtain similar results as using larger windows. We also show that medical queries share the same structural properties found for other types of information searches, thereby indicating an ability to reuse existing analysis methods for this type of search data.}, booktitle = {The Fifth International Workshop on Health Text Mining and Information Analysis (Louhi)}, author = {Moradi, Farnaz and Eklund, Ann-Marie and Kokkinakis, Dimitrios and Olovsson, Tomas and Tsigas, Philippas}, year = {2014}, ISBN = {978-1-937284-90-9}, pages = {2--10}, } @inProceedings{Kokkinakis-Dimitrios2014-209800, title = {HFST-SweNER . A New NER Resource for Swedish}, abstract = {Named entity recognition (NER) is a knowledge-intensive information extraction task that is used for recognizing textual mentions of entities that belong to a predefined set of categories, such as locations, organizations and time expressions. NER is a challenging, difficult, yet essential preprocessing technology for many natural language processing applications, and particularly crucial for language understanding. NER has been actively explored in academia and in industry especially during the last years due to the advent of social media data. This paper describes the conversion, modeling and adaptation of a Swedish NER system from a hybrid environment, with integrated functionality from various processing components, to the Helsinki Finite-State Transducer Technology (HFST) platform. This new HFST-based NER (HFST-SweNER) is a full-fledged open source implementation that supports a variety of generic named entity types and consists of multiple, reusable resource layers, e.g., various n-gram-based named entity lists (gazetteers).}, booktitle = {Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC), Reykjavik 26 - 31 May 2014.}, author = {Kokkinakis, Dimitrios and Niemi, Jyrki and hardwick, sam and Lindén, Krister and Borin, Lars}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {2537--2543}, } @inProceedings{Kokkinakis-Dimitrios2014-209807, title = {A corpus-based approach to the identification of non-literal language in a medical setting.}, abstract = {Automated processing of clinical texts is commonly faced with various less exposed, and not so regularly discussed linguistically complex problems that need to be addressed. One of these issues concerns the usage of figurative language. Figurative language implies the use of words that go beyond their ordinary meaning, a linguistically complex and challenging problem and also a problem that causes great difficulty for the field of natural language processing (NLP). The problem is equally prevalent in both general language and also in various sublanguages, such as clinical medicine. Therefore we believe that a comprehensive model of e.g. clinical language processing needs to account for figurative language usage, and this paper provides a description, and preliminary results towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. We illustrate several types of figurative expressions in the clinical discourse and apply a rather quantitative and corpus-based level analysis. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient-doctor and patient-nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree.}, booktitle = {Proceedings of the Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Kokkinakis, Dimitrios and Grahn, Inga-Lill}, year = {2014}, pages = {1}, } @inProceedings{Kokkinakis-Dimitrios2014-209802, title = {Semantics in Storytelling in Swedish Fiction}, abstract = {In this paper, we aim to define foundations and research questions for future large scale exploration of various types of semantic relationships in literature, namely Swedish prose fiction. More specifically, we are interested to get an in-depth understanding of storytelling in Swedish fiction by analyzing and mining the narrative discourse in a small sample of such data, focusing on interpersonal relationships and answering various questions such as how to recognize and assess gender patterns. Our intention is to apply our findings into a much larger scale in the near future in order to obtain useful insights about the social relations, structures, behavior and everyday life of characters found in literary works, thus enhancing the use of prose fiction as a source for research within the humanities and social sciences. Our work is inspired by the notions of distant reading and macroanalysis, a relatively new and often contested paradigm of literary research. In order to achieve our goal we strive for a combination of natural language processing techniques and simple visualizations that allow the user to rapidly focus on key areas of interest and provide the ability to discover latent semantic patterns and structures. }, booktitle = {Proceedings of the Digital Access to textual Cultural Heritage (DATeCH).}, author = {Kokkinakis, Dimitrios and Malm, Mats and Bergenmar, Jenny and Ighe, Ann}, year = {2014}, ISBN = {978-1-4503-2588-2}, pages = {6}, } @inProceedings{Kokkinakis-Dimitrios2014-209808, title = {Vocation Identification in Swedish Fiction. }, abstract = {This paper presents a system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Besides vocation identification, the system assigns gender (male, female, unknown) to the vocation words. Since gender is a prominent attribute of first names, we apply a named-entity recognizer (NER) that uses first name gazetteers where each name has been pre-assigned gender, which aids gender assignment to vocations with unknown gender if appropriate context is available. We also use a statistical modelling method, conditional random fields (CRF), for learning gender-assigned vocations in combination with the results of the NER and other pattern matching techniques. The purpose of this work is to develop and apply tools to literature as means to expand our understanding of history in the area of literature-based gender studies, e.g. investigate how women enter literature, which functions do they assume and their working patterns. Vocation identification can be used as one such indicator for achieving some these goals.}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference (SLTC)}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2014}, pages = {3}, } @inProceedings{Ahlberg-Malin2014-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{Grahn-Inga-Lill2014-216142, title = {Legitimating the visit - a recurrent challenge among patients with medically unexplained symptoms}, abstract = {The doctor’s evaluation of presented symptoms as doctorable, is a legitimation of the patient’s decision to seek medical care. It is also a confirmation of the rational, and even the moral, status of the patient, since consulting a doctor without good reasons is considered irrational. The analysis focuses on how patients take initiatives to present problems and on the doctors’ responses and evaluations regarding the doctorability. Situations where participants seem to have different views of the doctorability of the problems are examined in relation to conversational practices and social actions. The analyses shows that the doctor as well as the patient orients to the potential doctorability of the problems and to the moral challenges related to it, but that their different expectations and roles lead to communicatively unclear situations. Further analyses will illustrate in what ways the MUS-patients’ recurrent challenge of legitimating their visits could be influenced by the interaction, and hence in what ways conscious conversational practices from the care givers might facilitate these situations.}, booktitle = {Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Grahn, Inga-Lill and Kokkinakis, Dimitrios}, year = {2014}, } @inProceedings{Kokkinakis-Dimitrios2013-168227, title = {Figurative Language in Swedish Clinical Texts. Potsdam, Germany}, abstract = {Automated processing of clinical texts with the intention to link all important text fragments to various established terminologies and ontologies for relation or event extraction is commonly faced with various less exposed, and not so regularly discussed linguistically motivated issues that needs to be addressed. One of these issues is the usage of figurative language. Figurative language, that is the use of words that go beyond their ordinary meaning, is not only a linguistically complex and challenging problem but also a problem that causes great difficulty for the field of natural language processing (NLP), both for the processing of general language and of various sublanguages, such as clinical medicine. Therefore, a comprehensive model of e.g. clinical language processing needs to account for figurative language usage and this paper provides a description towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. As a matter of fact, all these types of expressions form a continuum with fuzzy boundaries, and most of the NLP-oriented approaches discussed in the past have used either very large data for the analysis or hand annotates samples, a situation that has been prohibitive so far in our project. Therefore distinction is solely based on a more general level, namely between literal versus figurative language, and on a more quantitative and corpus-based level, supported with concrete examples that illustrate several types of figurative expressions in the clinical discourse. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient doctor and patient nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree. }, booktitle = {Computational Semantics in Clinical Text workshop. Part of the 10th International Conference on Computational Semantics}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-1-62748-398-8}, pages = {6}, } @article{Oelke-D.2013-181484, title = {Fingerprint Matrices: Uncovering the dynamics of social networks in prose literature}, abstract = {In prose literature often complex dynamics of interpersonal relationships can be observed between the different characters. Traditionally, node-link diagrams are used to depict the social network of a novel. However, static graphs can only visualize the overall social network structure but not the development of the networks over the course of the story, while dynamic graphs have the serious problem that there are many sudden changes between different portions of the overall social network. In this paper we explore means to show the relationships between the characters of a plot and at the same time their development over the course of a novel. Based on a careful exploration of the design space, we suggest a new visualization technique called Fingerprint Matrices. A case study exemplifies the usage of Fingerprint Matrices and shows that they are an effective means to analyze prose literature with respect to the development of relationships between the different characters.}, author = {Oelke, D. and Kokkinakis, Dimitrios and Keim, D. A.}, year = {2013}, volume = {32}, number = {3}, pages = {371--380}, } @inProceedings{Kokkinakis-Dimitrios2013-188518, title = {A Macroanalytic View of Swedish Literature using Topic Modeling.}, abstract = {New research opportunities are plentiful for digital and literature scholars who are currently faced with increasingly large portions of large digitized archives produced during the last decades. Conventional methods of analysis involving a so called close reading view are not enough. Distant reading or macroanalysis is proposed instead, as a better, viable and more pragmatic alternative to the traditional methods of analyzing e.g., literature. According to this view, understanding literature is not accomplished by studying individual texts, but by aggregating and analyzing massive amounts of data. Therefore, applying macroanalytic methods and technologies is a priority among many research groups in the humanities worldwide. In this paper we explore topic modeling, an increasingly popular statistical method used for uncovering themes, topics and patterns in large amounts of text. We use available topic modeling software and, as empirical data, the content of the Swedish literature bank, a constantly growing body of Swedish fiction corpus from the 18th and 19th century. We present preliminary results on a sample of this corpus and discuss how humanistic research can be conducted through this type of computation, as a means to identify potential issues of interest e.g., for historians.}, booktitle = {Corpus Linguistics 2013 : abstract book (Lancaster) / edited by Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2013}, } @inProceedings{Kokkinakis-Dimitrios2013-188517, title = {Medical Event Extraction using Frame Semantics - Challenges and Opportunities. Samos, Greece}, abstract = {Abstract. The aim of this paper is to present some findings from a study into how a large scale semantic resource, FrameNet, can be applied for event extraction in the (Swedish) biomedical domain. Combining lexical resources with domain specific knowledge provide a powerful modeling mechanism that can be utilized for event extraction and other advanced text mining-related activities. The results, from developing a rule-based approach, showed that only small discrepancies and omissions were found between the semantic descriptions, the corpus data examined and the domain-specific semantics provided by SNOMED CT (medical terminology), NPL (medicinal products) and various semi-automatically developed clue lists (e. g., domain-related abbreviations). Although the described experiment is only based on four different domain-specific frames, the methodology is extendable to the rest ones and there is much room for improvements, for instance by combining rule-based with machine learning techniques, and using more advanced syntactic representations.}, booktitle = {Proceedings of the 14th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing)}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @inProceedings{Borin-Lars2013-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @inProceedings{Hamon-Thierry2013-189545, title = {Medication Extraction and Guessing in Swedish, French and English. }, abstract = {Extraction of information related to the medication is an im-portant task within the biomedical area. While the elaboration and updating of the drug vocabularies cannot follow the rap-id evolution of the drug development, we propose an automat-ic method for the extraction of known and new drug names. Our method combines internal and contextual clues. The method is applied to different types of documents in three languages (Swedish, French and English). The results indi-cate that with this kind of approach, we can efficiently update and enrich the existing drug vocabularies (probably with rap-id manual browsing). Precision and recall scores varied be-tween 81%-91% for precision and 85%-100% for recall. As a future work we intend to continuously refine the approach, by for instance better integration of semantic patterns and fuzzy matching that should hopefully enable further increase of the obtained results.}, booktitle = {Proceedings of the 14th World Congress on Medical and Health Informatics (MEDINFO). Studies in Health Technology and Informatics. Copenhagen, Denmark.}, author = {Hamon, Thierry and Grabar, Natalia and Kokkinakis, Dimitrios}, year = {2013}, volume = {192}, } @inProceedings{Kokkinakis-Dimitrios2013-189536, title = {Annotation of interpersonal relations in Swedish prose fiction.}, abstract = {This paper describes the manual annotation of a small sample of Swedish 19th and 20th century prose fiction with interpersonal relations between characters in six literary works. An interpersonal relationship is an association between two or more people that may range in duration from brief to enduring. The annotation is guided by a named entity recognition step. Our goal is to get an in-depth understanding of the difficulties of such a task and elaborate a model that can be applied for similar annotation on a larger scale, both manually as well as automatically. The identification of interpersonal relations can, hopefully, aid the reader of a Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story. Our aim is to use such annotations in a hybrid context, i.e., using machine learning and rule-based methods, which, in conjunction with named entity recognition, can provide the necessary infrastructure for creating detailed biographical sketches and extracting facts for various named entities which can be exploited in various possible ways by Natural Language Processing (NLP) technologies such as summarization, question answering, as well as visual analytic techniques.}, booktitle = {Proceedings of the 3rd Workshop on Annotation of Corpora for Research in the Humanities (ACRH-3). Sofia, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-954-91700-5-4}, pages = {37--47}, } @inProceedings{Kokkinakis-Dimitrios2013-189552, title = {Query Logs as a Corpus.}, abstract = {This paper provides a detailed description of a large Swedish health-related query log corpus and explores means to derive useful statistics, their distributions and analytics from its content across several dimensions. Information acquisition from query logs can be useful for several purposes and potential types of users, such as terminologists, infodemiologists / epidemiologists, medical data and web analysts, specialists in NLP technologies such as information retrieval and text mining but also public officials in health and safety organizations.}, booktitle = {Corpus Linguistics 2013 : abstract book. Lancaster: UCREL / edited by Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Eklund, Ann-Marie}, year = {2013}, pages = {329}, } @inProceedings{Kokkinakis-Dimitrios2013-189541, title = {Terminologihantering i medicinska loggfiler.}, booktitle = {Proceedings of the "Nationell termkonferens". Göteborg}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @article{Smith-Frida2013-10, title = "Readability, suitability and comprehensibility in Patient Education Materials for Swedish patients with colorectal cancer undergoing elective surgery - a mixed method design", journal = "Patient Education and Counselling", author = "Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, Richard and Friberg, Febe and Öhlén, Joakim", year = "2013", volume = "2013", number = "epub ahead of print", } @inProceedings{Oelke-Daniela2012-155493, title = { Advanced Visual Analytics Methods for Literature Analysis}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace, which results also in a growing demand for computational support to analyze such linguistic data. This paper combines robust text analysis with advanced visual analytics and bring a new set of tools to literature analysis. Visual analytics techniques can offer new and unexpected insights and knowledge to the literary scholar. We analyzed a small subset of a large literary collection, the Swedish Literature Bank, by focusing on the extraction of persons’ names, their gender and their normalized, linked form, including mentions of theistic beings (e.g., Gods’ names and mythological figures), and examined their appearance over the course of the novel. A case study based on 13 novels, from the aforementioned collection, shows a number of interesting applications of visual analytics methods to literature problems, where named entities can play a prominent role, demonstrating the advantage of visual literature analysis. Our work is inspired by the notion of distant reading or macroanalysis for the analyses of large literature collections. }, booktitle = {Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH). An EACL 2012 workshop. Avignon, France.}, author = {Oelke, Daniela and Kokkinakis, Dimitrios and Malm, Mats}, year = {2012}, volume = {Accepted}, pages = {10}, } @inProceedings{Oelke-Daniela2012-155495, title = {Visual Analytics and the Language of Web Query Logs - A Terminology Perspective}, abstract = {This paper explores means to integrate natural language processing methods for terminology and entity identification in medical web session logs with visual analytics techniques. The aim of the study is to examine whether the vocabulary used in queries posted to a Swedish regional health web site can be assessed in a way that will enable a terminologist or medical data analysts to instantly identify new term candidates and their relations based on significant co-occurrence patterns. We provide an example application in order to illustrate how the visualizations of co-occurrence relationships between medical and general entities occurring in such logs can be visualized, accessed and explored. To enable a visual exploration of the generated co-occurrence graphs, we employ a general purpose social network analysis tool, Visone (http://visone.info), that permits to visualize and analyze various types of graph structures. Our examples show that visual analytics based on co-occurrence analysis provides insights into the use of layman language in relation to established (professional) terminologies, which may help terminologists decide which terms to include in future terminologies. Increased understanding of the used querying language is also of interest in the context of public health web sites. The query results should reflect the intentions of the information seekers, who may express themselves in layman language that differs from the one used on the available web sites provided by medical professionals.}, booktitle = {The 15th EURALEX International Congress (European Association of Lexicography). Oslo, Norway.}, author = {Oelke, Daniela and Eklund, Ann-Marie and Marinov, Svetoslav and Kokkinakis, Dimitrios}, year = {2012}, pages = {8}, } @inProceedings{Kokkinakis-Dimitrios2012-155530, title = {Contextualisation of functional symptoms in primary health care}, abstract = {Background: a number of patients consulting primary health care have physical symptoms that may be labeled “medically unexplained”, i.e. absence of a demonstrable organic etiology. Common functional somatic symptoms (FSS) are irritable bowel, tension headache and chronic fatigue. FSS-patients are generally frustrated with the inability of health care to alleviate their illness. Health care staff often also feel frustration. The communication between patient and care giver is the key for coming to terms with the problem. Objective: to investigate how complex, vague and long-standing symptoms with no identified organic cause are put into context, interpreted and acted upon in primary health-care interactions. Two types of interventions are envisaged (i) methods for early identification of patients at risk of entering a vicious circle of functional symptoms and (ii) methods for re-interpreting symptoms in alternative and more purposeful ways. Methods: the project studies interactions between patients and nurses giving advice over telephone, consultations between patients and physicians, interviews and study patients' medical case notes. Eligible patients (18-65 y.o.) contact their primary health care centre by telephone, have had at least eight physical consultations with nurses or physicians in the last 12 months and if a majority of the symptoms within this time span had no clear organic or psychiatric cause. The project contains a number of subprojects, according to the type of data collected. Several methods of analysis will be used, mainly critical discourse analysis, phenomenologic-hermeneutic and computation linguistic analyses. (Expected) Results: using the collected data, we describe characteristics of the communication that takes place in these settings and the way symptoms and diseases are represented. This will facilitate the development of future interventions aimed at decreasing the morbidity due to FSS and give further insights into the problem. }, booktitle = {The 5th GENEVA Conference on Person-Centered Medicine. Geneva, Switzerland. }, author = {Kokkinakis, Dimitrios and Lidén, Eva and Svensson, Staffan and Björk Brämberg, Elisabeth and Määttä, Sylvia}, year = {2012}, } @inProceedings{Kokkinakis-Dimitrios2012-155537, title = {Men, Women and Gods: Distant Reading in Literary Collections - Combining Visual Analytics with Language Technology}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace and so increases the need to computationally support the analysis of such data. Literature can be studied in a number of different ways and from many different perspectives and text analysis make up a central component of literature studies. If such analysis can be integrated with advanced visual methods and fed back to the daily work of the literature researcher, then it is likely to reveal the presence of useful and nuanced insights into the complex daily lives, ideas and beliefs of the main characters found in many of the literary works. In this paper we describe the combination of robust text analysis with visual analytics and bring a new set of tools to literary analysis. As a show case, we analyzed a small subset (13 novels of a single author) taken from a large literary collection, the Swedish Literature Bank . The analysis is based upon two levels of inquiry, namely by focusing on mentions of theistic beings (e.g. Gods' names) as well as mentions of persons' names, including their gender and their normalized, linked variant forms, and examining their appearance in sentences, paragraphs and chapters. The case study shows several successful applications of visual analytics methods to various literature problems and demonstrates the advantages of the implementation of visual literature fingerprinting. Our work is inspired by the notion of distant reading or macronalysis for the analyses of literature collections. We start by recognizing all characters in the novels using a mature language technology (named entity recognition) which can be turned into a tool in aid of text analysis in this field. We apply context cues, lists of animacy and gender markers and inspired by the document centered approach and the labelled consistency principle which is a form of on-line learning from documents under processing which looks at unambiguous usages of words or names for assigning annotations in ambiguous words or names. For instance, if in an unambiguous context where there is a strong gender indicator, such as 'Mrs Alexander' the name 'Alexander' is assigned a feminine gender, then subsequent mentions of the same name in the same discourse will be assigned the feminine gender as well unless there is a conflict with another person with the same name. We argue, that the integration of text analysis such as the one briefly outlined and visualization techniques, such as higher resolution pixel-based fingerprinting, could be put to effective use also in literature studies. We also see an opportunity to devise new ways of exploring the large volumes of literary texts being made available through national cultural heritage digitization projects, for instance by exploring the possibility to show several literary texts (novels) at once. We will illustrate some of the applied techniques using several examples from our case study, such as summary plots based on all the characters in these novels as well as fingerprints based on the distribution of characters across the novels.}, booktitle = {Proceedings of the Advances in Visual Methods for Linguistics (AVML)}, author = {Kokkinakis, Dimitrios and Oelke, Daniela}, year = {2012}, volume = {Accepted}, } @inProceedings{Kokkinakis-Dimitrios2012-155893, title = {The Journal of the Swedish Medical Association - a Corpus Resource for Biomedical Text Mining in Swedish.}, abstract = {Biomedical text mining applications are largely dependent on high quality knowledge resources. Traditionally, these include lexical databases, terminologies, nomenclatures and ontologies and, during the last decade, also corpora of various sizes, variety and diversity. Some of these corpora are annotated with an expanding range of information types and metadata while others become available with a minimal set of annotations. At the same time, it is of great importance that biomedical corpora for lesser-spoken languages also get developed in order to support and facilitate the implementation of practical applications for such languages and to stimulate the development of language technology research and innovation infrastructures in the domain. This paper provides a detailed description of a Swedish biomedical corpus based on the electronic editions of the Journal of the Swedish Medical Association "Läkartidningen" of the years 1996-2010. The corpus consists of a variety of documents that can be related to different medical domains, developed as a response to the increasing needs for large and reliable medical information for Swedish biomedical NLP. The corpus has been structurally annotated with a minimal set of meta information and automatically indexed with the largest and systematically organised computer processable collection of medical terminology, the Swedish SNOMED CT (Systematized Nomenclature of Medicine -- Clinical Terms). This way topic-focused subcorpora, e.g. with diabetes-related content, can be easily developed.}, booktitle = {The Third Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM), an LREC Workshop. Turkey.}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Accepted}, } @inProceedings{Johansson-Richard2012-156400, title = {Semantic Role Labeling with the Swedish FrameNet}, abstract = {We present the first results on semantic role labeling using the Swedish FrameNet, which is a lexical resource currently in development. Several aspects of the task are investigated, including the selection of machine learning features, the effect of choice of syntactic parser, and the ability of the system to generalize to new frames and new genres. In addition, we evaluate two methods to make the role label classifier more robust: cross-frame generalization and cluster-based features. Although the small amount of training data limits the performance achievable at the moment, we reach promising results. In particular, the classifier that extracts the boundaries of arguments works well for new frames, which suggests that it already at this stage can be useful in a semi-automatic setting.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3697--3700}, } @inProceedings{Kokkinakis-Dimitrios2012-164587, title = {Literacy Demands and Information to Cancer Patients}, abstract = {This study examines language complexity of written health information materials for patients undergoing colorectal cancer surgery. Written and printed patient information from 28 Swedish clinics are automatically analyzed by means of language technology. The analysis reveals different problematic issues that might have impact on readability. The study is a first step, and part of a larger project about patients’ health information seeking behavior in relation to written information material. Our study aims to provide support for producing more individualized, person centered information materials according to preferences for complex and detailed or legible texts and thus enhance a movement from receiving information and instructions to participating in knowing. In the near future the study will continue by integrating focus groups with patients that may provide valuable feedback and enhance our knowledge about patients’ use and preferences of different information material.}, booktitle = {Proceedings of the 15th International Conference on Text, Speech and Dialogue}, author = {Kokkinakis, Dimitrios and Forsberg, Markus and Johansson Kokkinakis, Sofie and Smith, Frida and Öhlén, Joakim}, year = {2012}, ISBN = {978-364232789-6}, } @inProceedings{Kokkinakis-Dimitrios2012-164788, title = {Initial Experiments of Medication Event Extraction Using Frame Semantics}, abstract = {Semantic annotation of text corpora for mining complex relations and events has gained a considerable growing attention in the medical domain. The goal of this paper is to present a snapshot of ongoing work that aims to develop and apply an appropriate infrastructure for automatic event labelling and extraction in the Swedish medical domain. Annotated text samples, appropriate lexical resources (e.g. term lists and the Swedish Frame-Net++) and hybrid techniques are currently developed in order to alleviate some of the difficulties of the task. As a case study this paper presents a pilot approach based on the application of the theory of frame semantics to automatically identify and extract detailed medication information from medical texts. Medication information is often written in narrative form (e.g. in clinical records) and is therefore difficult to be acquired and used in computerized systems (e.g. decision support). Currently our approach uses a combination of generic entity and terminology taggers, specifically designed medical frames and various frame-related patterns. Future work intends to improve and enhance current results by using more annotated samples, more medically-relevant frames and combination of supervised learning techniques with the regular expression patterns.}, booktitle = {Scandinavian Conference on Health Informatics (SHI)}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-7519-758-6}, pages = {41--47}, } @inProceedings{Eklund-Ann-Marie2012-165309, title = {Drug interests revealed by a public health portal}, abstract = {Online health information seeking has become an important part of people's everyday lives. However, studies have shown that many of those have problems forming effective queries. In order to develop better support and tools for assisting people in health-related query formation we have to gain a deeper understanding into their information seeking behaviour in relation to key issues, such as medication and drugs. The present study attempts to understand the semantics of the users' information needs with respect to medication-related information. Search log queries from the Swedish 1177.se health portal were automatically annotated and categorized according to relevant background knowledge sources. Understanding the semantics of information needs can enable optimization and tailoring of (official) health related information presented to the online consumer, provide better terminology support and thematic coding of the queries and in the long run better models of consumers’ information needs. }, booktitle = {Proceedings of the SLTC-Workshop: Exploratory Query-log Analysis. Lund, Sweden.}, author = {Eklund, Ann-Marie and Kokkinakis, Dimitrios}, year = {2012}, pages = {2}, } @inProceedings{Smith-Frida2012-170895, title = {Hur kan vi förbättra skriftligt informations- och utbildningsmaterial för patienter som opereras elektivt för kolorektal cancer?}, abstract = {Kolorektal cancer (KRC) är den tredje största cancerdiagnosen i Sverige med drygt 5500 drabbade årligen. Primär behandling är kirurgi kompletterad av pre- och postoperativ onkologisk behandling. Standardiserade koncept för accelererat vårdförlopp med kortare vårdtider lägger mycket fokus på fysisk rehabilitering, men mindre på den psykiska påfrestning det innebär att bli opererad för en cancerdiagnos. Patienter förväntas ta stort ansvar för sin rehabilitering, både på sjukhuset och hemma. För att vara förberedd behövs både skriftlig och muntlig information. Syftet med studien var att kartlägga och karaktärisera det skriftliga informations- och utbildningsmaterial (IOU) som används till patienter som opereras elektivt för KRC. Vidare var syftet att beskriva patienters uppfattning om struktur och innehåll på IOU. IOU från 28 kliniker som opererar patienter med KRC samlades in (totalt 220 st). För att kunna ge ett mått på texternas svårighetsgrad gjordes språkteknologisk analys på samtliga IOU, där bl.a. ordlängd, meningsbyggnad och jämförelse med annan typ av litteratur mättes På 117 st gjordes en suitabilityanalys med instrumentet SAM+CAM där domän som innehåll, läsbarhet, bilder, layout samt stimulans och motivation för lärande bedömdes. Fem fokusgrupper med patienter genomfördes där patienterna uppmanades att berätta om vad de tycker utmärker ett bra respektive dåligt IOU, vad de saknar i innehåll och när och på vilket sätt de vill ha materialet utlämnat. Resultatet av språkteknologiska- och suitabilityanalysen visar att de flesta IOU bedömdes som ”adequate”, men spridningen var stor. Patienterna hade önskemål om mer nivåuppdelat/nivåriktat material, där man själv kan välja hur mycket information man vill ha vid ett visst tillfälle. Flera ämnen saknades, eller var för otydligt beskrivna för att patienterna skulle känna sig trygga vid hemgång. Resultatet av de tre analysmetoderna bör kunna användas för att utveckla en ”verktygslåda” för att i framtiden kunna utforma bättre riktat IOU för patientgruppen. }, booktitle = {Nationella konferensen i Cancervård, 24-25 maj 2012, Stockholm}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Forsberg, Markus and Kokkinakis, Dimitrios and Friberg, Febe}, year = {2012}, } @article{Smith-Frida2012-170897, title = {Ny studie visar hur information till patienter med kolorektal cancer kan förbättras}, abstract = {Skriftligt informationsmaterial är ofta skrivet på för hög nivå och ställer höga krav på den tänkta läsaren (patienten). Förutom läsbarhet finns det fler faktorer att utvärdera för att se om materialet är lämpligt. Innehåll, struktur, layout och typsnitt, illustrationer och lärande och motivation är sådant som bör tas hänsyn till. Ett lämpligare, bättre anpassat material kan hjälpa personer med sjukdom att ställa bättre frågor när de har samtal med vårdpersonal och det kan göra personen mindre osäker och orolig för det okända som väntar. En ny studie som ingår i forskningsprojektet PINCORE (personcentred information and communication in colorectal cancer care) syftar till att förbättra information och kommunikation vid kolorektal cancer.}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Friberg, Febe and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2012}, number = {5}, pages = {18--21}, } @inProceedings{Kokkinakis-Dimitrios2011-139977, title = {Evaluating the Coverage of three Controlled Health Vocabularies with Focus on Findings, Signs & Symptoms}, abstract = {The medical domain is blessed with a magnitude of terminological resources of various characteristics, sizes, structure, depth and breadth of descriptive power, granularity etc. In this domain a particularly interesting and difficult entity type are signs, symptoms and findings which to a large extend are expressed in a periphrastic manner, sometimes by the use of figurative or metaphorical language, or contextualized using a wealth of vague variant expressions. We hypothesize therefore that no major official terminology source alone can accommodate for the variation and complexity present in real text data, such as electronic medical records, notes or health related documents. In this paper we evaluate the content of the three largest medical control vocabularies available for Swedish on extracted reference symptom lists and initiate a discussion on how we should proceed in order to accommodate for increased coverage on similar genres. }, booktitle = {Workshop on Creation, Harmonization and Application of Terminology Resources Co-located with NODALIDA 2011}, author = {Kokkinakis, Dimitrios}, year = {2011}, pages = {5}, } @inProceedings{Kokkinakis-Dimitrios2011-141311, title = {Health Portals and Clinical Phenotypes - Recognition using SNOMED CT}, abstract = {The medical domain is particularly well endowed with various sources of terminology. Usually, such sources vary with respect to size, structure, depth and breadth of descriptive power, granularity and applicability. This paper investigates the extent by which the largest available medical nomenclature for Swedish can cope with a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. We evaluated the content of the resource on extracted reference symptom lists from several popular health portals. The results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions.}, booktitle = {9th Scandinavian Conference on Health Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{Kokkinakis-Dimitrios2011-141312, title = {What is the Coverage of SNOMED CT® on Scientific Medical Corpora?}, abstract = {This paper reports on the results of a large scale mapping of SNOMED CT on scientific medical corpora. The aim is to automatically access the validity, reliability and coverage of the Swedish SNOMED-CT translation, the largest, most extensive available resource of medical terminology. The method described here is based on the generation of predominantly safe harbor term variants which together with simple linguistic processing and the already available SNOMED term content are mapped to large corpora. The results show that term variations are very frequent and this may have implication on technological applications (such as indexing and information retrieval, decision support systems, text mining) using SNOMED CT. Naïve approaches to terminology mapping and indexing would critically affect the performance, success and results of such applications. SNOMED CT appears not well-suited for automatically capturing the enormous variety of concepts in scientific corpora (only 6,3% of all SNOMED terms could be directly matched to the corpus) unless extensive variant forms are generated and fuzzy and partial matching techniques are applied with the risk of allowing the recognition of a large number of false positives and spurious results.}, booktitle = {Studies in Health Technology and Informatics / XXIII International Conference of the European Federation for Medical Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, volume = {169}, } @inProceedings{Kokkinakis-Dimitrios2011-143877, title = {Reducing Complexity in Parsing Scientific Medical Data, a Diabetes Case Study}, abstract = {The aim of this study is to assemble and deploy various NLP components and resources in order to parse scientific medical data and evaluate the degree in which these resources contribute to the overall parsing performance. With parsing we limit our efforts to the identi-fication of unrestricted noun phrases with full phrase structure and investigate the effects of using layers of semantic annotations prior to parsing. Scientific medical texts exhibit com-plex linguistic structure but also regularities that can be captured by pre-processing the texts with specialized semantically-aware tools. Our results show evidence of improved performance while the complexity of parsing is reduced. Parsed scientific texts and inferred syntactic information can be leveraged to improve the accuracy of higher-level tasks such as information extraction and enhance the acquisition of semantic relations and events.}, booktitle = {Workshop: Biomedical Natural Language Processing in conjunction with Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{Kokkinakis-Dimitrios2011-143875, title = {Character Profiling in 19th Century Fiction}, abstract = {This paper describes the way in which personal relationships between main characters in 19th century Swedish prose fiction can be identified using information guided by named entities, provided by a entity recognition system adapted to the 19th century Swedish language characteristics. Interpersonal relation extraction is based on the context between two relevant, identified person entities. The extraction process of the relationships also utilize the content of on-line available lexical semantic resources (suitable vocabularies) and fairly standard context matching methods that provide a basic mechanism for identifying a wealth of interpersonal relations that hopefully can aid the reader of a 19th-century Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story.}, booktitle = {Workshop: Language Technologies for Digital Humanities and Cultural Heritage in conjunction with the Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2011}, } @article{Kokkinakis-Dimitrios2011-149930, title = {Natural language processing of clinical data with a focus on diffuse symptoms}, abstract = {The medical domain is well supported with a wealth of large, rich and varied controlled vocabularies and terminological resources. This paper investigates the extent by which the largest available medical nomenclature for Swedish, the Systematized Nomenclature of Medicine Clinical Terms (SNOMED CT), can handle a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. The aim of the study is to better understand phenotype contextualization in order to improve and enhance our knowledge of communicative events in various healthcare settings. Our approach can be seen as an exploratory one in which we believe to yield useful insights into the nature of how findings, symptoms and signs (i.e. clinical phenotypes in general) are expressed in real data. This study is initiated in the context of the project "Interpretation and understanding of functional symptoms in primary health care". The main research goal of which is to study health care interactions with patients suffering from Functional Somatic Syndromes (FSS). FSS are characterized by particular constellations of medically unexplained, often chronic symptoms, such as dizziness, fatigue, dyspepsia, muscle and joint pain. We use methods from the natural language processing field in order to investigate how symptom mentions are expressed and how available successful automated means are for capturing symptom descriptions both on collected written (patient records) and transcribed material (patient/nurse and patient/doctor encounters). We manually evaluated the content of the resource on the collected data and our results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions. We have evaluated the content of a large controlled vocabulary for Swedish on symptom descriptions in clinical texts.}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @article{Kokkinakis-Dimitrios2011-149931, title = {Medicinska terminologier - officiella standarder och verklighet}, abstract = {Officiella medicinska termlistor hinner aldrig bli helt kompletta eller uppdaterade i tid med de senaste upptäckterna inom det (bio)medicinska fältet Växande behov av koppling mellan fack- och allmänspråk för praktiska (medicinskt orienterade) tillämpningar, t.ex. "din journal på nätet"-projektet Applikationer med indata som innehåller både fackspråk och allmänspråk - brist på täckande medicinska (elektroniska) ordböcker/termlistor med integrerad utförlig språklig och medicinsk information för lekmän finns inte transkriberade patient-läkarsamtal Använda existerande medicinska terminologier i språkteknologisk forskning som stöd för informationsutvinning - skapa strukturerade representationer av texter (samförekomstanalys; faktaextraktion och syntes; relation- och händelseextraktion; t.ex. mellan sjukdom - behandling - utfall få att kunna få ett bra underlag för att kunna förutsäga hur framtida behandlingar slår) Använda terminologin som ett medium för att underlätta kommunikationen mellan hälsotagare och hälsogivare t.ex. underlätta förståelse av medicinska termer av allmänheten }, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{Smith-Frida2011-152723, title = {Developing a toolkit for written information materials for patients with colorectal cancer undergoing elective surgery}, abstract = {This study examines language complexity, readability and suitability, of written health information materials given to patients undergoing colorectal cancer (CRC) surgery. The overall aim is to investigate whether the implementation of adapted, person-centred information and communication for patients with CRC undergoing elective surgery, can enhance the patients’ self-care beliefs and well-being during recovery in the phase following diagnosis and initial treatment. Several explorative, qualitative studies are planned and will function both as a basis for the proposed interventions and provide explanations for the actual processes leading to the desired outcomes. Patients’ knowledge enablement will be reached by several interrelated intervention strategies and specific activities. One of these strategies deals with means to facilitate patients’ information seeking patterns and the goal is to provide patients with written information materials according to preferences for complex and detailed or legible texts. Thus, the interventions planed aim to enhance a movement from receiving information and instructions to participating in knowing. Written and printed patient information material from 28 Swedish clinics for patients diagnosed with CRC undergoing elective surgery were selected for analysis by means of standard metrics and more elaborate language technology techniques. Various text parameters such as lexical variation, frequency bands and the use of terminology were examined. The material was also analysed using a Suitability Assessment Instrument in order to examine content, literacy demand, graphic illustrations, layout and typography, learning stimulation and finally cultural appropriateness. In addition, five focusgroups were conducted where patients were asked to give their experiences of using written information. Results from the language technology analysis showed a variety in materials, where it could be divided in to easy, medium and difficult to read and comprehend. Patients in focusgroups told they would like written materials to be levelled in order to gain information stepwise, but also stressed the importance of information given both orally and in writing, and that they must correspond. Using the SAM-instrument was a good complement for deeper understanding, and taking all three analyses in account, we aim to design a balanced toolkit for how to best design written information materials where a person tailored approach can be offered. }, booktitle = {Svenska Läkaresällskapets Riksstämman}, author = {Smith, Frida and Carlsson, Eva and Friberg, Febe and Kokkinakis, Dimitrios and Forsberg, Markus and Öhrn, Matilda and Öhlén, Joakim}, year = {2011}, } @inProceedings{Borin-Lars2010-118907, title = {Diabase: Towards a diachronic BLARK in support of historical studies}, booktitle = {Proceedings of LREC 2010}, author = {Borin, Lars and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{Kokkinakis-Dimitrios2010-113194, title = {A Swedish Scientific Medical Corpus for Terminology Management and Linguistic Exploration}, abstract = {This paper describes the development of a new Swedish scientific medical corpus. We provide a detailed description of the characteristics of this new collection as well results for a number of term management tasks, including terminology validation and terminology extraction based on this material. Although the corpus is representative for the scientific medical domain it still covers a lot of specialised sub-disciplines such as “diabetes” and “osteoporosis” which makes it suitable for facilitating the production of smaller and more focused subcorpora. We have tried to address this issue by making explicit some features of the corpus in order to demonstrate the corpus usefulness particularly for the quality assessment of official terminologies such as the Systematized NOmenclature of MEDicine - Clinical Terms (SNOMED CT).}, booktitle = {Proceedings of the 7th international conference on Language Resources and Evaluation (LREC), Malta}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, } @inProceedings{Kokkinakis-Dimitrios2010-119441, title = {Linking SweFN++ with Medical Resources, towards a MedFrameNet for Swedish}, abstract = {In this pilot study we define and apply a methodology for building an event extraction system for the Swedish scientific medical and clinical language. Our aim is to find and describe linguistic expressions which refer to medical events, such as events related to diseases, symptoms and drug effects. In order to achieve this goal we have initiated actions that aim to extend and refine parts of the ongoing compilation of the Swedish FrameNet++ (SFN++), which, as its English original predecessor, is grounded in Frame Semantics which provides a sound theoretical ground for modeling and linking linguistic structures encountered in general language and in specific domains (after specialization). Using such resource we manually annotate domain texts to be used as training data for automatic event extraction by automated techniques.}, booktitle = {Proceedings of the Second Louhi Workshop on Text and Data Mining of Health Documents. A NAACL-HTL Workshop}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2010}, } @inProceedings{Kokkinakis-Dimitrios2010-119444, title = {Korpus för vårdens och omsorgens fackspråk.}, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ till svenska. Med hjälp av Läkartidningens digitala arkiv har vi utvecklat metoder för att effektivisera kvalitetssäkringen av terminnehållet. }, booktitle = {Humanistdagen 2010 - humaniora i dagens samhälle.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{Kokkinakis-Dimitrios2010-120480, title = {Läkartidningens arkiv i en ny skepnad - En resurs för forskare, läkare och allmänhet}, abstract = {I Sverige har det tagits fram en medicinsk korpus baserad på Läkartidningens digitala arkiv. Denna resurs möjliggör precisa sökningar och värdefull tillgång till medicinsk terminologisk information på olika nivåer. Dimitrios Kokkinakis från Göteborgs universitet och Ulla Gerdin från Socialstyrelsen presenterar projektet. }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, volume = {1/2010}, pages = {22--28}, } @inProceedings{Borin-Lars2010-110368, title = {The past meets the present in Swedish FrameNet++}, abstract = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.}, booktitle = {14th EURALEX International Congress}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, pages = {269--281}, } @incollection{Borin-Lars2010-124517, title = {Literary onomastics and language technology}, booktitle = {Literary education and digital learning}, author = {Borin, Lars and Kokkinakis, Dimitrios}, year = {2010}, publisher = {Information Science Reference}, adress = {Hershey - New York}, ISBN = {978-1-60566-932-8}, pages = {53--78}, } @inProceedings{Allvin-H.2010-120479, title = {Characteristics and Analysis of Finnish and Swedish Clinical Intensive Care Nursing Narratives}, abstract = {We present a comparative study of Finnish and Swedish free-text nursing narratives from intensive care. Although the two languages are linguistically very dissimilar, our hypothesis is that there are similarities that are important and interesting from a language technology point of view. This may have implications when building tools to support producing and using health care documentation. We perform a comparative qualitative analysis based on structure and content, as well as a comparative quantitative analysis on Finnish and Swedish Intensive Care Unit (ICU) nursing narratives. Our findings are that ICU nursing narratives in Finland and Sweden have many properties in common, but that many of these are challenging when it comes to developing language technology tools. }, booktitle = {Proceedings of the NAACL HLT 2010 Second Louhi Workshop on Text and Data Mining of Health Documents}, author = {Allvin, H. and Carlsson, E. and Dalianis, H. and Danielsson-Ojala, R. and Daudaravicius, V. and Hassel, M. and Kokkinakis, Dimitrios and Lundgren-Laine, H. and Nilsson, G. and Nytrø, Ø. and Salanterä, S. and Skeppstedt, M. and Suominen, H. and Velupillai, S.}, year = {2010}, pages = {53 -- 60}, } @article{Kokkinakis-Dimitrios2010-125644, title = {Complementary Methods for De-identifying Sensitive Data with a focus on Clinical Discourse}, abstract = {In the era of the Electronic Health Record (EHR) the release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly clinical, or other sources of sensitive data. The main challenge and goal of this research is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, author = {Kokkinakis, Dimitrios}, year = {2010}, volume = {45}, pages = {243--246}, } @article{Borin-Lars2010-129126, title = {Swedish FrameNet++}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, } @article{Kokkinakis-Dimitrios2010-130212, title = {Is data scrubbing useful for anonymizing sensitive data?.}, abstract = {The release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly in the clinical setting. The main challenge is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{Kokkinakis-Dimitrios2010-130213, title = {Är "data scrubbing" en användbar metod för att anonymisera känsliga patientdata?.}, abstract = {De senaste årens ökande användning av modern informationsteknik inom sjukvården har medfört en kraftig ökning av elektronisk dokumentation som rör patientens hälsotillstånd, vård och behandling. Vårddokumentationen blir både mer detaljerad och mer individuell, samtidigt som den uppdateras och förändras regelbundet. Patientjournalen är i första hand till för att bidra till en god och säker vård av patienten, men också en viktig informationskälla för FoU. Ett stort hinder för utnyttjandet av journalinformation som forskningskälla är de etiska och rättsliga problemen. För att kunna hantera och utnyttja dessa stora och ständigt växande informationsmängder ställs därmed högre krav på säker, skyddad och effektiv informationshantering.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{Kokkinakis-Dimitrios2010-130210, title = {Initiala resultat av en storskalig automatisk indexering av vetenskaplig litteratur med hela det svenska SNOMED CT - problem och möjligheter.}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv och dels från DiabetologNytts (DN) digitala arkiv .}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{Borin-Lars2009-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @incollection{Kokkinakis-Dimitrios2009-73979, title = {Lexical granularity for automatic indexing and means to achieve it - the case of Swedish MeSH®}, abstract = {The identification and mapping of terminology from large repositories of life science data onto concept hierarchies constitute an important initial step for a deeper semantic exploration of unstructured textual content. Accurate and efficient mapping of this kind is likely to provide better means of enhancing indexing and retrieval of text, uncovering subtle differences, similarities and useful patterns, and hopefully new knowledge, among complex surface realisations, overlooked by shallow techniques based on various forms of lexicon look-up approaches. However, a finer-grained level of mapping between terms as they occur in natural language and domain concepts is a cumbersome enterprise that requires various levels of processing in order to make explicit relevant linguistic structures. This chapter highlights some of the challenges encountered in the process of bridging free to controlled vocabularies and thesauri and vice versa. We investigate how the extensive variability of lexical terms in authentic data can be efficiently projected to hierarchically structured codes, while means to increase the coverage of the underlying lexical resources are also investigated.}, booktitle = {Information Retrieval in Biomedicine : Natural Language Processing for Knowledge Integration}, author = {Kokkinakis, Dimitrios}, year = {2009}, publisher = {IGI Global }, adress = {Hershey, Pennsylvania}, } @article{Kokkinakis-Dimitrios2009-105133, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning; a pilot study.}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discusses the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a supervised machine learning task in which the relations are tried to be learned using pre-annotated data. The challenges designing the problem and empirical results are presented.}, author = {Kokkinakis, Dimitrios}, year = {2009}, volume = {5729}, pages = {395--402}, } @article{Kokkinakis-Dimitrios2009-105136, title = {Uppbyggandet av en svensk medicinsk korpus för termvalidering och termextrahering - hur bra täcker SNOMED CT olika delfackområden?}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv och dels från DiabetologNytts (DN) digitala arkiv . }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{Kokkinakis-Dimitrios2009-105141, title = {Kvalitetssäkring av SNOMED CT med hjälp av Läkartidningens arkiv. }, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ (SNOMED CT) till svenska. Arbetet är både omfattande och tidskrävande samtidigt som uppdragstagaren har krav om kvalitetssäkring av översättningen. Hur kan Läkartidningens arkiv bidra till kvalitetssäkringen? Med hjälp av Läkartidningens digitala arkiv, LDA, (årgångarna 1996-2009) har vi utvecklat metoder för att effektivisera kvalitetssäkringen av olika SNOMED CT-urval (t.ex. diabetestermer). Det innebär att vi underlättar för utförandet av empiriska, SNOMED CT-relaterade studier, som t.ex. framtagning av underlag om termernas användning, variation och frekvensdistribution över tid. Arkivets förädling: LDA:t omvandlades till ett enhetligt textbaserat format och textinnehållet normaliserades med avseenden på dokumentformat och teckenkodning för att kunna skapa ett bra underlag för den efterföljande språkteknologiska analysen. Alla artiklar i varje publicerad årgång extraherades och märktes upp dels med olika slags metainformation (t.ex. genretillhörighet) dels med lingvistisk och semantisk information, sammanlagt 27 000 artiklar. Den språkteknologiska bearbetningen innefattade automatiskt tillägg av lingvistisk information som t.ex. ordklasstillhörighet för varje ord i korpusen och automatiskt, semantisk mappning dels till den svenska MeSH-tesaurusen och dels till delar av den svensköversatta SNOMED-hierarkin. LDA i en ny skepnad: LDA utgör sedan länge en värdefull svensk medicinsk resurs för alla som yrkesmässigt jobbar med termer och språk. Vi har dock bidragit med att göra textmaterialet ännu mer välstrukturerat och förädlat, som kan vara till hjälp för explorativa studier där sökningar kan förfinas på ett flertal sätt och därmed ge forskare möjligheter att göra djupare innehållsanalyser av texterna och samla grundläggande kunskaper inom olika ämnesområden. Kombinationen av enstaka termer och ord med lingvistisk och semantisk information ger unika möjligheter till att skaffa information och generera fakta som kan leda till nya hypoteser och eventuellt ny kunskap om olika aspekter som gäller termanvändning och variation och vi kommer att redovisa exempel på sådana analyser. }, booktitle = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{Kokkinakis-Dimitrios2009-94705, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning, a pilot study}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discuss the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a machine learning task in which the relations are tried to be learned in a supervised fashion, using pre-annotated data. The challenges designing the problem and empirical results are presented.}, booktitle = {Proceedings of the 12th International Conference TSD (Text, Speech and Dialogue). Springer Verlag, LNCS/LNAI series.}, author = {Kokkinakis, Dimitrios}, year = {2009}, } @article{Kokkinakis-Dimitrios2009-105140, title = {Issues on Quality Assessment of SNOMED CT® Subsets - Term Validation and Term Extraction}, abstract = {The aim of this paper is to apply and develop methods based on Natural Language Processing for automatically testing the validity, reliability and coverage of various Swedish SNOMED-CT subsets, the Systematized NOmenclature of MEDicine - Clinical Terms a multiaxial, hierarchical classification system which is currently being translated from English to Swedish. Our work has been developed across two dimensions. Initially a Swedish electronic text collection of scientific medical documents has been collected and processed to a uniform format. Secondly, a term processing activity has been taken place. In the first phase of this activity, various SNOMED CT subsets have been mapped to the text collection for evaluating the validity and reliability of the translated terms. In parallel, a large number of term candidates have been extracted from the corpus in order to examine the coverage of SNOMED CT. Term candidates that are currently not included in the Swedish SNOMED CT can be either parts of compounds, parts of potential multiword terms, terms that are not yet been translated or potentially new candidates. In order to achieve these goals a number of automatic term recognition algorithms have been applied to the corpus. The results of the later process is to be reviewed by domain experts (relevant to the subsets extracted) through a relevant interface who can decide whether a new set of terms can be incorporated in the Swedish translation of SNOMED CT or not. }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{Kokkinakis-Dimitrios2008-73972, title = {Applying MeSH® to the (Swedish) Clinical Domain - Evaluation and Lessons learned}, abstract = {Medical discharge summaries and clinical notes provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a potential goldmine for both medical scientists as well as practitioners in the language technology field. The capability to extract the key concepts and their relationships from such data can be of great value for knowledge management tasks such as indexing, data interchange, data aggregation and clinical decision support. The purpose of this work is to get insights into the feasibility of applying the content of a controlled vocabulary, the Medical Subject Headings (MeSH) to a sample of electronic discharge letters (i.e. free text clinical notes). We explore the application of natural language processing (NLP) techniques to the challenge of efficiently detecting the terminology, as encoded in MeSH and we evaluate MeSH in this setting, showing that a lot of work remains to be done in order to increase the coverage of the resource both in terms of its breadth and depth. }, booktitle = {Proceedings of the 6th Scandinavian Health Informatics and the 12th Swedish National Term Conference}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73973, title = {MeSH® - From a Controlled Vocabulary to a Processable Resource}, abstract = {Large repositories of life science data in the form of domain-specific literature, textual databases and other large specialised textual collections (corpora) in electronic form increase on a daily basis to a level beyond the human mind can grasp and interpret. As the volume of data continues to increase, substantial support from new information technologies and computational techniques grounded in the form of the ever increasing applications of the mining paradigm is becoming apparent. These emerging technologies play an increasingly critical role in aiding research productivity, and they provide the means for reducing the workload for information access and decision support and for speeding up and enhancing the knowledge discovery process. In order to accomplish these higher level goals and support the mining approach however, a fundamental and unavoidable starting point is the identification and mapping of terminology from the textual, unstructured data onto biomedical knowledge sources and concept hierarchies. In this paper, we provide a description of the work regarding terminology recognition using the Swedish MeSH® thesaurus and its corresponding English original source. We explain the various transformation and refinement steps applied to the original database tables into a fully-fledged processing oriented annotating resource. Particular attention has been given to a number of these steps in order to automatically map the extensive variability of lexical terms to structured MeSH® nodes. Issues on annotation and coverage are also discussed. }, booktitle = {Proceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73977, title = {Semantic Pre-processing for Complexity Reduction in Parsing Medical Texts}, abstract = {Collection and multilayer annotation of textual corpora in specialized fields, such as (bio-) medicine is an important enterprise for empirically-based, data-driven language processing, human language technologies and linguistic research. One of the most important and difficult to achieve piece of annotation that can be made available is at the syntactic and functional level, i.e. parsing, particularly in sublanguages where specialized tools have to be adapted which is considered too expensive for many applications. In this paper, we describe a way to reduce the complexity of parsing in medical discourse by the use of a semantic pre-processing stage guided by annotations provided by medical thesauri and other domain-specific lexical resources. Parsing biomedical texts, apart from the challenge it possesses (deviant and idiosyncratic uses of vocabulary and syntax), is required in order to support and improve technologies such as Information Extraction and Retrieval, enhance the acquisition of relations between terminology support terminology management and population of medical semantic resources.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73976, title = {MEDLEX+: An Integrated Corpus-Lexicon Medical Workbench for Swedish}, abstract = {This paper reports on ongoing work on developing a medical corpus-lexicon workbench for Swedish, MedLex+. At the moment the workbench incorporates: (i) an annotated collection of medical texts, 25 million tokens, 50,000 documents, (ii) a number of language processing components, including tools for collocation extraction, compound segmentation and thesaurus-based semantic annotation, and (iii) a lexical database of medical terms (5,000 entries). MedLex+ is a multifunctional lexical resource due to its structural design and content which can be easily queried. The medical workbench is intended to support lexicographers in their work on compiling lexicons and also lexicon users more or less initiated in the medical domain. It can also assist researchers working in the fields of lexical semantics and natural language processing (NLP) with focus on medical language. The linguistically and semantically annotated medical texts in combination with a set of queries turn the corpus into a rich repository of semasiological and onomasiological knowledge about medical terminology and their linguistic, lexical and pragmatic properties. These properties are recorded in the lexical database with a cognitive profile. The MedLex+ workbench seems to offer constructive help in many different lexical tasks. }, booktitle = {Proceedings of the 13th EURALEX}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73975, title = {Semantic Relation Mining of Solid Compounds in Medical Corpora.}, abstract = {In the context of scientific and technical texts, meaning is usually embedded in noun compounds and the semantic interpretation of these compounds deals with the detection and semantic classification of the relation that holds between the compound’s constituents. Semantic relation mining, the technology applied for marking up, interpreting, extracting and classifying relations that hold between pairs of words, is an important enterprise that contribute to deeper means of enhancing document understanding technologies, such as Information Extraction, Question Answering, Summarization, Paraphrasing, Ontology Building and Textual Entailment. This paper explores the application of assigning semantic descriptors taken from a multilingual medical thesaurus to a large sample of solid (closed form) compounds taken from large Swedish medical corpora, and determining the relation(s) that may hold between the compound constituents. Our work is inspired by previous research in the area of using lexical hierarchies for identifying relations between two-word noun compounds in the medical domain. In contrast to previous research, Swedish, as other Germanic languages, require further means of analysis, since compounds are written as one sequence with no white space between the words, e.g. virus diseases vs. virussjukdomar, which makes the problem more challenging, since solid compounds are harder to identify and segment.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, ISBN = {9786611733414}, } @inProceedings{Kokkinakis-Dimitrios2008-73974, title = {A Semantically Annotated Swedish Medical Corpus}, abstract = {With the information overload in the life sciences there is an increasing need for annotated corpora, particularly with biological and biomedical entities, which is the driving force for data-driven language processing applications and the empirical approach to language study. Inspired by the work in the GENIA Corpus, which is one of the very few of such corpora, extensively used in the biomedical field, and in order to fulfil the needs of our research, we have collected a Swedish medical corpus, the MEDLEX Corpus. MEDLEX is a large structurally and linguistically annotated document collection, consisting of a variety of text documents related to various medical text subfields, and does not focus at a particular medical genre, due to the lack of large Swedish resources within a particular medical subdomain. Out of this collection we selected 300 documents which were manually examined by two human experts who inspected, corrected and/or accordingly modified the automatically provided annotations according to a set of provided labelling guidelines. The annotations consist of medical terminology provided by the Swedish and English MeSH® (Medical Subject Headings) thesauri as well as named entity labels provided by an enhanced named entity recognition software.}, booktitle = {roceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{Borin-Lars2007-44951, title = {Medical frames as target and tool}, booktitle = {FRAME 2007: Building Frame Semantics resources for Scandinavian and Baltic languages. (Nodalida 2007 workshop proceedings)}, author = {Borin, Lars and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2007}, ISBN = {978-91-976939-0-5}, pages = {11--18}, } @inProceedings{Borin-Lars2007-44954, title = {Naming the past: Named entity and animacy recognition in 19th century Swedish literature}, booktitle = {ACL 2007 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2007)}, author = {Borin, Lars and Kokkinakis, Dimitrios and Olsson, Leif-Jöran}, year = {2007}, pages = {1--8}, } @techreport{Borin-Lars2007-53590, title = {Empowering the patient with language techno­logy}, author = {Borin, Lars and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Williams, Sandra and Willis, Alistair}, year = {2007}, publisher = {Göteborg University}, adress = {Göteborg}, } @article{Kokkinakis-Dimitrios2007-45193, title = {Anonymisation of Swedish Clinical Data}, abstract = {There is a constantly growing demand for exchanging clinical and health-related information electronically. In the era of the Electronic Health Record the release of individual data for research, health care statistics, monitoring of new diagnostic tests and tracking disease outbreak alerts are some of the areas in which the protection of (patient) privacy has become an important concern. In this paper we present a system for automatic anonymisation of Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology.}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, volume = {4594}, pages = {237--241}, } @article{Kokkinakis-Dimitrios2007-45195, title = {Identification of Entity References in Hospital Discharge Letters}, abstract = {In the era of the Electronic Health Record the release of medical narrative textual data for research, for health care statistics, for monitoring of new diagnostic tests and for tracking disease outbreak alerts imposes tough restrictions by various public authority bodies for the protection of (patient) privacy. In this paper we present a system for automatic identification of named entities in Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology with minor adaptations}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, } @article{Kokkinakis-Dimitrios2007-45194, title = {Lexical Parameters, Based on Corpus Analysis of English and Swedish Cancer Data, of Relevance for NLG}, abstract = {This paper reports on a corpus-based, contrastive study of the Swedish and English medical language in the cancer sub-domain. It is focused on the examination of a number of linguistic parameters differentiating two types of cancer-related textual material, one intended for medical experts and one for laymen. Language-dependent and language independent characteristics of the textual data between the two languages and the two registers are examined and compared. The aim of the work is to gain insights into the differences between lay and expert texts in order to support natural language generation (NLG) systems.}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Hallett, Catalina and Hardcastle, david}, year = {2007}, } @inProceedings{Kokkinakis-Dimitrios2007-47933, title = {Automatic Indexing using the English and Swedish MeSH®, a Note on Coverage}, abstract = {The identification and mapping of terminology onto a concept hierarchy is the very first stage of semantic, deeper analysis of textual documents. Work regarding automatic terminology recognition using the Swedish MeSH® thesaurus (Medical Subject Headings, edition 2006) and its corresponding English source is reported. A number of transformations and refinements were applied to the original lexical database in order to enhance the automatic process of mapping the extensive variability of lexical terms in authentic data to structured MeSH codes. Means to increase the coverage of both thesauruses for automatic indexing of Swedish medical data are investigated.}, booktitle = {Svenska Läkaresällskapets Riksstämma 2007}, author = {Kokkinakis, Dimitrios}, year = {2007}, } @techreport{Åhlfelt-Hans2006-34047, title = {Literature Review on Patient_Friendly Documentation Systems}, author = {Åhlfelt, Hans and Borin, Lars and Daumke, Philipp and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Kokkinakis, Dimitrios and Mancini, Clara and Marko, Kornel and Merkel, Magnus and Pietsch, Christian and Power, Richard and Scott, Donia and Silvervarg, Annika and Toporowska Gronostaj, Maria and Williams, Sandra and Willis, Alistair}, year = {2006}, publisher = {Göteborg University}, adress = {Göteborg}, } @inProceedings{Kokkinakis-Dimitrios2006-33936, title = {Recognizing Acronyms and their Definitions in Swedish Medical Texts}, abstract = {This paper addresses the task of recognizing acronym-definition pairs in Swedish (medical) texts as well as the compilation of a freely available sample of such manually annotated pairs. A material suitable not only for supervised learning experiments, but also as a testbed for the evaluation of the quality of future acronym-definition recognition systems. There are a number of approaches to the identification described in the literature, particularly within the biomedical domain, but none of those addresses the variation and complexity exhibited in a language other than English. This is realized by the fact that we can have a mixture of two languages in the same document and/or sentence, i.e. Swedish and English; that Swedish is a compound language that significantly deteriorates the performance of previous approaches (without adaptations) and, most importantly, the fact that there is a large variation of possible acronym-definition permutations realized in the analysed corpora, a variation that is usually ignored in previous studies. }, booktitle = {roceedings of the 5th Languages Resources and Evalutaion (LREC). }, author = {Kokkinakis, Dimitrios and Dannélls, Dana}, year = {2006}, } @inProceedings{Kokkinakis-Dimitrios2006-33937, title = {Collection, Encoding and Linguistic Processing of a Swedish Medical Corpus - The MEDLEX Experience.}, abstract = {Corpora annotated with structural and linguistic characteristics play a major role in nearly every area of language processing. During recent years a number of corpora and large data sets became known and available to research even in specialized fields such as medicine, but still however, targeted predominantly for the English language. This paper provides a description of the collection, encoding and linguistic processing of an ever growing Swedish medical corpus, the MEDLEX Corpus. MEDLEX consists of a variety of text-documents related to various medical text genres. The MEDLEX Corpus has been structurally annotated using the Corpus Encoding Standard for XML (XCES), lemmatized and automatically annotated with part-of-speech and semantic information (extended named entities and the Medical Subject Headings, MeSH, terminology). The results from the processing stages (part-of-speech, entities and terminology) have been merged into a single representation format and syntactically analysed using a cascaded finite state parser. Finally, the parser’s results are converted into a tree structure that follows the TIGER-XML coding scheme, resulting a suitable for further exploration and fairly large Treebank of Swedish medical texts. }, booktitle = {Proceedings of the 5th Languages Resources and Evalutaion (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{Kokkinakis-Dimitrios2006-33938, title = {Lay Language versus Professional Language within the Cardiovascular Subdomain - a Contrastive Study}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, booktitle = {Proceedings of the 2006 WSEAS Int. Conf. on Cellular & Molecular Biology, Biophysics & Bioengineering}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, } @inProceedings{Kokkinakis-Dimitrios2006-33925, title = {Developing Resources for Swedish Bio-Medical Text Mining}, abstract = {Collection and annotation of corpora in specialized fields, such as medicine, and particularly for lesser-spoken languages, than for instance English, is an important enterprise for the continuous development and growth of language technology research, for resource development and for the implementation of practical applications for these languages. In this paper, we describe our ongoing efforts to build a large Swedish medical corpus, the MEDLEX Corpus, how we combine ge-neric named entity and terminology recognition for the detailed annotation of the corpus, and how these annotations are further utilized by an annotations-aware cascaded finite-state parser. }, booktitle = {Proceedings of the 2nd International Symposium on Semantic Mining in Biomedicine (SMBM)}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @article{Kokkinakis-Dimitrios2006-34032, title = {Comparing Lay and Professional Language in Cardiovascular Disorders Corpora.}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, volume = {3}, number = {6}, pages = {429--437}, } @incollection{Kokkinakis-Dimitrios2006-56225, title = {Att bygga en språkbro mellan allmänhet och vårdpersonal - språket i texter om hjärt-kärlsjukdomar}, booktitle = {Humanistdag-boken}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Johansson Kokkinakis, Sofie}, year = {2006}, publisher = {Göteborgs universitet}, adress = {Göteborg}, } @article{Kokkinakis-Dimitrios2006-45197, title = {Towards a Swedish Medical Treebank}, abstract = {In this paper, we present our current activities towards the compilation and the multi-layered annotation of a domain-dependent corpus for Swedish in the area of medicine. The focus of the paper is based on the description of the constituent structure and functionally oriented annotation of the corpus. Moreover, the annotation scheme adopted, which incorporates three main layers of linguistic processing, lexical analysis, shallow semantic analysis and syntactic processing, will be exemplified. For the syntactic analysis we use a cascaded finite-state parser, aware of the shallow semantic annotations produced. The result of this analysis, including syntactic parsing and shallow semantic analysis, is transformed into the TIGER-XML interchange format. Our goal is to produce a large, rich in annotations, medical treebank suitable for both corpus-based grammar learning systems, for semantic relation extraction and for linguistic exploration of theoretical nature.}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{Kokkinakis-Dimitrios2006-34033, title = {Towards a Swedish Medical Treebank}, booktitle = {5th Conference on Treebanks and Linguistic Theories}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{Marko-Kornel2006-34049, title = {Cross-Lingual Alignment of Medical Lexicons}, abstract = {We present an approach for the creation of a multilingual medical dictionary for the biomedical domain. In a first step, available monolingual lexical resources are compiled into a common interchange format. Secondly, according to a linking format deciced by the authors, the cross-lingual mappings of lexical entries are added. We show how these mappings can be generated using a morpho-semantic term normalization engine, which captures intra- as well as interlingual synonymy relationships on the level of subwords.}, booktitle = {Language Resources and Evaluation }, author = {Marko, Kornel and Baud, Robert and Zweigenbaum, Pierre and Merkel, Magnus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Schulz, Stefan}, year = {2006}, volume = {2006}, pages = {5--8}, } @inProceedings{Kokkinakis-Dimitrios2005-33934, title = {Identification of Named Entities and Medical Terminology in Swedish Patient Records.}, abstract = {An anonymisation or de-identification system can provide a broad spectrum of services related to the growing demands for better forms of dissemination of information about individuals found in electronic patient records. The range of these services includes: health care statistics and sharing clinical information across institutions; validation and monitoring of new diagnostic tests; release of individual data by protecting identities or hints that can identify individuals, and appropriate mechanisms to provide only the information necessary to the professional who has the need to know. This paper describes our first experiments intended for automatic anonymisation of Swedish electronic patient records using a generic system for Named Entity Recognition. There are eight main types of entities that the system recognizes: “person”, “location”, “organisation”, “event”, “object”, “work & art”, “time” and “measure”. To this set, two new modules have been recently developed. One is dedicated to animacy recognition, a modules based on a number of clues (such as key words utilized in the person’s module grammar and verbs requiring animate subject), and another one designated to identify and annotate medical terminology. The latter module annotates names of drugs and chemical substances, diseases, symptoms, organisms and anatomical terms. A detailed evaluation of the system, on authentic patient records, is given both for the named, medical and animate entities. }, booktitle = {WSEAS Transactions on BIOLOGY and BIOMEDICINE}, author = {Kokkinakis, Dimitrios}, year = {2005}, volume = {2}, number = {3}, pages = {312--317}, } @inProceedings{Kokkinakis-Dimitrios2004-33928, title = {Reducing the Effect of Name Explosion.}, abstract = {The problem of new vocabulary is particularly frustrating once one begins to work with large corpora of real texts. The identification of unknown proper nouns, chains of non-proper nouns and even common words that function as names (i.e. named entities) in unrestricted text, and their subsequent classification into some sort of semantic type is a challenging and difficult problem in Natural Language Processing (NLP). Systems that perform Information Extraction, Information Retrieval, Question-Answering, Topic Detection, Text Mining, Machine Translation and annotation for the Semantic Web have highlighted the need for the automatic recognition of such entities, since their constant introduction in any domain, however narrow, is very common and needs special attention. Proper names are usually not listed in defining or other common types of dictionaries, they may appear in many alias forms and abbreviated variations, which makes their listing infeasible. This paper deals with some extensions to the “traditional” named entity recognition approaches. It puts emphasis on more name classes and their further subclassification into finer sets. An operative system that can be tested and evaluated on-line implements the ideas described in this paper.}, booktitle = {Proceedings of the LREC Workshop: Beyond Named Entity Recognition, Semantic labelling for NLP tasks. ourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2004}, } @inProceedings{Kokkinakis-Dimitrios2004-33932, title = {Intelligent Building of Language Resources for HLT Applications}, booktitle = {Proceedings of the LREC Workshop: Amazing Utility of Parallel and Comparable Corpora. Fourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios and Samiotou, Anna and Kranias, Lambros}, year = {2004}, } @book{Kokkinakis-Dimitrios2001-125224, title = {A Framework for the Acquisition of Lexical Knowledge; Description and Application}, author = {Kokkinakis, Dimitrios}, year = {2001}, adress = {Göteborg}, ISBN = {LIBRIS-ID:8245865}, } @article{JohanssonKokkinakis-Sofie1999-55910, title = {Beskrivning av några problem vid automatisk analys av text}, author = {Johansson Kokkinakis, Sofie and Kokkinakis, Dimitrios}, year = {1999}, volume = {No 25}, pages = {88--95}, } @article{Kokkinakis-Dimitrios1999-56218, title = {Automatisk betydelseidentifiering på cykelnivå m.h.a. GLDB}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @article{Kokkinakis-Dimitrios1999-56216, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @techreport{Kokkinakis-Dimitrios1999-56213, title = {Sense Tagging at the Cycle-Level Using GLDB}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @techreport{Kokkinakis-Dimitrios1998-56209, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1998}, publisher = {Svenska språket}, adress = {Göteborg}, }