@inProceedings{albertin-kokkinakis-2024-defining-342782, title = {Defining Cohesion Features in the Study of Discourse Properties in Cognitive Impairment }, abstract = {The analysis of discourse and pragmatics, which deteriorate alongside other linguistic levels in cognitive decline, can enhance our understanding of dementia-related language patterns and contribute to the improvement of automated diagnostic tools. This study focuses on discourse cohesion, specifically investigating three linguistic phenomena: reference, lexical repetition, and connectives. Six features related to these categories were defined and automatically extracted from an Italian corpus of semi-spontaneous speech, collected from patients with early dementia, MCI subjects, and healthy controls. Some of these features proved significant in distinguishing among the three groups. Additional quantitative analysis revealed notable differences in the use of these elements, suggesting a potential link between their degradation and cognitive decline.}, booktitle = {Tenth Swedish Language Technology Conference (SLTC)}, author = {Albertin, Giorgia and Kokkinakis, Dimitrios}, year = {2024}, pages = {4}, } @inProceedings{kokkinakis-2024-from-336089, title = {From Zipf distribution to Universal Dependencies - Interactive Notebooks for Swedish Text Analysis }, abstract = {Notebook-based environments are powerful (web-based) interactive development resources for conducting exploratory (textual) data analysis (EDA). These environments allow the embedding of code (code snippets in ‛code cells’) which can be easily executed with the results immediately presented into the user’s window. This paper introduces some basic exploratory tools and techniques using JupyterLab notebooks, applied to Swedish using a subcorpus that address various topics related to the COVID-19 pandemic published during January-December 2021}, booktitle = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping }, ISBN = {978-91-8075-512-2}, } @inProceedings{kokkinakis-etal-2023-analysis-330230, title = {Analysis of mRNA-vaccine posts on Swedish Twitter data }, abstract = {The aim of this study was to use Swedish social media data to capture public perspectives and sentiments regarding the abovementioned study on possible effect of the novel mRNA vaccines that became massively available to the public during late 2021. The intention is to understand the key issues (topics/themes) that have captured public attention in Sweden, as well as the barriers and facilitators to successful or not mRNA vaccines.}, booktitle = {14th International Conference of Experimental Linguistics,18-20 October 2023, Athens, Greece}, author = {Kokkinakis, Dimitrios and Bruinsma, Bastian and Hammarlin, Mia-Marie}, year = {2023}, } @inProceedings{beccaria-etal-2023-extraction-334169, title = {Extraction and Analysis of Acoustic Features from Italian-Speaking Children with Autism Spectrum Disorder }, abstract = {Background: The persistent difficulties in social interaction and communication that characterize Autism Spectrum Disorder can be accessed by investigating the quality of language. Indeed, these deficits involve the presence of anomalies in speech production and understanding, which find an expression at the acoustic and prosodic levels of linguistic analysis. Objectives: The main aim of this work is to propose a speech pipeline for the extraction of Italian speech biomarkers typical of ASD by conducting an acoustic and phonological analysis. Moreover, we will highlight the strengths and difficulties of this kind of investigation introducing new topics for further research. Methods: The poster will present the analysis of a speech corpus of 14 Italian-speaking children with ASD and 14 controls (C). The corpus is demographically balanced (age 6-10, 8;1 ± 1;3. Sex: 3F, 11 M) and homogeneous at the diatopic level (origin: Prato, Pistoia, Florence). First, we extracted the acoustic features by using eGeMAPS (openSMILE; Eyben et al., 2015), specifically ideated for the study of impaired speech. Then, we implemented the Mann-Whitney U-test to select the features with the most statistically significant distance in the production of the two groups. Secondly, we conducted a parallel extraction regarding the pitch (F0 mean and standard deviation). We propose this additional analysis because pitch varies according to some demographic traits of the speaker (sex, age, height) and the literature presents opposite trends. For this task, we used Praat to have more flexibility in the manipulation of the extraction. We set the F0 range between 70 and 400 Hz (Patel et al., 2020). Finally, we conducted a comparison between the results of the two methods excluding female participants to verify if the trend of pitch changes when the participants are not mixed. Results: Table 1 shows the features selected between the ones extracted. They are related to prosody, quality of voice, loudness, and spectral distribution. Jitter, shimmer and HNR are usually investigated together to describe the emotional prosody and the quality of voice. The same trend found on our corpus is recorded in previous studies on languages other than Italian (Bone et al. 2015; Kissine & Geelhand 2019). Moreover, spectral flux is usually investigated together with shimmer and jitter to describe speech impairments (Haider et al., 2019). Nevertheless, if we consider the studies related to autistic speech, there are few that describe this feature because of the different methodologies used during the extraction. Finally, the values of pitch extracted by eGeMAPS and Praat show the same trend. It is higher in ASD than in controls, both if we considered the corpus mixed and the one with only the male speakers. However, the pitch does not show a statistically significant difference between the two groups (Table 2). Conclusions: These results, although preliminary, seem to confirm the presence of phonetic alterations of speech associated with the disorder. Further studies could improve the accuracy of the pipeline proposed by doing a qualitative analysis of the results and considering other linguistic and paralinguistic domains (e.g., morphological, pragmatic, and gestural analysis). }, booktitle = {The 22nd International Society for Autism Research (INSAR), May 3-4, Stockholm, Sweden}, author = {Beccaria, Federica and Gagliardi, Gloria and Kokkinakis, Dimitrios}, year = {2023}, } @book{borin-etal-2024-vaccine-341185, title = {Vaccine Hesitancy in the Nordic Countries: Trust and Distrust During the COVID-19 Pandemic}, abstract = {Bringing together studies from across the Nordic region, this book examines the challenges brought by the COVID-19 pandemic, with a particular focus on vaccine hesitancy. Shedding light on the political tensions that emerged as a result of the pandemic and the debates that ensued both within and between the Nordic nations, it investigates the vociferous discussions surrounding the COVID-19 vaccines and their presumed negative side effects through the lens of trust; trust in and between the neighbouring countries, in healthcare systems, fellow citizens, and experts; in public authorities, politicians, researchers, journalists, and pharmaceutical companies. The first volume to explore vaccine hesitancy in the Scandinavian context, this ground-breaking volume offers fresh perspectives on vaccine scepticism not as a form of ignorance or lack of knowledge, but as a manifestation of a more fundamental lack of faith in modern government and science. As such, it will appeal to scholars of sociology, politics, anthropology, media studies, communication and cultural studies with interests in public health, popular and political discourse and questions of public trust. }, author = {Borin, Lars and Hammarlin, Mia Marie and Kokkinakis, Dimitrios and Miegel, Fredrik}, year = {2024}, publisher = {Taylor and Francis}, ISBN = {9781040011614}, } @inProceedings{belmonte-etal-2024-automatic-336253, title = {Automatic Detection of Rhythmic Features in Pathological Speech of MCI and Dementia Patients }, abstract = {The presence of linguistic alterations represents one of the prodromal signs of cognitive decline associated with dementia. In recent years, a growing body of work has been devoted to the development of algorithms for the automatic linguistic analysis of both oral and written texts, with diagnostic purposes. The extraction of Digital Linguistic Biomarkers from patients' verbal productions can indeed provide a rapid, ecological, and cost-effective system for large-scale screening of the pathology. This article contributes to the ongoing research in the field by exploring a traditionally less studied aspect of language in dementia, namely the rhythmic characteristics of speech. In particular, the paper focuses on the automatic detection of rhythmic features in Italian connected speech. A landmark-based system was developed and evaluated to segment the speech flow into vocalic and consonantal intervals and to calculate several rhythmic metrics. Additionally, the reliability of these metrics in identifying MCI and dementia patients was tested.}, booktitle = {RaPID-5: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments}, author = {Belmonte, Marica and Gagliardi, Gloria and Kokkinakis, Dimitrios and Tamburini, Fabio}, year = {2024}, publisher = {European Language Resources Association (ELRA)}, ISBN = {978-2-493814-11-1}, } @inProceedings{kokkinakis-hammarlin-2024-cluster-338476, title = {Cluster-Based BERTopic Modeling on Swedish COVID-19 Vaccine Posts}, abstract = {This paper explores the prevalent themes across multiple threads on the popular Swedish discussion forum Flashback. Among its diverse array of topics, the forum actively engages users in addressing and debating questions pertaining to COVID-19 vaccines and vaccination. Through distinguishing between positive and negative perspectives within posts across 14 relevant thread discussions, we employ BERTopic, a modular topic modeling framework, which utilizes pre-trained language models and applies clustering techniques to identify prevailing topics. This enables us to conduct a nuanced exploration of overarching themes, offering valuable insights into the multifaceted nature of the discussions regarding COVID-19 vaccines and vaccination in Sweden.}, booktitle = {The 34th Medical Informatics Europe Conference}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie}, year = {2024}, publisher = {IOS Press}, address = {Amsterdam • Washington, DC}, } @inProceedings{hammarlin-etal-2023-fearing-327373, title = {Fearing mRNA: A Mixed Methods Study of Vaccine Rumours}, abstract = {The first mass-distributed vaccines based on mRNA technology were launched in 2021 to protect against COVID-19, sparking rumours among vaccine critical individuals that these “new” vaccines might be more dangerous to the health than other, “traditional” vaccines. Drawing on rumour theories and social cognitive perspectives, the aim of this chapter is to account for the purpose and the spreading of medical rumours that encircle mRNA COVID-19 vaccines. We ask: How are rumours concerning mRNA expressed and established? In terms of trust and distrust, what function do the rumours have? We take as our empirical case the fast spreading of a medical journal article written by a group of infectious medicine researchers at Lund University, Sweden, that spawned an already established vaccine rumour, and analyse Swedish-language tweets discussing mRNA vaccines posted between February 10, 2022 and November 10, 2022. Our study follows a mixed methods sequential explanatory design consisting of an initial computational distant reading analysis based on structural topic modeling, followed by a close qualitative reading and thematic analysis of the results. Our analysis shows how mRNA rumours are not primarily based on ignorance, but rather on distrust regarding the officially sanctioned, positive narrative of new vaccine technologies, expressed through what we term counter-scientific argumentation.}, booktitle = {NordMedia23: "Technological Takeover? Social and Cultural Implications – Promises and Pitfalls", 16–18 August 2023, Bergen, Norway}, author = {Hammarlin, Mia-Marie and Kokkinakis, Dimitrios and Miegel, Fredrik and Stoencheva, Jullietta}, year = {2023}, address = {Bergen, Norway}, } @inProceedings{kokkinakis-etal-2023-investigating-325628, title = {Investigating the Effects of MWE Identification in Structural Topic Modelling }, abstract = {Multiword expressions (MWEs) are common word combinations which exhibit idiosyncrasies in various linguistic levels. For various downstream natural language processing applications and tasks, the identification and discovery of MWEs has been proven to be potentially practical and useful, but still challenging to codify. In this paper we investigate various, relevant to MWE, resources and tools for Swedish, and, within a specific application scenario, we apply structural topic modelling to investigate whether there are any interpretative advantages of identifying MWEs.}, booktitle = {The 19th Workshop on Multiword Expressions (MWE 2023)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Bruinsma, Sebastianus C. J. and Hammarlin, Mia-Marie}, year = {2023}, publisher = {ACL}, ISBN = {978-1-959429-59-3}, } @inProceedings{kokkinakis-etal-2023-scaling-326698, title = {Scaling-up the Resources for a Freely Available Swedish VADER (svVADER) }, abstract = {With widespread commercial applications in various domains, sentiment analysis has become a success story for Natural Language Processing (NLP). Still, although sentiment analysis has rapidly progressed during the last years, mainly due to the application of modern AI technologies, many approaches apply knowledge-based strategies, such as lexicon-based, to the task. This is particularly true for analyzing short social media content, e.g., tweets. Moreover, lexicon-based sentiment analysis approaches are usually preferred over learning-based methods when training data is unavailable or insufficient. Therefore, our main goal is to scale-up and apply a lexicon-based approach which can be used as a baseline to Swedish sentiment analysis. All scaled-up resources are made available, while the performance of this enhanced tool is evaluated on two short datasets, achieving adequate results. }, booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, author = {Kokkinakis, Dimitrios and Muñoz Sánchez, Ricardo and Hammarlin, Mia-Marie}, year = {2023}, } @misc{volodina-etal-2024-proceedings-335190, title = {Proceedings of the Huminfra Conference (HiC 2024), 10-11 January, 2024, Gothenburg, Sweden}, author = {Volodina, Elena and Bouma, Gerlof and Forsberg, Markus and Kokkinakis, Dimitrios and Alfter, David and Fridlund, Mats and Horn, Christian and Ahrenberg, Lars and Blåder, Anna}, year = {2024}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-8075-512-2}, } @misc{dunabeitia-etal-2023-editorial-333441, title = {Editorial: Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests -Volume II }, abstract = {Our first volume laid the foundation for understanding the potential of digital linguistic biomarkers in assessing various cognitive and psychological aspects. In this second volume, we witness a significant advancement in both the scope and depth of research in this area. The featured articles in this volume contribute to our understanding of how linguistic biomarkers can transcend traditional paper-and-pencil tests, offering a more nuanced and comprehensive approach to the assessment of cognitive function and psychological well-being.In the first study of the volume [Gonzalez-Recober et al., 2023], the authors employed automated methods to investigate speech production during category and letter fluency tasks, commonly used neuropsychological assessments for evaluating lexical retrieval abilities. Their analysis encompassed a diverse range of linguistic and acoustic features, providing a more comprehensive perspective on these tasks than previous studies. As expected, participants produced more words during the category fluency task than during the letter fluency task. Moreover, several linguistic and acoustic measures displayed distinctions between the two tasks. The automated techniques employed in this study offer a reproducible and scalable approach for analyzing fluency tasks, with potential applications in clinical settings. By implementing these methods, future research endeavors are expected to expand our knowledge of speech feature differences, not only in terms of total scores but also across various speech measures, particularly among clinical populations.In the second article of the volume [Sánchez-Vincitore et al. 2023], the authors present a longitudinal analysis of linguistic biomarkers to detect cognitive decline. Their study underscores the potential of natural language processing techniques in identifying subtle cognitive changes over time. They examined data from over 3,000 participants aged 45 and older to investigate the relationship between age, gender, and language-mediated working memory processes using commercial cognitive tests (in their case, scientific tests developed by CogniFit Inc.). The findings revealed that age negatively predicted working memory performance, highlighting the potential of computerized assessments in predicting cognitive functions during aging and the need for further research on gender effects in cognitive aging. This study contributed to the growing body of evidence supporting the utility of linguistic biomarkers in early cognitive assessment.In the third study of our volume [Kim et al. 2023], the focus shifts to postoperative delirium (POD) in elderly patients following spinal surgery. POD has been linked to adverse outcomes in this demographic, prompting researchers to explore potential biomarkers for degenerative cerebral dysfunctions like mild cognitive impairment and dementia. The authors used electroencephalography (EEG) to measure an EEG biomarker reflecting idle cortical states through intrinsic alpha oscillations in the prefrontal regions. Cognitive follow-ups were performed using the Telephone Interview for Cognitive Status™ (TICS). The study observed that among patients diagnosed with POD, neurocognitive disorders could persist for up to 1 year postsurgery. These findings suggest that EEG has the potential to be a novel and valuable tool for identifying elderly surgical patients at a higher risk of developing postoperative delirium, offering opportunities for early intervention and improved patient outcomes.As the fourth article in our volume, the study by [Saccone et al. 2023] delves into the realm of schizophrenia, examining how it affects speech prosody and pragmatic functions. The study conducted corpus-based research, focusing on real-life spontaneous interactions to shed light on the prosodic features of schizophrenia. Notably, the speech patterns of patients revealed distinct characteristics. Their speech was organized into smaller, less structured information chunks, punctuated by frequent silences and extended pauses during turn-taking. Fluency was disrupted by retracing phenomena, particularly in complex information structures. Besides, comparing Topic and Comment-prominences between patients and non-pathological individuals revealed a consistent pattern. Patients exhibited higher values for Topic-prominence across all parameters, while the non-pathological group displayed the opposite trend. These findings provide valuable insights into the prosodic and pragmatic aspects of speech in schizophrenia, emphasizing the importance of understanding these linguistic manifestations in the context of the disorder's impact on communication.In closing, the second volume of "Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests" presents a short yet diverse and comprehensive array of research articles that collectively advance the field. These contributions not only underscore the relevance and timeliness of linguistic biomarkers in the digital age but also highlight their potential to revolutionize the way we assess cognitive function, psychological well-being, and aging across diverse populations, extending to pathological and clinical samples.}, author = {Dunabeitia, Jon Andoni and Kokkinakis, Dimitrios and Gagliardi, Gloria}, year = {2023}, volume = {14}, } @inProceedings{fraser-etal-2017-analysis-257840, title = {An analysis of eye-movements during reading for the detection of mild cognitive impairment}, abstract = {We present a machine learning analysis of eye-tracking data for the detection of mild cognitive impairment, a decline in cognitive abilities that is associated with an increased risk of developing dementia. We compare two experimental configurations (reading aloud versus reading silently), as well as two methods of combining information from the two trials (concatenation and merging). Additionally, we annotate the words being read with information about their frequency and syntactic category, and use these annotations to generate new features. Ultimately, we are able to distinguish between participants with and without cognitive impairment with up to 86% accuracy.}, booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing. September 9-11, 2017 Copenhagen, Denmark / Editors Martha Palmer, Rebecca Hwa, Sebastian Riedel }, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, publisher = {Association for Computational Linguistics }, ISBN = {978-1-945626-83-8}, } @article{themistocleous-etal-2020-voice-295469, title = {Voice quality and speech fluency distinguish individuals with Mild Cognitive Impairment from Healthy Controls}, abstract = {Mild Cognitive Impairment (MCI) is a syndrome characterized by cognitive decline greater than expected for an individual's age and education level. This study aims to determine whether voice quality and speech fluency distinguish patients with MCI from healthy individuals to improve diagnosis of patients with MCI. We analyzed recordings of the Cookie Theft picture description task produced by 26 patients with MCI and 29 healthy controls from Sweden and calculated measures of voice quality and speech fluency. The results show that patients with MCI differ significantly from HC with respect to acoustic aspects of voice quality, namely H1-A3, cepstral peak prominence, center of gravity, and shimmer; and speech fluency, namely articulation rate and averaged speaking time. The method proposed along with the obtainability of connected speech productions can enable quick and easy analysis of speech fluency and voice quality, providing accessible and objective diagnostic markers of patients with MCI.}, journal = {PloS one}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, volume = {15}, number = {7}, pages = {e0236009}, } @inProceedings{kokkinakis-etal-2017-data-256955, title = {Data Collection from Persons with Mild Forms of Cognitive Impairment and Healthy Controls - Infrastructure for Classification and Prediction of Dementia}, abstract = {Cognitive and mental deterioration, such as difficulties with memory and language, are some of the typical phenotypes for most neurodegenerative diseases including Alzheimer’s disease and other dementia forms. This paper describes the first phases of a project that aims at collecting various types of cognitive data, acquired from human subjects in order to study relationships among linguistic and extra-linguistic observations. The project’s aim is to identify, extract, process, correlate, evaluate, and disseminate various linguistic phenotypes and measurements and thus contribute with complementary knowledge in early diagnosis, monitor progression, or predict individuals at risk. In the near future, automatic analysis of these data will be used to extract various types of features for training, testing and evaluating automatic classifiers that could be used to differentiate individuals with mild symptoms of cognitive impairment from healthy, age-matched controls and identify possible indicators for the early detection of mild forms of cognitive impairment. Features will be extracted from audio recordings (speech signal), the transcription of the audio signals (text) and the raw eye-tracking data.}, booktitle = {Proceedings of the 21st Nordic Conference on Computational Linguistics, NoDaLiDa, 22-24 May 2017, Gothenburg, Sweden}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2017}, publisher = {Linköping University Electronic Press, Linköpings universitet}, address = {Linköping}, ISBN = {978-91-7685-601-7}, } @misc{gagliardi-etal-2021-editorial-307124, title = {Editorial: Digital Linguistic Biomarkers: Beyond Paper and Pencil Test}, abstract = {Over the last decades, a growing body of linguistic studies have been devoted to the clinical domain (Perkins 2011), while the amount of experimental linguistic research focusing on neuroscience and mental health has increased exponentially during the last few years. Considering that many of the factors underlying cognitive and neuropsychiatric disorders may yield to late symptoms that are hard to foresee, it is often difficult to predict the existence of a presence or risk of a disease, as well as the disease’s trajectory. In this context, interdisciplinary approaches gain increasing popularity, and the analysis of complex behaviour – such as speech and language – emerges as a natural candidate to identify and analyse the extent to which a given neuropathology can impact the cognitive system at the very early stages. In this context, the development of cognitive evaluation and intervention tools focusing on linguistic biomarkers becomes a critical scientific arena both in and outside the clinic and laboratory (see Petrizzo & Popolo, 2020). Recent international research has demonstrated that automated collected and analysed quantitative linguistic features, easily extractable from a patient’s verbal productions, can be very useful in separating people with various cognitive or mental impairment from healthy subjects, even at a very early stage (see Bedi et al., 2015), and even to predict the outcomes of clinical interventions (see Carrillo et al., 2018). In this line, machine learning-based language technology methods and tools based on artificial intelligence are particularly promising to address this task (Locke et al. 2021; Sigman et al., 2021). Indeed, subtle language disruptions can be employed as digital linguistic biomarkers, namely objective, quantifiable behavioural data that can be collected and measured by means of digital devices, allowing for a low-cost pathology detection, classification and monitoring. Compared to classical pen-and-paper neuropsychological tests, the use of these instruments shows many advantages – such as its non-intrusive and time-effective application – providing not only offline, but also online measures that serve as a proxy for cognitive processing and its underlying mechanisms. The aim of the Research Topic Digital Linguistic Biomarkers: Beyond Paper and Pencil Tests is to provide a state-of-the-art overview of this multidisciplinary and constantly evolving area of research, bringing together contributions from different quarters of the cognitive sciences. The collection comprises one systematic review, six original research papers, and one opinion paper. The articles are based on empirical and theoretical research from several disciplines (i.e., linguistics, psychology, Artificial Intelligence), and they tackle a range of developmental and acquired disorders. Most probably, dementia assessment has been one of the most rapidly evolving domain of Natural Language Processing (NLP) application for medical science (Petti, Baker & Korhonen 2020), but this approach is spreading rapidly through the community, with encouraging results on both developmental and acquired pathologies, as shown in the current article collection (i.e., autism, developmental language disorder, attention-deficit hyperactivity disorder, Alzheimer’s disease and mild cognitive impairment, or Parkinson’s disease). Furthermore, this Research Topic covers a variety of test languages showing the degree of internationalization of the research on the analysis verbal productions (i.e., English, Italian, German, and Japanese).}, author = {Gagliardi, Gloria and Kokkinakis, Dimitrios and Dunabeitia, Jon Andoni}, year = {2021}, volume = {12}, pages = {752238}, } @article{antonsson-etal-2021-using-301490, title = {Using a Discourse Task to Explore Semantic Ability in Persons With Cognitive Impairment.}, abstract = {This paper uses a discourse task to explore aspects of semantic production in persons with various degree of cognitive impairment and healthy controls. The purpose of the study was to test if an in-depth semantic analysis of a cognitive-linguistic challenging discourse task could differentiate persons with a cognitive decline from those with a stable cognitive impairment. Both quantitative measures of semantic ability, using tests of oral lexical retrieval, and qualitative analysis of a narrative were used to detect semantic difficulties. Besides group comparisons a classification experiment was performed to investigate if the discourse features could be used to improve classification of the participants who had a stable cognitive impairment from those who had cognitively declined. In sum, both types of assessment methods captured difficulties between the groups, but tests of oral lexical retrieval most successfully differentiated between the cognitively stable and the cognitively declined group. Discourse features improved classification accuracy and the best combination of features discriminated between participants with a stable cognitive impairment and those who had cognitively declined with an area under the curve (AUC) of 0.93.}, journal = {Frontiers in aging neuroscience}, author = {Antonsson, Malin and Lundholm Fors, Kristina and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2021}, volume = {12}, } @misc{themistocleous-etal-2023-assessing-331090, title = {Assessing Language Disorders using Artificial Intelligence: a Paradigm Shift }, abstract = {Speech, language, and communication deficits are present in most neurodegenerative syndromes. They enable the early detection, diagnosis, treatment planning, and monitoring of neurocognitive disease progression as part of traditional neurological assessment. Nevertheless, standard speech and language evaluation is time-consuming and resource-intensive for clinicians. We argue that using machine learning methodologies, natural language processing, and modern artificial intelligence (AI) for Language Assessment is an improvement over conventional manual assessment. Using these methodologies, Computational Language Assessment (CLA) accomplishes three goals: (i) provides a neuro-cognitive evaluation of speech, language, and communication in elderly and high-risk individuals for dementia; (ii) facilitates the diagnosis, prognosis, and therapy efficacy in at-risk and language-impaired populations; and (iii) allows easier extensibility to assess patients from a wide range of languages. By employing AI models, CLA may inform neurocognitive theory on the relationship between language symptoms and their neural bases. Finally, it signals a paradigm shift by significantly advancing our ability to optimize the prevention and treatment of elderly individuals with communication disorders, allowing them to age gracefully with social engagement. }, author = {Themistocleous, Charalambos and Tsapkini, Kyrana and Kokkinakis, Dimitrios}, year = {2023}, publisher = {arXiv.org}, } @inProceedings{kokkinakis-etal-2023-prevalence-324818, title = {The Prevalence of mRNA Related Discussions during the Post-COVID-19 Era}, abstract = {Vaccinations are one of the most significant interventions to public health, but vaccine hesitancy and skepticism are raising serious concerns for a portion of the population in many countries, including Sweden. In this study, we use Swedish social media data and structural topic modeling to automatically identify mRNA-vaccine related discussion themes and gain deeper insights into how people’s refusal or acceptance of the mRNA technology affects vaccine uptake. Our point of departure is a scientific study published in February 2022, which seems to once again sparked further suspicion and concern and highlight the necessity to focus on issues about the nature and trustworthiness in vaccine safety. Structural topic modelling is a statistical method that facilitates the study of topic prevalence, temporal topic evolution, and topic correlation automatically. Using such a method, our research goal is to identify the current understanding of the mechanisms on how the public perceives the mRNA vaccine in the light of new experimental findings.}, booktitle = { Caring is Sharing – Exploiting the Value in Data for Health and Innovation / M. Hägglund et al. (eds.) Proceedings of the 33rd Medical Informatics Europe Conference (MIE2023), Gothenburg, Sweden, 22-25 May 2023}, author = {Kokkinakis, Dimitrios and Bruinsma, Sebastianus Cornelis Jacobus and Hammarlin, Mia-Marie}, year = {2023}, publisher = {IOS Press}, ISBN = {978-1-64368-388-1}, } @article{themistocleous-etal-2018-identification-273026, title = {Identification of Mild Cognitive Impairment From Speech in Swedish Using Deep Sequential Neural Networks}, abstract = {While people with mild cognitive impairment (MCI) portray noticeably incipient memory difficulty in remembering events and situations along with problems in decision making, planning, and finding their way in familiar environments, detailed neuropsychological assessments also indicate deficits in language performance. To this day, there is no cure for dementia but early-stage treatment can delay the progression of MCI; thus, the development of valid tools for identifying early cognitive changes is of great importance. In this study, we provide an automated machine learning method, using Deep Neural Network Architectures, that aims to identify MCI. Speech materials were obtained using a reading task during evaluation sessions, as part of the Gothenburg MCI research study. Measures of vowel duration, vowel formants (F1 to F5), and fundamental frequency were calculated from speech signals. To learn the acoustic characteristics associated with MCI vs. healthy controls, we have trained and evaluated ten Deep Neural Network Architectures and measured how accurately they can diagnose participants that are unknown to the model. We evaluated the models using two evaluation tasks: a 5-fold crossvalidation and by splitting the data into 90% training and 10% evaluation set. The findings suggest first, that the acoustic features provide significant information for the identification of MCI; second, the best Deep Neural Network Architectures can classify MCI and healthy controls with high classification accuracy (M = 83%); and third, the model has the potential to offer higher accuracy than 84% if trained with more data (cf., SD≈15%). The Deep Neural Network Architecture proposed here constitutes a method that contributes to the early diagnosis of cognitive decline, quantify the progression of the condition, and enable suitable therapeutics.}, journal = {Frontiers in Neurology}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2018}, volume = {9}, pages = {1--10}, } @article{fraser-etal-2019-predicting-282807, title = {Predicting MCI Status From Multimodal Language Data Using Cascaded Classifiers}, abstract = {Recent work has indicated the potential utility of automated language analysis for the detection of mild cognitive impairment (MCI). Most studies combining language processing and machine learning for the prediction of MCI focus on a single language task; here, we consider a cascaded approach to combine data from multiple language tasks. A cohort of 26 MCI participants and 29 healthy controls completed three language tasks: picture description, reading silently, and reading aloud. Information from each task is captured through different modes (audio, text, eye-tracking, and comprehension questions). Features are extracted from each mode, and used to train a series of cascaded classifiers which output predictions at the level of features, modes, tasks, and finally at the overall session level. The best classification result is achieved through combining the data at the task level (AUC = 0.88, accuracy = 0.83). This outperforms a classifier trained on neuropsychological test scores (AUC = 0.75, accuracy = 0.65) as well as the "early fusion" approach to multimodal classification (AUC = 0.79, accuracy = 0.70). By combining the predictions from the multimodal language classifier and the neuropsychological classifier, this result can be further improved to AUC = 0.90 and accuracy = 0.84. In a correlation analysis, language classifier predictions are found to be moderately correlated (rho = 0.42) with participant scores on the Rey Auditory Verbal Learning Test (RAVLT). The cascaded approach for multimodal classification improves both system performance and interpretability. This modular architecture can be easily generalized to incorporate different types of classifiers as well as other heterogeneous sources of data (imaging, metabolic, etc.).}, journal = {Frontiers in Aging Neuroscience}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Öhman, Fredrik and Kokkinakis, Dimitrios}, year = {2019}, volume = {11}, number = {205}, } @inProceedings{lundholmfors-etal-2018-automated-263790, title = {Automated Syntactic Analysis of Language Abilities in Persons with Mild and Subjective Cognitive Impairment}, abstract = {In this work we analyze the syntactic complexity of transcribed picture descriptions using a variety of automated syntactic features, and investigate the features’ predictive power in classifying narratives from people with subjective and mild cognitive impairment and healthy controls. Our results indicate that while there are no statistically significant differences, syntactic features can still be moderately successful at distinguishing the participant groups when used in a machine learning framework.}, booktitle = {Building continents of knowledge in oceans of data : the future of co-created eHealth: proceedings of MIE2018, 24-26 April 2018, Gothenburg, Sweden}, editor = {Adrien Ugon and Daniel Karlsson and Gunnar O. Klein and Anne Moen.}, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, publisher = {IOS Press}, address = {Amsterdam}, ISBN = {978-1-61499-851-8}, } @article{hammarlin-etal-2023-covid-329784, title = {COVID-19 Vaccine Hesitancy: A Mixed Methods Investigation of Matters of Life and Death.}, abstract = {In this article, hesitancy towards COVID-19 vaccinations is investigated as a phenomenon touching upon existential questions. We argue that it encompasses ideas of illness and health, and also of dying and fear of suffering. Building on a specific strand within anti-vaccination studies, we conjecture that vaccine hesitancy is, to some extent, reasonable, and that this scepticism should be studied with compassion. Through a mixed methods approach, vaccine hesitancy, as it is being expressed in a Swedish digital open forum, is investigated and understood as, on the one hand, a perceived need of protecting one’s body from techno-scientific experiments, and thus the risk of becoming a victim of medicine itself. On the other hand, the community members express what we call a tacit belief in modern medicine by demonstrating their own “expert” pandemic knowledge. The analysis also shows how the COVID-19 pandemic triggers memories of another pandemic, namely the swine flu in 2009–2010, and what we term a medical crisis that occurred then, due to a vaccine thatcaused a rare but severe side effect in Sweden and elsewhere.}, journal = {Journal of Digital Social Research (JDSR)}, author = {Hammarlin, MIa-Marie and Kokkinakis, Dimitrios and Borin, Lars}, year = {2023}, volume = {5}, number = {4}, pages = {31--61}, } @misc{kokkinakis-etal-2022-proceedings-317658, title = {Proceedings of LREC 2022 Workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments. (RaPID-2012), Saturday 25th of June 2022. }, abstract = {RaPID-4 aims to be an interdisciplinary forum for researchers to share information, findings, methods, models and experience on the collection and processing of data produced by people with various forms of mental, cognitive, neuropsychiatric, or neurodegenerative impairments, such as aphasia, dementia, autism, bipolar disorder, Parkinson's disease or schizophrenia. Particularly, the workshop's focus is on creation, processing and application of data resources from individuals at various stages of these impairments and with varying degrees of severity. Creation of resources includes e.g. the annotation, description, analysis and interpretation of linguistic, paralinguistic and extra-linguistic aspects of such data (i.e. spontaneous spoken language, transcripts, eye tracking, wearable and sensor measurements, digital biomarkers, etc.). Processing of such data can be used to identify, extract, correlate, evaluate and disseminate various linguistic or multimodal phenotypes and measurements, which then can be applied to aid diagnosis, monitor the progression or predict individuals at risk. A central aim is to facilitate the study of the relationships among various levels of linguistic, paralinguistic and extra-linguistic observations (e.g., acoustic measures; phonological, syntactic and semantic features; eye tracking, sensors, signs and multimodal signals). Submission of papers are invited in all of the aforementioned areas, particularly emphasizing multidisciplinary aspects of processing such data and the interplay between clinical/nursing/medical sciences, language technology, computational linguistics, natural language processing (NLP) and computer science. The workshop will act as a stimulus for the discussion of several ongoing research questions driving current and future research by bringing together researchers from various research communities. }, author = {Kokkinakis, Dimitrios and Themistocleous, Charalambos K. and Lundholm Fors, Kristina and Tsanas, Athanasios and Fraser, Kathleen C.}, year = {2022}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-77-1}, } @inProceedings{kokkinakis-etal-2022-necessity-321865, title = {The necessity of digital health communication in social media to boost COVID-19 vaccine acceptance. }, booktitle = {ICA Post Conference: Digital Health Communication: Issues and Perspectives. University of Burgundy Franche-Comté, Dijon, France.}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie and Borin, Lars and Miegel, Fredrik}, year = {2022}, } @inProceedings{kokkinakis-hammarlin-2022-negative-321864, title = {Negative vaccine voices in Swedish social media }, abstract = {Vaccinations are one of the most significant interventions to public health, but vaccine hesitancy creates concerns for a portion of the population in many countries, including Sweden. Since discussions on vaccine hesitancy are often taken on social networking sites, data from Swedish social media are used to study and quantify the sentiment among the discussants on the vaccination-or-not topic during phases of the COVID-19 pandemic. Out of all the posts analyzed a majority showed a stronger negative sentiment, prevailing throughout the whole of the examined period, with some spikes or jumps due to the occurrence of certain vaccine-related events distinguishable in the results. Sentiment analysis can be a valuable tool to track public opinions regarding the use, efficacy, safety, and importance of vaccination. }, booktitle = {Proceedings of the 13th International Conference of Experimental Linguistics}, author = {Kokkinakis, Dimitrios and Hammarlin, Mia-Marie}, year = {2022}, } @inProceedings{hammarlin-etal-2021-vaccine-307227, title = {Vaccine hesitancy – trust and distrust in medical expertise and authorities}, abstract = {The increase of vaccine hesitancy is singled out by WHO as one of the ten most important and urgent threats to global health (https://www.who.int/emergencies/ten-threats-to-global-health-in-2019). Diseases like measles are returning in different parts of Europe, partly as a result of the activities of the anti-vaccination movement. The herd immunity in most Western countries is high but even a small decrease in vaccination would have immediate negative effects for the population. Sweden offers a perfect site for future anti-vaccination studies due to its high vaccination covering. A decline in the numbers of children vaccinated has had immediate effects. For example, the incident rate in the country of pertussis rose from 700 cases to 3,200 cases per 100,000 children in 4 years due to a rather small decrease in vaccinations. This constitutes a strong argument for the civic importance of the case. The aim of this presentation is to introduce a new 4-year research project (2020–2023), independently financed by the Bank of Sweden Foundation (Riksbankens jubileumsfond), with the goal to investigate the role and importance of rumouring for the vaccination skepticism growing on the internet, and how it can be understood as an expression of civic engagement in the present digital times entailing crucial transformations for everyday civic culture. Theoretically, the project builds upon, and develop, media researcher Dahlgren’s work on civic culture and Kitta’s studies of the anti-vaccination movement. The overarching research question is: How have the everyday practice and experience of, and the conditions for, rumours been shaped and reshaped in the digital age, and what do these processes mean for civic engagement and participation? The project will offer an understanding of how everyday interaction on the internet has a powerful impact on the spreading of false information, which in the long run may challenge democracy. On a more concrete level the project will answer the following questions in relation to the case of vaccine skepticism: How are rumours about alleged risks and dangers of vaccination propagated and established on the internet? Are there specific patterns and correlations connecting topics, assumptions, myths, argumentation schemes, popularity and time? What do everyday practices, on- and offline, of rumouring mean for its adherents’ civic engagement in the anti-vaccination movement? Which are the civic implications of the spreading and circulation of vaccination hostile rumours on individual citizens and society at large?}, booktitle = {8th European Communication Conference (ECREA)}, author = {Hammarlin, Mia-Marie and Miegel, Fredrik and Borin, Lars and Kokkinakis, Dimitrios and Jaakonaho, Anna}, year = {2021}, } @incollection{johansson-etal-2021-semantic-310775, title = {Semantic Role Labeling}, booktitle = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications}, editor = {Dana Dannélls and Lars Borin and Karin Friberg Heppin}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2021}, publisher = {John Benjamins Publishing Company}, address = {Amsterdam / Philadelphia}, ISBN = {978 90 272 5848 9}, pages = {264–280}, } @inProceedings{kokkinakis-2021-insights-307200, title = {Insights on a Swedish Covid-19 corpus}, abstract = {The COVID-19 pandemic has had a serious impact on people all over the world, from mental and physical health to economic downturn to education and social relationships, while political decisions in many countries have had a profound impact on the lives of all people regardless of age. Many of these effects can be studied with statistical and qualitative data such as collected questionnaires and sickness absence rates. But large-scale studies require expertise in multiple domains and from many points of view. SpråkbankenText continuously collects text from various sources. In order to fill the gap in the lack of an available Swedish COVID-19-related dataset, we started to build a Swedish COVID-19 corpus (sv-COVID-19). Various tools for e.g. lexical, semantic or pragmatic/discourse analyses can be then applied in order to answer relevant questions on e.g. how people, on a larger scale than what can be obtained through qualitative studies, experienced their everyday life through the different phases of COVID-19 crisis, or how political decisions and their consequences are described and discussed.}, booktitle = {CLARIN Annual Conference (Virtual Event). 27 – 29 September 2021. Monica Monachini, Maria Eskevich (red.). s. 31-34}, author = {Kokkinakis, Dimitrios}, year = {2021}, } @misc{kokkinakis-etal-2020-proceedings-305214, title = {Proceedings of the LREC 2020. Workshop on: Resources and Processing of Linguistic, Para-linguistic and Extra-linguistic Data from People with Various Forms of Cognitive/Psychiatric/Developmental Impairments (RaPID-3), May 11-16, 2020, Marseille, France}, abstract = {RaPID-3 aims to be an interdisciplinary forum for researchers to share information, findings, methods, models and experience on the collection and processing of data produced by people with various forms of mental, cognitive, neuropsychiatric, or neurodegenerative impairments, such as aphasia, dementia, autism, bipolar disorder, Parkinson’s disease or schizophrenia. Particularly, the workshop’s focus is on creation, processing and application of data resources from individuals at various stages of these impairments and with varying degrees of severity. Creation of resources includes e.g. annotation, description, analysis and interpretation of linguistic, paralinguistic and extra-linguistic data (such as spontaneous spoken language, transcripts, eyetracking measurements, wearable and sensor data, etc). Processing is done to identify, extract, correlate, evaluate and disseminate various linguistic or multimodal phenotypes and measurements, which then can be applied to aid diagnosis, monitor the progression or predict individuals at risk. A central aim is to facilitate the study of the relationships among various levels of linguistic, paralinguistic and extra-linguistic observations (e.g., acoustic measures; phonological, syntactic and semantic features; eye tracking measurements; sensors, signs and multimodal signals). Submission of papers are invited in all of the aforementioned areas, particularly emphasizing multidisciplinary aspects of processing such data and the interplay between clinical/nursing/medical sciences, language technology, computational linguistics, natural language processing (NLP) and computer science. The workshop will act as a stimulus for the discussion of several ongoing research questions driving current and future research by bringing together researchers from various research communities. }, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Themistocleous, Charalambos and Antonsson, Malin and Eckerström, Marie}, year = {2020}, publisher = {European Language Resources Association (ELRA)}, address = {Paris}, ISBN = {979-10-95546-45-0}, } @inProceedings{themistocleous-etal-2020-automatic-305224, title = {Automatic analysis of voice quality and prosody in patients with Mild Cognitive Impairment.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/SNL2020.pdf}, booktitle = {The 12th Annual Society for the Neurobiology of Language Meeting (SNL) -- virtual conference}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{themistocleous-etal-2020-automated-305223, title = {Automated speech analysis improves MCI diagnosis}, abstract = {Mild Cognitive Impairment (MCI) is a condition characterized by cognitive decline greater than expected for an individual's age and education level. In this study, we are investigating whether acoustic properties of speech production can improve the classification of individuals with MCI from healthy controls augmenting the Mini Mental State Examination, a traditional screening tool, with automatically extracted acoustic information. We found that just one acoustic feature, can improve the AUC score (measuring a trade-off between sensitivity and specificity) from 0.77 to 0.89 in a boosting classification task. These preliminary results suggest that computerized language analysis can improve the accuracy of traditional screening tools}, booktitle = {Proceedings of the 11th Experimental Linguistics Conference (ExLing)}, author = {Themistocleous, Charalambos and Eckerström, Marie and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{themistocleous-etal-2020-improving-305222, title = {Improving the Diagnosis of Mild Cognitive Impairment in elderly individuals using a multifactorial automatic analysis of voice quality and prosody.}, abstract = {http://demo.spraakdata.gu.se/svedk/pbl/AEC-30-Paper.JPG}, booktitle = {30th Alzheimer Europe Conference #30AEC -- virtual conference }, author = {Themistocleous, Charalambos and Eckerström, Marie and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2020}, } @inProceedings{kokkinakis-lundholmfors-2020-digital-295582, title = {Digital Neuropsychological Tests and Biomarkers: Resources for NLP and AI Exploration in the Neuropsychological Domain}, abstract = {Non-invasive, time and cost-effective, easy-to-measure techniques for the early diagnosis or monitoring the progression of brain and mental disorders are at the forefront of recent research in this field. Natural Language Processing and Artificial Intelligence can play an important role in supporting and enhancing data driven approaches to improve the accuracy of prediction and classification. However, large datasets of e.g. recorded speech in the domain of cognitive health are limited. To improve the performance of existing models we need to train them on larger datasets, which could raise the accuracy of clinical diagnosis, and contribute to the detection of early signs at scale. In this paper, we outline our ongoing work to collect such data from a large population in order to support and conduct future research for modelling speech and language features in a cross-disciplinary manner. The final goal is to explore and combine linguistic with multimodal biomarkers from the same population and compare hybrid models that could increase the predictive accuracy of the algorithms that operate on them.}, booktitle = {CLARIN Annual Conference 2020 in Virtual Form}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, } @article{kokkinakis-lundholmfors-2020-manga-294522, title = {Hur många djur du kommer på kan avslöja hur din hjärna mår}, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2020}, volume = {2}, pages = {48--51}, } @inProceedings{fraser-etal-2019-multilingual-280280, title = {Multilingual prediction of Alzheimer’s disease through domain adaptation and concept-based language modelling}, abstract = {There is growing evidence that changes in speech and language may be early markers of dementia, but much of the previous NLP work in this area has been limited by the size of the available datasets. Here, we compare several methods of domain adaptation to augment a small French dataset of picture descriptions (n = 57) with a much larger English dataset (n = 550), for the task of automatically distinguishing participants with dementia from controls. The first challenge is to identify a set of features that transfer across languages; in addition to previously used features based on information units, we introduce a new set of features to model the order in which information units are produced by dementia patients and controls. These concept-based language model features improve classification performance in both English and French separately, and the best result (AUC = 0.89) is achieved using the multilingual training set with a combination of information and language model features.}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), June 2 - June 7, 2019, Minneapolis, Minnesota / Jill Burstein, Christy Doran, Thamar Solorio (Editors) }, author = {Fraser, Kathleen and Linz, Nicklas and Lundholm Fors, Kristina and Rudzicz, Frank and König, Alexandra and Alexandersson, Jan and Robert, Philippe and Kokkinakis, Dimitrios}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-950737-13-0}, } @inProceedings{lundholmfors-etal-2019-reading-284036, title = {Reading and mild cognitive impairment}, abstract = {In the present study, we investigated the discriminatory power of eye-tracking features in distinguishing between individuals with mild cognitive impairment (MCI) and healthy controls (HC). The eye movements of the study participants were recorded at two different time points, 18 months apart. Using a machine learning approach with leave-one-out cross-validation, we were able to discriminate between the groups with 73.6 AUC. However, somewhat surprisingly the classification was less successful using data from the second recording session, which might be attributed to the non-static nature of cognitive status. Still, the outcome suggests that eye-tracking measures can be exploited as useful markers of MCI. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Lundholm Fors, Kristina and Antonsson, Malin and Kokkinakis, Dimitrios and Fraser, Kathleen}, year = {2019}, ISBN = {978-618-84585-0-5}, } @inProceedings{johansson-etal-2019-lexical-284330, title = {Lexical diversity and mild cognitive impairment}, abstract = {This paper explores the role that various lexical-based measures play for differentiating between individuals with mild forms of cognitive impairment (MCI) and healthy controls (HC). Recent research underscores the importance of language and linguistic analysis as essential components that can contribute to a variety of sensitive cognitive measures for the identification of milder forms of cognitive impairment. Subtle language changes serve as a sign that an individual’s cognitive functions have been impacted, potentially leading to early diagnosis. Our research aims to identify linguistic biomarkers that could distinguish between individuals with MCI and HC and also be useful in predicting MCI.}, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Johansson, Sofie and Lundholm Fors, Kristina and Antonsson, Malin and Kokkinakis, Dimitrios}, year = {2019}, publisher = {ExLing Society}, address = {Athens, Greece}, ISBN = {978-618-84585-0-5}, } @inProceedings{themistocleous-kokkinakis-2019-speech-289021, title = {Speech and Mild Cognitive Impairment detection}, abstract = {It is of great importance to detect objective markers that can enable the early and fast identification of individuals with Mild Cognitive Impairment (MCI) from healthy individuals to inform, patient care, family and treatment planning. Connected speech productions can offer such markers. This study analyses recordings from picture description tasks by Swedish individuals with MCI and healthy control individuals (HC) and shows that voice quality, periodicity, and speech rate distinguish individuals with MCI from HC. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2019}, publisher = { ExLing Society}, ISBN = {978-618-84585-0-5}, } @inProceedings{antonsson-etal-2019-discourse-284038, title = {Discourse in Mild Cognitive Impairment }, abstract = {This paper reports on how persons with mild cognitive impairment (MCI) perform on two types of narrative tasks compared to a group of healthy controls (HC). The first task is a widely used picture description task and the other task is a more complex discourse task. Since the latter task puts higher demands on cognitive linguistic skills, as seen in previous research, we expected this task to be more efficient in discriminating between the two groups. The results confirm this hypothesis. }, booktitle = {Proceedings of the 10th International Conference of Experimental Linguistics, 25-27 September 2019, Lisbon, Portugal}, editor = {Antonis Botinis}, author = {Antonsson, Malin and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2019}, publisher = { ExLing Society}, ISBN = {978-618-84585-0-5}, } @inProceedings{linz-etal-2019-temporal-279131, title = {Temporal Analysis of Semantic Verbal Fluency Tasks in Persons with Subjective and Mild Cognitive Impairment.}, abstract = {The Semantic Verbal Fluency (SVF) task is a classical neuropsychological assessment where persons are asked to produce words belonging to a semantic category (e.g., animals) in a given time. This paper introduces a novel method of temporal analysis for SVF tasks utilizing time intervals and applies it to a corpus of elderly Swedish subjects (mild cognitive impairment, subjective cognitive impairment and healthy controls). A general decline in word count and lexical frequency over the course of the task is revealed, as well as an increase in word transition times. Persons with subjective cognitive impairment had a higher word count during the last intervals, but produced words of the same lexical frequencies. Persons with MCI had a steeper decline in both word count and lexical frequencies during the third interval. Additional correlations with neuropsychological scores suggest these findings are linked to a person’s overall vocabulary size and processing speed, respectively. Classification results improved when adding the novel features (AUC = 0.72), supporting their diagnostic value.}, booktitle = {Sixth Workshop on Computational Linguistics and Clinical Psychology: Reconciling Outcomes. Minneapolis, Minnesota, USA, June 6, 2019 / Kate Niederhoffer, Kristy Hollingshead, Philip Resnik, Rebecca Resnik, Kate Loveys (Editors)}, author = {Linz, Nicklas and Lundholm Fors, Kristina and Lindsay, Hali and Eckerström, Marie and Alexandersson, Jan and Kokkinakis, Dimitrios}, year = {2019}, publisher = {Association for Computational Linguistics }, address = {Stroudsburg, PA }, ISBN = {978-1-948087-95-7}, } @article{kokkinakis-edstrom-2019-alderism-284251, title = {Ålderism i dagens mediala Sverige }, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, number = {3/2019}, pages = {22--27}, } @inProceedings{kokkinakis-lundholmfors-2019-"hund-279384, title = {"hund, katt, ko...": Semantiskt ordflödestest som indikator på kognitiv nedsättning hos äldre.}, abstract = {Ordflödestest är en typ av test som ofta ingår vid språkliga och neuropsykologiska utredningar, och de används för att bedöma språkliga förmågor, så som ordmobilisering, och exekutiva funktioner, så som verbalt arbetsminne och bearbetningshastighet. Vid ett fonologiskt ordflödestest får personen i uppgift att på en begränsad tid (oftast 60 sekunder) producera så många ord som möjlighet som börjar med en viss bokstav (ofta F, A och S), medan vid ett semantiskt ordflödestest får personen istället i uppgift att producera ord som tillhör en viss kategori (t ex djur eller grönsaker). Dessa tester tar liten tid att genomföra, är lätta att administrera och ger värdefull information om kognitiva färdigheter och begränsningar. Tidigare forskning har visat att ordflödestester har hög reliabilitet och är känsliga för kognitiva nedsättningar. Vid analys av testen mäts traditionellt enbart antalet korrekta ord som producerats, men med hjälp av digital ljudinspelning samt den utveckling som skett inom språkteknologi kan man nu göra mer detaljerade analyser och få ny information om de strategier man använder vid exempelvis ordgenereringen; nämligen klustring (produktion av en grupp relaterade ord inom den redan identifierade subkategorin) och växling (sökning efter och växling till nya subkategorier). I vår forskning studerar vi bl.a. semantiskt ordflödestest som nyanserad indikator på olika aspekter av exekutiva och språkliga förmågor hos personer med degenerativa lindriga eller milda kognitiva nedsättningar samt en kontrollgrupp med kognitivt friska individer. Studien kommer presentera detaljer av vår språkteknologiska analys, visa på de skillnader som finns mellan grupperna och de samband som eventuellt finns med andra, redan genomförda, neuropsykiatriska tester för samma population.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina}, year = {2019}, } @inProceedings{kokkinakis-edstrom-2019-alderism-279386, title = {Ålderism i svenska nyhetsmedier.}, abstract = {Ålderdom existerar inte. Det finns människor som är mindre unga än andra. Det är allt.” (Simone de Beauvoir, 1908-1986). Ålderism syftar till “fördomar eller stereotypa föreställningar som utgår från en människas ålder och som kan leda till diskriminering”. Ålderism och media är ett område som under de senaste åren har uppmärksammats på ett sätt som aldrig tidigare skett (WHO). Detta antyder på att stereotypa beskrivningar och diskriminering av individer eller grupper av individer på grund av sin kronologiska ålder i (tryckta) nyhetsmedier är ett stort problem. För ålderismstudier är det värdefullt och viktigt att förstå hur olika typer av texter och medier beskriver eller presenterar åldrande och ålderdom. Därmed är syftet med denna forskning att samla och sammanställa korpusbaserade data från olika publicerade svenska mediekällor för att kunna svara på frågan om hur utbrett fenomenet är i den svenska verkligheten och därmed kunna frambringa en mer omfattande empirisk bevisning rörande fenomenet. Två pilotstudier har genomförts; en som använde förnamn och deras frekvenser av bärarnas ålder enligt Statistiska centralbyrån (SCB) i olika synkrona on-line tidningskällor och en som använde generella mönstermatchningstekniker som tillämpades på 13 utgåvor av Göteborgs Posten (1994, 2001-13). Äldre, i vår studie, är personer ≥60 år. Preliminära, kvantitativa, resultat tyder på att det finns tydliga och konsekventa skillnader i hur olika åldersgrupper representeras i dessa medier. Ett tydligt band visar att omnämnanden av 25-52-åringar är mycket överrepresenterat än den svenska befolkningspyramiden säger att de borde (SCB). Medan 0-24-åringar och personer över 52 är underrepresenterade. Mönstermatchning pekar åt liknande resultat med undantag av dödsannonser där omnämnanden om äldre är mycket vanligare. Vår pilotstudie bekräftar den introspektiva synen på underrepresentation av ålderdom och äldre i synkrona mediekällor. Men fler studier krävs och inom den närmaste tiden planerar vi att förbättra, skala upp och tillämpa språkteknologisk metodik på både synkronisk och diakronisk textkorpora och därmed få ett nytt och bredare perspektiv på skillnader och trender om åldrandet och äldre och vad olika publicerade källor ur en större tidsperiod kan avslöja.}, booktitle = {Svenskans beskrivning 37, 8–10.5.2019, Åbo, Finland.}, author = {Kokkinakis, Dimitrios and Edström, Maria}, year = {2019}, } @article{smith-etal-2014-readability-188146, title = {Readability, suitability and comprehensibility in patient education materials for Swedish patients with colorectal cancer undergoing elective surgery: A mixed method design.}, abstract = {To characterize education materials provided to patients undergoing colorectal cancer surgery to gain a better understanding of how to design readable, suitable, comprehensible materials.}, journal = {Patient education and counseling}, author = {Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, Richard and Friberg, Febe and Öhlén, Joakim}, year = {2014}, volume = {94}, number = {2}, pages = {202–209}, } @inProceedings{kokkinakis-etal-2019-multifaceted-278217, title = {A Multifaceted Corpus for the Study of Cognitive Decline in a Swedish Population}, abstract = {A potential, early-stage diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, is the onset of language disturbances which is often characterized by subtle word-finding difficulties, impaired spontaneous speech, slight speech hesitancy, object naming difficulties and phonemic errors. Connected speech provides valuable information in a non-invasive and easy-to-assess way for determining aspects of the severity of language impairment. Data elicitation is an established method of obtaining highly constrained samples of connected speech that allows us to study the intricate interactions between various linguistic levels and cognition. In the paper, we describe the collection and content of a corpus consisting of spontaneous Swedish speech from individuals with Mild Cognitive Impairment (MCI), with Subjective Cognitive Impairment SCI) and healthy, age-matched controls (HC). The subjects were pooled across homogeneous subgroups for age and education, a sub-cohort from the Gothenburg-MCI study. The corpus consists of high quality audio recordings (including transcriptions) of several tasks, namely: (i) a picture description task – the Cookie-theft picture, an ecologically valid approximation to spontaneous discourse that has been widely used to elicitate speech from speakers with different types of language and communication disorders; (ii) a read aloud task (including registration of eye movements) – where participants read a text from the IREST collection twice, both on a computer screen (while eye movements are registered), and the same text on paper; (iii) a complex planning task – a subset of executive functioning that tests the ability to identify, organize and carry out (complex) steps and elements that are required to achieve a goal; (iv) a map task – a spontaneous speech production/semi-structured conversation in which the participants are encouraged to talk about a predefined, cooperative task-oriented topic; (v) a semantic verbal fluency task – category animals: where participants have to produce as many words as possible from a category in a given time (60 seconds). The fluency tests require an elaborate retrieval of words from conceptual (semantic) and lexical (phonetic) memory involving specific areas of the brain in a restricted timeframe. All samples are produced by Swedish speakers after obtaining written consent approved by the local ethics committee. Tasks (i) and (ii) have been collected twice in a diachronically apart period of 18 months between 2016 and 2018. The corpus represents an approximation to speech in a natural setting: The material for elicitation is controlled in the sense that the speakers are given specific tasks to talk about, and they do so in front of a microphone. The corpus may serve as a basis for many linguistic and/or speech technological investigations and has being already used for various investigations of language features.}, booktitle = {CLARe4 : Corpora for Language and Aging Research, 27 February – 1 March 2019, Helsinki, Finland}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Eckerström, Marie and Horn, Greta and Themistocleous, Charalambos}, year = {2019}, } @inProceedings{fraser-etal-2018-improving-264397, title = {Improving the Sensitivity and Specificity of MCI Screening with Linguistic Information.}, abstract = {The Mini-Mental State Exam (MMSE) is a screening tool for cognitive impairment. It has been extensively validated and is widely used, but has been criticized as not being effective in detecting mild cognitive impairment (MCI). In this study, we examine the utility of augmenting MMSE scores with automatically extracted linguistic information from a narrative speech task to better differentiate between individuals with MCI and healthy controls in a Swedish population. We find that with the addition of just four linguistic features, the F score (measuring a trade-off between sensitivity and specificity) is improved from 0.67 to 0.81 in logistic regression classification. These preliminary results suggest that the accuracy of traditional screening tools may be improved through the addition of computerized language analysis.}, booktitle = {Proceedings of the LREC workshop: Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2). 8th of May 2018, Miyazaki, Japan / Dimitrios Kokkinakis (ed.)}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @article{fraser-etal-2019-multilingual-270713, title = {Multilingual word embeddings for the assessment of narrative speech in mild cognitive impairment}, abstract = {We analyze the information content of narrative speech samples from individuals with mild cognitive impairment (MCI), in both English and Swedish, using a combination of supervised and unsupervised learning techniques. We extract information units using topic models trained on word embeddings in monolingual and multilingual spaces, and find that the multilingual approach leads to significantly better classification accuracies than training on the target language alone. In many cases, we find that augmenting the topic model training corpus with additional clinical data from a different language is more effective than training on additional monolingual data from healthy controls. Ultimately we are able to distinguish MCI speakers from healthy older adults with accuracies of up to 63% (English) and 72% (Swedish) on the basis of information content alone. We also compare our method against previous results measuring information content in Alzheimer's disease, and report an improvement over other topic-modeling approaches. Furthermore, our results support the hypothesis that subtle differences in language can be detected in narrative speech, even at the very early stages of cognitive decline, when scores on screening tools such as the Mini-Mental State Exam are still in the “normal” range.}, journal = {Computer Speech and Language}, author = {Fraser, Kathleen and Lundholm Fors, Kristina and Kokkinakis, Dimitrios}, year = {2019}, volume = {53}, pages = {121--139}, } @inProceedings{themistocleous-etal-2018-effects-270215, title = {Effects of Mild Cognitive Impairment on vowel duration }, abstract = {Mild cognitive impairment (MCI) is a neurological condition, which is characterized by a noticeable decline of cognitive abilities, including communicative and linguistic skills. In this study, we have measured the duration of vowels produced in a reading task by 55 speakers— 30 healthy controls and 25 MCI—. The main results showed that MCI speakers differed significantly from HC in vowel duration as MCI speakers produced overall longer vowels. Also, we found that gender effects on vowel duration were different in MCI and HC. One significant aspect of this finding is that they highlight the contribution of vowel acoustic features as markers of MCI.}, booktitle = {Proceedings of the 9th Tutorial & Research Workshop on Experimental Linguistics, 28 - 30 August 2018, Paris, France}, editor = {Antonis Botinis}, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios and Eckerström, Marie and Fraser, Kathleen and Lundholm Fors, Kristina}, year = {2018}, ISBN = {978-960-466-162-6 }, } @misc{kokkinakis-2018-resources-265118, title = {Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments (RaPID-2)}, abstract = {Proceedings of the second RaPID: "Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments". An LREC workshop. 8th of May 2018, Miyazaki, Japan}, author = {Kokkinakis, Dimitrios}, year = {2018}, ISBN = {979-10-95546-26-9}, } @inProceedings{themistocleous-kokkinakis-2018-themis-265112, title = {THEMIS-SV: Automatic classification of language disorders from speech signals}, abstract = {Background and Aims: Brain injuries resulting from stroke can affect the production of speech resulting in different types of language impairments, such as aphasia. Studying these productions manually is an extremely cumbersome and time consuming process. The aim of this paper is to present THEMIS-SV: a system that enables the automatic transcription of speech signals and the segmentation of vowels and consonants in Swedish. Method: The input of the system are recordings of speech. The system processes the recordings and returns an output with three tiers: the utterance tier, the word tier, and the vowels/consonants tier. Results: The output of the system is a fast and reliable transcription and segmentation of speech, which is very close to transcriptions and segmentations performed manually. The automatic segmentation of speech enables targeted acoustic measurements, such as measurements of consonant spectra, formant frequencies of vowels, fundamental frequency, pauses, speech rate, etc. and other acoustic measurements that have been known to differentiate between the different types of language disorders. Conclusion: The method proposed here can be employed for the analysis of speech of individuals with post-stroke aphasia and other language disorders and constitutes a promising step towards a fully automated differential diagnostic tool for language disorders. }, booktitle = {Abstracts of the 4th European Stroke Organisation Conference (ESOC 2018). Gothenburg, Sweden, 16-18 May, 2018. }, author = {Themistocleous, Charalambos and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{lundholmfors-etal-2018-voice-264400, title = {Eye-voice span in adults with mild cognitive impairment (MCI) and healthy controls. }, abstract = {Objectives: This study is part of a larger project focused on developing new techniques for identification of early linguistic and extra-linguistic signs of cognitive impairment, with the overall goal of identifying dementia in the preclinical stage. In a previous study, we found that eye movements during reading can be used to distinguish between subjects with mild cognitive impairment (MCI) and healthy controls with up to 86% accuracy. In this study, we are investigating the process of reading aloud, by exploring the eye-voice span in subjects with and without cognitive impairment. The aim of the study is to identify differences in the reading processes and evaluate whether these differences can be used to discriminate between the two groups. Methods: The eye-voice span is a measurement of the temporal and spatial organization between the eye and the voice, and is affected by for example working memory and automaticity, but also by the familiarity and length of words. In previous work, differences between eye movements when reading in healthy controls and subjects with cognitive impairments have been identified, and it has been shown that subjects with Alzheimer’s disease show impairments when reading aloud, specifically with regards to speech and articulation rate. Results: We present a quantitative and qualitative analysis of the reading process in the subjects, focusing both on general measures of eye-voice span, but also specifically on instances of hesitation and mistakes in the speech, and the correlated eye movements. Conclusions/Take home message: Early detection of dementia is important for a number of reasons, such as giving the person access to interventions and medications, and allowing the individual and families time to prepare. By expanding the knowledge about reading processes in subjects with MCI, we are adding to the potential of using reading analysis as an avenue of detecting early signs of dementia.}, booktitle = {Book of Abstracts 10th CPLOL Congress 10-12 May 2018, Cascais, Portugal / editor : Trinite, Baiba }, author = {Lundholm Fors, Kristina and Fraser, Kathleen and Kokkinakis, Dimitrios}, year = {2018}, } @inProceedings{kokkinakis-etal-2018-textforskning-265113, title = {Kan textforskning bidra till tidigare och säkrare demensdiagnostik?}, abstract = {Tidigare forskning har visat att subtila språkstörningar kan finnas vid de tidigaste förstadierna till demens, flera år innan en klinisk diagnos kan ställas. Inom ramen för projektet ”Språkliga och extra-lingvistiska parametrar för tidig upptäckt av kognitiv svikt” (finansierat av Riksbankens Jubileumsutlysning, 2016-19) undersöker vi med hjälp av språkteknologi och språkanalysstudier hur dessa språkstörningar yttrar sig. Kan språkteknologi användas för att upptäcka dessa tidiga språkrelaterade symtom och därmed bidra med nyanserad, komplementär och användbar kunskap? Kan användning av språkteknologi särskilja personer med de allra tidigaste kognitiva avvikelserna från personer med mer godartad, åldersrelaterad kognitiv svikt? Vilka språkliga förmågor drabbas? Hur yttrar sig dessa förändringar och vilka slags empiriska material finns att tillgå? Dessa är några av de frågor vi söker svar på. Vi gör inspelningar som vi analyserar för att kunna ta fram ny kunskap om subtila språkliga kännetecken som kan föregå demensutveckling. Denna kunskap kan användas för att eventuellt kunna förutsäga vilka individer som befinner sig i riskzonen för att utveckla demens, och kan vara användbar som komplementerande beslutsunderlag till domänexperter. Vi utvinner, analyserar och undersöker om det finns samband mellan olika språkrelaterade parametrar från spontan talinteraktion, transkriptioner men även ögonrörelser och neuropsykologiska tester från personer med subjektiv eller lindrig kognitiv nedsättning och friska kontrollpersoner. Många gånger är det svårt att avgöra huruvida lindriga kognitiva symtom är en del av det normala åldrandet eller början på en neurodegenerativ process. Vi förväntar oss inte heller att varje enskild person med kognitiv nedsättning kommer att uttrycka sig eller läsa på samma sätt utan snarare att dessa personer tidigt i sjukdomsförloppet kommer att börja uppvisa olika slags avvikande läsmönster, eller göra fonologiska, lexikala, syntaktiska eller semantiska fel. I studien utvecklar vi verktyg för att automatiskt hitta dessa avvikelser, och målet är att detta sedan ska kunna användas som komplement till tidig diagnostik samt som prognostiskt eller screeningverktyg. Deltagarna i vår studie har rekryterats från en pågående longitudinell studie, ”Demens i Tidigt Skede”, (eng. ”The Gothenburg MCI study”) på Minnesmottagningen i Göteborg, och vårt projekt har godkänts av den lokala etiknämnden. Alla deltagare i studien (kontrollgruppen [HC], personer med subjektiv kognitiv nedsättning [SCI] och personer med mild kognitiv nedsättning [MCI]) har genomgått baslinjeundersökning och gett informerat skriftligt samtycke (demografisk information finns i tabell 1). Vårt projekt är f.n. pågående och vi kommer presentera resultat baserade på inspelningstillfälle nr ett (aug. 2016-mars 2017). En ny inspelningsomgång, med samma deltagare, började i februari 2018 och förväntas vara avslutat i december 2018. Under presentationen kommer vi ge exempel på olika tal-, text- och ögonrörelseanalyser vi har genomfört och diskutera metodval och resultat baserade på studiens första fas. Vi kommer vidare ge en kort inblick i den nya, pågående inspelningsomgången och de nya testmoment vi använder. Vi vill med vårt arbete visa hur språkteknologisk analys kan bidra till att utöka vår kunskap inom området så att den kan vara användbar för tidig diagnostik och optimal omvårdnad. Enligt Socialstyrelsen (2017) finns det i Sverige över 160 000 personer med någon demenssjukdom. Våra resultat kan ha en betydelse för vårdpersonal som snabbare vill diagnostisera och identifiera individer med olika former av kognitiv funktionsnedsättning innan allvarliga symtom blir påtagliga. Utvecklingsmöjligheterna är många: nya eller förbättrade kognitiva screeningtester som skulle kunna användas inom primär- och specialistvården, samt utveckling och tillämpning av insatser som kan påverka beteendemönster och träna upp individens kommunikativa förmåga, kan på sikt leda till positiva konsekvenser som minskade vårdköer samt effektivare behandling avseende kostnader och behandlingsutfall.}, booktitle = {Forum för textforskning 13 , Lund 7 – 8 juni 2018}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Eckerström, Marie and Themistocleous, Charalambos}, year = {2018}, } @inProceedings{edstrom-etal-2018-ageism-267250, title = {Ageism and Swedish news media}, abstract = {Ageism can be seen as a “social disease”, a casual or systematic prejudice, stereotyping and discriminating against individuals or groups on the basis of their age. This is an area of growing concern, particularly the role of mainstream media in relationship to ageism. A valuable and important step is to understand the presence of ageing and older age how different types of online news media. The main objective of this pilot work is to test, collate and produce evidence from Swedish news media representations of older ages and ageing. METHOD(S) Two pilot studies/experiments; first names and their frequencies of the carriers’ age according to Statistics Sweden (SCB) and their presence in 39 online news between 2015 and 2018. ( 4, 7 millions texts). using general pattern matching techniques with regular expressions and applying them to 13 issues (1994, 2001-13) of Göteborgs-Posten (Swedish news corpora). Definition: Older persons ≥60 years. (25 % of the population in Sweden is over 60 yearsRESULTS AND CONCLUSIONS: Clear and consistent differences of how various age spans are represented in the news. 20-50 year olds is highly over represented compared with the Swedish population, while 0-24 and people over 54 are underrepresented, especially women. Pattern matching exhibits similar characteristics with the exception of obituaries where the elderly mentions are much more frequent.Our pilot studies confirm the introspective view of underrepresentation of old age and older people in or trends can be revealed within a larger time span and synchronic media sources. More studies are required and in the near future we plan to improve, scale and apply our methodology on both synchronic and diachronic data using e.g. available text corpora and try to get a solid perspective on whether any differences or trends can be revealed within a larger time span }, booktitle = {24th Nordic Congress of Gerontoloy (NKG). Oslo, Norway: 2-4 May 2018 }, author = {Edström, Maria and Kokkinakis, Dimitrios and Berggren, Max}, year = {2018}, } @inProceedings{kokkinakis-etal-2018-swedish-262851, title = {A Swedish Cookie-Theft Corpus}, abstract = {Language disturbances can be a diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, at earlier stages, and connected speech analysis provides a non-invasive and easy-to-assess measure for determining aspects of the severity of language impairment. In this paper we focus on the development of a corpus consisting of audio recordings of picture descriptions of the Cookie-theft, produced by Swedish speakers, and accompanying transcriptions. The speech elicitation procedure provides an established method of obtaining highly constrained samples of connected speech that can allow us to study the intricate interactions between various linguistic levels and cognition. We chose the Cookie-theft picture since it is a standardized test that has been used in various studies in the past, and therefore comparisons can be made based on previous results. This type of picture description task might be useful for detecting subtle language deficits in patients with subjective and mild cognitive impairment. The resulting corpus is a new, rich and multi-faceted resource for the investigation of linguistic characteristics of connected speech and a unique data set that provides a rich resource for (future) research and experimentation in many areas, and of language impairment in particular. The information in the corpus can also be combined and correlated with other collected data about the speakers, such as neuropsychological tests, imaging and brain physiology markers and cerebrospinal fluid markers.}, booktitle = {LREC 2018, 11th edition of the Language Resources and Evaluation Conference, 7-12 May 2018, Miyazaki (Japan) / Editors: Nicoletta Calzolari (Conference chair), Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Koiti Hasida, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis, Takenobu Tokunaga}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Fraser, Kathleen and Nordlund, Arto}, year = {2018}, publisher = {European Language Resources Association}, ISBN = {979-10-95546-00-9}, } @misc{kokkinakis-2016-proceedings-252412, title = {Proceedings of LREC 2016 Workshop: Resources and Processing of Linguistic and Extra-Linguistic Data from People with Various Forms of Cognitive/Psychiatric Impairments (RaPID-2016), Monday 23rd of May 2016. Linköping electronic conference proceedings.}, abstract = {The purpose of the Workshop on “Resources and ProcessIng of linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric impairments” (RaPID-2016) was to provide a snapshot view of some of the current technological landscape, resources, data samples and also needs and challenges in the area of processing various data from individuals with various types of mental and neurological health impairments and similar conditions at various stages; increase the knowledge, understanding, awareness and ability to achieve useful outcomes in this area and strengthen the collaboration between researchers and workers in the field of clinical/nursing/medical sciences and those in the field of language technology/computational linguistics/Natural Language Processing (NLP). Although many of the causes of cognitive and neuropsychiatric impairments are difficult to foresee and accurately predict, physicians and clinicians work with a wide range of factors that potentially contribute to such impairments, e.g., traumatic brain injuries, genetic predispositions, side effects of medication, and congenital anomalies. In this context, there is new evidence that the acquisition and processing of linguistic data (e.g., spontaneous story telling) and extra-linguistic and production measures (e.g., eye tracking) could be used as a complement to clinical diagnosis and provide the foundation for future development of objective criteria to be used for identifying progressive decline or degeneration of normal mental and brain functioning. An important new area of research in NLP emphasizes the processing, analysis, and interpretation of such data and current research in this field, based on linguistic-oriented analysis of text and speech produced by such a population and compared to healthy adults, has shown promising outcomes. This is manifested in early diagnosis and prediction of individuals at risk, the differentiation of individuals with various degrees of severity forms of brain and mental illness, and for the monitoring of the progression of such conditions through the diachronic analysis of language samples or other extralinguistic measurements. Initially, work was based on written data but there is a rapidly growing body of research based on spoken samples and other modalities. Nevertheless, there remains significant work to be done to arrive at more accurate estimates for prediction purposes in the future and more research is required in order to reliably complement the battery of medical and clinical examinations currently undertaken for the early diagnosis or monitoring of, e.g., neurodegenerative and other brain and mental disorders and accordingly, aid the development of new, non-invasive, time and cost-effective and objective (future) clinical tests in neurology, psychology, and psychiatry.}, author = {Kokkinakis, Dimitrios}, year = {2016}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7685-730-4}, } @inProceedings{bjorkner-etal-2017-voice-256522, title = {Voice acoustic parameters for detecting signs of early cognitive impairment}, abstract = {Aiding the detection of very early cognitive impairment in Alzheimer's disease (AD) and assessing the disease progression are essential foundations for effective psychological assessment, diagnosis and planning. Efficient tools for routine dementia screening in primary health care, particularly non-invasive and cost-effective methods, are desirable. The aim of this study is to find out if voice acoustic analysis can be a useful tool for detecting signs of early cognitive impairment.}, booktitle = {PEVOC (PanEuropean Voice Conference) 12, August 30th - September 1st 2017, Ghent, Belgium}, author = {Björkner, Eva and Lundholm Fors, Kristina and Kokkinakis, Dimitrios and Nordlund, Arto}, year = {2017}, } @article{smith-etal-2012-studie-170897, title = {Ny studie visar hur information till patienter med kolorektal cancer kan förbättras}, abstract = {Skriftligt informationsmaterial är ofta skrivet på för hög nivå och ställer höga krav på den tänkta läsaren (patienten). Förutom läsbarhet finns det fler faktorer att utvärdera för att se om materialet är lämpligt. Innehåll, struktur, layout och typsnitt, illustrationer och lärande och motivation är sådant som bör tas hänsyn till. Ett lämpligare, bättre anpassat material kan hjälpa personer med sjukdom att ställa bättre frågor när de har samtal med vårdpersonal och det kan göra personen mindre osäker och orolig för det okända som väntar. En ny studie som ingår i forskningsprojektet PINCORE (personcentred information and communication in colorectal cancer care) syftar till att förbättra information och kommunikation vid kolorektal cancer.}, journal = {Cancervården}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Friberg, Febe and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2012}, number = {5}, pages = {18--21}, } @inProceedings{kokkinakis-etal-2016-data-243069, title = {Data Resource Acquisition from People at Various Stages of Cognitive Decline – Design and Exploration Considerations}, abstract = {In this paper we are introducing work in progress towards the development of an infrastructure (i.e., design, methodology, creation and description) of linguistic and extra-linguistic data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of various types of modalities; i.e. audio-recorded spoken language samples; transcripts of the audio recordings (text) and eye tracking measurements. The integration of the extra-linguistic information with the linguistic phenotypes and measurements elicited from audio and text, will be used to extract, evaluate and model features to be used in machine learning experiments. In these experiments, classification models that will be trained, that will be able to learn from the whole or a subset of the data to make predictions on new data in order to test how well a differentiation between the aforementioned groups can be made. Features will be also correlated with measured outcomes from e.g. language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = {The Seventh International Workshop on Health Text Mining and Information Analysis (Louhi). November 5, 2016, Austin, Texas, USA}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Nordlund, Arto}, year = {2016}, } @inProceedings{kokkinakis-2016-linguistic-243100, title = {Linguistic and extra-linguistic parameters for early detection of cognitive impairment}, abstract = {AIM: to adapt, develop and test methods that in isolation have shown promising outcomes on tasks related to (early) detection of dementia, differentiating between various dementia types and controls and also increase our understanding of the cognitive processes that underlie written text and certain forms of spoken language production. Unlike previous models, based solely on a certain aspect of language abilities (i.e. on written or spoken language alone), the project is comprehensive and more likely to provide new insights in the area of dementia detection and improve practices applied so far. The project builds on the success stories of the past and focus on the interplay between various types of technologies that hold the potential to provide reliable estimates for the detection of cognitive decline. The project emphasizes its interdisciplinary nature, by bringing together researchers from humanities (computational linguistics / language technology), computer science and medicine, and foresees the development of a comprehensive set of novel analytic approaches not explored jointly in the past GOAL: discovering evidence about linguistic performance and identifying whether the addition of new ways for investigating, combining and evaluating measurement and other parameters for improvement of established models can advance our understanding of: i) the boundaries between normal aging and dementia; ii) its effects on linguistic performance extrapolated from various sources and iii) whether effects of cognitive decline can be seen across (daily) language production. }, booktitle = {European Summer School on Eye Movements (ESSEM), 11-17 september, 2016 Athens, Greece.}, author = {Kokkinakis, Dimitrios}, year = {2016}, } @inProceedings{kokkinakis-etal-2016-specifications-243183, title = {Specifications and Methodology for Language-Related Data Acquisition and Analysis in the Domain of Dementia Diagnostics}, abstract = {This paper outlines the initial stages of a project that aims to build and use a corpus with data samples acquired from people diagnosed with subjective or mild cognitive impairment and healthy, age-matched controls. The data we are currently collecting consists of audio-recorded spoken language samples; transcripts of the audio recordings and eye tracking measurements. From these data we plan to extract, evaluate and model features to be used for learning classification models in order to test how well a differentiation between the aforementioned subject groups can be made. Features will be also correlated with outcomes from e.g. other language-related scores, such as word fluency, in order to investigate whether there are relationships between various variables.}, booktitle = { The Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Kokkinakis, Dimitrios and Lundholm Fors, Kristina and Björkner, Eva and Nordlund, Arto}, year = {2016}, } @inProceedings{kokkinakis-etal-2012-literacy-164587, title = {Literacy Demands and Information to Cancer Patients}, abstract = {This study examines language complexity of written health information materials for patients undergoing colorectal cancer surgery. Written and printed patient information from 28 Swedish clinics are automatically analyzed by means of language technology. The analysis reveals different problematic issues that might have impact on readability. The study is a first step, and part of a larger project about patients’ health information seeking behavior in relation to written information material. Our study aims to provide support for producing more individualized, person centered information materials according to preferences for complex and detailed or legible texts and thus enhance a movement from receiving information and instructions to participating in knowing. In the near future the study will continue by integrating focus groups with patients that may provide valuable feedback and enhance our knowledge about patients’ use and preferences of different information material.}, booktitle = {Proceedings of the 15th International Conference on Text, Speech and Dialogue}, author = {Kokkinakis, Dimitrios and Forsberg, Markus and Johansson Kokkinakis, Sofie and Smith, Frida and Öhlén, Joakim}, year = {2012}, ISBN = {978-364232789-6}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @inProceedings{kokkinakis-malm-2015-detecting-225762, title = {Detecting Reuse of Biblical Quotes in Swedish 19th Century Fiction using Sequence Alignment}, abstract = {Text reuse, a form of text repetition, recycling or borrowing, is a theoretically and practically interesting problem that has attracted considerable attention during the last years e.g. in the cultural heritage context (historical and comparative linguistics); in the context of social network propagation of ideas and in the measuring of journalistic reuse. In this paper we briefly outline and experiment with a method used for biological sequence alignment that have been also used in humanities research for e.g. the detection of similar passages in the complete works of Voltaire and 18th century French encyclopedias or for tracing how and which ideas spread in 19th century US-newspaper collections. We use available software (text-PAIR: Pairwise Alignment for Intertextual Relations) and experiment with the Charles XII Bible translation into Swedish, completed in 1703, against the content of the Swedish prose fiction 1800-1900, in order to automatically detect passages taken from this particular Bible translation in the prose fiction corpus.}, booktitle = {Corpus-based Research in the Humanities workshop (CRH), 10 December 2015 Warsaw, Poland }, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2015}, ISBN = {978-83-63159-19-1}, pages = {79--86}, } @inProceedings{moradi-etal-2014-graph-197533, title = {A Graph-Based Analysis of Medical Queries of a Swedish Health Care Portal}, abstract = {Today web portals play an increasingly important role in health care allowing information seekers to learn about diseases and treatments, and to administrate their care. Therefore, it is important that the portals are able to support this process as well as possible. In this paper, we study the search logs of a public Swedish health portal to address the questions if health information seeking differs from other types of Internet search and if there is a potential for utilizing network analysis methods in combination with semantic annotation to gain insights into search behaviors. Using a semantic-based method and a graph-based analysis of word cooccurrences in queries, we show there is an overlap among the results indicating a potential role of these types of methods to gain insights and facilitate improved information search. In addition we show that samples, windows of a month, of search logs may be sufficient to obtain similar results as using larger windows. We also show that medical queries share the same structural properties found for other types of information searches, thereby indicating an ability to reuse existing analysis methods for this type of search data.}, booktitle = {The Fifth International Workshop on Health Text Mining and Information Analysis (Louhi)}, author = {Moradi, Farnaz and Eklund, Ann-Marie and Kokkinakis, Dimitrios and Olovsson, Tomas and Tsigas, Philippas}, year = {2014}, ISBN = {978-1-937284-90-9}, pages = {2--10}, } @inProceedings{grahn-kokkinakis-2014-legitimating-216142, title = {Legitimating the visit - a recurrent challenge among patients with medically unexplained symptoms}, abstract = {The doctor’s evaluation of presented symptoms as doctorable, is a legitimation of the patient’s decision to seek medical care. It is also a confirmation of the rational, and even the moral, status of the patient, since consulting a doctor without good reasons is considered irrational. The analysis focuses on how patients take initiatives to present problems and on the doctors’ responses and evaluations regarding the doctorability. Situations where participants seem to have different views of the doctorability of the problems are examined in relation to conversational practices and social actions. The analyses shows that the doctor as well as the patient orients to the potential doctorability of the problems and to the moral challenges related to it, but that their different expectations and roles lead to communicatively unclear situations. Further analyses will illustrate in what ways the MUS-patients’ recurrent challenge of legitimating their visits could be influenced by the interaction, and hence in what ways conscious conversational practices from the care givers might facilitate these situations.}, booktitle = {Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Grahn, Inga-Lill and Kokkinakis, Dimitrios}, year = {2014}, } @inProceedings{kokkinakis-etal-2015-gender-215535, title = {Gender-Based Vocation Identification in Swedish 19th Century Prose Fiction using Linguistic Patterns, NER and CRF Learning}, abstract = {This paper investigates how literature could be used as a means to expand our understanding of history. By applying macroanalytic techniques we are aiming to investigate how women enter literature and particularly which functions they assume, their working patterns and if we can spot differences in how often male and female characters are mentioned with various types of occupational titles (vocation) in Swedish literary texts. Modern historiography, and especially feminist and women’s history has emphasized a relative invisibility of women’s work and women workers. The reasons behind this are manifold, and the extent, the margin of error in terms of women’s work activities is of course hard to assess. Therefore, vocation identification can be used as an indicator for such exploration and we present a hybrid system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Beside vo-cations, the system also assigns gender (male, female or unknown) to the vocation words, a prerequisite for the goals of the study and fu-ture in-depth explorations of the corpora.}, booktitle = {Proceedings of the Fourth Workshop on Computational Linguistics for Literature (Clfl). Co-located with the NAACL/HLT. Denver, Colorado, USA}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2015}, pages = {9}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @inProceedings{kokkinakis-grahn-2014-corpus-209807, title = {A corpus-based approach to the identification of non-literal language in a medical setting.}, abstract = {Automated processing of clinical texts is commonly faced with various less exposed, and not so regularly discussed linguistically complex problems that need to be addressed. One of these issues concerns the usage of figurative language. Figurative language implies the use of words that go beyond their ordinary meaning, a linguistically complex and challenging problem and also a problem that causes great difficulty for the field of natural language processing (NLP). The problem is equally prevalent in both general language and also in various sublanguages, such as clinical medicine. Therefore we believe that a comprehensive model of e.g. clinical language processing needs to account for figurative language usage, and this paper provides a description, and preliminary results towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. We illustrate several types of figurative expressions in the clinical discourse and apply a rather quantitative and corpus-based level analysis. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient-doctor and patient-nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree.}, booktitle = {Proceedings of the Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Kokkinakis, Dimitrios and Grahn, Inga-Lill}, year = {2014}, pages = {1}, } @inProceedings{kokkinakis-etal-2014-hfst-209800, title = {HFST-SweNER . A New NER Resource for Swedish}, abstract = {Named entity recognition (NER) is a knowledge-intensive information extraction task that is used for recognizing textual mentions of entities that belong to a predefined set of categories, such as locations, organizations and time expressions. NER is a challenging, difficult, yet essential preprocessing technology for many natural language processing applications, and particularly crucial for language understanding. NER has been actively explored in academia and in industry especially during the last years due to the advent of social media data. This paper describes the conversion, modeling and adaptation of a Swedish NER system from a hybrid environment, with integrated functionality from various processing components, to the Helsinki Finite-State Transducer Technology (HFST) platform. This new HFST-based NER (HFST-SweNER) is a full-fledged open source implementation that supports a variety of generic named entity types and consists of multiple, reusable resource layers, e.g., various n-gram-based named entity lists (gazetteers).}, booktitle = {Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC), Reykjavik 26 - 31 May 2014.}, author = {Kokkinakis, Dimitrios and Niemi, Jyrki and hardwick, sam and Lindén, Krister and Borin, Lars}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {2537--2543}, } @inProceedings{kokkinakis-etal-2014-vocation-209808, title = {Vocation Identification in Swedish Fiction. }, abstract = {This paper presents a system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Besides vocation identification, the system assigns gender (male, female, unknown) to the vocation words. Since gender is a prominent attribute of first names, we apply a named-entity recognizer (NER) that uses first name gazetteers where each name has been pre-assigned gender, which aids gender assignment to vocations with unknown gender if appropriate context is available. We also use a statistical modelling method, conditional random fields (CRF), for learning gender-assigned vocations in combination with the results of the NER and other pattern matching techniques. The purpose of this work is to develop and apply tools to literature as means to expand our understanding of history in the area of literature-based gender studies, e.g. investigate how women enter literature, which functions do they assume and their working patterns. Vocation identification can be used as one such indicator for achieving some these goals.}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference (SLTC)}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2014}, pages = {3}, } @inProceedings{kokkinakis-etal-2014-semantics-209802, title = {Semantics in Storytelling in Swedish Fiction}, abstract = {In this paper, we aim to define foundations and research questions for future large scale exploration of various types of semantic relationships in literature, namely Swedish prose fiction. More specifically, we are interested to get an in-depth understanding of storytelling in Swedish fiction by analyzing and mining the narrative discourse in a small sample of such data, focusing on interpersonal relationships and answering various questions such as how to recognize and assess gender patterns. Our intention is to apply our findings into a much larger scale in the near future in order to obtain useful insights about the social relations, structures, behavior and everyday life of characters found in literary works, thus enhancing the use of prose fiction as a source for research within the humanities and social sciences. Our work is inspired by the notions of distant reading and macroanalysis, a relatively new and often contested paradigm of literary research. In order to achieve our goal we strive for a combination of natural language processing techniques and simple visualizations that allow the user to rapidly focus on key areas of interest and provide the ability to discover latent semantic patterns and structures. }, booktitle = {Proceedings of the Digital Access to textual Cultural Heritage (DATeCH).}, author = {Kokkinakis, Dimitrios and Malm, Mats and Bergenmar, Jenny and Ighe, Ann}, year = {2014}, ISBN = {978-1-4503-2588-2}, pages = {6}, } @inProceedings{kokkinakis-eklund-2013-query-189552, title = {Query Logs as a Corpus.}, abstract = {This paper provides a detailed description of a large Swedish health-related query log corpus and explores means to derive useful statistics, their distributions and analytics from its content across several dimensions. Information acquisition from query logs can be useful for several purposes and potential types of users, such as terminologists, infodemiologists / epidemiologists, medical data and web analysts, specialists in NLP technologies such as information retrieval and text mining but also public officials in health and safety organizations.}, booktitle = {Corpus Linguistics 2013 : abstract book. Lancaster: UCREL}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Eklund, Ann-Marie}, year = {2013}, pages = {329}, } @inProceedings{kokkinakis-2013-annotation-189536, title = {Annotation of interpersonal relations in Swedish prose fiction.}, abstract = {This paper describes the manual annotation of a small sample of Swedish 19th and 20th century prose fiction with interpersonal relations between characters in six literary works. An interpersonal relationship is an association between two or more people that may range in duration from brief to enduring. The annotation is guided by a named entity recognition step. Our goal is to get an in-depth understanding of the difficulties of such a task and elaborate a model that can be applied for similar annotation on a larger scale, both manually as well as automatically. The identification of interpersonal relations can, hopefully, aid the reader of a Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story. Our aim is to use such annotations in a hybrid context, i.e., using machine learning and rule-based methods, which, in conjunction with named entity recognition, can provide the necessary infrastructure for creating detailed biographical sketches and extracting facts for various named entities which can be exploited in various possible ways by Natural Language Processing (NLP) technologies such as summarization, question answering, as well as visual analytic techniques.}, booktitle = {Proceedings of the 3rd Workshop on Annotation of Corpora for Research in the Humanities (ACRH-3). Sofia, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-954-91700-5-4}, pages = {37--47}, } @inProceedings{kokkinakis-malm-2013-macroanalytic-188518, title = {A Macroanalytic View of Swedish Literature using Topic Modeling.}, abstract = {New research opportunities are plentiful for digital and literature scholars who are currently faced with increasingly large portions of large digitized archives produced during the last decades. Conventional methods of analysis involving a so called close reading view are not enough. Distant reading or macroanalysis is proposed instead, as a better, viable and more pragmatic alternative to the traditional methods of analyzing e.g., literature. According to this view, understanding literature is not accomplished by studying individual texts, but by aggregating and analyzing massive amounts of data. Therefore, applying macroanalytic methods and technologies is a priority among many research groups in the humanities worldwide. In this paper we explore topic modeling, an increasingly popular statistical method used for uncovering themes, topics and patterns in large amounts of text. We use available topic modeling software and, as empirical data, the content of the Swedish literature bank, a constantly growing body of Swedish fiction corpus from the 18th and 19th century. We present preliminary results on a sample of this corpus and discuss how humanistic research can be conducted through this type of computation, as a means to identify potential issues of interest e.g., for historians.}, booktitle = {Corpus Linguistics 2013 : abstract book (Lancaster)}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2013}, } @inProceedings{kokkinakis-2013-figurative-168227, title = {Figurative Language in Swedish Clinical Texts. Potsdam, Germany}, abstract = {Automated processing of clinical texts with the intention to link all important text fragments to various established terminologies and ontologies for relation or event extraction is commonly faced with various less exposed, and not so regularly discussed linguistically motivated issues that needs to be addressed. One of these issues is the usage of figurative language. Figurative language, that is the use of words that go beyond their ordinary meaning, is not only a linguistically complex and challenging problem but also a problem that causes great difficulty for the field of natural language processing (NLP), both for the processing of general language and of various sublanguages, such as clinical medicine. Therefore, a comprehensive model of e.g. clinical language processing needs to account for figurative language usage and this paper provides a description towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. As a matter of fact, all these types of expressions form a continuum with fuzzy boundaries, and most of the NLP-oriented approaches discussed in the past have used either very large data for the analysis or hand annotates samples, a situation that has been prohibitive so far in our project. Therefore distinction is solely based on a more general level, namely between literal versus figurative language, and on a more quantitative and corpus-based level, supported with concrete examples that illustrate several types of figurative expressions in the clinical discourse. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient doctor and patient nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree. }, booktitle = {Computational Semantics in Clinical Text workshop. Part of the 10th International Conference on Computational Semantics}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-1-62748-398-8}, pages = {6}, } @inProceedings{hamon-etal-2013-medication-189545, title = {Medication Extraction and Guessing in Swedish, French and English. }, abstract = {Extraction of information related to the medication is an im-portant task within the biomedical area. While the elaboration and updating of the drug vocabularies cannot follow the rap-id evolution of the drug development, we propose an automat-ic method for the extraction of known and new drug names. Our method combines internal and contextual clues. The method is applied to different types of documents in three languages (Swedish, French and English). The results indi-cate that with this kind of approach, we can efficiently update and enrich the existing drug vocabularies (probably with rap-id manual browsing). Precision and recall scores varied be-tween 81%-91% for precision and 85%-100% for recall. As a future work we intend to continuously refine the approach, by for instance better integration of semantic patterns and fuzzy matching that should hopefully enable further increase of the obtained results.}, booktitle = {Proceedings of the 14th World Congress on Medical and Health Informatics (MEDINFO). Studies in Health Technology and Informatics. Copenhagen, Denmark.}, author = {Hamon, Thierry and Grabar, Natalia and Kokkinakis, Dimitrios}, year = {2013}, volume = {192}, } @inProceedings{kokkinakis-2013-terminologihantering-189541, title = {Terminologihantering i medicinska loggfiler.}, booktitle = {Proceedings of the "Nationell termkonferens". Göteborg}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @inProceedings{kokkinakis-2013-medical-188517, title = {Medical Event Extraction using Frame Semantics - Challenges and Opportunities. Samos, Greece}, abstract = {Abstract. The aim of this paper is to present some findings from a study into how a large scale semantic resource, FrameNet, can be applied for event extraction in the (Swedish) biomedical domain. Combining lexical resources with domain specific knowledge provide a powerful modeling mechanism that can be utilized for event extraction and other advanced text mining-related activities. The results, from developing a rule-based approach, showed that only small discrepancies and omissions were found between the semantic descriptions, the corpus data examined and the domain-specific semantics provided by SNOMED CT (medical terminology), NPL (medicinal products) and various semi-automatically developed clue lists (e. g., domain-related abbreviations). Although the described experiment is only based on four different domain-specific frames, the methodology is extendable to the rest ones and there is much room for improvements, for instance by combining rule-based with machine learning techniques, and using more advanced syntactic representations.}, booktitle = {Proceedings of the 14th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing)}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @article{oelke-etal-2013-fingerprint-181484, title = {Fingerprint Matrices: Uncovering the dynamics of social networks in prose literature}, abstract = {In prose literature often complex dynamics of interpersonal relationships can be observed between the different characters. Traditionally, node-link diagrams are used to depict the social network of a novel. However, static graphs can only visualize the overall social network structure but not the development of the networks over the course of the story, while dynamic graphs have the serious problem that there are many sudden changes between different portions of the overall social network. In this paper we explore means to show the relationships between the characters of a plot and at the same time their development over the course of a novel. Based on a careful exploration of the design space, we suggest a new visualization technique called Fingerprint Matrices. A case study exemplifies the usage of Fingerprint Matrices and shows that they are an effective means to analyze prose literature with respect to the development of relationships between the different characters.}, journal = {Computer Graphics Forum}, author = {Oelke, D. and Kokkinakis, Dimitrios and Keim, D. A.}, year = {2013}, volume = {32}, number = {3}, pages = {371--380}, } @inProceedings{smith-etal-2012-forbattra-170895, title = {Hur kan vi förbättra skriftligt informations- och utbildningsmaterial för patienter som opereras elektivt för kolorektal cancer?}, abstract = {Kolorektal cancer (KRC) är den tredje största cancerdiagnosen i Sverige med drygt 5500 drabbade årligen. Primär behandling är kirurgi kompletterad av pre- och postoperativ onkologisk behandling. Standardiserade koncept för accelererat vårdförlopp med kortare vårdtider lägger mycket fokus på fysisk rehabilitering, men mindre på den psykiska påfrestning det innebär att bli opererad för en cancerdiagnos. Patienter förväntas ta stort ansvar för sin rehabilitering, både på sjukhuset och hemma. För att vara förberedd behövs både skriftlig och muntlig information. Syftet med studien var att kartlägga och karaktärisera det skriftliga informations- och utbildningsmaterial (IOU) som används till patienter som opereras elektivt för KRC. Vidare var syftet att beskriva patienters uppfattning om struktur och innehåll på IOU. IOU från 28 kliniker som opererar patienter med KRC samlades in (totalt 220 st). För att kunna ge ett mått på texternas svårighetsgrad gjordes språkteknologisk analys på samtliga IOU, där bl.a. ordlängd, meningsbyggnad och jämförelse med annan typ av litteratur mättes På 117 st gjordes en suitabilityanalys med instrumentet SAM+CAM där domän som innehåll, läsbarhet, bilder, layout samt stimulans och motivation för lärande bedömdes. Fem fokusgrupper med patienter genomfördes där patienterna uppmanades att berätta om vad de tycker utmärker ett bra respektive dåligt IOU, vad de saknar i innehåll och när och på vilket sätt de vill ha materialet utlämnat. Resultatet av språkteknologiska- och suitabilityanalysen visar att de flesta IOU bedömdes som ”adequate”, men spridningen var stor. Patienterna hade önskemål om mer nivåuppdelat/nivåriktat material, där man själv kan välja hur mycket information man vill ha vid ett visst tillfälle. Flera ämnen saknades, eller var för otydligt beskrivna för att patienterna skulle känna sig trygga vid hemgång. Resultatet av de tre analysmetoderna bör kunna användas för att utveckla en ”verktygslåda” för att i framtiden kunna utforma bättre riktat IOU för patientgruppen. }, booktitle = {Nationella konferensen i Cancervård, 24-25 maj 2012, Stockholm}, author = {Smith, Frida and Öhlén, Joakim and Carlsson, Eva and Forsberg, Markus and Kokkinakis, Dimitrios and Friberg, Febe}, year = {2012}, } @inProceedings{eklund-kokkinakis-2012-drug-165309, title = {Drug interests revealed by a public health portal}, abstract = {Online health information seeking has become an important part of people's everyday lives. However, studies have shown that many of those have problems forming effective queries. In order to develop better support and tools for assisting people in health-related query formation we have to gain a deeper understanding into their information seeking behaviour in relation to key issues, such as medication and drugs. The present study attempts to understand the semantics of the users' information needs with respect to medication-related information. Search log queries from the Swedish 1177.se health portal were automatically annotated and categorized according to relevant background knowledge sources. Understanding the semantics of information needs can enable optimization and tailoring of (official) health related information presented to the online consumer, provide better terminology support and thematic coding of the queries and in the long run better models of consumers’ information needs. }, booktitle = {Proceedings of the SLTC-Workshop: Exploratory Query-log Analysis. Lund, Sweden.}, author = {Eklund, Ann-Marie and Kokkinakis, Dimitrios}, year = {2012}, pages = {2}, } @inProceedings{kokkinakis-2012-initial-164788, title = {Initial Experiments of Medication Event Extraction Using Frame Semantics}, abstract = {Semantic annotation of text corpora for mining complex relations and events has gained a considerable growing attention in the medical domain. The goal of this paper is to present a snapshot of ongoing work that aims to develop and apply an appropriate infrastructure for automatic event labelling and extraction in the Swedish medical domain. Annotated text samples, appropriate lexical resources (e.g. term lists and the Swedish Frame-Net++) and hybrid techniques are currently developed in order to alleviate some of the difficulties of the task. As a case study this paper presents a pilot approach based on the application of the theory of frame semantics to automatically identify and extract detailed medication information from medical texts. Medication information is often written in narrative form (e.g. in clinical records) and is therefore difficult to be acquired and used in computerized systems (e.g. decision support). Currently our approach uses a combination of generic entity and terminology taggers, specifically designed medical frames and various frame-related patterns. Future work intends to improve and enhance current results by using more annotated samples, more medically-relevant frames and combination of supervised learning techniques with the regular expression patterns.}, booktitle = {Scandinavian Conference on Health Informatics (SHI)}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Linköping Electronic Conference Proceedings}, ISBN = {978-91-7519-758-6}, pages = {41--47}, } @inProceedings{kokkinakis-2011-what-141312, title = {What is the Coverage of SNOMED CT® on Scientific Medical Corpora?}, abstract = {This paper reports on the results of a large scale mapping of SNOMED CT on scientific medical corpora. The aim is to automatically access the validity, reliability and coverage of the Swedish SNOMED-CT translation, the largest, most extensive available resource of medical terminology. The method described here is based on the generation of predominantly safe harbor term variants which together with simple linguistic processing and the already available SNOMED term content are mapped to large corpora. The results show that term variations are very frequent and this may have implication on technological applications (such as indexing and information retrieval, decision support systems, text mining) using SNOMED CT. Naïve approaches to terminology mapping and indexing would critically affect the performance, success and results of such applications. SNOMED CT appears not well-suited for automatically capturing the enormous variety of concepts in scientific corpora (only 6,3% of all SNOMED terms could be directly matched to the corpus) unless extensive variant forms are generated and fuzzy and partial matching techniques are applied with the risk of allowing the recognition of a large number of false positives and spurious results.}, booktitle = {Studies in Health Technology and Informatics / XXIII International Conference of the European Federation for Medical Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, volume = {169}, } @inProceedings{johansson-etal-2012-semantic-156400, title = {Semantic Role Labeling with the Swedish FrameNet}, abstract = {We present the first results on semantic role labeling using the Swedish FrameNet, which is a lexical resource currently in development. Several aspects of the task are investigated, including the selection of machine learning features, the effect of choice of syntactic parser, and the ability of the system to generalize to new frames and new genres. In addition, we evaluate two methods to make the role label classifier more robust: cross-frame generalization and cluster-based features. Although the small amount of training data limits the performance achievable at the moment, we reach promising results. In particular, the classifier that extracts the boundaries of arguments works well for new frames, which suggests that it already at this stage can be useful in a semi-automatic setting.}, booktitle = {Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12); Istanbul, Turkey; May 23-25}, author = {Johansson, Richard and Friberg Heppin, Karin and Kokkinakis, Dimitrios}, year = {2012}, ISBN = {978-2-9517408-7-7}, pages = {3697--3700}, } @inProceedings{oelke-etal-2012-advanced-155493, title = { Advanced Visual Analytics Methods for Literature Analysis}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace, which results also in a growing demand for computational support to analyze such linguistic data. This paper combines robust text analysis with advanced visual analytics and bring a new set of tools to literature analysis. Visual analytics techniques can offer new and unexpected insights and knowledge to the literary scholar. We analyzed a small subset of a large literary collection, the Swedish Literature Bank, by focusing on the extraction of persons’ names, their gender and their normalized, linked form, including mentions of theistic beings (e.g., Gods’ names and mythological figures), and examined their appearance over the course of the novel. A case study based on 13 novels, from the aforementioned collection, shows a number of interesting applications of visual analytics methods to literature problems, where named entities can play a prominent role, demonstrating the advantage of visual literature analysis. Our work is inspired by the notion of distant reading or macroanalysis for the analyses of large literature collections. }, booktitle = {Language Technology for Cultural Heritage, Social Sciences, and Humanities (LaTeCH). An EACL 2012 workshop. Avignon, France.}, author = {Oelke, Daniela and Kokkinakis, Dimitrios and Malm, Mats}, year = {2012}, volume = {Accepted}, pages = {10}, } @inProceedings{oelke-etal-2012-visual-155495, title = {Visual Analytics and the Language of Web Query Logs - A Terminology Perspective}, abstract = {This paper explores means to integrate natural language processing methods for terminology and entity identification in medical web session logs with visual analytics techniques. The aim of the study is to examine whether the vocabulary used in queries posted to a Swedish regional health web site can be assessed in a way that will enable a terminologist or medical data analysts to instantly identify new term candidates and their relations based on significant co-occurrence patterns. We provide an example application in order to illustrate how the visualizations of co-occurrence relationships between medical and general entities occurring in such logs can be visualized, accessed and explored. To enable a visual exploration of the generated co-occurrence graphs, we employ a general purpose social network analysis tool, Visone (http://visone.info), that permits to visualize and analyze various types of graph structures. Our examples show that visual analytics based on co-occurrence analysis provides insights into the use of layman language in relation to established (professional) terminologies, which may help terminologists decide which terms to include in future terminologies. Increased understanding of the used querying language is also of interest in the context of public health web sites. The query results should reflect the intentions of the information seekers, who may express themselves in layman language that differs from the one used on the available web sites provided by medical professionals.}, booktitle = {The 15th EURALEX International Congress (European Association of Lexicography). Oslo, Norway.}, author = {Oelke, Daniela and Eklund, Ann-Marie and Marinov, Svetoslav and Kokkinakis, Dimitrios}, year = {2012}, pages = {8}, } @inProceedings{kokkinakis-oelke-2012-women-155537, title = {Men, Women and Gods: Distant Reading in Literary Collections - Combining Visual Analytics with Language Technology}, abstract = {The volumes of digitized literary collections in various languages increase at a rapid pace and so increases the need to computationally support the analysis of such data. Literature can be studied in a number of different ways and from many different perspectives and text analysis make up a central component of literature studies. If such analysis can be integrated with advanced visual methods and fed back to the daily work of the literature researcher, then it is likely to reveal the presence of useful and nuanced insights into the complex daily lives, ideas and beliefs of the main characters found in many of the literary works. In this paper we describe the combination of robust text analysis with visual analytics and bring a new set of tools to literary analysis. As a show case, we analyzed a small subset (13 novels of a single author) taken from a large literary collection, the Swedish Literature Bank <http://litteraturbanken.se/#!om/inenglish>. The analysis is based upon two levels of inquiry, namely by focusing on mentions of theistic beings (e.g. Gods' names) as well as mentions of persons' names, including their gender and their normalized, linked variant forms, and examining their appearance in sentences, paragraphs and chapters. The case study shows several successful applications of visual analytics methods to various literature problems and demonstrates the advantages of the implementation of visual literature fingerprinting. Our work is inspired by the notion of distant reading or macronalysis for the analyses of literature collections. We start by recognizing all characters in the novels using a mature language technology (named entity recognition) which can be turned into a tool in aid of text analysis in this field. We apply context cues, lists of animacy and gender markers and inspired by the document centered approach and the labelled consistency principle which is a form of on-line learning from documents under processing which looks at unambiguous usages of words or names for assigning annotations in ambiguous words or names. For instance, if in an unambiguous context where there is a strong gender indicator, such as 'Mrs Alexander' the name 'Alexander' is assigned a feminine gender, then subsequent mentions of the same name in the same discourse will be assigned the feminine gender as well unless there is a conflict with another person with the same name. We argue, that the integration of text analysis such as the one briefly outlined and visualization techniques, such as higher resolution pixel-based fingerprinting, could be put to effective use also in literature studies. We also see an opportunity to devise new ways of exploring the large volumes of literary texts being made available through national cultural heritage digitization projects, for instance by exploring the possibility to show several literary texts (novels) at once. We will illustrate some of the applied techniques using several examples from our case study, such as summary plots based on all the characters in these novels as well as fingerprints based on the distribution of characters across the novels.}, booktitle = {Proceedings of the Advances in Visual Methods for Linguistics (AVML)}, author = {Kokkinakis, Dimitrios and Oelke, Daniela}, year = {2012}, volume = {Accepted}, } @inProceedings{kokkinakis-2012-journal-155893, title = {The Journal of the Swedish Medical Association - a Corpus Resource for Biomedical Text Mining in Swedish.}, abstract = {Biomedical text mining applications are largely dependent on high quality knowledge resources. Traditionally, these include lexical databases, terminologies, nomenclatures and ontologies and, during the last decade, also corpora of various sizes, variety and diversity. Some of these corpora are annotated with an expanding range of information types and metadata while others become available with a minimal set of annotations. At the same time, it is of great importance that biomedical corpora for lesser-spoken languages also get developed in order to support and facilitate the implementation of practical applications for such languages and to stimulate the development of language technology research and innovation infrastructures in the domain. This paper provides a detailed description of a Swedish biomedical corpus based on the electronic editions of the Journal of the Swedish Medical Association "Läkartidningen" of the years 1996-2010. The corpus consists of a variety of documents that can be related to different medical domains, developed as a response to the increasing needs for large and reliable medical information for Swedish biomedical NLP. The corpus has been structurally annotated with a minimal set of meta information and automatically indexed with the largest and systematically organised computer processable collection of medical terminology, the Swedish SNOMED CT (Systematized Nomenclature of Medicine -- Clinical Terms). This way topic-focused subcorpora, e.g. with diabetes-related content, can be easily developed.}, booktitle = {The Third Workshop on Building and Evaluating Resources for Biomedical Text Mining (BioTxtM), an LREC Workshop. Turkey.}, author = {Kokkinakis, Dimitrios}, year = {2012}, volume = {Accepted}, } @inProceedings{kokkinakis-etal-2012-contextualisation-155530, title = {Contextualisation of functional symptoms in primary health care}, abstract = {Background: a number of patients consulting primary health care have physical symptoms that may be labeled “medically unexplained”, i.e. absence of a demonstrable organic etiology. Common functional somatic symptoms (FSS) are irritable bowel, tension headache and chronic fatigue. FSS-patients are generally frustrated with the inability of health care to alleviate their illness. Health care staff often also feel frustration. The communication between patient and care giver is the key for coming to terms with the problem. Objective: to investigate how complex, vague and long-standing symptoms with no identified organic cause are put into context, interpreted and acted upon in primary health-care interactions. Two types of interventions are envisaged (i) methods for early identification of patients at risk of entering a vicious circle of functional symptoms and (ii) methods for re-interpreting symptoms in alternative and more purposeful ways. Methods: the project studies interactions between patients and nurses giving advice over telephone, consultations between patients and physicians, interviews and study patients' medical case notes. Eligible patients (18-65 y.o.) contact their primary health care centre by telephone, have had at least eight physical consultations with nurses or physicians in the last 12 months and if a majority of the symptoms within this time span had no clear organic or psychiatric cause. The project contains a number of subprojects, according to the type of data collected. Several methods of analysis will be used, mainly critical discourse analysis, phenomenologic-hermeneutic and computation linguistic analyses. (Expected) Results: using the collected data, we describe characteristics of the communication that takes place in these settings and the way symptoms and diseases are represented. This will facilitate the development of future interventions aimed at decreasing the morbidity due to FSS and give further insights into the problem. }, booktitle = {The 5th GENEVA Conference on Person-Centered Medicine. Geneva, Switzerland. }, author = {Kokkinakis, Dimitrios and Lidén, Eva and Svensson, Staffan and Björk Brämberg, Elisabeth and Määttä, Sylvia}, year = {2012}, } @inProceedings{smith-etal-2011-developing-152723, title = {Developing a toolkit for written information materials for patients with colorectal cancer undergoing elective surgery}, abstract = {This study examines language complexity, readability and suitability, of written health information materials given to patients undergoing colorectal cancer (CRC) surgery. The overall aim is to investigate whether the implementation of adapted, person-centred information and communication for patients with CRC undergoing elective surgery, can enhance the patients’ self-care beliefs and well-being during recovery in the phase following diagnosis and initial treatment. Several explorative, qualitative studies are planned and will function both as a basis for the proposed interventions and provide explanations for the actual processes leading to the desired outcomes. Patients’ knowledge enablement will be reached by several interrelated intervention strategies and specific activities. One of these strategies deals with means to facilitate patients’ information seeking patterns and the goal is to provide patients with written information materials according to preferences for complex and detailed or legible texts. Thus, the interventions planed aim to enhance a movement from receiving information and instructions to participating in knowing. Written and printed patient information material from 28 Swedish clinics for patients diagnosed with CRC undergoing elective surgery were selected for analysis by means of standard metrics and more elaborate language technology techniques. Various text parameters such as lexical variation, frequency bands and the use of terminology were examined. The material was also analysed using a Suitability Assessment Instrument in order to examine content, literacy demand, graphic illustrations, layout and typography, learning stimulation and finally cultural appropriateness. In addition, five focusgroups were conducted where patients were asked to give their experiences of using written information. Results from the language technology analysis showed a variety in materials, where it could be divided in to easy, medium and difficult to read and comprehend. Patients in focusgroups told they would like written materials to be levelled in order to gain information stepwise, but also stressed the importance of information given both orally and in writing, and that they must correspond. Using the SAM-instrument was a good complement for deeper understanding, and taking all three analyses in account, we aim to design a balanced toolkit for how to best design written information materials where a person tailored approach can be offered. }, booktitle = {Svenska Läkaresällskapets Riksstämman}, author = {Smith, Frida and Carlsson, Eva and Friberg, Febe and Kokkinakis, Dimitrios and Forsberg, Markus and Öhrn, Matilda and Öhlén, Joakim}, year = {2011}, } @article{kokkinakis-2011-medicinska-149931, title = {Medicinska terminologier - officiella standarder och verklighet}, abstract = {Officiella medicinska termlistor hinner aldrig bli helt kompletta eller uppdaterade i tid med de senaste upptäckterna inom det (bio)medicinska fältet Växande behov av koppling mellan fack- och allmänspråk för praktiska (medicinskt orienterade) tillämpningar, t.ex. "din journal på nätet"-projektet Applikationer med indata som innehåller både fackspråk och allmänspråk - brist på täckande medicinska (elektroniska) ordböcker/termlistor med integrerad utförlig språklig och medicinsk information för lekmän finns inte transkriberade patient-läkarsamtal Använda existerande medicinska terminologier i språkteknologisk forskning som stöd för informationsutvinning - skapa strukturerade representationer av texter (samförekomstanalys; faktaextraktion och syntes; relation- och händelseextraktion; t.ex. mellan sjukdom - behandling - utfall få att kunna få ett bra underlag för att kunna förutsäga hur framtida behandlingar slår) Använda terminologin som ett medium för att underlätta kommunikationen mellan hälsotagare och hälsogivare t.ex. underlätta förståelse av medicinska termer av allmänheten }, journal = {Terminologiworkshop i Karlstad}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @article{kokkinakis-2011-natural-149930, title = {Natural language processing of clinical data with a focus on diffuse symptoms}, abstract = {The medical domain is well supported with a wealth of large, rich and varied controlled vocabularies and terminological resources. This paper investigates the extent by which the largest available medical nomenclature for Swedish, the Systematized Nomenclature of Medicine Clinical Terms (SNOMED CT), can handle a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. The aim of the study is to better understand phenotype contextualization in order to improve and enhance our knowledge of communicative events in various healthcare settings. Our approach can be seen as an exploratory one in which we believe to yield useful insights into the nature of how findings, symptoms and signs (i.e. clinical phenotypes in general) are expressed in real data. This study is initiated in the context of the project "Interpretation and understanding of functional symptoms in primary health care". The main research goal of which is to study health care interactions with patients suffering from Functional Somatic Syndromes (FSS). FSS are characterized by particular constellations of medically unexplained, often chronic symptoms, such as dizziness, fatigue, dyspepsia, muscle and joint pain. We use methods from the natural language processing field in order to investigate how symptom mentions are expressed and how available successful automated means are for capturing symptom descriptions both on collected written (patient records) and transcribed material (patient/nurse and patient/doctor encounters). We manually evaluated the content of the resource on the collected data and our results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions. We have evaluated the content of a large controlled vocabulary for Swedish on symptom descriptions in clinical texts.}, journal = {Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{kokkinakis-2008-semantic-73975, title = {Semantic Relation Mining of Solid Compounds in Medical Corpora.}, abstract = {In the context of scientific and technical texts, meaning is usually embedded in noun compounds and the semantic interpretation of these compounds deals with the detection and semantic classification of the relation that holds between the compound’s constituents. Semantic relation mining, the technology applied for marking up, interpreting, extracting and classifying relations that hold between pairs of words, is an important enterprise that contribute to deeper means of enhancing document understanding technologies, such as Information Extraction, Question Answering, Summarization, Paraphrasing, Ontology Building and Textual Entailment. This paper explores the application of assigning semantic descriptors taken from a multilingual medical thesaurus to a large sample of solid (closed form) compounds taken from large Swedish medical corpora, and determining the relation(s) that may hold between the compound constituents. Our work is inspired by previous research in the area of using lexical hierarchies for identifying relations between two-word noun compounds in the medical domain. In contrast to previous research, Swedish, as other Germanic languages, require further means of analysis, since compounds are written as one sequence with no white space between the words, e.g. virus diseases vs. virussjukdomar, which makes the problem more challenging, since solid compounds are harder to identify and segment.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, ISBN = {9786611733414}, } @inProceedings{kokkinakis-malm-2011-character-143875, title = {Character Profiling in 19th Century Fiction}, abstract = {This paper describes the way in which personal relationships between main characters in 19th century Swedish prose fiction can be identified using information guided by named entities, provided by a entity recognition system adapted to the 19th century Swedish language characteristics. Interpersonal relation extraction is based on the context between two relevant, identified person entities. The extraction process of the relationships also utilize the content of on-line available lexical semantic resources (suitable vocabularies) and fairly standard context matching methods that provide a basic mechanism for identifying a wealth of interpersonal relations that hopefully can aid the reader of a 19th-century Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story.}, booktitle = {Workshop: Language Technologies for Digital Humanities and Cultural Heritage in conjunction with the Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2011}, } @inProceedings{kokkinakis-2011-reducing-143877, title = {Reducing Complexity in Parsing Scientific Medical Data, a Diabetes Case Study}, abstract = {The aim of this study is to assemble and deploy various NLP components and resources in order to parse scientific medical data and evaluate the degree in which these resources contribute to the overall parsing performance. With parsing we limit our efforts to the identi-fication of unrestricted noun phrases with full phrase structure and investigate the effects of using layers of semantic annotations prior to parsing. Scientific medical texts exhibit com-plex linguistic structure but also regularities that can be captured by pre-processing the texts with specialized semantically-aware tools. Our results show evidence of improved performance while the complexity of parsing is reduced. Parsed scientific texts and inferred syntactic information can be leveraged to improve the accuracy of higher-level tasks such as information extraction and enhance the acquisition of semantic relations and events.}, booktitle = {Workshop: Biomedical Natural Language Processing in conjunction with Recent Advances in Natural Language Processing (RANLP). Hissar, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @article{kokkinakis-2010-complementary-125644, title = {Complementary Methods for De-identifying Sensitive Data with a focus on Clinical Discourse}, abstract = {In the era of the Electronic Health Record (EHR) the release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly clinical, or other sources of sensitive data. The main challenge and goal of this research is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, journal = {Revista de Procesamiento de Lenguaje Natural (SEPLN)}, author = {Kokkinakis, Dimitrios}, year = {2010}, volume = {45}, pages = {243--246}, } @article{kokkinakis-2009-shallow-105133, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning; a pilot study.}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discusses the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a supervised machine learning task in which the relations are tried to be learned using pre-annotated data. The challenges designing the problem and empirical results are presented.}, journal = {Lecture Notes in Computer Science}, author = {Kokkinakis, Dimitrios}, year = {2009}, volume = {5729}, pages = {395--402}, } @article{kokkinakis-thurin-2007-anonymisation-45193, title = {Anonymisation of Swedish Clinical Data}, abstract = {There is a constantly growing demand for exchanging clinical and health-related information electronically. In the era of the Electronic Health Record the release of individual data for research, health care statistics, monitoring of new diagnostic tests and tracking disease outbreak alerts are some of the areas in which the protection of (patient) privacy has become an important concern. In this paper we present a system for automatic anonymisation of Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology.}, journal = {Lecture Notes in Computer Science}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, volume = {4594}, pages = {237--241}, } @article{kokkinakis-toporowskagronostaj-2006-comparing-34032, title = {Comparing Lay and Professional Language in Cardiovascular Disorders Corpora.}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, journal = {WSEAS Transactions on BIOLOGY and BIOMEDICINE}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, volume = {3}, number = {6}, pages = {429--437}, } @inProceedings{borin-etal-2010-past-110368, title = {The past meets the present in Swedish FrameNet++}, abstract = {The paper is about a recently initiated project which aims at the development of a Swedish FrameNet as an integral part of a larger lexical resource, hence the name “Swedish FrameNet++” (SweFN++). It focuses on reuse of free electronic resources and their role in the acquisition and population of Swedish frames. After a brief overview of Swedish resources, we reflect on three approaches to recycling the available lexical data in a semi-automatic manner. SweFN++ will be a multi-functional resource supporting research within lexicology and linguistics as well as different applications within computational lexicography and language technology, not to mention e-science.}, booktitle = {14th EURALEX International Congress}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, pages = {269--281}, } @inProceedings{kokkinakis-2011-health-141311, title = {Health Portals and Clinical Phenotypes - Recognition using SNOMED CT}, abstract = {The medical domain is particularly well endowed with various sources of terminology. Usually, such sources vary with respect to size, structure, depth and breadth of descriptive power, granularity and applicability. This paper investigates the extent by which the largest available medical nomenclature for Swedish can cope with a particularly challenging and difficult to automatically acquire type of terminology, namely (clinical) phenotypes. We evaluated the content of the resource on extracted reference symptom lists from several popular health portals. The results indicate that a large number of such phenotypes are expressed using figurative language, or contextualized using a number of variant expressions. SNOMED CT cannot easily accommodate for such variation and vagueness expressed in real text data, unless we devise means to handle such variation, e.g. by the use of near synonym dictionaries, development and linking of consumer health vocabularies. The presented research has several implications since accurate identification of phenotypes can for instance increase the value of available data in decision making and thus allow automatic systems to dynamically correct inappropriate domain decisions.}, booktitle = {9th Scandinavian Conference on Health Informatics}, author = {Kokkinakis, Dimitrios}, year = {2011}, } @inProceedings{kokkinakis-2011-evaluating-139977, title = {Evaluating the Coverage of three Controlled Health Vocabularies with Focus on Findings, Signs & Symptoms}, abstract = {The medical domain is blessed with a magnitude of terminological resources of various characteristics, sizes, structure, depth and breadth of descriptive power, granularity etc. In this domain a particularly interesting and difficult entity type are signs, symptoms and findings which to a large extend are expressed in a periphrastic manner, sometimes by the use of figurative or metaphorical language, or contextualized using a wealth of vague variant expressions. We hypothesize therefore that no major official terminology source alone can accommodate for the variation and complexity present in real text data, such as electronic medical records, notes or health related documents. In this paper we evaluate the content of the three largest medical control vocabularies available for Swedish on extracted reference symptom lists and initiate a discussion on how we should proceed in order to accommodate for increased coverage on similar genres. }, booktitle = {Workshop on Creation, Harmonization and Application of Terminology Resources Co-located with NODALIDA 2011}, author = {Kokkinakis, Dimitrios}, year = {2011}, pages = {5}, } @article{kokkinakis-gerdin-2010-lakartidningens-120480, title = {Läkartidningens arkiv i en ny skepnad - En resurs för forskare, läkare och allmänhet}, abstract = {I Sverige har det tagits fram en medicinsk korpus baserad på Läkartidningens digitala arkiv. Denna resurs möjliggör precisa sökningar och värdefull tillgång till medicinsk terminologisk information på olika nivåer. Dimitrios Kokkinakis från Göteborgs universitet och Ulla Gerdin från Socialstyrelsen presenterar projektet. }, journal = {Språkbruk}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, volume = {1/2010}, pages = {22--28}, } @article{kokkinakis-2010-"data-130213, title = {Är "data scrubbing" en användbar metod för att anonymisera känsliga patientdata?.}, abstract = {De senaste årens ökande användning av modern informationsteknik inom sjukvården har medfört en kraftig ökning av elektronisk dokumentation som rör patientens hälsotillstånd, vård och behandling. Vårddokumentationen blir både mer detaljerad och mer individuell, samtidigt som den uppdateras och förändras regelbundet. Patientjournalen är i första hand till för att bidra till en god och säker vård av patienten, men också en viktig informationskälla för FoU. Ett stort hinder för utnyttjandet av journalinformation som forskningskälla är de etiska och rättsliga problemen. För att kunna hantera och utnyttja dessa stora och ständigt växande informationsmängder ställs därmed högre krav på säker, skyddad och effektiv informationshantering.}, journal = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{kokkinakis-2010-data-130212, title = {Is data scrubbing useful for anonymizing sensitive data?.}, abstract = {The release of individual data for research, public health planning, health care statistics, monitoring of diagnostic tests, automated data collection for health care registries and tracking disease outbreaks are some of the areas in which the protection of Personal Health Information (PHI) has become an important concern. The purpose of this study is to adapt and apply synergetic methods to document de-identification, particularly in the clinical setting. The main challenge is to retain important concepts and PHI in the documents in a standardized and neutral manner as means of encryption without violating the integrity of the PHI and without sacrificing the quality and intended meaning of the authors.}, journal = {the Third Swedish Language Technology Conference}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{kokkinakis-2010-initiala-130210, title = {Initiala resultat av en storskalig automatisk indexering av vetenskaplig litteratur med hela det svenska SNOMED CT - problem och möjligheter.}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv <http://ltarkiv.lakartidningen.se> och dels från DiabetologNytts (DN) digitala arkiv <http://diabetolognytt.se/aterkommande/arkivet.html>.}, journal = {2010-års nationella termkonferens: Professionen i språket - språket i professionen.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @article{borin-etal-2010-swedish-129126, title = {Swedish FrameNet++}, journal = {Swedish Language Technology Conference 2010}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{allvin-etal-2010-characteristics-120479, title = {Characteristics and Analysis of Finnish and Swedish Clinical Intensive Care Nursing Narratives}, abstract = {We present a comparative study of Finnish and Swedish free-text nursing narratives from intensive care. Although the two languages are linguistically very dissimilar, our hypothesis is that there are similarities that are important and interesting from a language technology point of view. This may have implications when building tools to support producing and using health care documentation. We perform a comparative qualitative analysis based on structure and content, as well as a comparative quantitative analysis on Finnish and Swedish Intensive Care Unit (ICU) nursing narratives. Our findings are that ICU nursing narratives in Finland and Sweden have many properties in common, but that many of these are challenging when it comes to developing language technology tools. }, booktitle = {Proceedings of the NAACL HLT 2010 Second Louhi Workshop on Text and Data Mining of Health Documents}, author = {Allvin, H. and Carlsson, E. and Dalianis, H. and Danielsson-Ojala, R. and Daudaravicius, V. and Hassel, M. and Kokkinakis, Dimitrios and Lundgren-Laine, H. and Nilsson, G. and Nytrø, Ø. and Salanterä, S. and Skeppstedt, M. and Suominen, H. and Velupillai, S.}, year = {2010}, pages = {53 -- 60}, } @book{kokkinakis-2001-framework-125224, title = {A Framework for the Acquisition of Lexical Knowledge; Description and Application}, author = {Kokkinakis, Dimitrios}, year = {2001}, address = {Göteborg}, ISBN = {LIBRIS-ID:8245865}, } @incollection{borin-kokkinakis-2010-literary-124517, title = {Literary onomastics and language technology}, booktitle = {Literary education and digital learning}, author = {Borin, Lars and Kokkinakis, Dimitrios}, year = {2010}, publisher = {Information Science Reference}, address = {Hershey - New York}, ISBN = {978-1-60566-932-8}, pages = {53--78}, } @inProceedings{kokkinakis-2010-korpus-119444, title = {Korpus för vårdens och omsorgens fackspråk.}, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ till svenska. Med hjälp av Läkartidningens digitala arkiv har vi utvecklat metoder för att effektivisera kvalitetssäkringen av terminnehållet. }, booktitle = {Humanistdagen 2010 - humaniora i dagens samhälle.}, author = {Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{kokkinakis-toporowskagronostaj-2010-linking-119441, title = {Linking SweFN++ with Medical Resources, towards a MedFrameNet for Swedish}, abstract = {In this pilot study we define and apply a methodology for building an event extraction system for the Swedish scientific medical and clinical language. Our aim is to find and describe linguistic expressions which refer to medical events, such as events related to diseases, symptoms and drug effects. In order to achieve this goal we have initiated actions that aim to extend and refine parts of the ongoing compilation of the Swedish FrameNet++ (SFN++), which, as its English original predecessor, is grounded in Frame Semantics which provides a sound theoretical ground for modeling and linking linguistic structures encountered in general language and in specific domains (after specialization). Using such resource we manually annotate domain texts to be used as training data for automatic event extraction by automated techniques.}, booktitle = {Proceedings of the Second Louhi Workshop on Text and Data Mining of Health Documents. A NAACL-HTL Workshop}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2010}, } @inProceedings{borin-etal-2010-diabase-118907, title = {Diabase: Towards a diachronic BLARK in support of historical studies}, booktitle = {Proceedings of LREC 2010}, author = {Borin, Lars and Forsberg, Markus and Kokkinakis, Dimitrios}, year = {2010}, } @inProceedings{kokkinakis-gerdin-2010-swedish-113194, title = {A Swedish Scientific Medical Corpus for Terminology Management and Linguistic Exploration}, abstract = {This paper describes the development of a new Swedish scientific medical corpus. We provide a detailed description of the characteristics of this new collection as well results for a number of term management tasks, including terminology validation and terminology extraction based on this material. Although the corpus is representative for the scientific medical domain it still covers a lot of specialised sub-disciplines such as “diabetes” and “osteoporosis” which makes it suitable for facilitating the production of smaller and more focused subcorpora. We have tried to address this issue by making explicit some features of the corpus in order to demonstrate the corpus usefulness particularly for the quality assessment of official terminologies such as the Systematized NOmenclature of MEDicine - Clinical Terms (SNOMED CT).}, booktitle = {Proceedings of the 7th international conference on Language Resources and Evaluation (LREC), Malta}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2010}, } @inProceedings{borin-etal-2007-medical-44951, title = {Medical frames as target and tool}, booktitle = {FRAME 2007: Building Frame Semantics resources for Scandinavian and Baltic languages. (Nodalida 2007 workshop proceedings)}, author = {Borin, Lars and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2007}, ISBN = {978-91-976939-0-5}, pages = {11--18}, } @techreport{ahlfelt-etal-2006-literature-34047, title = {Literature Review on Patient_Friendly Documentation Systems}, author = {Åhlfelt, Hans and Borin, Lars and Daumke, Philipp and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Kokkinakis, Dimitrios and Mancini, Clara and Marko, Kornel and Merkel, Magnus and Pietsch, Christian and Power, Richard and Scott, Donia and Silvervarg, Annika and Toporowska Gronostaj, Maria and Williams, Sandra and Willis, Alistair}, year = {2006}, publisher = {Göteborg University}, address = {Göteborg}, } @inProceedings{marko-etal-2006-cross-34049, title = {Cross-Lingual Alignment of Medical Lexicons}, abstract = {We present an approach for the creation of a multilingual medical dictionary for the biomedical domain. In a first step, available monolingual lexical resources are compiled into a common interchange format. Secondly, according to a linking format deciced by the authors, the cross-lingual mappings of lexical entries are added. We show how these mappings can be generated using a morpho-semantic term normalization engine, which captures intra- as well as interlingual synonymy relationships on the level of subwords.}, booktitle = {Language Resources and Evaluation }, author = {Marko, Kornel and Baud, Robert and Zweigenbaum, Pierre and Merkel, Magnus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Schulz, Stefan}, year = {2006}, volume = {2006}, pages = {5--8}, } @article{kokkinakis-etal-2007-lexical-45194, title = {Lexical Parameters, Based on Corpus Analysis of English and Swedish Cancer Data, of Relevance for NLG}, abstract = {This paper reports on a corpus-based, contrastive study of the Swedish and English medical language in the cancer sub-domain. It is focused on the examination of a number of linguistic parameters differentiating two types of cancer-related textual material, one intended for medical experts and one for laymen. Language-dependent and language independent characteristics of the textual data between the two languages and the two registers are examined and compared. The aim of the work is to gain insights into the differences between lay and expert texts in order to support natural language generation (NLG) systems.}, journal = {roceedings of the 16th Nordic Conference of Computational Linguistics (NODALIDA)}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Hallett, Catalina and Hardcastle, david}, year = {2007}, } @inProceedings{kokkinakis-toporowskagronostaj-2006-language-33938, title = {Lay Language versus Professional Language within the Cardiovascular Subdomain - a Contrastive Study}, abstract = {This paper reports on a corpus-based, contrastive study of Swedish medical language. It is focused on the vocabulary used in two types of medical textual material: professional portals and web-based consumer sites within the domain of cardiovascular disorders. Linguistic, statistical and quantitatively based readability studies are considered in order to find the typical language-dependent and, possibly, language independent characteristics of the material examined and suggest concrete measures that might bridge the gap in medical vocabulary as used by laypersons/consumers and professionals. }, booktitle = {Proceedings of the 2006 WSEAS Int. Conf. on Cellular & Molecular Biology, Biophysics & Bioengineering}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2006}, } @inProceedings{borin-etal-2009-thinking-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @techreport{borin-etal-2007-empowering-53590, title = {Empowering the patient with language technology}, author = {Borin, Lars and Grabar, Natalia and Hallett, Catalina and Hardcastle, david and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios and Williams, Sandra and Willis, Alistair}, year = {2007}, publisher = {Göteborg University}, address = {Göteborg}, } @inProceedings{kokkinakis-toporowskagronostaj-2008-medlex+-73976, title = {MEDLEX+: An Integrated Corpus-Lexicon Medical Workbench for Swedish}, abstract = {This paper reports on ongoing work on developing a medical corpus-lexicon workbench for Swedish, MedLex+. At the moment the workbench incorporates: (i) an annotated collection of medical texts, 25 million tokens, 50,000 documents, (ii) a number of language processing components, including tools for collocation extraction, compound segmentation and thesaurus-based semantic annotation, and (iii) a lexical database of medical terms (5,000 entries). MedLex+ is a multifunctional lexical resource due to its structural design and content which can be easily queried. The medical workbench is intended to support lexicographers in their work on compiling lexicons and also lexicon users more or less initiated in the medical domain. It can also assist researchers working in the fields of lexical semantics and natural language processing (NLP) with focus on medical language. The linguistically and semantically annotated medical texts in combination with a set of queries turn the corpus into a rich repository of semasiological and onomasiological knowledge about medical terminology and their linguistic, lexical and pragmatic properties. These properties are recorded in the lexical database with a cognitive profile. The MedLex+ workbench seems to offer constructive help in many different lexical tasks. }, booktitle = {Proceedings of the 13th EURALEX}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2008}, } @incollection{kokkinakis-etal-2006-bygga-56225, title = {Att bygga en språkbro mellan allmänhet och vårdpersonal - språket i texter om hjärt-kärlsjukdomar}, booktitle = {Humanistdag-boken}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria and Johansson Kokkinakis, Sofie}, year = {2006}, publisher = {Göteborgs universitet}, address = {Göteborg}, } @article{johanssonkokkinakis-kokkinakis-1999-beskrivning-55910, title = {Beskrivning av några problem vid automatisk analys av text}, journal = {Från dataskärm och forskarpärm, "Språkliga studier tillägnade Birgitta Ernby", MISS, Göteborgs universitet}, author = {Johansson Kokkinakis, Sofie and Kokkinakis, Dimitrios}, year = {1999}, volume = {No 25}, pages = {88--95}, } @article{kokkinakis-johanssonkokkinakis-1999-automatisk-56218, title = {Automatisk betydelseidentifiering på cykelnivå m.h.a. GLDB}, journal = {Proceedings från NFL Symposiet (Nordisk Förening i Lexikografi) och Nordiska studier i Lexikografi}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @article{kokkinakis-johanssonkokkinakis-1999-cascaded-56216, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, journal = {European Chapter of the Association of Computational Linguistics (EACL)}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @techreport{kokkinakis-johanssonkokkinakis-1998-cascaded-56209, title = {A Cascaded Finite-State Parser for Syntactic Analysis of Swedish}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1998}, publisher = {Svenska språket}, address = {Göteborg}, } @techreport{kokkinakis-johanssonkokkinakis-1999-sense-56213, title = {Sense Tagging at the Cycle-Level Using GLDB}, author = {Kokkinakis, Dimitrios and Johansson Kokkinakis, Sofie}, year = {1999}, } @inProceedings{borin-etal-2007-naming-44954, title = {Naming the past: Named entity and animacy recognition in 19th century Swedish literature}, booktitle = {ACL 2007 Workshop on Language Technology for Cultural Heritage Data (LaTeCH 2007)}, author = {Borin, Lars and Kokkinakis, Dimitrios and Olsson, Leif-Jöran}, year = {2007}, pages = {1--8}, } @inProceedings{kokkinakis-gerdin-2009-kvalitetssakring-105141, title = {Kvalitetssäkring av SNOMED CT med hjälp av Läkartidningens arkiv. }, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ (SNOMED CT) till svenska. Arbetet är både omfattande och tidskrävande samtidigt som uppdragstagaren har krav om kvalitetssäkring av översättningen. Hur kan Läkartidningens arkiv bidra till kvalitetssäkringen? Med hjälp av Läkartidningens digitala arkiv, LDA, (årgångarna 1996-2009) har vi utvecklat metoder för att effektivisera kvalitetssäkringen av olika SNOMED CT-urval (t.ex. diabetestermer). Det innebär att vi underlättar för utförandet av empiriska, SNOMED CT-relaterade studier, som t.ex. framtagning av underlag om termernas användning, variation och frekvensdistribution över tid. Arkivets förädling: LDA:t omvandlades till ett enhetligt textbaserat format och textinnehållet normaliserades med avseenden på dokumentformat och teckenkodning för att kunna skapa ett bra underlag för den efterföljande språkteknologiska analysen. Alla artiklar i varje publicerad årgång extraherades och märktes upp dels med olika slags metainformation (t.ex. genretillhörighet) dels med lingvistisk och semantisk information, sammanlagt 27 000 artiklar. Den språkteknologiska bearbetningen innefattade automatiskt tillägg av lingvistisk information som t.ex. ordklasstillhörighet för varje ord i korpusen och automatiskt, semantisk mappning dels till den svenska MeSH-tesaurusen och dels till delar av den svensköversatta SNOMED-hierarkin. LDA i en ny skepnad: LDA utgör sedan länge en värdefull svensk medicinsk resurs för alla som yrkesmässigt jobbar med termer och språk. Vi har dock bidragit med att göra textmaterialet ännu mer välstrukturerat och förädlat, som kan vara till hjälp för explorativa studier där sökningar kan förfinas på ett flertal sätt och därmed ge forskare möjligheter att göra djupare innehållsanalyser av texterna och samla grundläggande kunskaper inom olika ämnesområden. Kombinationen av enstaka termer och ord med lingvistisk och semantisk information ger unika möjligheter till att skaffa information och generera fakta som kan leda till nya hypoteser och eventuellt ny kunskap om olika aspekter som gäller termanvändning och variation och vi kommer att redovisa exempel på sådana analyser. }, booktitle = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @article{kokkinakis-gerdin-2009-issues-105140, title = {Issues on Quality Assessment of SNOMED CT® Subsets - Term Validation and Term Extraction}, abstract = {The aim of this paper is to apply and develop methods based on Natural Language Processing for automatically testing the validity, reliability and coverage of various Swedish SNOMED-CT subsets, the Systematized NOmenclature of MEDicine - Clinical Terms a multiaxial, hierarchical classification system which is currently being translated from English to Swedish. Our work has been developed across two dimensions. Initially a Swedish electronic text collection of scientific medical documents has been collected and processed to a uniform format. Secondly, a term processing activity has been taken place. In the first phase of this activity, various SNOMED CT subsets have been mapped to the text collection for evaluating the validity and reliability of the translated terms. In parallel, a large number of term candidates have been extracted from the corpus in order to examine the coverage of SNOMED CT. Term candidates that are currently not included in the Swedish SNOMED CT can be either parts of compounds, parts of potential multiword terms, terms that are not yet been translated or potentially new candidates. In order to achieve these goals a number of automatic term recognition algorithms have been applied to the corpus. The results of the later process is to be reviewed by domain experts (relevant to the subsets extracted) through a relevant interface who can decide whether a new set of terms can be incorporated in the Swedish translation of SNOMED CT or not. }, journal = {Proceedings of RANLP-2009 Workshop: Biomedical Information Extraction.}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @article{kokkinakis-gerdin-2009-uppbyggandet-105136, title = {Uppbyggandet av en svensk medicinsk korpus för termvalidering och termextrahering - hur bra täcker SNOMED CT olika delfackområden?}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv <http://ltarkiv.lakartidningen.se> och dels från DiabetologNytts (DN) digitala arkiv <http://diabetolognytt.se/aterkommande/arkivet.html>. }, journal = {2009 års nationella termkonferens Språk och Kommunikation}, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @incollection{kokkinakis-2009-lexical-73979, title = {Lexical granularity for automatic indexing and means to achieve it - the case of Swedish MeSH®}, abstract = {The identification and mapping of terminology from large repositories of life science data onto concept hierarchies constitute an important initial step for a deeper semantic exploration of unstructured textual content. Accurate and efficient mapping of this kind is likely to provide better means of enhancing indexing and retrieval of text, uncovering subtle differences, similarities and useful patterns, and hopefully new knowledge, among complex surface realisations, overlooked by shallow techniques based on various forms of lexicon look-up approaches. However, a finer-grained level of mapping between terms as they occur in natural language and domain concepts is a cumbersome enterprise that requires various levels of processing in order to make explicit relevant linguistic structures. This chapter highlights some of the challenges encountered in the process of bridging free to controlled vocabularies and thesauri and vice versa. We investigate how the extensive variability of lexical terms in authentic data can be efficiently projected to hierarchically structured codes, while means to increase the coverage of the underlying lexical resources are also investigated.}, booktitle = {Information Retrieval in Biomedicine : Natural Language Processing for Knowledge Integration}, author = {Kokkinakis, Dimitrios}, year = {2009}, publisher = {IGI Global }, address = {Hershey, Pennsylvania}, } @inProceedings{kokkinakis-2009-shallow-94705, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning, a pilot study}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discuss the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a machine learning task in which the relations are tried to be learned in a supervised fashion, using pre-annotated data. The challenges designing the problem and empirical results are presented.}, booktitle = {Proceedings of the 12th International Conference TSD (Text, Speech and Dialogue). Springer Verlag, LNCS/LNAI series.}, author = {Kokkinakis, Dimitrios}, year = {2009}, } @inProceedings{kokkinakis-2008-semantically-73974, title = {A Semantically Annotated Swedish Medical Corpus}, abstract = {With the information overload in the life sciences there is an increasing need for annotated corpora, particularly with biological and biomedical entities, which is the driving force for data-driven language processing applications and the empirical approach to language study. Inspired by the work in the GENIA Corpus, which is one of the very few of such corpora, extensively used in the biomedical field, and in order to fulfil the needs of our research, we have collected a Swedish medical corpus, the MEDLEX Corpus. MEDLEX is a large structurally and linguistically annotated document collection, consisting of a variety of text documents related to various medical text subfields, and does not focus at a particular medical genre, due to the lack of large Swedish resources within a particular medical subdomain. Out of this collection we selected 300 documents which were manually examined by two human experts who inspected, corrected and/or accordingly modified the automatically provided annotations according to a set of provided labelling guidelines. The annotations consist of medical terminology provided by the Swedish and English MeSH® (Medical Subject Headings) thesauri as well as named entity labels provided by an enhanced named entity recognition software.}, booktitle = {roceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{kokkinakis-2008-semantic-73977, title = {Semantic Pre-processing for Complexity Reduction in Parsing Medical Texts}, abstract = {Collection and multilayer annotation of textual corpora in specialized fields, such as (bio-) medicine is an important enterprise for empirically-based, data-driven language processing, human language technologies and linguistic research. One of the most important and difficult to achieve piece of annotation that can be made available is at the syntactic and functional level, i.e. parsing, particularly in sublanguages where specialized tools have to be adapted which is considered too expensive for many applications. In this paper, we describe a way to reduce the complexity of parsing in medical discourse by the use of a semantic pre-processing stage guided by annotations provided by medical thesauri and other domain-specific lexical resources. Parsing biomedical texts, apart from the challenge it possesses (deviant and idiosyncratic uses of vocabulary and syntax), is required in order to support and improve technologies such as Information Extraction and Retrieval, enhance the acquisition of relations between terminology support terminology management and population of medical semantic resources.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{kokkinakis-2008-mesh(R)-73973, title = {MeSH® - From a Controlled Vocabulary to a Processable Resource}, abstract = {Large repositories of life science data in the form of domain-specific literature, textual databases and other large specialised textual collections (corpora) in electronic form increase on a daily basis to a level beyond the human mind can grasp and interpret. As the volume of data continues to increase, substantial support from new information technologies and computational techniques grounded in the form of the ever increasing applications of the mining paradigm is becoming apparent. These emerging technologies play an increasingly critical role in aiding research productivity, and they provide the means for reducing the workload for information access and decision support and for speeding up and enhancing the knowledge discovery process. In order to accomplish these higher level goals and support the mining approach however, a fundamental and unavoidable starting point is the identification and mapping of terminology from the textual, unstructured data onto biomedical knowledge sources and concept hierarchies. In this paper, we provide a description of the work regarding terminology recognition using the Swedish MeSH® thesaurus and its corresponding English original source. We explain the various transformation and refinement steps applied to the original database tables into a fully-fledged processing oriented annotating resource. Particular attention has been given to a number of these steps in order to automatically map the extensive variability of lexical terms to structured MeSH® nodes. Issues on annotation and coverage are also discussed. }, booktitle = {Proceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{kokkinakis-thurin-2008-applying-73972, title = {Applying MeSH® to the (Swedish) Clinical Domain - Evaluation and Lessons learned}, abstract = {Medical discharge summaries and clinical notes provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a potential goldmine for both medical scientists as well as practitioners in the language technology field. The capability to extract the key concepts and their relationships from such data can be of great value for knowledge management tasks such as indexing, data interchange, data aggregation and clinical decision support. The purpose of this work is to get insights into the feasibility of applying the content of a controlled vocabulary, the Medical Subject Headings (MeSH) to a sample of electronic discharge letters (i.e. free text clinical notes). We explore the application of natural language processing (NLP) techniques to the challenge of efficiently detecting the terminology, as encoded in MeSH and we evaluate MeSH in this setting, showing that a lot of work remains to be done in order to increase the coverage of the resource both in terms of its breadth and depth. }, booktitle = {Proceedings of the 6th Scandinavian Health Informatics and the 12th Swedish National Term Conference}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2008}, } @inProceedings{kokkinakis-2007-automatic-47933, title = {Automatic Indexing using the English and Swedish MeSH®, a Note on Coverage}, abstract = {The identification and mapping of terminology onto a concept hierarchy is the very first stage of semantic, deeper analysis of textual documents. Work regarding automatic terminology recognition using the Swedish MeSH® thesaurus (Medical Subject Headings, edition 2006) and its corresponding English source is reported. A number of transformations and refinements were applied to the original lexical database in order to enhance the automatic process of mapping the extensive variability of lexical terms in authentic data to structured MeSH codes. Means to increase the coverage of both thesauruses for automatic indexing of Swedish medical data are investigated.}, booktitle = {Svenska Läkaresällskapets Riksstämma 2007}, author = {Kokkinakis, Dimitrios}, year = {2007}, } @article{kokkinakis-thurin-2007-identification-45195, title = {Identification of Entity References in Hospital Discharge Letters}, abstract = {In the era of the Electronic Health Record the release of medical narrative textual data for research, for health care statistics, for monitoring of new diagnostic tests and for tracking disease outbreak alerts imposes tough restrictions by various public authority bodies for the protection of (patient) privacy. In this paper we present a system for automatic identification of named entities in Swedish clinical free text, in the form of discharge letters, by applying generic named entity recognition technology with minor adaptations}, journal = {Proceedings of the 16th Nordic Conference of Computational Linguistics (NODALIDA)}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2007}, } @inProceedings{kokkinakis-etal-2004-intelligent-33932, title = {Intelligent Building of Language Resources for HLT Applications}, booktitle = {Proceedings of the LREC Workshop: Amazing Utility of Parallel and Comparable Corpora. Fourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios and Samiotou, Anna and Kranias, Lambros}, year = {2004}, } @article{kokkinakis-2006-towards-45197, title = {Towards a Swedish Medical Treebank}, abstract = {In this paper, we present our current activities towards the compilation and the multi-layered annotation of a domain-dependent corpus for Swedish in the area of medicine. The focus of the paper is based on the description of the constituent structure and functionally oriented annotation of the corpus. Moreover, the annotation scheme adopted, which incorporates three main layers of linguistic processing, lexical analysis, shallow semantic analysis and syntactic processing, will be exemplified. For the syntactic analysis we use a cascaded finite-state parser, aware of the shallow semantic annotations produced. The result of this analysis, including syntactic parsing and shallow semantic analysis, is transformed into the TIGER-XML interchange format. Our goal is to produce a large, rich in annotations, medical treebank suitable for both corpus-based grammar learning systems, for semantic relation extraction and for linguistic exploration of theoretical nature.}, journal = {Proceedings of the 5th Conference on Treebanks and Linguistic Theories}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{kokkinakis-2006-towards-34033, title = {Towards a Swedish Medical Treebank}, booktitle = {5th Conference on Treebanks and Linguistic Theories}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{kokkinakis-2006-collection-33937, title = {Collection, Encoding and Linguistic Processing of a Swedish Medical Corpus - The MEDLEX Experience.}, abstract = {Corpora annotated with structural and linguistic characteristics play a major role in nearly every area of language processing. During recent years a number of corpora and large data sets became known and available to research even in specialized fields such as medicine, but still however, targeted predominantly for the English language. This paper provides a description of the collection, encoding and linguistic processing of an ever growing Swedish medical corpus, the MEDLEX Corpus. MEDLEX consists of a variety of text-documents related to various medical text genres. The MEDLEX Corpus has been structurally annotated using the Corpus Encoding Standard for XML (XCES), lemmatized and automatically annotated with part-of-speech and semantic information (extended named entities and the Medical Subject Headings, MeSH, terminology). The results from the processing stages (part-of-speech, entities and terminology) have been merged into a single representation format and syntactically analysed using a cascaded finite state parser. Finally, the parsers results are converted into a tree structure that follows the TIGER-XML coding scheme, resulting a suitable for further exploration and fairly large Treebank of Swedish medical texts. }, booktitle = {Proceedings of the 5th Languages Resources and Evalutaion (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2006}, } @inProceedings{kokkinakis-dannells-2006-recognizing-33936, title = {Recognizing Acronyms and their Definitions in Swedish Medical Texts}, abstract = {This paper addresses the task of recognizing acronym-definition pairs in Swedish (medical) texts as well as the compilation of a freely available sample of such manually annotated pairs. A material suitable not only for supervised learning experiments, but also as a testbed for the evaluation of the quality of future acronym-definition recognition systems. There are a number of approaches to the identification described in the literature, particularly within the biomedical domain, but none of those addresses the variation and complexity exhibited in a language other than English. This is realized by the fact that we can have a mixture of two languages in the same document and/or sentence, i.e. Swedish and English; that Swedish is a compound language that significantly deteriorates the performance of previous approaches (without adaptations) and, most importantly, the fact that there is a large variation of possible acronym-definition permutations realized in the analysed corpora, a variation that is usually ignored in previous studies. }, booktitle = {roceedings of the 5th Languages Resources and Evalutaion (LREC). }, author = {Kokkinakis, Dimitrios and Dannélls, Dana}, year = {2006}, } @inProceedings{kokkinakis-2005-identification-33934, title = {Identification of Named Entities and Medical Terminology in Swedish Patient Records.}, abstract = {An anonymisation or de-identification system can provide a broad spectrum of services related to the growing demands for better forms of dissemination of information about individuals found in electronic patient records. The range of these services includes: health care statistics and sharing clinical information across institutions; validation and monitoring of new diagnostic tests; release of individual data by protecting identities or hints that can identify individuals, and appropriate mechanisms to provide only the information necessary to the professional who has the need to know. This paper describes our first experiments intended for automatic anonymisation of Swedish electronic patient records using a generic system for Named Entity Recognition. There are eight main types of entities that the system recognizes: person, location, organisation, event, object, work & art, time and measure. To this set, two new modules have been recently developed. One is dedicated to animacy recognition, a modules based on a number of clues (such as key words utilized in the persons module grammar and verbs requiring animate subject), and another one designated to identify and annotate medical terminology. The latter module annotates names of drugs and chemical substances, diseases, symptoms, organisms and anatomical terms. A detailed evaluation of the system, on authentic patient records, is given both for the named, medical and animate entities. }, booktitle = {WSEAS Transactions on BIOLOGY and BIOMEDICINE}, author = {Kokkinakis, Dimitrios}, year = {2005}, volume = {2}, number = {3}, pages = {312--317}, } @inProceedings{kokkinakis-2004-reducing-33928, title = {Reducing the Effect of Name Explosion.}, abstract = {The problem of new vocabulary is particularly frustrating once one begins to work with large corpora of real texts. The identification of unknown proper nouns, chains of non-proper nouns and even common words that function as names (i.e. named entities) in unrestricted text, and their subsequent classification into some sort of semantic type is a challenging and difficult problem in Natural Language Processing (NLP). Systems that perform Information Extraction, Information Retrieval, Question-Answering, Topic Detection, Text Mining, Machine Translation and annotation for the Semantic Web have highlighted the need for the automatic recognition of such entities, since their constant introduction in any domain, however narrow, is very common and needs special attention. Proper names are usually not listed in defining or other common types of dictionaries, they may appear in many alias forms and abbreviated variations, which makes their listing infeasible. This paper deals with some extensions to the traditional named entity recognition approaches. It puts emphasis on more name classes and their further subclassification into finer sets. An operative system that can be tested and evaluated on-line implements the ideas described in this paper.}, booktitle = {Proceedings of the LREC Workshop: Beyond Named Entity Recognition, Semantic labelling for NLP tasks. ourth Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2004}, } @inProceedings{kokkinakis-2006-developing-33925, title = {Developing Resources for Swedish Bio-Medical Text Mining}, abstract = {Collection and annotation of corpora in specialized fields, such as medicine, and particularly for lesser-spoken languages, than for instance English, is an important enterprise for the continuous development and growth of language technology research, for resource development and for the implementation of practical applications for these languages. In this paper, we describe our ongoing efforts to build a large Swedish medical corpus, the MEDLEX Corpus, how we combine ge-neric named entity and terminology recognition for the detailed annotation of the corpus, and how these annotations are further utilized by an annotations-aware cascaded finite-state parser. }, booktitle = {Proceedings of the 2nd International Symposium on Semantic Mining in Biomedicine (SMBM)}, author = {Kokkinakis, Dimitrios}, year = {2006}, }