@inProceedings{Borin-Lars2009-110343, title = {Thinking Green: Toward Swedish FrameNet++}, abstract = {Access to multi-layered lexical, grammatical and semantic information representing text content is a prerequisite for efficient automatic understanding and generation of natural language. A FrameNet is considered a valuable resource for both linguistics and language technology research that may contribute to the achievement of these goals. Currently, FrameNet-like resources exist for a few languages,1 including some domain-specific and multilingual initiatives (Dolbey et al., 2006; Boas, 2009; Uematsu et al., 2009; Venturi et al., 2009), but are unavailable for most languages, including Swedish, although there have been some pilot studies exploring the semi-automatic acquisition of Swedish frames (Johansson & Nugues, 2006; Borin et al., 2007). At the University of Gothenburg, we are now embarking on a project to build a Swedish FrameNet-like resource. A novel feature of this project is that the Swedish FrameNetwill be an integral part of a largermany-faceted lexical resource. Hence the name Swedish FrameNet++ (SweFN++). }, booktitle = {FrameNet Masterclass and Workshop}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Kokkinakis, Dimitrios}, year = {2009}, } @incollection{Kokkinakis-Dimitrios2009-73979, title = {Lexical granularity for automatic indexing and means to achieve it - the case of Swedish MeSH®}, abstract = {The identification and mapping of terminology from large repositories of life science data onto concept hierarchies constitute an important initial step for a deeper semantic exploration of unstructured textual content. Accurate and efficient mapping of this kind is likely to provide better means of enhancing indexing and retrieval of text, uncovering subtle differences, similarities and useful patterns, and hopefully new knowledge, among complex surface realisations, overlooked by shallow techniques based on various forms of lexicon look-up approaches. However, a finer-grained level of mapping between terms as they occur in natural language and domain concepts is a cumbersome enterprise that requires various levels of processing in order to make explicit relevant linguistic structures. This chapter highlights some of the challenges encountered in the process of bridging free to controlled vocabularies and thesauri and vice versa. We investigate how the extensive variability of lexical terms in authentic data can be efficiently projected to hierarchically structured codes, while means to increase the coverage of the underlying lexical resources are also investigated.}, booktitle = {Information Retrieval in Biomedicine : Natural Language Processing for Knowledge Integration}, author = {Kokkinakis, Dimitrios}, year = {2009}, publisher = {IGI Global }, adress = {Hershey, Pennsylvania}, } @article{Kokkinakis-Dimitrios2009-105133, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning; a pilot study.}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discusses the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a supervised machine learning task in which the relations are tried to be learned using pre-annotated data. The challenges designing the problem and empirical results are presented.}, author = {Kokkinakis, Dimitrios}, year = {2009}, volume = {5729}, pages = {395--402}, } @article{Kokkinakis-Dimitrios2009-105136, title = {Uppbyggandet av en svensk medicinsk korpus för termvalidering och termextrahering - hur bra täcker SNOMED CT olika delfackområden?}, abstract = {Syftet med denna studie är dels att skapa en stor samling svenska medicinska elektroniska texter, en korpus, och dels att validera och kvalitetssäkra existerande termer ur SNOMED CT (the Systematized NOmenclature of MEDicine - Clinical Terms) gentemot korpusinnehållet. På det sättet kan man få en objektiv uppfattning om SNOMED CT:s validitet, täckning och reliabilitet. Man kan även berika terminologin med nya termer eller termvarianter genom att automatiskt extrahera termkandidater inom olika delfackområden från korpusen med hjälp av olika statistiska och lingvistiska metoder. Resultat av de korpusbaserade, empiriska studierna ska kunna användas av terminologer i deras arbete med att göra SNOMED CT mer täckande, pålitlig och enhetlig. Samtidigt, genom användning av autentisk data, kan man försäkra sig om att termvarianterna (existerande eller nya) är vedertagna termer hos fackmän. I fall flera etablerade termvarianter (nya termkandidater) förekommer i korpusen kan dessa införas efter manuell granskning som synonymer till rekommenderade termer (med stöd av ett lämpligt granskningsgränssnitt) och därmed vidare utveckla innehållet i SNOMED CT. Följaktligen kommer vår presentation att innehålla en redovisning som bygger på tre huvudpelare – korpusuppbyggnad – termvalidering – termextrahering. Korpusen samlades in från två källor efter erhållet tillstånd. Texternas ursprung i korpusen kommer dels från Läkartidningens (LT) digitala arkiv och dels från DiabetologNytts (DN) digitala arkiv . }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{Kokkinakis-Dimitrios2009-105141, title = {Kvalitetssäkring av SNOMED CT med hjälp av Läkartidningens arkiv. }, abstract = {Inom ramen för regeringens satsning ”Nationell IT-strategi för vård och omsorg” har Socialstyrelsen fått i uppdrag att översätta och anpassa begreppssystemet ’the Systematized Nomenclature of Medicine, Clinical Terms’ (SNOMED CT) till svenska. Arbetet är både omfattande och tidskrävande samtidigt som uppdragstagaren har krav om kvalitetssäkring av översättningen. Hur kan Läkartidningens arkiv bidra till kvalitetssäkringen? Med hjälp av Läkartidningens digitala arkiv, LDA, (årgångarna 1996-2009) har vi utvecklat metoder för att effektivisera kvalitetssäkringen av olika SNOMED CT-urval (t.ex. diabetestermer). Det innebär att vi underlättar för utförandet av empiriska, SNOMED CT-relaterade studier, som t.ex. framtagning av underlag om termernas användning, variation och frekvensdistribution över tid. Arkivets förädling: LDA:t omvandlades till ett enhetligt textbaserat format och textinnehållet normaliserades med avseenden på dokumentformat och teckenkodning för att kunna skapa ett bra underlag för den efterföljande språkteknologiska analysen. Alla artiklar i varje publicerad årgång extraherades och märktes upp dels med olika slags metainformation (t.ex. genretillhörighet) dels med lingvistisk och semantisk information, sammanlagt 27 000 artiklar. Den språkteknologiska bearbetningen innefattade automatiskt tillägg av lingvistisk information som t.ex. ordklasstillhörighet för varje ord i korpusen och automatiskt, semantisk mappning dels till den svenska MeSH-tesaurusen och dels till delar av den svensköversatta SNOMED-hierarkin. LDA i en ny skepnad: LDA utgör sedan länge en värdefull svensk medicinsk resurs för alla som yrkesmässigt jobbar med termer och språk. Vi har dock bidragit med att göra textmaterialet ännu mer välstrukturerat och förädlat, som kan vara till hjälp för explorativa studier där sökningar kan förfinas på ett flertal sätt och därmed ge forskare möjligheter att göra djupare innehållsanalyser av texterna och samla grundläggande kunskaper inom olika ämnesområden. Kombinationen av enstaka termer och ord med lingvistisk och semantisk information ger unika möjligheter till att skaffa information och generera fakta som kan leda till nya hypoteser och eventuellt ny kunskap om olika aspekter som gäller termanvändning och variation och vi kommer att redovisa exempel på sådana analyser. }, booktitle = {Svenska Läkaresällskapets Riksstämman }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, } @inProceedings{Kokkinakis-Dimitrios2009-94705, title = {Shallow Features for Differentiating Disease-Treatment Relations using Supervised Learning, a pilot study}, abstract = {Clinical narratives provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a challenge for practitioners in the language technology field, particularly because of the nature of the texts (excessive use of terminology, abbreviations, orthographic term variation), the significant opportunities for clinical research that such material can provide and the potentially broad impact that clinical findings may have in every day life. It is therefore recognized that the capability to automatically extract key concepts and their relationships from such data will allow systems to properly understand the content and knowledge embedded in the free text which can be of great value for applications such as information extraction and question & answering. This paper gives a brief presentation of such textual data and its semantic annotation, and discuss the set of semantic relations that can be observed between diseases and treatments in the sample. The problem is then designed as a machine learning task in which the relations are tried to be learned in a supervised fashion, using pre-annotated data. The challenges designing the problem and empirical results are presented.}, booktitle = {Proceedings of the 12th International Conference TSD (Text, Speech and Dialogue). Springer Verlag, LNCS/LNAI series.}, author = {Kokkinakis, Dimitrios}, year = {2009}, } @article{Kokkinakis-Dimitrios2009-105140, title = {Issues on Quality Assessment of SNOMED CT® Subsets - Term Validation and Term Extraction}, abstract = {The aim of this paper is to apply and develop methods based on Natural Language Processing for automatically testing the validity, reliability and coverage of various Swedish SNOMED-CT subsets, the Systematized NOmenclature of MEDicine - Clinical Terms a multiaxial, hierarchical classification system which is currently being translated from English to Swedish. Our work has been developed across two dimensions. Initially a Swedish electronic text collection of scientific medical documents has been collected and processed to a uniform format. Secondly, a term processing activity has been taken place. In the first phase of this activity, various SNOMED CT subsets have been mapped to the text collection for evaluating the validity and reliability of the translated terms. In parallel, a large number of term candidates have been extracted from the corpus in order to examine the coverage of SNOMED CT. Term candidates that are currently not included in the Swedish SNOMED CT can be either parts of compounds, parts of potential multiword terms, terms that are not yet been translated or potentially new candidates. In order to achieve these goals a number of automatic term recognition algorithms have been applied to the corpus. The results of the later process is to be reviewed by domain experts (relevant to the subsets extracted) through a relevant interface who can decide whether a new set of terms can be incorporated in the Swedish translation of SNOMED CT or not. }, author = {Kokkinakis, Dimitrios and Gerdin, Ulla}, year = {2009}, }