@inProceedings{Kokkinakis-Dimitrios2008-73972, title = {Applying MeSH® to the (Swedish) Clinical Domain - Evaluation and Lessons learned}, abstract = {Medical discharge summaries and clinical notes provide an information rich, nearly unexplored corpus of evidential knowledge that is considered as a potential goldmine for both medical scientists as well as practitioners in the language technology field. The capability to extract the key concepts and their relationships from such data can be of great value for knowledge management tasks such as indexing, data interchange, data aggregation and clinical decision support. The purpose of this work is to get insights into the feasibility of applying the content of a controlled vocabulary, the Medical Subject Headings (MeSH) to a sample of electronic discharge letters (i.e. free text clinical notes). We explore the application of natural language processing (NLP) techniques to the challenge of efficiently detecting the terminology, as encoded in MeSH and we evaluate MeSH in this setting, showing that a lot of work remains to be done in order to increase the coverage of the resource both in terms of its breadth and depth. }, booktitle = {Proceedings of the 6th Scandinavian Health Informatics and the 12th Swedish National Term Conference}, author = {Kokkinakis, Dimitrios and Thurin, Anders}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73973, title = {MeSH® - From a Controlled Vocabulary to a Processable Resource}, abstract = {Large repositories of life science data in the form of domain-specific literature, textual databases and other large specialised textual collections (corpora) in electronic form increase on a daily basis to a level beyond the human mind can grasp and interpret. As the volume of data continues to increase, substantial support from new information technologies and computational techniques grounded in the form of the ever increasing applications of the mining paradigm is becoming apparent. These emerging technologies play an increasingly critical role in aiding research productivity, and they provide the means for reducing the workload for information access and decision support and for speeding up and enhancing the knowledge discovery process. In order to accomplish these higher level goals and support the mining approach however, a fundamental and unavoidable starting point is the identification and mapping of terminology from the textual, unstructured data onto biomedical knowledge sources and concept hierarchies. In this paper, we provide a description of the work regarding terminology recognition using the Swedish MeSH® thesaurus and its corresponding English original source. We explain the various transformation and refinement steps applied to the original database tables into a fully-fledged processing oriented annotating resource. Particular attention has been given to a number of these steps in order to automatically map the extensive variability of lexical terms to structured MeSH® nodes. Issues on annotation and coverage are also discussed. }, booktitle = {Proceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73977, title = {Semantic Pre-processing for Complexity Reduction in Parsing Medical Texts}, abstract = {Collection and multilayer annotation of textual corpora in specialized fields, such as (bio-) medicine is an important enterprise for empirically-based, data-driven language processing, human language technologies and linguistic research. One of the most important and difficult to achieve piece of annotation that can be made available is at the syntactic and functional level, i.e. parsing, particularly in sublanguages where specialized tools have to be adapted which is considered too expensive for many applications. In this paper, we describe a way to reduce the complexity of parsing in medical discourse by the use of a semantic pre-processing stage guided by annotations provided by medical thesauri and other domain-specific lexical resources. Parsing biomedical texts, apart from the challenge it possesses (deviant and idiosyncratic uses of vocabulary and syntax), is required in order to support and improve technologies such as Information Extraction and Retrieval, enhance the acquisition of relations between terminology support terminology management and population of medical semantic resources.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73976, title = {MEDLEX+: An Integrated Corpus-Lexicon Medical Workbench for Swedish}, abstract = {This paper reports on ongoing work on developing a medical corpus-lexicon workbench for Swedish, MedLex+. At the moment the workbench incorporates: (i) an annotated collection of medical texts, 25 million tokens, 50,000 documents, (ii) a number of language processing components, including tools for collocation extraction, compound segmentation and thesaurus-based semantic annotation, and (iii) a lexical database of medical terms (5,000 entries). MedLex+ is a multifunctional lexical resource due to its structural design and content which can be easily queried. The medical workbench is intended to support lexicographers in their work on compiling lexicons and also lexicon users more or less initiated in the medical domain. It can also assist researchers working in the fields of lexical semantics and natural language processing (NLP) with focus on medical language. The linguistically and semantically annotated medical texts in combination with a set of queries turn the corpus into a rich repository of semasiological and onomasiological knowledge about medical terminology and their linguistic, lexical and pragmatic properties. These properties are recorded in the lexical database with a cognitive profile. The MedLex+ workbench seems to offer constructive help in many different lexical tasks. }, booktitle = {Proceedings of the 13th EURALEX}, author = {Kokkinakis, Dimitrios and Toporowska Gronostaj, Maria}, year = {2008}, } @inProceedings{Kokkinakis-Dimitrios2008-73975, title = {Semantic Relation Mining of Solid Compounds in Medical Corpora.}, abstract = {In the context of scientific and technical texts, meaning is usually embedded in noun compounds and the semantic interpretation of these compounds deals with the detection and semantic classification of the relation that holds between the compound’s constituents. Semantic relation mining, the technology applied for marking up, interpreting, extracting and classifying relations that hold between pairs of words, is an important enterprise that contribute to deeper means of enhancing document understanding technologies, such as Information Extraction, Question Answering, Summarization, Paraphrasing, Ontology Building and Textual Entailment. This paper explores the application of assigning semantic descriptors taken from a multilingual medical thesaurus to a large sample of solid (closed form) compounds taken from large Swedish medical corpora, and determining the relation(s) that may hold between the compound constituents. Our work is inspired by previous research in the area of using lexical hierarchies for identifying relations between two-word noun compounds in the medical domain. In contrast to previous research, Swedish, as other Germanic languages, require further means of analysis, since compounds are written as one sequence with no white space between the words, e.g. virus diseases vs. virussjukdomar, which makes the problem more challenging, since solid compounds are harder to identify and segment.}, booktitle = {Proceedings of the 21th Conference on the European Federation for Medical Informatics (MIE 2008)}, author = {Kokkinakis, Dimitrios}, year = {2008}, ISBN = {9786611733414}, } @inProceedings{Kokkinakis-Dimitrios2008-73974, title = {A Semantically Annotated Swedish Medical Corpus}, abstract = {With the information overload in the life sciences there is an increasing need for annotated corpora, particularly with biological and biomedical entities, which is the driving force for data-driven language processing applications and the empirical approach to language study. Inspired by the work in the GENIA Corpus, which is one of the very few of such corpora, extensively used in the biomedical field, and in order to fulfil the needs of our research, we have collected a Swedish medical corpus, the MEDLEX Corpus. MEDLEX is a large structurally and linguistically annotated document collection, consisting of a variety of text documents related to various medical text subfields, and does not focus at a particular medical genre, due to the lack of large Swedish resources within a particular medical subdomain. Out of this collection we selected 300 documents which were manually examined by two human experts who inspected, corrected and/or accordingly modified the automatically provided annotations according to a set of provided labelling guidelines. The annotations consist of medical terminology provided by the Swedish and English MeSH® (Medical Subject Headings) thesauri as well as named entity labels provided by an enhanced named entity recognition software.}, booktitle = {roceedings of the 6th Language Resources and Evaluation Conference (LREC)}, author = {Kokkinakis, Dimitrios}, year = {2008}, }