@inProceedings{volodina-johanssonkokkinakis-2013-compiling-188550, title = {Compiling a corpus of CEFR-related texts.}, abstract = {This paper reports on initial efforts to compile a corpus of course book texts used for teaching CEFR-based courses of Swedish to adult immigrants. The research agenda behind compiling such a corpus comprises the study of normative “input” texts that can reveal a number of facts about what is being taught in terms of explicit grammar, receptive vocabulary, text and sentence readability; as well as build insights into linguistic characteristics of normative texts which can help anticipate learner performance in terms of active vocabulary, grammatical competence, etc. in classroom and testing settings. The CEFR “can-do” statements are known to offer flexibility in interpreting them for different languages and target groups. However, they are nonspecific and therefore it is difficult to associate different kinds of competences and levels of accuracy learners need in order to perform the communicative tasks with the different CEFR levels. To address this problem a systematic study needs to be performed for each individual anguage, both for “input” normative texts and “output” learner-produced texts. In this project we take the first step to collect and study normative texts for Swedish. The article describes the process of corpus compilation, annotation scheme of CEFR- relevant parameters, and methods proposed for text analysis, namely statistic and empiric methods, as well as techniques coming from computational linguistics/machine learning. }, booktitle = {Proceedings of the Language Testing and CEFR conference, Antwerpen, Belgium, May 27-29, 2013}, author = {Volodina, Elena and Johansson Kokkinakis, Sofie}, year = {2013}, } @inProceedings{pijetlovic-volodina-2013-developing-188543, title = {Developing a Swedish spelling game on an ICALL platform}, abstract = {In this project we developed web services on the ICALL platform Lärka for automatic generation of Swedish spelling exercises using Text-To-Speech (TTS) technology which allows L2 learners to train their spelling and listening individually performance based levels. The embedded avatar pronounces a random item of the desired level, which the user has to spell. Furthermore, the users have the possibility to train their own words for different linguistic levels. A result tracker containing a total and correct answer score keeps track of the language learner’s performance. In order to analyse typical spelling errors and provide better feedback, misspellings are collected in a database. The usability of the spelling exercises, concerning the different linguistic levels and the quality of speech, has been evaluated through a questionnaire with 10 participants.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, Évora, Portugal, Proceedings.}, author = {Pijetlovic, Dijana and Volodina, Elena}, year = {2013}, ISBN = {978-1-908416-12-4}, } @inProceedings{ghosh-etal-2013-mining-188844, title = {Mining Fine-grained Opinion Expressions with Shallow Parsing}, abstract = {Opinion analysis deals with public opinions and trends, but subjective language is highly ambiguous. In this paper, we follow a simple data-driven technique to learn fine-grained opinions. We select an intersection set of Wall Street Journal documents that is included both in the Penn Discourse Tree Bank (PDTB) and in the Multi-Perspective Question Answering (MPQA) corpus. This is done in order to explore the usefulness of discourse-level structure to facilitate the extraction of fine-grained opinion expressions. Here we perform shallow parsing of MPQA expressions with connective based discourse structure, and then also with Named Entities (NE) and some syntax features using conditional random fields; the latter feature set is basically a collection of NEs and a bundle of features that is proved to be useful in a shallow discourse parsing task. We found that both of the feature-sets are useful to improve our baseline at different levels of this fine-grained opinion expression mining task.}, booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing}, author = {Ghosh, Sucheta and Tonelli, Sara and Johansson, Richard}, year = {2013}, pages = {302--310}, } @inProceedings{bouma-adesam-2013-experiments-177631, title = {Experiments on sentence segmentation in Old Swedish editions}, booktitle = {NEALT Proceedings Series }, author = {Bouma, Gerlof and Adesam, Yvonne}, year = {2013}, volume = {18}, ISBN = {978-91-7519-587-2}, } @inProceedings{volodina-etal-2013-towards-188549, title = {Towards a gold standard for Swedish CEFR-based ICALL}, abstract = {In qualitative projects on ICALL (Intelligent Computer-Assisted Language Learning), research and development always go hand in hand: development both depends upon the research results and dictates the research agenda. Likewise, in the development of the Swedish ICALL platform Lärka, the practical issues of development have dictated its research agenda. With NLP approaches, sooner or later, the necessity for reliable training data becomes unavoidable. At the moment Lärka's research agenda cannot be addressed without access to reliable training data, so-called “gold standard”. This paper gives an overview of the current state of the Swedish ICALL platform development and related research agenda, and describes the first attempts to collect the reference corpus (“gold standard”) coming from course books used in CEFR-based language teaching.}, booktitle = {Proceedings of the Second Workshop on NLP for Computer-Assisted Language Learning. NEALT Proceedings Series 17. Nodalida 2013, Oslo, Norway. }, author = {Volodina, Elena and Pijetlovic, Dijana and Pilán, Ildikó and Johansson Kokkinakis, Sofie}, year = {2013}, ISBN = {978-91-7519-588-9}, } @inProceedings{pilan-etal-2013-automatic-188465, title = {Automatic Selection of Suitable Sentences for Language Learning Exercises}, abstract = {In this study we investigated second and foreign language (L2) sentence readability, an area little explored so far in the case of several languages, including Swedish. The outcome of our research consists of two methods for sentence selection from native language corpora based on Natural Language Processing (NLP) and machine learning (ML) techniques. The two approaches have been made available online within Lärka, an Intelligent CALL (ICALL) platform offering activities for language learners and students of linguistics. Such an automatic selection of suitable sentences can be valuable for L2 teachers during the creation of new teaching materials, for L2 students who look for additional self-study exercises as well as for lexicographers in search of example sentences to illustrate the meaning of a vocabulary item. Members from all these potential user groups evaluated our methods and found the majority of the sentences selected suitable for L2 learning purposes.}, booktitle = {20 Years of EUROCALL: Learning from the Past, Looking to the Future. 2013 EUROCALL Conference, 11th to 14th September 2013 Évora, Portugal, Proceedings.}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2013}, ISBN = {978-1-908416-12-4}, pages = {218--225}, } @edited_book{borin-etal-2013-proceedings-190260, title = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Borin, Lars and Fjeld, Ruth Vatvedt and Forsberg, Markus and Nimb, Sanni and Nugues, Pierre and Pedersen, Bolette Sandford}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-586-5}, } @inProceedings{ahlberg-etal-2013-korp-178355, title = {Korp and Karp – a bestiary of language resources: the research infrastructure of Språkbanken}, abstract = {A central activity in Språkbanken, an R&D unit at the University of Gothenburg, is the systematic construction of a research infrastructure based on interoperability and widely accepted standards for metadata and data. The two main components of this infrastructure deal with text corpora and with lexical resources. For modularity and flexibility, both components have a backend, or server-side part, accessed through an API made up of a set of well-defined web services. This means that there can be any number of different user interfaces to these components, corresponding, e.g., to different research needs. Here, we will demonstrate the standard corpus and lexicon search interfaces, designed primarily for linguistic searches: Korp and Karp.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Olsson, Leif-Jöran and Olsson, Olof and Roxendal, Johan and Uppström, Jonatan}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, } @inProceedings{borin-etal-2013-mining-188846, title = {Mining semantics for culturomics: towards a knowledge-based approach}, abstract = {The massive amounts of text data made available through the Google Books digitization project have inspired a new field of big-data textual research. Named culturomics, this field has attracted the attention of a growing number of scholars over recent years. However, initial studies based on these data have been criticized for not referring to relevant work in linguistics and language technology. This paper provides some ideas, thoughts and first steps towards a new culturomics initiative, based this time on Swedish data, which pursues a more knowledge-based approach than previous work in this emerging field. The amount of new Swedish text produced daily and older texts being digitized in cultural heritage projects grows at an accelerating rate. These volumes of text being available in digital form have grown far beyond the capacity of human readers, leaving automated semantic processing of the texts as the only realistic option for accessing and using the information contained in them. The aim of our recently initiated research program is to advance the state of the art in language technology resources and methods for semantic processing of Big Swedish text and focus on the theoretical and methodological advancement of the state of the art in extracting and correlating information from large volumes of Swedish text using a combination of knowledge-based and statistical methods.}, booktitle = {2013 ACM International Workshop on Mining Unstructured Big Data Using Natural Language Processing, UnstructureNLP 2013, Held at 22nd ACM International Conference on Information and Knowledge Management, CIKM 2013; San Francisco, CA; United States; 28 October 2013 through 28 October 2013}, author = {Borin, Lars and Dubhashi, Devdatt and Forsberg, Markus and Johansson, Richard and Kokkinakis, Dimitrios and Nugues, Pierre}, year = {2013}, ISBN = {978-1-4503-2415-1}, pages = {3--10}, } @article{borin-etal-2013-saldo-188604, title = {SALDO: a touch of yin to WordNet's yang}, abstract = {The English-language Princeton WordNet (PWN) and some wordnets for other languages have been extensively used as lexical–semantic knowledge sources in language technology applications, due to their free availability and their size. The ubiquitousness of PWN-type wordnets tends to overshadow the fact that they represent one out of many possible choices for structuring a lexical-semantic resource, and it could be enlightening to look at a differently structured resource both from the point of view of theoretical–methodological considerations and from the point of view of practical text processing requirements. The resource described here—SALDO—is such a lexical–semantic resource, intended primarily for use in language technology applications, and offering an alternative organization to PWN- style wordnets. We present our work on SALDO, compare it with PWN, and discuss some implications of the differences. We also describe an integrated infrastructure for computational lexical resources where SALDO forms the central component.}, journal = {Language resources and evaluation}, author = {Borin, Lars and Forsberg, Markus and Lönngren, Lennart}, year = {2013}, volume = {47}, number = {4}, pages = {1191--1211}, } @article{andersson-ahlberg-2013-towards-181972, title = {Towards automatic tracking of lexical change: linking historical lexical resources}, journal = {NEALT Proceedings Series}, author = {Andersson, Peter and Ahlberg, Malin}, year = {2013}, volume = {18}, } @inProceedings{backstrom-etal-2013-automatic-178351, title = {Automatic identification of construction candidates for a Swedish constructicon}, abstract = {We present an experiment designed for extracting construction candidates for a Swedish constructicon from text corpora. We have explored the use of hybrid n-grams with the practical goal to discover previously undescribed partially schematic constructions. The experiment was successful, in that quite a few new constructions were discovered. The precision is low, but as a push-button tool for construction discovery, it has proven a valuable tool for the work on a Swedish constructicon.}, booktitle = {Proceedings of the workshop on lexical semantic resources for NLP at NODALIDA 2013, May 22-24, 2013, Oslo, Norway. NEALT Proceedings Series 19}, author = {Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Prentice, Julia and Sköldberg, Emma}, year = {2013}, pages = {2--11}, } @techreport{roxendal-2013-state-189376, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction – W3C Last Call Working Draft 1 August 2013}, author = {Roxendal, Johan}, year = {2013}, publisher = {MIT}, address = {Cambridge, USA}, } @inProceedings{pedersen-etal-2013-nordic-178357, title = {Nordic and Baltic wordnets aligned and compared through “WordTies”}, abstract = {During the last few years, extensive wordnets have been built locally for the Nordic and Baltic languages applying very different compilation strategies. The aim of the present investigation is to consolidate and examine these wordnets through an alignment via Princeton Core WordNet and thereby compare them along the measures of taxonomical structure, synonym structure, and assigned relations to approximate to a best practice. A common web interface and visualizer “WordTies” is developed to facilitate this purpose. Four bilingual wordnets are automatically processed and evaluated exposing interesting differences between the wordnets. Even if the alignments are judged to be of a good quality, the precision of the translations vary due to considerable differences in hyponymy depth and interpretation of the synset. All seven monolingual and four bilingual wordnets as well as WordTies have been made available via META-SHARE through the META-NORD project.}, booktitle = {Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013), May 22–24, 2013, Oslo University, Norway. NEALT Proceedings Series 16}, author = {Pedersen, Bolette and Borin, Lars and Forsberg, Markus and Kahusk, Neeme and Lindén, Krister and Niemi, Jyrki and Nisbeth, Niklas and Nygaard, Lars and Orav, Heili and Rögnvaldsson, Eiríkur and Seaton, Mitchel and Vider, Kadri and Voionmaa, Kaarlo}, year = {2013}, number = {16}, pages = {147--162}, } @article{borin-etal-2013-close-187063, title = {Close encounters of the fifth kind: Some linguistic and computational aspects of the Swedish FrameNet++ project}, abstract = {The Swedish FrameNet++ (SweFN++) project aims at developing an integrated Swedish lexical macro-resource to be used primarily in language technology R&D to build natural language processing (NLP) applications. Most of the component resources making up SweFN++ are existing digital lexical resources; in their case the central project effort is directed at making them interoperable on as many levels as possible. An important new resource being created in the project is a Swedish framenet. Now a sister project is starting with the aim of adding a Swedish constructicon (SweCxn) to the macro-resource. In this paper, we discuss some theoretical and conceptual issues which have arisen in the course of our work on the SweFN++ and the planning of the SweCxn, in the close encounter between the practical requirements of NLP and the theory and practice of linguistic – lexical and grammatical – description. }, journal = {Veredas}, author = {Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin}, year = {2013}, volume = {17}, number = {1}, pages = {28--43}, } @inProceedings{skoldberg-etal-2013-between-186041, title = {Between Grammars and Dictionaries: a Swedish Constructicon }, abstract = {This paper introduces the Swedish Constructicon (SweCxn), a database of Swedish constructions currently under development. We also present a small study of the treatment of constructions in Swedish (paper) dictionaries, thus illustrating the need for a constructionist approach, and discuss three different methods used to identify potential constructions for inclusion in the constructicon. SweCxn is a freely available electronic resource, with a particular focus on semi-general linguistic patterns of the type that are difficult to account for from a purely lexicographic or a purely grammatical perspective, and which therefore have tended to be neglected in both dictionaries and grammars. Far from being a small set of borderline cases, such constructions are both numerous and common. They are also quite problematic for second language acquisition as well as LT applications. Accordingly, various kinds of multi-word units have received more attention in recent years, not least from a lexicographic perspective. The coverage, however, is only partial, and the productivity of many constructions is hard to capture from a lexical viewpoint. To identify constructions for SweCxn, we use a combination of methods, such as working from existing construction descriptions for Swedish and other languages, applying LT tools to discover recurring patterns in texts, and extrapolating constructional information from dictionaries. }, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia. Ljubljana/Tallinn: Trojina, Institute for Applied Slovene Studies/Eesti Keele Instituut.}, author = {Sköldberg, Emma and Bäckström, Linnéa and Borin, Lars and Forsberg, Markus and Lyngfelt, Benjamin and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Tingsell, Sofia and Uppström, Jonatan}, year = {2013}, pages = {310--327}, } @inProceedings{dannells-etal-2013-mapserver-178095, title = {MapServer for Swedish Language Technology}, abstract = {The MapServer application used by the Swedish Language Bank provides new opportunities for visualizing geographical information found in its large repository of written texts, in particular literary texts. The application is capable of performing coordinate search on the basis of recognized place names and rendering both static and dynamic maps that display their geographical locations. }, booktitle = {Digital Humanities}, author = {Dannélls, Dana and Borin, Lars and Olsson, Leif-Jöran}, year = {2013}, } @edited_book{borin-saxena-2013-approaches-184757, title = {Approaches to Measuring Linguistic Differences}, abstract = {The present volume collects contributions addressing different aspects of the measurement of linguistic differences, a topic which probably is as old as language itself but at the same time has acquired renewed interest over the last decade or so, reflecting a rapid development of data-intensive computing in all fields of research, including linguistics.}, editor = {Borin, Lars and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, } @incollection{borin-2013-measuring-184758, title = {The why and how of measuring linguistic differences}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin and Anju Saxena}, author = {Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {3--26}, } @inProceedings{kokkinakis-2013-annotation-189536, title = {Annotation of interpersonal relations in Swedish prose fiction.}, abstract = {This paper describes the manual annotation of a small sample of Swedish 19th and 20th century prose fiction with interpersonal relations between characters in six literary works. An interpersonal relationship is an association between two or more people that may range in duration from brief to enduring. The annotation is guided by a named entity recognition step. Our goal is to get an in-depth understanding of the difficulties of such a task and elaborate a model that can be applied for similar annotation on a larger scale, both manually as well as automatically. The identification of interpersonal relations can, hopefully, aid the reader of a Swedish literary work to better understand its content and plot, and get a bird’s eye view on the landscape of the core story. Our aim is to use such annotations in a hybrid context, i.e., using machine learning and rule-based methods, which, in conjunction with named entity recognition, can provide the necessary infrastructure for creating detailed biographical sketches and extracting facts for various named entities which can be exploited in various possible ways by Natural Language Processing (NLP) technologies such as summarization, question answering, as well as visual analytic techniques.}, booktitle = {Proceedings of the 3rd Workshop on Annotation of Corpora for Research in the Humanities (ACRH-3). Sofia, Bulgaria.}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-954-91700-5-4}, pages = {37--47}, } @incollection{saxena-borin-2013-carving-184759, title = {Carving Tibeto-Kanauri by its joints: Using basic vocabulary lists for genetic grouping of languages}, booktitle = {Approaches to Measuring Linguistic Differences}, author = {Saxena, Anju and Borin, Lars}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {175--198}, } @edited_book{volodina-etal-2013-proceedings-188675, title = {Proceedings of the second workshop on NLP for computer-assisted language learning at NODALIDA 2013 May 22-24, 2013, Oslo, Norway}, editor = {Volodina, Elena and Borin, Lars and Loftsson, Hrafn}, year = {2013}, publisher = {Linköping University Press}, address = {Linköping, Sweden}, ISBN = {978-91-7519-588-9}, } @incollection{borin-etal-2013-intercontinental-184760, title = {The Intercontinental Dictionary Series – a rich and principled database for language comparison}, booktitle = {Approaches to Measuring Linguistic Differences}, editor = {Lars Borin ; Anju Saxena}, author = {Borin, Lars and Comrie, Bernard and Saxena, Anju}, year = {2013}, publisher = {De Gruyter Mouton}, address = {Berlin}, ISBN = {978-3-11-030525-8}, pages = {285--302}, } @edited_book{desmedt-etal-2013-proceedings-190263, title = {Proceedings of the workshop on Nordic language research infrastructure at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {De Smedt, Koenrad and Borin, Lars and Lindén, Krister and Maegaard, Bente and Rögnvaldsson, Eiríkur and Vider, Kadri}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-585-8}, } @inProceedings{borin-etal-2013-lexical-186032, title = {The lexical editing system of Karp}, abstract = {Karp is the open lexical infrastructure of Språkbanken (the Swedish Language Bank). The infrastructure has three main functions: (1) to support the work on creating, curating, and integrating our various lexical resources; (2) to publish the resources, making them searchable and downloadable; and (3) to offer advanced editing functionalities. An important feature of the lexical infrastructure is also that we maintain a strong bidirectional connection to our corpus infrastructure. At the heart of the infrastructure is the SweFN++ project with the goal to create free Swedish lexical resources geared towards language technology applications. The infrastructure currently hosts 23 Swedish lexical resources. The resources are integrated through links to a pivot lexical resource, SALDO, a large morphological and lexical-semantic resource for modern Swedish.}, booktitle = {Kosem, I., Kallas, J., Gantar, P., Krek, S., Langemets, M., Tuulik, M. (eds.) 2013. Electronic lexicography in the 21st century: thinking outside the paper. Proceedings of the eLex 2013 conference, 17-19 October 2013, Tallinn, Estonia.}, author = {Borin, Lars and Forsberg, Markus and Olsson, Leif-Jöran and Olsson, Olof and Uppström, Jonatan}, year = {2013}, publisher = {Trojina, Institute for Applied Slovene Studies / Eesti Keele Instituut }, address = {Ljubljana/Tallinn}, ISBN = { 978-961-93594-0-2}, } @edited_book{ey?orsson-etal-2013-proceedings-190256, title = {Proceedings of the workshop on computational historical linguistics at NODALIDA 2013, May 22-24, 2013, Oslo, Norway}, editor = {Eyþórsson, Þórhallur and Borin, Lars and Haug, Dag and Rögnvaldsson, Eiríkur}, year = {2013}, publisher = {Linköping University Electronic Press}, address = {Linköping}, ISBN = {978-91-7519-587-2}, } @article{oelke-etal-2013-fingerprint-181484, title = {Fingerprint Matrices: Uncovering the dynamics of social networks in prose literature}, abstract = {In prose literature often complex dynamics of interpersonal relationships can be observed between the different characters. Traditionally, node-link diagrams are used to depict the social network of a novel. However, static graphs can only visualize the overall social network structure but not the development of the networks over the course of the story, while dynamic graphs have the serious problem that there are many sudden changes between different portions of the overall social network. In this paper we explore means to show the relationships between the characters of a plot and at the same time their development over the course of a novel. Based on a careful exploration of the design space, we suggest a new visualization technique called Fingerprint Matrices. A case study exemplifies the usage of Fingerprint Matrices and shows that they are an effective means to analyze prose literature with respect to the development of relationships between the different characters.}, journal = {Computer Graphics Forum}, author = {Oelke, D. and Kokkinakis, Dimitrios and Keim, D. A.}, year = {2013}, volume = {32}, number = {3}, pages = {371--380}, } @inProceedings{ju-etal-2013-learning-166990, title = {Learning to Rank from Structures in Hierarchical Text Classification}, abstract = {In this paper, we model learning to rank algorithms based on structural dependencies in hierarchical multi-label text categorization (TC). Our method uses the classification probability of the binary classifiers of a standard top-down approach to generate k-best hypotheses. The latter are generated according to their global probability while at the same time satisfy the structural constraints between father and children nodes. The rank is then refined using Support Vector Machines and tree kernels applied to a structural representation of hypotheses, i.e., a hierarchy tree in which the outcome of binary one-vs-all classifiers is directly marked in its nodes. Our extensive experiments on the whole Reuters Corpus Volume 1 show that our models significantly improve over the state of the art in TC, thanks to the use of structural dependecies.}, booktitle = {Advances in Information Retrieval; 35th European Conference on IR Research, ECIR 2013, Moscow, Russia, March 24-27, 2013; P. Serdyukov et al. (ed)}, author = {Ju, Qi and Moschitti, Alessandro and Johansson, Richard}, year = {2013}, volume = {Lecture Notes in Computer Science 7814}, ISBN = {978-3-642-36972-8}, pages = {183--194}, } @inProceedings{skadina-etal-2013-baltic-194532, title = {Baltic and Nordic parts of the European linguistic infrastructure}, booktitle = {71. Proceedings of the 19th Nordic Conference of Computational Linguistics (NODALIDA 2013) 22-24, May 2013 Oslo, Norway}, author = {Skadina, Inguna and Vasiljevs, Andrejs and Borin, Lars and Lindén, Krister and Losnegaard, Gyri and Pedersen, Bolette Sandford and Rozis, Roberts and De Smedt, Koenraad}, year = {2013}, ISBN = {978-91-7519-589-6}, pages = {195--211}, } @inProceedings{akesson-lindh-2013-describing-188836, title = {Describing a database collection procedure for studying ‘double filtering’ effects}, booktitle = {22nd conference of the International Association for Forensic Phonetics and Acoustics (IAFPA). July 21st-24th, 2013, Tampa, Florida, USA}, author = {Åkesson, Joel and Lindh, Jonas}, year = {2013}, } @inProceedings{gustavsson-etal-2013-neural-177670, title = {Neural processing of voices - Familiarity}, abstract = {Brain responses to familiar and unfamiliar voices were investigated with ERPs (Event Related Potentials). Presentation of a stream of one syllable utterances from a female voice established a standard expectation, and similar samples from four other male voices where inserted as unexpected deviants in a typical mismatch paradigm. The participants were 12 students from the basic course in linguistics. Two of the deviant voices were familiar voices of their teachers. The two other deviant voices were matched (same age, sex and dialect) but unfamiliar to the participants. A typical MMN (Mismatch Negativity) was elicited, i.e. a more negative response to the deviants compared to the standards. In contrast to verbal reports, where only one participant identified any of the deviant voices, the MMN response differed on group level between familiar and unfamiliar voices. MMN to familiar voices was larger. Using teachers' voices ensured naturalistic long term exposure, but did not allow for random assignment to conditions of familiarity making the design quasi-experimental. Thus acoustic analysis of voice characteristics as well as follow up studies with randomized exposure to voices are needed to rule out possible confounds and establish a causal effect of voice familiarity.}, booktitle = {Proceedings of 21st International Congress on Acoustics}, author = {Gustavsson, Lisa and Kallioinen, Petter and Klintfors, Eeva and Lindh, Jonas}, year = {2013}, volume = {19}, number = {I}, pages = {060204----6}, } @inProceedings{lindh-akesson-2013-pilot-188837, title = {A pilot study on the effect of different phonetic acoustic input to a GMM - UBM system for voice comparison}, booktitle = {22nd conference of the International Association for Forensic Phonetics and Acoustics (IAFPA). July 21st-24th, 2013, Tampa, Florida, USA}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2013}, } @inProceedings{kokkinakis-malm-2013-macroanalytic-188518, title = {A Macroanalytic View of Swedish Literature using Topic Modeling.}, abstract = {New research opportunities are plentiful for digital and literature scholars who are currently faced with increasingly large portions of large digitized archives produced during the last decades. Conventional methods of analysis involving a so called close reading view are not enough. Distant reading or macroanalysis is proposed instead, as a better, viable and more pragmatic alternative to the traditional methods of analyzing e.g., literature. According to this view, understanding literature is not accomplished by studying individual texts, but by aggregating and analyzing massive amounts of data. Therefore, applying macroanalytic methods and technologies is a priority among many research groups in the humanities worldwide. In this paper we explore topic modeling, an increasingly popular statistical method used for uncovering themes, topics and patterns in large amounts of text. We use available topic modeling software and, as empirical data, the content of the Swedish literature bank, a constantly growing body of Swedish fiction corpus from the 18th and 19th century. We present preliminary results on a sample of this corpus and discuss how humanistic research can be conducted through this type of computation, as a means to identify potential issues of interest e.g., for historians.}, booktitle = {Corpus Linguistics 2013 : abstract book (Lancaster)}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2013}, } @article{johansson-moschitti-2013-relational-158811, title = {Relational Features in Fine-grained Opinion Analysis}, abstract = {Fine-grained opinion analysis often makes use of linguistic features but typically does not take the interaction between opinions into account. This article describes a set of experiments that demonstrate that relational features, mainly derived from dependency-syntactic and semantic role structures, can significantly improve the performance of automatic systems for a number of fine-grained opinion analysis tasks: marking up opinion expressions, finding opinion holders, and determining the polarities of opinion expressions. These features make it possible to model the way opinions expressed in natural-language discourse interact in a sentence over arbitrary distances. The use of relations requires us to consider multiple opinions simultaneously, which makes exact inference intractable. However, a reranker can be used as a sufficiently accurate and efficient approximation. A number of feature sets and machine learning approaches for the rerankers are evaluated. For the task of opinion expression extraction, the best model shows a 10-point absolute improvement in soft recall on the MPQA corpus over a conventional sequence labeler based on local contextual features, while precision decreases only slightly. Significant improvements are also seen for the extended tasks where holders and polarities are considered: 10 and 7 points in recall, respectively. In addition, the systems outperform previously published results for unlabeled (6 F-measure points) and polarity-labeled (10–15 points) opinion expression extraction. Finally, as an extrinsic evaluation, the extracted MPQA-style opinion expressions are used in practical opinion mining tasks. In all scenarios considered, the machine learning features derived from the opinion expressions lead to statistically significant improvement.}, journal = {Computational Linguistics}, author = {Johansson, Richard and Moschitti, Alessandro}, year = {2013}, volume = {39}, number = {3}, pages = {473--509}, } @inProceedings{johansson-2013-training-173587, title = {Training Parsers on Incompatible Treebanks}, abstract = {We consider the problem of training a statistical parser in the situation when there are multiple treebanks available, and these treebanks are annotated according to different linguistic conventions. To address this problem, we present two simple adaptation methods: the first method is based on the idea of using a shared feature representation when parsing multiple treebanks, and the second method on guided parsing where the output of one parser provides features for a second one. To evaluate and analyze the adaptation methods, we train parsers on treebank pairs in four languages: German, Swedish, Italian, and English. We see significant improvements for all eight treebanks when training on the full training sets. However, the clearest benefits are seen when we consider smaller training sets. Our experiments were carried out with unlabeled dependency parsers, but the methods can easily be generalized to other feature-based parsers.}, booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Johansson, Richard}, year = {2013}, pages = {127--137}, } @inProceedings{bennaceur-etal-2013-automatic-158812, title = {Automatic Service Categorisation through Machine Learning in Emergent Middleware}, booktitle = {Lecture notes in computer sciences}, author = {Bennaceur, Amel and Johansson, Richard and Moschitti, Alessandro and Sykes, Daniel and Issarny, Valérie}, year = {2013}, volume = {7542}, pages = {133--149}, } @inProceedings{kokkinakis-eklund-2013-query-189552, title = {Query Logs as a Corpus.}, abstract = {This paper provides a detailed description of a large Swedish health-related query log corpus and explores means to derive useful statistics, their distributions and analytics from its content across several dimensions. Information acquisition from query logs can be useful for several purposes and potential types of users, such as terminologists, infodemiologists / epidemiologists, medical data and web analysts, specialists in NLP technologies such as information retrieval and text mining but also public officials in health and safety organizations.}, booktitle = {Corpus Linguistics 2013 : abstract book. Lancaster: UCREL}, editor = {Andrew Hardie and Robbie Love}, author = {Kokkinakis, Dimitrios and Eklund, Ann-Marie}, year = {2013}, pages = {329}, } @techreport{roxendal-2013-state-189377, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction – W3C Working Draft 6 December 2012}, author = {Roxendal, Johan}, year = {2013}, publisher = {MIT}, address = {Cambridge, USA}, } @inProceedings{kokkinakis-2013-figurative-168227, title = {Figurative Language in Swedish Clinical Texts. Potsdam, Germany}, abstract = {Automated processing of clinical texts with the intention to link all important text fragments to various established terminologies and ontologies for relation or event extraction is commonly faced with various less exposed, and not so regularly discussed linguistically motivated issues that needs to be addressed. One of these issues is the usage of figurative language. Figurative language, that is the use of words that go beyond their ordinary meaning, is not only a linguistically complex and challenging problem but also a problem that causes great difficulty for the field of natural language processing (NLP), both for the processing of general language and of various sublanguages, such as clinical medicine. Therefore, a comprehensive model of e.g. clinical language processing needs to account for figurative language usage and this paper provides a description towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. As a matter of fact, all these types of expressions form a continuum with fuzzy boundaries, and most of the NLP-oriented approaches discussed in the past have used either very large data for the analysis or hand annotates samples, a situation that has been prohibitive so far in our project. Therefore distinction is solely based on a more general level, namely between literal versus figurative language, and on a more quantitative and corpus-based level, supported with concrete examples that illustrate several types of figurative expressions in the clinical discourse. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient doctor and patient nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree. }, booktitle = {Computational Semantics in Clinical Text workshop. Part of the 10th International Conference on Computational Semantics}, author = {Kokkinakis, Dimitrios}, year = {2013}, ISBN = {978-1-62748-398-8}, pages = {6}, } @inProceedings{hamon-etal-2013-medication-189545, title = {Medication Extraction and Guessing in Swedish, French and English. }, abstract = {Extraction of information related to the medication is an im-portant task within the biomedical area. While the elaboration and updating of the drug vocabularies cannot follow the rap-id evolution of the drug development, we propose an automat-ic method for the extraction of known and new drug names. Our method combines internal and contextual clues. The method is applied to different types of documents in three languages (Swedish, French and English). The results indi-cate that with this kind of approach, we can efficiently update and enrich the existing drug vocabularies (probably with rap-id manual browsing). Precision and recall scores varied be-tween 81%-91% for precision and 85%-100% for recall. As a future work we intend to continuously refine the approach, by for instance better integration of semantic patterns and fuzzy matching that should hopefully enable further increase of the obtained results.}, booktitle = {Proceedings of the 14th World Congress on Medical and Health Informatics (MEDINFO). Studies in Health Technology and Informatics. Copenhagen, Denmark.}, author = {Hamon, Thierry and Grabar, Natalia and Kokkinakis, Dimitrios}, year = {2013}, volume = {192}, } @inProceedings{kokkinakis-2013-terminologihantering-189541, title = {Terminologihantering i medicinska loggfiler.}, booktitle = {Proceedings of the "Nationell termkonferens". Göteborg}, author = {Kokkinakis, Dimitrios}, year = {2013}, } @inProceedings{kokkinakis-2013-medical-188517, title = {Medical Event Extraction using Frame Semantics - Challenges and Opportunities. Samos, Greece}, abstract = {Abstract. The aim of this paper is to present some findings from a study into how a large scale semantic resource, FrameNet, can be applied for event extraction in the (Swedish) biomedical domain. Combining lexical resources with domain specific knowledge provide a powerful modeling mechanism that can be utilized for event extraction and other advanced text mining-related activities. The results, from developing a rule-based approach, showed that only small discrepancies and omissions were found between the semantic descriptions, the corpus data examined and the domain-specific semantics provided by SNOMED CT (medical terminology), NPL (medicinal products) and various semi-automatically developed clue lists (e. g., domain-related abbreviations). Although the described experiment is only based on four different domain-specific frames, the methodology is extendable to the rest ones and there is much room for improvements, for instance by combining rule-based with machine learning techniques, and using more advanced syntactic representations.}, booktitle = {Proceedings of the 14th International Conference on Intelligent Text Processing and Computational Linguistics (CICLing)}, author = {Kokkinakis, Dimitrios}, year = {2013}, }