@inProceedings{kokkinakis-etal-2014-vocation-209808, title = {Vocation Identification in Swedish Fiction. }, abstract = {This paper presents a system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Besides vocation identification, the system assigns gender (male, female, unknown) to the vocation words. Since gender is a prominent attribute of first names, we apply a named-entity recognizer (NER) that uses first name gazetteers where each name has been pre-assigned gender, which aids gender assignment to vocations with unknown gender if appropriate context is available. We also use a statistical modelling method, conditional random fields (CRF), for learning gender-assigned vocations in combination with the results of the NER and other pattern matching techniques. The purpose of this work is to develop and apply tools to literature as means to expand our understanding of history in the area of literature-based gender studies, e.g. investigate how women enter literature, which functions do they assume and their working patterns. Vocation identification can be used as one such indicator for achieving some these goals.}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference (SLTC)}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2014}, pages = {3}, } @article{kilgariff-etal-2014-corpus-188541, title = {Corpus-Based Vocabulary lists for Language Learners for Nine Languages.}, abstract = {We present the KELLY project and its work on developing monolingual and bilingual word lists for language learning, using corpus methods, for nine languages and thirty-six language pairs. We describe the method and discuss the many challenges encountered. We have loaded the data into an online database to make it accessible for anyone to explore and we present our own first explorations of it. The focus of the paper is thus twofold, covering pedagogical and methodological aspects of the lists’ construction, and linguistic aspects of the by-product of the project, the KELLY database. }, journal = {Language resources and evaluation}, author = {Kilgariff, Adam and Charalabopoulou, Frieda and Gavrilidou, Maria and Bondi Johannessen, Janne and Khalil, Saussan and Johansson Kokkinakis, Sofie and Lew, Robert and Sharoff, Serge and Vadlapudi, R. and Volodina, Elena}, year = {2014}, volume = {48}, number = {1}, pages = {121--163}, } @article{borin-etal-2014-geographic-198286, title = {Geographic visualization of place names in Swedish literary texts}, abstract = {This article describes the development of a geographical information system (GIS) at Språkbanken as part of a visualization solution to be used in an archive of historical Swedish literary texts. The research problems we are aiming to address concern orthographic and morphological variation, missing place names, and missing place name coordinates. Some of these problems form a central part in the development of methods and tools for the automatic analysis of historical Swedish literary texts at our research unit. We discuss the advantages and challenges of covering large-scale spelling variation in place names from different sources and in generating maps with focus on different time periods. }, journal = {Literary & Linguistic Computing}, author = {Borin, Lars and Dannélls, Dana and Olsson, Leif-Jöran}, year = {2014}, volume = {29}, number = {3}, pages = {400--404}, } @inProceedings{hu-lindh-2014-effects-203082, title = {Effects of initial sounds on the perception of Chinese disyllable tones by Swedish students of Chinese}, abstract = {ABSTRACT This paper extends previous research on the effects of initial sounds on perception of Chinese disyllable tones. A perception test was performed on Swedish adult students of Chinese using disyllable words (most previous studies have been made using solely monosyllable words). The main results indicate that voiced initial sounds e.g. [l] have a strong connection to the tone confusion pattern Tone 2 perceived as Tone 3. On the contrary, a voiceless aspirated initial sound e.g. [th] is mostly connected to misidentifications between Tone 3 to Tone 2. Unvoiced unaspirated initial sounds affect tone perception heavily, especially when they occur in the second syllable of a disyllabic word. }, booktitle = {2014 International Conference on Phonetic Research and Language Learning (ICPRLL) & English Phonetic Conference in China (EPCC)}, author = {Hu, Guohua and Lindh, Jonas}, year = {2014}, } @article{rama-borin-2014-gram-187121, title = {N-Gram Approaches to the Historical Dynamics of Basic Vocabulary}, journal = {Journal of Quantitative Linguistics}, author = {Rama, Taraka and Borin, Lars}, year = {2014}, volume = {21}, number = {1}, pages = {50--64}, } @inProceedings{borin-etal-2014-linguistic-198551, title = {Linguistic landscaping of South Asia using digital language resources: Genetic vs. areal linguistics}, booktitle = {Proceedings of LREC, May 26-31, 2014, Reykjavik, Iceland}, author = {Borin, Lars and Saxena, Anju and Rama, Taraka and Comrie, Bernard}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3137--3144}, } @article{morrison-etal-2014-likelihood-188784, title = {Likelihood ratio calculation for a disputed-utterance analysis with limited available data}, abstract = {We present a disputed-utterance analysis using relevant data, quantitative measurements and statistical models to calculate likelihood ratios. The acoustic data were taken from an actual forensic case in which the amount of data available to train the statistical models was small and the data point from the disputed word was far out on the tail of one of the modelled distributions. A procedure based on single multivariate Gaussian models for each hypothesis led to an unrealistically high likelihood ratio value with extremely poor reliability, but a procedure based on Hotelling’s T2 statistic and a procedure based on calculating a posterior predictive density produced more acceptable results. The Hotelling’s T2 procedure attempts to take account of the sampling uncertainty of the mean vectors and covariance matrices due to the small number of tokens used to train the models, and the posterior-predictive-density analysis integrates out the values of the mean vectors and covariance matrices as nuisance parameters. Data scarcity is common in forensic speech science and we argue that it is important not to accept extremely large calculated likelihood ratios at face value, but to consider whether such values can be supported given the size of the available data and modelling constraints.}, journal = {Speech Communication}, author = {Morrison, Geoffrey Stewart and Lindh, Jonas and Curran, James M}, year = {2014}, volume = {58}, pages = {81--90}, } @inProceedings{dannells-etal-2014-multilingual-204733, title = {A Multilingual SPARQL-Based Retrieval Interface for Cultural Heritage Objects}, booktitle = {Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Dannélls, Dana and Enache, Ramona and Damova, Mariana}, year = {2014}, volume = {1272}, pages = {205--208}, } @incollection{damova-etal-2014-natural-178094, title = {Natural Language Interaction with Semantic Web Knowledge Bases and Linked Open Data}, abstract = {Cultural heritage appears to be a very useful use case for Semantic Web technologies. The domain provides with plenty of circumstances where linkages between different knowledge sources are required to ensure access to rich information and respond to the needs of professionals dealing with cultural heritage content. Semantic Web technologies offer the technological backbone to meet the requirement of integrating heterogeneous data easily, but they are still more adapted to be consumed by computers than by humans, especially non-engineers or developers. This chapter is about a technique which allows interaction in natural language with semantic knowledge bases. The proposed technique offers a method that allows querying a semantic repository in natural language and obtaining results from it as a coherent text. This unique solution includes several steps of transition from natural language to SPARQL and from RDF to coherent multilingual descriptions, using the Grammatical Framework, GF. The approach builds on a semantic knowledge infrastructure in RDF, it is based on OWLIM-SE and the data integration method Reason-able View supplied with an ontological reference layer. The latter is connected via formal rules with abstract representations derived from the syntactic trees of natural language input using the GF resource grammar library. }, booktitle = {Towards multilingual Semantic Web}, author = {Damova, Mariana and Dannélls, Dana and Mateva, Maria and Enache, Ramona and Ranta, Aarne}, year = {2014}, publisher = {Springer}, address = {Berlin}, ISBN = {978-3-662-43585-4}, pages = {211--226}, } @inProceedings{kokkinakis-etal-2014-hfst-209800, title = {HFST-SweNER . A New NER Resource for Swedish}, abstract = {Named entity recognition (NER) is a knowledge-intensive information extraction task that is used for recognizing textual mentions of entities that belong to a predefined set of categories, such as locations, organizations and time expressions. NER is a challenging, difficult, yet essential preprocessing technology for many natural language processing applications, and particularly crucial for language understanding. NER has been actively explored in academia and in industry especially during the last years due to the advent of social media data. This paper describes the conversion, modeling and adaptation of a Swedish NER system from a hybrid environment, with integrated functionality from various processing components, to the Helsinki Finite-State Transducer Technology (HFST) platform. This new HFST-based NER (HFST-SweNER) is a full-fledged open source implementation that supports a variety of generic named entity types and consists of multiple, reusable resource layers, e.g., various n-gram-based named entity lists (gazetteers).}, booktitle = {Proceedings of the 9th edition of the Language Resources and Evaluation Conference (LREC), Reykjavik 26 - 31 May 2014.}, author = {Kokkinakis, Dimitrios and Niemi, Jyrki and hardwick, sam and Lindén, Krister and Borin, Lars}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {2537--2543}, } @inProceedings{dannells-etal-2014-using-201951, title = {Using language technology resources and tools to construct Swedish FrameNet}, abstract = {Having access to large lexical and grammatical resources when creating a new language resource is essential for its enhancement and enrichment. This paper describes the interplay and interac- tive utilization of different language technology tools and resources, in p articular the Swedish lexicon SALDO and Swedish Constructicon, in the creation of Swedish Frame Net. We show how integrating resources in a larger infrastructure is much more than the su m of the parts. }, booktitle = {Proceedings of the Workshop on Lexical and Grammatical Resources for Language Processing, Dublin Ireland, August 24, 2014}, author = {Dannélls, Dana and Friberg Heppin, Karin and Ehrlemark, Anna}, year = {2014}, ISBN = {978-1-873769-44-7}, pages = {8--17}, } @article{johansson-2014-automatic-201874, title = {Automatic Expansion of the Swedish FrameNet Lexicon}, abstract = {We evaluate several lexicon-based and corpus-based methods to automatically induce new lexical units for the Swedish FrameNet, and we see that the best-performing setup uses a combination of both types of methods. A particular challenge for Swedish is the absence of a lexical resource such as WordNet; however, we show that the semantic network SALDO, which is organized according to lexicographical principles quite different from those of WordNet, is very useful for our purposes.}, journal = {Constructions and Frames}, author = {Johansson, Richard}, year = {2014}, volume = {6}, number = {1}, pages = {92--113}, } @article{borin-etal-2014-introduction-202127, title = {Introduction: Constructions and frames meet language technology}, journal = {Constructions and Frames}, author = {Borin, Lars and de Melo, Gerard and Friberg Heppin, Karin and Torrent, Tiago Timponi}, year = {2014}, volume = {6}, number = {1}, pages = {1--8}, } @incollection{ribeck-borin-2014-lexical-201965, title = {Lexical Bundles in Swedish Secondary School Textbooks}, abstract = {The present paper describes the process of identifying lexical bundles, i.e., frequently recurring word sequences such as by means of and in the end of, in secondary school history and physics textbooks. In its determination of finding genuine lexical bundles, i.e. the word boundaries between lexical bundles and surrounding arbitrary words, it proposes a new approach to come to terms with the problem of extracting overlapping bundles of different lengths. The results of the structural classification indicate that history uses more NP/PP-based and less dependent-clause-based bundles than physics. The comparative analysis manages to restrict this difference to the referential function. History almost only refers to phrases, i.e. within clauses, while physics much more tends to make references across clauses. The article also includes a report on an extension of the study, ongoing work where the automatic identification of multi-word expressions in general is in focus.}, booktitle = {Human Language Technology Challenges for Computer Science and Linguistics 5th Language and Technology Conference, LTC 2011, Poznań, Poland, November 25--27, 2011, Revised Selected Papers}, editor = {Zygmunt Vetulani and Joseph Mariani.}, author = {Ribeck, Judy Carola and Borin, Lars}, year = {2014}, publisher = {Springer International Publishing}, volume = {2014}, number = {XVI}, address = {Cham}, ISBN = {978-3-319-08958-4}, pages = {238--249}, } @inProceedings{volodina-etal-2014-flexible-201885, title = {A flexible language learning platform based on language resources and web services. }, abstract = {We present Lärka, the language learning platform of Språkbanken (the Swedish Language Bank). It consists of an exercise generator which reuses resources available through Språkbanken: mainly Korp, the corpus infrastructure, and Karp, the lexical infrastructure. Through Lärka we reach new user groups – students and teachers of Linguistics as well as second language learners and their teachers – and this way bring Språkbanken's resources in a relevant format to them. Lärka can therefore be viewed as a case of a real-life language resource evaluation with end users. In this article we describe Lärka's architecture, its user interface, and the five exercise types that have been released for users so far. The first user evaluation following in-class usage with students of linguistics, speech therapy and teacher candidates are presented. The outline of future work concludes the paper.}, booktitle = {Proceedings of LREC 26-31 May 2014, Reykjavik, Iceland }, author = {Volodina, Elena and Pilán, Ildikó and Borin, Lars and Tiedemann, Therese Lindström}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {3973--3978}, } @article{forsberg-etal-2014-from-208123, title = {From construction candidates to constructicon entries: An experiment using semi-automatic methods for identifying constructions in corpora}, abstract = { We present an experiment where natural language processing tools are used to automatically identify potential constructions in a corpus. e experiment was conducted as part of the ongoing efforts to develop a Swedish constructicon. Using an automatic method to suggest constructions has advantages not only for efficiency but also methodologically: it forces the analyst to look more objec-tively at the constructions actually occurring in corpora, as opposed to focusing on “interesting” constructions only. As a heuristic for identifying potential con-structions, the method has proved successful, yielding about 200 (out of 1,200) highly relevant construction candidates.}, journal = {Constructions and Frames}, author = {Forsberg, Markus and Johansson, Richard and Bäckström, Linnéa and Borin, Lars and Lyngfelt, Benjamin and Olofsson, Joel and Prentice, Julia}, year = {2014}, volume = {6}, number = {1, 2014}, pages = {114--135}, } @inProceedings{dannells-gruzitis-2014-controlled-201944, title = {Controlled Natural Language Generation from a Multilingual FrameNet-based Grammar}, abstract = {This paper presents a currently bilingual but potentially multilingual FrameNet-based grammar library implemented in Grammatical Framework. The contribution of this paper is two-fold. First, it offers a methodological approach to automatically generate the grammar based on semantico-syntactic valence patterns extracted from FrameNet-annotated corpora. Second, it provides a proof of concept for two use cases illustrating how the acquired multilingual grammar can be exploited in different CNL applications in the domains of arts and tourism.}, booktitle = {Lecture Notes in Computer Science}, author = {Dannélls, Dana and Gruzitis, Normunds}, year = {2014}, volume = {8625}, ISBN = {978-3-319-10222-1}, pages = {155--166}, } @edited_book{volodina-etal-2014-proceedings-206135, title = {Proceedings of the third workshop on NLP for computer-assisted language learning at SLTC 2014, Uppsala University}, abstract = {The workshop series on NLP for Computer-Assisted Language Learning (NLP4CALL) is a meeting place for researchers working on the integration of Natural Language Processing and Speech technologies in CALL systems and exploring the theoretical and methodological issues arising in this connection. The papers in the proceedings volume from the third NLP4CALL workshop cover three main topic areas: resources for development of ICALL applications (e.g., learner corpora and coursebook corpora), tools and algorithms for the analysis of learner language (e.g., focusing on collocations, reading tasks, cloze items, pronunciation, spelling, level classification of learner production), and the generation of learning materials (e.g., exercise generators).}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2014}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-175-1}, } @inProceedings{adesam-etal-2014-koala-211376, title = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard}, year = {2014}, } @inProceedings{rehm-etal-2014-strategic-198556, title = {The strategic impact of META-NET on the regional, national and international level}, booktitle = {Proceedings of LREC 2014, 26-31 May, Reykjavik, Iceland }, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bieleviciene, Audrone and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and Garcia-Mateo, Carmen and Genabith, Josef Van and Hajic, Jan and Hernaez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asuncion and Odijk, Jan and Ogrodniczuk, Maciej and Pezik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Michael and Pedersen, Bolette Sandford and Skadina, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiș, Dan and Váradi, Tamás and Vasiljevs, Andrejs and Vider, Kadri and Zabarskaite, Jolanta}, year = {2014}, ISBN = {978-2-9517408-8-4}, pages = {1517--1524}, } @inProceedings{borin-etal-2014-representing-204731, title = {Representing Swedish Lexical Resources in RDF with lemon}, abstract = {The paper presents an ongoing project which aims to publish Swedish lexical-semantic resources using Semantic Web and Linked Data technologies. In this article, we highlight the practical conversion methods and challenges of converting three of the Swedish language resources in RDF with lemon.}, booktitle = { Proceedings of the ISWC 2014 Posters & Demonstrations Track a track within the 13th International Semantic Web Conference (ISWC 2014)}, author = {Borin, Lars and Dannélls, Dana and Forsberg, Markus and McCrae, John P.}, year = {2014}, volume = {1272 }, pages = {329--332}, } @inProceedings{agfjord-etal-2014-grammar-208776, title = {Grammar-based Suggestion Engine with Keyword Search.}, booktitle = {The Fifth Swedish Language Technology Conference}, author = {Agfjord, Martin and Angelov, Krasimir and Fredelius, Per and Marinov, Svetoslav}, year = {2014}, } @inProceedings{ahlberg-etal-2014-swedish-210083, title = {Swedish FrameNet++ The Beginning of the End and the End of the Beginning}, booktitle = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014}, author = {Ahlberg, Malin and Borin, Lars and Dannélls, Dana and Forsberg, Markus and Toporowska Gronostaj, Maria and Friberg Heppin, Karin and Johansson, Richard and Kokkinakis, Dimitrios and Olsson, Leif-Jöran and Uppström, Jonatan}, year = {2014}, } @article{borin-johansson-2014-kulturomik-192931, title = {Kulturomik: Att spana efter språkliga och kulturella förändringar i digitala textarkiv}, journal = {Historia i en digital värld}, author = {Borin, Lars and Johansson, Richard}, year = {2014}, } @inProceedings{lindh-akesson-2014-effect-218075, title = {Effect of the Double-Filtering effect on Automatic Voice Comparison}, abstract = {In forensic casework today it is not uncommon to receive material recorded with mobile phones or other handheld recording devices. From experience we know most people do not treat recordings with as much care as a person well versed in audio technology. Especially given the varying circumstances under which the material can be recorded. Thus it is important we learn more about what sort of acoustic effects take place under particular conditions and how these effects can influence Automatic Voice Comparison (AVC). The current study aims at evaluating the effects of recording material consisting of what could be described as ‘doublefiltered’ sound, henceforth referred to as DF, e.g. when a phone call is recorded using a handheld recorder placed in the vicinity of the mobile device. This filtering effect constitutes sound transmitted via GSM communication (1st filter) which then passes an indeterminable distance through the air before being captured by another recording device, such as a mobile phone or handheld recorder’s microphone (2nd filter). This effect affects the energy in the signal. The energy decreases in both the low and the high frequencies, while the middle frequencies are boosted. In this study we have used a database consisting of 150 female speakers of Swedish, all students of speech and language pathology. The recordings were made in a sound treated recording booth using a setup of one computer equipped with an internal MAudio soundcard and a high quality headset microphone. Each recording consists of solicited spontaneous speech together with read speech material (Swedish standard reading passage called ‘Ett svårt fall’). Each speaker is informed and encouraged to finish the task at their own pace. Mean duration of the full recording among the speakers was 69.3 seconds (std 16 seconds).}, booktitle = {Proceedings of IAFPA 2014. International Association for Forensic Phonetics and Acoustics Annual Conference 31 August - 3 September 2014}, author = {Lindh, Jonas and Åkesson, Joel}, year = {2014}, pages = {2}, } @inProceedings{pilan-volodina-2014-reusing-200967, title = {Reusing Swedish FrameNet for training semantic roles}, abstract = {In this article we present the first experiences of reusing the Swedish FrameNet (SweFN) as a resource for training semantic roles. We give an account of the procedure we used to adapt SweFN to the needs of students of Linguistics in the form of an automatically generated exercise. During this adaptation, the mapping of the fine-grained distinction of roles from SweFN into learner-friendlier coarse-grained roles presented a major challenge. Besides discussing the details of this mapping, we describe the resulting multiple-choice exercise and its graphical user interface. The exercise was made available through Lärka, an online platform for students of Linguistics and learners of Swedish as a second language. We outline also aspects underlying the selection of the incorrect answer options which include semantic as well as frequency-based criteria. Finally, we present our own observations and initial user feedback about the applicability of such a resource in the pedagogical domain. Students' answers indicated an overall positive experience, the majority found the exercise useful for learning semantic roles. }, booktitle = {Proceedings of LREC 2014, May 26-31, 2014, Reykjavik, Iceland}, author = {Pilán, Ildikó and Volodina, Elena}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1359--1363}, } @article{smith-etal-2014-readability-188146, title = {Readability, suitability and comprehensibility in patient education materials for Swedish patients with colorectal cancer undergoing elective surgery: A mixed method design.}, abstract = {To characterize education materials provided to patients undergoing colorectal cancer surgery to gain a better understanding of how to design readable, suitable, comprehensible materials.}, journal = {Patient education and counseling}, author = {Smith, Frida and Carlsson, Eva and Kokkinakis, Dimitrios and Forsberg, Markus and Kodeda, Karl and Sawatzky, Richard and Friberg, Febe and Öhlén, Joakim}, year = {2014}, volume = {94}, number = {2}, pages = {202–209}, } @inProceedings{volodina-lindstromtiedemann-2014-evaluating-206141, title = {Evaluating students' metalinguistic knowledge with Lärka.}, booktitle = {Proceedings of the 5th Swedish Language Technology Conference, Uppsala University 13-14 November 2014}, author = {Volodina, Elena and Lindström TIedemann, Therese}, year = {2014}, } @inProceedings{lyngfelt-etal-2014-svenskt-208457, title = {Ett svenskt konstruktikon. Grammatik möter lexikon}, booktitle = {Svenskans beskrivning : Förhandlingar vid Trettiotredje sammankomsten för svenskans beskrivning. Helsingfors den 15–17 maj 2013}, author = {Lyngfelt, Benjamin and Borin, Lars and Bäckström, Linnéa and Forsberg, Markus and Olsson, Leif-Jöran and Prentice, Julia and Rydstedt, Rudolf and Sköldberg, Emma and Tingsell, Sofia and Uppström, Jonatan}, year = {2014}, volume = {33}, ISBN = {978-951-51-0120-4}, pages = {268--279}, } @inProceedings{gunther-etal-2014-rtrgo-201512, title = {RTRGO: Enhancing the GU-MLT-LT System for Sentiment Analysis of Short Messages}, abstract = {This paper describes the enhancements made to our GU-MLT-LT system (Günther and Furrer, 2013) for the SemEval-2014 re-run of the SemEval-2013 shared task on sentiment analysis in Twitter. The changes include the usage of a Twitter-specific tokenizer, additional features and sentiment lexica, feature weighting and random subspace learning. The improvements result in an increase of 4.18 F-measure points on this year’s Twitter test set, ranking 3rd. }, booktitle = {Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014) August 23-24, 2014 Dublin, Ireland}, author = {Günther, Tobias and Vancoppenolle, Jean and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-24-2}, pages = {497--502}, } @article{fribergheppin-toporowskagronostaj-2014-exploiting-210058, title = {Exploiting FrameNet for Swedish: Mismatch?}, abstract = {This paper presents work on developing Swedish FrameNet (SweFN) as a resource analogous to the original Berkeley-based FrameNet. We describe the theoretical and practical basics of FrameNet, and articulate some multilingual issues that arise in expanding a linguistic resource from one language to another. SweFN uses FrameNet as a starting point in order to save time and effort, and to make it compatible with other FrameNet-based resources. The lexical units are from the pivot lexicon SALDO, making SweFN compatible with other resources of the larger project SweFN++. It is a corpus-based resource, meant to support tasks within natural language processing relying on semantic data.}, journal = {Constructions and Frames}, author = {Friberg Heppin, Karin and Toporowska Gronostaj, Maria}, year = {2014}, volume = {6}, number = {1}, pages = {52--72}, } @inProceedings{ahlberg-etal-2014-semi-198791, title = {Semi-supervised learning of morphological paradigms and lexicons}, abstract = {We present a semi-supervised approach to the problem of paradigm induction from inflection tables. Our system extracts generalizations from inflection tables, representing the resulting paradigms in an abstract form. The process is intended to be language-independent, and to provide human-readable generalizations of paradigms. The tools we provide can be used by linguists for the rapid creation of lexical resources. We evaluate the system through an inflection table reconstruction task using Wiktionary data for German, Spanish, and Finnish. With no additional corpus information available, the evaluation yields per word form accuracy scores on inflecting unseen base forms in different lan guages ranging from 87.81% (German nouns) to 99.52% (Spanish verbs); with additional unlabeled tex t corpora available for training the scores range from 91.81% (German nouns) to 99.58% (Spanish verbs). We separately evaluate the system in a simulated task of Swedish lexicon creation, and show that on the basis of a small number of inflection tables, the system can accurately collect from a list of noun forms a lexicon with inflection information ranging from 100.0% correct (collect 100 words), to 96.4% correct (collect 1000 words).}, booktitle = {Proceedings of the 14th Conference of the European Chapter of the Association for Computational Linguistics, Gothenburg, Sweden 26–30 April 2014 }, author = {Ahlberg, Malin and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = {978-1-937284-78-7}, pages = {569--578}, } @inProceedings{volodina-etal-2014-what-206132, title = {You get what you annotate: a pedagogically annotated corpus of coursebooks for Swedish as a Second Language.}, abstract = {We present the COCTAILL corpus, containing over 700.000 tokens of Swedish texts from 12 coursebooks aimed at second/foreign language (L2) learning. Each text in the corpus is labelled with a proficiency level according to the CEFR proficiency scale. Genres, topics, associated activities, vocabulary lists and other types of information are annotated in the coursebooks to facilitate Second Language Acquisition (SLA)-aware studies and experiments aimed at Intelligent Computer-Assisted Language Learning (ICALL). Linguistic annotation in the form of parts-of-speech (POS; e.g. nouns, verbs), base forms (lemmas) and syntactic relations (e.g. subject, object) has been also added to the corpus. In the article we describe our annotation scheme and the editor we have developed for the content mark-up of the coursebooks, including the taxonomy of pedagogical activities and linguistic skills. Inter-annotator agreement has been computed and reported on a subset of the corpus. Surprisingly, we have not found any other examples of pedagogically marked-up corpora based on L2 coursebooks to draw on existing experiences. Hence, our work may be viewed as “groping in the darkness” and eventually a starting point for others. The paper also presents our first quantitative exploration of the corpus where we focus on textually and pedagogically annotated features of the coursebooks to exemplify what types of studies can be performed using the presented annotation scheme. We explore trends shown in use of topics and genres over proficiency levels and compare pedagogical focus of exercises across levels. The final section of the paper summarises the potential this corpus holds for research within SLA and various ICALL tasks. }, booktitle = {NEALT Proceedings Series}, author = {Volodina, Elena and Pilán, Ildikó and Rødven-Eide, Stian and Heidarsson, Hannes}, year = {2014}, volume = {22}, ISBN = {978-91-7519-175-1}, pages = {128--144}, } @inProceedings{adesam-etal-2014-computer-198794, title = {Computer-aided Morphology Expansion for Old Swedish}, abstract = {In this paper we describe and evaluate a tool for paradigm induction and lexicon extraction that has been applied to Old Swedish. The tool is semi-supervised and uses a small seed lexicon and unannotated corpora to derive full inflection tables for input lemmata. In the work presented here, the tool has been modified to deal with the rich spelling variation found in Old Swedish texts. We also present some initial experiments, which are the first steps towards creating a large-scale morphology for Old Swedish.}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14) May 26-31, 2014 Reykjavik, Iceland }, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Bouma, Gerlof and Forsberg, Markus and Hulden, Mans}, year = {2014}, ISBN = { 978-2-9517408-8-4}, pages = {1102--1105}, } @inProceedings{dannells-gruzitis-2014-extracting-198499, title = {Extracting a bilingual semantic grammar from FrameNet-annotated corpora}, abstract = {We present the creation of an English-Swedish FrameNet-based grammar in Grammatical Framework. The aim of this research is to make existing framenets computationally accessible for multilingual natural language applications via a common semantic grammar API, and to facilitate the porting of such grammar to other languages. In this paper, we describe the abstract syntax of the semantic grammar while focusing on its automatic extraction possibilities. We have extracted a shared abstract syntax from ~58,500 annotated sentences in Berkeley FrameNet (BFN) and ~3,500 annotated sentences in Swedish FrameNet (SweFN). The abstract syntax defines 769 frame-specific valence patterns that cover 77,8% examples in BFN and 74,9% in SweFN belonging to the shared set of 471 frames. As a side result, we provide a unified method for comparing semantic and syntactic valence patterns across framenets.}, booktitle = {Proceedings of the 9th International Conference on Language Resources and Evaluation (LREC)}, author = {Dannélls, Dana and Gruzitis, Normunds}, year = {2014}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-8-4}, } @inProceedings{borin-forsberg-2014-swesaurus;-193085, title = {Swesaurus; or, The Frankenstein Approach to Wordnet Construction}, abstract = {Swesaurus is a freely available (under a CC-BY license) Swedish wordnet under construction, built primarily by scavenging and recycling information from a number of existing lexical resources. Among its more unusual characteristics are graded lexical-semantic relations and inclusion of all parts of speech, not only open-class items. }, booktitle = {Proceedings of the Seventh Global WordNet Conference (GWC 2014)}, author = {Borin, Lars and Forsberg, Markus}, year = {2014}, ISBN = {978-9949-32-492-7}, } @inProceedings{kokkinakis-etal-2014-semantics-209802, title = {Semantics in Storytelling in Swedish Fiction}, abstract = {In this paper, we aim to define foundations and research questions for future large scale exploration of various types of semantic relationships in literature, namely Swedish prose fiction. More specifically, we are interested to get an in-depth understanding of storytelling in Swedish fiction by analyzing and mining the narrative discourse in a small sample of such data, focusing on interpersonal relationships and answering various questions such as how to recognize and assess gender patterns. Our intention is to apply our findings into a much larger scale in the near future in order to obtain useful insights about the social relations, structures, behavior and everyday life of characters found in literary works, thus enhancing the use of prose fiction as a source for research within the humanities and social sciences. Our work is inspired by the notions of distant reading and macroanalysis, a relatively new and often contested paradigm of literary research. In order to achieve our goal we strive for a combination of natural language processing techniques and simple visualizations that allow the user to rapidly focus on key areas of interest and provide the ability to discover latent semantic patterns and structures. }, booktitle = {Proceedings of the Digital Access to textual Cultural Heritage (DATeCH).}, author = {Kokkinakis, Dimitrios and Malm, Mats and Bergenmar, Jenny and Ighe, Ann}, year = {2014}, ISBN = {978-1-4503-2588-2}, pages = {6}, } @inProceedings{kageback-etal-2014-extractive-210878, title = {Extractive Summarization using Continuous Vector Space Models}, abstract = {Automatic summarization can help users extract the most important pieces of information from the vast amount of text digitized into electronic form everyday. Central to automatic summarization is the notion of similarity between sentences in text. In this paper we propose the use of continuous vector representations for semantically aware representations of sentences as a basis for measuring similarity. We evaluate different compositions for sentence representation on a standard dataset using the ROUGE evaluation measures. Our experiments show that the evaluated methods improve the performance of a state-of-the-art summarization framework and strongly indicate the benefits of continuous word vector representations for automatic summarization.}, booktitle = {Proceedings of the 2nd Workshop on Continuous Vector Space Models and their Compositionality (CVSC) EACL, April 26-30, 2014 Gothenburg, Sweden}, author = {Kågebäck, Mikael and Mogren, Olof and Tahmasebi, Nina and Dubhashi, Devdatt}, year = {2014}, ISBN = {978-1-937284-94-7}, pages = {31--39}, } @inProceedings{borin-etal-2014-bring-198549, title = {Bring vs. MTRoget: Evaluating automatic thesaurus translation}, booktitle = {Proceedings of LREC 2014, May 26-31, 2014 Reykjavik, Iceland}, author = {Borin, Lars and Allwood, Jens and de Melo, Gerard}, year = {2014}, publisher = {European Language Resources Association}, ISBN = {978-2-9517408-8-4}, } @article{dupplaw-etal-2014-information-195563, title = {Information extraction from multimedia web documents: an open-source platform and testbed}, abstract = {The LivingKnowledge project aimed to enhance the current state of the art in search, retrieval and knowledge management on the web by advancing the use of sentiment and opinion analysis within multimedia applications. To achieve this aim, a diverse set of novel and complementary analysis techniques have been integrated into a single, but extensible software platform on which such applications can be built. The platform combines state-of-the-art techniques for extracting facts, opinions and sentiment from multimedia documents, and unlike earlier platforms, it exploits both visual and textual techniques to support multimedia information retrieval. Foreseeing the usefulness of this software in the wider community, the platform has been made generally available as an open-source project. This paper describes the platform design, gives an overview of the analysis algorithms integrated into the system and describes two applications that utilise the system for multimedia information retrieval.}, journal = {International Journal of Multimedia Information Retrieval}, author = {Dupplaw, David and Matthews, Michael and Johansson, Richard and Boato, Giulia and Costanzo, Andrea and Fontani, Marco and Minack, Enrico and Demidova, Elena and Blanco, Roi and Griffiths, Thomas and Lewis, Paul and Hare, Jonathon and Moschitti, Alessandro}, year = {2014}, volume = {3}, number = {2}, pages = {97--111}, } @inProceedings{pilan-etal-2014-rule-210940, title = {Rule-based and machine learning approaches for second language sentence-level readability}, abstract = {We present approaches for the identification of sentences understandable by second language learners of Swedish, which can be used in automatically generated exercises based on corpora. In this work we merged methods and knowledge from machine learning-based readability research, from rule-based studies of Good Dictionary Examples and from second language learning syllabuses. The proposed selection methods have also been implemented as a module in a free web-based language learning platform. Users can use different parameters and linguistic filters to personalize their sentence search with or without a machine learning component assessing readability. The sentences selected have already found practical use as multiple-choice exercise items within the same platform. Out of a number of deep linguistic indicators explored, we found mainly lexical-morphological and semantic features informative for second language sentence-level readability. We obtained a readability classification accuracy result of 71%, which approaches the performance of other models used in similar tasks. Furthermore, during an empirical evaluation with teachers and students, about seven out of ten sentences selected were considered understandable, the rule-based approach slightly outperforming the method incorporating the machine learning model.}, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 26, 2014 Baltimore, Maryland, USA}, author = {Pilán, Ildikó and Volodina, Elena and Johansson, Richard}, year = {2014}, ISBN = {978-1-941643-03-7}, pages = {174----184}, } @inProceedings{lenkiewicz-etal-2014-dwan-216695, title = {The DWAN framework: Application of a web annotation framework for the general humanities to the domain of language resources}, abstract = {Researchers share large amounts of digital resources, which offer new chances for cooperation. Collaborative annotation systems are meant to support this. Often, these systems are targeted at a specific task or domain, e.g., annotation of a corpus. The DWAN framework for web annotation is generic and can support a wide range of tasks and domains. A key feature of the framework is its support for caching representations of the annotated resource. This allows showing the context of the annotation even if the resource has changed or has been removed. The paper describes the design and implementation of the framework. Use cases provided by researchers are well in line with the key characteristics of the DWAN annotation framework.}, booktitle = {LREC 2014, Reykjavik, Iceland; http://lrec2014.lrec-conf.org/en/conference-programme/list-accepted-papers/}, author = {Lenkiewicz, Przemyslaw and Shkaravska, Olha and Goosen, Twan and Windhouwer, Menzo and Broeder, Daan and Roth, Stephanie S. and Olsson, Olof}, year = {2014}, } @inProceedings{grahn-kokkinakis-2014-legitimating-216142, title = {Legitimating the visit - a recurrent challenge among patients with medically unexplained symptoms}, abstract = {The doctor’s evaluation of presented symptoms as doctorable, is a legitimation of the patient’s decision to seek medical care. It is also a confirmation of the rational, and even the moral, status of the patient, since consulting a doctor without good reasons is considered irrational. The analysis focuses on how patients take initiatives to present problems and on the doctors’ responses and evaluations regarding the doctorability. Situations where participants seem to have different views of the doctorability of the problems are examined in relation to conversational practices and social actions. The analyses shows that the doctor as well as the patient orients to the potential doctorability of the problems and to the moral challenges related to it, but that their different expectations and roles lead to communicatively unclear situations. Further analyses will illustrate in what ways the MUS-patients’ recurrent challenge of legitimating their visits could be influenced by the interaction, and hence in what ways conscious conversational practices from the care givers might facilitate these situations.}, booktitle = {Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Grahn, Inga-Lill and Kokkinakis, Dimitrios}, year = {2014}, } @inProceedings{moradi-etal-2014-graph-197533, title = {A Graph-Based Analysis of Medical Queries of a Swedish Health Care Portal}, abstract = {Today web portals play an increasingly important role in health care allowing information seekers to learn about diseases and treatments, and to administrate their care. Therefore, it is important that the portals are able to support this process as well as possible. In this paper, we study the search logs of a public Swedish health portal to address the questions if health information seeking differs from other types of Internet search and if there is a potential for utilizing network analysis methods in combination with semantic annotation to gain insights into search behaviors. Using a semantic-based method and a graph-based analysis of word cooccurrences in queries, we show there is an overlap among the results indicating a potential role of these types of methods to gain insights and facilitate improved information search. In addition we show that samples, windows of a month, of search logs may be sufficient to obtain similar results as using larger windows. We also show that medical queries share the same structural properties found for other types of information searches, thereby indicating an ability to reuse existing analysis methods for this type of search data.}, booktitle = {The Fifth International Workshop on Health Text Mining and Information Analysis (Louhi)}, author = {Moradi, Farnaz and Eklund, Ann-Marie and Kokkinakis, Dimitrios and Olovsson, Tomas and Tsigas, Philippas}, year = {2014}, ISBN = {978-1-937284-90-9}, pages = {2--10}, } @inProceedings{kokkinakis-grahn-2014-corpus-209807, title = {A corpus-based approach to the identification of non-literal language in a medical setting.}, abstract = {Automated processing of clinical texts is commonly faced with various less exposed, and not so regularly discussed linguistically complex problems that need to be addressed. One of these issues concerns the usage of figurative language. Figurative language implies the use of words that go beyond their ordinary meaning, a linguistically complex and challenging problem and also a problem that causes great difficulty for the field of natural language processing (NLP). The problem is equally prevalent in both general language and also in various sublanguages, such as clinical medicine. Therefore we believe that a comprehensive model of e.g. clinical language processing needs to account for figurative language usage, and this paper provides a description, and preliminary results towards this goal. Since the empirical, clinical data used in the study is limited in size, there is no formal distinction made between different sub-classifications of figurative language. e.g., metaphors, idioms or simile. We illustrate several types of figurative expressions in the clinical discourse and apply a rather quantitative and corpus-based level analysis. The main research questions that this paper asks are whether there are traces of figurative language (or at least a subset of such types) in patient-doctor and patient-nurse interactions, how can they be found in a convenient way and whether these are transferred in the electronic health records and to what degree.}, booktitle = {Proceedings of the Conference on Communication, Medicine and Ethics (COMET), Lugano, 26-28 June 2014}, author = {Kokkinakis, Dimitrios and Grahn, Inga-Lill}, year = {2014}, pages = {1}, }