@techreport{barnett-etal-2015-state-234687, title = {State Chart XML (SCXML): State Machine Notation for Control Abstraction}, abstract = {This document describes SCXML, or the "State Chart extensible Markup Language". SCXML provides a generic state-machine based execution environment based on CCXML and Harel State Tables.}, author = {Barnett, Jim and Akolkar, Rahul and Auburn, RJ and Bodell, Michael and Burnett, Daniel C. and Carter, Jerry and McGlashan, Scott and Lager, Torbjörn and Helbing, Mark and Hosn, Rafah and Raman, T.V. and Reifenrath, Klaus and Rosenthal, No'am and Roxendal, Johan}, year = {2015}, publisher = {World Wide Web Consortium}, address = {Massachusetts, USA}, } @inProceedings{lindh-2015-forensic-222517, title = {Forensic speaker comparison evaluations}, booktitle = {Proceedings of Roundtable in Forensic Linguistics 2015, September 4th- 6th, Mainz, Germany}, author = {Lindh, Jonas}, year = {2015}, } @inProceedings{kageback-etal-2015-neural-217864, title = {Neural context embeddings for automatic discovery of word senses}, abstract = {Word sense induction (WSI) is the problem of automatically building an inventory of senses for a set of target words using only a text corpus. We introduce a new method for embedding word instances and their context, for use in WSI. The method, Instance-context embedding (ICE), leverages neural word embeddings, and the correlation statistics they capture, to compute high quality embeddings of word contexts. In WSI, these context embeddings are clustered to find the word senses present in the text. ICE is based on a novel method for combining word embeddings using continuous Skip-gram, based on both se- mantic and a temporal aspects of context words. ICE is evaluated both in a new system, and in an extension to a previous system for WSI. In both cases, we surpass previous state-of-the-art, on the WSI task of SemEval-2013, which highlights the generality of ICE. Our proposed system achieves a 33% relative improvement.}, booktitle = {Proceedings of the 1st Workshop on Vector Space Modeling for Natural Language Processing. Denver, United States}, author = {Kågebäck, Mikael and Johansson, Fredrik and Johansson, Richard and Dubhashi, Devdatt}, year = {2015}, pages = {25--32}, } @inProceedings{ghanimifard-johansson-2015-enriching-222749, title = {Enriching Word-sense Embeddings with Translational Context}, abstract = {Vector-space models derived from corpora are an effective way to learn a representation of word meaning directly from data, and these models have many uses in practical applications. A number of unsupervised approaches have been proposed to automatically learn representations of word senses directly from corpora, but since these methods use no information but the words themselves, they sometimes miss distinctions that could be possible to make if more information were available. In this paper, we present a general framework that we call context enrichment that incorporates external information during the training of multi-sense vector-space models. Our approach is agnostic as to which external signal is used to enrich the context, but in this work we consider the use of translations as the source of enrichment. We evaluated the models trained using the translation-enriched context using several similarity benchmarks and a word analogy test set. In all our evaluations, the enriched model outperformed the purely word-based baseline soundly. }, booktitle = {Proceedings of Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov. International Conference and Hissar and Bulgaria 7–9 September and 2015}, author = {Ghanimifard, Mehdi and Johansson, Richard}, year = {2015}, pages = {208--215}, } @inProceedings{borin-etal-2015-here-217351, title = {Here be dragons? The perils and promises of inter-resource lexical-semantic mapping}, abstract = {Lexical-semantic knowledges sources are a stock item in the language technologist’s toolbox, having proved their practical worth in many and diverse natural language processing (NLP) applications. In linguistics, lexical semantics comes in many flavors, but in the NLP world, wordnets reign more or less supreme. There has been some promising work utilizing Roget-style thesauruses instead, but wider experimentation is hampered by the limited availability of such resources. The work presented here is a first step in the direction of creating a freely available Roget-style lexical resource for modern Swedish. Here, we explore methods for automatic disambiguation of interresource mappings with the longer-term goal of utilizing similar techniques for automatic enrichment of lexical-semantic resources.}, booktitle = {Linköping Electronic Conference Proceedings. Semantic resources and semantic annotation for Natural Language Processing and the Digital Humanities. Workshop at NODALIDA , May 11, 13-18 2015, Vilnius}, author = {Borin, Lars and Nieto Piña, Luis and Johansson, Richard}, year = {2015}, volume = {112}, ISBN = {978-91-7519-049-5}, pages = {1--11}, } @inProceedings{lindh-2015-forensic-222514, title = {Forensic speaker comparison using machine and mind}, booktitle = {24th Annual Conference of the International Association for Forensic Phonetics and Acoustics, 8 - 10 July 2015, Leiden, Netherlands}, author = {Lindh, Jonas}, year = {2015}, } @misc{andersen-etal-2015-sibirientyska-215757, title = {Sibirientyska kvinnor (Siberian German women)}, abstract = {Siberian German women The corpus consists of dialogs between four women born in 1927 to 1937 in the Soviet Volga Republic. Their mother tongue is a German variety spoken in Russia since the second half of the 18th century. Since the end of the Second World War, the women have lived in the region of Krasnoyarsk. They talk about their backgrounds and their everyday lives in the village. The corpus consists of about 16 000 words. Russian words and hybrids are given in [brackets], the turns of the interviewers are in {brackets}; all verb forms have got the attribute FINIT or INFINIT. More information on the research project see Syntax in contact. }, author = {Andersen, Christiane and Forsberg, Markus and Hammarstedt, Martin and Pankow, Alexander}, year = {2015}, publisher = {University of Gothenburg}, address = {Göteborg}, } @article{tahmasebi-etal-2015-visions-212969, title = {Visions and open challenges for a knowledge-based culturomics}, abstract = {The concept of culturomics was born out of the availability of massive amounts of textual data and the interest to make sense of cultural and language phenomena over time. Thus far however, culturomics has only made use of, and shown the great potential of, statistical methods. In this paper, we present a vision for a knowledge-based culturomics that complements traditional culturomics. We discuss the possibilities and challenges of combining knowledge-based methods with statistical methods and address major challenges that arise due to the nature of the data; diversity of sources, changes in language over time as well as temporal dynamics of information in general. We address all layers needed for knowledge-based culturomics, from natural language processing and relations to summaries and opinions.}, journal = {International Journal on Digital Libraries}, author = {Tahmasebi, Nina and Borin, Lars and Capannini, Gabriele and Dubhashi, Devdatt and Exner, Peter and Forsberg, Markus and Gossen, Gerhard and Johansson, Fredrik and Johansson, Richard and Kågebäck, Mikael and Mogren, Olof and Nugues, Pierre and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {169--187}, } @inProceedings{aristodemou-etal-2015-acoustics-239890, title = {The Acoustics of Cypriot Greek Fricatives}, booktitle = {Proceedings of the 6th ISEL Conference on Experimental Linguistics ExLing 2015 26 - 27 June 2015 Athens, Greece Edited by Antonis Botinis }, author = {Aristodemou, Andrie and Savva, Angelandria and Themistocleous, Charalambos}, year = {2015}, publisher = {University of Athens}, address = {Athens}, pages = {9--12}, } @inProceedings{themistocleous-muller-2015-intonation-232414, title = {The intonation of Albanian polar questions and statements}, abstract = {This studyaims to provide an account of the effects of sentence type (statements vs. polar questions) on Standard Albanian prenuclear rises through a polynomial model representing the dynamic characteristics of tonal contours.Results show that the main difference in contour shape between Albanian statements and polar questions is located in the shape of the prenuclear rise, and this difference was significant; onset timing of the prenuclear rise, however, did not differ significantly betweenthe two types of sentence.}, booktitle = {6th International Conference of Experimental Linguistics. ExLing 2015, 26-27 June 2015, Athens, Greece / Edited by Antonis Botinis}, author = {Themistocleous, Charalambos and Müller, Daniela}, year = {2015}, publisher = {University of Athens}, address = {Athens}, ISBN = {978-960-466-160-2}, } @inProceedings{forsberg-etal-2015-speaker-220340, title = {Speaker comparison evaluation using a new corpus of urban speech}, booktitle = {24th Annual Conference of the International Association for Forensic Phonetics and Acoustics, 8-10/7 2015, Leiden}, author = {Forsberg, Julia and Gross, Johan and Lindh, Jonas and Åkesson, Joel}, year = {2015}, pages = {46--47}, } @inProceedings{johansson-nietopina-2015-combining-216865, title = {Combining Relational and Distributional Knowledge for Word Sense Disambiguation}, abstract = {We present a new approach to word sense disambiguation derived from recent ideas in distributional semantics. The input to the algorithm is a large unlabeled corpus and a graph describing how senses are related; no sense-annotated corpus is needed. The fundamental idea is to embed meaning representations of senses in the same continuous-valued vector space as the representations of words. In this way, the knowledge encoded in the lexical resource is combined with the infor- mation derived by the distributional methods. Once this step has been carried out, the sense representations can be plugged back into e.g. the skip-gram model, which allows us to compute scores for the different possible senses of a word in a given context. We evaluated the new word sense disambiguation system on two Swedish test sets annotated with senses defined by the SALDO lexical resource. In both evaluations, our system soundly outperformed random and first-sense baselines. Its accuracy was slightly above that of a well- known graph-based system, while being computationally much more efficient,}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, May 12-13, Vilnius, Lithuania. Linköping Electronic Conference Proceedings 109, Linköping University Electronic Press..}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {69--78}, } @inProceedings{johansson-nietopina-2015-embedding-217863, title = {Embedding a Semantic Network in a Word Space}, abstract = {We present a framework for using continuous- space vector representations of word meaning to derive new vectors representing the meaning of senses listed in a semantic network. It is a post-processing approach that can be applied to several types of word vector representations. It uses two ideas: first, that vectors for polysemous words can be decomposed into a convex combination of sense vectors; secondly, that the vector for a sense is kept similar to those of its neighbors in the network.This leads to a constrained optimization problem, and we present an approximation for the case when the distance function is the squared Euclidean. We applied this algorithm on a Swedish semantic network, and we evaluate the quality of the resulting sense representations extrinsically by showing that they give large improvements when used in a classifier that creates lexical units for FrameNet frames. }, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies. Denver, United States, May 31 – June 5, 2015}, author = {Johansson, Richard and Nieto Piña, Luis}, year = {2015}, ISBN = {978-1-941643-49-5}, pages = {1428--1433}, } @inProceedings{nietopina-johansson-2015-simple-222611, title = {A Simple and Efficient Method to Generate Word Sense Representations}, abstract = {Distributed representations of words have boosted the performance of many Natural Language Processing tasks. However, usually only one representation per word is obtained, not acknowledging the fact that some words have multiple meanings. This has a negative effect on the individual word representations and the language model as a whole. In this paper we present a simple model that enables recent techniques for building word vectors to represent distinct senses of polysemic words. In our assessment of this model we show that it is able to effectively discriminate between words’ senses and to do so in a computationally efficient manner.}, booktitle = {Proceedings of International Conference in Recent Advances in Natural Language Processing}, editor = {Galia Angelova and Kalina Bontcheva and Ruslan Mitkov and Hissar and Bulgaria 7–9 September and 2015}, author = {Nieto Piña, Luis and Johansson, Richard}, year = {2015}, pages = {465--472}, } @inProceedings{ahlberg-etal-2015-case-217988, title = {A case study on supervised classification of Swedish pseudo-coordination}, abstract = {We present a case study on supervised classification of Swedish pseudo-coordination (SPC). The classification is attempted on the type-level with data collected from two data sets: a blog corpus and a fiction corpus. Two small experiments were designed to evaluate the feasability of this task. The first experiment explored a classifier’s ability to discriminate pseudo-coordinations from ordinary verb coordinations, given a small labeled data set created during the experiment. The second experiment evaluated how well the classifier performed at detecting and ranking SPCs in a set of unlabeled verb coordinations, to investigate if it could be used as a semi-automatic discovery procedure to find new SPCs.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania}, author = {Ahlberg, Malin and Andersson, Peter and Forsberg, Markus and Tahmasebi, Nina}, year = {2015}, publisher = {Linköping University Electronic Press}, address = {Linköpings universitet}, ISBN = {978-91-7519-098-3}, } @inProceedings{fribergheppin-dannells-2015-polysemy-218276, title = {Polysemy and questions of lumping or splitting in the construction of Swedish FrameNet}, abstract = {When working on a lexical resource, such as Swedish FrameNet (SweFN), assumptions based on linguistic theories are made, and methodological directions based upon them are taken. These directions often need to be revised when not beforehand foreseen problems arise. One assumption that was made already in the early development stages of SweFN was that each lexical entry from the reference lexicon, SALDO, would evoke only one semantic frame in SweFN. If a lexical entry evoked more than one frame, it entailed more than one sense and therefore required a new entry in the lexicon. As work progressed, this inclination towards splitting, in the perpetual lumpers and splitters discussion proved to be progressively untenable. This paper will give an account of the problems which were encountered and suggestions for solutions on polysemy issues forcing a discussion on lumping or splitting.}, booktitle = {Proceedings of the Workshop on Semantic resources and Semantic Annotation for Natural Language Processing and the Digital Humanities at NODALIDA 2015, Vilnius, 11th May, 2015}, author = {Friberg Heppin, Karin and Dannélls, Dana}, year = {2015}, pages = {12--20}, } @edited_book{volodina-etal-2015-proceedings-226574, title = {Proceedings of the 4th workshop on NLP for computer assisted language learning at Nodalida 2015, Vilnius, 11th May, 2015}, editor = {Volodina, Elena and Borin, Lars and Pilán, Ildikó}, year = {2015}, publisher = {Linköping University Press}, address = {Linköping}, ISBN = {978-91-7519-036-5}, } @inProceedings{gruzitis-etal-2015-formalising-220419, title = {Formalising the Swedish Constructicon in Grammatical Framework}, abstract = {This paper presents a semi-automatic approach to acquire a computational construction grammar from the semi-formal Swedish Constructicon. The implementation is based on the resource grammar library provided by Grammatical Framework and can be seen as an extension to the existing Swedish resource grammar. An important consequence of this work is that it generates feedback, explicit and implicit, on how to improve the annotation consistency and adequacy of the original construction resource. }, booktitle = {Proceedings of the Grammar Engineering Across Frameworks (GEAF) Workshop, 53rd Annual Meeting of the ACL and 7th IJCNLP, Beijing, China, July 26-31, 2015}, author = {Gruzitis, Normunds and Dannélls, Dana and Lyngfelt, Benjamin and Ranta, Aarne}, year = {2015}, ISBN = {978-1-932432-66-4}, pages = {49----56}, } @misc{wilhelmsson-2015-autentiska-249227, title = {Autentiska och artificiella frågor till svensk text Automatisk frågegenerering jämfört med användares frågor för informationsåtkomst}, abstract = {Informationssökning mot ostrukturerade datakällor som fri text är ett av de områden där användargränssnitt med fri formulering i naturligt språk har tagits fram. I ett sådant, eventuellt AI-betonat, system kan några grundläggande svårigheter från användarperspektivet märkas. En sådan svårighet är att en användare inte känner till huruvida en fråga som hon avser att ställa egentligen kan besvaras av den aktuella texten. Denna svårighet, tillsammans med andra, som de kraftiga variationsmöjligheterna för formen för ett giltigt svar på en ställd fråga, riskerar att leda till att användarintrycken av systemtypen blir negativa. De moment som behöver ingå i ett sådant frågebaserat informationssystems funktionssätt måste på något sätt inbegripa en mappning av frågeled i frågan (t.ex. när) till den form och grammatisk funktion som svaret i texten måste ha (för frågan när normalt ett tidsadverbial). Bland annat denna iakttagelse inbjuder till användning av automatisk frågegenerering (question generation, QG). Frågegenerering innebär att frågor som en naturlig text besvarar initialt utvinns av ett program som samlar in dem i explicit form. Tanken för användning i informationssökning är att en användare i gränssnittet enbart ska kunna ställa just dessa frågor, vilka faktiskt besvaras av texten. Denna studie gäller just de frågor som ett automatiskt frågegenereringssystem för svenska kan, och genom vidare utveckling, skulle kunna generera för godtycklig digital svensk text. Även om mängden automatiskt genererade frågor och frågeformuleringar kan bli mycket stor, utrymmesmässigt många gånger större än ursprungstexten, så är det tydligt att den beskrivna metoden för frågegenerering för svenska inte kan och troligen inte heller kommer att kunna förmås att skapa alla de frågor och frågeformuleringar som en vanlig användare skulle anse att en viss text besvarar. Men hur väl fungerar då automatiskt genererade frågor i detta sammanhang? Denna uppsats kretsar kring en användarundersökning där undersökningsdeltagare har ombetts att formulera frågor som texter besvarar, och som anses vara relevanta frågor. Den resulterande samlingen frågor undersöktes och kategoriserades. Resultatet av undersökningens huvudfråga visar att bara 20-25 % av användarnas frågeformuleringar skulle kunna genereras direkt automatiskt med aktuell ansats – utan vissa informationstekniska förbättringar. Uppsatsen föreslår viss ny terminologi för detta outforskade område, bl.a. för att skilja mellan de olika grader av processkrav som generering av olika frågeslag från text kräver.}, author = {Wilhelmsson, Kenneth}, year = {2015}, address = {Göteborgs universitet, Inst för tillämpad IT}, } @inProceedings{ahlberg-etal-2015-paradigm-217987, title = {Paradigm classification in supervised learning of morphology}, abstract = {Supervised morphological paradigm learning by identifying and aligning the longest common subsequence found in inflection tables has recently been proposed as a simple yet competitive way to induce morphological patterns. We combine this non-probabilistic strategy of inflection table generalization with a discriminative classifier to permit the reconstruction of complete inflection tables of unseen words. Our system learns morphological paradigms from labeled examples of inflection patterns (inflection tables) and then produces inflection tables from unseen lemmas or base forms. We evaluate the approach on datasets covering 11 different languages and show that this approach results in consistently higher accuracies vis-a-vis other methods on the same task, thus indicating that the general method is a viable approach to quickly creating high-accuracy morphological resources.}, booktitle = {Proceedings of the 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, author = {Ahlberg, Malin and Forsberg, Markus and Huldén, Måns}, year = {2015}, } @inProceedings{bergenmar-olsson-2015-tracing-228773, title = {Tracing Cultural Transfer Through Multiple Translation Analysis. The Case of the Swedish 19th-Century Bourgeois Novel in German and Czech}, abstract = {In the last decades, Comparative Literature has become more directed towards questions of transculturality. This renders translations of literary texts an important role as a vehicle not just for the transfer of text and language, but also of ideas and cultures. Digital methods for comparing multiple translations within and across languages might prove to be important for exploring how, for example, a Swedish 19th century bourgeois novel is reframed in Czech translations. The chosen example is A Merchant House (1859) by Emilie Flygare–Carlén (1807–1892) who was one of the most popular authors in Czech speaking regions in the late 19th Century. In this paper existing collation tools are used for comparing two different Czech translations (1872 and 1910), by two different translators. This might both reveal how the gender, context and position of the translator colours the literary text and how the translations are adapted to changing literary trends. Furthermore, parallel text alignment is tried as a method for comparing across languages, since the Czech translation is made from a German translation. Are the Czech translations subject to “foreignization” or “domestication”? Or do they retain the same traits as the German translation, which is the source of the first Czech translation? Does the systematic comparison of multiple translations contribute to the understanding of how texts move from certain gendered cultural contexts and ideologies to others? }, booktitle = {Digital Literary Studies. International Conference May 14-15 2015, Coimbra, Portugal}, author = {Bergenmar, Jenny and Olsson, Leif-Jöran}, year = {2015}, } @inProceedings{forsberg-etal-2015-forensic-222113, title = {A forensic and sociophonetic perspective on a new corpus of young urban Swedish}, booktitle = {10th UK Language Variation and Change (UKLVC) conference 1-3/9 2015, York, UK}, author = {Forsberg, Julia and Gross, Johan and Lindh, Jonas and Åkesson, Joel}, year = {2015}, } @incollection{rama-borin-2015-comparative-197484, title = {Comparative evaluation of string similarity measures for automatic language classification.}, booktitle = {Sequences in Language and Text}, author = {Rama, Taraka and Borin, Lars}, year = {2015}, publisher = {De Gruyter Mouton}, ISBN = {978-3-11-036287-9}, } @inProceedings{pilan-2015-helping-227313, title = {Helping Swedish words come to their senses: word-sense disambiguation based on sense associations from the SALDO lexicon}, abstract = {This paper describes a knowledge-based approach to word-sense disambiguation using a lexical-semantic resource, SALDO. This hierarchically organized lexicon defining senses in terms of other related senses has not been previously explored for this purpose. The proposed method is based on maximizing the overlap between associated word senses of nouns and verbs co-occuring within a sentence. The results of a small-scale experiment using this method are also reported. Overall, the approach proved more efficient for nouns, since not only was the accuracy score higher for this category (56%) than for verbs (46%), but for nouns in 22% more of the cases was a sense overlap found. As a result of an in-depth analysis of the predictions, we identified a number of ways the system could be modified or extended for an improved performance.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics (NODALIDA 2015). May 11–13, 2015, Vilnius, Lithuania}, editor = {Beáta Megyesi}, author = {Pilán, Ildikó}, year = {2015}, number = {109}, ISBN = {9789175190983}, pages = {275--279}, } @inProceedings{volodina-pijetlovic-2015-lark-226543, title = {Lark Trills for Language Drills: Text-to-speech technology for language learners.}, abstract = {This paper reports on the development and the initial evaluation of a dictation&spelling prototype exercise for second language (L2) learners of Swedish based on text-to-speech (TTS) technology. Implemented on an already existing Intelligent Computer-Assisted Language Learning (ICALL) platform, the exercise has not only served as a test case for TTS in L2 environment, but has also shown a potential to train listening and orthographic skills, as well as has become a way of collecting learner-specific spelling errors into a database. Exercise generation re-uses well-annotated corpora, lexical resources, and text-to-speech technology with an accompanying talking head. }, booktitle = {Proceedings of the Ninth Workshop on Innovative Use of NLP for Building Educational Applications, June 4, 2015, Denver, Colorado, USA}, author = {Volodina, Elena and Pijetlovic, Dijana}, year = {2015}, ISBN = {978-1-941643-35-8}, pages = {107--117}, } @inProceedings{adesam-etal-2015-multiwords-228833, title = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish}, abstract = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels. In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.}, booktitle = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-83-63159-18-4}, pages = {3--12}, } @article{holzmann-etal-2015-named-209780, title = {Named entity evolution recognition on the Blogosphere}, abstract = {Advancements in technology and culture lead to changes in our language. These changes create a gap between the language known by users and the language stored in digital archives. It affects user’s possibility to firstly find content and secondly interpret that content. In a previous work, we introduced our approach for named entity evolution recognition (NEER) in newspaper collections. Lately, increasing efforts in Web preservation have led to increased availability of Web archives covering longer time spans. However, language on the Web is more dynamic than in traditional media and many of the basic assumptions from the newspaper domain do not hold for Web data. In this paper we discuss the limitations of existing methodology for NEER. We approach these by adapting an existing NEER method to work on noisy data like the Web and the Blogosphere in particular. We develop novel filters that reduce the noise and make use of Semantic Web resources to obtain more information about terms. Our evaluation shows the potentials of the proposed approach.}, journal = {International Journal on Digital Libraries}, author = {Holzmann, Helge and Tahmasebi, Nina and Risse, Thomas}, year = {2015}, volume = {15}, number = {2-4}, pages = {209--235}, } @inProceedings{kokkinakis-etal-2015-gender-215535, title = {Gender-Based Vocation Identification in Swedish 19th Century Prose Fiction using Linguistic Patterns, NER and CRF Learning}, abstract = {This paper investigates how literature could be used as a means to expand our understanding of history. By applying macroanalytic techniques we are aiming to investigate how women enter literature and particularly which functions they assume, their working patterns and if we can spot differences in how often male and female characters are mentioned with various types of occupational titles (vocation) in Swedish literary texts. Modern historiography, and especially feminist and women’s history has emphasized a relative invisibility of women’s work and women workers. The reasons behind this are manifold, and the extent, the margin of error in terms of women’s work activities is of course hard to assess. Therefore, vocation identification can be used as an indicator for such exploration and we present a hybrid system for automatic annotation of vocational signals in 19th century Swedish prose fiction. Beside vo-cations, the system also assigns gender (male, female or unknown) to the vocation words, a prerequisite for the goals of the study and fu-ture in-depth explorations of the corpora.}, booktitle = {Proceedings of the Fourth Workshop on Computational Linguistics for Literature (Clfl). Co-located with the NAACL/HLT. Denver, Colorado, USA}, author = {Kokkinakis, Dimitrios and Ighe, Ann and Malm, Mats}, year = {2015}, pages = {9}, } @inProceedings{adesam-etal-2015-defining-217815, title = {Defining the Eukalyptus forest – the Koala treebank of Swedish}, abstract = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.}, booktitle = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi}, author = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard}, year = {2015}, ISBN = {978-91-7519-098-3}, pages = {1--9}, } @inProceedings{kokkinakis-malm-2015-detecting-225762, title = {Detecting Reuse of Biblical Quotes in Swedish 19th Century Fiction using Sequence Alignment}, abstract = {Text reuse, a form of text repetition, recycling or borrowing, is a theoretically and practically interesting problem that has attracted considerable attention during the last years e.g. in the cultural heritage context (historical and comparative linguistics); in the context of social network propagation of ideas and in the measuring of journalistic reuse. In this paper we briefly outline and experiment with a method used for biological sequence alignment that have been also used in humanities research for e.g. the detection of similar passages in the complete works of Voltaire and 18th century French encyclopedias or for tracing how and which ideas spread in 19th century US-newspaper collections. We use available software (text-PAIR: Pairwise Alignment for Intertextual Relations) and experiment with the Charles XII Bible translation into Swedish, completed in 1703, against the content of the Swedish prose fiction 1800-1900, in order to automatically detect passages taken from this particular Bible translation in the prose fiction corpus.}, booktitle = {Corpus-based Research in the Humanities workshop (CRH), 10 December 2015 Warsaw, Poland }, author = {Kokkinakis, Dimitrios and Malm, Mats}, year = {2015}, ISBN = {978-83-63159-19-1}, pages = {79--86}, }