Skip to main content


	title        = {FSvReader – Exploring Old Swedish Cultural Heritage Texts},
	abstract     = {This paper describes FSvReader, a tool for easier access to Old Swedish (13th–16th century) texts. Through automatic fuzzy linking of words in a text to a dictionary describing the language of the time, the reader has direct access to dictionary pop-up definitions, in spite of the large amount of  morphological and spelling variation. The linked dictionary entries can also be used for simple searches in the text, highlighting possible further instances of the same entry. },
	booktitle    = {CEUR Workshop Proceedings, vol. 2084. Proceedings of the Digital Humanities in the Nordic Countries 3rd Conference Helsinki, Finland, March 7-9, 2018. Edited by Eetu, Mäkelä Mikko, Tolonen Jouni Tuominen},
	author       = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof},
	year         = {2018},
	publisher    = {University of Helsinki, Faculty of Arts},
	address      = {Helsinki},

	title        = { A best-first anagram hashing filter for approximate string matching with generalized edit distance},
	abstract     = {This paper presents an efficient method for approximate string matching against a lexicon. We
define a filter that for each source word selects a small set of target lexical entries, from which
the best match is then selected using generalized edit distance, where edit operations can be
assigned an arbitrary weight. The filter combines a specialized hash function with best-first
search. Our work extends and improves upon a previously proposed hash-based filter, developed
for matching with uniform-weight edit distance. We evaluate an approximate matching system
implemented with the new best-first filter, by conducting several experiments on a historical
corpus and a set of weighted rules taken from the literature. We present running times and
discuss how performance varies using different stopping criteria and target lexica. The results
show that the filter is suitable for large rule sets and million word corpora, and encourage
further development.
	booktitle    = {24th International Conference on Computational Linguistics COLING,  8-15 December 2012, Mumbai, India. Proceedings},
	author       = {Ahlberg, Malin and Bouma, Gerlof},
	year         = {2012},

	title        = {Språkteknologi för svenska språket genom tiderna},
	abstract     = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources).
This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.},
	journal      = {Kungliga Skytteanska Samfundets Handlingar},
	author       = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus},
	year         = {2016},
	volume       = {76},
	number       = {Studier i svensk språkhistoria 13},
	pages        = {65--87},

	title        = {Computer-aided Morphology Expansion for Old Swedish},
	abstract     = {In this paper we describe and evaluate a tool for paradigm induction and lexicon extraction that has been applied to Old Swedish. The tool is semi-supervised and uses a small seed lexicon and unannotated corpora to derive full inflection tables for input lemmata. In the work presented here, the tool has been modified to deal with the rich spelling variation found in Old Swedish texts. We also present some initial experiments, which are the first steps towards creating a large-scale morphology for Old Swedish.},
	booktitle    = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14) May 26-31, 2014 Reykjavik, Iceland  },
	author       = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Bouma, Gerlof and Forsberg, Markus and Hulden, Mans},
	year         = {2014},
	ISBN         = { 978-2-9517408-8-4},
	pages        = {1102--1105},

	title        = {Processing spelling variation in historical text},
	booktitle    = {Proceedings of the Fourth Swedish Language Technology Conference (SLTC)},
	author       = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof},
	year         = {2012},

	title        = {bokstaffua, bokstaffwa, bokstafwa, bokstaua, bokstawa... Towards lexical link-up for a corpus of Old Swedish},
	booktitle    = {Proceedings of the LTHist workshop at Konvens},
	author       = {Adesam, Yvonne and Ahlberg, Malin and Bouma, Gerlof},
	year         = {2012},

	title        = {The Koala Part-of-Speech Tagset},
	abstract     = {We present the Koala part-of-speech tagset for written Swedish. The categorization takes the Swedish Academy Grammar (SAG) as its main starting point, to fit with the current descriptive view on Swedish grammar. We argue that neither SAG, as is, nor any of the existing part-of-speech tagsets meet our requirements for a broadly applicable categorization. Our proposal is outlined and compared to the other descriptions, and motivations for both the tagset as a whole as well as decisions about individual tags are discussed.},
	journal      = {Northern European Journal of Language Technology},
	author       = {Adesam, Yvonne and Bouma, Gerlof},
	year         = {2019},
	volume       = {6},
	pages        = {5--41},

	title        = {The Eukalyptus Treebank of Written Swedish},
	booktitle    = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7–9 November 2018},
	author       = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard and Borin, Lars and Forsberg, Markus},
	year         = {2018},

	title        = {The Koala Part-of-Speech and Morphological Tagset for Swedish},
	booktitle    = {Seventh Swedish Language Technology Conference (SLTC), Stockholm, 7-9 November, 2018},
	author       = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard},
	year         = {2018},

	title        = {Proceedings of the NoDaLiDa 2017 Workshop on Processing Historical Language},
	author       = {Bouma, Gerlof and Adesam, Yvonne},
	year         = {2017},
	publisher    = {Linköping University Electronic Press, Linköpings universitet},
	address      = {Linköping},
	ISBN         = {978-91-7685-503-4},

	title        = {Part-of-speech and Morphology Tagging Old Swedish},
	booktitle    = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016},
	author       = {Bouma, Gerlof and Adesam, Yvonne},
	year         = {2016},

	title        = {SWORD: Towards Cutting-Edge Swedish Word Processing},
	abstract     = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types.  For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.},
	booktitle    = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016},
	author       = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim},
	year         = {2016},

	title        = {Multiword Annotation in the Eukalyptus Treebank of Written Swedish},
	booktitle    = {PARSEME, 6th general meeting, 7-8 April 2016, Struga, FYR Macedonia },
	author       = {Bouma, Gerlof and Adesam, Yvonne},
	year         = {2016},

	title        = {Old Swedish Part-of-Speech Tagging between Variation and External Knowledge},
	booktitle    = {Proceedings of the 10th SIGHUM Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities, Berlin, Germany, August 11, 2016},
	author       = {Adesam, Yvonne and Bouma, Gerlof},
	year         = {2016},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-945626-09-8},

	title        = {A Multi-domain Corpus of Swedish Word Sense Annotation},
	abstract     = {We describe the word sense annotation layer in Eukalyptus, a freely available five-domain corpus of contemporary Swedish with several annotation layers. The annotation uses the SALDO lexicon to define the sense inventory, and allows word sense annotation of compound segments and multiword units. We give an overview of the new annotation tool developed for this project, and finally present an analysis of the inter-annotator agreement between two annotators.
	booktitle    = {10th edition of the Language Resources and Evaluation Conference, 23-28 May 2016, Portorož (Slovenia)},
	author       = {Johansson, Richard and Adesam, Yvonne and Bouma, Gerlof and Hedberg, Karin},
	year         = {2016},
	publisher    = {European Language Resources Association},
	ISBN         = {978-2-9517408-9-1},

	title        = {Multiwords, Word Senses and Multiword Senses in the Eukalyptus Treebank of Written Swedish},
	abstract     = {Multiwords reside at the intersection of the lexicon and syntax and in an annotation project, they will affect both levels.  In the Eukalyptus treebank of written Swedish, we treat multiwords formally as syntactic objects, which are assigned a lexical type and sense. With the help of a simple dichotomy, analyzed vs unanalyzed multiwords, and the expressiveness of the syntactic annotation formalism employed, we are able to flexibly handle most multiword types and usages.},
	booktitle    = {Proceedings of the Fourteenth International Workshop on Treebanks and Linguistic Theories (TLT14), 11–12 December 2015 Warsaw, Poland},
	author       = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard},
	year         = {2015},
	ISBN         = {978-83-63159-18-4},
	pages        = {3--12},

	title        = {Defining the Eukalyptus forest – the Koala treebank of Swedish},
	abstract     = {This paper details the design of the lexical and syntactic layers of a new annotated corpus of Swedish contemporary texts. In order to make the corpus adaptable into a variety of representations, the annotation is of a hybrid type with head-marked constituents and function-labeled edges, and with a rich annotation of non-local dependencies. The source material has been taken from public sources, to allow the resulting corpus to be made freely available.},
	booktitle    = {Proceedings of the 20th Nordic Conference of Computational Linguistics, NODALIDA 2015, May 11-13, 2015, Vilnius, Lithuania. Edited by Beáta Megyesi},
	author       = {Adesam, Yvonne and Bouma, Gerlof and Johansson, Richard},
	year         = {2015},
	ISBN         = {978-91-7519-098-3},
	pages        = {1--9},

	title        = {Koala – Korp’s Linguistic Annotations Developing an infrastructure for text-based research with high-quality annotations},
	booktitle    = {Proceedings of the Fifth Swedish Language Technology Conference, Uppsala, 13-14 November 2014},
	author       = {Adesam, Yvonne and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Johansson, Richard},
	year         = {2014},

	title        = {Experiments on sentence segmentation in Old Swedish editions},
	booktitle    = {NEALT Proceedings Series },
	author       = {Bouma, Gerlof and Adesam, Yvonne},
	year         = {2013},
	volume       = {18},
	ISBN         = {978-91-7519-587-2},

	title        = {A free cloud service for OCR / En fri molntjänst för OCR},
	author       = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana},
	year         = {2016},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},

	title        = {Building a Diachronic and Contrastive Parallel Corpus – and an Intended Application in the Form of a Study of Germanic Complex Verb Constructions },
	abstract     = {We present a parallel corpus under construction, which is parallel in diachronically (through time) as well as contrastively (between languages). The corpus is made up of Bible texts spanning almost 6 centuries in 4 languages. Our project's direct purpose of building the corpus is to track the development of verb combinations containing multiple auxiliary verbs through time in German, Dutch, English and Swedish. We will also make the corpus available to other researchers.

In this poster, we discuss the design of the corpus, our selection of sources, issues with bringing together a wide variety of sources, and alignment of the data. We will also touch upon intended future work concerning the automatic linguistic processing needed to facilitate the study of verb constructions, and the methodological challenges of doing corpus linguistic research on the varying quality of annotations produced by automatic methods on materials from such a wide range of origins.},
	booktitle    = {Digital Humanities 2019, 9 -12 July 2019, Utrecht, the Netherlands},
	author       = {Bouma, Gerlof and Coussé, Evie and de Kooter , Dirk-Jan and van der Sijs, Nicoline},
	year         = {2019},

	title        = {Exploring Combining Training Datasets for the CLIN 2019 Shared Task on Cross-genre Gender Detection in Dutch},
	abstract     = {We present our entries to the Shared Task on Cross-genre Gender Detection in Dutch at CLIN 2019. We start from a simple logistic regression model with commonly used features, and consider two ways of combining training data from different sources.Our in-genre models do reasonably well, but the cross-genre models area lot worse. Post-task experiments show no clear systematic advantage of one way of combining training data sources over the other, but do suggest  accuracy  can  be  gained  from  a  better  way  of  setting  model hyperparameters.},
	booktitle    = {CEUR Workshop Proceedings, vol 2453. Proceedings of the Shared Task on Cross-Genre Gender Prediction in Dutch at CLIN29 (GxG-CLIN29) co-located with the 29th Conference on Computational Linguistics in The Netherlands (CLIN29). Groningen, The Netherlands, January 31, 2019. Edited by Hessel Haagsma, Tim Kreutz, Masha Medvedeva, Walter Daelemans and Malvina Nissim},
	author       = {Bouma, Gerlof},
	year         = {2019},
	publisher    = {},
	address      = {Aachen },

	title        = {The PROIEL treebank family: a standard for early attestations of Indo-European languages},
	abstract     = {This article describes a family of dependency treebanks of early attestations of Indo-European languages originating in the parallel treebank built by the members of the project pragmatic resources in old Indo-European languages. The treebanks all share a set of open-source software tools, including a web annotation interface, and a set of annotation schemes and guidelines developed especially for the project languages. The treebanks use an enriched dependency grammar scheme complemented by detailed morphological tags, which have proved sufficient to give detailed descriptions of these richly inflected languages, and which have been easy to adapt to new languages. We describe the tools and annotation schemes and discuss some challenges posed by the various languages that have been annotated. We also discuss problems with tokenisation, sentence division and lemmatisation, commonly encountered in ancient and mediaeval texts, and challenges associated with low levels of standardisation and ongoing morphological and syntactic change.},
	journal      = {Language Resources and Evaluation},
	author       = {Eckhoff, H. and Bech, K. and Bouma, Gerlof and Eide, K. and Haug, D. and Haugen, O. E. and Johndal, M.},
	year         = {2018},
	volume       = {52},
	number       = {1},
	pages        = {29--65},

	title        = {Two for the price of one: an LFG treatment of sentence initial object es in German.},
	abstract     = {    We present an analysis of sentence initial object es ‘it’ in German. The
weak pronoun es may only realize such an object under specific information
structural conditions. We follow recent work suggesting these conditions are
exactly those that licence the use of the presentational construction, marked
by a sentence initial dummy es. We propose that the initial objects are an
example of function amalgamation, show that only objects that may also
appear in the clause-internal postverbal domain can participate in this fusion
and make this precise in LFG. We end the paper with a contrastive discussion.
	booktitle    = {Proceedings of LFG'12. Miriam Butt and Tracy Holloway King (Eds.)},
	author       = {Theiler, Nadine and Bouma, Gerlof},
	year         = {2012},
	pages        = {603--623},

	title        = {Real-Time Persistent Queues and Deques with Logic Variables (Declarative Pearl)},
	abstract     = {             We present a Prolog implementation of real-time persistent
queues and double-ended queues. Our implementation is inspired by
Okasaki’s lazy-functional approach, but relies only on standard Prolog,
comprising of the pure subset plus if-then-else constructs to efficiently
implement guards and meta-calls for convenience. The resulting data
structure is a nice demonstration of the fact that the use of logic variables
to hold the outcome of an unfinished computation can sometimes give
the same kind of elegant and compact solutions as lazy evaluation.
	booktitle    = {Proceedings of the 11th International Symposium on Functional and Logic Programming (FLOPS 2012)},
	author       = {Bouma, Gerlof},
	year         = {2012},
	ISBN         = {978-3-642-29821-9},
	pages        = {62----73},

	title        = {A German Grammar for Generation in OpenCCG},
	abstract     = {We present a freely available CCG fragment for German that is being developed for natural language generation tasks in the domain of share price statistics. It is implemented in OpenCCG, an open source Java implementation of the compuationally attractive CCG formalism. Since generation requires lexical categories to have semantic representations, so that posssible realizations can be produced, the underlying grammar needs to define semantics. Hybrid Logic Dependency Semantics, a logic calculus especially suited for encodings linguistic meaning, is used to declare the semantics layer. To our knowledge, related work on German CCG development has not yet focused on the semantics layer. In terms of syntax, we concentrate on aspects of German as a partially free constituent order language. Special attention is payed to scrambling, where we employ CCG's type-changing mechanism in a manner athat is somewhat unusual, but allows us to a) minimize the amount of syntactic categories that are needed to model scrambling, compared to providing categories for all possible argument orders, and b) retain enough control to impose restrictions on scrambling.},
	booktitle    = {H. Hedeland, T. Schmidt, K. Wörner (eds.): Multilingual Resources and Multilingual Applications. Proc. of the Conference of the German Society for Computational Linguistics and Language Technology (GSCL), Hamburg, 2011. Working Papers in Multilingualism, Series B },
	author       = {Vancoppenolle, Jean and Tabbert, Eric and Bouma, Gerlof and Stede, Manfred},
	year         = {2011},
	number       = {96},
	pages        = {145--150},

	title        = {The EDGeS Diachronic Bible Corpus},
	abstract     = {We present the EDGeS Diachronic Bible Corpus: a diachronically and synchronically parallel corpus of Bible translations in Dutch, English, German and Swedish, with texts from the 14th century until today. It is compiled in the context of an intended longitudinal and contrastive study of complex verb constructions in Germanic. The paper discusses the corpus design principles, its selection of 36 Bibles, and the information and metadata encoded for the corpus texts. The EDGeS corpus will be available in two forms: the whole corpus will be accessible for researchers behind a login in the well-known OPUS search infrastructure, and the open subpart of the corpus will be available for download.},
	booktitle    = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), May 11-16, 2020, Marseille, France},
	author       = {Bouma, Gerlof and Coussé, Evie and Dijkstra, Trude and van der Sijs, Nicoline},
	year         = {2020},
	publisher    = {European Language Resources Association (ELRA)},
	ISBN         = {979-10-95546-34-4},

	title        = {The Swedish Winogender Dataset},
	abstract     = {We introduce the SweWinogender test set, a diagnostic dataset to measure gender bias in coreference resolution. It is modelled after the English Winogender benchmark, and is released with reference statistics on the distribution of men and women between occupations and the association between gender and occupation in modern corpus material. The paper discusses the design and creation of the dataset, and presents a small investigation of the supplementary statistics.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31 - June 2, 2021, Reykjavik, Iceland (online)},
	author       = {Hansson, Saga and Mavromatakis, Konstantinos and Adesam, Yvonne and Bouma, Gerlof and Dannélls, Dana},
	year         = {2021},
	publisher    = {Linköping University Electronic Press },
	address      = {Linköping },
	ISBN         = {978-91-7929-614-8},

	title        = {A lexical resource for computational historical linguistics},
	abstract     = {In this chapter we present the diachronic dimension of Swedish FrameNet++. We describe the historical lexical resources currently available for Swedish, linked to the Contemporary Swedish lexicon Saldo. We present a case study of how interlinking the dictionaries simultaneously allows us to study lexical change. We also present a method of linking text words to lexicon entries, facilitating interactive exploration of historical texts. Diachronical language resources present both a high-variation challenge from a wider language technology perspective, and an interesting object of linguistic study. While a number of improvements of the parts of the diachronic lexical macroresource are still needed, this resource is invaluable for analysing and accessing historical texts, as well as for both synchronic historical and diachronic lexical studies.},
	booktitle    = {The Swedish FrameNet++. Harmonization, integration, method development and practical language technology applications},
	author       = {Adesam, Yvonne and Andersson, Peter and Borin, Lars and Bouma, Gerlof},
	year         = {2021},
	publisher    = {John Benjamins Publishing Company},
	address      = {Amsterdam / Philadelphia},
	ISBN         = {978 90 272 5848 9},
	pages        = {98–121},

	title        = {Hulpwerkwoorden stapelen – toen en nu.},
	booktitle    = {Wat gebeurt er in het Nederlands? : over taal, frequentie en variatie / Redactie Nicoline van der Sijs, Lauren Fonteyn en Marten van der Meulen},
	author       = {Bouma, Gerlof and Coussé, Evie},
	year         = {2021},
	publisher    = {Sterck & de Vreese},
	address      = {Gorredijk},
	ISBN         = {9789056158033},
	pages        = {36--40},

	title        = {Semantic scope restrictions in complex verb constructions in Dutch},
	abstract     = {This article addresses the question of how and why verbs combine in complex verb constructions in Dutch. We discuss introspective data reported in reference grammars and add evidence from corpus data to uncover the systematic ways in which Dutch verbs combine. Our analysis shows that verbs expressing meanings such as tense, aspect, modality and evidentiality are organized in a semantic scope hierarchy; that is, some verb meanings systematically have scope over others but not the other way round. We argue that this scope hierarchy reflects hierarchies of functional categories, elaborated in both functional and generative frameworks.},
	journal      = {Linguistics},
	author       = {Coussé, Evie and Bouma, Gerlof},
	year         = {2022},
	volume       = {60},
	number       = {1},
	pages        = {123--176 },

	title        = {Counting dirty words: The effect of OCR quality on token statistics in historical Swedish corpora},
	booktitle    = {Live and learn: Festschrift in honor of Lars Borin / Editors: Elena Volodina, Dana Dannélls, Aleksandrs Berdicevskis, Markus Forsberg, Shafqat Virk},
	author       = {Bouma, Gerlof and Adesam, Yvonne},
	year         = {2022},
	publisher    = {University of Gothenburg},
	address      = {Gothenburg},
	ISBN         = {978-91-87850-83-7},
	pages        = {17--24},

	title        = {DaLAJ-GED – a dataset for Grammatical Error Detection tasks on Swedish},
	booktitle    = {Proceedings of the 12th Workshop on Natural Language Processing for Computer Assisted Language Learning (NLP4CALL 2023)  / edited by David Alfter, Elena Volodina, Thomas François, Arne Jönsson and Evelina Rennes},
	author       = {Volodina, Elena and Ali Mohammed, Yousuf and Berdicevskis, Aleksandrs and Bouma, Gerlof and Öhman, Joey},
	year         = {2023},
	publisher    = { Linköping Electronic Conference Proceedings},
	address      = {Linköping },
	ISBN         = {978-91-8075-250-3},