@inProceedings{bouma-etal-2019-building-289485, title = {Building a Diachronic and Contrastive Parallel Corpus – and an Intended Application in the Form of a Study of Germanic Complex Verb Constructions }, abstract = {We present a parallel corpus under construction, which is parallel in diachronically (through time) as well as contrastively (between languages). The corpus is made up of Bible texts spanning almost 6 centuries in 4 languages. Our project's direct purpose of building the corpus is to track the development of verb combinations containing multiple auxiliary verbs through time in German, Dutch, English and Swedish. We will also make the corpus available to other researchers. In this poster, we discuss the design of the corpus, our selection of sources, issues with bringing together a wide variety of sources, and alignment of the data. We will also touch upon intended future work concerning the automatic linguistic processing needed to facilitate the study of verb constructions, and the methodological challenges of doing corpus linguistic research on the varying quality of annotations produced by automatic methods on materials from such a wide range of origins.}, booktitle = {Digital Humanities 2019, 9 -12 July 2019, Utrecht, the Netherlands}, author = {Bouma, Gerlof and Coussé, Evie and de Kooter, Dirk-Jan and van der Sijs, Nicoline}, year = {2019}, } @inProceedings{bouma-2019-exploring-289484, title = {Exploring Combining Training Datasets for the CLIN 2019 Shared Task on Cross-genre Gender Detection in Dutch}, abstract = {We present our entries to the Shared Task on Cross-genre Gender Detection in Dutch at CLIN 2019. We start from a simple logistic regression model with commonly used features, and consider two ways of combining training data from different sources.Our in-genre models do reasonably well, but the cross-genre models area lot worse. Post-task experiments show no clear systematic advantage of one way of combining training data sources over the other, but do suggest accuracy can be gained from a better way of setting model hyperparameters.}, booktitle = {CEUR Workshop Proceedings, vol 2453. Proceedings of the Shared Task on Cross-Genre Gender Prediction in Dutch at CLIN29 (GxG-CLIN29) co-located with the 29th Conference on Computational Linguistics in The Netherlands (CLIN29). Groningen, The Netherlands, January 31, 2019. Edited by Hessel Haagsma, Tim Kreutz, Masha Medvedeva, Walter Daelemans and Malvina Nissim}, author = {Bouma, Gerlof}, year = {2019}, publisher = {CEUR-WS.org}, address = {Aachen }, } @article{adesam-bouma-2019-koala-288026, title = {The Koala Part-of-Speech Tagset}, abstract = {We present the Koala part-of-speech tagset for written Swedish. The categorization takes the Swedish Academy Grammar (SAG) as its main starting point, to fit with the current descriptive view on Swedish grammar. We argue that neither SAG, as is, nor any of the existing part-of-speech tagsets meet our requirements for a broadly applicable categorization. Our proposal is outlined and compared to the other descriptions, and motivations for both the tagset as a whole as well as decisions about individual tags are discussed.}, journal = {Northern European Journal of Language Technology}, author = {Adesam, Yvonne and Bouma, Gerlof}, year = {2019}, volume = {6}, pages = {5--41}, }