Skip to main content

BibTeX

@article{hengchen-tahmasebi-2021-collection-301262,
	title        = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data},
	abstract     = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.},
	journal      = {Journal of Open Humanities Data},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {1--7},
}

@inProceedings{marjanen-etal-2021-topic-304736,
	title        = {Topic Modelling Discourse Dynamics in Historical Newspapers
},
	abstract     = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models.
},
	booktitle    = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020},
	author       = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko},
	year         = {2021},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik},
	address      = {Aachen },
}

@misc{romanello-hengchen-2021-detecting-304990,
	title        = {Detecting Text Reuse with Passim},
	abstract     = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	volume       = {10},
}