Skip to main content

BibTeX

@article{hengchen-tahmasebi-2021-collection-301262,
	title        = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data},
	abstract     = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.},
	journal      = {Journal of Open Humanities Data},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {1--7},
}

@misc{romanello-hengchen-2021-detecting-304990,
	title        = {Detecting Text Reuse with Passim},
	abstract     = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	volume       = {10},
}

@inProceedings{marjanen-etal-2021-topic-304736,
	title        = {Topic Modelling Discourse Dynamics in Historical Newspapers
},
	abstract     = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models.
},
	booktitle    = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020},
	author       = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko},
	year         = {2021},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik},
	address      = {Aachen },
}

@inProceedings{hengchen-tahmasebi-2021-supersim-305157,
	title        = {SuperSim: a test set for word similarity and relatedness in Swedish},
	abstract     = {Language models are notoriously difficult to evaluate. 
We release SuperSim, a large-scale similarity and relatedness test set for Swedish built with expert human judgments. The test set is composed of 1,360 word-pairs independently judged for both relatedness and similarity by five annotators. We evaluate three different models (Word2Vec, fastText, and GloVe) trained on two separate Swedish datasets, namely the Swedish Gigaword corpus and a Swedish Wikipedia dump, to provide a baseline for future comparison. 
We release the fully annotated test set, code, baseline models, and data.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31-June 2 2021, Reykjavik, Iceland (online)},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{duong-etal-2021-unsupervised-305156,
	title        = {An Unsupervised method for OCR Post-Correction and Spelling Normalisation for Finnish},
	abstract     = {Historical corpora are known to contain errors introduced by OCR (optical character recognition) methods used in the digitization process, often said to be degrading the performance of NLP systems. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We build on previous work on fully automatic unsupervised extraction of parallel data to train a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction designed for English, and adapt it to Finnish by proposing solutions that take the rich morphology of the language into account. Our new method shows increased performance while remaining fully unsupervised, with the added benefit of spelling normalisation. The source code and models are available on GitHub and Zenodo.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31–2 June, 2021, Reykjavik, Iceland (online)},
	author       = {Duong, Quan and Hämäläinen, Mika and Hengchen, Simon},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{hengchen-etal-2021-sbx--305550,
	title        = {SBX­-HY at RuShiftEval 2021: Доверяй, но проверяй},
	abstract     = {Research in computational lexical semantic change, due to the inherent nature of language change, has been notoriously difficult to evaluate. This led to the creation of many new exciting models that cannot be easily compared. In this system paper, we describe our submissions at RuShiftEval 2021 – one of the few recently shared tasks that enable researchers, through a standard evaluation set and control conditions, to systematically compare models and gain insights from previous work. We show that despite top results in similar tasks on other languages, Temporal Referencing does not seem to perform as well on Russian.},
	booktitle    = {Computational Linguistics and Intellectual Technologies: Proceedings of the International Conference “Dialogue 2021,” Moscow, June 16–19, 2021},
	author       = {Hengchen, Simon and Viloria, Kate and Indukaev, Andrey},
	year         = {2021},
	publisher    = {Rossiiskii Gosudarstvennyi Gumanitarnyi Universitet },
	address      = {Moscow},
}

@edited_book{tahmasebi-etal-2021-computational-306968,
	title        = {Computational approaches to semantic change},
	abstract     = {Semantic change — how the meanings of words change over time — has preoccupied scholars since well before modern linguistics emerged in the late 19th and early 20th century, ushering in a new methodological turn in the study of language change. Compared to changes in sound and grammar, semantic change is the least  understood. Ever since, the study of semantic change has progressed steadily, accumulating a vast store of knowledge for over a century, encompassing many languages and language families.

Historical linguists also early on realized the potential of computers as research tools, with papers at the very first international conferences in computational linguistics in the 1960s. Such computational studies still tended to be small-scale, method-oriented, and qualitative. However, recent years have witnessed a sea-change in this regard. Big-data empirical quantitative investigations are now coming to the forefront, enabled by enormous advances in storage capability and processing power. Diachronic corpora have grown beyond imagination, defying exploration by traditional manual qualitative methods, and language technology has become increasingly data-driven and semantics-oriented. These developments present a golden opportunity for the empirical study of semantic change over both long and short time spans.

A major challenge presently is to integrate the hard-earned  knowledge and expertise of traditional historical linguistics with  cutting-edge methodology explored primarily in computational linguistics.

The idea for the present volume came out of a concrete response to this challenge.  The 1st International Workshop on Computational Approaches to Historical Language Change (LChange'19), at ACL 2019, brought together scholars from both fields.

This volume offers a survey of this exciting new direction in the study of semantic change, a discussion of the many remaining challenges that we face in pursuing it, and considerably updated and extended versions of a selection of the contributions to the LChange'19 workshop, addressing both more theoretical problems —  e.g., discovery of "laws of semantic change" — and practical applications, such as information retrieval in longitudinal text archives.},
	editor       = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang and Hengchen, Simon},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
}

@incollection{hengchen-etal-2021-challenges-306972,
	title        = {Challenges for computational lexical semantic change},
	abstract     = {The computational study of lexical semantic change (LSC) has taken off in the past few years and we are seeing increasing interest in the field, from both computational sciences and linguistics. Most of the research so far has focused on methods for modelling and detecting semantic change using large diachronic textual data, with the majority of the approaches employing neural embeddings. While methods that offer easy modelling of diachronic text are one of the main reasons for the spiking interest in LSC, neural models leave many aspects of the problem unsolved. The field has several open and complex challenges. In this chapter, we aim to describe the most important of these challenges and outline future directions.},
	booktitle    = {Computational approaches to semantic change / Tahmasebi, Nina, Borin, Lars, Jatowt, Adam, Yang, Xu, Hengchen, Simon (eds.)},
	author       = {Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik and Dubossarsky, Haim},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {341--372},
}

@incollection{perrone-etal-2021-lexical-306974,
	title        = {Lexical semantic change for Ancient Greek and Latin},
	abstract     = {Change and its precondition, variation, are inherent in languages. Over time, new words enter the lexicon, others become obsolete, and existing words acquire new senses. Associating a word with its correct meaning in its historical context is a central challenge in diachronic research. Historical corpora of classical languages, such as Ancient Greek and Latin, typically come with rich metadata, and existing models are limited by their inability to exploit contextual information beyond the document timestamp. While embedding-based methods feature among the current state of the art systems, they are lacking in their interpretative power. In  contrast, Bayesian models provide explicit and interpretable representations of semantic change phenomena. In this chapter we build on GASC, a recent computational approach to semantic change based on a dynamic Bayesian mixture model. In this model, the evolution of word senses over time is based not only on distributional information of lexical nature, but also on text genres. We provide a systematic comparison of dynamic Bayesian mixture models for semantic change with state-ofthe-art embedding-based models. On top of providing a full description of meaning change over time, we show that Bayesian mixture models are highly competitive approaches to detect binary semantic change in both Ancient Greek and Latin.
},
	booktitle    = {Computational approaches to semantic change},
	author       = {Perrone, Valerio and Hengchen, Simon and Palma, Marco and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {287--310},
}

@article{hengchen-etal-2021-data-309329,
	title        = {A data-driven approach to studying changing vocabularies in historical newspaper collections},
	abstract     = {Nation and nationhood are among the most frequently studied concepts in the field of intellectual history. At the same time, the word ‘nation’ and its historical usage are very vague. The aim in this article was to develop a data-driven method using dependency parsing and neural word embeddings to clarify some of the vagueness in the evolution of this concept. To this end, we propose the following two-step method. First, using linguistic processing, we create a large set of words pertaining to the topic of nation. Second, we train diachronic word embeddings and use them to quantify the strength of the semantic similarity between these words and thereby create meaningful clusters, which are then aligned diachronically. To illustrate the robustness of the study across languages, time spans, as well as large datasets, we apply it to the entirety of five historical newspaper archives in Dutch, Swedish, Finnish, and English. To our knowledge, thus far there have been no large-scale comparative studies of this kind that purport to grasp long-term developments in as many as four different languages in a data-driven way. A particular strength of the method we describe in this article is that, by design, it is not limited to the study of nationhood, but rather expands beyond it to other research questions and is reusable in different contexts.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hengchen, Simon and Ros, Ruben and Marjanen, Jani and Tolonen, Mikko},
	year         = {2021},
	volume       = {36},
	number       = {Supplement 2},
	pages        = {109–126},
}