Skip to main content

BibTeX

@inProceedings{dubossarsky-etal-2019-time-281304,
	title        = {Time-Out: Temporal Referencing for Robust Modeling of Lexical Semantic Change },
	abstract     = {State-of-the-art models of lexical semantic change detection suffer from noise stemming from vector space alignment. We have empirically tested the Temporal Referencing method for lexical semantic change and show that, by avoiding alignment, it is less affected by this noise. We show that, trained on a diachronic corpus, the skip-gram with negative sampling architecture with temporal referencing outperforms alignment models on a synthetic task as well as a manual testset. We introduce a principled way to simulate lexical semantic change and systematically control for possible biases. },
	booktitle    = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy, July 28 - August 2, 2019 / Anna Korhonen, David Traum, Lluís Màrquez (Editors)},
	author       = { Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik },
	year         = {2019},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {978-1-950737-48-2},
}

@inProceedings{dubossarsky-etal-2019-time-295438,
	title        = {Time for change: Evaluating models of semantic change without evaluation tasks},
	booktitle    = {Cambridge Language Sciences Annual Symposium 2019 : Perspectives on Language Change},
	author       = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik },
	year         = {2019},
}

@article{hengchen-tahmasebi-2021-collection-301262,
	title        = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data},
	abstract     = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.},
	journal      = {Journal of Open Humanities Data},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {1--7},
}

@inProceedings{marjanen-etal-2021-topic-304736,
	title        = {Topic Modelling Discourse Dynamics in Historical Newspapers
},
	abstract     = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models.
},
	booktitle    = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020},
	author       = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko},
	year         = {2021},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik},
	address      = {Aachen },
}

@misc{romanello-hengchen-2021-detecting-304990,
	title        = {Detecting Text Reuse with Passim},
	abstract     = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	volume       = {10},
}

@inProceedings{schlechtweg-etal-2020-semeval-295463,
	title        = {SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {Lexical Semantic Change detection, i.e., the task of identifying words that change meaning over time, is a very active research area, with applications in NLP, lexicography, and linguistics. Evaluation is currently the most pressing problem in Lexical Semantic Change detection, as no gold standards are available to the community, which hinders progress. We present the results of the first shared task that addresses this gap by providing researchers with an evaluation framework and manually annotated, high-quality datasets for English, German, Latin, and Swedish. 33 teams submitted 186 systems, which were evaluated on two subtasks. },
	booktitle    = {Proceedings of the Fourteenth Workshop on Semantic Evaluation (SemEval2020), Barcelona, Spain (Online), December 12, 2020.},
	author       = {Schlechtweg, Dominik and McGillivray, Barbara  and Hengchen, Simon and Dubossarsky, Haim  and Tahmasebi, Nina},
	year         = {2020},
	publisher    = {ACL},
}

@misc{schlechtweg-etal-2020-post-295466,
	title        = {Post-Evaluation Data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {This data collection contains the post-evaluation data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection: (1) the starting kit to download data, and examples for competing in the CodaLab challenge including baselines; (2) the true binary change scores of the targets for Subtask 1, and their true graded change scores for Subtask 2 (test_data_truth/); (3)the scoring program used to score submissions against the true test data in the evaluation and post-evaluation phase (scoring_program/); and (4) the results of the evaluation phase including, for example, analysis plots (plots/) displaying the results:},
	author       = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina},
	year         = {2020},
	publisher    = {Zenodo},
}

@article{tahmasebi-hengchen-2019-strengths-291189,
	title        = {The Strengths and Pitfalls of Large-Scale Text Mining for Literary Studies},
	abstract     = {This paper is an overview of the opportunities and challenges of using large-scale text mining to answer research questions that stem from the humanities in general and literature specifically.  In  this  paper,  we  will  discuss  a  data-intensive  research  methodology  and  how  different  views of digital text affect answers to research questions. We will discuss results derived from text mining, how these results can be evaluated, and their relation to hypotheses and research questions. Finally, we will discuss some pitfalls of computational literary analysis and give some pointers as to how these can be avoided.},
	journal      = {Samlaren : tidskrift för svensk litteraturvetenskaplig forskning},
	author       = {Tahmasebi, Nina and Hengchen, Simon},
	year         = {2019},
	volume       = {140},
	pages        = {198–227},
}

@misc{tahmasebi-etal-2020-swedish-295465,
	title        = {Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {This data collection contains the Swedish test data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection. It consists of a Swedish text corpus pair (corpus1/, corpus2/) and 31 lemmas which have been annotated for their lexical semantic change between the two corpora (targets.txt). We sample from the KubHist2 corpus, digitized by the National Library of Sweden, and available through the Språkbanken corpus infrastructure Korp (Borin et al., 2012). The full corpus is available through a CC BY (attribution) license. Each word for which the lemmatizer in the Korp pipeline has found a lemma is replaced with the lemma. In cases where the lemmatizer cannot find a lemma, we leave the word as is (i.e., unlemmatized, no lower-casing). KubHist contains very frequent OCR errors, especially for the older data.More detail about the properties and quality of the Kubhist corpus can be found in (Adesam et al., 2019).},
	author       = {Tahmasebi, Nina and Hengchen, Simon and Schlechtweg, Dominik and McGillivray, Barbara and Dubossarsky, Haim},
	year         = {2020},
}

@inProceedings{hamalainen-hengchen-2019-from-293917,
	title        = {From the paft to the fiiture: A fully automatic NMT and word embeddings method for OCR post-correction},
	abstract     = {A great deal of historical corpora suffer from errors introduced by the OCR (optical character recognition) methods used in the digitization process. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We present a fully automatic unsupervised way of extracting parallel data for training a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction.},
	booktitle    = {International Conference Recent Advances in Natural Language Processing, RANLP, Varna, Bulgaria, 2–4 September, 2019 },
	author       = {Hämäläinen, Mika and Hengchen, Simon},
	year         = {2019},
	ISBN         = {978-954-452-056-4 },
}

@inProceedings{perrone-etal-2019-gasc-293918,
	title        = {GASC: Genre-Aware Semantic Change for Ancient Greek},
	booktitle    = {Proceedings of the 1st International Workshop on Computational Approaches to Historical Language Change},
	author       = {Perrone, Valerio and Palma, Marco and Hengchen, Simon and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara},
	year         = {2019},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-950737-31-4},
}

@inProceedings{frossard-etal-2020-dataset-293923,
	title        = {Dataset for Temporal Analysis of English-French Cognates},
	abstract     = {Languages change over time and, thanks to the abundance of digital corpora, their evolutionary analysis using computational techniques has recently gained much research attention. In this paper, we focus on creating a dataset to support investigating the similarity in evolution between different languages. We look in particular into the similarities and differences between the use of corresponding words across time in English and French, two languages from different linguistic families yet with shared syntax and close contact. For this we select a set of cognates in both languages and study their frequency changes and correlations over time. We propose a new dataset for computational approaches of synchronized diachronic investigation of language pairs, and subsequently show novel findings stemming from the cognate-focused diachronic comparison of the two chosen languages. To the best of our knowledge, the present study is the first in the literature to use computational approaches and large data to make a cross-language diachronic analysis.},
	booktitle    = {Proceedings of The 12th Language Resources and Evaluation Conference},
	author       = {Frossard, Esteban and Coustaty, Mickael and Doucet, Antoine and Jatowt, Adam and Hengchen, Simon},
	year         = {2020},
	publisher    = {European Language Resources Association},
	address      = {Marseille, France},
	ISBN         = {979-10-95546-34-4},
}

@article{hill-hengchen-2019-quantifying-293919,
	title        = {Quantifying the impact of dirty OCR on historical text analysis: Eighteenth Century Collections Online as a case study},
	abstract     = {This article aims to quantify the impact optical character recognition (OCR) has on the quantitative analysis of historical documents. Using Eighteenth Century Collections Online as a case study, we first explore and explain the differences between the OCR corpus and its keyed-in counterpart, created by the Text Creation Partnership. We then conduct a series of specific analyses common to the digital humanities: topic modelling, authorship attribution, collocation analysis, and vector space modelling. The article concludes by offering some preliminary thoughts on how these conclusions can be applied to other datasets, by reflecting on the potential for predicting the quality of OCR where no ground-truth exists.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hill, Mark J. and Hengchen, Simon},
	year         = {2019},
	volume       = {34},
	number       = {4},
	pages        = {825--843},
}

@misc{mcgillivray-etal-2020-challenges-295208,
	title        = {The challenges and prospects of the intersection of humanities and data science: A White Paper from The Alan Turing Institute},
	abstract     = {Since their beginnings, the digital humanities have engaged in an energetic debate about their scope, defining features, and relationship to the wider humanities, and have established themselves as a community of practice (Schreibman et al., 2004; Terras, 2010; Terras, 2013; Terras et al., 2013; Gold and Klein, 2016; The Digital Humanities Manifesto 2.0). The computational focus has characterised the field from its initial explorations (Hockey, 2004; Vanhoutte, 2013; Nyhan and Flinn, 2016) and the shift from the label ‘Humanities Computing’ to ‘Digital Humanities’ was a catalyst for change. In the history of the field, recurring cycles and productive tensions have arisen from the interfolding of computational methodologies and approaches with hermeneutic and critical modes of analysis (see McCarty, 2005; Rockwell and Sinclair, 2016; Jones, 2016). This document postulates that we are currently witnessing another one of these junctures, one that is calling for a critical involvement with data science.
In many ways, we are seeing earlier methods blending into, or being extended by data science.
Digitisation workflows are being augmented with automatic information extraction, data analysis, automated transcription of handwritten documents, and visualisation of transcribed content. Techniques developed for history, literary studies, and linguistics are being scaled towards larger datasets and more complex problems raising the bar of interpretability and questioning the validity of data collection and analysis methods. On the other hand, the field of data science has recently started to engage with non-STEM (Science, Technology, Engineering, and Mathematics) disciplines, by offering new data-driven modelling frameworks for addressing
long-standing research questions (Kitchin, 2014; Lazer et al., 2009) and proposing so-called ‘human-centred approaches’ to data science, focussed on the interpretability of machine learning models and a more active role for human input in algorithms (See Chen et al., 2016).
Moreover, in the current historical context we are witnessing an increased awareness of the questions of diversity and inclusion in research and academia, and we are seeing the creation of a strong movement aimed at addressing such issues globally. We believe that this paper can play a role in reinforcing a positive message in this respect.},
	author       = {McGillivray, Barbara and Alex, Beatrice and Ames, Sarah and Armstrong, Guyda and Beavan, David and Ciula, Arianna and Colavizza, Giovanni and Cummings, James and De Roure, David and Farquhar, Adam and Hengchen, Simon and Lang, Anouk and Loxley, James and Goudarouli, Eirini and Nanni, Federico and Nini, Andrea and Nyhan, Julianne and Osborne, Nicola and Poibeau, Thierry and Ridge, Mia and Ranade, Sonia and Smithies, James and Terras, Melissa and Vlachidis, Andreas and Willcox, Pip},
	year         = {2020},
}

@book{hengchen-2017-when-294147,
	title        = {When Does it Mean? Detecting Semantic Change in Historical Texts},
	author       = {Hengchen, Simon},
	year         = {2017},
	publisher    = {Université libre de Bruxelles},
	address      = {Brussels},
}