@inProceedings{dubossarsky-etal-2019-time-295438, title = {Time for change: Evaluating models of semantic change without evaluation tasks}, booktitle = {Cambridge Language Sciences Annual Symposium 2019 : Perspectives on Language Change}, author = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik}, year = {2019}, } @inProceedings{dubossarsky-etal-2019-time-281304, title = {Time-Out: Temporal Referencing for Robust Modeling of Lexical Semantic Change }, abstract = {State-of-the-art models of lexical semantic change detection suffer from noise stemming from vector space alignment. We have empirically tested the Temporal Referencing method for lexical semantic change and show that, by avoiding alignment, it is less affected by this noise. We show that, trained on a diachronic corpus, the skip-gram with negative sampling architecture with temporal referencing outperforms alignment models on a synthetic task as well as a manual testset. We introduce a principled way to simulate lexical semantic change and systematically control for possible biases. }, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy, July 28 - August 2, 2019 / Anna Korhonen, David Traum, Lluís Màrquez (Editors)}, author = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA}, ISBN = {978-1-950737-48-2}, } @article{tahmasebi-hengchen-2019-strengths-291189, title = {The Strengths and Pitfalls of Large-Scale Text Mining for Literary Studies}, abstract = {This paper is an overview of the opportunities and challenges of using large-scale text mining to answer research questions that stem from the humanities in general and literature specifically. In this paper, we will discuss a data-intensive research methodology and how different views of digital text affect answers to research questions. We will discuss results derived from text mining, how these results can be evaluated, and their relation to hypotheses and research questions. Finally, we will discuss some pitfalls of computational literary analysis and give some pointers as to how these can be avoided.}, journal = {Samlaren : tidskrift för svensk litteraturvetenskaplig forskning}, author = {Tahmasebi, Nina and Hengchen, Simon}, year = {2019}, volume = {140}, pages = {198–227}, } @inProceedings{hamalainen-hengchen-2019-from-293917, title = {From the paft to the fiiture: A fully automatic NMT and word embeddings method for OCR post-correction}, abstract = {A great deal of historical corpora suffer from errors introduced by the OCR (optical character recognition) methods used in the digitization process. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We present a fully automatic unsupervised way of extracting parallel data for training a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction.}, booktitle = {International Conference Recent Advances in Natural Language Processing, RANLP, Varna, Bulgaria, 2–4 September, 2019 }, author = {Hämäläinen, Mika and Hengchen, Simon}, year = {2019}, ISBN = {978-954-452-056-4 }, } @inProceedings{perrone-etal-2019-gasc-293918, title = {GASC: Genre-Aware Semantic Change for Ancient Greek}, booktitle = {Proceedings of the 1st International Workshop on Computational Approaches to Historical Language Change}, author = {Perrone, Valerio and Palma, Marco and Hengchen, Simon and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara}, year = {2019}, publisher = {Association for Computational Linguistics}, address = {Stroudsburg, PA }, ISBN = {978-1-950737-31-4}, } @article{hill-hengchen-2019-quantifying-293919, title = {Quantifying the impact of dirty OCR on historical text analysis: Eighteenth Century Collections Online as a case study}, abstract = {This article aims to quantify the impact optical character recognition (OCR) has on the quantitative analysis of historical documents. Using Eighteenth Century Collections Online as a case study, we first explore and explain the differences between the OCR corpus and its keyed-in counterpart, created by the Text Creation Partnership. We then conduct a series of specific analyses common to the digital humanities: topic modelling, authorship attribution, collocation analysis, and vector space modelling. The article concludes by offering some preliminary thoughts on how these conclusions can be applied to other datasets, by reflecting on the potential for predicting the quality of OCR where no ground-truth exists.}, journal = {Digital Scholarship in the Humanities}, author = {Hill, Mark J. and Hengchen, Simon}, year = {2019}, volume = {34}, number = {4}, pages = {825--843}, }