Hoppa till huvudinnehåll

BibTeX

@article{Tahmasebi-Nina2019-291189,
	title        = {The Strengths and Pitfalls of Large-Scale Text Mining for Literary Studies},
	abstract     = {This paper is an overview of the opportunities and challenges of using large-scale text mining to answer research questions that stem from the humanities in general and literature specifically.  In  this  paper,  we  will  discuss  a  data-intensive  research  methodology  and  how  different  views of digital text affect answers to research questions. We will discuss results derived from text mining, how these results can be evaluated, and their relation to hypotheses and research questions. Finally, we will discuss some pitfalls of computational literary analysis and give some pointers as to how these can be avoided.},
	journal      = {Samlaren : tidskrift för svensk litteraturvetenskaplig forskning},
	author       = {Tahmasebi, Nina and Hengchen, Simon},
	year         = {2019},
	volume       = {140},
	pages        = {198–227},
}

@inProceedings{Hämäläinen-Mika2019-293917,
	title        = {From the paft to the fiiture: A fully automatic NMT and word embeddings method for OCR post-correction},
	abstract     = {A great deal of historical corpora suffer from errors introduced by the OCR (optical character recognition) methods used in the digitization process. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We present a fully automatic unsupervised way of extracting parallel data for training a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction.},
	booktitle    = {International Conference Recent Advances in Natural Language Processing, RANLP, Varna, Bulgaria, 2–4 September, 2019 },
	author       = {Hämäläinen, Mika and Hengchen, Simon},
	year         = {2019},
	ISBN         = {978-954-452-056-4 },
}

@misc{Schlechtweg-Dominik2020-295466,
	title        = {Post-Evaluation Data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {This data collection contains the post-evaluation data for SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection: (1) the starting kit to download data, and examples for competing in the CodaLab challenge including baselines; (2) the true binary change scores of the targets for Subtask 1, and their true graded change scores for Subtask 2 (test_data_truth/); (3)the scoring program used to score submissions against the true test data in the evaluation and post-evaluation phase (scoring_program/); and (4) the results of the evaluation phase including, for example, analysis plots (plots/) displaying the results:},
	author       = {Schlechtweg, Dominik and McGillivray, Barbara and Hengchen, Simon and Dubossarsky, Haim and Tahmasebi, Nina},
	year         = {2020},
	publisher    = {Zenodo},
}

@misc{Tahmasebi-Nina2020-295465,
	title        = {Swedish Test Data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {This data collection contains the Swedish test data for SemEval 2020 Task 1: Unsupervised Lexical Semantic Change Detection. It consists of a Swedish text corpus pair (corpus1/, corpus2/) and 31 lemmas which have been annotated for their lexical semantic change between the two corpora (targets.txt). We sample from the KubHist2 corpus, digitized by the National Library of Sweden, and available through the Språkbanken corpus infrastructure Korp (Borin et al., 2012). The full corpus is available through a CC BY (attribution) license. Each word for which the lemmatizer in the Korp pipeline has found a lemma is replaced with the lemma. In cases where the lemmatizer cannot find a lemma, we leave the word as is (i.e., unlemmatized, no lower-casing). KubHist contains very frequent OCR errors, especially for the older data.More detail about the properties and quality of the Kubhist corpus can be found in (Adesam et al., 2019).},
	author       = {Tahmasebi, Nina and Hengchen, Simon and Schlechtweg, Dominik and McGillivray, Barbara and Dubossarsky, Haim},
	year         = {2020},
}

@inProceedings{Perrone-Valerio2019-293918,
	title        = {GASC: Genre-Aware Semantic Change for Ancient Greek},
	booktitle    = {Proceedings of the 1st International Workshop on Computational Approaches to Historical Language Change},
	author       = {Perrone, Valerio and Palma, Marco and Hengchen, Simon and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara},
	year         = {2019},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA },
	ISBN         = {978-1-950737-31-4},
}

@inProceedings{Frossard-Esteban2020-293923,
	title        = {Dataset for Temporal Analysis of English-French Cognates},
	abstract     = {Languages change over time and, thanks to the abundance of digital corpora, their evolutionary analysis using computational techniques has recently gained much research attention. In this paper, we focus on creating a dataset to support investigating the similarity in evolution between different languages. We look in particular into the similarities and differences between the use of corresponding words across time in English and French, two languages from different linguistic families yet with shared syntax and close contact. For this we select a set of cognates in both languages and study their frequency changes and correlations over time. We propose a new dataset for computational approaches of synchronized diachronic investigation of language pairs, and subsequently show novel findings stemming from the cognate-focused diachronic comparison of the two chosen languages. To the best of our knowledge, the present study is the first in the literature to use computational approaches and large data to make a cross-language diachronic analysis.},
	booktitle    = {Proceedings of The 12th Language Resources and Evaluation Conference},
	author       = {Frossard, Esteban and Coustaty, Mickael and Doucet, Antoine and Jatowt, Adam and Hengchen, Simon},
	year         = {2020},
	publisher    = {European Language Resources Association},
	address      = {Marseille, France},
	ISBN         = {979-10-95546-34-4},
}

@inProceedings{Dubossarsky-Haim2019-281304,
	title        = {Time-Out: Temporal Referencing for Robust Modeling of Lexical Semantic Change },
	abstract     = {State-of-the-art models of lexical semantic change detection suffer from noise stemming from vector space alignment. We have empirically tested the Temporal Referencing method for lexical semantic change and show that, by avoiding alignment, it is less affected by this noise. We show that, trained on a diachronic corpus, the skip-gram with negative sampling architecture with temporal referencing outperforms alignment models on a synthetic task as well as a manual testset. We introduce a principled way to simulate lexical semantic change and systematically control for possible biases. },
	booktitle    = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, Florence, Italy, July 28 - August 2, 2019 / Anna Korhonen, David Traum, Lluís Màrquez (Editors)},
	author       = { Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik },
	year         = {2019},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {978-1-950737-48-2},
}

@inProceedings{Schlechtweg-Dominik2020-295463,
	title        = {SemEval-2020 Task 1: Unsupervised Lexical Semantic Change Detection},
	abstract     = {Lexical Semantic Change detection, i.e., the task of identifying words that change meaning over time, is a very active research area, with applications in NLP, lexicography, and linguistics. Evaluation is currently the most pressing problem in Lexical Semantic Change detection, as no gold standards are available to the community, which hinders progress. We present the results of the first shared task that addresses this gap by providing researchers with an evaluation framework and manually annotated, high-quality datasets for English, German, Latin, and Swedish. 33 teams submitted 186 systems, which were evaluated on two subtasks. },
	booktitle    = {Proceedings of the Fourteenth Workshop on Semantic Evaluation (SemEval2020), Barcelona, Spain (Online), December 12, 2020.},
	author       = {Schlechtweg, Dominik and McGillivray, Barbara  and Hengchen, Simon and Dubossarsky, Haim  and Tahmasebi, Nina},
	year         = {2020},
	publisher    = {ACL},
}

@inProceedings{Dubossarsky-Haim2019-295438,
	title        = {Time for change: Evaluating models of semantic change without evaluation tasks},
	booktitle    = {Cambridge Language Sciences Annual Symposium 2019 : Perspectives on Language Change},
	author       = {Dubossarsky, Haim and Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik },
	year         = {2019},
}

@article{Hill-MarkJ.2019-293919,
	title        = {Quantifying the impact of dirty OCR on historical text analysis: Eighteenth Century Collections Online as a case study},
	abstract     = {This article aims to quantify the impact optical character recognition (OCR) has on the quantitative analysis of historical documents. Using Eighteenth Century Collections Online as a case study, we first explore and explain the differences between the OCR corpus and its keyed-in counterpart, created by the Text Creation Partnership. We then conduct a series of specific analyses common to the digital humanities: topic modelling, authorship attribution, collocation analysis, and vector space modelling. The article concludes by offering some preliminary thoughts on how these conclusions can be applied to other datasets, by reflecting on the potential for predicting the quality of OCR where no ground-truth exists.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hill, Mark J. and Hengchen, Simon},
	year         = {2019},
	volume       = {34},
	number       = {4},
	pages        = {825--843},
}

@misc{McGillivray-Barbara2020-295208,
	title        = {The challenges and prospects of the intersection of humanities and data science: A White Paper from The Alan Turing Institute},
	abstract     = {Since their beginnings, the digital humanities have engaged in an energetic debate about their scope, defining features, and relationship to the wider humanities, and have established themselves as a community of practice (Schreibman et al., 2004; Terras, 2010; Terras, 2013; Terras et al., 2013; Gold and Klein, 2016; The Digital Humanities Manifesto 2.0). The computational focus has characterised the field from its initial explorations (Hockey, 2004; Vanhoutte, 2013; Nyhan and Flinn, 2016) and the shift from the label ‘Humanities Computing’ to ‘Digital Humanities’ was a catalyst for change. In the history of the field, recurring cycles and productive tensions have arisen from the interfolding of computational methodologies and approaches with hermeneutic and critical modes of analysis (see McCarty, 2005; Rockwell and Sinclair, 2016; Jones, 2016). This document postulates that we are currently witnessing another one of these junctures, one that is calling for a critical involvement with data science.
In many ways, we are seeing earlier methods blending into, or being extended by data science.
Digitisation workflows are being augmented with automatic information extraction, data analysis, automated transcription of handwritten documents, and visualisation of transcribed content. Techniques developed for history, literary studies, and linguistics are being scaled towards larger datasets and more complex problems raising the bar of interpretability and questioning the validity of data collection and analysis methods. On the other hand, the field of data science has recently started to engage with non-STEM (Science, Technology, Engineering, and Mathematics) disciplines, by offering new data-driven modelling frameworks for addressing
long-standing research questions (Kitchin, 2014; Lazer et al., 2009) and proposing so-called ‘human-centred approaches’ to data science, focussed on the interpretability of machine learning models and a more active role for human input in algorithms (See Chen et al., 2016).
Moreover, in the current historical context we are witnessing an increased awareness of the questions of diversity and inclusion in research and academia, and we are seeing the creation of a strong movement aimed at addressing such issues globally. We believe that this paper can play a role in reinforcing a positive message in this respect.},
	author       = {McGillivray, Barbara and Alex, Beatrice and Ames, Sarah and Armstrong, Guyda and Beavan, David and Ciula, Arianna and Colavizza, Giovanni and Cummings, James and De Roure, David and Farquhar, Adam and Hengchen, Simon and Lang, Anouk and Loxley, James and Goudarouli, Eirini and Nanni, Federico and Nini, Andrea and Nyhan, Julianne and Osborne, Nicola and Poibeau, Thierry and Ridge, Mia and Ranade, Sonia and Smithies, James and Terras, Melissa and Vlachidis, Andreas and Willcox, Pip},
	year         = {2020},
}

@book{Hengchen-Simon2017-294147,
	title        = {When Does it Mean? Detecting Semantic Change in Historical Texts},
	author       = {Hengchen, Simon},
	year         = {2017},
	publisher    = {Université libre de Bruxelles},
	address      = {Brussels},
}

@article{Hengchen-Simon2021-301262,
	title        = {A Collection of Swedish Diachronic Word Embedding Models Trained on Historical Newspaper Data},
	abstract     = {This paper describes the creation of several word embedding models based on a large collection of diachronic Swedish newspaper material available through Språkbanken Text, the Swedish language bank. This data was produced in the context of Språkbanken Text’s continued mission to collaborate with humanities and natural language processing (NLP) researchers and to provide freely available language resources, for the development of state-of-the-art NLP methods and tools.},
	journal      = {Journal of Open Humanities Data},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {1--7},
}

@misc{Romanello-Matteo2021-304990,
	title        = {Detecting Text Reuse with Passim},
	abstract     = {In this lesson you will learn about text reuse detection – the automatic identification of reused passages in texts – and why you might want to use it in your research. Through a detailed installation guide and two case studies, this lesson will teach you the ropes of Passim, an open source and scalable tool for text reuse detection.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	volume       = {10},
}

@inProceedings{Marjanen-Jani2021-304736,
	title        = {Topic Modelling Discourse Dynamics in Historical Newspapers
},
	abstract     = {This paper addresses methodological issues in diachronic data analysis for historical research. We apply two families of topic models (LDA and DTM) on a relatively large set of historical newspapers, with the aim of capturing and understanding discourse dynamics. Our case study focuses on newspapers and periodicals published in Finland between 1854 and 1917, but our method can easily be transposed to any diachronic data. Our main contributions are a) a combined sampling, training and inference procedure for applying topic models to huge and imbalanced diachronic text collections; b) a discussion on the differences between two topic models for this type of data; c) quantifying topic prominence for a period and thus a generalization of document-wise topic assignment to a discourse level; and d) a discussion of the role of humanistic interpretation with regard to analysing discourse dynamics through topic models.
},
	booktitle    = {CEUR Workshop Proceedings. Post-Proceedings of the 5th Conference Digital Humanities in the Nordic Countries (DHN 2020), Riga, Latvia, October 21-23, 2020},
	author       = {Marjanen, Jani and Zosa, Elaine and Hengchen, Simon and Pivovarova, Lidia and Tolonen, Mikko},
	year         = {2021},
	publisher    = {M. Jeusfeld c/o Redaktion Sun SITE, Informatik},
	address      = {Aachen },
}

@inProceedings{Hengchen-Simon2021-305157,
	title        = {SuperSim: a test set for word similarity and relatedness in Swedish},
	abstract     = {Language models are notoriously difficult to evaluate. 
We release SuperSim, a large-scale similarity and relatedness test set for Swedish built with expert human judgments. The test set is composed of 1,360 word-pairs independently judged for both relatedness and similarity by five annotators. We evaluate three different models (Word2Vec, fastText, and GloVe) trained on two separate Swedish datasets, namely the Swedish Gigaword corpus and a Swedish Wikipedia dump, to provide a baseline for future comparison. 
We release the fully annotated test set, code, baseline models, and data.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31-June 2 2021, Reykjavik, Iceland (online)},
	author       = {Hengchen, Simon and Tahmasebi, Nina},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{Duong-Quan2021-305156,
	title        = {An Unsupervised method for OCR Post-Correction and Spelling Normalisation for Finnish},
	abstract     = {Historical corpora are known to contain errors introduced by OCR (optical character recognition) methods used in the digitization process, often said to be degrading the performance of NLP systems. Correcting these errors manually is a time-consuming process and a great part of the automatic approaches have been relying on rules or supervised machine learning. We build on previous work on fully automatic unsupervised extraction of parallel data to train a character-based sequence-to-sequence NMT (neural machine translation) model to conduct OCR error correction designed for English, and adapt it to Finnish by proposing solutions that take the rich morphology of the language into account. Our new method shows increased performance while remaining fully unsupervised, with the added benefit of spelling normalisation. The source code and models are available on GitHub and Zenodo.},
	booktitle    = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), May 31–2 June, 2021, Reykjavik, Iceland (online)},
	author       = {Duong, Quan and Hämäläinen, Mika and Hengchen, Simon},
	year         = {2021},
	publisher    = {Linköping Electronic Conference Proceedings},
	address      = {Linköping},
	ISBN         = {978-91-7929-614-8},
}

@inProceedings{Hengchen-Simon2021-305550,
	title        = {SBX­-HY at RuShiftEval 2021: Доверяй, но проверяй},
	abstract     = {Research in computational lexical semantic change, due to the inherent nature of language change, has been notoriously difficult to evaluate. This led to the creation of many new exciting models that cannot be easily compared. In this system paper, we describe our submissions at RuShiftEval 2021 – one of the few recently shared tasks that enable researchers, through a standard evaluation set and control conditions, to systematically compare models and gain insights from previous work. We show that despite top results in similar tasks on other languages, Temporal Referencing does not seem to perform as well on Russian.},
	booktitle    = {Computational Linguistics and Intellectual Technologies: Proceedings of the International Conference “Dialogue 2021,” Moscow, June 16–19, 2021},
	author       = {Hengchen, Simon and Viloria, Kate and Indukaev, Andrey},
	year         = {2021},
}

@book{Tahmasebi-Nina2021-306968,
	title        = {Computational approaches to semantic change},
	abstract     = {Semantic change — how the meanings of words change over time — has preoccupied scholars since well before modern linguistics emerged in the late 19th and early 20th century, ushering in a new methodological turn in the study of language change. Compared to changes in sound and grammar, semantic change is the least  understood. Ever since, the study of semantic change has progressed steadily, accumulating a vast store of knowledge for over a century, encompassing many languages and language families.

Historical linguists also early on realized the potential of computers as research tools, with papers at the very first international conferences in computational linguistics in the 1960s. Such computational studies still tended to be small-scale, method-oriented, and qualitative. However, recent years have witnessed a sea-change in this regard. Big-data empirical quantitative investigations are now coming to the forefront, enabled by enormous advances in storage capability and processing power. Diachronic corpora have grown beyond imagination, defying exploration by traditional manual qualitative methods, and language technology has become increasingly data-driven and semantics-oriented. These developments present a golden opportunity for the empirical study of semantic change over both long and short time spans.

A major challenge presently is to integrate the hard-earned  knowledge and expertise of traditional historical linguistics with  cutting-edge methodology explored primarily in computational linguistics.

The idea for the present volume came out of a concrete response to this challenge.  The 1st International Workshop on Computational Approaches to Historical Language Change (LChange'19), at ACL 2019, brought together scholars from both fields.

This volume offers a survey of this exciting new direction in the study of semantic change, a discussion of the many remaining challenges that we face in pursuing it, and considerably updated and extended versions of a selection of the contributions to the LChange'19 workshop, addressing both more theoretical problems —  e.g., discovery of "laws of semantic change" — and practical applications, such as information retrieval in longitudinal text archives.},
	author       = {Tahmasebi, Nina and Borin, Lars and Jatowt, Adam and Xu, Yang and Hengchen, Simon},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
}

@incollection{Hengchen-Simon2021-306972,
	title        = {Challenges for computational lexical semantic change},
	abstract     = {The computational study of lexical semantic change (LSC) has taken off in the past few years and we are seeing increasing interest in the field, from both computational sciences and linguistics. Most of the research so far has focused on methods for modelling and detecting semantic change using large diachronic textual data, with the majority of the approaches employing neural embeddings. While methods that offer easy modelling of diachronic text are one of the main reasons for the spiking interest in LSC, neural models leave many aspects of the problem unsolved. The field has several open and complex challenges. In this chapter, we aim to describe the most important of these challenges and outline future directions.},
	booktitle    = {Computational approaches to semantic change / Tahmasebi, Nina, Borin, Lars, Jatowt, Adam, Yang, Xu, Hengchen, Simon (eds.)},
	author       = {Hengchen, Simon and Tahmasebi, Nina and Schlechtweg, Dominik and Dubossarsky, Haim},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {341--372},
}

@incollection{Perrone-Valerio2021-306974,
	title        = {Lexical semantic change for Ancient Greek and Latin},
	abstract     = {Change and its precondition, variation, are inherent in languages. Over time, new words enter the lexicon, others become obsolete, and existing words acquire new senses. Associating a word with its correct meaning in its historical context is a central challenge in diachronic research. Historical corpora of classical languages, such as Ancient Greek and Latin, typically come with rich metadata, and existing models are limited by their inability to exploit contextual information beyond the document timestamp. While embedding-based methods feature among the current state of the art systems, they are lacking in their interpretative power. In  contrast, Bayesian models provide explicit and interpretable representations of semantic change phenomena. In this chapter we build on GASC, a recent computational approach to semantic change based on a dynamic Bayesian mixture model. In this model, the evolution of word senses over time is based not only on distributional information of lexical nature, but also on text genres. We provide a systematic comparison of dynamic Bayesian mixture models for semantic change with state-ofthe-art embedding-based models. On top of providing a full description of meaning change over time, we show that Bayesian mixture models are highly competitive approaches to detect binary semantic change in both Ancient Greek and Latin.
},
	booktitle    = {Computational approaches to semantic change},
	author       = {Perrone, Valerio and Hengchen, Simon and Palma, Marco and Vatri, Alessandro and Smith, Jim Q. and McGillivray, Barbara},
	year         = {2021},
	publisher    = {Language Science Press},
	address      = {Berlin},
	ISBN         = {978-3-98554-008-2},
	pages        = {287--310},
}

@misc{Romanello-Matteo2021-307547,
	title        = {Détecter la réutilisation de texte avec Passim},
	abstract     = {Dans cette leçon, vous serez initié à la détection automatique de la réutilisation des textes avec la bibliothèque Passim. Vous apprendrez comment installer et exécuter Passim et ses dépendances, comment préparer vos textes en tant que fichiers d’entrée adaptés à l’utilisation de Passim et, enfin, comment traiter la sortie générée par Passim pour effectuer des analyses de base.},
	author       = {Romanello, Matteo and Hengchen, Simon},
	year         = {2021},
	publisher    = {The Programming Historian en français},
	volume       = {3},
}

@article{Hengchen-Simon2021-309329,
	title        = {A data-driven approach to studying changing vocabularies in historical newspaper collections},
	abstract     = {Nation and nationhood are among the most frequently studied concepts in the field of intellectual history. At the same time, the word ‘nation’ and its historical usage are very vague. The aim in this article was to develop a data-driven method using dependency parsing and neural word embeddings to clarify some of the vagueness in the evolution of this concept. To this end, we propose the following two-step method. First, using linguistic processing, we create a large set of words pertaining to the topic of nation. Second, we train diachronic word embeddings and use them to quantify the strength of the semantic similarity between these words and thereby create meaningful clusters, which are then aligned diachronically. To illustrate the robustness of the study across languages, time spans, as well as large datasets, we apply it to the entirety of five historical newspaper archives in Dutch, Swedish, Finnish, and English. To our knowledge, thus far there have been no large-scale comparative studies of this kind that purport to grasp long-term developments in as many as four different languages in a data-driven way. A particular strength of the method we describe in this article is that, by design, it is not limited to the study of nationhood, but rather expands beyond it to other research questions and is reusable in different contexts.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hengchen, Simon and Ros, Ruben and Marjanen, Jani and Tolonen, Mikko},
	year         = {2021},
	volume       = {36},
	number       = {Supplement 2},
	pages        = {109–126},
}

@misc{Tahmasebi-Nina2022-316661,
	title        = {Proceedings of the 3rd Workshop on Computational Approaches to Historical Language Change, May 26-27, 2022, Dublin, Ireland},
	author       = {Tahmasebi, Nina and Montariol, Syrielle  and Kutuzov, Andrey and Hengchen, Simon and Dubossarsky, Haim and Borin, Lars},
	year         = {2022},
	publisher    = {Association for Computational Linguistics},
	address      = {Stroudsburg, PA},
	ISBN         = {978-1-955917-42-1},
}