Skip to main content


	title        = {Quantifying the impact of dirty OCR on historical text analysis: Eighteenth Century Collections Online as a case study},
	abstract     = {This article aims to quantify the impact optical character recognition (OCR) has on the quantitative analysis of historical documents. Using Eighteenth Century Collections Online as a case study, we first explore and explain the differences between the OCR corpus and its keyed-in counterpart, created by the Text Creation Partnership. We then conduct a series of specific analyses common to the digital humanities: topic modelling, authorship attribution, collocation analysis, and vector space modelling. The article concludes by offering some preliminary thoughts on how these conclusions can be applied to other datasets, by reflecting on the potential for predicting the quality of OCR where no ground-truth exists.},
	journal      = {Digital Scholarship in the Humanities},
	author       = {Hill, Mark J. and Hengchen, Simon},
	year         = {2019},
	volume       = {34},
	number       = {4},
	pages        = {825--843},