@techreport{Hammarstedt-Martin2017-256056, title = {Korp 6 - Användarmanual}, author = {Hammarstedt, Martin and Borin, Lars and Forsberg, Markus and Roxendal, Johan and Schumacher, Anne and Öhrman, Maria}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @techreport{Hammarstedt-Martin2017-256055, title = {Korp 6 - Technical Report}, author = {Hammarstedt, Martin and Roxendal, Johan and Öhrman, Maria and Borin, Lars and Forsberg, Markus and Schumacher, Anne}, year = {2017}, publisher = {Institutionen för svenska språket, Göteborgs universitet}, } @inProceedings{Nord-Andreas2017-259902, title = {Enklare efter klarspråk? Myndighetstexter före och efter ett klarspråksprojekt}, booktitle = {Saga Bendegard, Ulla Melander Marttala & Maria Westman (red.), Språk och norm: Rapport från ASLA:s symposium, Uppsala universitet 21–22 april 2016}, author = {Nord, Andreas and Forsberg, Markus}, year = {2017}, publisher = {ASLA}, adress = {Uppsala}, ISBN = {978-91-87884-26-9}, } @inProceedings{Hammarström-Harald2017-261851, title = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection}, abstract = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.}, booktitle = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 }, author = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus}, year = {2017}, publisher = {Association for Computing Machinery (ACM)}, adress = {New York}, ISBN = {978-1-4503-5265-9}, }