@inProceedings{Virk-Shafqat2017-261789, title = {Automatic extraction of typological linguistic features from descriptive grammars}, abstract = {The present paper describes experiments on automatically extracting typological linguistic features of natural languages from traditional written descriptive grammars. The feature-extraction task has high potential value in typological, genealogical, historical, and other related areas of linguistics that make use of databases of structural features of languages. Until now, extraction of such features from grammars has been done manually, which is highly time and labor consuming and becomes prohibitive when extended to the thousands of languages for which linguistic descriptions are available. The system we describe here starts from semantically parsed text over which a set of rules are applied in order to extract feature values. We evaluate the system’s performance on the manually curated Grambank database as the gold standard and report the first measures of precision and recall for this problem.}, booktitle = {Text, Speech, and Dialogue 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings / edited by Kamil Ekštein, Václav Matoušek.}, author = {Virk, Shafqat and Borin, Lars and Saxena, Anju and Hammarström, Harald}, year = {2017}, publisher = {Springer International Publishing}, adress = {Cham}, ISBN = {978-3-319-64205-5}, } @inProceedings{Hammarström-Harald2017-261851, title = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection}, abstract = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.}, booktitle = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 }, author = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus}, year = {2017}, publisher = {Association for Computing Machinery (ACM)}, adress = {New York}, ISBN = {978-1-4503-5265-9}, }