Språkbanken Text är en avdelning inom Språkbanken.
BibTeX

@inProceedings{virk-etal-2021-novel-306962,
	title        = {A Novel Machine Learning Based Approach for Post-OCR Error Detection},
	abstract     = {Post processing is the most conventional approach for correcting errors that are caused
by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct
OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused
most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. },
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and  Muhammad, Azam Sheikh},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{virk-etal-2021-data-306964,
	title        = {A Data-Driven Semi-Automatic Framenet Development Methodology },
	abstract     = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist
a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database
and together with the annotated example sentences they have been made available through a web interface.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@article{borin-etal-2021-bird's-309082,
	title        = {A bird’s-eye view on South Asian languages
through LSI: Areal or genetic relationships?},
	abstract     = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.},
	journal      = {Journal of South Asian Languages and Linguistics},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {151--185},
}

@inProceedings{dannells-virk-2021-supervised-310123,
	title        = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text },
	abstract     = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.},
	booktitle    = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2021},
	publisher    = {Linköping Electronic Press },
	address      = {Linköping},
}

@incollection{borin-etal-2021-swedish-311387,
	title        = {Swedish FrameNet++ and comparative linguistics},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {139–165},
}

@inProceedings{virk-etal-2021-deep-319450,
	title        = {A Deep Learning System for Automatic Extraction of Typological Linguistic Information from Descriptive Grammars},
	abstract     = {Linguistic typology is an area of linguistics concerned with analysis of and comparison between natural languages of the world based on their certain linguistic features. For that purpose, historically, the area has relied on manual extraction of linguistic feature values from textural descriptions of languages. This makes it a laborious and time expensive task and is also bound by human brain capacity. In this study, we present a deep learning system for the task of automatic extraction of linguistic features from textual descriptions of natural languages. First, textual descriptions are manually annotated with special structures called semantic frames. Those annotations are learned by a recurrent neural network, which is then used to annotate un-annotated text. Finally, the annotations are converted to linguistic feature values using a separate rule based module. Word embeddings, learned from general purpose text, are used as a major source of knowledge by the recurrent neural network. We compare the proposed deep learning system to a previously reported machine learning based system for the same task, and the deep learning system wins in terms of F1 scores with a fair margin. Such a system is expected to be a useful contribution for the automatic curation of typological databases, which otherwise are manually developed},
	booktitle    = {Proceedings of Recent Advances in Natural Language Processing, Sep 1–3, 2021/ edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Foster, Daniel and Sheikh Muhammad, Azam and Saleem, Raheela},
	year         = {2021},
	publisher    = {Association for Computational Linguistics (ACL)},
	ISBN         = {978-954-452-072-4},
}
Sidansvarig: sb-webb