Hoppa till huvudinnehåll

BibTeX

@article{borin-etal-2021-bird's-309082,
	title        = {A bird’s-eye view on South Asian languages
through LSI: Areal or genetic relationships?},
	abstract     = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.},
	journal      = {Journal of South Asian Languages and Linguistics},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {151--185},
}

@inProceedings{virk-etal-2021-novel-306962,
	title        = {A Novel Machine Learning Based Approach for Post-OCR Error Detection},
	abstract     = {Post processing is the most conventional approach for correcting errors that are caused
by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct
OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused
most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. },
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and  Muhammad, Azam Sheikh},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{virk-etal-2021-data-306964,
	title        = {A Data-Driven Semi-Automatic Framenet Development Methodology },
	abstract     = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist
a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database
and together with the annotated example sentences they have been made available through a web interface.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}