Skip to main content
Språkbanken Text is a part of Språkbanken.

BibTeX

@inProceedings{skoldberg-etal-2024-revealing-341866,
	title        = {Revealing Semantic Variation in Swedish Using Computational Models of
Semantic Proximity–Results From Lexicographical Experiments},
	abstract     = {The paper reports a pilot study on the detection of lexical semantic variation in
modern Swedish. The starting point of the study is the meaning descriptions of around 65,000
headwords in ’The Contemporary Dictionary of the Swedish Academy’ (SO, 2021) covering
approximately 100,000 different senses. In our work, we aim to explore the potential of the
latest computational methods to discover outdated definitions in SO and update them. For
this, we make use of the DURel tool (Schlechtweg et al., 2018, 2024) which relies on state-
of-the-art language models for the automatic semantic analysis of word usages. The work
resulted in drawing lexicographers’ attention to both main senses and subsenses that should
be added to the dictionary. It has also demonstrated that certain meaning descriptions in SO
are too general and should be split in accordance with the current principles for the semantic
descriptions in the dictionary.},
	booktitle    = {Lexicography and Semantics. Proceedings of the XXI EURALEX International Congress 8–12 October 2024 Cavtat, Croatia (eds. Kristina Š. Despot, Ana Ostroški Anić & Ivana Brač )},
	author       = {Sköldberg, Emma and Virk, Shafqat and Sander, Pauline and Hengchen, Simon and Schlechtweg, Dominik},
	year         = {2024},
	publisher    = {Institut za hrvatski jezik},
	ISBN         = {978‐953‐7967‐77‐2},
}

@inProceedings{schlechtweg-etal-2024-durel-336715,
	title        = {The DURel Annotation Tool: Human and Computational Measurement of Semantic Proximity, Sense Clusters and Semantic Change},
	abstract     = {We present the DURel tool implementing the annotation of semantic proximity between word uses into an online, open source interface. The tool supports standardized human annotation as well as computational annotation, building on recent advances with Word-in-Context models. Annotator judgments are clustered with automatic graph clustering techniques and visualized for analysis. This allows to measure word senses with simple and intuitive micro-task judgments between use pairs, requiring minimal preparation efforts. The tool offers additional functionalities to compare the agreement between annotators to guarantee the inter-subjectivity of the obtained judgments and to calculate summary statistics over the annotated data giving insights into sense frequency distributions, semantic variation or changes of senses over time.},
	booktitle    = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations, March 17-22, 2024, St. Julians, Malta. },
	author       = {Schlechtweg, Dominik and Virk, Shafqat and Sander, Pauline and Sköldberg, Emma and Theuer Linke, Lukas and Zhang, Tuo and Tahmasebi, Nina and Schulte im Walde, Sabine},
	year         = {2024},
	publisher    = {Association for Computational Linguistics},
	ISBN         = {979-8-89176-091-2},
}

@inProceedings{sander-etal-2024-durel-341867,
	title        = {The DURel Annotation Tool},
	booktitle    = {Book of Abstracts of the Workshop Large Language Models and Lexicography, 8 October 2024 Cavtat, Croatia (ed. Simon Krek)},
	author       = {Sander, Pauline and Hengchen, Simon and Zhao, Wei and Ma, Xiaocheng and Sköldberg, Emma and Virk, Shafqat and Schlechtweg, Dominik},
	year         = {2024},
}

@incollection{virk-etal-2023-lingfn-337386,
	title        = {LingFN: A Framenet for the Linguistic Domain},
	abstract     = {Frame semantics is a theory of meaning in natural language, which defines the structure of the lexical semantic resources known as framenets. Both framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for their limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering domains such as medicine, soccer, and tourism. In this paper, we report on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars (written in English) i.e. a framenet for the linguistic domain (LingFN) to complement the general-language BFN.},
	booktitle    = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
	author       = {Virk, Shafqat and Klang, Per and Borin, Lars and Saxena, Anju},
	year         = {2023},
	ISBN         = {9783031243363},
	pages        = {367--379},
}

@incollection{borin-etal-2021-swedish-311387,
	title        = {Swedish FrameNet++ and comparative linguistics},
	booktitle    = {The Swedish FrameNet++: Harmonization, integration, method development and practical language technology applications / editor(s): Dana Dannélls, Lars Borin and Karin Friberg Heppin},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	publisher    = {John Benjamins},
	address      = {Amsterdam},
	ISBN         = {9789027209900},
	pages        = {139–165},
}

@inProceedings{virk-etal-2017-automatic-261789,
	title        = {Automatic extraction of typological linguistic features from descriptive grammars},
	abstract     = {The present paper describes experiments on automatically extracting typological linguistic features of natural languages from traditional written descriptive grammars. The feature-extraction task has high potential value in typological, genealogical, historical, and other related areas of linguistics that make use of databases of structural features of languages. Until now, extraction of such features from grammars has been done manually, which is highly time and labor consuming and becomes prohibitive when extended to the thousands of languages for which linguistic descriptions are available. The system we describe here starts from semantically parsed text over which a set of rules are applied in order to extract feature values. We evaluate the system’s performance on the manually curated Grambank database as the gold standard and report the first measures of precision and recall for this problem.},
	booktitle    = {Text, Speech, and Dialogue 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings},
	editor       = {Kamil Ekštein and Václav Matoušek.},
	author       = {Virk, Shafqat and Borin, Lars and Saxena, Anju and Hammarström, Harald},
	year         = {2017},
	publisher    = {Springer International Publishing},
	address      = {Cham},
	ISBN         = {978-3-319-64205-5},
}

@inProceedings{virk-etal-2021-data-306964,
	title        = {A Data-Driven Semi-Automatic Framenet Development Methodology },
	abstract     = {FrameNet is a lexical semantic resource based on the linguistic theory of frame semantics. A number of framenet development strategies have been reported previously and all of them involve exploration of corpora and a fair amount of manual work. Despite previous efforts, there does not exist
a well-thought-out automatic/semi-automatic methodology for frame construction. In this paper we propose a data-driven methodology for identification and semi-automatic construction of frames. As a proof of concept, we report on our initial attempts to build a wider-scale framenet for the legal domain (LawFN) using the proposed methodology. The constructed frames are stored in a lexical database
and together with the annotated example sentences they have been made available through a web interface.},
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Borin, Lars and Forsberg, Markus},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{ohlsson-etal-2023-going-329710,
	title        = {Going to the market together. A presentation of a
mixed methods project},
	booktitle    = {TwinTalks Workshop at DH2023, 10 July, Graz, Austria},
	author       = {Ohlsson, Claes and Virk, Shafqat and Tahmasebi, Nina},
	year         = {2023},
}

@inProceedings{virk-etal-2021-deep-319450,
	title        = {A Deep Learning System for Automatic Extraction of Typological Linguistic Information from Descriptive Grammars},
	abstract     = {Linguistic typology is an area of linguistics concerned with analysis of and comparison between natural languages of the world based on their certain linguistic features. For that purpose, historically, the area has relied on manual extraction of linguistic feature values from textural descriptions of languages. This makes it a laborious and time expensive task and is also bound by human brain capacity. In this study, we present a deep learning system for the task of automatic extraction of linguistic features from textual descriptions of natural languages. First, textual descriptions are manually annotated with special structures called semantic frames. Those annotations are learned by a recurrent neural network, which is then used to annotate un-annotated text. Finally, the annotations are converted to linguistic feature values using a separate rule based module. Word embeddings, learned from general purpose text, are used as a major source of knowledge by the recurrent neural network. We compare the proposed deep learning system to a previously reported machine learning based system for the same task, and the deep learning system wins in terms of F1 scores with a fair margin. Such a system is expected to be a useful contribution for the automatic curation of typological databases, which otherwise are manually developed},
	booktitle    = {Proceedings of Recent Advances in Natural Language Processing, Sep 1–3, 2021/ edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Foster, Daniel and Sheikh Muhammad, Azam and Saleem, Raheela},
	year         = {2021},
	publisher    = {Association for Computational Linguistics (ACL)},
	ISBN         = {978-954-452-072-4},
}

@inProceedings{virk-etal-2021-novel-306962,
	title        = {A Novel Machine Learning Based Approach for Post-OCR Error Detection},
	abstract     = {Post processing is the most conventional approach for correcting errors that are caused
by Optical Character Recognition (OCR) systems. Two steps are usually taken to correct
OCR errors: detection and corrections. For the first task, supervised machine learning methods have shown state-of-the-art performances. Previously proposed approaches have focused
most prominently on combining lexical, contextual and statistical features for detecting errors. In this study, we report a novel system to error detection which is based merely on the n-gram counts of a candidate token. In addition to being simple and computationally less expensive, our proposed system beats previous systems reported in the ICDAR2019 competition on OCR-error detection with notable margins. We achieved state-of-the-art F1-scores for eight out of the ten involved European languages. The maximum improvement is for Spanish which improved from 0.69 to 0.90, and the minimum for Polish from 0.82 to 0.84. },
	booktitle    = {Proceedings of the International Conference on Recent Advances in Natural Language Processing, 1–3 September, 2021 / Edited by Galia Angelova, Maria Kunilovskaya, Ruslan Mitkov, Ivelina Nikolova-Koleva},
	author       = {Virk, Shafqat and Dannélls, Dana and Muhammad, Azam Sheikh},
	year         = {2021},
	publisher    = {INCOMA},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-072-4},
}

@edited_book{volodina-etal-2022-live-320415,
	title        = {Live and Learn- Festschrift in honor of Lars Borin},
	abstract     = {This Festschrift has been compiled to honor Professor Lars Borin on his 65th anniversary. It consists of 30 articles which reflect a fraction of Lars’ scholarly interests within computational linguistics and related fields. They come from his friends and colleagues around the world and deal with topics that have been – in one way or another – inspired by his work. A common theme for the articles is the never-ending need to learn, which is alluded to in the title of the volume, Live and Learn.},
	editor       = {Volodina, Elena and Dannélls, Dana and Berdicevskis, Aleksandrs and Forsberg, Markus and Virk, Shafqat},
	year         = {2022},
	publisher    = {Institutionen för svenska, flerspråkighet och språkteknologi, Göteborgs universitet},
	address      = {Göteborg},
	ISBN         = {978-91-87850-83-7},
}

@inProceedings{dannells-virk-2021-supervised-310123,
	title        = {A Supervised Machine Learning Approach for Post-OCR Error Detection for Historical Text },
	abstract     = {Training machine learning models with high accuracy requires careful feature engineering, which involves finding the best feature combinations and extracting their values from the data. The task becomes extremely laborious for specific problems such as post Optical Character Recognition (OCR) error detection because of the diversity of errors in the data. In this paper we present a machine learning approach which exploits character n-gram statistics as the only feature for the OCR error detection task. Our method achieves a significant improvement over the baseline reaching state-of-the-art results of 91% and 89% F1 measure on English and Swedish datasets respectively. We report various experiments to select the appropriate machine learning algorithm and to compare our approach to previously reported traditional approaches.},
	booktitle    = {Linköping Electronic Press Workshop and Conference Collection. Selected contributions from the Eighth Swedish Language Technology Conference (SLTC-2020), 25-27 November, 2020 },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2021},
	publisher    = {Linköping Electronic Press },
	address      = {Linköping},
}

@article{borin-etal-2021-bird's-309082,
	title        = {A bird’s-eye view on South Asian languages
through LSI: Areal or genetic relationships?},
	abstract     = {We present initial exploratory work on illuminating the long-standing question of areal versus genealogical connections in South Asia using computational data visualization tools. With respect to genealogy, we focus on the subclassification of Indo-Aryan, the most ubiquitous language family of South Asia. The intent here is methodological: we explore computational methods for visualizing large datasets of linguistic features, in our case 63 features from 200 languages representing four language families of South Asia, coming out of a digitized version of Grierson’s Linguistic Survey of India. To this dataset we apply phylogenetic software originally developed in the context of computational biology for clustering the languages and displaying the clusters in the form of networks. We further explore multiple correspondence analysis as a way of illustrating how linguistic feature bundles correlate with extrinsically defined groupings of languages (genealogical and geographical). Finally, map visualization of combinations of linguistic features and language genealogy is suggested as an aid in distinguishing genealogical and areal features. On the whole, our results are in line with the conclusions of earlier studies: Areality and genealogy are strongly intertwined in South Asia, the traditional lower-level subclassification of Indo-Aryan is largely upheld, and there is a clearly discernible areal east–west divide cutting across language families.},
	journal      = {Journal of South Asian Languages and Linguistics},
	author       = {Borin, Lars and Saxena, Anju and Virk, Shafqat and Comrie, Bernard},
	year         = {2021},
	volume       = {7},
	number       = {2},
	pages        = {151--185},
}

@inProceedings{wichmann-virk-2020-towards-298431,
	title        = { Towards a data-driven network of linguistic terms},
	abstract     = {Starting   from   close   to   20,000   text   docu-ments from the literature of language descrip-tions, from documents either born digitally orscanned and OCR’d, we extract keywords andpass  them  through  a  pruning  pipeline  wheremainly keywords that can be considered as be-longing to linguistic terminology survive. Sub-sequently  we  quantify  relations  among  those terms using Normalized Pointwise Mutual In-formation (NPMI) and use the resulting measures,  in  conjunction  with  the  Google  PageRank  (GPR),  to  build  networks  of  linguistic terms.   Two  uses  of  the  work  are  envisaged:(1) developing a search machine adapted to thelarge DReaM corpus of linguistic descriptive literature  and  (2)  getting  insights  into  how  adata-driven ontology of linguistic terminology might be built.},
	booktitle    = {Swedish Language Technology Conference (SLTC)},
	author       = {Wichmann, Søren and Virk, Shafqat},
	year         = {2020},
}

@inProceedings{dannells-virk-2020-error-297714,
	title        = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models},
	abstract     = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines.      },
	booktitle    = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg },
	author       = {Dannélls, Dana and Virk, Shafqat},
	year         = {2020},
}

@inProceedings{virk-prasad-2018-towards-295336,
	title        = {Towards Hindi/Urdu FrameNets via the Multilingual FrameNet.},
	booktitle    = {Proceedings of the LREC 2018 Workshop. International FrameNet Workshop 2018 : Multilingual Framenets and Constructicon, 12 May 2018 – Miyaza, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker },
	author       = {Virk, Shafqat and Prasad, K.V.S},
	year         = {2018},
	publisher    = {European Language Resources Association (ELRA).},
	ISBN         = {979-10-95546-00-9},
}

@inProceedings{virk-etal-2020-dream-295338,
	title        = {The DReaM Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages},
	booktitle    = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), Marseille, 11–16 May 2020  / Editors : Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis},
	author       = {Virk, Shafqat and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren},
	year         = {2020},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {979-10-95546-34-4 },
}

@inProceedings{virk-etal-2020-from-295339,
	title        = {From Linguistic Descriptions to Language Profiles},
	abstract     = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.},
	booktitle    = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia},
	author       = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren},
	year         = {2020},
	publisher    = {European Language Resources Association},
	address      = {Paris},
	ISBN         = {979-10-95546-36-8},
}

@inProceedings{borin-etal-2018-language-290841,
	title        = {Language technology for digital linguistics: Turning the Linguistic Survey of India into a rich source of linguistic information},
	abstract     = {We present our work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) from a printed discursive textual description into a formally structured digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. While doing so, we develop state-of-the-art language technology for automatically extracting the relevant grammatical information from the text of the LSI, and interactive linguistic information visualization tools for better analysis and comparisons of languages based on their structural and functional features.},
	booktitle    = {Lecture Notes in Computer Science. Computational Linguistics and Intelligent Text Processing, 18th International Conference, CICLing 2017, Budapest, Hungary, April 17–23, 2017},
	author       = {Borin, Lars and Virk, Shafqat and Saxena, Anju},
	year         = {2018},
	publisher    = {Springer},
	address      = {Cham},
}

@inProceedings{virk-etal-2019-exploiting-290903,
	title        = {Exploiting frame semantics and frame-semantic parsing for automatic extraction of typological information from descriptive grammars of natural languages},
	abstract     = {We describe a novel system for automatic extraction of typological linguistic information from descriptive grammars of natural languages, applying the theory of frame semantics in the form of frame-semantic parsing. The current proof-of-concept system covers a few selected linguistic features, but the methodology is general and can be extended not only to other typological features but also to descriptive grammars written in languages other than English. Such a system is expected to be a useful assistance for automatic curation of typological databases which otherwise are built manually, a very labor and time consuming as well as cognitively taxing enterprise.},
	booktitle    = {12th International Conference on Recent Advances in Natural Language Processing, RANLP 2019, Varna, Bulgaria, 2-4 September 2019},
	author       = {Virk, Shafqat and Muhammad, Azam Sheikh and Borin, Lars and Aslam, Muhammad Irfan and Iqbal, Saania and Khurram, Nazia},
	year         = {2019},
	publisher    = {INCOMA Ltd.},
	address      = {Shoumen, Bulgaria},
	ISBN         = {978-954-452-055-7},
}

@inProceedings{malm-etal-2018-lingfn-267404,
	title        = {LingFN: Towards a framenet for the linguistics domain},
	abstract     = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive
manual process.},
	booktitle    = {Proceedings : LREC 2018 Workshop, International FrameNet Workshop 2018. Multilingual Framenets and Constructicons, May 12, 2018, Miyazaki, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker},
	author       = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju},
	year         = {2018},
	publisher    = {ELRA},
	address      = {Miyazaki},
	ISBN         = {979-10-95546-04-7},
}

@inProceedings{borin-etal-2018-many-267534,
	title        = {Many a little makes a mickle - infrastructure component reuse for a massively multilingual linguistic study},
	abstract     = {We present ongoing work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia and studies relating to language typology and contact linguistics. The project has two concrete main aims: (1) to conduct a linguistic investigation of the claim that South Asia constitutes a linguistic area; (2) to develop state-of-the-art language technology for automatically extracting the relevant information from the text of the LSI. In this presentation we focus on how, in the first part of the project, a number of existing research infrastructure components provided by Swe-Clarin, the Swedish CLARIN consortium, have been ‘recycled’ in order to allow the linguists involved in the project to quickly orient themselves in the vast LSI material, and to be able to provide input to the language technologists designing the tools for information extraction from the descriptive grammars.},
	booktitle    = {Selected papers from the CLARIN Annual Conference 2017, Budapest, 18–20 September 2017},
	author       = {Borin, Lars and Virk, Shafqat and Saxena, Anju},
	year         = {2018},
	publisher    = {Linköping University Electronic Press},
	address      = {Linköping},
	ISBN         = {978-91-7685-273-6},
}

@inProceedings{hammarstrom-etal-2017-poor-261851,
	title        = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection},
	abstract     = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.},
	booktitle    = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 },
	author       = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus},
	year         = {2017},
	publisher    = {Association for Computing Machinery (ACM)},
	address      = {New York},
	ISBN         = {978-1-4503-5265-9},
}

@inProceedings{borin-etal-2016-towards-253952,
	title        = {Towards a Big Data View on South Asian Linguistic Diversity},
	abstract     = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.},
	booktitle    = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation},
	author       = {Borin, Lars and Virk, Shafqat and Saxena, Anju},
	year         = {2016},
	publisher    = {ELRA},
	address      = {Paris},
}

@inProceedings{virk-etal-2014-developing-202573,
	title        = {Developing an interlingual translation lexicon using WordNets and Grammatical Framework},
	abstract     = {The Grammatical Framework (GF) offers perfect translation between controlled subsets
of natural languages. E.g., an abstract syntax for a set of sentences in school mathematics
is the interlingua between the corresponding sentences in English and Hindi, say. GF
“resource grammars” specify how to say something in English or Hindi; these are reused
with “application grammars” that specify what can be said (mathematics, tourist
phrases, etc.). More recent robust parsing and parse-tree disambiguation allow GF to
parse arbitrary English text. We report here an experiment to linearise the resulting
tree directly to other languages (e.g. Hindi, German, etc.), i.e., we use a language independent
resource grammar as the interlingua. We focus particularly on the last part
of the translation system, the interlingual lexicon and word sense disambiguation (WSD).
We improved the quality of the wide coverage interlingual translation lexicon by using
the Princeton and Universal WordNet data. We then integrated an existing WSD tool
and replaced the usual GF style lexicons, which give one target word per source word,
by the WordNet based lexicons. These new lexicons and WSD improve the quality of
translation in most cases, as we show by examples. Both WordNets and WSD in general
are well known, but this is the first use of these tools with GF.},
	booktitle    = {Proceedings of the Fifth Workshop on South and Southeast Asian Natural Language Processing},
	author       = {Virk, Shafqat and Prasad, K. V. S. and Ranta, Aarne and Angelov, Krasimir},
	year         = {2014},
}

@book{virk-2013-computational-176382,
	title        = {Computational Linguistics Resources for Indo-Iranian Languages},
	abstract     = {Can computers process human languages? During the last fifty years, two main approaches have been used to find an answer to this question: data- driven (i.e. statistics based) and knowledge-driven (i.e. grammar based). The former relies on the availability of a vast amount of electronic linguistic data and the processing capabilities of modern-age computers, while the latter builds on grammatical rules and classical linguistic theories of language.
In this thesis, we use mainly the second approach and elucidate the development of computational (”resource”) grammars for six Indo-Iranian languages: Urdu, Hindi, Punjabi, Persian, Sindhi, and Nepali. We explore different lexical and syntactical aspects of these languages and build their resource grammars using the Grammatical Framework (GF) – a type theo- retical grammar formalism tool.
We also provide computational evidence of the similarities/differences between Hindi and Urdu, and report a mechanical development of a Hindi resource grammar starting from an Urdu resource grammar. We use a functor style implementation that makes it possible to share the commonalities between the two languages. Our analysis shows that this sharing is possible upto 94% at the syntax level, whereas at the lexical level Hindi and Urdu differed in 18% of the basic words, in 31% of tourist phrases, and in 92% of school mathematics terms.
Next, we describe the development of wide-coverage morphological lexicons for some of the Indo-Iranian languages. We use existing linguistic data from different resources (i.e. dictionaries and WordNets) to build uni-sense and multi-sense lexicons.
Finally, we demonstrate how we used the reported grammatical and lexical resources to add support for Indo-Iranian languages in a few existing GF application grammars. These include the Phrasebook, the mathematics grammar library, and the Attempto controlled English grammar. Further, we give the experimental results of developing a wide-coverage grammar based arbitrary text translator using these resources. These applications show the importance of such linguistic resources, and open new doors for future re- search on these languages.},
	author       = {Virk, Shafqat},
	year         = {2013},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
	ISBN         = {978-91-628-8706-3},
}

@inProceedings{prasad-virk-2012-computational-170274,
	title        = {Computational evidence that Hindi and Urdu share a grammar but not the lexicon},
	abstract     = {Hindi and Urdu share a grammar and a basic vocabulary, but are often mutually unintelligible because they use different words in higher registers and sometimes even in quite
ordinary situations. We report computational translation evidence of this unusual relationship (it differs from the usual pattern, that related languages share the advanced vocabulary
and differ in the basics). We took a GF resource grammar for Urdu and adapted it mechanically for Hindi, changing essentially only the script (Urdu is written in Perso-Arabic,
and Hindi in Devanagari) and the lexicon where needed. In evaluation, the Urdu grammar
and its Hindi twin either both correctly translated an English sentence, or failed in exactly
the same grammatical way, thus confirming computationally that Hindi andUrdu share a
grammar. But the evaluation also found that the Hindi and Urdu lexicons differed in 18%
of the basic words, in 31% of tourist phrases, and in 92% of school mathematics terms.},
	booktitle    = {3rd Workshop on South and Southeast Asian Natural Language Processing (SANLP)", collocated with COLING 12},
	author       = {Prasad, K. V. S. and Virk, Shafqat},
	year         = {2012},
}

@inProceedings{caprotti-etal-2012-high-178183,
	title        = {High-quality translation: Molto tools and applications},
	abstract     = {MOLTO (Multilingual On Line Translation, FP7-ICT-247914, www.molto-project.eu) is a European project focusing on translation on the web. MOLTO targets translation that has production quality, that is, usable for quick and reliable dissemination of information. MOLTO’s main focus is to increase the productivity of such translation systems, building on the technology of GF (Grammatical Framework) and its Resource Grammar Library. But MOLTO also develops hybrid methods which increase
the quality of Statistical Machine Translation (SMT) by adding linguistic information, or bootstrap grammatical models from statistical models. This paper gives a brief overview of MOLTO’s latest achievements, many of which are more thoroughly described in separate papers and available as web-based demos and as open-source software.},
	booktitle    = {The fourth Swedish Language Technology Conference (SLTC)},
	author       = {Caprotti, Olga and Ranta, Aarne and Angelov, Krasimir and Enache, Ramona and Camilleri, John J. and Dannélls, Dana and Détrez, Grégoire and Hallgren, Thomas and Prasad, K. V. S. and Virk, Shafqat},
	year         = {2012},
}

@book{virk-2012-computational-170273,
	title        = {Computational Grammar Resources for Indo-Iranian Languages},
	author       = {Virk, Shafqat},
	year         = {2012},
	publisher    = {University of Gothenburg},
	address      = {Göteborg},
}

@inProceedings{virk-etal-2010-open-131249,
	title        = {An Open Source Urdu Resource Grammar},
	booktitle    = {Proceedings of the 8th Workshop on Asian Language Resources (Coling 2010 workshop)},
	author       = {Virk, Shafqat and Humayoun, Muhammad and Ranta, Aarne},
	year         = {2010},
}

@inProceedings{virk-etal-2011-open-151566,
	title        = {An Open-Source Punjabi Resource Grammar},
	booktitle    = {Proceedings of RANLP-2011, Recent Advances in Natural Language Processing, Hissar, Bulgaria, 12-14 September, 2011},
	author       = {Virk, Shafqat and Humayoun, Muhammad and Ranta, Aarne},
	year         = {2011},
	pages        = {70--76},
}

@inProceedings{virk-abolahrar-2012-open-170271,
	title        = {An Open Source Persian Computational Grammar},
	abstract     = {In this paper, we describe a multilingual open-source computational grammar of Persian, developed in Grammatical Framework 
(GF)  – A type-theoretical grammar formalism. We discuss in detail the structure of different syntactic (i.e. noun phrases, verb 
phrases, adjectival phrases, etc.) categories of Persian. First, we show how to structure and construct these categories individually. 
Then we describe how they are glued together to make well-formed sentences in Persian, while maintaining the grammatical features 
such as agreement, word order, etc. We also show how some of the distinctive features of Persian, such as the ezafe construction, are 
implemented in GF. In order to evaluate the grammar’s correctness, and to demonstrate its usefulness, we have added support for
Persian in a multilingual application grammar (the Tourist Phrasebook) using the reported resource grammar.},
	booktitle    = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
	author       = {Virk, Shafqat and ABOLAHRAR, ELNAZ},
	year         = {2012},
}