@inProceedings{Malm-Per2018-267404, title = {LingFN: Towards a framenet for the linguistics domain}, abstract = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive manual process.}, booktitle = {Proceedings : LREC 2018 Workshop, International FrameNet Workshop 2018. Multilingual Framenets and Constructicons, May 12, 2018, Miyazaki, Japan / Edited by Tiago Timponi Torrent, Lars Borin and Collin F. Baker}, author = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju}, year = {2018}, publisher = {ELRA}, adress = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{Borin-Lars2018-267534, title = {Many a little makes a mickle - infrastructure component reuse for a massively multilingual linguistic study}, abstract = {We present ongoing work aiming at turning the linguistic material available in Grierson’s classical Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia and studies relating to language typology and contact linguistics. The project has two concrete main aims: (1) to conduct a linguistic investigation of the claim that South Asia constitutes a linguistic area; (2) to develop state-of-the-art language technology for automatically extracting the relevant information from the text of the LSI. In this presentation we focus on how, in the first part of the project, a number of existing research infrastructure components provided by Swe-Clarin, the Swedish CLARIN consortium, have been ‘recycled’ in order to allow the linguists involved in the project to quickly orient themselves in the vast LSI material, and to be able to provide input to the language technologists designing the tools for information extraction from the descriptive grammars.}, booktitle = {Selected papers from the CLARIN Annual Conference 2017, Budapest, 18–20 September 2017}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2018}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-273-6}, } @inProceedings{Malm-Per2018-269086, title = {LingFN: Towards a framenet for the linguistics domain}, abstract = {Framenets and frame semantics have proved useful for a number of natural language processing (NLP) tasks. However, in this connection framenets have often been criticized for limited coverage. A proposed reasonable-effort solution to this problem is to develop domain-specific (sublanguage) framenets to complement the corresponding general-language framenets for particular NLP tasks, and in the literature we find such initiatives covering, e.g., medicine, soccer, and tourism. In this paper, we report on our experiments and first results on building a framenet to cover the terms and concepts encountered in descriptive linguistic grammars. A contextual statistics based approach is used to judge the polysemous nature of domain-specific terms, and to design new domain-specific frames. The work is part of a more extensive research undertaking where we are developing NLP methodologies for automatic extraction of linguistic information from traditional linguistic descriptions to build typological databases, which otherwise are populated using a labor intensive manual process.}, booktitle = {Proceedings of the LREC 2018 workshop: International FrameNet Workshop 2018 – Multilingual Framenets and Constructicons}, author = {Malm, Per and Virk, Shafqat and Borin, Lars and Saxena, Anju}, year = {2018}, publisher = {ELRA}, adress = {Miyazaki}, ISBN = {979-10-95546-04-7}, } @inProceedings{Virk-Shafqat2017-261789, title = {Automatic extraction of typological linguistic features from descriptive grammars}, abstract = {The present paper describes experiments on automatically extracting typological linguistic features of natural languages from traditional written descriptive grammars. The feature-extraction task has high potential value in typological, genealogical, historical, and other related areas of linguistics that make use of databases of structural features of languages. Until now, extraction of such features from grammars has been done manually, which is highly time and labor consuming and becomes prohibitive when extended to the thousands of languages for which linguistic descriptions are available. The system we describe here starts from semantically parsed text over which a set of rules are applied in order to extract feature values. We evaluate the system’s performance on the manually curated Grambank database as the gold standard and report the first measures of precision and recall for this problem.}, booktitle = {Text, Speech, and Dialogue 20th International Conference, TSD 2017, Prague, Czech Republic, August 27-31, 2017, Proceedings / edited by Kamil Ekštein, Václav Matoušek.}, author = {Virk, Shafqat and Borin, Lars and Saxena, Anju and Hammarström, Harald}, year = {2017}, publisher = {Springer International Publishing}, adress = {Cham}, ISBN = {978-3-319-64205-5}, } @inProceedings{Hammarström-Harald2017-261851, title = {Poor man's OCR post-correction: Unsupervised recognition of variant spelling applied to a multilingual document collection}, abstract = {© 2017 Copyright held by the owner/author(s). The accuracy of Optical Character Recognition (OCR) is sets the limit for the success of subsequent applications used in text analyzing pipeline. Recent models of OCR postprocessing significantly improve the quality of OCR-generated text but require engineering work or resources such as humanlabeled data or a dictionary to perform with such accuracy on novel datasets. In the present paper we introduce a technique for OCR post-processing that runs off-the-shelf with no resources or parameter tuning required. In essence, words which are similar in form that are also distributionally more similar than expected at random are deemed OCR-variants. As such it can be applied to any language or genre (as long as the orthography segments the language at the word-level). The algorithm is illustrated and evaluated using a multilingual document collection and a benchmark English dataset.}, booktitle = {DATeCH2017, Proceedings of the 2nd International Conference on Digital Access to Textual Cultural Heritage, Göttingen, Germany — June 01 - 02, 2017 }, author = {Hammarström, Harald and Virk, Shafqat and Forsberg, Markus}, year = {2017}, publisher = {Association for Computing Machinery (ACM)}, adress = {New York}, ISBN = {978-1-4503-5265-9}, } @inProceedings{Borin-Lars2016-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, adress = {Paris}, } @inProceedings{Virk-Shafqat2014-202573, title = {Developing an interlingual translation lexicon using WordNets and Grammatical Framework}, abstract = {The Grammatical Framework (GF) offers perfect translation between controlled subsets of natural languages. E.g., an abstract syntax for a set of sentences in school mathematics is the interlingua between the corresponding sentences in English and Hindi, say. GF “resource grammars” specify how to say something in English or Hindi; these are reused with “application grammars” that specify what can be said (mathematics, tourist phrases, etc.). More recent robust parsing and parse-tree disambiguation allow GF to parse arbitrary English text. We report here an experiment to linearise the resulting tree directly to other languages (e.g. Hindi, German, etc.), i.e., we use a language independent resource grammar as the interlingua. We focus particularly on the last part of the translation system, the interlingual lexicon and word sense disambiguation (WSD). We improved the quality of the wide coverage interlingual translation lexicon by using the Princeton and Universal WordNet data. We then integrated an existing WSD tool and replaced the usual GF style lexicons, which give one target word per source word, by the WordNet based lexicons. These new lexicons and WSD improve the quality of translation in most cases, as we show by examples. Both WordNets and WSD in general are well known, but this is the first use of these tools with GF.}, booktitle = {Proceedings of the Fifth Workshop on South and Southeast Asian Natural Language Processing}, author = {Virk, Shafqat and Prasad, K. V. S. and Ranta, Aarne and Angelov, Krasimir}, year = {2014}, } @book{Virk-Shafqat2013-176382, title = {Computational Linguistics Resources for Indo-Iranian Languages}, abstract = {Can computers process human languages? During the last fifty years, two main approaches have been used to find an answer to this question: data- driven (i.e. statistics based) and knowledge-driven (i.e. grammar based). The former relies on the availability of a vast amount of electronic linguistic data and the processing capabilities of modern-age computers, while the latter builds on grammatical rules and classical linguistic theories of language. In this thesis, we use mainly the second approach and elucidate the development of computational (”resource”) grammars for six Indo-Iranian languages: Urdu, Hindi, Punjabi, Persian, Sindhi, and Nepali. We explore different lexical and syntactical aspects of these languages and build their resource grammars using the Grammatical Framework (GF) – a type theo- retical grammar formalism tool. We also provide computational evidence of the similarities/differences between Hindi and Urdu, and report a mechanical development of a Hindi resource grammar starting from an Urdu resource grammar. We use a functor style implementation that makes it possible to share the commonalities between the two languages. Our analysis shows that this sharing is possible upto 94% at the syntax level, whereas at the lexical level Hindi and Urdu differed in 18% of the basic words, in 31% of tourist phrases, and in 92% of school mathematics terms. Next, we describe the development of wide-coverage morphological lexicons for some of the Indo-Iranian languages. We use existing linguistic data from different resources (i.e. dictionaries and WordNets) to build uni-sense and multi-sense lexicons. Finally, we demonstrate how we used the reported grammatical and lexical resources to add support for Indo-Iranian languages in a few existing GF application grammars. These include the Phrasebook, the mathematics grammar library, and the Attempto controlled English grammar. Further, we give the experimental results of developing a wide-coverage grammar based arbitrary text translator using these resources. These applications show the importance of such linguistic resources, and open new doors for future re- search on these languages.}, author = {Virk, Shafqat}, year = {2013}, publisher = {University of Gothenburg}, adress = {Göteborg}, ISBN = {978-91-628-8706-3}, } @inProceedings{Caprotti-Olga2012-178183, title = {High-quality translation: Molto tools and applications}, abstract = {MOLTO (Multilingual On Line Translation, FP7-ICT-247914, www.molto-project.eu) is a European project focusing on translation on the web. MOLTO targets translation that has production quality, that is, usable for quick and reliable dissemination of information. MOLTO’s main focus is to increase the productivity of such translation systems, building on the technology of GF (Grammatical Framework) and its Resource Grammar Library. But MOLTO also develops hybrid methods which increase the quality of Statistical Machine Translation (SMT) by adding linguistic information, or bootstrap grammatical models from statistical models. This paper gives a brief overview of MOLTO’s latest achievements, many of which are more thoroughly described in separate papers and available as web-based demos and as open-source software.}, booktitle = {The fourth Swedish Language Technology Conference (SLTC)}, author = {Caprotti, Olga and Ranta, Aarne and Angelov, Krasimir and Enache, Ramona and Camilleri, John J. and Dannélls, Dana and Détrez, Grégoire and Hallgren, Thomas and Prasad, K. V. S. and Virk, Shafqat}, year = {2012}, } @inProceedings{Prasad-K.V.S.2012-170274, title = {Computational evidence that Hindi and Urdu share a grammar but not the lexicon}, abstract = {Hindi and Urdu share a grammar and a basic vocabulary, but are often mutually unintelligible because they use different words in higher registers and sometimes even in quite ordinary situations. We report computational translation evidence of this unusual relationship (it differs from the usual pattern, that related languages share the advanced vocabulary and differ in the basics). We took a GF resource grammar for Urdu and adapted it mechanically for Hindi, changing essentially only the script (Urdu is written in Perso-Arabic, and Hindi in Devanagari) and the lexicon where needed. In evaluation, the Urdu grammar and its Hindi twin either both correctly translated an English sentence, or failed in exactly the same grammatical way, thus confirming computationally that Hindi andUrdu share a grammar. But the evaluation also found that the Hindi and Urdu lexicons differed in 18% of the basic words, in 31% of tourist phrases, and in 92% of school mathematics terms.}, booktitle = {3rd Workshop on South and Southeast Asian Natural Language Processing (SANLP)", collocated with COLING 12}, author = {Prasad, K. V. S. and Virk, Shafqat}, year = {2012}, } @book{Virk-Shafqat2012-170273, title = {Computational Grammar Resources for Indo-Iranian Languages}, author = {Virk, Shafqat}, year = {2012}, publisher = {University of Gothenburg}, adress = {Göteborg}, } @inProceedings{Virk-Shafqat2012-170271, title = {An Open Source Persian Computational Grammar}, abstract = {In this paper, we describe a multilingual open-source computational grammar of Persian, developed in Grammatical Framework (GF) – A type-theoretical grammar formalism. We discuss in detail the structure of different syntactic (i.e. noun phrases, verb phrases, adjectival phrases, etc.) categories of Persian. First, we show how to structure and construct these categories individually. Then we describe how they are glued together to make well-formed sentences in Persian, while maintaining the grammatical features such as agreement, word order, etc. We also show how some of the distinctive features of Persian, such as the ezafe construction, are implemented in GF. In order to evaluate the grammar’s correctness, and to demonstrate its usefulness, we have added support for Persian in a multilingual application grammar (the Tourist Phrasebook) using the reported resource grammar.}, booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, author = {Virk, Shafqat and ABOLAHRAR, ELNAZ}, year = {2012}, } @inProceedings{Virk-Shafqat2011-151566, title = {An Open-Source Punjabi Resource Grammar}, booktitle = {Proceedings of RANLP-2011, Recent Advances in Natural Language Processing, Hissar, Bulgaria, 12-14 September, 2011}, author = {Virk, Shafqat and Humayoun, Muhammad and Ranta, Aarne}, year = {2011}, pages = {70--76}, } @inProceedings{Virk-Shafqat2010-131249, title = {An Open Source Urdu Resource Grammar}, booktitle = {Proceedings of the 8th Workshop on Asian Language Resources (Coling 2010 workshop)}, author = {Virk, Shafqat and Humayoun, Muhammad and Ranta, Aarne}, year = {2010}, }