@inProceedings{borin-etal-2016-towards-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, address = {Paris}, }