@inProceedings{wichmann-virk-2020-towards-298431, title = { Towards a data-driven network of linguistic terms}, abstract = {Starting from close to 20,000 text docu-ments from the literature of language descrip-tions, from documents either born digitally orscanned and OCR’d, we extract keywords andpass them through a pruning pipeline wheremainly keywords that can be considered as be-longing to linguistic terminology survive. Sub-sequently we quantify relations among those terms using Normalized Pointwise Mutual In-formation (NPMI) and use the resulting measures, in conjunction with the Google PageRank (GPR), to build networks of linguistic terms. Two uses of the work are envisaged:(1) developing a search machine adapted to thelarge DReaM corpus of linguistic descriptive literature and (2) getting insights into how adata-driven ontology of linguistic terminology might be built.}, booktitle = {Swedish Language Technology Conference (SLTC)}, author = {Wichmann, Søren and Virk, Shafqat}, year = {2020}, } @inProceedings{dannells-virk-2020-error-297714, title = {OCR Error Detection on Historical Text Using Uni-Feature and Multi-Feature Based Machine Learning Models}, abstract = {Detecting errors that are caused by Optical Character Recognition (OCR) systems is a challenging task that has received much attention over the years. Recent work has explored machine learning methods using hand-crafted feature engineering, which, in addition to the difficulty in identifying the best feature combinations, is often very time and resources expensive. This raises the question: Do we always need many features to achieve better results? This is an open-ended question and its answer might depend on the task at hand. For OCR error detection, we experimented and found that interestingly a uni-feature based system conquered multi-feature based systems on a Swedish data set achieving state-of-the art results, and performed equally well on an English dataset. We also experimented to find which machine learning algorithm is more suitable for the task at hand by comparing the performance of five well-known machine learning algorithms, namely Logistic regression, Decision Trees, Bernoulli Naive Bayes, Naive Bays, and Support Vector Machines. }, booktitle = {Swedish Language Technology Conference (SLTC), 25-27 November 2020, University of Gothenburg }, author = {Dannélls, Dana and Virk, Shafqat}, year = {2020}, } @inProceedings{virk-etal-2020-dream-295338, title = {The DReaM Corpus: A Multilingual Annotated Corpus of Grammars for the World’s Languages}, booktitle = {Proceedings of the 12th Conference on Language Resources and Evaluation (LREC 2020), Marseille, 11–16 May 2020 / Editors : Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis}, author = {Virk, Shafqat and Hammarström, Harald and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-34-4 }, } @inProceedings{virk-etal-2020-from-295339, title = {From Linguistic Descriptions to Language Profiles}, abstract = {Language catalogues and typological databases are two important types of resources containing different types of knowledge about the world’s natural languages. The former provide metadata such as number of speakers, location (in prose descriptions and/or GPS coordinates), language code, literacy, etc., while the latter contain information about a set of structural and functional attributes of languages. Given that both types of resources are developed and later maintained manually, there are practical limits as to the number of languages and the number of features that can be surveyed. We introduce the concept of a language profile, which is intended to be a structured representation of various types of knowledge about a natural language extracted semi-automatically from descriptive documents and stored at a central location. It has three major parts: (1) an introductory; (2) an attributive; and (3) a reference part, each containing different types of knowledge about a given natural language. As a case study, we develop and present a language profile of an example language. At this stage, a language profile is an independent entity, but in the future it is envisioned to become part of a network of language profiles connected to each other via various types of relations. Such a representation is expected to be suitable both for humans and machines to read and process for further deeper linguistic analyses and/or comparisons.}, booktitle = {Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020). Language Resources and Evaluation Conference (LREC 2020), Marseille, 11–16 May 2020 / Edited by : Maxim Ionov, John P. McCrae, Christian Chiarcos, Thierry Declerck, Julia Bosque-Gil, and Jorge Gracia}, author = {Virk, Shafqat and Hammarström, Harald and Borin, Lars and Forsberg, Markus and Wichmann, Søren}, year = {2020}, publisher = {European Language Resources Association}, address = {Paris}, ISBN = {979-10-95546-36-8}, }