@techreport{Borin-Lars2016-233768, title = {A free cloud service for OCR / En fri molntjänst för OCR}, author = {Borin, Lars and Bouma, Gerlof and Dannélls, Dana}, year = {2016}, publisher = {University of Gothenburg}, adress = {Göteborg}, } @inProceedings{Tahmasebi-Nina2016-233899, title = {SWE-CLARIN – the Swedish CLARIN project – aims and activities}, booktitle = {Digital Humanities in the Nordic countries, Oslo, March 15-17 2016}, author = {Tahmasebi, Nina and Borin, Lars and Jordan, Caspar and Ekman, Stefan}, year = {2016}, pages = {122--123}, } @inProceedings{Viklund-Jon2016-236738, title = {How can big data help us study rhetorical history?}, abstract = {Rhetorical history is traditionally studied through rhetorical treatises or selected rhetorical practices, for example the speeches of major orators. Although valuable sources, these do not give us the answers to all our questions. Indeed, focus on a few canonical works or the major historical key figures might even lead us to reproduce cultural self-identifications and false generalizations. However, thanks to increasing availability of relevant digitized texts, we are now at a point where it is possible to see how new research questions can be formulated – and how old research questions can be addressed from a new angle or established results verified – on the basis of exhaustive collections of data, rather than small samples, but where a methodology has not yet established itself. The aim of this paper is twofold: (1) We wish to demonstrate the usefulness of large-scale corpus studies (“text mining”) in the field of rhetorical history, and hopefully point to some interesting research problems and how they can be analyzed using “big-data” methods. (2) In doing this, we also aim to make a contribution to method development in e-science for the humanities and social sciences, and in particular in the framework of CLARIN. }, booktitle = {Linköping Electronic Conference Proceedings, No. 123. Edited by Koenraad De Smedt. Selected Papers from the CLARIN Annual Conference 2015. October 14–16, 2015, Wroclaw, Poland}, author = {Viklund, Jon and Borin, Lars}, year = {2016}, volume = {123}, ISBN = {978-91-7685-765-6}, pages = {79--93}, } @article{Rehm-Georg2016-237609, title = {The strategic impact of META-NET on the regional, national and international level}, abstract = {This article provides an overview of the dissemination work carried out in META-NET from 2010 until 2015; we describe its impact on the regional, national and international level, mainly with regard to politics and the funding situation for LT topics. The article documents the initiative’s work throughout Europe in order to boost progress and innovation in our field.}, author = {Rehm, Georg and Uszkoreit, Hans and Ananiadou, Sophia and Bel, Núria and Bielevičienė, Audronė and Borin, Lars and Branco, António and Budin, Gerhard and Calzolari, Nicoletta and Daelemans, Walter and Garabík, Radovan and Grobelnik, Marko and García-Mateo, Carmen and Genabith, Josef Van and Hajič, Jan and Hernáez, Inma and Judge, John and Koeva, Svetla and Krek, Simon and Krstev, Cvetana and Lindén, Krister and Magnini, Bernardo and Mariani, Joseph and Mcnaught, John and Melero, Maite and Monachini, Monica and Moreno, Asunción and Odijk, Jan and Ogrodniczuk, Maciej and Pęzik, Piotr and Piperidis, Stelios and Przepiórkowski, Adam and Rögnvaldsson, Eiríkur and Rosner, Mike and Pedersen, Bolette Sandford and Skadiņa, Inguna and De Smedt, Koenraad and Tadić, Marko and Thompson, Paul and Tufiş, Dan and Váradi, Tamás and Vasiļjevs, Andrejs and Vider, Kadri and Zabarskaitė, Jolanta}, year = {2016}, volume = {50}, number = {2}, pages = {351--374}, } @article{Adesam-Yvonne2016-237884, title = {Språkteknologi för svenska språket genom tiderna}, abstract = {Språkbanken, the Swedish Language Bank, is a language technology research unit at the Department of Swedish, University of Gothenburg. We develop language resources – such as corpora, lexical resources, and analytical tools – for all variants of Swedish, from Old Swedish laws to present-day social media. Historical texts offer exciting theoretical and methodological challenges for language technology because they often defy the assumption inherent in most automatic analysis tools that the texts contain a standardized written language. In this article, we describe our ongoing work on the development of annotated historical corpora, as well as our efforts on linking various resources (both corpora and lexical resources). This research advances the state of the art of language technology as well as enables new research for scholars in other disciplines.}, author = {Adesam, Yvonne and Ahlberg, Malin and Andersson, Peter and Borin, Lars and Bouma, Gerlof and Forsberg, Markus}, year = {2016}, volume = {76}, number = {Studier i svensk språkhistoria 13}, pages = {65--87}, } @inProceedings{Borin-Lars2016-238147, title = {Towards interactive visualization of public discourse in time and space}, abstract = {We report on a proof-of-concept study where we (1) apply NLP tools for extracting political-discourse topics from a large Swedish Twitter dataset; and (2) design an interactive spatiotemporal visualization application allowing humanities and social-science scholars to explore how the tweet topics vary over space and time.}, booktitle = {Linköping Electronic Conference Proceedings}, author = {Borin, Lars and Kosiński, Tomasz}, year = {2016}, volume = {126}, ISBN = {978-91-7685-733-5}, pages = {1--7}, } @misc{Eide-StianRødven2016-238134, title = "The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP", author = "Eide, Stian Rødven and Tahmasebi, Nina and Borin, Lars", year = "2016", volume = "126", number = "002", isbn = "978-91-7685-733-5 ", pages = "8--12", } @incollection{Borin-Lars2016-246607, title = {Lexikografi för maskiner och lexikografi för människor}, booktitle = {Framtidens lexikografi: Rapport från ett symposium i Göteborg 5 oktober 2012}, author = {Borin, Lars}, year = {2016}, publisher = {Meijerbergs institut vid Göteborgs universitet}, adress = {Göteborg}, ISBN = {978-91-87850-01-1}, pages = {9--27}, } @inProceedings{Ahlberg-Malin2016-246072, title = {Karp: Språkbanken’s Open Lexical Infrastructure}, booktitle = {Globalex 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @inProceedings{Ahlberg-Malin2016-246063, title = {Språkbanken’s Open Lexical Infrastructure}, abstract = {Karp is an open lexical infrastructure and a web based tool for searching, exploring and developing lexical resources. Språkbanken currently hosts a number of lexicons in Karp and on-going work aims at broadening the type of resources that can be developed in the system. This abstract gives a short overview of Karp's basic functionality, and describes some current projects and on-going work.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference. Umeå University, 17-18 November, 2016}, author = {Ahlberg, Malin and Borin, Lars and Forsberg, Markus and Olsson, Olof and Schumacher, Anne and Uppström, Jonatan}, year = {2016}, } @inProceedings{Borin-Lars2016-246053, title = {Sparv: Språkbanken’s corpus annotation pipeline infrastructure}, abstract = {Sparv is Språkbanken's corpus annotation pipeline infrastructure. The easiest way to use the pipeline is from its web interface with a plain text document. The pipeline uses in-house and external tools on the text to segment it into sentences and paragraphs, tokenise, tag parts-of-speech, look up in dictionaries and analyse compounds. The pipeline can also be run using a web API with XML results, and it is run locally at Språkbanken to prepare the documents in Korp, our corpus search tool. While the most sophisticated support is for modern Swedish, the pipeline supports 15 languages.}, booktitle = {SLTC 2016. The Sixth Swedish Language Technology Conference, Umeå University, 17-18 November, 2016}, author = {Borin, Lars and Forsberg, Markus and Hammarstedt, Martin and Rosén, Dan and Schäfer, Roland and Schumacher, Anne}, year = {2016}, } @misc{Volodina-Elena2016-248087, title = {Preface. Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition. }, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, number = {130}, pages = { i–viii}, } @misc{Volodina-Elena2016-248081, title = {Proceedings of the joint workshop on NLP for Computer Assisted Language Learning and NLP for Language Acquisition at SLTC, Umeå, 16th November 2016}, abstract = {The joint workshop on Natural Language Processing (NLP) for Computer-Assisted Language Learning (CALL) & NLP for Language Acquisition (LA) – shorthand NLP4CALL&LA – is an effort to provide a debate space and collaboration between two closely related areas. Both focus on language acquisition, related resources and technologies, that can support research of the language learning process as well as aim to bring interdisciplinary advantage to the field. Individual workshop areas are outlined below. The area of NLP4CALL is applied in essence, where tools, algorithms, and ready-to-use programs play an important role. It has a traditional focus on second or foreign language learning, and the target age group of school children or older. The intersection of Natural Language Processing and Speech Technology, with Computer-Assisted Language Learning (CALL) brings “understanding” of language to CALL tools, thus making CALL intelligent. This fact has provided the name for this area of research – Intelligent CALL, ICALL. As the definition suggests, apart from having excellent knowledge of Natural Language Processing and/or Speech Technology, ICALL researchers need good insights into second language acquisition (SLA) theories and practices, second language assessment, as well as knowledge of L2 pedagogy and didactics. The workshop on Language Processing for Research in Language Acquisition (NLP4LA) broadens the scope of the joint workshop to also include theoretical, empirical, and experimental investigation of first, second and bilingual language acquisition. NLP4LA aims to foster collaboration between the NLP, linguistics, psychology and cognitive science communities. The workshop is targeted at anyone interested in the relevance of computational techniques for first, second and bilingual language acquisition. The joint workshop series on NLP4CALL&LA has arisen in 2016 and has become a meeting place for researchers working on the integration of Natural Language Processing and Speech Technologies in systems supporting language learning and research around it, and exploring the theoretical and methodological issues arising during language acquisition.}, author = {Volodina, Elena and Grigonytė, Gintarė and Pilán, Ildikó and Nilsson Björkenstam, Kristina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-633-8}, } @inProceedings{Eide-StianRødven2016-250073, title = {The Swedish Culturomics Gigaword Corpus: A One Billion Word Swedish Reference Dataset for NLP}, abstract = {In this paper we present a dataset of contemporary Swedish containing one billion words. The dataset consists of a wide range of sources, all annotated using a state-of-the-art corpus annotation pipeline, and is intended to be a static and clearly versioned dataset. This will facilitate reproducibility of experiments across institutions and make it easier to compare NLP algorithms on contemporary Swedish. The dataset contains sentences from 1950 to 2015 and has been carefully designed to feature a good mix of genres balanced over each included decade. The sources include literary, journalistic, academic and legal texts, as well as blogs and web forum entries.}, booktitle = {Linköping Electronic Conference Proceedings. Digital Humanities 2016. From Digitization to Knowledge 2016: Resources and Methods for Semantic Processing of Digital Works/Texts, July 11, 2016, Krakow, Poland}, author = {Eide, Stian Rødven and Tahmasebi, Nina and Borin, Lars}, year = {2016}, publisher = {Linköping University Electronic Press}, adress = {Linköping}, ISBN = {978-91-7685-733-5}, } @inProceedings{Borin-Lars2016-253952, title = {Towards a Big Data View on South Asian Linguistic Diversity}, abstract = {South Asia with its rich and diverse linguistic tapestry of hundreds of languages, including many from four major language families, and a long history of intensive language contact, provides rich empirical data for studies of linguistic genealogy, linguistic typology, and language contact. South Asia is often referred to as a linguistic area, a region where, due to close contact and widespread multilingualism, languages have influenced one another to the extent that both related and unrelated languages are more similar on many linguistic levels than we would expect. However, with some rare exceptions, most studies are largely impressionistic, drawing examples from a few languages. In this paper we present our ongoing work aiming at turning the linguistic material available in Grierson’s Linguistic Survey of India (LSI) into a digital language resource, a database suitable for a broad array of linguistic investigations of the languages of South Asia. In addition to this, we aim to contribute to the methodological development of large-scale comparative linguistics drawing on digital language resources, by exploring NLP techniques for extracting linguistic information from free-text language descriptions of the kind found in the LSI.}, booktitle = {WILDRE-3 – 3rd Workshop on Indian Language Data: Resources and Evaluation}, author = {Borin, Lars and Virk, Shafqat and Saxena, Anju}, year = {2016}, publisher = {ELRA}, adress = {Paris}, } @inProceedings{Cap-Fabienne2016-254388, title = {SWORD: Towards Cutting-Edge Swedish Word Processing}, abstract = {Despite many years of research on Swedish language technology, there is still no well-documented standard for Swedish word processing covering the whole spectrum from low-level tokenization to morphological analysis and disambiguation. SWORD is a new initiative within the SWE-CLARIN consortium aiming to develop documented standards for Swedish word processing. In this paper, we report on a pilot study of Swedish tokenization, where we compare the output of six different tokenizers on four different text types. For one text type (Wikipedia articles), we also compare to the tokenization produced by six manual annotators.}, booktitle = {Proceedings of the Sixth Swedish Language Technology Conference (SLTC) Umeå University, 17-18 November, 2016}, author = {Cap, Fabienne and Adesam, Yvonne and Ahrenberg, Lars and Borin, Lars and Bouma, Gerlof and Forsberg, Markus and Kann, Viggo and Östling, Robert and Smith, Aaron and Wirén, Mats and Nivre, Joakim}, year = {2016}, }